]> Raphaël G. Git Repositories - youtubedl/commitdiff
Imported Upstream version 2014.06.07
authorRogério Brito <rbrito@ime.usp.br>
Sun, 8 Jun 2014 13:58:42 +0000 (10:58 -0300)
committerRogério Brito <rbrito@ime.usp.br>
Sun, 8 Jun 2014 13:58:42 +0000 (10:58 -0300)
234 files changed:
CHANGELOG [deleted file]
MANIFEST.in
Makefile
README.md
README.txt
devscripts/make_readme.py
devscripts/prepare_manpage.py [new file with mode: 0644]
devscripts/release.sh
docs/.gitignore [new file with mode: 0644]
docs/Makefile [new file with mode: 0644]
docs/conf.py [new file with mode: 0644]
docs/index.rst [new file with mode: 0644]
docs/module_guide.rst [new file with mode: 0644]
test/helper.py
test/test_InfoExtractor.py [new file with mode: 0644]
test/test_YoutubeDL.py
test/test_age_restriction.py
test/test_all_urls.py
test/test_download.py
test/test_playlists.py
test/test_subtitles.py
test/test_utils.py
test/test_youtube_lists.py
youtube-dl
youtube-dl.1
youtube-dl.bash-completion
youtube_dl/InfoExtractors.py [deleted file]
youtube_dl/YoutubeDL.py [changed mode: 0644->0755]
youtube_dl/__init__.py
youtube_dl/downloader/common.py
youtube_dl/downloader/f4m.py
youtube_dl/downloader/hls.py
youtube_dl/downloader/http.py
youtube_dl/downloader/rtmp.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/academicearth.py
youtube_dl/extractor/addanime.py
youtube_dl/extractor/aftonbladet.py [new file with mode: 0644]
youtube_dl/extractor/aol.py [new file with mode: 0644]
youtube_dl/extractor/appletrailers.py
youtube_dl/extractor/ard.py
youtube_dl/extractor/arte.py
youtube_dl/extractor/auengine.py
youtube_dl/extractor/bandcamp.py
youtube_dl/extractor/bbccouk.py
youtube_dl/extractor/bilibili.py [new file with mode: 0644]
youtube_dl/extractor/blinkx.py
youtube_dl/extractor/bliptv.py
youtube_dl/extractor/bloomberg.py
youtube_dl/extractor/br.py [new file with mode: 0644]
youtube_dl/extractor/breakcom.py
youtube_dl/extractor/brightcove.py
youtube_dl/extractor/byutv.py [new file with mode: 0644]
youtube_dl/extractor/c56.py
youtube_dl/extractor/canal13cl.py [new file with mode: 0644]
youtube_dl/extractor/canalc2.py
youtube_dl/extractor/canalplus.py
youtube_dl/extractor/cbsnews.py [new file with mode: 0644]
youtube_dl/extractor/ceskatelevize.py [new file with mode: 0644]
youtube_dl/extractor/cinemassacre.py
youtube_dl/extractor/clipfish.py
youtube_dl/extractor/clipsyndicate.py
youtube_dl/extractor/clubic.py [new file with mode: 0644]
youtube_dl/extractor/cmt.py
youtube_dl/extractor/cnet.py [new file with mode: 0644]
youtube_dl/extractor/cnn.py
youtube_dl/extractor/collegehumor.py
youtube_dl/extractor/comedycentral.py
youtube_dl/extractor/common.py
youtube_dl/extractor/condenast.py
youtube_dl/extractor/crunchyroll.py
youtube_dl/extractor/cspan.py
youtube_dl/extractor/dailymotion.py
youtube_dl/extractor/daum.py
youtube_dl/extractor/depositfiles.py [deleted file]
youtube_dl/extractor/discovery.py
youtube_dl/extractor/divxstage.py [new file with mode: 0644]
youtube_dl/extractor/ehow.py
youtube_dl/extractor/empflix.py [new file with mode: 0644]
youtube_dl/extractor/engadget.py [new file with mode: 0644]
youtube_dl/extractor/extremetube.py
youtube_dl/extractor/facebook.py
youtube_dl/extractor/fc2.py [new file with mode: 0644]
youtube_dl/extractor/firstpost.py
youtube_dl/extractor/fivemin.py [new file with mode: 0644]
youtube_dl/extractor/fourtube.py
youtube_dl/extractor/franceculture.py [new file with mode: 0644]
youtube_dl/extractor/francetv.py
youtube_dl/extractor/funnyordie.py
youtube_dl/extractor/gamekings.py
youtube_dl/extractor/gamespot.py
youtube_dl/extractor/gdcvault.py [new file with mode: 0644]
youtube_dl/extractor/generic.py
youtube_dl/extractor/googlesearch.py
youtube_dl/extractor/hentaistigma.py [new file with mode: 0644]
youtube_dl/extractor/huffpost.py
youtube_dl/extractor/iconosquare.py [moved from youtube_dl/extractor/statigram.py with 76% similarity]
youtube_dl/extractor/ign.py
youtube_dl/extractor/infoq.py
youtube_dl/extractor/instagram.py
youtube_dl/extractor/iprima.py
youtube_dl/extractor/ivi.py
youtube_dl/extractor/jukebox.py
youtube_dl/extractor/justintv.py
youtube_dl/extractor/keezmovies.py
youtube_dl/extractor/kickstarter.py
youtube_dl/extractor/kontrtube.py
youtube_dl/extractor/ku6.py [new file with mode: 0644]
youtube_dl/extractor/lifenews.py
youtube_dl/extractor/liveleak.py
youtube_dl/extractor/lynda.py
youtube_dl/extractor/mailru.py [new file with mode: 0644]
youtube_dl/extractor/mdr.py
youtube_dl/extractor/metacafe.py
youtube_dl/extractor/metacritic.py
youtube_dl/extractor/mit.py
youtube_dl/extractor/mixcloud.py
youtube_dl/extractor/mooshare.py
youtube_dl/extractor/morningstar.py [new file with mode: 0644]
youtube_dl/extractor/motorsport.py [new file with mode: 0644]
youtube_dl/extractor/moviezine.py [new file with mode: 0644]
youtube_dl/extractor/movshare.py [new file with mode: 0644]
youtube_dl/extractor/mpora.py
youtube_dl/extractor/mtv.py
youtube_dl/extractor/musicplayon.py [new file with mode: 0644]
youtube_dl/extractor/myvideo.py
youtube_dl/extractor/naver.py
youtube_dl/extractor/nba.py
youtube_dl/extractor/nbc.py
youtube_dl/extractor/ndr.py
youtube_dl/extractor/newstube.py [new file with mode: 0644]
youtube_dl/extractor/nfb.py
youtube_dl/extractor/niconico.py
youtube_dl/extractor/ninegag.py
youtube_dl/extractor/noco.py [new file with mode: 0644]
youtube_dl/extractor/normalboots.py
youtube_dl/extractor/novamov.py
youtube_dl/extractor/nowness.py
youtube_dl/extractor/nowvideo.py
youtube_dl/extractor/nrk.py [new file with mode: 0644]
youtube_dl/extractor/ntv.py [new file with mode: 0644]
youtube_dl/extractor/nuvid.py [new file with mode: 0644]
youtube_dl/extractor/nytimes.py [new file with mode: 0644]
youtube_dl/extractor/oe1.py [new file with mode: 0644]
youtube_dl/extractor/ooyala.py
youtube_dl/extractor/orf.py
youtube_dl/extractor/parliamentliveuk.py [new file with mode: 0644]
youtube_dl/extractor/pbs.py
youtube_dl/extractor/photobucket.py
youtube_dl/extractor/playvid.py [new file with mode: 0644]
youtube_dl/extractor/podomatic.py
youtube_dl/extractor/pornhd.py
youtube_dl/extractor/pornhub.py
youtube_dl/extractor/prosiebensat1.py [new file with mode: 0644]
youtube_dl/extractor/pyvideo.py
youtube_dl/extractor/radiofrance.py
youtube_dl/extractor/ro220.py
youtube_dl/extractor/roxwel.py
youtube_dl/extractor/rtbf.py [new file with mode: 0644]
youtube_dl/extractor/rtlnow.py
youtube_dl/extractor/rts.py [new file with mode: 0644]
youtube_dl/extractor/rtve.py [new file with mode: 0644]
youtube_dl/extractor/rutube.py
youtube_dl/extractor/rutv.py [new file with mode: 0644]
youtube_dl/extractor/savefrom.py [new file with mode: 0644]
youtube_dl/extractor/scivee.py [new file with mode: 0644]
youtube_dl/extractor/slashdot.py [deleted file]
youtube_dl/extractor/slideshare.py
youtube_dl/extractor/slutload.py [new file with mode: 0644]
youtube_dl/extractor/smotri.py
youtube_dl/extractor/soundcloud.py
youtube_dl/extractor/space.py
youtube_dl/extractor/spankwire.py
youtube_dl/extractor/spiegeltv.py [new file with mode: 0644]
youtube_dl/extractor/spike.py
youtube_dl/extractor/steam.py
youtube_dl/extractor/streamcz.py
youtube_dl/extractor/swrmediathek.py [new file with mode: 0644]
youtube_dl/extractor/syfy.py
youtube_dl/extractor/tagesschau.py [new file with mode: 0644]
youtube_dl/extractor/teachertube.py [new file with mode: 0644]
youtube_dl/extractor/teachingchannel.py [new file with mode: 0644]
youtube_dl/extractor/teamcoco.py
youtube_dl/extractor/ted.py
youtube_dl/extractor/testurl.py [new file with mode: 0644]
youtube_dl/extractor/tf1.py
youtube_dl/extractor/theplatform.py
youtube_dl/extractor/tinypic.py
youtube_dl/extractor/tlc.py [new file with mode: 0644]
youtube_dl/extractor/toypics.py [new file with mode: 0644]
youtube_dl/extractor/trutube.py [new file with mode: 0644]
youtube_dl/extractor/tube8.py
youtube_dl/extractor/tvigle.py [new file with mode: 0644]
youtube_dl/extractor/udemy.py [new file with mode: 0644]
youtube_dl/extractor/urort.py [new file with mode: 0644]
youtube_dl/extractor/ustream.py
youtube_dl/extractor/veoh.py
youtube_dl/extractor/vesti.py
youtube_dl/extractor/vevo.py
youtube_dl/extractor/vh1.py [new file with mode: 0644]
youtube_dl/extractor/vice.py [deleted file]
youtube_dl/extractor/videobam.py [new file with mode: 0644]
youtube_dl/extractor/videodetective.py
youtube_dl/extractor/videolecturesnet.py [new file with mode: 0644]
youtube_dl/extractor/videott.py [new file with mode: 0644]
youtube_dl/extractor/videoweed.py [new file with mode: 0644]
youtube_dl/extractor/viki.py
youtube_dl/extractor/vimeo.py
youtube_dl/extractor/vine.py
youtube_dl/extractor/vk.py
youtube_dl/extractor/vube.py
youtube_dl/extractor/vuclip.py [new file with mode: 0644]
youtube_dl/extractor/washingtonpost.py [new file with mode: 0644]
youtube_dl/extractor/wat.py
youtube_dl/extractor/wdr.py [new file with mode: 0644]
youtube_dl/extractor/weibo.py
youtube_dl/extractor/wimp.py
youtube_dl/extractor/worldstarhiphop.py
youtube_dl/extractor/xbef.py [new file with mode: 0644]
youtube_dl/extractor/xhamster.py
youtube_dl/extractor/xnxx.py
youtube_dl/extractor/xtube.py
youtube_dl/extractor/xvideos.py
youtube_dl/extractor/yahoo.py
youtube_dl/extractor/youporn.py
youtube_dl/extractor/youtube.py
youtube_dl/extractor/zdf.py
youtube_dl/jsinterp.py [new file with mode: 0644]
youtube_dl/postprocessor/__init__.py
youtube_dl/postprocessor/atomicparsley.py [new file with mode: 0644]
youtube_dl/postprocessor/ffmpeg.py
youtube_dl/postprocessor/xattrpp.py
youtube_dl/utils.py
youtube_dl/version.py

diff --git a/CHANGELOG b/CHANGELOG
deleted file mode 100644 (file)
index 3fa1167..0000000
--- a/CHANGELOG
+++ /dev/null
@@ -1,14 +0,0 @@
-2013.01.02  Codename: GIULIA
-
-    * Add support for ComedyCentral clips <nto>
-    * Corrected Vimeo description fetching <Nick Daniels>
-    * Added the --no-post-overwrites argument <Barbu Paul - Gheorghe>
-    * --verbose offers more environment info
-    * New info_dict field: uploader_id
-    * New updates system, with signature checking
-    * New IEs: NBA, JustinTV, FunnyOrDie, TweetReel, Steam, Ustream
-    * Fixed IEs: BlipTv
-    * Fixed for Python 3 IEs: Xvideo, Youku, XNXX, Dailymotion, Vimeo, InfoQ
-    * Simplified IEs and test code
-    * Various (Python 3 and other) fixes
-    * Revamped and expanded tests
index 8f8af7a7ffbce0c3ec1d6c6eeb4dd053b10bedb4..d43cc1f3ba95e2ec16728320b5dd64b8a3558abb 100644 (file)
@@ -3,3 +3,4 @@ include test/*.py
 include test/*.json
 include youtube-dl.bash-completion
 include youtube-dl.1
+recursive-include docs Makefile conf.py *.rst
index c6d09932bcd4f45b8910e828255703403c2df0d7..c079761efa9b2e60887575f4cd7626d0abe469a2 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 all: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-completion
 
 clean:
-       rm -rf youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz
+       rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz
 
 cleanall: clean
        rm -f youtube-dl youtube-dl.exe
@@ -55,7 +55,9 @@ README.txt: README.md
        pandoc -f markdown -t plain README.md -o README.txt
 
 youtube-dl.1: README.md
-       pandoc -s -f markdown -t man README.md -o youtube-dl.1
+       python devscripts/prepare_manpage.py >youtube-dl.1.temp.md
+       pandoc -s -f markdown -t man youtube-dl.1.temp.md -o youtube-dl.1
+       rm -f youtube-dl.1.temp.md
 
 youtube-dl.bash-completion: youtube_dl/*.py youtube_dl/*/*.py devscripts/bash-completion.in
        python devscripts/bash-completion.py
@@ -72,8 +74,9 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-
                --exclude '__pycache' \
                --exclude '.git' \
                --exclude 'testdata' \
+               --exclude 'docs/_build' \
                -- \
-               bin devscripts test youtube_dl \
-               CHANGELOG LICENSE README.md README.txt \
+               bin devscripts test youtube_dl docs \
+               LICENSE README.md README.txt \
                Makefile MANIFEST.in youtube-dl.1 youtube-dl.bash-completion setup.py \
                youtube-dl
index 35876d979242b38d7d697219153c1314e0fa68a8..2bea609bfc397940d369628d27a8142ddcdbf867 100644 (file)
--- a/README.md
+++ b/README.md
@@ -1,11 +1,24 @@
-% YOUTUBE-DL(1)
-
-# NAME
 youtube-dl - download videos from youtube.com or other video platforms
 
 # SYNOPSIS
 **youtube-dl** [OPTIONS] URL [URL...]
 
+# INSTALLATION
+
+To install it right away for all UNIX users (Linux, OS X, etc.), type:
+
+    sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl
+    sudo chmod a+x /usr/local/bin/youtube-dl
+
+If you do not have curl, you can alternatively use a recent wget:
+
+    sudo wget https://yt-dl.org/downloads/2014.05.13/youtube-dl -O /usr/local/bin/youtube-dl
+    sudo chmod a+x /usr/local/bin/youtube-dl
+
+Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29).
+
+Alternatively, refer to the developer instructions below for how to check out and work with the git repository. For further options, including PGP signatures, see https://rg3.github.io/youtube-dl/download.html .
+
 # DESCRIPTION
 **youtube-dl** is a small command-line program to download videos from
 YouTube.com and a few more sites. It requires the Python interpreter, version
@@ -20,7 +33,7 @@ which means you can modify it, redistribute it or use it however you like.
                                      sure that you have sufficient permissions
                                      (run with sudo if needed)
     -i, --ignore-errors              continue on download errors, for example to
-                                     to skip unavailable videos in a playlist
+                                     skip unavailable videos in a playlist
     --abort-on-error                 Abort downloading of further videos (in the
                                      playlist or the command line) if an error
                                      occurs
@@ -28,6 +41,9 @@ which means you can modify it, redistribute it or use it however you like.
     --user-agent UA                  specify a custom user agent
     --referer REF                    specify a custom referer, use if the video
                                      access is restricted to one domain
+    --add-header FIELD:VALUE         specify a custom HTTP header and its value,
+                                     separated by a colon ':'. You can use this
+                                     option multiple times
     --list-extractors                List all supported extractors and the URLs
                                      they would handle
     --extractor-descriptions         Output descriptions of all supported
@@ -36,6 +52,9 @@ which means you can modify it, redistribute it or use it however you like.
                                      an empty string (--proxy "") for direct
                                      connection
     --no-check-certificate           Suppress HTTPS certificate validation.
+    --prefer-insecure                Use an unencrypted connection to retrieve
+                                     information about the video. (Currently
+                                     supported only for YouTube)
     --cache-dir DIR                  Location in the filesystem where youtube-dl
                                      can store some downloaded information
                                      permanently. By default $XDG_CACHE_HOME
@@ -59,6 +78,7 @@ which means you can modify it, redistribute it or use it however you like.
                                      configuration in ~/.config/youtube-dl.conf
                                      (%APPDATA%/youtube-dl/config.txt on
                                      Windows)
+    --encoding ENCODING              Force the specified encoding (experimental)
 
 ## Video Selection:
     --playlist-start NUMBER          playlist video to start at (default is 1)
@@ -124,8 +144,12 @@ which means you can modify it, redistribute it or use it however you like.
                                      video id, %(playlist)s for the playlist the
                                      video is in, %(playlist_index)s for the
                                      position in the playlist and %% for a
-                                     literal percent. Use - to output to stdout.
-                                     Can also be used to download to a different
+                                     literal percent. %(height)s and %(width)s
+                                     for the width and height of the video
+                                     format. %(resolution)s for a textual
+                                     description of the resolution of the video
+                                     format. Use - to output to stdout. Can also
+                                     be used to download to a different
                                      directory, for example with -o '/my/downloa
                                      ds/%(uploader)s/%(title)s-%(id)s.%(ext)s' .
     --autonumber-size NUMBER         Specifies the number of digits in
@@ -159,6 +183,7 @@ which means you can modify it, redistribute it or use it however you like.
 
 ## Verbosity / Simulation Options:
     -q, --quiet                      activates quiet mode
+    --no-warnings                    Ignore warnings
     -s, --simulate                   do not download the video and do not write
                                      anything to disk
     --skip-download                  do not download the video
@@ -170,7 +195,9 @@ which means you can modify it, redistribute it or use it however you like.
     --get-duration                   simulate, quiet but print video length
     --get-filename                   simulate, quiet but print output filename
     --get-format                     simulate, quiet but print output format
-    -j, --dump-json                  simulate, quiet but print JSON information
+    -j, --dump-json                  simulate, quiet but print JSON information.
+                                     See --output for a description of available
+                                     keys.
     --newline                        output progress bar as new lines
     --no-progress                    do not print progress bar
     --console-title                  display progress in console titlebar
@@ -187,9 +214,9 @@ which means you can modify it, redistribute it or use it however you like.
                                      preference using slashes: "-f 22/17/18".
                                      "-f mp4" and "-f flv" are also supported.
                                      You can also use the special names "best",
-                                     "bestaudio", "worst", and "worstaudio". By
-                                     default, youtube-dl will pick the best
-                                     quality.
+                                     "bestvideo", "bestaudio", "worst",
+                                     "worstvideo" and "worstaudio". By default,
+                                     youtube-dl will pick the best quality.
     --all-formats                    download all available video formats
     --prefer-free-formats            prefer free video formats unless a specific
                                      one is requested
@@ -236,6 +263,7 @@ which means you can modify it, redistribute it or use it however you like.
                                      default
     --embed-subs                     embed subtitles in the video (only for mp4
                                      videos)
+    --embed-thumbnail                embed thumbnail in the audio as cover art
     --add-metadata                   write metadata to the video file
     --xattrs                         write metadata to the video file's xattrs
                                      (using dublin core and xdg standards)
@@ -246,7 +274,7 @@ which means you can modify it, redistribute it or use it however you like.
 
 # CONFIGURATION
 
-You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\<Yourname>\youtube-dl.conf`.
+You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl/config`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\<Yourname>\youtube-dl.conf`.
 
 # OUTPUT TEMPLATE
 
@@ -281,12 +309,14 @@ Videos can be filtered by their upload date using the options `--date`, `--dateb
  
 Examples:
 
-  $ # Download only the videos uploaded in the last 6 months
-       $ youtube-dl --dateafter now-6months
-  $ # Download only the videos uploaded on January 1, 1970
-       $ youtube-dl --date 19700101
-  $ # will only download the videos uploaded in the 200x decade
-       $ youtube-dl --dateafter 20000101 --datebefore 20091231
+    # Download only the videos uploaded in the last 6 months
+    $ youtube-dl --dateafter now-6months
+
+    # Download only the videos uploaded on January 1, 1970
+    $ youtube-dl --date 19700101
+
+    $ # will only download the videos uploaded in the 200x decade
+    $ youtube-dl --dateafter 20000101 --datebefore 20091231
 
 # FAQ
 
@@ -355,7 +385,67 @@ If you want to create a build of youtube-dl yourself, you'll need
 
 ### Adding support for a new site
 
-If you want to add support for a new site, copy *any* [recently modified](https://github.com/rg3/youtube-dl/commits/master/youtube_dl/extractor) file in `youtube_dl/extractor`, add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Don't forget to run the tests with `python test/test_download.py Test_Download.test_YourExtractor`! For a detailed tutorial, refer to [this blog post](http://filippo.io/add-support-for-a-new-video-site-to-youtube-dl/).
+If you want to add support for a new site, you can follow this quick list (assuming your service is called `yourextractor`):
+
+1. [Fork this repository](https://github.com/rg3/youtube-dl/fork)
+2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git`
+3. Start a new git branch with `cd youtube-dl; git checkout -b yourextractor`
+4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`:
+
+        # coding: utf-8
+        from __future__ import unicode_literals
+
+        import re
+
+        from .common import InfoExtractor
+        
+        
+        class YourExtractorIE(InfoExtractor):
+            _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P<id>[0-9]+)'
+            _TEST = {
+                'url': 'http://yourextractor.com/watch/42',
+                'md5': 'TODO: md5 sum of the first 10KiB of the video file',
+                'info_dict': {
+                    'id': '42',
+                    'ext': 'mp4',
+                    'title': 'Video title goes here',
+                    # TODO more properties, either as:
+                    # * A value
+                    # * MD5 checksum; start the string with md5:
+                    # * A regular expression; start the string with re:
+                    # * Any Python type (for example int or float)
+                }
+            }
+
+            def _real_extract(self, url):
+                mobj = re.match(self._VALID_URL, url)
+                video_id = mobj.group('id')
+
+                # TODO more code goes here, for example ...
+                webpage = self._download_webpage(url, video_id)
+                title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
+
+                return {
+                    'id': video_id,
+                    'title': title,
+                    # TODO more properties (see youtube_dl/extractor/common.py)
+                }
+
+
+5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py).
+6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done.
+7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Add tests and code for as many as you want.
+8. If you can, check the code with [pyflakes](https://pypi.python.org/pypi/pyflakes) (a good idea) and [pep8](https://pypi.python.org/pypi/pep8) (optional, ignore E501).
+9. When the tests pass, [add](https://www.kernel.org/pub/software/scm/git/docs/git-add.html) the new files and [commit](https://www.kernel.org/pub/software/scm/git/docs/git-commit.html) them and [push](https://www.kernel.org/pub/software/scm/git/docs/git-push.html) the result, like this:
+
+        $ git add youtube_dl/extractor/__init__.py
+        $ git add youtube_dl/extractor/yourextractor.py
+        $ git commit -m '[yourextractor] Add new extractor'
+        $ git push origin yourextractor
+
+10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
+
+In any case, thank you very much for your contributions!
 
 # BUGS
 
@@ -381,7 +471,7 @@ If your report is shorter than two lines, it is almost certainly missing some of
 
 For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the -v flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information.
 
-Site support requests must contain an example URL. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL.
+Site support requests **must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL.
 
 ###  Are you using the latest version?
 
index 0015a74e060e5e92a04e919256356b80ff255540..4757a338b25780dcca13935d81efb5c1f5fb33b0 100644 (file)
@@ -1,6 +1,3 @@
-NAME
-====
-
 youtube-dl - download videos from youtube.com or other video platforms
 
 SYNOPSIS
@@ -8,6 +5,27 @@ SYNOPSIS
 
 youtube-dl OPTIONS URL [URL...]
 
+INSTALLATION
+============
+
+To install it right away for all UNIX users (Linux, OS X, etc.), type:
+
+    sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl
+    sudo chmod a+x /usr/local/bin/youtube-dl
+
+If you do not have curl, you can alternatively use a recent wget:
+
+    sudo wget https://yt-dl.org/downloads/2014.05.13/youtube-dl -O /usr/local/bin/youtube-dl
+    sudo chmod a+x /usr/local/bin/youtube-dl
+
+Windows users can download a .exe file and place it in their home
+directory or any other location on their PATH.
+
+Alternatively, refer to the developer instructions below for how to
+check out and work with the git repository. For further options,
+including PGP signatures, see
+https://rg3.github.io/youtube-dl/download.html .
+
 DESCRIPTION
 ===========
 
@@ -27,7 +45,7 @@ OPTIONS
                                      sure that you have sufficient permissions
                                      (run with sudo if needed)
     -i, --ignore-errors              continue on download errors, for example to
-                                     to skip unavailable videos in a playlist
+                                     skip unavailable videos in a playlist
     --abort-on-error                 Abort downloading of further videos (in the
                                      playlist or the command line) if an error
                                      occurs
@@ -35,6 +53,9 @@ OPTIONS
     --user-agent UA                  specify a custom user agent
     --referer REF                    specify a custom referer, use if the video
                                      access is restricted to one domain
+    --add-header FIELD:VALUE         specify a custom HTTP header and its value,
+                                     separated by a colon ':'. You can use this
+                                     option multiple times
     --list-extractors                List all supported extractors and the URLs
                                      they would handle
     --extractor-descriptions         Output descriptions of all supported
@@ -43,6 +64,9 @@ OPTIONS
                                      an empty string (--proxy "") for direct
                                      connection
     --no-check-certificate           Suppress HTTPS certificate validation.
+    --prefer-insecure                Use an unencrypted connection to retrieve
+                                     information about the video. (Currently
+                                     supported only for YouTube)
     --cache-dir DIR                  Location in the filesystem where youtube-dl
                                      can store some downloaded information
                                      permanently. By default $XDG_CACHE_HOME
@@ -66,6 +90,7 @@ OPTIONS
                                      configuration in ~/.config/youtube-dl.conf
                                      (%APPDATA%/youtube-dl/config.txt on
                                      Windows)
+    --encoding ENCODING              Force the specified encoding (experimental)
 
 Video Selection:
 ----------------
@@ -137,8 +162,12 @@ Filesystem Options:
                                      video id, %(playlist)s for the playlist the
                                      video is in, %(playlist_index)s for the
                                      position in the playlist and %% for a
-                                     literal percent. Use - to output to stdout.
-                                     Can also be used to download to a different
+                                     literal percent. %(height)s and %(width)s
+                                     for the width and height of the video
+                                     format. %(resolution)s for a textual
+                                     description of the resolution of the video
+                                     format. Use - to output to stdout. Can also
+                                     be used to download to a different
                                      directory, for example with -o '/my/downloa
                                      ds/%(uploader)s/%(title)s-%(id)s.%(ext)s' .
     --autonumber-size NUMBER         Specifies the number of digits in
@@ -174,6 +203,7 @@ Verbosity / Simulation Options:
 -------------------------------
 
     -q, --quiet                      activates quiet mode
+    --no-warnings                    Ignore warnings
     -s, --simulate                   do not download the video and do not write
                                      anything to disk
     --skip-download                  do not download the video
@@ -185,7 +215,9 @@ Verbosity / Simulation Options:
     --get-duration                   simulate, quiet but print video length
     --get-filename                   simulate, quiet but print output filename
     --get-format                     simulate, quiet but print output format
-    -j, --dump-json                  simulate, quiet but print JSON information
+    -j, --dump-json                  simulate, quiet but print JSON information.
+                                     See --output for a description of available
+                                     keys.
     --newline                        output progress bar as new lines
     --no-progress                    do not print progress bar
     --console-title                  display progress in console titlebar
@@ -204,9 +236,9 @@ Video Format Options:
                                      preference using slashes: "-f 22/17/18".
                                      "-f mp4" and "-f flv" are also supported.
                                      You can also use the special names "best",
-                                     "bestaudio", "worst", and "worstaudio". By
-                                     default, youtube-dl will pick the best
-                                     quality.
+                                     "bestvideo", "bestaudio", "worst",
+                                     "worstvideo" and "worstaudio". By default,
+                                     youtube-dl will pick the best quality.
     --all-formats                    download all available video formats
     --prefer-free-formats            prefer free video formats unless a specific
                                      one is requested
@@ -259,6 +291,7 @@ Post-processing Options:
                                      default
     --embed-subs                     embed subtitles in the video (only for mp4
                                      videos)
+    --embed-thumbnail                embed thumbnail in the audio as cover art
     --add-metadata                   write metadata to the video file
     --xattrs                         write metadata to the video file's xattrs
                                      (using dublin core and xdg standards)
@@ -272,7 +305,7 @@ CONFIGURATION
 
 You can configure youtube-dl by placing default arguments (such as
 --extract-audio --no-mtime to always extract the audio and not copy the
-mtime) into /etc/youtube-dl.conf and/or ~/.config/youtube-dl.conf. On
+mtime) into /etc/youtube-dl.conf and/or ~/.config/youtube-dl/config. On
 Windows, the configuration file locations are
 %APPDATA%\youtube-dl\config.txt and C:\Users\<Yourname>\youtube-dl.conf.
 
@@ -330,11 +363,14 @@ Videos can be filtered by their upload date using the options --date,
 
 Examples:
 
-$ # Download only the videos uploaded in the last 6 months $ youtube-dl
---dateafter now-6months $ # Download only the videos uploaded on January
-1, 1970 $ youtube-dl --date 19700101 $ # will only download the videos
-uploaded in the 200x decade $ youtube-dl --dateafter 20000101
---datebefore 20091231
+    # Download only the videos uploaded in the last 6 months
+    $ youtube-dl --dateafter now-6months
+
+    # Download only the videos uploaded on January 1, 1970
+    $ youtube-dl --date 19700101
+
+    $ # will only download the videos uploaded in the 200x decade
+    $ youtube-dl --dateafter 20000101 --datebefore 20091231
 
 FAQ
 ===
@@ -433,14 +469,76 @@ If you want to create a build of youtube-dl yourself, you'll need
 
 Adding support for a new site
 
-If you want to add support for a new site, copy any recently modified
-file in youtube_dl/extractor, add an import in
-youtube_dl/extractor/__init__.py. Have a look at
-youtube_dl/common/extractor/common.py for possible helper methods and a
-detailed description of what your extractor should return. Don't forget
-to run the tests with
-python test/test_download.py Test_Download.test_YourExtractor! For a
-detailed tutorial, refer to this blog post.
+If you want to add support for a new site, you can follow this quick
+list (assuming your service is called yourextractor):
+
+1.  Fork this repository
+2.  Check out the source code with
+    git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git
+3.  Start a new git branch with
+    cd youtube-dl; git checkout -b yourextractor
+4.  Start with this simple template and save it to
+    youtube_dl/extractor/yourextractor.py:
+
+        # coding: utf-8
+        from __future__ import unicode_literals
+
+        import re
+
+        from .common import InfoExtractor
+
+
+        class YourExtractorIE(InfoExtractor):
+            _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P<id>[0-9]+)'
+            _TEST = {
+                'url': 'http://yourextractor.com/watch/42',
+                'md5': 'TODO: md5 sum of the first 10KiB of the video file',
+                'info_dict': {
+                    'id': '42',
+                    'ext': 'mp4',
+                    'title': 'Video title goes here',
+                    # TODO more properties, either as:
+                    # * A value
+                    # * MD5 checksum; start the string with md5:
+                    # * A regular expression; start the string with re:
+                    # * Any Python type (for example int or float)
+                }
+            }
+
+            def _real_extract(self, url):
+                mobj = re.match(self._VALID_URL, url)
+                video_id = mobj.group('id')
+
+                # TODO more code goes here, for example ...
+                webpage = self._download_webpage(url, video_id)
+                title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
+
+                return {
+                    'id': video_id,
+                    'title': title,
+                    # TODO more properties (see youtube_dl/extractor/common.py)
+                }
+
+5.  Add an import in youtube_dl/extractor/__init__.py.
+6.  Run python test/test_download.py TestDownload.test_YourExtractor.
+    This should fail at first, but you can continually re-run it until
+    you're done.
+7.  Have a look at youtube_dl/common/extractor/common.py for possible
+    helper methods and a detailed description of what your extractor
+    should return. Add tests and code for as many as you want.
+8.  If you can, check the code with pyflakes (a good idea) and pep8
+    (optional, ignore E501).
+9.  When the tests pass, add the new files and commit them and push the
+    result, like this:
+
+        $ git add youtube_dl/extractor/__init__.py
+        $ git add youtube_dl/extractor/yourextractor.py
+        $ git commit -m '[yourextractor] Add new extractor'
+        $ git push origin yourextractor
+
+10. Finally, create a pull request. We'll then review and merge it.
+
+In any case, thank you very much for your contributions!
 
 BUGS
 ====
index cae1fa4f24fe0f190760e942666a379c47d041d7..70fa942dd12f7a75ee71b1fc9668e615cda0d747 100755 (executable)
@@ -15,7 +15,7 @@ header = oldreadme[:oldreadme.index('# OPTIONS')]
 footer = oldreadme[oldreadme.index('# CONFIGURATION'):]
 
 options = helptext[helptext.index('  General Options:') + 19:]
-options = re.sub(r'^  (\w.+)$', r'## \1', options, flags=re.M)
+options = re.sub(r'(?m)^  (\w.+)$', r'## \1', options)
 options = '# OPTIONS\n' + options + '\n'
 
 with io.open(README_FILE, 'w', encoding='utf-8') as f:
diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py
new file mode 100644 (file)
index 0000000..d9c8570
--- /dev/null
@@ -0,0 +1,20 @@
+
+import io
+import os.path
+import sys
+import re
+
+ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+README_FILE = os.path.join(ROOT_DIR, 'README.md')
+
+with io.open(README_FILE, encoding='utf-8') as f:
+    readme = f.read()
+
+PREFIX = '%YOUTUBE-DL(1)\n\n# NAME\n'
+readme = re.sub(r'(?s)# INSTALLATION.*?(?=# DESCRIPTION)', '', readme)
+readme = PREFIX + readme
+
+if sys.version_info < (3, 0):
+    print(readme.encode('utf-8'))
+else:
+    print(readme)
index 323acf8cfa92cc7662c21fac44795790867901f4..453087e5f70fa92906926ef12ab3b192087c51c3 100755 (executable)
 
 set -e
 
-skip_tests=false
-if [ "$1" = '--skip-test' ]; then
-    skip_tests=true
+skip_tests=true
+if [ "$1" = '--run-tests' ]; then
+    skip_tests=false
     shift
 fi
 
 if [ -z "$1" ]; then echo "ERROR: specify version number like this: $0 1994.09.06"; exit 1; fi
 version="$1"
+major_version=$(echo "$version" | sed -n 's#^\([0-9]*\.[0-9]*\.[0-9]*\).*#\1#p')
+if test "$major_version" '!=' "$(date '+%Y.%m.%d')"; then
+    echo "$version does not start with today's date!"
+    exit 1
+fi
+
 if [ ! -z "`git tag | grep "$version"`" ]; then echo 'ERROR: version already present'; exit 1; fi
 if [ ! -z "`git status --porcelain | grep -v CHANGELOG`" ]; then echo 'ERROR: the working directory is not clean; commit or stash changes'; exit 1; fi
 useless_files=$(find youtube_dl -type f -not -name '*.py')
@@ -39,9 +45,9 @@ fi
 /bin/echo -e "\n### Changing version in version.py..."
 sed -i "s/__version__ = '.*'/__version__ = '$version'/" youtube_dl/version.py
 
-/bin/echo -e "\n### Committing CHANGELOG README.md and youtube_dl/version.py..."
+/bin/echo -e "\n### Committing README.md and youtube_dl/version.py..."
 make README.md
-git add CHANGELOG README.md youtube_dl/version.py
+git add README.md youtube_dl/version.py
 git commit -m "release $version"
 
 /bin/echo -e "\n### Now tagging, signing and pushing..."
@@ -70,7 +76,7 @@ RELEASE_FILES="youtube-dl youtube-dl.exe youtube-dl-$version.tar.gz"
 git checkout HEAD -- youtube-dl youtube-dl.exe
 
 /bin/echo -e "\n### Signing and uploading the new binaries to yt-dl.org ..."
-for f in $RELEASE_FILES; do gpg --detach-sig "build/$version/$f"; done
+for f in $RELEASE_FILES; do gpg --passphrase-repeat 5 --detach-sig "build/$version/$f"; done
 scp -r "build/$version" ytdl@yt-dl.org:html/tmp/
 ssh ytdl@yt-dl.org "mv html/tmp/$version html/downloads/"
 ssh ytdl@yt-dl.org "sh html/update_latest.sh $version"
@@ -97,7 +103,7 @@ rm -rf build
 
 make pypi-files
 echo "Uploading to PyPi ..."
-python setup.py sdist upload
+python setup.py sdist bdist_wheel upload
 make clean
 
 /bin/echo -e "\n### DONE!"
diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644 (file)
index 0000000..69fa449
--- /dev/null
@@ -0,0 +1 @@
+_build/
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644 (file)
index 0000000..7122180
--- /dev/null
@@ -0,0 +1,177 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
+
+help:
+       @echo "Please use \`make <target>' where <target> is one of"
+       @echo "  html       to make standalone HTML files"
+       @echo "  dirhtml    to make HTML files named index.html in directories"
+       @echo "  singlehtml to make a single large HTML file"
+       @echo "  pickle     to make pickle files"
+       @echo "  json       to make JSON files"
+       @echo "  htmlhelp   to make HTML files and a HTML help project"
+       @echo "  qthelp     to make HTML files and a qthelp project"
+       @echo "  devhelp    to make HTML files and a Devhelp project"
+       @echo "  epub       to make an epub"
+       @echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+       @echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+       @echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+       @echo "  text       to make text files"
+       @echo "  man        to make manual pages"
+       @echo "  texinfo    to make Texinfo files"
+       @echo "  info       to make Texinfo files and run them through makeinfo"
+       @echo "  gettext    to make PO message catalogs"
+       @echo "  changes    to make an overview of all changed/added/deprecated items"
+       @echo "  xml        to make Docutils-native XML files"
+       @echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+       @echo "  linkcheck  to check all external links for integrity"
+       @echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+       rm -rf $(BUILDDIR)/*
+
+html:
+       $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+       @echo
+       @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+       $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+       @echo
+       @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+       $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+       @echo
+       @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+       $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+       @echo
+       @echo "Build finished; now you can process the pickle files."
+
+json:
+       $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+       @echo
+       @echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+       $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+       @echo
+       @echo "Build finished; now you can run HTML Help Workshop with the" \
+             ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+       $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+       @echo
+       @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+             ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+       @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/youtube-dl.qhcp"
+       @echo "To view the help file:"
+       @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/youtube-dl.qhc"
+
+devhelp:
+       $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+       @echo
+       @echo "Build finished."
+       @echo "To view the help file:"
+       @echo "# mkdir -p $$HOME/.local/share/devhelp/youtube-dl"
+       @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/youtube-dl"
+       @echo "# devhelp"
+
+epub:
+       $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+       @echo
+       @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+       $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+       @echo
+       @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+       @echo "Run \`make' in that directory to run these through (pdf)latex" \
+             "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+       $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+       @echo "Running LaTeX files through pdflatex..."
+       $(MAKE) -C $(BUILDDIR)/latex all-pdf
+       @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+       $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+       @echo "Running LaTeX files through platex and dvipdfmx..."
+       $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+       @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+       $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+       @echo
+       @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+       $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+       @echo
+       @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+       $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+       @echo
+       @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+       @echo "Run \`make' in that directory to run these through makeinfo" \
+             "(use \`make info' here to do that automatically)."
+
+info:
+       $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+       @echo "Running Texinfo files through makeinfo..."
+       make -C $(BUILDDIR)/texinfo info
+       @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+       $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+       @echo
+       @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+       $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+       @echo
+       @echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+       $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+       @echo
+       @echo "Link check complete; look for any errors in the above output " \
+             "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+       $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+       @echo "Testing of doctests in the sources finished, look at the " \
+             "results in $(BUILDDIR)/doctest/output.txt."
+
+xml:
+       $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+       @echo
+       @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+       $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+       @echo
+       @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644 (file)
index 0000000..4a04ad7
--- /dev/null
@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+#
+# youtube-dl documentation build configuration file, created by
+# sphinx-quickstart on Fri Mar 14 21:05:43 2014.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+# Allows to import youtube_dl
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+# -- General configuration ------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'youtube-dl'
+copyright = u'2014, Ricardo Garcia Gonzalez'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+import youtube_dl
+version = youtube_dl.__version__
+# The full version, including alpha/beta/rc tags.
+release = version
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = 'default'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'youtube-dldoc'
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644 (file)
index 0000000..b746ff9
--- /dev/null
@@ -0,0 +1,23 @@
+Welcome to youtube-dl's documentation!
+======================================
+
+*youtube-dl* is a command-line program to download videos from YouTube.com and more sites.
+It can also be used in Python code.
+
+Developer guide
+---------------
+
+This section contains information for using *youtube-dl* from Python programs.
+
+.. toctree::
+    :maxdepth: 2
+
+    module_guide
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
diff --git a/docs/module_guide.rst b/docs/module_guide.rst
new file mode 100644 (file)
index 0000000..03d7288
--- /dev/null
@@ -0,0 +1,67 @@
+Using the ``youtube_dl`` module
+===============================
+
+When using the ``youtube_dl`` module, you start by creating an instance of :class:`YoutubeDL` and adding all the available extractors:
+
+.. code-block:: python
+
+    >>> from youtube_dl import YoutubeDL
+    >>> ydl = YoutubeDL()
+    >>> ydl.add_default_info_extractors()
+
+Extracting video information
+----------------------------
+
+You use the :meth:`YoutubeDL.extract_info` method for getting the video information, which returns a dictionary:
+
+.. code-block:: python
+
+    >>> info = ydl.extract_info('http://www.youtube.com/watch?v=BaW_jenozKc', download=False)
+    [youtube] Setting language
+    [youtube] BaW_jenozKc: Downloading webpage
+    [youtube] BaW_jenozKc: Downloading video info webpage
+    [youtube] BaW_jenozKc: Extracting video information
+    >>> info['title']
+    'youtube-dl test video "\'/\\ä↭𝕐'
+    >>> info['height'], info['width']
+    (720, 1280)
+
+If you want to download or play the video you can get its url:
+
+.. code-block:: python
+
+    >>> info['url']
+    'https://...'
+
+Extracting playlist information
+-------------------------------
+
+The playlist information is extracted in a similar way, but the dictionary is a bit different:
+
+.. code-block:: python
+
+    >>> playlist = ydl.extract_info('http://www.ted.com/playlists/13/open_source_open_world', download=False)
+    [TED] open_source_open_world: Downloading playlist webpage
+    ...
+    >>> playlist['title']
+    'Open-source, open world'
+
+
+
+You can access the videos in the playlist with the ``entries`` field:
+
+.. code-block:: python
+
+    >>> for video in playlist['entries']:
+    ...     print('Video #%d: %s' % (video['playlist_index'], video['title']))
+
+    Video #1: How Arduino is open-sourcing imagination
+    Video #2: The year open data went worldwide
+    Video #3: Massive-scale online collaboration
+    Video #4: The art of asking
+    Video #5: How cognitive surplus will change the world
+    Video #6: The birth of Wikipedia
+    Video #7: Coding a better government
+    Video #8: The era of open innovation
+    Video #9: The currency of the new economy is trust
+
index b1f421ac58331bad23328502f42a0e1316df853d..230d2bd67ab06b4db552bff30c5620f83673ca93 100644 (file)
@@ -9,7 +9,10 @@ import sys
 
 import youtube_dl.extractor
 from youtube_dl import YoutubeDL
-from youtube_dl.utils import preferredencoding
+from youtube_dl.utils import (
+    compat_str,
+    preferredencoding,
+)
 
 
 def get_params(override=None):
@@ -71,15 +74,77 @@ class FakeYDL(YoutubeDL):
             old_report_warning(message)
         self.report_warning = types.MethodType(report_warning, self)
 
-def get_testcases():
+
+def gettestcases(include_onlymatching=False):
     for ie in youtube_dl.extractor.gen_extractors():
         t = getattr(ie, '_TEST', None)
         if t:
-            t['name'] = type(ie).__name__[:-len('IE')]
-            yield t
-        for t in getattr(ie, '_TESTS', []):
+            assert not hasattr(ie, '_TESTS'), \
+                '%s has _TEST and _TESTS' % type(ie).__name__
+            tests = [t]
+        else:
+            tests = getattr(ie, '_TESTS', [])
+        for t in tests:
+            if not include_onlymatching and t.get('only_matching', False):
+                continue
             t['name'] = type(ie).__name__[:-len('IE')]
             yield t
 
 
 md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
+
+
+def expect_info_dict(self, expected_dict, got_dict):
+    for info_field, expected in expected_dict.items():
+        if isinstance(expected, compat_str) and expected.startswith('re:'):
+            got = got_dict.get(info_field)
+            match_str = expected[len('re:'):]
+            match_rex = re.compile(match_str)
+
+            self.assertTrue(
+                isinstance(got, compat_str) and match_rex.match(got),
+                u'field %s (value: %r) should match %r' % (info_field, got, match_str))
+        elif isinstance(expected, type):
+            got = got_dict.get(info_field)
+            self.assertTrue(isinstance(got, expected),
+                u'Expected type %r for field %s, but got value %r of type %r' % (expected, info_field, got, type(got)))
+        else:
+            if isinstance(expected, compat_str) and expected.startswith('md5:'):
+                got = 'md5:' + md5(got_dict.get(info_field))
+            else:
+                got = got_dict.get(info_field)
+            self.assertEqual(expected, got,
+                u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
+
+    # Check for the presence of mandatory fields
+    for key in ('id', 'url', 'title', 'ext'):
+        self.assertTrue(got_dict.get(key), 'Missing mandatory field %s' % key)
+    # Check for mandatory fields that are automatically set by YoutubeDL
+    for key in ['webpage_url', 'extractor', 'extractor_key']:
+        self.assertTrue(got_dict.get(key), u'Missing field: %s' % key)
+
+    # Are checkable fields missing from the test case definition?
+    test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))
+        for key, value in got_dict.items()
+        if value and key in ('title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location'))
+    missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys())
+    if missing_keys:
+        sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=4) + u'\n')
+        self.assertFalse(
+            missing_keys,
+            'Missing keys in test definition: %s' % (
+                ', '.join(sorted(missing_keys))))
+
+
+def assertRegexpMatches(self, text, regexp, msg=None):
+    if hasattr(self, 'assertRegexpMatches'):
+        return self.assertRegexpMatches(text, regexp, msg)
+    else:
+        m = re.match(regexp, text)
+        if not m:
+            note = 'Regexp didn\'t match: %r not found in %r' % (regexp, text)
+            if msg is None:
+                msg = note
+            else:
+                msg = note + ', ' + msg
+            self.assertTrue(m, msg)
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py
new file mode 100644 (file)
index 0000000..13c18ed
--- /dev/null
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+
+from __future__ import unicode_literals
+
+# Allow direct execution
+import os
+import sys
+import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from test.helper import FakeYDL
+from youtube_dl.extractor.common import InfoExtractor
+from youtube_dl.extractor import YoutubeIE, get_info_extractor
+
+
+class TestIE(InfoExtractor):
+    pass
+
+
+class TestInfoExtractor(unittest.TestCase):
+    def setUp(self):
+        self.ie = TestIE(FakeYDL())
+
+    def test_ie_key(self):
+        self.assertEqual(get_info_extractor(YoutubeIE.ie_key()), YoutubeIE)
+
+    def test_html_search_regex(self):
+        html = '<p id="foo">Watch this <a href="http://www.youtube.com/watch?v=BaW_jenozKc">video</a></p>'
+        search = lambda re, *args: self.ie._html_search_regex(re, html, *args)
+        self.assertEqual(search(r'<p id="foo">(.+?)</p>', 'foo'), 'Watch this video')
+
+    def test_opengraph(self):
+        ie = self.ie
+        html = '''
+            <meta name="og:title" content='Foo'/>
+            <meta content="Some video's description " name="og:description"/>
+            <meta property='og:image' content='http://domain.com/pic.jpg?key1=val1&amp;key2=val2'/>
+            '''
+        self.assertEqual(ie._og_search_title(html), 'Foo')
+        self.assertEqual(ie._og_search_description(html), 'Some video\'s description ')
+        self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2')
+
+if __name__ == '__main__':
+    unittest.main()
index 37e7b9b28fd476c64dd4d7b2a371b0f9db3d32c6..e794cc97f0e643c5f05539fd3d0313d30dc98f8d 100644 (file)
@@ -8,7 +8,7 @@ import sys
 import unittest
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-from test.helper import FakeYDL
+from test.helper import FakeYDL, assertRegexpMatches
 from youtube_dl import YoutubeDL
 from youtube_dl.extractor import YoutubeIE
 
@@ -26,16 +26,27 @@ class YDL(FakeYDL):
         self.msgs.append(msg)
 
 
+def _make_result(formats, **kwargs):
+    res = {
+        'formats': formats,
+        'id': 'testid',
+        'title': 'testttitle',
+        'extractor': 'testex',
+    }
+    res.update(**kwargs)
+    return res
+
+
 class TestFormatSelection(unittest.TestCase):
     def test_prefer_free_formats(self):
         # Same resolution => download webm
         ydl = YDL()
         ydl.params['prefer_free_formats'] = True
         formats = [
-            {'ext': 'webm', 'height': 460},
-            {'ext': 'mp4',  'height': 460},
+            {'ext': 'webm', 'height': 460, 'url': 'x'},
+            {'ext': 'mp4', 'height': 460, 'url': 'y'},
         ]
-        info_dict = {'formats': formats, 'extractor': 'test'}
+        info_dict = _make_result(formats)
         yie = YoutubeIE(ydl)
         yie._sort_formats(info_dict['formats'])
         ydl.process_ie_result(info_dict)
@@ -46,8 +57,8 @@ class TestFormatSelection(unittest.TestCase):
         ydl = YDL()
         ydl.params['prefer_free_formats'] = True
         formats = [
-            {'ext': 'webm', 'height': 720},
-            {'ext': 'mp4', 'height': 1080},
+            {'ext': 'webm', 'height': 720, 'url': 'a'},
+            {'ext': 'mp4', 'height': 1080, 'url': 'b'},
         ]
         info_dict['formats'] = formats
         yie = YoutubeIE(ydl)
@@ -56,13 +67,13 @@ class TestFormatSelection(unittest.TestCase):
         downloaded = ydl.downloaded_info_dicts[0]
         self.assertEqual(downloaded['ext'], 'mp4')
 
-        # No prefer_free_formats => prefer mp4 and flv for greater compatibilty
+        # No prefer_free_formats => prefer mp4 and flv for greater compatibility
         ydl = YDL()
         ydl.params['prefer_free_formats'] = False
         formats = [
-            {'ext': 'webm', 'height': 720},
-            {'ext': 'mp4', 'height': 720},
-            {'ext': 'flv', 'height': 720},
+            {'ext': 'webm', 'height': 720, 'url': '_'},
+            {'ext': 'mp4', 'height': 720, 'url': '_'},
+            {'ext': 'flv', 'height': 720, 'url': '_'},
         ]
         info_dict['formats'] = formats
         yie = YoutubeIE(ydl)
@@ -74,8 +85,8 @@ class TestFormatSelection(unittest.TestCase):
         ydl = YDL()
         ydl.params['prefer_free_formats'] = False
         formats = [
-            {'ext': 'flv', 'height': 720},
-            {'ext': 'webm', 'height': 720},
+            {'ext': 'flv', 'height': 720, 'url': '_'},
+            {'ext': 'webm', 'height': 720, 'url': '_'},
         ]
         info_dict['formats'] = formats
         yie = YoutubeIE(ydl)
@@ -91,8 +102,7 @@ class TestFormatSelection(unittest.TestCase):
             {'format_id': 'great', 'url': 'http://example.com/great', 'preference': 3},
             {'format_id': 'excellent', 'url': 'http://example.com/exc', 'preference': 4},
         ]
-        info_dict = {
-            'formats': formats, 'extractor': 'test', 'id': 'testvid'}
+        info_dict = _make_result(formats)
 
         ydl = YDL()
         ydl.process_ie_result(info_dict)
@@ -120,12 +130,12 @@ class TestFormatSelection(unittest.TestCase):
 
     def test_format_selection(self):
         formats = [
-            {'format_id': '35', 'ext': 'mp4', 'preference': 1},
-            {'format_id': '45', 'ext': 'webm', 'preference': 2},
-            {'format_id': '47', 'ext': 'webm', 'preference': 3},
-            {'format_id': '2', 'ext': 'flv', 'preference': 4},
+            {'format_id': '35', 'ext': 'mp4', 'preference': 1, 'url': '_'},
+            {'format_id': '45', 'ext': 'webm', 'preference': 2, 'url': '_'},
+            {'format_id': '47', 'ext': 'webm', 'preference': 3, 'url': '_'},
+            {'format_id': '2', 'ext': 'flv', 'preference': 4, 'url': '_'},
         ]
-        info_dict = {'formats': formats, 'extractor': 'test'}
+        info_dict = _make_result(formats)
 
         ydl = YDL({'format': '20/47'})
         ydl.process_ie_result(info_dict.copy())
@@ -154,12 +164,12 @@ class TestFormatSelection(unittest.TestCase):
 
     def test_format_selection_audio(self):
         formats = [
-            {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none'},
-            {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none'},
-            {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none'},
-            {'format_id': 'vid', 'ext': 'mp4', 'preference': 4},
+            {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none', 'url': '_'},
+            {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none', 'url': '_'},
+            {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none', 'url': '_'},
+            {'format_id': 'vid', 'ext': 'mp4', 'preference': 4, 'url': '_'},
         ]
-        info_dict = {'formats': formats, 'extractor': 'test'}
+        info_dict = _make_result(formats)
 
         ydl = YDL({'format': 'bestaudio'})
         ydl.process_ie_result(info_dict.copy())
@@ -172,16 +182,34 @@ class TestFormatSelection(unittest.TestCase):
         self.assertEqual(downloaded['format_id'], 'audio-low')
 
         formats = [
-            {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1},
-            {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2},
+            {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1, 'url': '_'},
+            {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2, 'url': '_'},
         ]
-        info_dict = {'formats': formats, 'extractor': 'test'}
+        info_dict = _make_result(formats)
 
         ydl = YDL({'format': 'bestaudio/worstaudio/best'})
         ydl.process_ie_result(info_dict.copy())
         downloaded = ydl.downloaded_info_dicts[0]
         self.assertEqual(downloaded['format_id'], 'vid-high')
 
+    def test_format_selection_video(self):
+        formats = [
+            {'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none', 'url': '_'},
+            {'format_id': 'dash-video-high', 'ext': 'mp4', 'preference': 2, 'acodec': 'none', 'url': '_'},
+            {'format_id': 'vid', 'ext': 'mp4', 'preference': 3, 'url': '_'},
+        ]
+        info_dict = _make_result(formats)
+
+        ydl = YDL({'format': 'bestvideo'})
+        ydl.process_ie_result(info_dict.copy())
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], 'dash-video-high')
+
+        ydl = YDL({'format': 'worstvideo'})
+        ydl.process_ie_result(info_dict.copy())
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], 'dash-video-low')
+
     def test_youtube_format_selection(self):
         order = [
             '38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '36', '17', '13',
@@ -199,10 +227,12 @@ class TestFormatSelection(unittest.TestCase):
         for f1id, f2id in zip(order, order[1:]):
             f1 = YoutubeIE._formats[f1id].copy()
             f1['format_id'] = f1id
+            f1['url'] = 'url:' + f1id
             f2 = YoutubeIE._formats[f2id].copy()
             f2['format_id'] = f2id
+            f2['url'] = 'url:' + f2id
 
-            info_dict = {'formats': [f1, f2], 'extractor': 'youtube'}
+            info_dict = _make_result([f1, f2], extractor='youtube')
             ydl = YDL()
             yie = YoutubeIE(ydl)
             yie._sort_formats(info_dict['formats'])
@@ -210,7 +240,7 @@ class TestFormatSelection(unittest.TestCase):
             downloaded = ydl.downloaded_info_dicts[0]
             self.assertEqual(downloaded['format_id'], f1id)
 
-            info_dict = {'formats': [f2, f1], 'extractor': 'youtube'}
+            info_dict = _make_result([f2, f1], extractor='youtube')
             ydl = YDL()
             yie = YoutubeIE(ydl)
             yie._sort_formats(info_dict['formats'])
@@ -244,6 +274,12 @@ class TestFormatSelection(unittest.TestCase):
         # Replace missing fields with 'NA'
         self.assertEqual(fname('%(uploader_date)s-%(id)s.%(ext)s'), 'NA-1234.mp4')
 
+    def test_format_note(self):
+        ydl = YoutubeDL()
+        self.assertEqual(ydl._format_note({}), '')
+        assertRegexpMatches(self, ydl._format_note({
+            'vbr': 10,
+        }), '^\s*10k$')
 
 if __name__ == '__main__':
     unittest.main()
index c9cdb96cb30578d58724ddadb4328ad790316a39..71e80b037a5cc99fd0cb1a6711d20cfb59e01b34 100644 (file)
@@ -13,7 +13,7 @@ from youtube_dl import YoutubeDL
 
 
 def _download_restricted(url, filename, age):
-    """ Returns true iff the file has been downloaded """
+    """ Returns true if the file has been downloaded """
 
     params = {
         'age_limit': age,
index aa8e4e4bdc9d2e407ad122b6d15bd3e6e0ade143..4b56137cebb63287e4410959c091b09b6ffd785b 100644 (file)
@@ -9,7 +9,7 @@ import unittest
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 
-from test.helper import get_testcases
+from test.helper import gettestcases
 
 from youtube_dl.extractor import (
     FacebookIE,
@@ -49,6 +49,7 @@ class TestAllURLsMatching(unittest.TestCase):
         self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube'])
         self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube'])
         self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube'])
+        self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube'])
 
     def test_youtube_channel_matching(self):
         assertChannel = lambda url: self.assertMatch(url, ['youtube:channel'])
@@ -68,21 +69,28 @@ class TestAllURLsMatching(unittest.TestCase):
     def test_youtube_show_matching(self):
         self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show'])
 
+    def test_youtube_truncated(self):
+        self.assertMatch('http://www.youtube.com/watch?', ['youtube:truncated_url'])
+
+    def test_youtube_search_matching(self):
+        self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
+        self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
+
     def test_justin_tv_channelid_matching(self):
-        self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv"))
-        self.assertTrue(JustinTVIE.suitable(u"twitch.tv/vanillatv"))
-        self.assertTrue(JustinTVIE.suitable(u"www.justin.tv/vanillatv"))
-        self.assertTrue(JustinTVIE.suitable(u"www.twitch.tv/vanillatv"))
-        self.assertTrue(JustinTVIE.suitable(u"http://www.justin.tv/vanillatv"))
-        self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv"))
-        self.assertTrue(JustinTVIE.suitable(u"http://www.justin.tv/vanillatv/"))
-        self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv/"))
+        self.assertTrue(JustinTVIE.suitable('justin.tv/vanillatv'))
+        self.assertTrue(JustinTVIE.suitable('twitch.tv/vanillatv'))
+        self.assertTrue(JustinTVIE.suitable('www.justin.tv/vanillatv'))
+        self.assertTrue(JustinTVIE.suitable('www.twitch.tv/vanillatv'))
+        self.assertTrue(JustinTVIE.suitable('http://www.justin.tv/vanillatv'))
+        self.assertTrue(JustinTVIE.suitable('http://www.twitch.tv/vanillatv'))
+        self.assertTrue(JustinTVIE.suitable('http://www.justin.tv/vanillatv/'))
+        self.assertTrue(JustinTVIE.suitable('http://www.twitch.tv/vanillatv/'))
 
     def test_justintv_videoid_matching(self):
-        self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv/b/328087483"))
+        self.assertTrue(JustinTVIE.suitable('http://www.twitch.tv/vanillatv/b/328087483'))
 
     def test_justin_tv_chapterid_matching(self):
-        self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/tsm_theoddone/c/2349361"))
+        self.assertTrue(JustinTVIE.suitable('http://www.twitch.tv/tsm_theoddone/c/2349361'))
 
     def test_youtube_extract(self):
         assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id)
@@ -98,7 +106,7 @@ class TestAllURLsMatching(unittest.TestCase):
 
     def test_no_duplicates(self):
         ies = gen_extractors()
-        for tc in get_testcases():
+        for tc in gettestcases(include_onlymatching=True):
             url = tc['url']
             for ie in ies:
                 if type(ie).__name__ in ('GenericIE', tc['name'] + 'IE'):
@@ -117,6 +125,8 @@ class TestAllURLsMatching(unittest.TestCase):
 
     def test_vimeo_matching(self):
         self.assertMatch('http://vimeo.com/channels/tributes', ['vimeo:channel'])
+        self.assertMatch('http://vimeo.com/channels/31259', ['vimeo:channel'])
+        self.assertMatch('http://vimeo.com/channels/31259/53576664', ['vimeo'])
         self.assertMatch('http://vimeo.com/user7108434', ['vimeo:user'])
         self.assertMatch('http://vimeo.com/user7108434/videos', ['vimeo:user'])
         self.assertMatch('https://vimeo.com/user21297594/review/75524534/3c257a1b5d', ['vimeo:review'])
@@ -132,6 +142,40 @@ class TestAllURLsMatching(unittest.TestCase):
     def test_pbs(self):
         # https://github.com/rg3/youtube-dl/issues/2350
         self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['PBS'])
+        self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['PBS'])
+
+    def test_ComedyCentralShows(self):
+        self.assertMatch(
+            'http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview',
+            ['ComedyCentralShows'])
+        self.assertMatch(
+            'http://thecolbertreport.cc.com/videos/29w6fx/-realhumanpraise-for-fox-news',
+            ['ComedyCentralShows'])
+        self.assertMatch(
+            'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114',
+            ['ComedyCentralShows'])
+        self.assertMatch(
+            'http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3',
+            ['ComedyCentralShows'])
+        self.assertMatch(
+            'http://thedailyshow.cc.com/episodes/sy7yv0/april-8--2014---denis-leary',
+            ['ComedyCentralShows'])
+        self.assertMatch(
+            'http://thecolbertreport.cc.com/episodes/8ase07/april-8--2014---jane-goodall',
+            ['ComedyCentralShows'])
+        self.assertMatch(
+            'http://thedailyshow.cc.com/video-playlists/npde3s/the-daily-show-19088-highlights',
+            ['ComedyCentralShows'])
+        self.assertMatch(
+            'http://thedailyshow.cc.com/special-editions/2l8fdb/special-edition---a-look-back-at-food',
+            ['ComedyCentralShows'])
+
+    def test_yahoo_https(self):
+        # https://github.com/rg3/youtube-dl/issues/2701
+        self.assertMatch(
+            'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html',
+            ['Yahoo'])
+
 
 if __name__ == '__main__':
     unittest.main()
index 7587a18aa18fee1cdf61a735246c62dbf91e691d..f171c10bad84a876a9fe4caba2b71a984c3169ec 100644 (file)
@@ -8,10 +8,11 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 from test.helper import (
     get_params,
-    get_testcases,
-    try_rm,
+    gettestcases,
+    expect_info_dict,
     md5,
-    report_warning
+    try_rm,
+    report_warning,
 )
 
 
@@ -50,7 +51,7 @@ def _file_md5(fn):
     with open(fn, 'rb') as f:
         return hashlib.md5(f.read()).hexdigest()
 
-defs = get_testcases()
+defs = gettestcases()
 
 
 class TestDownload(unittest.TestCase):
@@ -72,9 +73,7 @@ def generator(test_case):
         if 'playlist' not in test_case:
             info_dict = test_case.get('info_dict', {})
             if not test_case.get('file') and not (info_dict.get('id') and info_dict.get('ext')):
-                print_skipping('The output file cannot be know, the "file" '
-                    'key is missing or the info_dict is incomplete')
-                return
+                raise Exception('Test definition incorrect. The output file cannot be known. Are both \'id\' and \'ext\' keys present?')
         if 'skip' in test_case:
             print_skipping(test_case['skip'])
             return
@@ -136,27 +135,8 @@ def generator(test_case):
                     self.assertEqual(md5_for_file, tc['md5'])
                 with io.open(info_json_fn, encoding='utf-8') as infof:
                     info_dict = json.load(infof)
-                for (info_field, expected) in tc.get('info_dict', {}).items():
-                    if isinstance(expected, compat_str) and expected.startswith('md5:'):
-                        got = 'md5:' + md5(info_dict.get(info_field))
-                    else:
-                        got = info_dict.get(info_field)
-                    self.assertEqual(expected, got,
-                        u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got))
-
-                # If checkable fields are missing from the test case, print the info_dict
-                test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value))
-                    for key, value in info_dict.items()
-                    if value and key in ('title', 'description', 'uploader', 'upload_date', 'uploader_id', 'location'))
-                if not all(key in tc.get('info_dict', {}).keys() for key in test_info_dict.keys()):
-                    sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=4) + u'\n')
-
-                # Check for the presence of mandatory fields
-                for key in ('id', 'url', 'title', 'ext'):
-                    self.assertTrue(key in info_dict.keys() and info_dict[key])
-                # Check for mandatory fields that are automatically set by YoutubeDL
-                for key in ['webpage_url', 'extractor', 'extractor_key']:
-                    self.assertTrue(info_dict.get(key), u'Missing field: %s' % key)
+
+                expect_info_dict(self, tc.get('info_dict', {}), info_dict)
         finally:
             try_rm_tcs_files()
 
index 1de9e8ec1ab1a9f636a8b3220d6614a663ba94b2..465b07b9e28e48ce9fe3b8a0a477a712b9f06940 100644 (file)
@@ -9,8 +9,11 @@ import sys
 import unittest
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-from test.helper import FakeYDL
-
+from test.helper import (
+    assertRegexpMatches,
+    expect_info_dict,
+    FakeYDL,
+)
 
 from youtube_dl.extractor import (
     AcademicEarthCourseIE,
@@ -20,9 +23,12 @@ from youtube_dl.extractor import (
     VimeoUserIE,
     VimeoAlbumIE,
     VimeoGroupsIE,
+    VineUserIE,
     UstreamChannelIE,
     SoundcloudSetIE,
     SoundcloudUserIE,
+    SoundcloudPlaylistIE,
+    TeacherTubeClassroomIE,
     LivestreamIE,
     NHLVideocenterIE,
     BambuserChannelIE,
@@ -36,6 +42,12 @@ from youtube_dl.extractor import (
     RutubeChannelIE,
     GoogleSearchIE,
     GenericIE,
+    TEDIE,
+    ToypicsUserIE,
+    XTubeUserIE,
+    InstagramUserIE,
+    CSpanIE,
+    AolIE,
 )
 
 
@@ -92,13 +104,20 @@ class TestPlaylists(unittest.TestCase):
         self.assertEqual(result['title'], 'Rolex Awards for Enterprise')
         self.assertTrue(len(result['entries']) > 72)
 
+    def test_vine_user(self):
+        dl = FakeYDL()
+        ie = VineUserIE(dl)
+        result = ie.extract('https://vine.co/Visa')
+        self.assertIsPlaylist(result)
+        self.assertTrue(len(result['entries']) >= 50)
+
     def test_ustream_channel(self):
         dl = FakeYDL()
         ie = UstreamChannelIE(dl)
         result = ie.extract('http://www.ustream.tv/channel/young-americans-for-liberty')
         self.assertIsPlaylist(result)
         self.assertEqual(result['id'], '5124905')
-        self.assertTrue(len(result['entries']) >= 11)
+        self.assertTrue(len(result['entries']) >= 6)
 
     def test_soundcloud_set(self):
         dl = FakeYDL()
@@ -116,6 +135,17 @@ class TestPlaylists(unittest.TestCase):
         self.assertEqual(result['id'], '9615865')
         self.assertTrue(len(result['entries']) >= 12)
 
+    def test_soundcloud_playlist(self):
+        dl = FakeYDL()
+        ie = SoundcloudPlaylistIE(dl)
+        result = ie.extract('http://api.soundcloud.com/playlists/4110309')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['id'], '4110309')
+        self.assertEqual(result['title'], 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]')
+        assertRegexpMatches(
+            self, result['description'], r'TILT Brass - Bowery Poetry Club')
+        self.assertEqual(len(result['entries']), 6)
+
     def test_livestream_event(self):
         dl = FakeYDL()
         ie = LivestreamIE(dl)
@@ -170,30 +200,30 @@ class TestPlaylists(unittest.TestCase):
     def test_AcademicEarthCourse(self):
         dl = FakeYDL()
         ie = AcademicEarthCourseIE(dl)
-        result = ie.extract('http://academicearth.org/courses/building-dynamic-websites/')
+        result = ie.extract('http://academicearth.org/playlists/laws-of-nature/')
         self.assertIsPlaylist(result)
-        self.assertEqual(result['id'], 'building-dynamic-websites')
-        self.assertEqual(result['title'], 'Building Dynamic Websites')
-        self.assertEqual(result['description'], u"Today's websites are increasingly dynamic. Pages are no longer static HTML files but instead generated by scripts and database calls. User interfaces are more seamless, with technologies like Ajax replacing traditional page reloads. This course teaches students how to build dynamic websites with Ajax and with Linux, Apache, MySQL, and PHP (LAMP), one of today's most popular frameworks. Students learn how to set up domain names with DNS, how to structure pages with XHTML and CSS, how to program in JavaScript and PHP, how to configure Apache and MySQL, how to design and query databases with SQL, how to use Ajax with both XML and JSON, and how to build mashups. The course explores issues of security, scalability, and cross-browser support and also discusses enterprise-level deployments of websites, including third-party hosting, virtualization, colocation in data centers, firewalling, and load-balancing.")
-        self.assertEqual(len(result['entries']), 10)
+        self.assertEqual(result['id'], 'laws-of-nature')
+        self.assertEqual(result['title'], 'Laws of Nature')
+        self.assertEqual(result['description'],u'Introduce yourself to the laws of nature with these free online college lectures from Yale, Harvard, and MIT.')# u"Today's websites are increasingly dynamic. Pages are no longer static HTML files but instead generated by scripts and database calls. User interfaces are more seamless, with technologies like Ajax replacing traditional page reloads. This course teaches students how to build dynamic websites with Ajax and with Linux, Apache, MySQL, and PHP (LAMP), one of today's most popular frameworks. Students learn how to set up domain names with DNS, how to structure pages with XHTML and CSS, how to program in JavaScript and PHP, how to configure Apache and MySQL, how to design and query databases with SQL, how to use Ajax with both XML and JSON, and how to build mashups. The course explores issues of security, scalability, and cross-browser support and also discusses enterprise-level deployments of websites, including third-party hosting, virtualization, colocation in data centers, firewalling, and load-balancing.")
+        self.assertEqual(len(result['entries']), 4)
         
     def test_ivi_compilation(self):
         dl = FakeYDL()
         ie = IviCompilationIE(dl)
-        result = ie.extract('http://www.ivi.ru/watch/dezhurnyi_angel')
+        result = ie.extract('http://www.ivi.ru/watch/dvoe_iz_lartsa')
         self.assertIsPlaylist(result)
-        self.assertEqual(result['id'], 'dezhurnyi_angel')
-        self.assertEqual(result['title'], 'Ð\94ежÑ\83Ñ\80нÑ\8bй Ð°Ð½Ð³ÐµÐ» (2010 - 2012)')
-        self.assertTrue(len(result['entries']) >= 36)
-        
+        self.assertEqual(result['id'], 'dvoe_iz_lartsa')
+        self.assertEqual(result['title'], 'Ð\94вое Ð¸Ð· Ð»Ð°Ñ\80Ñ\86а (2006 - 2008)')
+        self.assertTrue(len(result['entries']) >= 24)
+
     def test_ivi_compilation_season(self):
         dl = FakeYDL()
         ie = IviCompilationIE(dl)
-        result = ie.extract('http://www.ivi.ru/watch/dezhurnyi_angel/season2')
+        result = ie.extract('http://www.ivi.ru/watch/dvoe_iz_lartsa/season1')
         self.assertIsPlaylist(result)
-        self.assertEqual(result['id'], 'dezhurnyi_angel/season2')
-        self.assertEqual(result['title'], 'Ð\94ежÑ\83Ñ\80нÑ\8bй Ð°Ð½Ð³ÐµÐ» (2010 - 2012) 2 сезон')
-        self.assertTrue(len(result['entries']) >= 20)
+        self.assertEqual(result['id'], 'dvoe_iz_lartsa/season1')
+        self.assertEqual(result['title'], 'Ð\94вое Ð¸Ð· Ð»Ð°Ñ\80Ñ\86а (2006 - 2008) 1 сезон')
+        self.assertTrue(len(result['entries']) >= 12)
         
     def test_imdb_list(self):
         dl = FakeYDL()
@@ -248,7 +278,96 @@ class TestPlaylists(unittest.TestCase):
         self.assertIsPlaylist(result)
         self.assertEqual(result['id'], 'python language')
         self.assertEqual(result['title'], 'python language')
-        self.assertTrue(len(result['entries']) == 15)
+        self.assertEqual(len(result['entries']), 15)
+
+    def test_generic_rss_feed(self):
+        dl = FakeYDL()
+        ie = GenericIE(dl)
+        result = ie.extract('http://phihag.de/2014/youtube-dl/rss.xml')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['id'], 'http://phihag.de/2014/youtube-dl/rss.xml')
+        self.assertEqual(result['title'], 'Zero Punctuation')
+        self.assertTrue(len(result['entries']) > 10)
+
+    def test_ted_playlist(self):
+        dl = FakeYDL()
+        ie = TEDIE(dl)
+        result = ie.extract('http://www.ted.com/playlists/who_are_the_hackers')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['id'], '10')
+        self.assertEqual(result['title'], 'Who are the hackers?')
+        self.assertTrue(len(result['entries']) >= 6)
+
+    def test_toypics_user(self):
+        dl = FakeYDL()
+        ie = ToypicsUserIE(dl)
+        result = ie.extract('http://videos.toypics.net/Mikey')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['id'], 'Mikey')
+        self.assertTrue(len(result['entries']) >= 17)
+
+    def test_xtube_user(self):
+        dl = FakeYDL()
+        ie = XTubeUserIE(dl)
+        result = ie.extract('http://www.xtube.com/community/profile.php?user=greenshowers')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['id'], 'greenshowers')
+        self.assertTrue(len(result['entries']) >= 155)
+
+    def test_InstagramUser(self):
+        dl = FakeYDL()
+        ie = InstagramUserIE(dl)
+        result = ie.extract('http://instagram.com/porsche')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['id'], 'porsche')
+        self.assertTrue(len(result['entries']) >= 2)
+        test_video = next(
+            e for e in result['entries']
+            if e['id'] == '614605558512799803_462752227')
+        dl.add_default_extra_info(test_video, ie, '(irrelevant URL)')
+        dl.process_video_result(test_video, download=False)
+        EXPECTED = {
+            'id': '614605558512799803_462752227',
+            'ext': 'mp4',
+            'title': '#Porsche Intelligent Performance.',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'uploader': 'Porsche',
+            'uploader_id': 'porsche',
+            'timestamp': 1387486713,
+            'upload_date': '20131219',
+        }
+        expect_info_dict(self, EXPECTED, test_video)
+
+    def test_CSpan_playlist(self):
+        dl = FakeYDL()
+        ie = CSpanIE(dl)
+        result = ie.extract(
+            'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['id'], '342759')
+        self.assertEqual(
+            result['title'], 'General Motors Ignition Switch Recall')
+        whole_duration = sum(e['duration'] for e in result['entries'])
+        self.assertEqual(whole_duration, 14855)
+
+    def test_aol_playlist(self):
+        dl = FakeYDL()
+        ie = AolIE(dl)
+        result = ie.extract(
+            'http://on.aol.com/playlist/brace-yourself---todays-weirdest-news-152147?icid=OnHomepageC4_Omg_Img#_videoid=518184316')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['id'], '152147')
+        self.assertEqual(
+            result['title'], 'Brace Yourself - Today\'s Weirdest News')
+        self.assertTrue(len(result['entries']) >= 10)
+
+    def test_TeacherTubeClassroom(self):
+        dl = FakeYDL()
+        ie = TeacherTubeClassroomIE(dl)
+        result = ie.extract('http://www.teachertube.com/view_classroom.php?user=rbhagwati2')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['id'], 'rbhagwati2')
+        self.assertTrue(len(result['entries']) >= 20)
 
 if __name__ == '__main__':
     unittest.main()
index 79991e6462ac3b868a9ecbb98234e55df483e725..5736fe58112fc88b5ae15a53863221aa806ba4eb 100644 (file)
@@ -181,7 +181,7 @@ class TestTedSubtitles(BaseTestSubtitles):
         self.DL.params['writesubtitles'] = True
         self.DL.params['allsubtitles'] = True
         subtitles = self.getSubtitles()
-        self.assertEqual(len(subtitles.keys()), 28)
+        self.assertTrue(len(subtitles.keys()) >= 28)
 
     def test_list_subtitles(self):
         self.DL.expect_warning(u'Automatic Captions not supported by this server')
index 84553b94386236352e9ad856782c4b3428f3587b..51eb0b6b936c7ea5d21cfef9bdc0b70f2ee7663a 100644 (file)
@@ -9,6 +9,8 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 
 # Various small unit tests
+import io
+import json
 import xml.etree.ElementTree
 
 #from youtube_dl.utils import htmlentity_transform
@@ -21,6 +23,7 @@ from youtube_dl.utils import (
     orderedSet,
     PagedList,
     parse_duration,
+    read_batch_urls,
     sanitize_filename,
     shell_quote,
     smuggle_url,
@@ -31,7 +34,11 @@ from youtube_dl.utils import (
     unified_strdate,
     unsmuggle_url,
     url_basename,
+    urlencode_postdata,
     xpath_with_ns,
+    parse_iso8601,
+    strip_jsonp,
+    uppercase_escape,
 )
 
 if sys.version_info < (3, 0):
@@ -250,5 +257,32 @@ class TestUtil(unittest.TestCase):
     def test_struct_unpack(self):
         self.assertEqual(struct_unpack(u'!B', b'\x00'), (0,))
 
+    def test_read_batch_urls(self):
+        f = io.StringIO(u'''\xef\xbb\xbf foo
+            bar\r
+            baz
+            # More after this line\r
+            ; or after this
+            bam''')
+        self.assertEqual(read_batch_urls(f), [u'foo', u'bar', u'baz', u'bam'])
+
+    def test_urlencode_postdata(self):
+        data = urlencode_postdata({'username': 'foo@bar.com', 'password': '1234'})
+        self.assertTrue(isinstance(data, bytes))
+
+    def test_parse_iso8601(self):
+        self.assertEqual(parse_iso8601('2014-03-23T23:04:26+0100'), 1395612266)
+        self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266)
+        self.assertEqual(parse_iso8601('2014-03-23T22:04:26Z'), 1395612266)
+
+    def test_strip_jsonp(self):
+        stripped = strip_jsonp('cb ([ {"id":"532cb",\n\n\n"x":\n3}\n]\n);')
+        d = json.loads(stripped)
+        self.assertEqual(d, [{"id": "532cb", "x": 3}])
+
+    def test_uppercase_escpae(self):
+        self.assertEqual(uppercase_escape(u'aä'), u'aä')
+        self.assertEqual(uppercase_escape(u'\\U0001d550'), u'𝕐')
+
 if __name__ == '__main__':
     unittest.main()
index 38ac989ce706a347e575f62a9dd6b60b8fece8c9..3aadedd64cf5af38ab1d18b640b10301c2073de2 100644 (file)
@@ -16,6 +16,7 @@ from youtube_dl.extractor import (
     YoutubeChannelIE,
     YoutubeShowIE,
     YoutubeTopListIE,
+    YoutubeSearchURLIE,
 )
 
 
@@ -111,13 +112,15 @@ class TestYoutubeLists(unittest.TestCase):
     def test_youtube_mix(self):
         dl = FakeYDL()
         ie = YoutubePlaylistIE(dl)
-        result = ie.extract('http://www.youtube.com/watch?v=lLJf9qJHR3E&list=RDrjFaenf1T-Y')
+        result = ie.extract('https://www.youtube.com/watch?v=W01L70IGBgE&index=2&list=RDOQpdSVF_k_w')
         entries = result['entries']
         self.assertTrue(len(entries) >= 20)
         original_video = entries[0]
-        self.assertEqual(original_video['id'], 'rjFaenf1T-Y')
+        self.assertEqual(original_video['id'], 'OQpdSVF_k_w')
 
     def test_youtube_toptracks(self):
+        print('Skipping: The playlist page gives error 500')
+        return
         dl = FakeYDL()
         ie = YoutubePlaylistIE(dl)
         result = ie.extract('https://www.youtube.com/playlist?list=MCUS')
@@ -131,5 +134,14 @@ class TestYoutubeLists(unittest.TestCase):
         entries = result['entries']
         self.assertTrue(len(entries) >= 5)
 
+    def test_youtube_search_url(self):
+        dl = FakeYDL()
+        ie = YoutubeSearchURLIE(dl)
+        result = ie.extract('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video')
+        entries = result['entries']
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['title'], 'youtube-dl test video')
+        self.assertTrue(len(entries) >= 5)
+
 if __name__ == '__main__':
     unittest.main()
index 063e40dc742bd6eafc3ba996dbc5a6443dc6e651..b98d36a19b220dd47c8259ed766b2634f84f9251 100755 (executable)
Binary files a/youtube-dl and b/youtube-dl differ
index 7abbe59d13a6171bc16af51c471c3e747fc9ed64..f17adddce87ce25ef3f7a24f3b3daaf29b3e242f 100644 (file)
@@ -24,7 +24,7 @@ redistribute it or use it however you like.
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ sure\ that\ you\ have\ sufficient\ permissions
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ (run\ with\ sudo\ if\ needed)
 \-i,\ \-\-ignore\-errors\ \ \ \ \ \ \ \ \ \ \ \ \ \ continue\ on\ download\ errors,\ for\ example\ to
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ to\ skip\ unavailable\ videos\ in\ a\ playlist
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ skip\ unavailable\ videos\ in\ a\ playlist
 \-\-abort\-on\-error\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Abort\ downloading\ of\ further\ videos\ (in\ the
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ playlist\ or\ the\ command\ line)\ if\ an\ error
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ occurs
@@ -32,6 +32,9 @@ redistribute it or use it however you like.
 \-\-user\-agent\ UA\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ specify\ a\ custom\ user\ agent
 \-\-referer\ REF\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ specify\ a\ custom\ referer,\ use\ if\ the\ video
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ access\ is\ restricted\ to\ one\ domain
+\-\-add\-header\ FIELD:VALUE\ \ \ \ \ \ \ \ \ specify\ a\ custom\ HTTP\ header\ and\ its\ value,
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ separated\ by\ a\ colon\ \[aq]:\[aq].\ You\ can\ use\ this
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ option\ multiple\ times
 \-\-list\-extractors\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ List\ all\ supported\ extractors\ and\ the\ URLs
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ they\ would\ handle
 \-\-extractor\-descriptions\ \ \ \ \ \ \ \ \ Output\ descriptions\ of\ all\ supported
@@ -40,6 +43,9 @@ redistribute it or use it however you like.
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ an\ empty\ string\ (\-\-proxy\ "")\ for\ direct
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ connection
 \-\-no\-check\-certificate\ \ \ \ \ \ \ \ \ \ \ Suppress\ HTTPS\ certificate\ validation.
+\-\-prefer\-insecure\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Use\ an\ unencrypted\ connection\ to\ retrieve
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ information\ about\ the\ video.\ (Currently
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ supported\ only\ for\ YouTube)
 \-\-cache\-dir\ DIR\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Location\ in\ the\ filesystem\ where\ youtube\-dl
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ can\ store\ some\ downloaded\ information
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ permanently.\ By\ default\ $XDG_CACHE_HOME
@@ -63,6 +69,7 @@ redistribute it or use it however you like.
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ configuration\ in\ ~/.config/youtube\-dl.conf
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ (%APPDATA%/youtube\-dl/config.txt\ on
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Windows)
+\-\-encoding\ ENCODING\ \ \ \ \ \ \ \ \ \ \ \ \ \ Force\ the\ specified\ encoding\ (experimental)
 \f[]
 .fi
 .SS Video Selection:
@@ -140,8 +147,12 @@ redistribute it or use it however you like.
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ video\ id,\ %(playlist)s\ for\ the\ playlist\ the
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ video\ is\ in,\ %(playlist_index)s\ for\ the
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ position\ in\ the\ playlist\ and\ %%\ for\ a
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ literal\ percent.\ Use\ \-\ to\ output\ to\ stdout.
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Can\ also\ be\ used\ to\ download\ to\ a\ different
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ literal\ percent.\ %(height)s\ and\ %(width)s
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ for\ the\ width\ and\ height\ of\ the\ video
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ format.\ %(resolution)s\ for\ a\ textual
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ description\ of\ the\ resolution\ of\ the\ video
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ format.\ Use\ \-\ to\ output\ to\ stdout.\ Can\ also
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ be\ used\ to\ download\ to\ a\ different
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ directory,\ for\ example\ with\ \-o\ \[aq]/my/downloa
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ ds/%(uploader)s/%(title)s\-%(id)s.%(ext)s\[aq]\ .
 \-\-autonumber\-size\ NUMBER\ \ \ \ \ \ \ \ \ Specifies\ the\ number\ of\ digits\ in
@@ -179,6 +190,7 @@ redistribute it or use it however you like.
 .nf
 \f[C]
 \-q,\ \-\-quiet\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ activates\ quiet\ mode
+\-\-no\-warnings\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Ignore\ warnings
 \-s,\ \-\-simulate\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ do\ not\ download\ the\ video\ and\ do\ not\ write
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ anything\ to\ disk
 \-\-skip\-download\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ do\ not\ download\ the\ video
@@ -190,7 +202,9 @@ redistribute it or use it however you like.
 \-\-get\-duration\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ video\ length
 \-\-get\-filename\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ output\ filename
 \-\-get\-format\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ output\ format
-\-j,\ \-\-dump\-json\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ JSON\ information
+\-j,\ \-\-dump\-json\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ JSON\ information.
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ See\ \-\-output\ for\ a\ description\ of\ available
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ keys.
 \-\-newline\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ output\ progress\ bar\ as\ new\ lines
 \-\-no\-progress\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ do\ not\ print\ progress\ bar
 \-\-console\-title\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ display\ progress\ in\ console\ titlebar
@@ -211,9 +225,9 @@ redistribute it or use it however you like.
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ preference\ using\ slashes:\ "\-f\ 22/17/18".
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ "\-f\ mp4"\ and\ "\-f\ flv"\ are\ also\ supported.
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ You\ can\ also\ use\ the\ special\ names\ "best",
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ "bestaudio",\ "worst",\ and\ "worstaudio".\ By
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ default,\ youtube\-dl\ will\ pick\ the\ best
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ quality.
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ "bestvideo",\ "bestaudio",\ "worst",
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ "worstvideo"\ and\ "worstaudio".\ By\ default,
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ youtube\-dl\ will\ pick\ the\ best\ quality.
 \-\-all\-formats\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ download\ all\ available\ video\ formats
 \-\-prefer\-free\-formats\ \ \ \ \ \ \ \ \ \ \ \ prefer\ free\ video\ formats\ unless\ a\ specific
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ one\ is\ requested
@@ -272,6 +286,7 @@ redistribute it or use it however you like.
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ default
 \-\-embed\-subs\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ embed\ subtitles\ in\ the\ video\ (only\ for\ mp4
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ videos)
+\-\-embed\-thumbnail\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ embed\ thumbnail\ in\ the\ audio\ as\ cover\ art
 \-\-add\-metadata\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ write\ metadata\ to\ the\ video\ file
 \-\-xattrs\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ write\ metadata\ to\ the\ video\ file\[aq]s\ xattrs
 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ (using\ dublin\ core\ and\ xdg\ standards)
@@ -286,7 +301,7 @@ redistribute it or use it however you like.
 You can configure youtube\-dl by placing default arguments (such as
 \f[C]\-\-extract\-audio\ \-\-no\-mtime\f[] to always extract the audio
 and not copy the mtime) into \f[C]/etc/youtube\-dl.conf\f[] and/or
-\f[C]~/.config/youtube\-dl.conf\f[].
+\f[C]~/.config/youtube\-dl/config\f[].
 On Windows, the configuration file locations are
 \f[C]%APPDATA%\\youtube\-dl\\config.txt\f[] and
 \f[C]C:\\Users\\<Yourname>\\youtube\-dl.conf\f[].
@@ -359,12 +374,19 @@ Relative dates: Dates in the format
 \f[C](now|today)[+\-][0\-9](day|week|month|year)(s)?\f[]
 .PP
 Examples:
-.PP
-$ # Download only the videos uploaded in the last 6 months $ youtube\-dl
-\-\-dateafter now\-6months $ # Download only the videos uploaded on
-January 1, 1970 $ youtube\-dl \-\-date 19700101 $ # will only download
-the videos uploaded in the 200x decade $ youtube\-dl \-\-dateafter
-20000101 \-\-datebefore 20091231
+.IP
+.nf
+\f[C]
+#\ Download\ only\ the\ videos\ uploaded\ in\ the\ last\ 6\ months
+$\ youtube\-dl\ \-\-dateafter\ now\-6months
+
+#\ Download\ only\ the\ videos\ uploaded\ on\ January\ 1,\ 1970
+$\ youtube\-dl\ \-\-date\ 19700101
+
+$\ #\ will\ only\ download\ the\ videos\ uploaded\ in\ the\ 200x\ decade
+$\ youtube\-dl\ \-\-dateafter\ 20000101\ \-\-datebefore\ 20091231
+\f[]
+.fi
 .SH FAQ
 .SS Can you please put the \-b option back?
 .PP
@@ -473,19 +495,108 @@ zip
 nosetests
 .SS Adding support for a new site
 .PP
-If you want to add support for a new site, copy \f[I]any\f[] recently
-modified (https://github.com/rg3/youtube-dl/commits/master/youtube_dl/extractor)
-file in \f[C]youtube_dl/extractor\f[], add an import in
+If you want to add support for a new site, you can follow this quick
+list (assuming your service is called \f[C]yourextractor\f[]):
+.IP " 1." 4
+Fork this repository (https://github.com/rg3/youtube-dl/fork)
+.IP " 2." 4
+Check out the source code with
+\f[C]git\ clone\ git\@github.com:YOUR_GITHUB_USERNAME/youtube\-dl.git\f[]
+.IP " 3." 4
+Start a new git branch with
+\f[C]cd\ youtube\-dl;\ git\ checkout\ \-b\ yourextractor\f[]
+.IP " 4." 4
+Start with this simple template and save it to
+\f[C]youtube_dl/extractor/yourextractor.py\f[]:
+.RS 4
+.IP
+.nf
+\f[C]
+#\ coding:\ utf\-8
+from\ __future__\ import\ unicode_literals
+
+import\ re
+
+from\ .common\ import\ InfoExtractor
+
+
+class\ YourExtractorIE(InfoExtractor):
+\ \ \ \ _VALID_URL\ =\ r\[aq]https?://(?:www\\.)?yourextractor\\.com/watch/(?P<id>[0\-9]+)\[aq]
+\ \ \ \ _TEST\ =\ {
+\ \ \ \ \ \ \ \ \[aq]url\[aq]:\ \[aq]http://yourextractor.com/watch/42\[aq],
+\ \ \ \ \ \ \ \ \[aq]md5\[aq]:\ \[aq]TODO:\ md5\ sum\ of\ the\ first\ 10KiB\ of\ the\ video\ file\[aq],
+\ \ \ \ \ \ \ \ \[aq]info_dict\[aq]:\ {
+\ \ \ \ \ \ \ \ \ \ \ \ \[aq]id\[aq]:\ \[aq]42\[aq],
+\ \ \ \ \ \ \ \ \ \ \ \ \[aq]ext\[aq]:\ \[aq]mp4\[aq],
+\ \ \ \ \ \ \ \ \ \ \ \ \[aq]title\[aq]:\ \[aq]Video\ title\ goes\ here\[aq],
+\ \ \ \ \ \ \ \ \ \ \ \ #\ TODO\ more\ properties,\ either\ as:
+\ \ \ \ \ \ \ \ \ \ \ \ #\ *\ A\ value
+\ \ \ \ \ \ \ \ \ \ \ \ #\ *\ MD5\ checksum;\ start\ the\ string\ with\ md5:
+\ \ \ \ \ \ \ \ \ \ \ \ #\ *\ A\ regular\ expression;\ start\ the\ string\ with\ re:
+\ \ \ \ \ \ \ \ \ \ \ \ #\ *\ Any\ Python\ type\ (for\ example\ int\ or\ float)
+\ \ \ \ \ \ \ \ }
+\ \ \ \ }
+
+\ \ \ \ def\ _real_extract(self,\ url):
+\ \ \ \ \ \ \ \ mobj\ =\ re.match(self._VALID_URL,\ url)
+\ \ \ \ \ \ \ \ video_id\ =\ mobj.group(\[aq]id\[aq])
+
+\ \ \ \ \ \ \ \ #\ TODO\ more\ code\ goes\ here,\ for\ example\ ...
+\ \ \ \ \ \ \ \ webpage\ =\ self._download_webpage(url,\ video_id)
+\ \ \ \ \ \ \ \ title\ =\ self._html_search_regex(r\[aq]<h1>(.*?)</h1>\[aq],\ webpage,\ \[aq]title\[aq])
+
+\ \ \ \ \ \ \ \ return\ {
+\ \ \ \ \ \ \ \ \ \ \ \ \[aq]id\[aq]:\ video_id,
+\ \ \ \ \ \ \ \ \ \ \ \ \[aq]title\[aq]:\ title,
+\ \ \ \ \ \ \ \ \ \ \ \ #\ TODO\ more\ properties\ (see\ youtube_dl/extractor/common.py)
+\ \ \ \ \ \ \ \ }
+\f[]
+.fi
+.RE
+.IP " 5." 4
+Add an import in
 \f[C]youtube_dl/extractor/__init__.py\f[] (https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py).
+.IP " 6." 4
+Run
+\f[C]python\ test/test_download.py\ TestDownload.test_YourExtractor\f[].
+This \f[I]should fail\f[] at first, but you can continually re\-run it
+until you\[aq]re done.
+.IP " 7." 4
 Have a look at
 \f[C]youtube_dl/common/extractor/common.py\f[] (https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py)
 for possible helper methods and a detailed description of what your
 extractor should
 return (https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38).
-Don\[aq]t forget to run the tests with
-\f[C]python\ test/test_download.py\ Test_Download.test_YourExtractor\f[]!
-For a detailed tutorial, refer to this blog
-post (http://filippo.io/add-support-for-a-new-video-site-to-youtube-dl/).
+Add tests and code for as many as you want.
+.IP " 8." 4
+If you can, check the code with
+pyflakes (https://pypi.python.org/pypi/pyflakes) (a good idea) and
+pep8 (https://pypi.python.org/pypi/pep8) (optional, ignore E501).
+.IP " 9." 4
+When the tests pass,
+add (https://www.kernel.org/pub/software/scm/git/docs/git-add.html) the
+new files and
+commit (https://www.kernel.org/pub/software/scm/git/docs/git-commit.html)
+them and
+push (https://www.kernel.org/pub/software/scm/git/docs/git-push.html)
+the result, like this:
+.RS 4
+.IP
+.nf
+\f[C]
+$\ git\ add\ youtube_dl/extractor/__init__.py
+$\ git\ add\ youtube_dl/extractor/yourextractor.py
+$\ git\ commit\ \-m\ \[aq][yourextractor]\ Add\ new\ extractor\[aq]
+$\ git\ push\ origin\ yourextractor
+\f[]
+.fi
+.RE
+.IP "10." 4
+Finally, create a pull
+request (https://help.github.com/articles/creating-a-pull-request).
+We\[aq]ll then review and merge it.
+.PP
+In any case, thank you very much for your contributions!
 .SH BUGS
 .PP
 Bugs and suggestions should be reported at:
@@ -537,7 +648,7 @@ For bug reports, this means that your report should contain the
 The error message you get for (most) bugs even says so, but you would
 not believe how many of our bug reports do not contain this information.
 .PP
-Site support requests must contain an example URL.
+Site support requests \f[B]must contain an example URL\f[].
 An example URL is a URL you might want to download, like
 http://www.youtube.com/watch?v=BaW_jenozKc .
 There should be an obvious video present.
index a5398bbae26cb0a76f4f7fae50552bb9b389bc68..498e841dc762910ea6a1fcb73daa104f5d5af70d 100644 (file)
@@ -4,7 +4,7 @@ __youtube_dl()
     COMPREPLY=()
     cur="${COMP_WORDS[COMP_CWORD]}"
     prev="${COMP_WORDS[COMP_CWORD-1]}"
-    opts="--help --version --update --ignore-errors --abort-on-error --dump-user-agent --user-agent --referer --list-extractors --extractor-descriptions --proxy --no-check-certificate --cache-dir --no-cache-dir --socket-timeout --bidi-workaround --default-search --ignore-config --playlist-start --playlist-end --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --min-views --max-views --no-playlist --age-limit --download-archive --include-ads --youtube-include-dash-manifest --rate-limit --retries --buffer-size --no-resize-buffer --test --title --id --literal --auto-number --output --autonumber-size --restrict-filenames --batch-file --load-info --no-overwrites --continue --no-continue --cookies --no-part --no-mtime --write-description --write-info-json --write-annotations --write-thumbnail --quiet --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-duration --get-filename --get-format --dump-json --newline --no-progress --console-title --verbose --dump-intermediate-pages --write-pages --youtube-print-sig-code --print-traffic --format --all-formats --prefer-free-formats --max-quality --list-formats --write-sub --write-auto-sub --all-subs --list-subs --sub-format --sub-lang --username --password --netrc --video-password --extract-audio --audio-format --audio-quality --recode-video --keep-video --no-post-overwrites --embed-subs --add-metadata --xattrs --prefer-avconv --prefer-ffmpeg"
+    opts="--help --version --update --ignore-errors --abort-on-error --dump-user-agent --user-agent --referer --add-header --list-extractors --extractor-descriptions --proxy --no-check-certificate --prefer-insecure --cache-dir --no-cache-dir --socket-timeout --bidi-workaround --default-search --ignore-config --encoding --playlist-start --playlist-end --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --min-views --max-views --no-playlist --age-limit --download-archive --include-ads --youtube-include-dash-manifest --rate-limit --retries --buffer-size --no-resize-buffer --test --title --id --literal --auto-number --output --autonumber-size --restrict-filenames --batch-file --load-info --no-overwrites --continue --no-continue --cookies --no-part --no-mtime --write-description --write-info-json --write-annotations --write-thumbnail --quiet --no-warnings --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-duration --get-filename --get-format --dump-json --newline --no-progress --console-title --verbose --dump-intermediate-pages --write-pages --youtube-print-sig-code --print-traffic --format --all-formats --prefer-free-formats --max-quality --list-formats --write-sub --write-auto-sub --all-subs --list-subs --sub-format --sub-lang --username --password --netrc --video-password --extract-audio --audio-format --audio-quality --recode-video --keep-video --no-post-overwrites --embed-subs --embed-thumbnail --add-metadata --xattrs --prefer-avconv --prefer-ffmpeg"
     keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater :ythistory"
     fileopts="-a|--batch-file|--download-archive|--cookies|--load-info"
     diropts="--cache-dir"
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
deleted file mode 100755 (executable)
index 672ef9e..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-# Legacy file for backwards compatibility, use youtube_dl.extractor instead!
-
-from .extractor.common import InfoExtractor, SearchInfoExtractor
-from .extractor import gen_extractors, get_info_extractor
old mode 100644 (file)
new mode 100755 (executable)
index 42cbcf6..dc0ba98
@@ -4,9 +4,11 @@
 from __future__ import absolute_import, unicode_literals
 
 import collections
+import datetime
 import errno
 import io
 import json
+import locale
 import os
 import platform
 import re
@@ -29,6 +31,7 @@ from .utils import (
     ContentTooShortError,
     date_from_str,
     DateRange,
+    DEFAULT_OUTTMPL,
     determine_ext,
     DownloadError,
     encodeFilename,
@@ -93,6 +96,7 @@ class YoutubeDL(object):
     usenetrc:          Use netrc for authentication instead.
     verbose:           Print additional info to stdout.
     quiet:             Do not print messages to stdout.
+    no_warnings:       Do not print out anything for warnings.
     forceurl:          Force printing final URL.
     forcetitle:        Force printing title.
     forceid:           Force printing ID.
@@ -147,6 +151,8 @@ class YoutubeDL(object):
                        again.
     cookiefile:        File name where cookies should be read from and dumped to.
     nocheckcertificate:Do not verify SSL certificates
+    prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
+                       At the moment, this is only supported by YouTube.
     proxy:             URL of the proxy server to use
     socket_timeout:    Time to wait for unresponsive hosts, in seconds
     bidi_workaround:   Work around buggy terminals without bidirectional text
@@ -155,6 +161,7 @@ class YoutubeDL(object):
     include_ads:       Download ads as well
     default_search:    Prepend this string if an input url is not valid.
                        'auto' for elaborate guessing
+    encoding:          Use this encoding instead of the system-specified.
 
     The following parameters are not used by YoutubeDL itself, they are used by
     the FileDownloader:
@@ -280,6 +287,9 @@ class YoutubeDL(object):
         """Print message to stdout if not in quiet mode."""
         return self.to_stdout(message, skip_eol, check_quiet=True)
 
+    def _write_string(self, s, out=None):
+        write_string(s, out=out, encoding=self.params.get('encoding'))
+
     def to_stdout(self, message, skip_eol=False, check_quiet=False):
         """Print message to stdout if not in quiet mode."""
         if self.params.get('logger'):
@@ -289,7 +299,7 @@ class YoutubeDL(object):
             terminator = ['\n', ''][skip_eol]
             output = message + terminator
 
-            write_string(output, self._screen_file)
+            self._write_string(output, self._screen_file)
 
     def to_stderr(self, message):
         """Print message to stderr."""
@@ -299,7 +309,7 @@ class YoutubeDL(object):
         else:
             message = self._bidi_workaround(message)
             output = message + '\n'
-            write_string(output, self._err_file)
+            self._write_string(output, self._err_file)
 
     def to_console_title(self, message):
         if not self.params.get('consoletitle', False):
@@ -309,21 +319,21 @@ class YoutubeDL(object):
             # already of type unicode()
             ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
         elif 'TERM' in os.environ:
-            write_string('\033]0;%s\007' % message, self._screen_file)
+            self._write_string('\033]0;%s\007' % message, self._screen_file)
 
     def save_console_title(self):
         if not self.params.get('consoletitle', False):
             return
         if 'TERM' in os.environ:
             # Save the title on stack
-            write_string('\033[22;0t', self._screen_file)
+            self._write_string('\033[22;0t', self._screen_file)
 
     def restore_console_title(self):
         if not self.params.get('consoletitle', False):
             return
         if 'TERM' in os.environ:
             # Restore the title from stack
-            write_string('\033[23;0t', self._screen_file)
+            self._write_string('\033[23;0t', self._screen_file)
 
     def __enter__(self):
         self.save_console_title()
@@ -370,12 +380,17 @@ class YoutubeDL(object):
         Print the message to stderr, it will be prefixed with 'WARNING:'
         If stderr is a tty file the 'WARNING:' will be colored
         '''
-        if self._err_file.isatty() and os.name != 'nt':
-            _msg_header = '\033[0;33mWARNING:\033[0m'
+        if self.params.get('logger') is not None:
+            self.params['logger'].warning(message)
         else:
-            _msg_header = 'WARNING:'
-        warning_message = '%s %s' % (_msg_header, message)
-        self.to_stderr(warning_message)
+            if self.params.get('no_warnings'):
+                return
+            if self._err_file.isatty() and os.name != 'nt':
+                _msg_header = '\033[0;33mWARNING:\033[0m'
+            else:
+                _msg_header = 'WARNING:'
+            warning_message = '%s %s' % (_msg_header, message)
+            self.to_stderr(warning_message)
 
     def report_error(self, message, tb=None):
         '''
@@ -409,6 +424,13 @@ class YoutubeDL(object):
             template_dict['autonumber'] = autonumber_templ % self._num_downloads
             if template_dict.get('playlist_index') is not None:
                 template_dict['playlist_index'] = '%05d' % template_dict['playlist_index']
+            if template_dict.get('resolution') is None:
+                if template_dict.get('width') and template_dict.get('height'):
+                    template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
+                elif template_dict.get('height'):
+                    template_dict['resolution'] = '%sp' % template_dict['height']
+                elif template_dict.get('width'):
+                    template_dict['resolution'] = '?x%d' % template_dict['width']
 
             sanitize = lambda k, v: sanitize_filename(
                 compat_str(v),
@@ -419,7 +441,8 @@ class YoutubeDL(object):
                                  if v is not None)
             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
 
-            tmpl = os.path.expanduser(self.params['outtmpl'])
+            outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
+            tmpl = os.path.expanduser(outtmpl)
             filename = tmpl % template_dict
             return filename
         except ValueError as err:
@@ -499,13 +522,7 @@ class YoutubeDL(object):
                         '_type': 'compat_list',
                         'entries': ie_result,
                     }
-                self.add_extra_info(ie_result,
-                    {
-                        'extractor': ie.IE_NAME,
-                        'webpage_url': url,
-                        'webpage_url_basename': url_basename(url),
-                        'extractor_key': ie.ie_key(),
-                    })
+                self.add_default_extra_info(ie_result, ie, url)
                 if process:
                     return self.process_ie_result(ie_result, download, extra_info)
                 else:
@@ -522,7 +539,15 @@ class YoutubeDL(object):
                 else:
                     raise
         else:
-            self.report_error('no suitable InfoExtractor: %s' % url)
+            self.report_error('no suitable InfoExtractor for URL %s' % url)
+
+    def add_default_extra_info(self, ie_result, ie, url):
+        self.add_extra_info(ie_result, {
+            'extractor': ie.IE_NAME,
+            'webpage_url': url,
+            'webpage_url_basename': url_basename(url),
+            'extractor_key': ie.ie_key(),
+        })
 
     def process_ie_result(self, ie_result, download=True, extra_info={}):
         """
@@ -656,6 +681,18 @@ class YoutubeDL(object):
                 if f.get('vcodec') == 'none']
             if audio_formats:
                 return audio_formats[0]
+        elif format_spec == 'bestvideo':
+            video_formats = [
+                f for f in available_formats
+                if f.get('acodec') == 'none']
+            if video_formats:
+                return video_formats[-1]
+        elif format_spec == 'worstvideo':
+            video_formats = [
+                f for f in available_formats
+                if f.get('acodec') == 'none']
+            if video_formats:
+                return video_formats[0]
         else:
             extensions = ['mp4', 'flv', 'webm', '3gp']
             if format_spec in extensions:
@@ -670,11 +707,35 @@ class YoutubeDL(object):
     def process_video_result(self, info_dict, download=True):
         assert info_dict.get('_type', 'video') == 'video'
 
+        if 'id' not in info_dict:
+            raise ExtractorError('Missing "id" field in extractor result')
+        if 'title' not in info_dict:
+            raise ExtractorError('Missing "title" field in extractor result')
+
         if 'playlist' not in info_dict:
             # It isn't part of a playlist
             info_dict['playlist'] = None
             info_dict['playlist_index'] = None
 
+        thumbnails = info_dict.get('thumbnails')
+        if thumbnails:
+            thumbnails.sort(key=lambda t: (
+                t.get('width'), t.get('height'), t.get('url')))
+            for t in thumbnails:
+                if 'width' in t and 'height' in t:
+                    t['resolution'] = '%dx%d' % (t['width'], t['height'])
+
+        if thumbnails and 'thumbnail' not in info_dict:
+            info_dict['thumbnail'] = thumbnails[-1]['url']
+
+        if 'display_id' not in info_dict and 'id' in info_dict:
+            info_dict['display_id'] = info_dict['id']
+
+        if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
+            upload_date = datetime.datetime.utcfromtimestamp(
+                info_dict['timestamp'])
+            info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
+
         # This extractors handle format selection themselves
         if info_dict['extractor'] in ['Youku']:
             if download:
@@ -688,8 +749,14 @@ class YoutubeDL(object):
         else:
             formats = info_dict['formats']
 
+        if not formats:
+            raise ExtractorError('No video formats found!')
+
         # We check that all the formats have the format and format_id fields
-        for (i, format) in enumerate(formats):
+        for i, format in enumerate(formats):
+            if 'url' not in format:
+                raise ExtractorError('Missing "url" key in result (index %d)' % i)
+
             if format.get('format_id') is None:
                 format['format_id'] = compat_str(i)
             if format.get('format') is None:
@@ -700,7 +767,7 @@ class YoutubeDL(object):
                 )
             # Automatically determine file extension if missing
             if 'ext' not in format:
-                format['ext'] = determine_ext(format['url'])
+                format['ext'] = determine_ext(format['url']).lower()
 
         format_limit = self.params.get('format_limit', None)
         if format_limit:
@@ -825,7 +892,7 @@ class YoutubeDL(object):
 
         try:
             dn = os.path.dirname(encodeFilename(filename))
-            if dn != '' and not os.path.exists(dn):
+            if dn and not os.path.exists(dn):
                 os.makedirs(dn)
         except (OSError, IOError) as err:
             self.report_error('unable to create directory ' + compat_str(err))
@@ -882,7 +949,7 @@ class YoutubeDL(object):
                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
                                 subfile.write(sub)
                 except (OSError, IOError):
-                    self.report_error('Cannot write subtitles file ' + descfn)
+                    self.report_error('Cannot write subtitles file ' + sub_filename)
                     return
 
         if self.params.get('writeinfojson', False):
@@ -908,7 +975,7 @@ class YoutubeDL(object):
                     self.to_screen('[%s] %s: Downloading thumbnail ...' %
                                    (info_dict['extractor'], info_dict['id']))
                     try:
-                        uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
+                        uf = self.urlopen(info_dict['thumbnail'])
                         with open(thumb_filename, 'wb') as thumbf:
                             shutil.copyfileobj(uf, thumbf)
                         self.to_screen('[%s] %s: Writing thumbnail to: %s' %
@@ -971,10 +1038,11 @@ class YoutubeDL(object):
 
     def download(self, url_list):
         """Download a given list of URLs."""
+        outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
         if (len(url_list) > 1 and
-                '%' not in self.params['outtmpl']
+                '%' not in outtmpl
                 and self.params.get('max_downloads') != 1):
-            raise SameFileError(self.params['outtmpl'])
+            raise SameFileError(outtmpl)
 
         for url in url_list:
             try:
@@ -1085,57 +1153,57 @@ class YoutubeDL(object):
             res = default
         return res
 
-    def list_formats(self, info_dict):
-        def format_note(fdict):
-            res = ''
-            if fdict.get('ext') in ['f4f', 'f4m']:
-                res += '(unsupported) '
-            if fdict.get('format_note') is not None:
-                res += fdict['format_note'] + ' '
-            if fdict.get('tbr') is not None:
-                res += '%4dk ' % fdict['tbr']
-            if fdict.get('container') is not None:
-                if res:
-                    res += ', '
-                res += '%s container' % fdict['container']
-            if (fdict.get('vcodec') is not None and
-                    fdict.get('vcodec') != 'none'):
-                if res:
-                    res += ', '
-                res += fdict['vcodec']
-                if fdict.get('vbr') is not None:
-                    res += '@'
-            elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
-                res += 'video@'
+    def _format_note(self, fdict):
+        res = ''
+        if fdict.get('ext') in ['f4f', 'f4m']:
+            res += '(unsupported) '
+        if fdict.get('format_note') is not None:
+            res += fdict['format_note'] + ' '
+        if fdict.get('tbr') is not None:
+            res += '%4dk ' % fdict['tbr']
+        if fdict.get('container') is not None:
+            if res:
+                res += ', '
+            res += '%s container' % fdict['container']
+        if (fdict.get('vcodec') is not None and
+                fdict.get('vcodec') != 'none'):
+            if res:
+                res += ', '
+            res += fdict['vcodec']
             if fdict.get('vbr') is not None:
-                res += '%4dk' % fdict['vbr']
-            if fdict.get('acodec') is not None:
-                if res:
-                    res += ', '
-                if fdict['acodec'] == 'none':
-                    res += 'video only'
-                else:
-                    res += '%-5s' % fdict['acodec']
-            elif fdict.get('abr') is not None:
-                if res:
-                    res += ', '
-                res += 'audio'
-            if fdict.get('abr') is not None:
-                res += '@%3dk' % fdict['abr']
-            if fdict.get('asr') is not None:
-                res += ' (%5dHz)' % fdict['asr']
-            if fdict.get('filesize') is not None:
-                if res:
-                    res += ', '
-                res += format_bytes(fdict['filesize'])
-            return res
+                res += '@'
+        elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
+            res += 'video@'
+        if fdict.get('vbr') is not None:
+            res += '%4dk' % fdict['vbr']
+        if fdict.get('acodec') is not None:
+            if res:
+                res += ', '
+            if fdict['acodec'] == 'none':
+                res += 'video only'
+            else:
+                res += '%-5s' % fdict['acodec']
+        elif fdict.get('abr') is not None:
+            if res:
+                res += ', '
+            res += 'audio'
+        if fdict.get('abr') is not None:
+            res += '@%3dk' % fdict['abr']
+        if fdict.get('asr') is not None:
+            res += ' (%5dHz)' % fdict['asr']
+        if fdict.get('filesize') is not None:
+            if res:
+                res += ', '
+            res += format_bytes(fdict['filesize'])
+        return res
 
+    def list_formats(self, info_dict):
         def line(format, idlen=20):
             return (('%-' + compat_str(idlen + 1) + 's%-10s%-12s%s') % (
                 format['format_id'],
                 format['ext'],
                 self.format_resolution(format),
-                format_note(format),
+                self._format_note(format),
             ))
 
         formats = info_dict.get('formats', [info_dict])
@@ -1143,8 +1211,8 @@ class YoutubeDL(object):
                     max(len(f['format_id']) for f in formats))
         formats_s = [line(f, idlen) for f in formats]
         if len(formats) > 1:
-            formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)'
-            formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
+            formats_s[0] += (' ' if self._format_note(formats[0]) else '') + '(worst)'
+            formats_s[-1] += (' ' if self._format_note(formats[-1]) else '') + '(best)'
 
         header_line = line({
             'format_id': 'format code', 'ext': 'extension',
@@ -1154,12 +1222,22 @@ class YoutubeDL(object):
 
     def urlopen(self, req):
         """ Start an HTTP download """
-        return self._opener.open(req)
+        return self._opener.open(req, timeout=self._socket_timeout)
 
     def print_debug_header(self):
         if not self.params.get('verbose'):
             return
-        write_string('[debug] youtube-dl version ' + __version__ + '\n')
+
+        write_string(
+            '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
+                locale.getpreferredencoding(),
+                sys.getfilesystemencoding(),
+                sys.stdout.encoding,
+                self.get_encoding()),
+            encoding=None
+        )
+
+        self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
         try:
             sp = subprocess.Popen(
                 ['git', 'rev-parse', '--short', 'HEAD'],
@@ -1168,24 +1246,24 @@ class YoutubeDL(object):
             out, err = sp.communicate()
             out = out.decode().strip()
             if re.match('[0-9a-f]+', out):
-                write_string('[debug] Git HEAD: ' + out + '\n')
+                self._write_string('[debug] Git HEAD: ' + out + '\n')
         except:
             try:
                 sys.exc_clear()
             except:
                 pass
-        write_string('[debug] Python version %s - %s' %
+        self._write_string('[debug] Python version %s - %s' %
                      (platform.python_version(), platform_name()) + '\n')
 
         proxy_map = {}
         for handler in self._opener.handlers:
             if hasattr(handler, 'proxies'):
                 proxy_map.update(handler.proxies)
-        write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
+        self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
 
     def _setup_opener(self):
         timeout_val = self.params.get('socket_timeout')
-        timeout = 600 if timeout_val is None else float(timeout_val)
+        self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
 
         opts_cookiefile = self.params.get('cookiefile')
         opts_proxy = self.params.get('proxy')
@@ -1224,6 +1302,18 @@ class YoutubeDL(object):
         opener.addheaders = []
         self._opener = opener
 
-        # TODO remove this global modification
-        compat_urllib_request.install_opener(opener)
-        socket.setdefaulttimeout(timeout)
+    def encode(self, s):
+        if isinstance(s, bytes):
+            return s  # Already encoded
+
+        try:
+            return s.encode(self.get_encoding())
+        except UnicodeEncodeError as err:
+            err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
+            raise
+
+    def get_encoding(self):
+        encoding = self.params.get('encoding')
+        if encoding is None:
+            encoding = preferredencoding()
+        return encoding
index f843036c71e75cde73285351b10d7f06c3f7e542..1e01432d27c92dd92e82794f91870ca0eb2eff68 100644 (file)
@@ -46,12 +46,25 @@ __authors__  = (
     'Andreas Schmitz',
     'Michael Kaiser',
     'Niklas Laxström',
+    'David Triendl',
+    'Anthony Weems',
+    'David Wagner',
+    'Juan C. Olivares',
+    'Mattias Harrysson',
+    'phaer',
+    'Sainyam Kapoor',
+    'Nicolas Évrard',
+    'Jason Normore',
+    'Hoje Lee',
+    'Adam Thalhammer',
+    'Georg Jähnig',
+    'Ralf Haring',
 )
 
 __license__ = 'Public Domain'
 
 import codecs
-import getpass
+import io
 import locale
 import optparse
 import os
@@ -62,14 +75,17 @@ import sys
 
 
 from .utils import (
+    compat_getpass,
     compat_print,
     DateRange,
+    DEFAULT_OUTTMPL,
     decodeOption,
     get_term_width,
     DownloadError,
     get_cachedir,
     MaxDownloadsReached,
     preferredencoding,
+    read_batch_urls,
     SameFileError,
     setproctitle,
     std_headers,
@@ -83,6 +99,8 @@ from .extractor import gen_extractors
 from .version import __version__
 from .YoutubeDL import YoutubeDL
 from .postprocessor import (
+    AtomicParsleyPP,
+    FFmpegAudioFixPP,
     FFmpegMetadataPP,
     FFmpegVideoConvertor,
     FFmpegExtractAudioPP,
@@ -208,7 +226,7 @@ def parseOpts(overrideArguments=None):
     general.add_option('-U', '--update',
             action='store_true', dest='update_self', help='update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)')
     general.add_option('-i', '--ignore-errors',
-            action='store_true', dest='ignoreerrors', help='continue on download errors, for example to to skip unavailable videos in a playlist', default=False)
+            action='store_true', dest='ignoreerrors', help='continue on download errors, for example to skip unavailable videos in a playlist', default=False)
     general.add_option('--abort-on-error',
             action='store_false', dest='ignoreerrors',
             help='Abort downloading of further videos (in the playlist or the command line) if an error occurs')
@@ -220,6 +238,9 @@ def parseOpts(overrideArguments=None):
     general.add_option('--referer',
             dest='referer', help='specify a custom referer, use if the video access is restricted to one domain',
             metavar='REF', default=None)
+    general.add_option('--add-header',
+            dest='headers', help='specify a custom HTTP header and its value, separated by a colon \':\'. You can use this option multiple times', action="append",
+            metavar='FIELD:VALUE')
     general.add_option('--list-extractors',
             action='store_true', dest='list_extractors',
             help='List all supported extractors and the URLs they would handle', default=False)
@@ -230,6 +251,9 @@ def parseOpts(overrideArguments=None):
         '--proxy', dest='proxy', default=None, metavar='URL',
         help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection')
     general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.')
+    general.add_option(
+        '--prefer-insecure', '--prefer-unsecure', action='store_true', dest='prefer_insecure',
+        help='Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube)')
     general.add_option(
         '--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR',
         help='Location in the filesystem where youtube-dl can store some downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl or ~/.cache/youtube-dl . At the moment, only YouTube player files (for videos with obfuscated signatures) are cached, but that may change.')
@@ -242,14 +266,17 @@ def parseOpts(overrideArguments=None):
     general.add_option(
         '--bidi-workaround', dest='bidi_workaround', action='store_true',
         help=u'Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH')
-    general.add_option('--default-search',
-            dest='default_search', metavar='PREFIX',
-            help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for  youtube-dl "large apple". By default (with value "auto") youtube-dl guesses.')
+    general.add_option(
+        '--default-search',
+        dest='default_search', metavar='PREFIX',
+        help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for  youtube-dl "large apple". By default (with value "auto") youtube-dl guesses.')
     general.add_option(
         '--ignore-config',
         action='store_true',
         help='Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: do not read the user configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows)')
-
+    general.add_option(
+        '--encoding', dest='encoding', metavar='ENCODING',
+        help='Force the specified encoding (experimental)')
 
     selection.add_option(
         '--playlist-start',
@@ -309,7 +336,7 @@ def parseOpts(overrideArguments=None):
 
     video_format.add_option('-f', '--format',
             action='store', dest='format', metavar='FORMAT', default=None,
-            help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported. You can also use the special names "best", "bestaudio", "worst", and "worstaudio". By default, youtube-dl will pick the best quality.')
+            help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported. You can also use the special names "best", "bestvideo", "bestaudio", "worst", "worstvideo" and "worstaudio". By default, youtube-dl will pick the best quality.')
     video_format.add_option('--all-formats',
             action='store_const', dest='format', help='download all available video formats', const='all')
     video_format.add_option('--prefer-free-formats',
@@ -352,6 +379,10 @@ def parseOpts(overrideArguments=None):
 
     verbosity.add_option('-q', '--quiet',
             action='store_true', dest='quiet', help='activates quiet mode', default=False)
+    verbosity.add_option(
+        '--no-warnings',
+        dest='no_warnings', action='store_true', default=False,
+        help='Ignore warnings')
     verbosity.add_option('-s', '--simulate',
             action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
     verbosity.add_option('--skip-download',
@@ -379,7 +410,7 @@ def parseOpts(overrideArguments=None):
             help='simulate, quiet but print output format', default=False)
     verbosity.add_option('-j', '--dump-json',
             action='store_true', dest='dumpjson',
-            help='simulate, quiet but print JSON information', default=False)
+            help='simulate, quiet but print JSON information. See --output for a description of available keys.', default=False)
     verbosity.add_option('--newline',
             action='store_true', dest='progress_with_newline', help='output progress bar as new lines', default=False)
     verbosity.add_option('--no-progress',
@@ -424,6 +455,8 @@ def parseOpts(overrideArguments=None):
                   '%(extractor)s for the provider (youtube, metacafe, etc), '
                   '%(id)s for the video id, %(playlist)s for the playlist the video is in, '
                   '%(playlist_index)s for the position in the playlist and %% for a literal percent. '
+                  '%(height)s and %(width)s for the width and height of the video format. '
+                  '%(resolution)s for a textual description of the resolution of the video format. '
                   'Use - to output to stdout. Can also be used to download to a different directory, '
                   'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .'))
     filesystem.add_option('--autonumber-size',
@@ -479,6 +512,8 @@ def parseOpts(overrideArguments=None):
             help='do not overwrite post-processed files; the post-processed files are overwritten by default')
     postproc.add_option('--embed-subs', action='store_true', dest='embedsubtitles', default=False,
             help='embed subtitles in the video (only for mp4 videos)')
+    postproc.add_option('--embed-thumbnail', action='store_true', dest='embedthumbnail', default=False,
+            help='embed thumbnail in the audio as cover art')
     postproc.add_option('--add-metadata', action='store_true', dest='addmetadata', default=False,
             help='write metadata to the video file')
     postproc.add_option('--xattrs', action='store_true', dest='xattrs', default=False,
@@ -521,8 +556,6 @@ def parseOpts(overrideArguments=None):
             write_string(u'[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n')
             write_string(u'[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n')
             write_string(u'[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n')
-            write_string(u'[debug] Encodings: locale %r, fs %r, out %r, pref: %r\n' %
-                         (locale.getpreferredencoding(), sys.getfilesystemencoding(), sys.stdout.encoding, preferredencoding()))
 
     return parser, opts, args
 
@@ -545,27 +578,35 @@ def _real_main(argv=None):
     if opts.referer is not None:
         std_headers['Referer'] = opts.referer
 
+    # Custom HTTP headers
+    if opts.headers is not None:
+        for h in opts.headers:
+            if h.find(':', 1) < 0:
+                parser.error(u'wrong header formatting, it should be key:value, not "%s"'%h)
+            key, value = h.split(':', 2)
+            if opts.verbose:
+                write_string(u'[debug] Adding header from command line option %s:%s\n'%(key, value))
+            std_headers[key] = value
+
     # Dump user agent
     if opts.dump_user_agent:
         compat_print(std_headers['User-Agent'])
         sys.exit(0)
 
     # Batch file verification
-    batchurls = []
+    batch_urls = []
     if opts.batchfile is not None:
         try:
             if opts.batchfile == '-':
                 batchfd = sys.stdin
             else:
-                batchfd = open(opts.batchfile, 'r')
-            batchurls = batchfd.readlines()
-            batchurls = [x.strip() for x in batchurls]
-            batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
+                batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore')
+            batch_urls = read_batch_urls(batchfd)
             if opts.verbose:
-                write_string(u'[debug] Batch file urls: ' + repr(batchurls) + u'\n')
+                write_string(u'[debug] Batch file urls: ' + repr(batch_urls) + u'\n')
         except IOError:
             sys.exit(u'ERROR: batch file could not be read')
-    all_urls = batchurls + args
+    all_urls = batch_urls + args
     all_urls = [url.strip() for url in all_urls]
     _enc = preferredencoding()
     all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls]
@@ -604,7 +645,7 @@ def _real_main(argv=None):
     if opts.usetitle and opts.useid:
         parser.error(u'using title conflicts with using video ID')
     if opts.username is not None and opts.password is None:
-        opts.password = getpass.getpass(u'Type account password and press return:')
+        opts.password = compat_getpass(u'Type account password and press [Return]: ')
     if opts.ratelimit is not None:
         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
         if numeric_limit is None:
@@ -642,13 +683,13 @@ def _real_main(argv=None):
         if not opts.audioquality.isdigit():
             parser.error(u'invalid audio quality specified')
     if opts.recodevideo is not None:
-        if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg']:
+        if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv']:
             parser.error(u'invalid video recode format specified')
     if opts.date is not None:
         date = DateRange.day(opts.date)
     else:
         date = DateRange(opts.dateafter, opts.datebefore)
-    if opts.default_search not in ('auto', None) and ':' not in opts.default_search:
+    if opts.default_search not in ('auto', 'auto_warning', None) and ':' not in opts.default_search:
         parser.error(u'--default-search invalid; did you forget a colon (:) at the end?')
 
     # Do not download videos when there are audio-only formats
@@ -671,7 +712,7 @@ def _real_main(argv=None):
             or (opts.usetitle and u'%(title)s-%(id)s.%(ext)s')
             or (opts.useid and u'%(id)s.%(ext)s')
             or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
-            or u'%(title)s-%(id)s.%(ext)s')
+            or DEFAULT_OUTTMPL)
     if not os.path.splitext(outtmpl)[1] and opts.extractaudio:
         parser.error(u'Cannot download a video and extract audio into the same'
                      u' file! Use "{0}.%(ext)s" instead of "{0}" as the output'
@@ -686,6 +727,7 @@ def _real_main(argv=None):
         'password': opts.password,
         'videopassword': opts.videopassword,
         'quiet': (opts.quiet or any_printing),
+        'no_warnings': opts.no_warnings,
         'forceurl': opts.geturl,
         'forcetitle': opts.gettitle,
         'forceid': opts.getid,
@@ -749,6 +791,7 @@ def _real_main(argv=None):
         'download_archive': download_archive_fn,
         'cookiefile': opts.cookiefile,
         'nocheckcertificate': opts.no_check_certificate,
+        'prefer_insecure': opts.prefer_insecure,
         'proxy': opts.proxy,
         'socket_timeout': opts.socket_timeout,
         'bidi_workaround': opts.bidi_workaround,
@@ -757,6 +800,7 @@ def _real_main(argv=None):
         'include_ads': opts.include_ads,
         'default_search': opts.default_search,
         'youtube_include_dash_manifest': opts.youtube_include_dash_manifest,
+        'encoding': opts.encoding,
     }
 
     with YoutubeDL(ydl_opts) as ydl:
@@ -775,6 +819,10 @@ def _real_main(argv=None):
             ydl.add_post_processor(FFmpegEmbedSubtitlePP(subtitlesformat=opts.subtitlesformat))
         if opts.xattrs:
             ydl.add_post_processor(XAttrMetadataPP())
+        if opts.embedthumbnail:
+            if not opts.addmetadata:
+                ydl.add_post_processor(FFmpegAudioFixPP())
+            ydl.add_post_processor(AtomicParsleyPP())
 
         # Update version
         if opts.update_self:
index 5a068aa8b8063c8503eb4dc8170fd19303779a02..917f3450e63c62b95551081109c5d3f55f49aeba 100644 (file)
@@ -4,9 +4,10 @@ import sys
 import time
 
 from ..utils import (
+    compat_str,
     encodeFilename,
-    timeconvert,
     format_bytes,
+    timeconvert,
 )
 
 
@@ -173,7 +174,7 @@ class FileDownloader(object):
                 return
             os.rename(encodeFilename(old_filename), encodeFilename(new_filename))
         except (IOError, OSError) as err:
-            self.report_error(u'unable to rename file: %s' % str(err))
+            self.report_error(u'unable to rename file: %s' % compat_str(err))
 
     def try_utime(self, filename, last_modified_hdr):
         """Try to set the last-modified time of the given file."""
index 2a870a758fa32dd475eaf725f69dedbf3c45903b..e6be6ae6c878c9ede7cd2cf3b6be663e22bb8be1 100644 (file)
@@ -12,7 +12,6 @@ from .http import HttpFD
 from ..utils import (
     struct_pack,
     struct_unpack,
-    compat_urllib_request,
     compat_urlparse,
     format_bytes,
     encodeFilename,
@@ -117,8 +116,8 @@ class FlvReader(io.BytesIO):
         self.read_unsigned_char()
         # flags
         self.read(3)
-        # BootstrapinfoVersion
-        bootstrap_info_version = self.read_unsigned_int()
+
+        self.read_unsigned_int()  # BootstrapinfoVersion
         # Profile,Live,Update,Reserved
         self.read(1)
         # time scale
@@ -127,15 +126,15 @@ class FlvReader(io.BytesIO):
         self.read_unsigned_long_long()
         # SmpteTimeCodeOffset
         self.read_unsigned_long_long()
-        # MovieIdentifier
-        movie_identifier = self.read_string()
+
+        self.read_string()  # MovieIdentifier
         server_count = self.read_unsigned_char()
         # ServerEntryTable
         for i in range(server_count):
             self.read_string()
         quality_count = self.read_unsigned_char()
         # QualityEntryTable
-        for i in range(server_count):
+        for i in range(quality_count):
             self.read_string()
         # DrmData
         self.read_string()
@@ -298,6 +297,7 @@ class F4mFD(FileDownloader):
                         break
             frags_filenames.append(frag_filename)
 
+        dest_stream.close()
         self.report_finish(format_bytes(state['downloaded_bytes']), time.time() - start)
 
         self.try_rename(tmpfilename, filename)
index fa983462babeb6a6bccc2f22992411dd6aea8a5b..9d407fe6eb81683b19c5671ef7b92050e0a690c3 100644 (file)
@@ -13,8 +13,10 @@ class HlsFD(FileDownloader):
         self.report_destination(filename)
         tmpfilename = self.temp_name(filename)
 
-        args = ['-y', '-i', url, '-f', 'mp4', '-c', 'copy',
-            '-bsf:a', 'aac_adtstoasc', tmpfilename]
+        args = [
+            '-y', '-i', url, '-f', 'mp4', '-c', 'copy',
+            '-bsf:a', 'aac_adtstoasc',
+            encodeFilename(tmpfilename, for_subprocess=True)]
 
         for program in ['avconv', 'ffmpeg']:
             try:
index 748f9f3adfe0555024cce272b8eb872c55558d6c..f79e6a99587cdc7d13ba210e528512a1e572e4ea 100644 (file)
@@ -14,6 +14,8 @@ from ..utils import (
 
 
 class HttpFD(FileDownloader):
+    _TEST_FILE_SIZE = 10241
+
     def real_download(self, filename, info_dict):
         url = info_dict['url']
         tmpfilename = self.temp_name(filename)
@@ -23,11 +25,15 @@ class HttpFD(FileDownloader):
         headers = {'Youtubedl-no-compression': 'True'}
         if 'user_agent' in info_dict:
             headers['Youtubedl-user-agent'] = info_dict['user_agent']
+        if 'http_referer' in info_dict:
+            headers['Referer'] = info_dict['http_referer']
         basic_request = compat_urllib_request.Request(url, None, headers)
         request = compat_urllib_request.Request(url, None, headers)
 
-        if self.params.get('test', False):
-            request.add_header('Range', 'bytes=0-10240')
+        is_test = self.params.get('test', False)
+
+        if is_test:
+            request.add_header('Range', 'bytes=0-%s' % str(self._TEST_FILE_SIZE - 1))
 
         # Establish possible resume length
         if os.path.isfile(encodeFilename(tmpfilename)):
@@ -49,7 +55,7 @@ class HttpFD(FileDownloader):
         while count <= retries:
             # Establish connection
             try:
-                data = compat_urllib_request.urlopen(request)
+                data = self.ydl.urlopen(request)
                 break
             except (compat_urllib_error.HTTPError, ) as err:
                 if (err.code < 500 or err.code >= 600) and err.code != 416:
@@ -59,7 +65,7 @@ class HttpFD(FileDownloader):
                     # Unable to resume (requested range not satisfiable)
                     try:
                         # Open the connection again without the range header
-                        data = compat_urllib_request.urlopen(basic_request)
+                        data = self.ydl.urlopen(basic_request)
                         content_length = data.info()['Content-Length']
                     except (compat_urllib_error.HTTPError, ) as err:
                         if err.code < 500 or err.code >= 600:
@@ -85,6 +91,7 @@ class HttpFD(FileDownloader):
                         else:
                             # The length does not match, we start the download over
                             self.report_unable_to_resume()
+                            resume_len = 0
                             open_mode = 'wb'
                             break
             # Retry
@@ -97,6 +104,15 @@ class HttpFD(FileDownloader):
             return False
 
         data_len = data.info().get('Content-length', None)
+
+        # Range HTTP header may be ignored/unsupported by a webserver
+        # (e.g. extractor/scivee.py, extractor/bambuser.py).
+        # However, for a test we still would like to download just a piece of a file.
+        # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control
+        # block size when downloading a file.
+        if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE):
+            data_len = self._TEST_FILE_SIZE
+
         if data_len is not None:
             data_len = int(data_len) + resume_len
             min_data_len = self.params.get("min_filesize", None)
@@ -115,7 +131,7 @@ class HttpFD(FileDownloader):
         while True:
             # Download and write
             before = time.time()
-            data_block = data.read(block_size)
+            data_block = data.read(block_size if not is_test else min(block_size, data_len - byte_counter))
             after = time.time()
             if len(data_block) == 0:
                 break
@@ -159,6 +175,9 @@ class HttpFD(FileDownloader):
                 'speed': speed,
             })
 
+            if is_test and byte_counter == data_len:
+                break
+
             # Apply rate limit
             self.slow_down(start, byte_counter - resume_len)
 
index e93c28d6482857d6f84cf3acd6b5b1651168d660..cc6a84106b4ccc1221b74da313eb619544c4a8ef 100644 (file)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import os
 import re
 import subprocess
@@ -8,6 +10,7 @@ from .common import FileDownloader
 from ..utils import (
     encodeFilename,
     format_bytes,
+    compat_str,
 )
 
 
@@ -22,7 +25,7 @@ class RtmpFD(FileDownloader):
             proc_stderr_closed = False
             while not proc_stderr_closed:
                 # read line from stderr
-                line = u''
+                line = ''
                 while True:
                     char = proc.stderr.read(1)
                     if not char:
@@ -46,7 +49,7 @@ class RtmpFD(FileDownloader):
                     data_len = None
                     if percent > 0:
                         data_len = int(downloaded_data_len * 100 / percent)
-                    data_len_str = u'~' + format_bytes(data_len)
+                    data_len_str = '~' + format_bytes(data_len)
                     self.report_progress(percent, data_len_str, speed, eta)
                     cursor_in_new_line = False
                     self._hook_progress({
@@ -76,12 +79,12 @@ class RtmpFD(FileDownloader):
                         })
                     elif self.params.get('verbose', False):
                         if not cursor_in_new_line:
-                            self.to_screen(u'')
+                            self.to_screen('')
                         cursor_in_new_line = True
-                        self.to_screen(u'[rtmpdump] '+line)
+                        self.to_screen('[rtmpdump] '+line)
             proc.wait()
             if not cursor_in_new_line:
-                self.to_screen(u'')
+                self.to_screen('')
             return proc.returncode
 
         url = info_dict['url']
@@ -93,6 +96,7 @@ class RtmpFD(FileDownloader):
         flash_version = info_dict.get('flash_version', None)
         live = info_dict.get('rtmp_live', False)
         conn = info_dict.get('rtmp_conn', None)
+        protocol = info_dict.get('rtmp_protocol', None)
 
         self.report_destination(filename)
         tmpfilename = self.temp_name(filename)
@@ -102,7 +106,7 @@ class RtmpFD(FileDownloader):
         try:
             subprocess.call(['rtmpdump', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
         except (OSError, IOError):
-            self.report_error(u'RTMP download detected but "rtmpdump" could not be run')
+            self.report_error('RTMP download detected but "rtmpdump" could not be run')
             return False
 
         # Download using rtmpdump. rtmpdump returns exit code 2 when
@@ -125,9 +129,14 @@ class RtmpFD(FileDownloader):
             basic_args += ['--flashVer', flash_version]
         if live:
             basic_args += ['--live']
-        if conn:
+        if isinstance(conn, list):
+            for entry in conn:
+                basic_args += ['--conn', entry]
+        elif isinstance(conn, compat_str):
             basic_args += ['--conn', conn]
-        args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)]
+        if protocol is not None:
+            basic_args += ['--protocol', protocol]
+        args = basic_args + [[], ['--resume', '--skip', '1']][not live and self.params.get('continuedl', False)]
 
         if sys.platform == 'win32' and sys.version_info < (3, 0):
             # Windows subprocess module does not actually support Unicode
@@ -150,26 +159,35 @@ class RtmpFD(FileDownloader):
                 shell_quote = lambda args: ' '.join(map(pipes.quote, str_args))
             except ImportError:
                 shell_quote = repr
-            self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(str_args))
+            self.to_screen('[debug] rtmpdump command line: ' + shell_quote(str_args))
+
+        RD_SUCCESS = 0
+        RD_FAILED = 1
+        RD_INCOMPLETE = 2
+        RD_NO_CONNECT = 3
 
         retval = run_rtmpdump(args)
 
-        while (retval == 2 or retval == 1) and not test:
+        if retval == RD_NO_CONNECT:
+            self.report_error('[rtmpdump] Could not connect to RTMP server.')
+            return False
+
+        while (retval == RD_INCOMPLETE or retval == RD_FAILED) and not test and not live:
             prevsize = os.path.getsize(encodeFilename(tmpfilename))
-            self.to_screen(u'[rtmpdump] %s bytes' % prevsize)
+            self.to_screen('[rtmpdump] %s bytes' % prevsize)
             time.sleep(5.0) # This seems to be needed
-            retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
+            retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == RD_FAILED])
             cursize = os.path.getsize(encodeFilename(tmpfilename))
-            if prevsize == cursize and retval == 1:
+            if prevsize == cursize and retval == RD_FAILED:
                 break
              # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
-            if prevsize == cursize and retval == 2 and cursize > 1024:
-                self.to_screen(u'[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
-                retval = 0
+            if prevsize == cursize and retval == RD_INCOMPLETE and cursize > 1024:
+                self.to_screen('[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
+                retval = RD_SUCCESS
                 break
-        if retval == 0 or (test and retval == 2):
+        if retval == RD_SUCCESS or (test and retval == RD_INCOMPLETE):
             fsize = os.path.getsize(encodeFilename(tmpfilename))
-            self.to_screen(u'[rtmpdump] %s bytes' % fsize)
+            self.to_screen('[rtmpdump] %s bytes' % fsize)
             self.try_rename(tmpfilename, filename)
             self._hook_progress({
                 'downloaded_bytes': fsize,
@@ -179,6 +197,6 @@ class RtmpFD(FileDownloader):
             })
             return True
         else:
-            self.to_stderr(u"\n")
-            self.report_error(u'rtmpdump exited with code %d' % retval)
+            self.to_stderr('\n')
+            self.report_error('rtmpdump exited with code %d' % retval)
             return False
index 72537188326160e82085a17faf7c4e805fd72532..15a42ce44246f708a6b06027c366a4b41eab5c7b 100644 (file)
@@ -1,6 +1,8 @@
 from .academicearth import AcademicEarthCourseIE
 from .addanime import AddAnimeIE
+from .aftonbladet import AftonbladetIE
 from .anitube import AnitubeIE
+from .aol import AolIE
 from .aparat import AparatIE
 from .appletrailers import AppleTrailersIE
 from .archiveorg import ArchiveOrgIE
@@ -9,29 +11,39 @@ from .arte import (
     ArteTvIE,
     ArteTVPlus7IE,
     ArteTVCreativeIE,
+    ArteTVConcertIE,
     ArteTVFutureIE,
     ArteTVDDCIE,
+    ArteTVEmbedIE,
 )
 from .auengine import AUEngineIE
 from .bambuser import BambuserIE, BambuserChannelIE
 from .bandcamp import BandcampIE, BandcampAlbumIE
 from .bbccouk import BBCCoUkIE
+from .bilibili import BiliBiliIE
 from .blinkx import BlinkxIE
 from .bliptv import BlipTVIE, BlipTVUserIE
 from .bloomberg import BloombergIE
+from .br import BRIE
 from .breakcom import BreakIE
 from .brightcove import BrightcoveIE
+from .byutv import BYUtvIE
 from .c56 import C56IE
+from .canal13cl import Canal13clIE
 from .canalplus import CanalplusIE
 from .canalc2 import Canalc2IE
 from .cbs import CBSIE
+from .cbsnews import CBSNewsIE
+from .ceskatelevize import CeskaTelevizeIE
 from .channel9 import Channel9IE
 from .chilloutzone import ChilloutzoneIE
 from .cinemassacre import CinemassacreIE
 from .clipfish import ClipfishIE
 from .cliphunter import CliphunterIE
 from .clipsyndicate import ClipsyndicateIE
+from .clubic import ClubicIE
 from .cmt import CMTIE
+from .cnet import CNETIE
 from .cnn import (
     CNNIE,
     CNNBlogsIE,
@@ -49,31 +61,36 @@ from .dailymotion import (
     DailymotionUserIE,
 )
 from .daum import DaumIE
-from .depositfiles import DepositFilesIE
 from .dotsub import DotsubIE
 from .dreisat import DreiSatIE
 from .defense import DefenseGouvFrIE
 from .discovery import DiscoveryIE
+from .divxstage import DivxStageIE
 from .dropbox import DropboxIE
 from .ebaumsworld import EbaumsWorldIE
 from .ehow import EHowIE
 from .eighttracks import EightTracksIE
 from .eitb import EitbIE
 from .elpais import ElPaisIE
+from .empflix import EmpflixIE
+from .engadget import EngadgetIE
 from .escapist import EscapistIE
 from .everyonesmixtape import EveryonesMixtapeIE
 from .exfm import ExfmIE
 from .extremetube import ExtremeTubeIE
 from .facebook import FacebookIE
 from .faz import FazIE
+from .fc2 import FC2IE
 from .firstpost import FirstpostIE
 from .firsttv import FirstTVIE
+from .fivemin import FiveMinIE
 from .fktv import (
     FKTVIE,
     FKTVPosteckeIE,
 )
 from .flickr import FlickrIE
 from .fourtube import FourTubeIE
+from .franceculture import FranceCultureIE
 from .franceinter import FranceInterIE
 from .francetv import (
     PluzzIE,
@@ -88,15 +105,18 @@ from .funnyordie import FunnyOrDieIE
 from .gamekings import GamekingsIE
 from .gamespot import GameSpotIE
 from .gametrailers import GametrailersIE
+from .gdcvault import GDCVaultIE
 from .generic import GenericIE
 from .googleplus import GooglePlusIE
 from .googlesearch import GoogleSearchIE
 from .hark import HarkIE
 from .helsinki import HelsinkiIE
+from .hentaistigma import HentaiStigmaIE
 from .hotnewhiphop import HotNewHipHopIE
 from .howcast import HowcastIE
 from .huffpost import HuffPostIE
 from .hypem import HypemIE
+from .iconosquare import IconosquareIE
 from .ign import IGNIE, OneUPIE
 from .imdb import (
     ImdbIE,
@@ -104,7 +124,7 @@ from .imdb import (
 )
 from .ina import InaIE
 from .infoq import InfoQIE
-from .instagram import InstagramIE
+from .instagram import InstagramIE, InstagramUserIE
 from .internetvideoarchive import InternetVideoArchiveIE
 from .iprima import IPrimaIE
 from .ivi import (
@@ -122,6 +142,7 @@ from .khanacademy import KhanAcademyIE
 from .kickstarter import KickStarterIE
 from .keek import KeekIE
 from .kontrtube import KontrTubeIE
+from .ku6 import Ku6IE
 from .la7 import LA7IE
 from .lifenews import LifeNewsIE
 from .liveleak import LiveLeakIE
@@ -132,45 +153,67 @@ from .lynda import (
 )
 from .m6 import M6IE
 from .macgamestore import MacGameStoreIE
+from .mailru import MailRuIE
 from .malemotion import MalemotionIE
 from .mdr import MDRIE
 from .metacafe import MetacafeIE
 from .metacritic import MetacriticIE
-from .mit import TechTVMITIE, MITIE
+from .mit import TechTVMITIE, MITIE, OCWMITIE
 from .mixcloud import MixcloudIE
 from .mpora import MporaIE
 from .mofosex import MofosexIE
 from .mooshare import MooshareIE
+from .morningstar import MorningstarIE
+from .motorsport import MotorsportIE
+from .moviezine import MoviezineIE
+from .movshare import MovShareIE
 from .mtv import (
     MTVIE,
     MTVIggyIE,
 )
+from .musicplayon import MusicPlayOnIE
 from .muzu import MuzuTVIE
 from .myspace import MySpaceIE
 from .myspass import MySpassIE
 from .myvideo import MyVideoIE
 from .naver import NaverIE
 from .nba import NBAIE
-from .nbc import NBCNewsIE
+from .nbc import (
+    NBCIE,
+    NBCNewsIE,
+)
 from .ndr import NDRIE
 from .ndtv import NDTVIE
 from .newgrounds import NewgroundsIE
+from .newstube import NewstubeIE
 from .nfb import NFBIE
 from .nhl import NHLIE, NHLVideocenterIE
 from .niconico import NiconicoIE
 from .ninegag import NineGagIE
+from .noco import NocoIE
 from .normalboots import NormalbootsIE
-from .novamov import NovamovIE
+from .novamov import NovaMovIE
 from .nowness import NownessIE
 from .nowvideo import NowVideoIE
+from .nrk import (
+    NRKIE,
+    NRKTVIE,
+)
+from .ntv import NTVIE
+from .nytimes import NYTimesIE
+from .nuvid import NuvidIE
+from .oe1 import OE1IE
 from .ooyala import OoyalaIE
 from .orf import ORFIE
+from .parliamentliveuk import ParliamentLiveUKIE
 from .pbs import PBSIE
 from .photobucket import PhotobucketIE
+from .playvid import PlayvidIE
 from .podomatic import PodomaticIE
 from .pornhd import PornHdIE
 from .pornhub import PornHubIE
 from .pornotube import PornotubeIE
+from .prosiebensat1 import ProSiebenSat1IE
 from .pyvideo import PyvideoIE
 from .radiofrance import RadioFranceIE
 from .rbmaradio import RBMARadioIE
@@ -179,17 +222,23 @@ from .ringtv import RingTVIE
 from .ro220 import Ro220IE
 from .rottentomatoes import RottenTomatoesIE
 from .roxwel import RoxwelIE
+from .rtbf import RTBFIE
 from .rtlnow import RTLnowIE
+from .rts import RTSIE
+from .rtve import RTVEALaCartaIE
 from .rutube import (
     RutubeIE,
     RutubeChannelIE,
     RutubeMovieIE,
     RutubePersonIE,
 )
+from .rutv import RUTVIE
+from .savefrom import SaveFromIE
+from .scivee import SciVeeIE
 from .servingsys import ServingSysIE
 from .sina import SinaIE
-from .slashdot import SlashdotIE
 from .slideshare import SlideshareIE
+from .slutload import SlutloadIE
 from .smotri import (
     SmotriIE,
     SmotriCommunityIE,
@@ -197,7 +246,12 @@ from .smotri import (
     SmotriBroadcastIE,
 )
 from .sohu import SohuIE
-from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE
+from .soundcloud import (
+    SoundcloudIE,
+    SoundcloudSetIE,
+    SoundcloudUserIE,
+    SoundcloudPlaylistIE
+)
 from .southparkstudios import (
     SouthParkStudiosIE,
     SouthparkDeIE,
@@ -205,41 +259,62 @@ from .southparkstudios import (
 from .space import SpaceIE
 from .spankwire import SpankwireIE
 from .spiegel import SpiegelIE
+from .spiegeltv import SpiegeltvIE
 from .spike import SpikeIE
 from .stanfordoc import StanfordOpenClassroomIE
-from .statigram import StatigramIE
 from .steam import SteamIE
 from .streamcloud import StreamcloudIE
 from .streamcz import StreamCZIE
+from .swrmediathek import SWRMediathekIE
 from .syfy import SyfyIE
 from .sztvhu import SztvHuIE
+from .tagesschau import TagesschauIE
+from .teachertube import (
+    TeacherTubeIE,
+    TeacherTubeClassroomIE,
+)
+from .teachingchannel import TeachingChannelIE
 from .teamcoco import TeamcocoIE
 from .techtalks import TechTalksIE
 from .ted import TEDIE
+from .testurl import TestURLIE
 from .tf1 import TF1IE
 from .theplatform import ThePlatformIE
 from .thisav import ThisAVIE
 from .tinypic import TinyPicIE
+from .tlc import TlcIE, TlcDeIE
 from .toutv import TouTvIE
+from .toypics import ToypicsUserIE, ToypicsIE
 from .traileraddict import TrailerAddictIE
 from .trilulilu import TriluliluIE
+from .trutube import TruTubeIE
 from .tube8 import Tube8IE
 from .tudou import TudouIE
 from .tumblr import TumblrIE
 from .tutv import TutvIE
+from .tvigle import TvigleIE
 from .tvp import TvpIE
+from .udemy import (
+    UdemyIE,
+    UdemyCourseIE
+)
 from .unistra import UnistraIE
+from .urort import UrortIE
 from .ustream import UstreamIE, UstreamChannelIE
 from .vbox7 import Vbox7IE
 from .veehd import VeeHDIE
 from .veoh import VeohIE
 from .vesti import VestiIE
 from .vevo import VevoIE
-from .vice import ViceIE
+from .vh1 import VH1IE
 from .viddler import ViddlerIE
+from .videobam import VideoBamIE
 from .videodetective import VideoDetectiveIE
+from .videolecturesnet import VideoLecturesNetIE
 from .videofyme import VideofyMeIE
 from .videopremium import VideoPremiumIE
+from .videott import VideoTtIE
+from .videoweed import VideoWeedIE
 from .vimeo import (
     VimeoIE,
     VimeoChannelIE,
@@ -247,20 +322,32 @@ from .vimeo import (
     VimeoAlbumIE,
     VimeoGroupsIE,
     VimeoReviewIE,
+    VimeoWatchLaterIE,
+)
+from .vine import (
+    VineIE,
+    VineUserIE,
 )
-from .vine import VineIE
 from .viki import VikiIE
 from .vk import VKIE
 from .vube import VubeIE
+from .vuclip import VuClipIE
+from .washingtonpost import WashingtonPostIE
 from .wat import WatIE
+from .wdr import (
+    WDRIE,
+    WDRMobileIE,
+    WDRMausIE,
+)
 from .weibo import WeiboIE
 from .wimp import WimpIE
 from .wistia import WistiaIE
 from .worldstarhiphop import WorldStarHipHopIE
+from .xbef import XBefIE
 from .xhamster import XHamsterIE
 from .xnxx import XNXXIE
 from .xvideos import XVideosIE
-from .xtube import XTubeIE
+from .xtube import XTubeUserIE, XTubeIE
 from .yahoo import (
     YahooIE,
     YahooNewsIE,
@@ -271,19 +358,20 @@ from .youku import YoukuIE
 from .youporn import YouPornIE
 from .youtube import (
     YoutubeIE,
+    YoutubeChannelIE,
+    YoutubeFavouritesIE,
+    YoutubeHistoryIE,
     YoutubePlaylistIE,
-    YoutubeSearchIE,
+    YoutubeRecommendedIE,
     YoutubeSearchDateIE,
-    YoutubeUserIE,
-    YoutubeChannelIE,
+    YoutubeSearchIE,
+    YoutubeSearchURLIE,
     YoutubeShowIE,
     YoutubeSubscriptionsIE,
-    YoutubeRecommendedIE,
+    YoutubeTopListIE,
     YoutubeTruncatedURLIE,
+    YoutubeUserIE,
     YoutubeWatchLaterIE,
-    YoutubeFavouritesIE,
-    YoutubeHistoryIE,
-    YoutubeTopListIE,
 )
 from .zdf import ZDFIE
 
index 72f81d01a4976767033ea236eff4d8a5e2e43d33..59d3bbba413c3c256a3f77917708fb171e337b14 100644 (file)
@@ -5,7 +5,7 @@ from .common import InfoExtractor
 
 
 class AcademicEarthCourseIE(InfoExtractor):
-    _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/(?:courses|playlists)/(?P<id>[^?#/]+)'
+    _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/playlists/(?P<id>[^?#/]+)'
     IE_NAME = 'AcademicEarth:Course'
 
     def _real_extract(self, url):
@@ -14,12 +14,12 @@ class AcademicEarthCourseIE(InfoExtractor):
 
         webpage = self._download_webpage(url, playlist_id)
         title = self._html_search_regex(
-            r'<h1 class="playlist-name">(.*?)</h1>', webpage, u'title')
+            r'<h1 class="playlist-name"[^>]*?>(.*?)</h1>', webpage, u'title')
         description = self._html_search_regex(
-            r'<p class="excerpt">(.*?)</p>',
+            r'<p class="excerpt"[^>]*?>(.*?)</p>',
             webpage, u'description', fatal=False)
         urls = re.findall(
-            r'<h3 class="lecture-title"><a target="_blank" href="([^"]+)">',
+            r'<li class="lecture-preview">\s*?<a target="_blank" href="([^"]+)">',
             webpage)
         entries = [self.url_result(u) for u in urls]
 
index a3a1b999df25da791617c46a793843b2fd6ddc99..fcf296057cc807edbdca5ca1effbc9ad50153400 100644 (file)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
@@ -14,14 +16,14 @@ from ..utils import (
 class AddAnimeIE(InfoExtractor):
 
     _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video\.php\?(?:.*?)v=(?P<video_id>[\w_]+)(?:.*)'
-    IE_NAME = u'AddAnime'
     _TEST = {
-        u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9',
-        u'file': u'24MR3YO5SAS9.mp4',
-        u'md5': u'72954ea10bc979ab5e2eb288b21425a0',
-        u'info_dict': {
-            u"description": u"One Piece 606",
-            u"title": u"One Piece 606"
+        'url': 'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9',
+        'md5': '72954ea10bc979ab5e2eb288b21425a0',
+        'info_dict': {
+            'id': '24MR3YO5SAS9',
+            'ext': 'mp4',
+            'description': 'One Piece 606',
+            'title': 'One Piece 606',
         }
     }
 
@@ -38,10 +40,10 @@ class AddAnimeIE(InfoExtractor):
             redir_webpage = ee.cause.read().decode('utf-8')
             action = self._search_regex(
                 r'<form id="challenge-form" action="([^"]+)"',
-                redir_webpage, u'Redirect form')
+                redir_webpage, 'Redirect form')
             vc = self._search_regex(
                 r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>',
-                redir_webpage, u'redirect vc value')
+                redir_webpage, 'redirect vc value')
             av = re.search(
                 r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);',
                 redir_webpage)
@@ -52,19 +54,19 @@ class AddAnimeIE(InfoExtractor):
             parsed_url = compat_urllib_parse_urlparse(url)
             av_val = av_res + len(parsed_url.netloc)
             confirm_url = (
-                parsed_url.scheme + u'://' + parsed_url.netloc +
+                parsed_url.scheme + '://' + parsed_url.netloc +
                 action + '?' +
                 compat_urllib_parse.urlencode({
                     'jschl_vc': vc, 'jschl_answer': compat_str(av_val)}))
             self._download_webpage(
                 confirm_url, video_id,
-                note=u'Confirming after redirect')
+                note='Confirming after redirect')
             webpage = self._download_webpage(url, video_id)
 
         formats = []
         for format_id in ('normal', 'hq'):
             rex = r"var %s_video_file = '(.*?)';" % re.escape(format_id)
-            video_url = self._search_regex(rex, webpage, u'video file URLx',
+            video_url = self._search_regex(rex, webpage, 'video file URLx',
                                            fatal=False)
             if not video_url:
                 continue
@@ -72,14 +74,13 @@ class AddAnimeIE(InfoExtractor):
                 'format_id': format_id,
                 'url': video_url,
             })
-        if not formats:
-            raise ExtractorError(u'Cannot find any video format!')
+        self._sort_formats(formats)
         video_title = self._og_search_title(webpage)
         video_description = self._og_search_description(webpage)
 
         return {
             '_type': 'video',
-            'id':  video_id,
+            'id': video_id,
             'formats': formats,
             'title': video_title,
             'description': video_description
diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py
new file mode 100644 (file)
index 0000000..cfc7370
--- /dev/null
@@ -0,0 +1,66 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class AftonbladetIE(InfoExtractor):
+    _VALID_URL = r'^http://tv\.aftonbladet\.se/webbtv.+?(?P<video_id>article[0-9]+)\.ab(?:$|[?#])'
+    _TEST = {
+        'url': 'http://tv.aftonbladet.se/webbtv/nyheter/vetenskap/rymden/article36015.ab',
+        'info_dict': {
+            'id': 'article36015',
+            'ext': 'mp4',
+            'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna',
+            'description': 'Jupiters måne mest aktiv av alla himlakroppar',
+            'timestamp': 1394142732,
+            'upload_date': '20140306',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.search(self._VALID_URL, url)
+
+        video_id = mobj.group('video_id')
+        webpage = self._download_webpage(url, video_id)
+
+        # find internal video meta data
+        meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json'
+        internal_meta_id = self._html_search_regex(
+            r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id')
+        internal_meta_url = meta_url % internal_meta_id
+        internal_meta_json = self._download_json(
+            internal_meta_url, video_id, 'Downloading video meta data')
+
+        # find internal video formats
+        format_url = 'http://aftonbladet-play.videodata.drvideo.aptoma.no/actions/video/?id=%s'
+        internal_video_id = internal_meta_json['videoId']
+        internal_formats_url = format_url % internal_video_id
+        internal_formats_json = self._download_json(
+            internal_formats_url, video_id, 'Downloading video formats')
+
+        formats = []
+        for fmt in internal_formats_json['formats']['http']['pseudostreaming']['mp4']:
+            p = fmt['paths'][0]
+            formats.append({
+                'url': 'http://%s:%d/%s/%s' % (p['address'], p['port'], p['path'], p['filename']),
+                'ext': 'mp4',
+                'width': fmt['width'],
+                'height': fmt['height'],
+                'tbr': fmt['bitrate'],
+                'protocol': 'http',
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': internal_meta_json['title'],
+            'formats': formats,
+            'thumbnail': internal_meta_json['imageUrl'],
+            'description': internal_meta_json['shortPreamble'],
+            'timestamp': internal_meta_json['timePublished'],
+            'duration': internal_meta_json['duration'],
+            'view_count': internal_meta_json['views'],
+        }
diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py
new file mode 100644 (file)
index 0000000..a7bfe5a
--- /dev/null
@@ -0,0 +1,65 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .fivemin import FiveMinIE
+
+
+class AolIE(InfoExtractor):
+    IE_NAME = 'on.aol.com'
+    _VALID_URL = r'''(?x)
+        (?:
+            aol-video:|
+            http://on\.aol\.com/
+            (?:
+                video/.*-|
+                playlist/(?P<playlist_display_id>[^/?#]+?)-(?P<playlist_id>[0-9]+)[?#].*_videoid=
+            )
+        )
+        (?P<id>[0-9]+)
+        (?:$|\?)
+    '''
+
+    _TEST = {
+        'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img',
+        'md5': '18ef68f48740e86ae94b98da815eec42',
+        'info_dict': {
+            'id': '518167793',
+            'ext': 'mp4',
+            'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam',
+        },
+        'add_ie': ['FiveMin'],
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        playlist_id = mobj.group('playlist_id')
+        if playlist_id and not self._downloader.params.get('noplaylist'):
+            self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
+
+            webpage = self._download_webpage(url, playlist_id)
+            title = self._html_search_regex(
+                r'<h1 class="video-title[^"]*">(.+?)</h1>', webpage, 'title')
+            playlist_html = self._search_regex(
+                r"(?s)<ul\s+class='video-related[^']*'>(.*?)</ul>", webpage,
+                'playlist HTML')
+            entries = [{
+                '_type': 'url',
+                'url': 'aol-video:%s' % m.group('id'),
+                'ie_key': 'Aol',
+            } for m in re.finditer(
+                r"<a\s+href='.*videoid=(?P<id>[0-9]+)'\s+class='video-thumb'>",
+                playlist_html)]
+
+            return {
+                '_type': 'playlist',
+                'id': playlist_id,
+                'display_id': mobj.group('playlist_display_id'),
+                'title': title,
+                'entries': entries,
+            }
+
+        return FiveMinIE._build_result(video_id)
index 922cede056690bac963cdb2f896eb7b9254680af..dc8657b67c9850c1676af737f319cb4c06bad6d6 100644 (file)
@@ -6,7 +6,6 @@ import json
 from .common import InfoExtractor
 from ..utils import (
     compat_urlparse,
-    determine_ext,
 )
 
 
@@ -16,9 +15,10 @@ class AppleTrailersIE(InfoExtractor):
         "url": "http://trailers.apple.com/trailers/wb/manofsteel/",
         "playlist": [
             {
-                "file": "manofsteel-trailer4.mov",
                 "md5": "d97a8e575432dbcb81b7c3acb741f8a8",
                 "info_dict": {
+                    "id": "manofsteel-trailer4",
+                    "ext": "mov",
                     "duration": 111,
                     "title": "Trailer 4",
                     "upload_date": "20130523",
@@ -26,9 +26,10 @@ class AppleTrailersIE(InfoExtractor):
                 },
             },
             {
-                "file": "manofsteel-trailer3.mov",
                 "md5": "b8017b7131b721fb4e8d6f49e1df908c",
                 "info_dict": {
+                    "id": "manofsteel-trailer3",
+                    "ext": "mov",
                     "duration": 182,
                     "title": "Trailer 3",
                     "upload_date": "20130417",
@@ -36,9 +37,10 @@ class AppleTrailersIE(InfoExtractor):
                 },
             },
             {
-                "file": "manofsteel-trailer.mov",
                 "md5": "d0f1e1150989b9924679b441f3404d48",
                 "info_dict": {
+                    "id": "manofsteel-trailer",
+                    "ext": "mov",
                     "duration": 148,
                     "title": "Trailer",
                     "upload_date": "20121212",
@@ -46,15 +48,16 @@ class AppleTrailersIE(InfoExtractor):
                 },
             },
             {
-                "file": "manofsteel-teaser.mov",
                 "md5": "5fe08795b943eb2e757fa95cb6def1cb",
                 "info_dict": {
+                    "id": "manofsteel-teaser",
+                    "ext": "mov",
                     "duration": 93,
                     "title": "Teaser",
                     "upload_date": "20120721",
                     "uploader_id": "wb",
                 },
-            }
+            },
         ]
     }
 
@@ -65,16 +68,16 @@ class AppleTrailersIE(InfoExtractor):
         movie = mobj.group('movie')
         uploader_id = mobj.group('company')
 
-        playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc')
+        playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc')
         def fix_html(s):
-            s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s)
+            s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s)
             s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s)
             # The ' in the onClick attributes are not escaped, it couldn't be parsed
             # like: http://trailers.apple.com/trailers/wb/gravity/
             def _clean_json(m):
-                return u'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
+                return 'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
             s = re.sub(self._JSON_RE, _clean_json, s)
-            s = u'<html>' + s + u'</html>'
+            s = '<html>' + s + u'</html>'
             return s
         doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
 
@@ -82,7 +85,7 @@ class AppleTrailersIE(InfoExtractor):
         for li in doc.findall('./div/ul/li'):
             on_click = li.find('.//a').attrib['onClick']
             trailer_info_json = self._search_regex(self._JSON_RE,
-                on_click, u'trailer info')
+                on_click, 'trailer info')
             trailer_info = json.loads(trailer_info_json)
             title = trailer_info['title']
             video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
@@ -98,8 +101,7 @@ class AppleTrailersIE(InfoExtractor):
             first_url = trailer_info['url']
             trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower()
             settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id)
-            settings_json = self._download_webpage(settings_json_url, trailer_id, u'Downloading settings json')
-            settings = json.loads(settings_json)
+            settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json')
 
             formats = []
             for format in settings['metadata']['sizes']:
@@ -107,7 +109,6 @@ class AppleTrailersIE(InfoExtractor):
                 format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src'])
                 formats.append({
                     'url': format_url,
-                    'ext': determine_ext(format_url),
                     'format': format['type'],
                     'width': format['width'],
                     'height': int(format['height']),
index b88f71bc40b9803fb4ed0dea134738a7e1e07201..c6d22c029ef1c8dcdef44df172fe3e9391fea6eb 100644 (file)
@@ -38,15 +38,19 @@ class ARDIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
 
         title = self._html_search_regex(
-            r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>', webpage, 'title')
+            [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
+             r'<meta name="dcterms.title" content="(.*?)"/>',
+             r'<h4 class="headline">(.*?)</h4>'],
+            webpage, 'title')
         description = self._html_search_meta(
             'dcterms.abstract', webpage, 'description')
         thumbnail = self._og_search_thumbnail(webpage)
 
-        streams = [
-            mo.groupdict()
-            for mo in re.finditer(
-                r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)', webpage)]
+
+        media_info = self._download_json(
+            'http://www.ardmediathek.de/play/media/%s' % video_id, video_id)
+        # The second element of the _mediaArray contains the standard http urls
+        streams = media_info['_mediaArray'][1]['_mediaStreamArray']
         if not streams:
             if '"fsk"' in webpage:
                 raise ExtractorError('This video is only available after 20:00')
@@ -54,21 +58,12 @@ class ARDIE(InfoExtractor):
         formats = []
         for s in streams:
             format = {
-                'quality': int(s['quality']),
+                'quality': s['_quality'],
+                'url': s['_stream'],
             }
-            if s.get('rtmp_url'):
-                format['protocol'] = 'rtmp'
-                format['url'] = s['rtmp_url']
-                format['playpath'] = s['video_url']
-            else:
-                format['url'] = s['video_url']
-
-            quality_name = self._search_regex(
-                r'[,.]([a-zA-Z0-9_-]+),?\.mp4', format['url'],
-                'quality name', default='NA')
-            format['format_id'] = '%s-%s-%s-%s' % (
-                determine_ext(format['url']), quality_name, s['media_type'],
-                s['quality'])
+
+            format['format_id'] = '%s-%s' % (
+                determine_ext(format['url']), format['quality'])
 
             formats.append(format)
 
index 7cf3785ac687a4fae01a53589af26fb11054463d..b528a9ec50ca6c2dac1a52fe66de2cd66194dd23 100644 (file)
@@ -2,7 +2,6 @@
 from __future__ import unicode_literals
 
 import re
-import json
 
 from .common import InfoExtractor
 from ..utils import (
@@ -19,115 +18,46 @@ from ..utils import (
 # is different for each one. The videos usually expire in 7 days, so we can't
 # add tests.
 
-class ArteTvIE(InfoExtractor):
-    _VIDEOS_URL = r'(?:http://)?videos\.arte\.tv/(?P<lang>fr|de)/.*-(?P<id>.*?)\.html'
-    _LIVEWEB_URL = r'(?:http://)?liveweb\.arte\.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)'
-    _LIVE_URL = r'index-[0-9]+\.html$'
 
+class ArteTvIE(InfoExtractor):
+    _VALID_URL = r'http://videos\.arte\.tv/(?P<lang>fr|de)/.*-(?P<id>.*?)\.html'
     IE_NAME = 'arte.tv'
 
-    @classmethod
-    def suitable(cls, url):
-        return any(re.match(regex, url) for regex in (cls._VIDEOS_URL, cls._LIVEWEB_URL))
-
-    # TODO implement Live Stream
-    # from ..utils import compat_urllib_parse
-    # def extractLiveStream(self, url):
-    #     video_lang = url.split('/')[-4]
-    #     info = self.grep_webpage(
-    #         url,
-    #         r'src="(.*?/videothek_js.*?\.js)',
-    #         0,
-    #         [
-    #             (1, 'url', 'Invalid URL: %s' % url)
-    #         ]
-    #     )
-    #     http_host = url.split('/')[2]
-    #     next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
-    #     info = self.grep_webpage(
-    #         next_url,
-    #         r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
-    #             '(http://.*?\.swf).*?' +
-    #             '(rtmp://.*?)\'',
-    #         re.DOTALL,
-    #         [
-    #             (1, 'path',   'could not extract video path: %s' % url),
-    #             (2, 'player', 'could not extract video player: %s' % url),
-    #             (3, 'url',    'could not extract video url: %s' % url)
-    #         ]
-    #     )
-    #     video_url = '%s/%s' % (info.get('url'), info.get('path'))
-
     def _real_extract(self, url):
-        mobj = re.match(self._VIDEOS_URL, url)
-        if mobj is not None:
-            id = mobj.group('id')
-            lang = mobj.group('lang')
-            return self._extract_video(url, id, lang)
-
-        mobj = re.match(self._LIVEWEB_URL, url)
-        if mobj is not None:
-            name = mobj.group('name')
-            lang = mobj.group('lang')
-            return self._extract_liveweb(url, name, lang)
-
-        if re.search(self._LIVE_URL, url) is not None:
-            raise ExtractorError(u'Arte live streams are not yet supported, sorry')
-            # self.extractLiveStream(url)
-            # return
-
-    def _extract_video(self, url, video_id, lang):
-        """Extract from videos.arte.tv"""
+        mobj = re.match(self._VALID_URL, url)
+        lang = mobj.group('lang')
+        video_id = mobj.group('id')
+
         ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
         ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
-        ref_xml_doc = self._download_xml(ref_xml_url, video_id, note=u'Downloading metadata')
+        ref_xml_doc = self._download_xml(
+            ref_xml_url, video_id, note='Downloading metadata')
         config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang)
         config_xml_url = config_node.attrib['ref']
-        config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration')
-
-        video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml))
-        def _key(m):
-            quality = m.group('quality')
-            if quality == 'hd':
-                return 2
-            else:
-                return 1
-        # We pick the best quality
-        video_urls = sorted(video_urls, key=_key)
-        video_url = list(video_urls)[-1].group('url')
-        
-        title = self._html_search_regex(r'<name>(.*?)</name>', config_xml, 'title')
-        thumbnail = self._html_search_regex(r'<firstThumbnailUrl>(.*?)</firstThumbnailUrl>',
-                                            config_xml, 'thumbnail')
-        return {'id': video_id,
-                'title': title,
-                'thumbnail': thumbnail,
-                'url': video_url,
-                'ext': 'flv',
-                }
-
-    def _extract_liveweb(self, url, name, lang):
-        """Extract form http://liveweb.arte.tv/"""
-        webpage = self._download_webpage(url, name)
-        video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, 'event id')
-        config_doc = self._download_xml('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id,
-                                            video_id, 'Downloading information')
-        event_doc = config_doc.find('event')
-        url_node = event_doc.find('video').find('urlHd')
-        if url_node is None:
-            url_node = event_doc.find('urlSd')
-
-        return {'id': video_id,
-                'title': event_doc.find('name%s' % lang.capitalize()).text,
-                'url': url_node.text.replace('MP4', 'mp4'),
-                'ext': 'flv',
-                'thumbnail': self._og_search_thumbnail(webpage),
-                }
+        config = self._download_xml(
+            config_xml_url, video_id, note='Downloading configuration')
+
+        formats = [{
+            'forma_id': q.attrib['quality'],
+            'url': q.text,
+            'ext': 'flv',
+            'quality': 2 if q.attrib['quality'] == 'hd' else 1,
+        } for q in config.findall('./urls/url')]
+        self._sort_formats(formats)
+
+        title = config.find('.//name').text
+        thumbnail = config.find('.//firstThumbnailUrl').text
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
 
 
 class ArteTVPlus7IE(InfoExtractor):
     IE_NAME = 'arte.tv:+7'
-    _VALID_URL = r'https?://www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
+    _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
 
     @classmethod
     def _extract_url_info(cls, url):
@@ -144,13 +74,12 @@ class ArteTVPlus7IE(InfoExtractor):
         return self._extract_from_webpage(webpage, video_id, lang)
 
     def _extract_from_webpage(self, webpage, video_id, lang):
-        json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
+        json_url = self._html_search_regex(
+            r'arte_vp_url="(.*?)"', webpage, 'json vp url')
         return self._extract_from_json_url(json_url, video_id, lang)
 
     def _extract_from_json_url(self, json_url, video_id, lang):
-        json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
-        self.report_extraction(video_id)
-        info = json.loads(json_info)
+        info = self._download_json(json_url, video_id)
         player_info = info['videoJsonPlayer']
 
         info_dict = {
@@ -172,6 +101,8 @@ class ArteTVPlus7IE(InfoExtractor):
                 l = 'F'
             elif lang == 'de':
                 l = 'A'
+            else:
+                l = lang
             regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]
             return any(re.match(r, f['versionCode']) for r in regexes)
         # Some formats may not be in the same language as the url
@@ -190,14 +121,19 @@ class ArteTVPlus7IE(InfoExtractor):
                 return ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality'])
         else:
             def sort_key(f):
+                versionCode = f.get('versionCode')
+                if versionCode is None:
+                    versionCode = ''
                 return (
                     # Sort first by quality
-                    int(f.get('height',-1)),
-                    int(f.get('bitrate',-1)),
+                    int(f.get('height', -1)),
+                    int(f.get('bitrate', -1)),
                     # The original version with subtitles has lower relevance
-                    re.match(r'VO-ST(F|A)', f.get('versionCode', '')) is None,
+                    re.match(r'VO-ST(F|A)', versionCode) is None,
                     # The version with sourds/mal subtitles has also lower relevance
-                    re.match(r'VO?(F|A)-STM\1', f.get('versionCode', '')) is None,
+                    re.match(r'VO?(F|A)-STM\1', versionCode) is None,
+                    # Prefer http downloads over m3u8
+                    0 if f['url'].endswith('m3u8') else 1,
                 )
         formats = sorted(formats, key=sort_key)
         def _format(format_info):
@@ -238,8 +174,9 @@ class ArteTVCreativeIE(ArteTVPlus7IE):
 
     _TEST = {
         'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design',
-        'file': '050489-002.mp4',
         'info_dict': {
+            'id': '050489-002',
+            'ext': 'mp4',
             'title': 'Agentur Amateur / Agence Amateur #2 : Corporate Design',
         },
     }
@@ -251,8 +188,9 @@ class ArteTVFutureIE(ArteTVPlus7IE):
 
     _TEST = {
         'url': 'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081',
-        'file': '050940-003.mp4',
         'info_dict': {
+            'id': '050940-003',
+            'ext': 'mp4',
             'title': 'Les champignons au secours de la planète',
         },
     }
@@ -266,7 +204,7 @@ class ArteTVFutureIE(ArteTVPlus7IE):
 
 class ArteTVDDCIE(ArteTVPlus7IE):
     IE_NAME = 'arte.tv:ddc'
-    _VALID_URL = r'http?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)'
+    _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>.+)'
 
     def _real_extract(self, url):
         video_id, lang = self._extract_url_info(url)
@@ -280,3 +218,39 @@ class ArteTVDDCIE(ArteTVPlus7IE):
         javascriptPlayerGenerator = self._download_webpage(script_url, video_id, 'Download javascript player generator')
         json_url = self._search_regex(r"json_url=(.*)&rendering_place.*", javascriptPlayerGenerator, 'json url')
         return self._extract_from_json_url(json_url, video_id, lang)
+
+
+class ArteTVConcertIE(ArteTVPlus7IE):
+    IE_NAME = 'arte.tv:concert'
+    _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>de|fr)/(?P<id>.+)'
+
+    _TEST = {
+        'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde',
+        'md5': '9ea035b7bd69696b67aa2ccaaa218161',
+        'info_dict': {
+            'id': '186',
+            'ext': 'mp4',
+            'title': 'The Notwist im Pariser Konzertclub "Divan du Monde"',
+            'upload_date': '20140128',
+            'description': 'md5:486eb08f991552ade77439fe6d82c305',
+        },
+    }
+
+
+class ArteTVEmbedIE(ArteTVPlus7IE):
+    IE_NAME = 'arte.tv:embed'
+    _VALID_URL = r'''(?x)
+        http://www\.arte\.tv
+        /playerv2/embed\.php\?json_url=
+        (?P<json_url>
+            http://arte\.tv/papi/tvguide/videos/stream/player/
+            (?P<lang>[^/]+)/(?P<id>[^/]+)[^&]*
+        )
+    '''
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        lang = mobj.group('lang')
+        json_url = mobj.group('json_url')
+        return self._extract_from_json_url(json_url, video_id, lang)
index c6f30e62616c09f50f8826d4d30cc6069cf3367f..20bf12550d4b4493982ca3ea6f31578368e31aba 100644 (file)
@@ -11,22 +11,24 @@ from ..utils import (
 
 
 class AUEngineIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?auengine\.com/embed\.php\?.*?file=(?P<id>[^&]+).*?'
+
     _TEST = {
         'url': 'http://auengine.com/embed.php?file=lfvlytY6&w=650&h=370',
-        'file': 'lfvlytY6.mp4',
         'md5': '48972bdbcf1a3a2f5533e62425b41d4f',
         'info_dict': {
+            'id': 'lfvlytY6',
+            'ext': 'mp4',
             'title': '[Commie]The Legend of the Legendary Heroes - 03 - Replication Eye (Alpha Stigma)[F9410F5A]'
         }
     }
-    _VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed\.php\?.*?file=([^&]+).*?'
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group(1)
+        video_id = mobj.group('id')
+
         webpage = self._download_webpage(url, video_id)
-        title = self._html_search_regex(r'<title>(?P<title>.+?)</title>',
-                webpage, 'title')
+        title = self._html_search_regex(r'<title>(?P<title>.+?)</title>', webpage, 'title')
         title = title.strip()
         links = re.findall(r'\s(?:file|url):\s*["\']([^\'"]+)["\']', webpage)
         links = map(compat_urllib_parse.unquote, links)
@@ -39,14 +41,15 @@ class AUEngineIE(InfoExtractor):
             elif '/videos/' in link:
                 video_url = link
         if not video_url:
-            raise ExtractorError(u'Could not find video URL')
+            raise ExtractorError('Could not find video URL')
         ext = '.' + determine_ext(video_url)
         if ext == title[-len(ext):]:
             title = title[:-len(ext)]
 
         return {
-            'id':        video_id,
-            'url':       video_url,
-            'title':     title,
+            'id': video_id,
+            'url': video_url,
+            'title': title,
             'thumbnail': thumbnail,
+            'http_referer': 'http://www.auengine.com/flowplayer/flowplayer.commercial-3.2.14.swf',
         }
index 886b0dfabb7537ebca8a13ce7e4d6d68b9966a4c..dcbbdef4346c36c789e49531df1dc602bc35255b 100644 (file)
@@ -12,14 +12,14 @@ from ..utils import (
 
 
 class BandcampIE(InfoExtractor):
-    _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
+    _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>.*)'
     _TESTS = [{
         'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
         'file': '1812978515.mp3',
         'md5': 'c557841d5e50261777a6585648adf439',
         'info_dict': {
             "title": "youtube-dl  \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad",
-            "duration": 10,
+            "duration": 9.8485,
         },
         '_skip': 'There is a limit of 200 free downloads / month for the test song'
     }]
@@ -28,36 +28,32 @@ class BandcampIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         title = mobj.group('title')
         webpage = self._download_webpage(url, title)
-        # We get the link to the free download page
         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
-        if m_download is None:
+        if not m_download:
             m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
             if m_trackinfo:
                 json_code = m_trackinfo.group(1)
-                data = json.loads(json_code)
-                d = data[0]
+                data = json.loads(json_code)[0]
 
-                duration = int(round(d['duration']))
                 formats = []
-                for format_id, format_url in d['file'].items():
-                    ext, _, abr_str = format_id.partition('-')
-
+                for format_id, format_url in data['file'].items():
+                    ext, abr_str = format_id.split('-', 1)
                     formats.append({
                         'format_id': format_id,
                         'url': format_url,
-                        'ext': format_id.partition('-')[0],
+                        'ext': ext,
                         'vcodec': 'none',
-                        'acodec': format_id.partition('-')[0],
-                        'abr': int(format_id.partition('-')[2]),
+                        'acodec': ext,
+                        'abr': int(abr_str),
                     })
 
                 self._sort_formats(formats)
 
                 return {
-                    'id': compat_str(d['id']),
-                    'title': d['title'],
+                    'id': compat_str(data['id']),
+                    'title': data['title'],
                     'formats': formats,
-                    'duration': duration,
+                    'duration': float(data['duration']),
                 }
             else:
                 raise ExtractorError('No free songs found')
@@ -67,11 +63,9 @@ class BandcampIE(InfoExtractor):
             r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
             webpage, re.MULTILINE | re.DOTALL).group('id')
 
-        download_webpage = self._download_webpage(download_link, video_id,
-                                                  'Downloading free downloads page')
-        # We get the dictionary of the track from some javascrip code
-        info = re.search(r'items: (.*?),$',
-                         download_webpage, re.MULTILINE).group(1)
+        download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page')
+        # We get the dictionary of the track from some javascript code
+        info = re.search(r'items: (.*?),$', download_webpage, re.MULTILINE).group(1)
         info = json.loads(info)[0]
         # We pick mp3-320 for now, until format selection can be easily implemented.
         mp3_info = info['downloads']['mp3-320']
@@ -100,7 +94,7 @@ class BandcampIE(InfoExtractor):
 
 class BandcampAlbumIE(InfoExtractor):
     IE_NAME = 'Bandcamp:album'
-    _VALID_URL = r'http://.*?\.bandcamp\.com/album/(?P<title>.*)'
+    _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<title>[^?#]+))'
 
     _TEST = {
         'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
@@ -123,13 +117,15 @@ class BandcampAlbumIE(InfoExtractor):
         'params': {
             'playlistend': 2
         },
-        'skip': 'Bancamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test'
+        'skip': 'Bandcamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test'
     }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
+        playlist_id = mobj.group('subdomain')
         title = mobj.group('title')
-        webpage = self._download_webpage(url, title)
+        display_id = title or playlist_id
+        webpage = self._download_webpage(url, display_id)
         tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage)
         if not tracks_paths:
             raise ExtractorError('The page doesn\'t contain any tracks')
@@ -139,6 +135,8 @@ class BandcampAlbumIE(InfoExtractor):
         title = self._search_regex(r'album_title : "(.*?)"', webpage, 'title')
         return {
             '_type': 'playlist',
+            'id': playlist_id,
+            'display_id': display_id,
             'title': title,
             'entries': entries,
         }
index 6d785c0bf08a778bd91dbe1cf956e6cf4caeb560..75e608f99de4ff3cc14234ab370f370b1ae83940 100644 (file)
@@ -13,13 +13,13 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
 
     _TESTS = [
         {
-            'url': 'http://www.bbc.co.uk/programmes/p01q7wz1',
+            'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
             'info_dict': {
-                'id': 'p01q7wz4',
+                'id': 'b039d07m',
                 'ext': 'flv',
-                'title': 'Friction: Blu Mar Ten guest mix: Blu Mar Ten - Guest Mix',
-                'description': 'Blu Mar Ten deliver a Guest Mix for Friction.',
-                'duration': 1936,
+                'title': 'Kaleidoscope: Leonard Cohen',
+                'description': 'md5:db4755d7a665ae72343779f7dacb402c',
+                'duration': 1740,
             },
             'params': {
                 # rtmp download
@@ -38,7 +38,8 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
             'params': {
                 # rtmp download
                 'skip_download': True,
-            }
+            },
+            'skip': 'Episode is no longer available on BBC iPlayer Radio',
         },
         {
             'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
@@ -161,6 +162,11 @@ class BBCCoUkIE(SubtitlesInfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         group_id = mobj.group('id')
 
+        webpage = self._download_webpage(url, group_id, 'Downloading video page')
+        if re.search(r'id="emp-error" class="notinuk">', webpage):
+            raise ExtractorError('Currently BBC iPlayer TV programmes are available to play in the UK only',
+                expected=True)
+
         playlist = self._download_xml('http://www.bbc.co.uk/iplayer/playlist/%s' % group_id, group_id,
             'Downloading playlist XML')
 
diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py
new file mode 100644 (file)
index 0000000..45067b9
--- /dev/null
@@ -0,0 +1,106 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_parse_qs,
+    ExtractorError,
+    int_or_none,
+    unified_strdate,
+)
+
+
+class BiliBiliIE(InfoExtractor):
+    _VALID_URL = r'http://www\.bilibili\.tv/video/av(?P<id>[0-9]+)/'
+
+    _TEST = {
+        'url': 'http://www.bilibili.tv/video/av1074402/',
+        'md5': '2c301e4dab317596e837c3e7633e7d86',
+        'info_dict': {
+            'id': '1074402',
+            'ext': 'flv',
+            'title': '【金坷垃】金泡沫',
+            'duration': 308,
+            'upload_date': '20140420',
+            'thumbnail': 're:^https?://.+\.jpg',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        video_code = self._search_regex(
+            r'(?s)<div itemprop="video".*?>(.*?)</div>', webpage, 'video code')
+
+        title = self._html_search_meta(
+            'media:title', video_code, 'title', fatal=True)
+        duration_str = self._html_search_meta(
+            'duration', video_code, 'duration')
+        if duration_str is None:
+            duration = None
+        else:
+            duration_mobj = re.match(
+                r'^T(?:(?P<hours>[0-9]+)H)?(?P<minutes>[0-9]+)M(?P<seconds>[0-9]+)S$',
+                duration_str)
+            duration = (
+                int_or_none(duration_mobj.group('hours'), default=0) * 3600 +
+                int(duration_mobj.group('minutes')) * 60 +
+                int(duration_mobj.group('seconds')))
+        upload_date = unified_strdate(self._html_search_meta(
+            'uploadDate', video_code, fatal=False))
+        thumbnail = self._html_search_meta(
+            'thumbnailUrl', video_code, 'thumbnail', fatal=False)
+
+        player_params = compat_parse_qs(self._html_search_regex(
+            r'<iframe .*?class="player" src="https://secure.bilibili.tv/secure,([^"]+)"',
+            webpage, 'player params'))
+
+        if 'cid' in player_params:
+            cid = player_params['cid'][0]
+
+            lq_doc = self._download_xml(
+                'http://interface.bilibili.cn/v_cdn_play?cid=%s' % cid,
+                video_id,
+                note='Downloading LQ video info'
+            )
+            lq_durl = lq_doc.find('.//durl')
+            formats = [{
+                'format_id': 'lq',
+                'quality': 1,
+                'url': lq_durl.find('./url').text,
+                'filesize': int_or_none(
+                    lq_durl.find('./size'), get_attr='text'),
+            }]
+
+            hq_doc = self._download_xml(
+                'http://interface.bilibili.cn/playurl?cid=%s' % cid,
+                video_id,
+                note='Downloading HQ video info',
+                fatal=False,
+            )
+            if hq_doc is not False:
+                hq_durl = hq_doc.find('.//durl')
+                formats.append({
+                    'format_id': 'hq',
+                    'quality': 2,
+                    'ext': 'flv',
+                    'url': hq_durl.find('./url').text,
+                    'filesize': int_or_none(
+                        hq_durl.find('./size'), get_attr='text'),
+                })
+        else:
+            raise ExtractorError('Unsupported player parameters: %r' % (player_params,))
+
+        self._sort_formats(formats)
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'duration': duration,
+            'upload_date': upload_date,
+            'thumbnail': thumbnail,
+        }
index 96408e4e093ba6b27b4da54248d436d162d3c40c..38ccd957f3eb61a761950bb9a70cdbbeec6bea6d 100644 (file)
@@ -1,6 +1,5 @@
 from __future__ import unicode_literals
 
-import datetime
 import json
 import re
 
@@ -19,15 +18,16 @@ class BlinkxIE(InfoExtractor):
         'file': '8aQUy7GV.mp4',
         'md5': '2e9a07364af40163a908edbf10bb2492',
         'info_dict': {
-            "title": "Police Car Rolls Away",
-            "uploader": "stupidvideos.com",
-            "upload_date": "20131215",
-            "description": "A police car gently rolls away from a fight. Maybe it felt weird being around a confrontation and just had to get out of there!",
-            "duration": 14.886,
-            "thumbnails": [{
-                "width": 100,
-                "height": 76,
-                "url": "http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg",
+            'title': 'Police Car Rolls Away',
+            'uploader': 'stupidvideos.com',
+            'upload_date': '20131215',
+            'timestamp': 1387068000,
+            'description': 'A police car gently rolls away from a fight. Maybe it felt weird being around a confrontation and just had to get out of there!',
+            'duration': 14.886,
+            'thumbnails': [{
+                'width': 100,
+                'height': 76,
+                'url': 'http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg',
             }],
         },
     }
@@ -41,9 +41,6 @@ class BlinkxIE(InfoExtractor):
                    'video=%s' % video_id)
         data_json = self._download_webpage(api_url, display_id)
         data = json.loads(data_json)['api']['results'][0]
-        dt = datetime.datetime.fromtimestamp(data['pubdate_epoch'])
-        pload_date = dt.strftime('%Y%m%d')
-
         duration = None
         thumbnails = []
         formats = []
@@ -64,10 +61,7 @@ class BlinkxIE(InfoExtractor):
                 vcodec = remove_start(m['vcodec'], 'ff')
                 acodec = remove_start(m['acodec'], 'ff')
                 tbr = (int(m['vbr']) + int(m['abr'])) // 1000
-                format_id = (u'%s-%sk-%s' %
-                             (vcodec,
-                              tbr,
-                              m['w']))
+                format_id = u'%s-%sk-%s' % (vcodec, tbr, m['w'])
                 formats.append({
                     'format_id': format_id,
                     'url': m['link'],
@@ -88,7 +82,7 @@ class BlinkxIE(InfoExtractor):
             'title': data['title'],
             'formats': formats,
             'uploader': data['channel_name'],
-            'upload_date': pload_date,
+            'timestamp': data['pubdate_epoch'],
             'description': data.get('description'),
             'thumbnails': thumbnails,
             'duration': duration,
index a26001bb37401d8f0b92dcc5a4ef130be31b3e5e..d4da08991d937823fefba19da6a2e64a705e9434 100644 (file)
 from __future__ import unicode_literals
 
-import datetime
 import re
 
 from .common import InfoExtractor
 from .subtitles import SubtitlesInfoExtractor
 from ..utils import (
-    compat_str,
     compat_urllib_request,
-
     unescapeHTML,
+    parse_iso8601,
+    compat_urlparse,
+    clean_html,
+    compat_str,
 )
 
 
 class BlipTVIE(SubtitlesInfoExtractor):
-    """Information extractor for blip.tv"""
-
-    _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(?P<presumptive_id>.+)$'
-
-    _TESTS = [{
-        'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352',
-        'md5': 'c6934ad0b6acf2bd920720ec888eb812',
-        'info_dict': {
-            'id': '5779306',
-            'ext': 'mov',
-            'upload_date': '20111205',
-            'description': 'md5:9bc31f227219cde65e47eeec8d2dc596',
-            'uploader': 'Comic Book Resources - CBR TV',
-            'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3',
-        }
-    }, {
-        # https://github.com/rg3/youtube-dl/pull/2274
-        'note': 'Video with subtitles',
-        'url': 'http://blip.tv/play/h6Uag5OEVgI.html',
-        'md5': '309f9d25b820b086ca163ffac8031806',
-        'info_dict': {
-            'id': '6586561',
-            'ext': 'mp4',
-            'uploader': 'Red vs. Blue',
-            'description': 'One-Zero-One',
-            'upload_date': '20130614',
-            'title': 'Red vs. Blue Season 11 Episode 1',
+    _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z]+)))'
+
+    _TESTS = [
+        {
+            'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352',
+            'md5': 'c6934ad0b6acf2bd920720ec888eb812',
+            'info_dict': {
+                'id': '5779306',
+                'ext': 'mov',
+                'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3',
+                'description': 'md5:9bc31f227219cde65e47eeec8d2dc596',
+                'timestamp': 1323138843,
+                'upload_date': '20111206',
+                'uploader': 'cbr',
+                'uploader_id': '679425',
+                'duration': 81,
+            }
+        },
+        {
+            # https://github.com/rg3/youtube-dl/pull/2274
+            'note': 'Video with subtitles',
+            'url': 'http://blip.tv/play/h6Uag5OEVgI.html',
+            'md5': '309f9d25b820b086ca163ffac8031806',
+            'info_dict': {
+                'id': '6586561',
+                'ext': 'mp4',
+                'title': 'Red vs. Blue Season 11 Episode 1',
+                'description': 'One-Zero-One',
+                'timestamp': 1371261608,
+                'upload_date': '20130615',
+                'uploader': 'redvsblue',
+                'uploader_id': '792887',
+                'duration': 279,
+            }
         }
-    }]
+    ]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        presumptive_id = mobj.group('presumptive_id')
+        lookup_id = mobj.group('lookup_id')
 
         # See https://github.com/rg3/youtube-dl/issues/857
-        embed_mobj = re.match(r'https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)([a-zA-Z0-9]+)', url)
-        if embed_mobj:
-            info_url = 'http://blip.tv/play/%s.x?p=1' % embed_mobj.group(1)
-            info_page = self._download_webpage(info_url, embed_mobj.group(1))
-            video_id = self._search_regex(
-                r'data-episode-id="([0-9]+)', info_page, 'video_id')
-            return self.url_result('http://blip.tv/a/a-' + video_id, 'BlipTV')
-        
-        cchar = '&' if '?' in url else '?'
-        json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
-        request = compat_urllib_request.Request(json_url)
-        request.add_header('User-Agent', 'iTunes/10.6.1')
-
-        json_data = self._download_json(request, video_id=presumptive_id)
-
-        if 'Post' in json_data:
-            data = json_data['Post']
+        if lookup_id:
+            info_page = self._download_webpage(
+                'http://blip.tv/play/%s.x?p=1' % lookup_id, lookup_id, 'Resolving lookup id')
+            video_id = self._search_regex(r'data-episode-id="([0-9]+)', info_page, 'video_id')
         else:
-            data = json_data
+            video_id = mobj.group('id')
+
+        rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS')
+
+        def blip(s):
+            return '{http://blip.tv/dtd/blip/1.0}%s' % s
+
+        def media(s):
+            return '{http://search.yahoo.com/mrss/}%s' % s
+
+        def itunes(s):
+            return '{http://www.itunes.com/dtds/podcast-1.0.dtd}%s' % s
+
+        item = rss.find('channel/item')
+
+        video_id = item.find(blip('item_id')).text
+        title = item.find('./title').text
+        description = clean_html(compat_str(item.find(blip('puredescription')).text))
+        timestamp = parse_iso8601(item.find(blip('datestamp')).text)
+        uploader = item.find(blip('user')).text
+        uploader_id = item.find(blip('userid')).text
+        duration = int(item.find(blip('runtime')).text)
+        media_thumbnail = item.find(media('thumbnail'))
+        thumbnail = media_thumbnail.get('url') if media_thumbnail is not None else item.find(itunes('image')).text
+        categories = [category.text for category in item.findall('category')]
 
-        video_id = compat_str(data['item_id'])
-        upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
-        subtitles = {}
         formats = []
-        if 'additionalMedia' in data:
-            for f in data['additionalMedia']:
-                if f.get('file_type_srt') == 1:
-                    LANGS = {
-                        'english': 'en',
-                    }
-                    lang = f['role'].rpartition('-')[-1].strip().lower()
-                    langcode = LANGS.get(lang, lang)
-                    subtitles[langcode] = f['url']
-                    continue
-                if not int(f['media_width']):  # filter m3u8
-                    continue
+        subtitles = {}
+
+        media_group = item.find(media('group'))
+        for media_content in media_group.findall(media('content')):
+            url = media_content.get('url')
+            role = media_content.get(blip('role'))
+            msg = self._download_webpage(
+                url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url',
+                video_id, 'Resolving URL for %s' % role)
+            real_url = compat_urlparse.parse_qs(msg)['message'][0]
+
+            media_type = media_content.get('type')
+            if media_type == 'text/srt' or url.endswith('.srt'):
+                LANGS = {
+                    'english': 'en',
+                }
+                lang = role.rpartition('-')[-1].strip().lower()
+                langcode = LANGS.get(lang, lang)
+                subtitles[langcode] = url
+            elif media_type.startswith('video/'):
                 formats.append({
-                    'url': f['url'],
-                    'format_id': f['role'],
-                    'width': int(f['media_width']),
-                    'height': int(f['media_height']),
+                    'url': real_url,
+                    'format_id': role,
+                    'format_note': media_type,
+                    'vcodec': media_content.get(blip('vcodec')),
+                    'acodec': media_content.get(blip('acodec')),
+                    'filesize': media_content.get('filesize'),
+                    'width': int(media_content.get('width')),
+                    'height': int(media_content.get('height')),
                 })
-        else:
-            formats.append({
-                'url': data['media']['url'],
-                'width': int(data['media']['width']),
-                'height': int(data['media']['height']),
-            })
         self._sort_formats(formats)
 
         # subtitles
@@ -107,12 +129,14 @@ class BlipTVIE(SubtitlesInfoExtractor):
 
         return {
             'id': video_id,
-            'uploader': data['display_name'],
-            'upload_date': upload_date,
-            'title': data['title'],
-            'thumbnail': data['thumbnailUrl'],
-            'description': data['description'],
-            'user_agent': 'iTunes/10.6.1',
+            'title': title,
+            'description': description,
+            'timestamp': timestamp,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'duration': duration,
+            'thumbnail': thumbnail,
+            'categories': categories,
             'formats': formats,
             'subtitles': video_subtitles,
         }
index 2415ce4030521940a066af58796e438f0adc955a..25fb79e146b18f50962ba506d01560fbd845dbf2 100644 (file)
@@ -1,22 +1,21 @@
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
-from .ooyala import OoyalaIE
 
 
 class BloombergIE(InfoExtractor):
     _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?)\.html'
 
     _TEST = {
-        u'url': u'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html',
-        u'file': u'12bzhqZTqQHmmlA8I-i0NpzJgcG5NNYX.mp4',
-        u'info_dict': {
-            u'title': u'Shah\'s Presentation on Foreign-Exchange Strategies',
-            u'description': u'md5:abc86e5236f9f0e4866c59ad36736686',
-        },
-        u'params': {
-            # Requires ffmpeg (m3u8 manifest)
-            u'skip_download': True,
+        'url': 'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html',
+        'md5': '7bf08858ff7c203c870e8a6190e221e5',
+        'info_dict': {
+            'id': 'qurhIVlJSB6hzkVi229d8g',
+            'ext': 'flv',
+            'title': 'Shah\'s Presentation on Foreign-Exchange Strategies',
+            'description': 'md5:0681e0d30dcdfc6abf34594961d8ea88',
         },
     }
 
@@ -24,7 +23,16 @@ class BloombergIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         name = mobj.group('name')
         webpage = self._download_webpage(url, name)
-        embed_code = self._search_regex(
-            r'<source src="https?://[^/]+/[^/]+/[^/]+/([^/]+)', webpage,
-            'embed code')
-        return OoyalaIE._build_url_result(embed_code)
+        f4m_url = self._search_regex(
+            r'<source src="(https?://[^"]+\.f4m.*?)"', webpage,
+            'f4m url')
+        title = re.sub(': Video$', '', self._og_search_title(webpage))
+
+        return {
+            'id': name.split('-')[-1],
+            'title': title,
+            'url': f4m_url,
+            'ext': 'flv',
+            'description': self._og_search_description(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }
diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py
new file mode 100644 (file)
index 0000000..b5b56ff
--- /dev/null
@@ -0,0 +1,139 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+)
+
+
+class BRIE(InfoExtractor):
+    IE_DESC = 'Bayerischer Rundfunk Mediathek'
+    _VALID_URL = r'https?://(?:www\.)?br\.de/(?:[a-z0-9\-]+/)+(?P<id>[a-z0-9\-]+)\.html'
+    _BASE_URL = 'http://www.br.de'
+
+    _TESTS = [
+        {
+            'url': 'http://www.br.de/mediathek/video/anselm-gruen-114.html',
+            'md5': 'c4f83cf0f023ba5875aba0bf46860df2',
+            'info_dict': {
+                'id': '2c8d81c5-6fb7-4a74-88d4-e768e5856532',
+                'ext': 'mp4',
+                'title': 'Feiern und Verzichten',
+                'description': 'Anselm Grün: Feiern und Verzichten',
+                'uploader': 'BR/Birgit Baier',
+                'upload_date': '20140301',
+            }
+        },
+        {
+            'url': 'http://www.br.de/mediathek/video/sendungen/unter-unserem-himmel/unter-unserem-himmel-alpen-ueber-den-pass-100.html',
+            'md5': 'ab451b09d861dbed7d7cc9ab0be19ebe',
+            'info_dict': {
+                'id': '2c060e69-3a27-4e13-b0f0-668fac17d812',
+                'ext': 'mp4',
+                'title': 'Über den Pass',
+                'description': 'Die Eroberung der Alpen: Über den Pass',
+            }
+        },
+        {
+            'url': 'http://www.br.de/nachrichten/schaeuble-haushaltsentwurf-bundestag-100.html',
+            'md5': '3db0df1a9a9cd9fa0c70e6ea8aa8e820',
+            'info_dict': {
+                'id': 'c6aae3de-2cf9-43f2-957f-f17fef9afaab',
+                'ext': 'aac',
+                'title': '"Keine neuen Schulden im nächsten Jahr"',
+                'description': 'Haushaltsentwurf: "Keine neuen Schulden im nächsten Jahr"',
+            }
+        },
+        {
+            'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html',
+            'md5': 'dbab0aef2e047060ea7a21fc1ce1078a',
+            'info_dict': {
+                'id': '6ba73750-d405-45d3-861d-1ce8c524e059',
+                'ext': 'mp4',
+                'title': 'Umweltbewusster Häuslebauer',
+                'description': 'Uwe Erdelt: Umweltbewusster Häuslebauer',
+            }
+        },
+        {
+            'url': 'http://www.br.de/fernsehen/br-alpha/sendungen/kant-fuer-anfaenger/kritik-der-reinen-vernunft/kant-kritik-01-metaphysik100.html',
+            'md5': '23bca295f1650d698f94fc570977dae3',
+            'info_dict': {
+                'id': 'd982c9ce-8648-4753-b358-98abb8aec43d',
+                'ext': 'mp4',
+                'title': 'Folge 1 - Metaphysik',
+                'description': 'Kant für Anfänger: Folge 1 - Metaphysik',
+                'uploader': 'Eva Maria Steimle',
+                'upload_date': '20140117',
+            }
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+        page = self._download_webpage(url, display_id)
+        xml_url = self._search_regex(
+            r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL')
+        xml = self._download_xml(self._BASE_URL + xml_url, None)
+
+        medias = []
+
+        for xml_media in xml.findall('video') + xml.findall('audio'):
+            media = {
+                'id': xml_media.get('externalId'),
+                'title': xml_media.find('title').text,
+                'formats': self._extract_formats(xml_media.find('assets')),
+                'thumbnails': self._extract_thumbnails(xml_media.find('teaserImage/variants')),
+                'description': ' '.join(xml_media.find('shareTitle').text.splitlines()),
+                'webpage_url': xml_media.find('permalink').text
+            }
+            if xml_media.find('author').text:
+                media['uploader'] = xml_media.find('author').text
+            if xml_media.find('broadcastDate').text:
+                media['upload_date'] = ''.join(reversed(xml_media.find('broadcastDate').text.split('.')))
+            medias.append(media)
+
+        if len(medias) > 1:
+            self._downloader.report_warning(
+                'found multiple medias; please '
+                'report this with the video URL to http://yt-dl.org/bug')
+        if not medias:
+            raise ExtractorError('No media entries found')
+        return medias[0]
+
+    def _extract_formats(self, assets):
+
+        def text_or_none(asset, tag):
+            elem = asset.find(tag)
+            return None if elem is None else elem.text
+
+        formats = [{
+            'url': text_or_none(asset, 'downloadUrl'),
+            'ext': text_or_none(asset, 'mediaType'),
+            'format_id': asset.get('type'),
+            'width': int_or_none(text_or_none(asset, 'frameWidth')),
+            'height': int_or_none(text_or_none(asset, 'frameHeight')),
+            'tbr': int_or_none(text_or_none(asset, 'bitrateVideo')),
+            'abr': int_or_none(text_or_none(asset, 'bitrateAudio')),
+            'vcodec': text_or_none(asset, 'codecVideo'),
+            'acodec': text_or_none(asset, 'codecAudio'),
+            'container': text_or_none(asset, 'mediaType'),
+            'filesize': int_or_none(text_or_none(asset, 'size')),
+        } for asset in assets.findall('asset')
+            if asset.find('downloadUrl') is not None]
+
+        self._sort_formats(formats)
+        return formats
+
+    def _extract_thumbnails(self, variants):
+        thumbnails = [{
+            'url': self._BASE_URL + variant.find('url').text,
+            'width': int_or_none(variant.find('width').text),
+            'height': int_or_none(variant.find('height').text),
+        } for variant in variants.findall('variant')]
+        thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True)
+        return thumbnails
index 8ec6dda490c9c886463502c795bce4b361b9c326..1bfc9f35bbd5c7c929c8f21a20f7b9642d00bcb2 100644 (file)
@@ -23,13 +23,14 @@ class BreakIE(InfoExtractor):
         video_id = mobj.group(1).split("-")[-1]
         embed_url = 'http://www.break.com/embed/%s' % video_id
         webpage = self._download_webpage(embed_url, video_id)
-        info_json = self._search_regex(r'var embedVars = ({.*?});', webpage,
-                                       'info json', flags=re.DOTALL)
+        info_json = self._search_regex(r'var embedVars = ({.*})\s*?</script>',
+            webpage, 'info json', flags=re.DOTALL)
         info = json.loads(info_json)
         video_url = info['videoUri']
-        m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url)
-        if m_youtube is not None:
-            return self.url_result(m_youtube.group(1), 'Youtube')
+        youtube_id = info.get('youtubeId')
+        if youtube_id:
+            return self.url_result(youtube_id, 'Youtube')
+
         final_url = video_url + '?' + info['AuthToken']
         return {
             'id': video_id,
index 83eec84d3cd446b75854accd8dd8c2c754ba4349..3c02c297a58a32cf536e8ccc972dea68021f650b 100644 (file)
@@ -87,7 +87,7 @@ class BrightcoveIE(InfoExtractor):
         object_str = object_str.replace('<--', '<!--')
         object_str = fix_xml_ampersands(object_str)
 
-        object_doc = xml.etree.ElementTree.fromstring(object_str)
+        object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8'))
 
         fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars')
         if fv_el is not None:
@@ -140,7 +140,11 @@ class BrightcoveIE(InfoExtractor):
 
         url_m = re.search(r'<meta\s+property="og:video"\s+content="(http://c.brightcove.com/[^"]+)"', webpage)
         if url_m:
-            return [unescapeHTML(url_m.group(1))]
+            url = unescapeHTML(url_m.group(1))
+            # Some sites don't add it, we can't download with this url, for example:
+            # http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/
+            if 'playerKey' in url:
+                return [url]
 
         matches = re.findall(
             r'''(?sx)<object
diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py
new file mode 100644 (file)
index 0000000..cf19b7b
--- /dev/null
@@ -0,0 +1,48 @@
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class BYUtvIE(InfoExtractor):
+    _VALID_URL = r'^https?://(?:www\.)?byutv.org/watch/[0-9a-f-]+/(?P<video_id>[^/?#]+)'
+    _TEST = {
+        'url': 'http://www.byutv.org/watch/44e80f7b-e3ba-43ba-8c51-b1fd96c94a79/granite-flats-talking',
+        'info_dict': {
+            'id': 'granite-flats-talking',
+            'ext': 'mp4',
+            'description': 'md5:4e9a7ce60f209a33eca0ac65b4918e1c',
+            'title': 'Talking',
+            'thumbnail': 're:^https?://.*promo.*'
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('video_id')
+
+        webpage = self._download_webpage(url, video_id)
+        episode_code = self._search_regex(
+            r'(?s)episode:(.*?\}),\s*\n', webpage, 'episode information')
+        episode_json = re.sub(
+            r'(\n\s+)([a-zA-Z]+):\s+\'(.*?)\'', r'\1"\2": "\3"', episode_code)
+        ep = json.loads(episode_json)
+
+        if ep['providerType'] == 'Ooyala':
+            return {
+                '_type': 'url_transparent',
+                'ie_key': 'Ooyala',
+                'url': 'ooyala:%s' % ep['providerId'],
+                'id': video_id,
+                'title': ep['title'],
+                'description': ep.get('description'),
+                'thumbnail': ep.get('imageThumbnail'),
+            }
+        else:
+            raise ExtractorError('Unsupported provider %s' % ep['provider'])
index 690bc7c25fe2574faa473b122e8427137599c3cd..cb96c3876b7cbf02220d06ad86a44414d69c9fa8 100644 (file)
@@ -2,39 +2,46 @@
 from __future__ import unicode_literals
 
 import re
-import json
 
 from .common import InfoExtractor
 
 
 class C56IE(InfoExtractor):
-    _VALID_URL = r'https?://((www|player)\.)?56\.com/(.+?/)?(v_|(play_album.+-))(?P<textid>.+?)\.(html|swf)'
+    _VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P<textid>.+?)\.(?:html|swf)'
     IE_NAME = '56.com'
     _TEST = {
         'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html',
-        'file': '93440716.flv',
         'md5': 'e59995ac63d0457783ea05f93f12a866',
         'info_dict': {
+            'id': '93440716',
+            'ext': 'flv',
             'title': '网事知多少 第32期:车怒',
+            'duration': 283.813,
         },
     }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
         text_id = mobj.group('textid')
-        info_page = self._download_webpage('http://vxml.56.com/json/%s/' % text_id,
-                                           text_id, 'Downloading video info')
-        info = json.loads(info_page)['info']
-        formats = [{
-            'format_id': f['type'],
-            'filesize': int(f['filesize']),
-            'url': f['url']
-        } for f in info['rfiles']]
+
+        page = self._download_json(
+            'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info')
+
+        info = page['info']
+
+        formats = [
+            {
+                'format_id': f['type'],
+                'filesize': int(f['filesize']),
+                'url': f['url']
+            } for f in info['rfiles']
+        ]
         self._sort_formats(formats)
 
         return {
             'id': info['vid'],
             'title': info['Subject'],
+            'duration': int(info['duration']) / 1000.0,
             'formats': formats,
             'thumbnail': info.get('bimg') or info.get('img'),
         }
diff --git a/youtube_dl/extractor/canal13cl.py b/youtube_dl/extractor/canal13cl.py
new file mode 100644 (file)
index 0000000..93241fe
--- /dev/null
@@ -0,0 +1,48 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class Canal13clIE(InfoExtractor):
+    _VALID_URL = r'^http://(?:www\.)?13\.cl/(?:[^/?#]+/)*(?P<id>[^/?#]+)'
+    _TEST = {
+        'url': 'http://www.13.cl/t13/nacional/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda',
+        'md5': '4cb1fa38adcad8fea88487a078831755',
+        'info_dict': {
+            'id': '1403022125',
+            'display_id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda',
+            'ext': 'mp4',
+            'title': 'El "círculo de hierro" de Michelle Bachelet en su regreso a La Moneda',
+            'description': '(Foto: Agencia Uno) En nueve días más, Michelle Bachelet va a asumir por segunda vez como presidenta de la República. Entre aquellos que la acompañarán hay caras que se repiten y otras que se consolidan en su entorno de colaboradores más cercanos.',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        title = self._html_search_meta(
+            'twitter:title', webpage, 'title', fatal=True)
+        description = self._html_search_meta(
+            'twitter:description', webpage, 'description')
+        url = self._html_search_regex(
+            r'articuloVideo = \"(.*?)\"', webpage, 'url')
+        real_id = self._search_regex(
+            r'[^0-9]([0-9]{7,})[^0-9]', url, 'id', default=display_id)
+        thumbnail = self._html_search_regex(
+            r'articuloImagen = \"(.*?)\"', webpage, 'thumbnail')
+
+        return {
+            'id': real_id,
+            'display_id': display_id,
+            'url': url,
+            'title': title,
+            'description': description,
+            'ext': 'mp4',
+            'thumbnail': thumbnail,
+        }
index 3d8d7f9d2dee4713b467e47ab79bbb55edccf147..c4fefefe43b250c13c3a711cf397e1a3caa046a7 100644 (file)
@@ -1,4 +1,6 @@
 # coding: utf-8
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
@@ -9,11 +11,12 @@ class Canalc2IE(InfoExtractor):
     _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?.*?idVideo=(?P<id>\d+)'
 
     _TEST = {
-        u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui',
-        u'file': u'12163.mp4',
-        u'md5': u'060158428b650f896c542dfbb3d6487f',
-        u'info_dict': {
-            u'title': u'Terrasses du Numérique'
+        'url': 'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui',
+        'md5': '060158428b650f896c542dfbb3d6487f',
+        'info_dict': {
+            'id': '12163',
+            'ext': 'mp4',
+            'title': 'Terrasses du Numérique'
         }
     }
 
@@ -28,10 +31,11 @@ class Canalc2IE(InfoExtractor):
         video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name
 
         title = self._html_search_regex(
-            r'class="evenement8">(.*?)</a>', webpage, u'title')
-        
-        return {'id': video_id,
-                'ext': 'mp4',
-                'url': video_url,
-                'title': title,
-                }
+            r'class="evenement8">(.*?)</a>', webpage, 'title')
+
+        return {
+            'id': video_id,
+            'ext': 'mp4',
+            'url': video_url,
+            'title': title,
+        }
index 7cdcd8399a8cabcd17ef7af8d89ba9052e9f8901..0202078b0cdcef31f300e1419eed6fc38fa7b424 100644 (file)
@@ -1,53 +1,72 @@
 # encoding: utf-8
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
-from ..utils import unified_strdate
+from ..utils import (
+    unified_strdate,
+    url_basename,
+)
 
 
 class CanalplusIE(InfoExtractor):
-    _VALID_URL = r'https?://(www\.canalplus\.fr/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>\d+))'
+    _VALID_URL = r'https?://(?:www\.canalplus\.fr/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>[0-9]+))'
     _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'
-    IE_NAME = u'canalplus.fr'
+    IE_NAME = 'canalplus.fr'
 
     _TEST = {
-        u'url': u'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470',
-        u'file': u'922470.flv',
-        u'info_dict': {
-            u'title': u'Zapping - 26/08/13',
-            u'description': u'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013',
-            u'upload_date': u'20130826',
-        },
-        u'params': {
-            u'skip_download': True,
+        'url': 'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470',
+        'md5': '3db39fb48b9685438ecf33a1078023e4',
+        'info_dict': {
+            'id': '922470',
+            'ext': 'flv',
+            'title': 'Zapping - 26/08/13',
+            'description': 'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013',
+            'upload_date': '20130826',
         },
     }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.groupdict().get('id')
+
+        # Beware, some subclasses do not define an id group
+        display_id = url_basename(mobj.group('path'))
+
         if video_id is None:
-            webpage = self._download_webpage(url, mobj.group('path'))
-            video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id')
+            webpage = self._download_webpage(url, display_id)
+            video_id = self._search_regex(r'<canal:player videoId="(\d+)"', webpage, 'video id')
+
         info_url = self._VIDEO_INFO_TEMPLATE % video_id
-        doc = self._download_xml(info_url,video_id, 
-                                           u'Downloading video info')
+        doc = self._download_xml(info_url, video_id, 'Downloading video XML')
 
-        self.report_extraction(video_id)
         video_info = [video for video in doc if video.find('ID').text == video_id][0]
-        infos = video_info.find('INFOS')
         media = video_info.find('MEDIA')
-        formats = [media.find('VIDEOS/%s' % format)
-            for format in ['BAS_DEBIT', 'HAUT_DEBIT', 'HD']]
-        video_url = [format.text for format in formats if format is not None][-1]
-
-        return {'id': video_id,
-                'title': u'%s - %s' % (infos.find('TITRAGE/TITRE').text,
-                                       infos.find('TITRAGE/SOUS_TITRE').text),
-                'url': video_url,
-                'ext': 'flv',
-                'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text),
-                'thumbnail': media.find('IMAGES/GRAND').text,
-                'description': infos.find('DESCRIPTION').text,
-                'view_count': int(infos.find('NB_VUES').text),
-                }
+        infos = video_info.find('INFOS')
+
+        preferences = ['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD', 'HLS', 'HDS']
+
+        formats = [
+            {
+                'url': fmt.text + '?hdcore=2.11.3' if fmt.tag == 'HDS' else fmt.text,
+                'format_id': fmt.tag,
+                'ext': 'mp4' if fmt.tag == 'HLS' else 'flv',
+                'preference': preferences.index(fmt.tag) if fmt.tag in preferences else -1,
+            } for fmt in media.find('VIDEOS') if fmt.text
+        ]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': '%s - %s' % (infos.find('TITRAGE/TITRE').text,
+                                  infos.find('TITRAGE/SOUS_TITRE').text),
+            'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text),
+            'thumbnail': media.find('IMAGES/GRAND').text,
+            'description': infos.find('DESCRIPTION').text,
+            'view_count': int(infos.find('NB_VUES').text),
+            'like_count': int(infos.find('NB_LIKES').text),
+            'comment_count': int(infos.find('NB_COMMENTS').text),
+            'formats': formats,
+        }
\ No newline at end of file
diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py
new file mode 100644 (file)
index 0000000..0bce793
--- /dev/null
@@ -0,0 +1,87 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+
+
+class CBSNewsIE(InfoExtractor):
+    IE_DESC = 'CBS News'
+    _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:[^/]+/)+(?P<id>[\da-z_-]+)'
+
+    _TESTS = [
+        {
+            'url': 'http://www.cbsnews.com/news/tesla-and-spacex-elon-musks-industrial-empire/',
+            'info_dict': {
+                'id': 'tesla-and-spacex-elon-musks-industrial-empire',
+                'ext': 'flv',
+                'title': 'Tesla and SpaceX: Elon Musk\'s industrial empire',
+                'thumbnail': 'http://beta.img.cbsnews.com/i/2014/03/30/60147937-2f53-4565-ad64-1bdd6eb64679/60-0330-pelley-640x360.jpg',
+                'duration': 791,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/',
+            'info_dict': {
+                'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack',
+                'ext': 'flv',
+                'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack',
+                'thumbnail': 'http://cbsnews2.cbsistatic.com/hub/i/r/2014/04/04/0c9fbc66-576b-41ca-8069-02d122060dd2/thumbnail/140x90/6dad7a502f88875ceac38202984b6d58/en-0404-werner-replace-640x360.jpg',
+                'duration': 205,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_info = json.loads(self._html_search_regex(
+            r'(?:<ul class="media-list items" id="media-related-items"><li data-video-info|<div id="cbsNewsVideoPlayer" data-video-player-options)=\'({.+?})\'',
+            webpage, 'video JSON info'))
+
+        item = video_info['item'] if 'item' in video_info else video_info
+        title = item.get('articleTitle') or item.get('hed')
+        duration = item.get('duration')
+        thumbnail = item.get('mediaImage') or item.get('thumbnail')
+
+        formats = []
+        for format_id in ['RtmpMobileLow', 'RtmpMobileHigh', 'Hls', 'RtmpDesktop']:
+            uri = item.get('media' + format_id + 'URI')
+            if not uri:
+                continue
+            fmt = {
+                'url': uri,
+                'format_id': format_id,
+            }
+            if uri.startswith('rtmp'):
+                fmt.update({
+                    'app': 'ondemand?auth=cbs',
+                    'play_path': 'mp4:' + uri.split('<break>')[-1],
+                    'player_url': 'http://www.cbsnews.com/[[IMPORT]]/vidtech.cbsinteractive.com/player/3_3_0/CBSI_PLAYER_HD.swf',
+                    'page_url': 'http://www.cbsnews.com',
+                    'ext': 'flv',
+                })
+            elif uri.endswith('.m3u8'):
+                fmt['ext'] = 'mp4'
+            formats.append(fmt)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'formats': formats,
+        }
\ No newline at end of file
diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py
new file mode 100644 (file)
index 0000000..90a3ddd
--- /dev/null
@@ -0,0 +1,126 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_request,
+    compat_urllib_parse,
+    compat_urllib_parse_urlparse,
+    ExtractorError,
+)
+
+
+class CeskaTelevizeIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P<id>[^?#]+)'
+
+    _TESTS = [
+        {
+            'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/213512120230004-spanelska-chripka',
+            'info_dict': {
+                'id': '213512120230004',
+                'ext': 'flv',
+                'title': 'První republika: Španělská chřipka',
+                'duration': 3107.4,
+            },
+            'params': {
+                'skip_download': True,  # requires rtmpdump
+            },
+            'skip': 'Works only from Czech Republic.',
+        },
+        {
+            'url': 'http://www.ceskatelevize.cz/ivysilani/1030584952-tsatsiki-maminka-a-policajt',
+            'info_dict': {
+                'id': '20138143440',
+                'ext': 'flv',
+                'title': 'Tsatsiki, maminka a policajt',
+                'duration': 6754.1,
+            },
+            'params': {
+                'skip_download': True,  # requires rtmpdump
+            },
+            'skip': 'Works only from Czech Republic.',
+        },
+        {
+            'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina',
+            'info_dict': {
+                'id': '14716',
+                'ext': 'flv',
+                'title': 'První republika: Zpěvačka z Dupárny Bobina',
+                'duration': 90,
+            },
+            'params': {
+                'skip_download': True,  # requires rtmpdump
+            },
+        },
+    ]
+
+    def _real_extract(self, url):
+        url = url.replace('/porady/', '/ivysilani/').replace('/video/', '')
+
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.'
+        if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
+            raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
+
+        typ = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type')
+        episode_id = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id')
+
+        data = {
+            'playlist[0][type]': typ,
+            'playlist[0][id]': episode_id,
+            'requestUrl': compat_urllib_parse_urlparse(url).path,
+            'requestSource': 'iVysilani',
+        }
+
+        req = compat_urllib_request.Request('http://www.ceskatelevize.cz/ivysilani/ajax/get-playlist-url',
+                                            data=compat_urllib_parse.urlencode(data))
+
+        req.add_header('Content-type', 'application/x-www-form-urlencoded')
+        req.add_header('x-addr', '127.0.0.1')
+        req.add_header('X-Requested-With', 'XMLHttpRequest')
+        req.add_header('Referer', url)
+
+        playlistpage = self._download_json(req, video_id)
+
+        req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlistpage['url']))
+        req.add_header('Referer', url)
+
+        playlist = self._download_xml(req, video_id)
+        
+        formats = []
+        for i in playlist.find('smilRoot/body'):
+            if 'AD' not in i.attrib['id']:
+                base_url = i.attrib['base']
+                parsedurl = compat_urllib_parse_urlparse(base_url)
+                duration = i.attrib['duration']
+
+                for video in i.findall('video'):
+                    if video.attrib['label'] != 'AD':
+                        format_id = video.attrib['label']
+                        play_path = video.attrib['src']
+                        vbr = int(video.attrib['system-bitrate'])
+
+                        formats.append({
+                            'format_id': format_id,
+                            'url': base_url,
+                            'vbr': vbr,
+                            'play_path': play_path,
+                            'app': parsedurl.path[1:] + '?' + parsedurl.query,
+                            'rtmp_live': True,
+                            'ext': 'flv',
+                        })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': episode_id,
+            'title': self._html_search_regex(r'<title>(.+?) — iVysílání — Česká televize</title>', webpage, 'title'),
+            'duration': float(duration),
+            'formats': formats,
+        }
index f0d08cebfce87b006b339508f655eba95a4bc1ef..496271be4e5f7170ad3d814ec5e2c0b99d15538d 100644 (file)
@@ -1,84 +1,94 @@
 # encoding: utf-8
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
+    int_or_none,
 )
 
 
 class CinemassacreIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?(?:www\.)?(?P<url>cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/.+?)(?:[/?].*)?'
-    _TESTS = [{
-        u'url': u'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
-        u'file': u'19911.flv',
-        u'info_dict': {
-            u'upload_date': u'20121110',
-            u'title': u'“Angry Video Game Nerd: The Movie” – Trailer',
-            u'description': u'md5:fb87405fcb42a331742a0dce2708560b',
-        },
-        u'params': {
-            # rtmp download
-            u'skip_download': True,
-        },
-    },
-    {
-        u'url': u'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
-        u'file': u'521be8ef82b16.flv',
-        u'info_dict': {
-            u'upload_date': u'20131002',
-            u'title': u'The Mummy’s Hand (1940)',
-        },
-        u'params': {
-            # rtmp download
-            u'skip_download': True,
+    _VALID_URL = r'http://(?:www\.)?cinemassacre\.com/(?P<date_Y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)'
+    _TESTS = [
+        {
+            'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/',
+            'md5': 'fde81fbafaee331785f58cd6c0d46190',
+            'info_dict': {
+                'id': '19911',
+                'ext': 'mp4',
+                'upload_date': '20121110',
+                'title': '“Angry Video Game Nerd: The Movie” – Trailer',
+                'description': 'md5:fb87405fcb42a331742a0dce2708560b',
+            },
         },
-    }]
+        {
+            'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940',
+            'md5': 'd72f10cd39eac4215048f62ab477a511',
+            'info_dict': {
+                'id': '521be8ef82b16',
+                'ext': 'mp4',
+                'upload_date': '20131002',
+                'title': 'The Mummy’s Hand (1940)',
+            },
+        }
+    ]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('display_id')
 
-        webpage_url = u'http://' + mobj.group('url')
-        webpage = self._download_webpage(webpage_url, None) # Don't know video id yet
+        webpage = self._download_webpage(url, display_id)
         video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d')
         mobj = re.search(r'src="(?P<embed_url>http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P<video_id>.+?))"', webpage)
         if not mobj:
-            raise ExtractorError(u'Can\'t extract embed url and video id')
-        playerdata_url = mobj.group(u'embed_url')
-        video_id = mobj.group(u'video_id')
+            raise ExtractorError('Can\'t extract embed url and video id')
+        playerdata_url = mobj.group('embed_url')
+        video_id = mobj.group('video_id')
 
-        video_title = self._html_search_regex(r'<title>(?P<title>.+?)\|',
-            webpage, u'title')
-        video_description = self._html_search_regex(r'<div class="entry-content">(?P<description>.+?)</div>',
-            webpage, u'description', flags=re.DOTALL, fatal=False)
-        if len(video_description) == 0:
-            video_description = None
+        video_title = self._html_search_regex(
+            r'<title>(?P<title>.+?)\|', webpage, 'title')
+        video_description = self._html_search_regex(
+            r'<div class="entry-content">(?P<description>.+?)</div>',
+            webpage, 'description', flags=re.DOTALL, fatal=False)
 
-        playerdata = self._download_webpage(playerdata_url, video_id)
-        url = self._html_search_regex(r'\'streamer\': \'(?P<url>[^\']+)\'', playerdata, u'url')
+        playerdata = self._download_webpage(playerdata_url, video_id, 'Downloading player webpage')
+        video_thumbnail = self._search_regex(
+            r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False)
+        sd_url = self._search_regex(r'file: \'([^\']+)\', label: \'SD\'', playerdata, 'sd_file')
+        videolist_url = self._search_regex(r'file: \'([^\']+\.smil)\'}', playerdata, 'videolist_url')
 
-        sd_file = self._html_search_regex(r'\'file\': \'(?P<sd_file>[^\']+)\'', playerdata, u'sd_file')
-        hd_file = self._html_search_regex(r'\'?file\'?: "(?P<hd_file>[^"]+)"', playerdata, u'hd_file')
-        video_thumbnail = self._html_search_regex(r'\'image\': \'(?P<thumbnail>[^\']+)\'', playerdata, u'thumbnail', fatal=False)
+        videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML')
 
-        formats = [
-            {
-                'url': url,
-                'play_path': 'mp4:' + sd_file,
-                'rtmp_live': True, # workaround
-                'ext': 'flv',
-                'format': 'sd',
-                'format_id': 'sd',
-            },
-            {
-                'url': url,
-                'play_path': 'mp4:' + hd_file,
-                'rtmp_live': True, # workaround
-                'ext': 'flv',
-                'format': 'hd',
-                'format_id': 'hd',
-            },
-        ]
+        formats = []
+        baseurl = sd_url[:sd_url.rfind('/')+1]
+        for video in videolist.findall('.//video'):
+            src = video.get('src')
+            if not src:
+                continue
+            file_ = src.partition(':')[-1]
+            width = int_or_none(video.get('width'))
+            height = int_or_none(video.get('height'))
+            bitrate = int_or_none(video.get('system-bitrate'))
+            format = {
+                'url': baseurl + file_,
+                'format_id': src.rpartition('.')[0].rpartition('_')[-1],
+            }
+            if width or height:
+                format.update({
+                    'tbr': bitrate // 1000 if bitrate else None,
+                    'width': width,
+                    'height': height,
+                })
+            else:
+                format.update({
+                    'abr': bitrate // 1000 if bitrate else None,
+                    'vcodec': 'none',
+                })
+            formats.append(format)
+        self._sort_formats(formats)
 
         return {
             'id': video_id,
index 43efb08bfc33accf5661bf0afa3d59aeb1bb0c0e..669919a2cc9039ffb91ae052b96d5531665341e0 100644 (file)
@@ -1,22 +1,28 @@
+from __future__ import unicode_literals
+
 import re
 import time
 import xml.etree.ElementTree
 
 from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+    ExtractorError,
+    parse_duration,
+)
 
 
 class ClipfishIE(InfoExtractor):
-    IE_NAME = u'clipfish'
+    IE_NAME = 'clipfish'
 
     _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/'
     _TEST = {
-        u'url': u'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
-        u'file': u'3966754.mp4',
-        u'md5': u'2521cd644e862936cf2e698206e47385',
-        u'info_dict': {
-            u'title': u'FIFA 14 - E3 2013 Trailer',
-            u'duration': 82,
+        'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
+        'md5': '2521cd644e862936cf2e698206e47385',
+        'info_dict': {
+            'id': '3966754',
+            'ext': 'mp4',
+            'title': 'FIFA 14 - E3 2013 Trailer',
+            'duration': 82,
         },
         u'skip': 'Blocked in the US'
     }
@@ -33,21 +39,10 @@ class ClipfishIE(InfoExtractor):
         video_url = doc.find('filename').text
         if video_url is None:
             xml_bytes = xml.etree.ElementTree.tostring(doc)
-            raise ExtractorError(u'Cannot find video URL in document %r' %
+            raise ExtractorError('Cannot find video URL in document %r' %
                                  xml_bytes)
         thumbnail = doc.find('imageurl').text
-        duration_str = doc.find('duration').text
-        m = re.match(
-            r'^(?P<hours>[0-9]+):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}):(?P<ms>[0-9]*)$',
-            duration_str)
-        if m:
-            duration = (
-                (int(m.group('hours')) * 60 * 60) +
-                (int(m.group('minutes')) * 60) +
-                (int(m.group('seconds')))
-            )
-        else:
-            duration = None
+        duration = parse_duration(doc.find('duration').text)
 
         return {
             'id': video_id,
index 9ab6a4ab69726c5c2a7ad0df7de5933f1f882d33..02a1667fa3fbf7cbe1a822db7b82f9c087864249 100644 (file)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
@@ -11,13 +13,14 @@ class ClipsyndicateIE(InfoExtractor):
     _VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)'
 
     _TEST = {
-        u'url': u'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe',
-        u'md5': u'4d7d549451bad625e0ff3d7bd56d776c',
-        u'info_dict': {
-            u'id': u'4629301',
-            u'ext': u'mp4',
-            u'title': u'Brick Briscoe',
-            u'duration': 612,
+        'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe',
+        'md5': '4d7d549451bad625e0ff3d7bd56d776c',
+        'info_dict': {
+            'id': '4629301',
+            'ext': 'mp4',
+            'title': 'Brick Briscoe',
+            'duration': 612,
+            'thumbnail': 're:^https?://.+\.jpg',
         },
     }
 
@@ -26,13 +29,13 @@ class ClipsyndicateIE(InfoExtractor):
         video_id = mobj.group('id')
         js_player = self._download_webpage(
             'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id,
-            video_id, u'Downlaoding player')
+            video_id, 'Downlaoding player')
         # it includes a required token
-        flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars')
+        flvars = self._search_regex(r'flvars: "(.*?)"', js_player, 'flvars')
 
         pdoc = self._download_xml(
             'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
-            video_id, u'Downloading video info',
+            video_id, 'Downloading video info',
             transform_source=fix_xml_ampersands)
 
         track_doc = pdoc.find('trackList/track')
diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py
new file mode 100644 (file)
index 0000000..14f215c
--- /dev/null
@@ -0,0 +1,58 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    qualities,
+)
+
+
+class ClubicIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?clubic\.com/video/[^/]+/video.*-(?P<id>[0-9]+)\.html'
+
+    _TEST = {
+        'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html',
+        'md5': '1592b694ba586036efac1776b0b43cd3',
+        'info_dict': {
+            'id': '448474',
+            'ext': 'mp4',
+            'title': 'Clubic Week 2.0 : le FBI se lance dans la photo d\u0092identité',
+            'description': 're:Gueule de bois chez Nokia. Le constructeur a indiqué cette.*',
+            'thumbnail': 're:^http://img\.clubic\.com/.*\.jpg$',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        player_url = 'http://player.m6web.fr/v1/player/clubic/%s.html' % video_id
+        player_page = self._download_webpage(player_url, video_id)
+
+        config_json = self._search_regex(
+            r'(?m)M6\.Player\.config\s*=\s*(\{.+?\});$', player_page,
+            'configuration')
+        config = json.loads(config_json)
+
+        video_info = config['videoInfo']
+        sources = config['sources']
+        quality_order = qualities(['sd', 'hq'])
+
+        formats = [{
+            'format_id': src['streamQuality'],
+            'url': src['src'],
+            'quality': quality_order(src['streamQuality']),
+        } for src in sources]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video_info['title'],
+            'formats': formats,
+            'description': clean_html(video_info.get('description')),
+            'thumbnail': config.get('poster'),
+        }
index 88e0e9aba9150cea2ccac5a6ea4be06b9d0700ba..e96c59f718a5dc412a2ce7eaa962d6bdca98e187 100644 (file)
@@ -1,19 +1,19 @@
+from __future__ import unicode_literals
 from .mtv import MTVIE
 
+
 class CMTIE(MTVIE):
-    IE_NAME = u'cmt.com'
+    IE_NAME = 'cmt.com'
     _VALID_URL = r'https?://www\.cmt\.com/videos/.+?/(?P<videoid>[^/]+)\.jhtml'
     _FEED_URL = 'http://www.cmt.com/sitewide/apps/player/embed/rss/'
 
-    _TESTS = [
-        {
-            u'url': u'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061',
-            u'md5': u'e6b7ef3c4c45bbfae88061799bbba6c2',
-            u'info_dict': {
-                u'id': u'989124',
-                u'ext': u'mp4',
-                u'title': u'Garth Brooks - "The Call (featuring Trisha Yearwood)"',
-                u'description': u'Blame It All On My Roots',
-            },
+    _TESTS = [{
+        'url': 'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061',
+        'md5': 'e6b7ef3c4c45bbfae88061799bbba6c2',
+        'info_dict': {
+            'id': '989124',
+            'ext': 'mp4',
+            'title': 'Garth Brooks - "The Call (featuring Trisha Yearwood)"',
+            'description': 'Blame It All On My Roots',
         },
-    ]
+    }]
diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py
new file mode 100644 (file)
index 0000000..a94f425
--- /dev/null
@@ -0,0 +1,75 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+)
+
+
+class CNETIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/'
+    _TEST = {
+        'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/',
+        'md5': '041233212a0d06b179c87cbcca1577b8',
+        'info_dict': {
+            'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60',
+            'ext': 'mp4',
+            'title': 'Hands-on with Microsoft Windows 8.1 Update',
+            'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.',
+            'thumbnail': 're:^http://.*/flmswindows8.jpg$',
+            'uploader_id': 'sarah.mitroff@cbsinteractive.com',
+            'uploader': 'Sarah Mitroff',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, display_id)
+        data_json = self._html_search_regex(
+            r"<div class=\"cnetVideoPlayer\"\s+.*?data-cnet-video-options='([^']+)'",
+            webpage, 'data json')
+        data = json.loads(data_json)
+        vdata = data['video']
+        if not vdata:
+            vdata = data['videos'][0]
+        if not vdata:
+            raise ExtractorError('Cannot find video data')
+
+        video_id = vdata['id']
+        title = vdata['headline']
+        description = vdata.get('dek')
+        thumbnail = vdata.get('image', {}).get('path')
+        author = vdata.get('author')
+        if author:
+            uploader = '%s %s' % (author['firstName'], author['lastName'])
+            uploader_id = author.get('email')
+        else:
+            uploader = None
+            uploader_id = None
+
+        formats = [{
+            'format_id': '%s-%s-%s' % (
+                f['type'], f['format'],
+                int_or_none(f.get('bitrate'), 1000, default='')),
+            'url': f['uri'],
+            'tbr': int_or_none(f.get('bitrate'), 1000),
+        } for f in vdata['files']['data']]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'formats': formats,
+            'description': description,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'thumbnail': thumbnail,
+        }
index b32cb898010a0ad0e02e12f3b3a55c3769cc3979..dae40c136bae20fd54cae401e711b9233c750e14 100644 (file)
@@ -79,8 +79,11 @@ class CNNIE(InfoExtractor):
 
         self._sort_formats(formats)
 
-        thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')])
-        thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails]
+        thumbnails = [{
+            'height': int(t.attrib['height']),
+            'width': int(t.attrib['width']),
+            'url': t.text,
+        } for t in info.findall('images/image')]
 
         metas_el = info.find('metas')
         upload_date = (
@@ -93,8 +96,7 @@ class CNNIE(InfoExtractor):
             'id': info.attrib['id'],
             'title': info.find('headline').text,
             'formats': formats,
-            'thumbnail': thumbnails[-1][1],
-            'thumbnails': thumbs_dict,
+            'thumbnails': thumbnails,
             'description': info.find('description').text,
             'duration': duration,
             'upload_date': upload_date,
index 10c925dfe1c7d44d7d1f6abe6a72b79d12230abf..6f866e7fcee7f401362b24d69db2285e22cfa6a4 100644 (file)
@@ -17,8 +17,9 @@ class CollegeHumorIE(InfoExtractor):
             'id': '6902724',
             'ext': 'mp4',
             'title': 'Comic-Con Cosplay Catastrophe',
-            'description': 'Fans get creative this year',
+            'description': "Fans get creative this year at San Diego.  Too creative.  And yes, that's really Joss Whedon.",
             'age_limit': 13,
+            'duration': 187,
         },
     },
     {
@@ -28,22 +29,22 @@ class CollegeHumorIE(InfoExtractor):
             'id': '3505939',
             'ext': 'mp4',
             'title': 'Font Conference',
-            'description': 'This video wasn\'t long enough,',
+            'description': "This video wasn't long enough, so we made it double-spaced.",
             'age_limit': 10,
             'duration': 179,
         },
     },
     # embedded youtube video
     {
-        'url': 'http://www.collegehumor.com/embed/6950457',
+        'url': 'http://www.collegehumor.com/embed/6950306',
         'info_dict': {
-            'id': 'W5gMp3ZjYg4',
+            'id': 'Z-bao9fg6Yc',
             'ext': 'mp4',
-            'title': 'Funny Dogs Protecting Babies Compilation 2014 [NEW HD]',
-            'uploader': 'Funnyplox TV',
-            'uploader_id': 'funnyploxtv',
-            'description': 'md5:7ded37421526d54afdf005e25bc2b7a3',
-            'upload_date': '20140128',
+            'title': 'Young Americans Think President John F. Kennedy Died THIS MORNING IN A CAR ACCIDENT!!!',
+            'uploader': 'Mark Dice',
+            'uploader_id': 'MarkDice',
+            'description': 'md5:62c3dab9351fac7bb44b53b69511d87f',
+            'upload_date': '20140127',
         },
         'params': {
             'skip_download': True,
@@ -87,6 +88,7 @@ class CollegeHumorIE(InfoExtractor):
         self._sort_formats(formats)
 
         duration = int_or_none(vdata.get('duration'), 1000)
+        like_count = int_or_none(vdata.get('likes'))
 
         return {
             'id': video_id,
@@ -96,4 +98,5 @@ class CollegeHumorIE(InfoExtractor):
             'formats': formats,
             'age_limit': age_limit,
             'duration': duration,
+            'like_count': like_count,
         }
index ed3986f313a149f0db4a69dc92762730297ced1a..ba4d73ab8bf3ff893fdb2c07fc57f0cbc009ec44 100644 (file)
@@ -7,21 +7,21 @@ from .mtv import MTVServicesInfoExtractor
 from ..utils import (
     compat_str,
     compat_urllib_parse,
-
     ExtractorError,
+    float_or_none,
     unified_strdate,
 )
 
 
 class ComedyCentralIE(MTVServicesInfoExtractor):
-    _VALID_URL = r'''(?x)https?://(?:www\.)?comedycentral\.com/
+    _VALID_URL = r'''(?x)https?://(?:www\.)?(comedycentral|cc)\.com/
         (video-clips|episodes|cc-studios|video-collections)
         /(?P<title>.*)'''
     _FEED_URL = 'http://comedycentral.com/feeds/mrss/'
 
     _TEST = {
         'url': 'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother',
-        'md5': '4167875aae411f903b751a21f357f1ee',
+        'md5': 'c4f48e9eda1b16dd10add0744344b6d8',
         'info_dict': {
             'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354',
             'ext': 'mp4',
@@ -32,31 +32,34 @@ class ComedyCentralIE(MTVServicesInfoExtractor):
 
 
 class ComedyCentralShowsIE(InfoExtractor):
-    IE_DESC = 'The Daily Show / Colbert Report'
+    IE_DESC = 'The Daily Show / The Colbert Report'
     # urls can be abbreviations like :thedailyshow or :colbert
     # urls for episodes like:
     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
-    _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
-                      |(https?://)?(www\.)?
-                          (?P<showname>thedailyshow|colbertnation)\.com/
-                         (full-episodes/(?P<episode>.*)|
+    _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
+                      |https?://(:www\.)?
+                          (?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/
+                         ((?:full-)?episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)|
                           (?P<clip>
-                              (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
-                              |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))|
+                              (?:(?:guests/[^/]+|videos|video-playlists|special-editions)/[^/]+/(?P<videotitle>[^/?#]+))
+                              |(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
+                              |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*))
+                          )|
                           (?P<interview>
-                              extended-interviews/(?P<interID>[0-9]+)/playlist_tds_extended_(?P<interview_title>.*?)/.*?)))
-                     $"""
+                              extended-interviews/(?P<interID>[0-9a-z]+)/(?:playlist_tds_extended_)?(?P<interview_title>.*?)(/.*?)?)))
+                     (?:[?#].*|$)'''
     _TEST = {
-        'url': 'http://www.thedailyshow.com/watch/thu-december-13-2012/kristen-stewart',
-        'file': '422212.mp4',
+        'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart',
         'md5': '4e2f5cb088a83cd8cdb7756132f9739d',
         'info_dict': {
-            "upload_date": "20121214",
-            "description": "Kristen Stewart",
-            "uploader": "thedailyshow",
-            "title": "thedailyshow-kristen-stewart part 1"
+            'id': 'ab9ab3e7-5a98-4dbe-8b21-551dc0523d55',
+            'ext': 'mp4',
+            'upload_date': '20121213',
+            'description': 'Kristen Stewart learns to let loose in "On the Road."',
+            'uploader': 'thedailyshow',
+            'title': 'thedailyshow kristen-stewart part 1',
         }
     }
 
@@ -79,11 +82,6 @@ class ComedyCentralShowsIE(InfoExtractor):
         '400': (384, 216),
     }
 
-    @classmethod
-    def suitable(cls, url):
-        """Receives a URL and returns True if suitable for this IE."""
-        return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
-
     @staticmethod
     def _transform_rtmp_url(rtmp_video_url):
         m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\.comedystor/.*)$', rtmp_video_url)
@@ -99,14 +97,16 @@ class ComedyCentralShowsIE(InfoExtractor):
 
         if mobj.group('shortname'):
             if mobj.group('shortname') in ('tds', 'thedailyshow'):
-                url = 'http://www.thedailyshow.com/full-episodes/'
+                url = 'http://thedailyshow.cc.com/full-episodes/'
             else:
-                url = 'http://www.colbertnation.com/full-episodes/'
+                url = 'http://thecolbertreport.cc.com/full-episodes/'
             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
             assert mobj is not None
 
         if mobj.group('clip'):
-            if mobj.group('showname') == 'thedailyshow':
+            if mobj.group('videotitle'):
+                epTitle = mobj.group('videotitle')
+            elif mobj.group('showname') == 'thedailyshow':
                 epTitle = mobj.group('tdstitle')
             else:
                 epTitle = mobj.group('cntitle')
@@ -120,9 +120,9 @@ class ComedyCentralShowsIE(InfoExtractor):
                 epTitle = mobj.group('showname')
             else:
                 epTitle = mobj.group('episode')
+        show_name = mobj.group('showname')
 
-        self.report_extraction(epTitle)
-        webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
+        webpage, htmlHandle = self._download_webpage_handle(url, epTitle)
         if dlNewest:
             url = htmlHandle.geturl()
             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
@@ -130,71 +130,86 @@ class ComedyCentralShowsIE(InfoExtractor):
                 raise ExtractorError('Invalid redirected URL: ' + url)
             if mobj.group('episode') == '':
                 raise ExtractorError('Redirected URL is still not specific: ' + url)
-            epTitle = mobj.group('episode')
+            epTitle = mobj.group('episode').rpartition('/')[-1]
 
         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
-
         if len(mMovieParams) == 0:
             # The Colbert Report embeds the information in a without
             # a URL prefix; so extract the alternate reference
             # and then add the URL prefix manually.
 
-            altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
+            altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video|playlist).*?:.*?)"', webpage)
             if len(altMovieParams) == 0:
                 raise ExtractorError('unable to find Flash URL in webpage ' + url)
             else:
                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
 
         uri = mMovieParams[0][1]
-        indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
-        idoc = self._download_xml(indexUrl, epTitle,
-                                          'Downloading show index',
-                                          'unable to download episode index')
-
-        results = []
-
-        itemEls = idoc.findall('.//item')
-        for partNum,itemEl in enumerate(itemEls):
-            mediaId = itemEl.findall('./guid')[0].text
-            shortMediaId = mediaId.split(':')[-1]
-            showId = mediaId.split(':')[-2].replace('.com', '')
-            officialTitle = itemEl.findall('./title')[0].text
-            officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
-
-            configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
-                        compat_urllib_parse.urlencode({'uri': mediaId}))
-            cdoc = self._download_xml(configUrl, epTitle,
-                                               'Downloading configuration for %s' % shortMediaId)
+        # Correct cc.com in uri
+        uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.cc.com', uri)
+
+        index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse.urlencode({'uri': uri}))
+        idoc = self._download_xml(
+            index_url, epTitle,
+            'Downloading show index', 'Unable to download episode index')
+
+        title = idoc.find('./channel/title').text
+        description = idoc.find('./channel/description').text
+
+        entries = []
+        item_els = idoc.findall('.//item')
+        for part_num, itemEl in enumerate(item_els):
+            upload_date = unified_strdate(itemEl.findall('./pubDate')[0].text)
+            thumbnail = itemEl.find('.//{http://search.yahoo.com/mrss/}thumbnail').attrib.get('url')
+
+            content = itemEl.find('.//{http://search.yahoo.com/mrss/}content')
+            duration = float_or_none(content.attrib.get('duration'))
+            mediagen_url = content.attrib['url']
+            guid = itemEl.find('./guid').text.rpartition(':')[-1]
+
+            cdoc = self._download_xml(
+                mediagen_url, epTitle,
+                'Downloading configuration for segment %d / %d' % (part_num + 1, len(item_els)))
 
             turls = []
             for rendition in cdoc.findall('.//rendition'):
                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
                 turls.append(finfo)
 
-            if len(turls) == 0:
-                self._downloader.report_error('unable to download ' + mediaId + ': No videos found')
-                continue
-
             formats = []
             for format, rtmp_video_url in turls:
                 w, h = self._video_dimensions.get(format, (None, None))
                 formats.append({
+                    'format_id': 'vhttp-%s' % format,
                     'url': self._transform_rtmp_url(rtmp_video_url),
                     'ext': self._video_extensions.get(format, 'mp4'),
-                    'format_id': format,
                     'height': h,
                     'width': w,
                 })
+                formats.append({
+                    'format_id': 'rtmp-%s' % format,
+                    'url': rtmp_video_url.replace('viacomccstrm', 'viacommtvstrm'),
+                    'ext': self._video_extensions.get(format, 'mp4'),
+                    'height': h,
+                    'width': w,
+                })
+                self._sort_formats(formats)
 
-            effTitle = showId + '-' + epTitle + ' part ' + compat_str(partNum+1)
-            results.append({
-                'id': shortMediaId,
+            virtual_id = show_name + ' ' + epTitle + ' part ' + compat_str(part_num + 1)
+            entries.append({
+                'id': guid,
+                'title': virtual_id,
                 'formats': formats,
-                'uploader': showId,
-                'upload_date': officialDate,
-                'title': effTitle,
-                'thumbnail': None,
-                'description': compat_str(officialTitle),
+                'uploader': show_name,
+                'upload_date': upload_date,
+                'duration': duration,
+                'thumbnail': thumbnail,
+                'description': description,
             })
 
-        return results
+        return {
+            '_type': 'playlist',
+            'entries': entries,
+            'title': show_name + ' ' + title,
+            'description': description,
+        }
index 84fca8ba0b2577696877c117a13fcc0a5ce40735..49e75405e8b079eef83191f9429ebd34a6c0bc26 100644 (file)
@@ -74,7 +74,7 @@ class InfoExtractor(object):
                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
                     * preference Order number of this format. If this field is
                                  present and not None, the formats get sorted
-                                 by this field.
+                                 by this field, regardless of all other values.
                                  -1 for default (order by other properties),
                                  -2 or smaller for less than default.
                     * quality    Order number of the video quality of this
@@ -88,12 +88,22 @@ class InfoExtractor(object):
 
     The following fields are optional:
 
-    thumbnails:     A list of dictionaries (with the entries "resolution" and
-                    "url") for the varying thumbnails
+    display_id      An alternative identifier for the video, not necessarily
+                    unique, but available before title. Typically, id is
+                    something like "4234987", title "Dancing naked mole rats",
+                    and display_id "dancing-naked-mole-rats"
+    thumbnails:     A list of dictionaries, with the following entries:
+                        * "url"
+                        * "width" (optional, int)
+                        * "height" (optional, int)
+                        * "resolution" (optional, string "{width}x{height"},
+                                        deprecated)
     thumbnail:      Full URL to a video thumbnail image.
     description:    One-line video description.
     uploader:       Full name of the video uploader.
+    timestamp:      UNIX timestamp of the moment the video became available.
     upload_date:    Video upload date (YYYYMMDD).
+                    If not explicitly set, calculated from timestamp.
     uploader_id:    Nickname or id of the video uploader.
     location:       Physical location of the video.
     subtitles:      The subtitle file contents as a dictionary in the format
@@ -107,6 +117,8 @@ class InfoExtractor(object):
     webpage_url:    The url to the video webpage, if given to youtube-dl it
                     should allow to get the same result again. (It will be set
                     by YoutubeDL if it's missing)
+    categories:     A list of categories that the video falls in, for example
+                    ["Sports", "Berlin"]
 
     Unless mentioned otherwise, the fields should be Unicode strings.
 
@@ -114,9 +126,6 @@ class InfoExtractor(object):
     _real_extract() methods and define a _VALID_URL regexp.
     Probably, they should also be added to the list of extractors.
 
-    _real_extract() must return a *list* of information dictionaries as
-    described above.
-
     Finally, the _WORKING attribute should be set to False for broken IEs
     in order to warn the users and skip the tests.
     """
@@ -239,16 +248,31 @@ class InfoExtractor(object):
                 url = url_or_request.get_full_url()
             except AttributeError:
                 url = url_or_request
-            if len(url) > 200:
-                h = u'___' + hashlib.md5(url.encode('utf-8')).hexdigest()
-                url = url[:200 - len(h)] + h
-            raw_filename = ('%s_%s.dump' % (video_id, url))
+            basen = '%s_%s' % (video_id, url)
+            if len(basen) > 240:
+                h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
+                basen = basen[:240 - len(h)] + h
+            raw_filename = basen + '.dump'
             filename = sanitize_filename(raw_filename, restricted=True)
             self.to_screen(u'Saving request to ' + filename)
             with open(filename, 'wb') as outf:
                 outf.write(webpage_bytes)
 
-        content = webpage_bytes.decode(encoding, 'replace')
+        try:
+            content = webpage_bytes.decode(encoding, 'replace')
+        except LookupError:
+            content = webpage_bytes.decode('utf-8', 'replace')
+
+        if (u'<title>Access to this site is blocked</title>' in content and
+                u'Websense' in content[:512]):
+            msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
+            blocked_iframe = self._html_search_regex(
+                r'<iframe src="([^"]+)"', content,
+                u'Websense information URL', default=None)
+            if blocked_iframe:
+                msg += u' Visit %s for more details' % blocked_iframe
+            raise ExtractorError(msg, expected=True)
+
         return (content, urlh)
 
     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
@@ -262,9 +286,12 @@ class InfoExtractor(object):
 
     def _download_xml(self, url_or_request, video_id,
                       note=u'Downloading XML', errnote=u'Unable to download XML',
-                      transform_source=None):
+                      transform_source=None, fatal=True):
         """Return the xml as an xml.etree.ElementTree.Element"""
-        xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
+        xml_string = self._download_webpage(
+            url_or_request, video_id, note, errnote, fatal=fatal)
+        if xml_string is False:
+            return xml_string
         if transform_source:
             xml_string = transform_source(xml_string)
         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
@@ -432,14 +459,14 @@ class InfoExtractor(object):
         if secure: regexes = self._og_regexes('video:secure_url') + regexes
         return self._html_search_regex(regexes, html, name, **kargs)
 
-    def _html_search_meta(self, name, html, display_name=None):
+    def _html_search_meta(self, name, html, display_name=None, fatal=False):
         if display_name is None:
             display_name = name
         return self._html_search_regex(
             r'''(?ix)<meta
                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
-            html, display_name, fatal=False)
+            html, display_name, fatal=fatal)
 
     def _dc_search_uploader(self, html):
         return self._html_search_meta('dc.creator', html, 'uploader')
@@ -528,6 +555,23 @@ class InfoExtractor(object):
             )
         formats.sort(key=_formats_key)
 
+    def http_scheme(self):
+        """ Either "https:" or "https:", depending on the user's preferences """
+        return (
+            'http:'
+            if self._downloader.params.get('prefer_insecure', False)
+            else 'https:')
+
+    def _proto_relative_url(self, url, scheme=None):
+        if url is None:
+            return url
+        if url.startswith('//'):
+            if scheme is None:
+                scheme = self.http_scheme()
+            return scheme + url
+        else:
+            return url
+
 
 class SearchInfoExtractor(InfoExtractor):
     """
@@ -571,3 +615,4 @@ class SearchInfoExtractor(InfoExtractor):
     @property
     def SEARCH_KEY(self):
         return self._SEARCH_KEY
+
index 91c1c1348f587798131459676ffe5444727c5c3b..ffbe4903b807faf0442057ebbee27bc9ed838c12 100644 (file)
@@ -28,16 +28,18 @@ class CondeNastIE(InfoExtractor):
         'glamour': 'Glamour',
         'wmagazine': 'W Magazine',
         'vanityfair': 'Vanity Fair',
+        'cnevids': 'Condé Nast',
     }
 
-    _VALID_URL = r'http://(video|www)\.(?P<site>%s)\.com/(?P<type>watch|series|video)/(?P<id>.+)' % '|'.join(_SITES.keys())
+    _VALID_URL = r'http://(video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys())
     IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values()))
 
     _TEST = {
         'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led',
-        'file': '5171b343c2b4c00dd0c1ccb3.mp4',
         'md5': '1921f713ed48aabd715691f774c451f7',
         'info_dict': {
+            'id': '5171b343c2b4c00dd0c1ccb3',
+            'ext': 'mp4',
             'title': '3D Printed Speakers Lit With LED',
             'description': 'Check out these beautiful 3D printed LED speakers.  You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.',
         }
@@ -55,12 +57,16 @@ class CondeNastIE(InfoExtractor):
         entries = [self.url_result(build_url(path), 'CondeNast') for path in paths]
         return self.playlist_result(entries, playlist_title=title)
 
-    def _extract_video(self, webpage):
-        description = self._html_search_regex([r'<div class="cne-video-description">(.+?)</div>',
-                                               r'<div class="video-post-content">(.+?)</div>',
-                                               ],
-                                              webpage, 'description',
-                                              fatal=False, flags=re.DOTALL)
+    def _extract_video(self, webpage, url_type):
+        if url_type != 'embed':
+            description = self._html_search_regex(
+                [
+                    r'<div class="cne-video-description">(.+?)</div>',
+                    r'<div class="video-post-content">(.+?)</div>',
+                ],
+                webpage, 'description', fatal=False, flags=re.DOTALL)
+        else:
+            description = None
         params = self._search_regex(r'var params = {(.+?)}[;,]', webpage,
                                     'player params', flags=re.DOTALL)
         video_id = self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id')
@@ -99,12 +105,12 @@ class CondeNastIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         site = mobj.group('site')
         url_type = mobj.group('type')
-        id = mobj.group('id')
+        item_id = mobj.group('id')
 
-        self.to_screen(u'Extracting from %s with the Condé Nast extractor' % self._SITES[site])
-        webpage = self._download_webpage(url, id)
+        self.to_screen('Extracting from %s with the Condé Nast extractor' % self._SITES[site])
+        webpage = self._download_webpage(url, item_id)
 
         if url_type == 'series':
             return self._extract_series(url, webpage)
         else:
-            return self._extract_video(webpage)
+            return self._extract_video(webpage, url_type)
index 920728e01f1c2b2a6e6b75c3b6740e84b3043897..026a9177e754de7d606961e6e4793af86da49fe2 100644 (file)
@@ -1,7 +1,11 @@
 # encoding: utf-8
 from __future__ import unicode_literals
 
-import re, base64, zlib
+import re
+import json
+import base64
+import zlib
+
 from hashlib import sha1
 from math import pow, sqrt, floor
 from .common import InfoExtractor
@@ -19,13 +23,15 @@ from ..aes import (
     inc,
 )
 
+
 class CrunchyrollIE(InfoExtractor):
-    _VALID_URL = r'(?:https?://)?(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
-    _TESTS = [{
+    _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)'
+    _TEST = {
         'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
-        'file': '645513.flv',
         #'md5': 'b1639fd6ddfaa43788c85f6d1dddd412',
         'info_dict': {
+            'id': '645513',
+            'ext': 'flv',
             'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!',
             'description': 'md5:2d17137920c64f2f49981a7797d275ef',
             'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg',
@@ -36,7 +42,7 @@ class CrunchyrollIE(InfoExtractor):
             # rtmp
             'skip_download': True,
         },
-    }]
+    }
 
     _FORMAT_IDS = {
         '360': ('60', '106'),
@@ -68,7 +74,7 @@ class CrunchyrollIE(InfoExtractor):
             shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest())
             # Extend 160 Bit hash to 256 Bit
             return shaHash + [0] * 12
-        
+
         key = obfuscate_key(id)
         class Counter:
             __value = iv
@@ -80,9 +86,8 @@ class CrunchyrollIE(InfoExtractor):
         return zlib.decompress(decrypted_data)
 
     def _convert_subtitles_to_srt(self, subtitles):
-        i=1
         output = ''
-        for start, end, text in re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles):
+        for i, (start, end, text) in enumerate(re.findall(r'<event [^>]*?start="([^"]+)" [^>]*?end="([^"]+)" [^>]*?text="([^"]+)"[^>]*?>', subtitles), 1):
             start = start.replace('.', ',')
             end = end.replace('.', ',')
             text = clean_html(text)
@@ -90,7 +95,6 @@ class CrunchyrollIE(InfoExtractor):
             if not text:
                 continue
             output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text)
-            i+=1
         return output
 
     def _real_extract(self,url):
@@ -108,6 +112,12 @@ class CrunchyrollIE(InfoExtractor):
         if note_m:
             raise ExtractorError(note_m)
 
+        mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P<msg>{.+?})\]\)', webpage)
+        if mobj:
+            msg = json.loads(mobj.group('msg'))
+            if msg.get('type') == 'error':
+                raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True)
+
         video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL)
         video_title = re.sub(r' {2,}', ' ', video_title)
         video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='')
@@ -123,7 +133,7 @@ class CrunchyrollIE(InfoExtractor):
         playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url})
         playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
         playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info')
-        
+
         stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, 'stream_id')
         video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False)
 
@@ -161,7 +171,7 @@ class CrunchyrollIE(InfoExtractor):
             data = base64.b64decode(data)
 
             subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')
-            lang_code = self._search_regex(r'lang_code=\'([^\']+)', subtitle, 'subtitle_lang_code', fatal=False)
+            lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
             if not lang_code:
                 continue
             subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle)
index d65046f588d0bf4481ec6f8d8de7e031e6bdb2f9..b6552c542411c2abf639e71c955c66c34db2b007 100644 (file)
@@ -4,15 +4,16 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
+    int_or_none,
     unescapeHTML,
     find_xpath_attr,
 )
 
 
 class CSpanIE(InfoExtractor):
-    _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P<id>\d+)'
+    _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P<id>[0-9a-f]+)'
     IE_DESC = 'C-SPAN'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.c-span.org/video/?313572-1/HolderonV',
         'md5': '8e44ce11f0f725527daccc453f553eb0',
         'info_dict': {
@@ -22,13 +23,24 @@ class CSpanIE(InfoExtractor):
             'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in Shelby County v. Holder in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.',
         },
         'skip': 'Regularly fails on travis, for unknown reasons',
-    }
+    }, {
+        'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models',
+        # For whatever reason, the served video alternates between
+        # two different ones
+        #'md5': 'dbb0f047376d457f2ab8b3929cbb2d0c',
+        'info_dict': {
+            'id': '340723',
+            'ext': 'mp4',
+            'title': 'International Health Care Models',
+            'description': 'md5:7a985a2d595dba00af3d9c9f0783c967',
+        }
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         page_id = mobj.group('id')
         webpage = self._download_webpage(url, page_id)
-        video_id = self._search_regex(r'data-progid=\'(\d+)\'>', webpage, 'video id')
+        video_id = self._search_regex(r'progid=\'?([0-9]+)\'?>', webpage, 'video id')
 
         description = self._html_search_regex(
             [
@@ -43,18 +55,29 @@ class CSpanIE(InfoExtractor):
         info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id
         data = self._download_json(info_url, video_id)
 
-        url = unescapeHTML(data['video']['files'][0]['path']['#text'])
-
-        doc = self._download_xml('http://www.c-span.org/common/services/flashXml.php?programid=' + video_id,
+        doc = self._download_xml(
+            'http://www.c-span.org/common/services/flashXml.php?programid=' + video_id,
             video_id)
 
-        def find_string(s):
-            return find_xpath_attr(doc, './/string', 'name', s).text
+        title = find_xpath_attr(doc, './/string', 'name', 'title').text
+        thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text
+
+        files = data['video']['files']
+
+        entries = [{
+            'id': '%s_%d' % (video_id, partnum + 1),
+            'title': (
+                title if len(files) == 1 else
+                '%s part %d' % (title, partnum + 1)),
+            'url': unescapeHTML(f['path']['#text']),
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': int_or_none(f.get('length', {}).get('#text')),
+        } for partnum, f in enumerate(files)]
 
         return {
+            '_type': 'playlist',
+            'entries': entries,
+            'title': title,
             'id': video_id,
-            'title': find_string('title'),
-            'url': url,
-            'description': description,
-            'thumbnail': find_string('poster'),
         }
index 6685c94a3d6b283e0b7f2240ebfcf35ce462edc2..55216201fe7f137747ad4ac24137b8fe54494d72 100644 (file)
@@ -8,12 +8,11 @@ from .subtitles import SubtitlesInfoExtractor
 from ..utils import (
     compat_urllib_request,
     compat_str,
-    get_element_by_attribute,
-    get_element_by_id,
     orderedSet,
     str_to_int,
-
+    int_or_none,
     ExtractorError,
+    unescapeHTML,
 )
 
 class DailymotionBaseInfoExtractor(InfoExtractor):
@@ -124,7 +123,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
             if video_url is not None:
                 m_size = re.search(r'H264-(\d+)x(\d+)', video_url)
                 if m_size is not None:
-                    width, height = m_size.group(1), m_size.group(2)
+                    width, height = map(int_or_none, (m_size.group(1), m_size.group(2)))
                 else:
                     width, height = None, None
                 formats.append({
@@ -179,7 +178,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
 class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
     IE_NAME = u'dailymotion:playlist'
     _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/'
-    _MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/playlist/.+?".*?>.*?</a>.*?</div>'
+    _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"'
     _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s'
 
     def _extract_entries(self, id):
@@ -189,10 +188,9 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
             webpage = self._download_webpage(request,
                                              id, u'Downloading page %s' % pagenum)
 
-            playlist_el = get_element_by_attribute(u'class', u'row video_list', webpage)
-            video_ids.extend(re.findall(r'data-id="(.+?)"', playlist_el))
+            video_ids.extend(re.findall(r'data-xid="(.+?)"', webpage))
 
-            if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
+            if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
                 break
         return [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')
                    for video_id in orderedSet(video_ids)]
@@ -202,26 +200,26 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
         playlist_id = mobj.group('id')
         webpage = self._download_webpage(url, playlist_id)
 
-        return {'_type': 'playlist',
-                'id': playlist_id,
-                'title': get_element_by_id(u'playlist_name', webpage),
-                'entries': self._extract_entries(playlist_id),
-                }
+        return {
+            '_type': 'playlist',
+            'id': playlist_id,
+            'title': self._og_search_title(webpage),
+            'entries': self._extract_entries(playlist_id),
+        }
 
 
 class DailymotionUserIE(DailymotionPlaylistIE):
     IE_NAME = u'dailymotion:user'
-    _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/user/(?P<user>[^/]+)'
-    _MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/user/.+?".*?>.*?</a>.*?</div>'
+    _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/user/(?P<user>[^/]+)'
     _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         user = mobj.group('user')
         webpage = self._download_webpage(url, user)
-        full_user = self._html_search_regex(
-            r'<a class="label" href="/%s".*?>(.*?)</' % re.escape(user),
-            webpage, u'user', flags=re.DOTALL)
+        full_user = unescapeHTML(self._html_search_regex(
+            r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user),
+            webpage, u'user', flags=re.DOTALL))
 
         return {
             '_type': 'playlist',
index 4876ecb4812710e2509eec8fc19f00dac60d2fde..6033cd94a1b251d66e7a3f80034bc58b79fa4b55 100644 (file)
@@ -1,25 +1,28 @@
 # encoding: utf-8
+
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
 from ..utils import (
     compat_urllib_parse,
-    determine_ext,
 )
 
 
 class DaumIE(InfoExtractor):
     _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/.*?clipid=(?P<id>\d+)'
-    IE_NAME = u'daum.net'
+    IE_NAME = 'daum.net'
 
     _TEST = {
-        u'url': u'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690',
-        u'file': u'52554690.mp4',
-        u'info_dict': {
-            u'title': u'DOTA 2GETHER 시즌2 6회 - 2부',
-            u'description': u'DOTA 2GETHER 시즌2 6회 - 2부',
-            u'upload_date': u'20130831',
-            u'duration': 3868,
+        'url': 'http://tvpot.daum.net/clip/ClipView.do?clipid=52554690',
+        'info_dict': {
+            'id': '52554690',
+            'ext': 'mp4',
+            'title': 'DOTA 2GETHER 시즌2 6회 - 2부',
+            'description': 'DOTA 2GETHER 시즌2 6회 - 2부',
+            'upload_date': '20130831',
+            'duration': 3868,
         },
     }
 
@@ -30,14 +33,14 @@ class DaumIE(InfoExtractor):
         webpage = self._download_webpage(canonical_url, video_id)
         full_id = self._search_regex(
             r'<iframe src="http://videofarm.daum.net/controller/video/viewer/Video.html\?.*?vid=(.+?)[&"]',
-            webpage, u'full id')
+            webpage, 'full id')
         query = compat_urllib_parse.urlencode({'vid': full_id})
         info = self._download_xml(
             'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id,
-            u'Downloading video info')
+            'Downloading video info')
         urls = self._download_xml(
             'http://videofarm.daum.net/controller/api/open/v1_2/MovieData.apixml?' + query,
-            video_id, u'Downloading video formats info')
+            video_id, 'Downloading video formats info')
 
         self.to_screen(u'%s: Getting video urls' % video_id)
         formats = []
@@ -53,7 +56,6 @@ class DaumIE(InfoExtractor):
             format_url = url_doc.find('result/url').text
             formats.append({
                 'url': format_url,
-                'ext': determine_ext(format_url),
                 'format_id': profile,
             })
 
diff --git a/youtube_dl/extractor/depositfiles.py b/youtube_dl/extractor/depositfiles.py
deleted file mode 100644 (file)
index 2c9fb5f..0000000
+++ /dev/null
@@ -1,60 +0,0 @@
-import re
-import os
-import socket
-
-from .common import InfoExtractor
-from ..utils import (
-    compat_http_client,
-    compat_str,
-    compat_urllib_error,
-    compat_urllib_parse,
-    compat_urllib_request,
-
-    ExtractorError,
-)
-
-
-class DepositFilesIE(InfoExtractor):
-    """Information extractor for depositfiles.com"""
-
-    _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
-
-    def _real_extract(self, url):
-        file_id = url.split('/')[-1]
-        # Rebuild url in english locale
-        url = 'http://depositfiles.com/en/files/' + file_id
-
-        # Retrieve file webpage with 'Free download' button pressed
-        free_download_indication = {'gateway_result' : '1'}
-        request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
-        try:
-            self.report_download_webpage(file_id)
-            webpage = compat_urllib_request.urlopen(request).read()
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
-
-        # Search for the real file URL
-        mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
-        if (mobj is None) or (mobj.group(1) is None):
-            # Try to figure out reason of the error.
-            mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
-            if (mobj is not None) and (mobj.group(1) is not None):
-                restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
-                raise ExtractorError(u'%s' % restriction_message)
-            else:
-                raise ExtractorError(u'Unable to extract download URL from: %s' % url)
-
-        file_url = mobj.group(1)
-        file_extension = os.path.splitext(file_url)[1][1:]
-
-        # Search for file title
-        file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
-
-        return [{
-            'id':       file_id.decode('utf-8'),
-            'url':      file_url.decode('utf-8'),
-            'uploader': None,
-            'upload_date':  None,
-            'title':    file_title,
-            'ext':      file_extension.decode('utf-8'),
-        }]
index 885944c5e8bf95e7af78c5c3ed4eb5f69cc7a35f..2ae6ecc12e7d5a5d546c5e4f56ef2a37b6fcb27f 100644 (file)
@@ -10,9 +10,10 @@ class DiscoveryIE(InfoExtractor):
     _VALID_URL = r'http://dsc\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?'
     _TEST = {
         'url': 'http://dsc.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm',
-        'file': '614784.mp4',
         'md5': 'e12614f9ee303a6ccef415cb0793eba2',
         'info_dict': {
+            'id': '614784',
+            'ext': 'mp4',
             'title': 'MythBusters: Mission Impossible Outtakes',
             'description': ('Watch Jamie Hyneman and Adam Savage practice being'
                 ' each other -- to the point of confusing Jamie\'s dog -- and '
@@ -34,7 +35,7 @@ class DiscoveryIE(InfoExtractor):
         formats = []
         for f in info['mp4']:
             formats.append(
-                {'url': f['src'], r'ext': r'mp4', 'tbr': int(f['bitrate'][:-1])})
+                {'url': f['src'], 'ext': 'mp4', 'tbr': int(f['bitrate'][:-1])})
 
         return {
             'id': info['contentId'],
diff --git a/youtube_dl/extractor/divxstage.py b/youtube_dl/extractor/divxstage.py
new file mode 100644 (file)
index 0000000..4ca3f37
--- /dev/null
@@ -0,0 +1,27 @@
+from __future__ import unicode_literals
+
+from .novamov import NovaMovIE
+
+
+class DivxStageIE(NovaMovIE):
+    IE_NAME = 'divxstage'
+    IE_DESC = 'DivxStage'
+
+    _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'divxstage\.(?:eu|net|ch|co|at|ag)'}
+
+    _HOST = 'www.divxstage.eu'
+
+    _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<'
+    _TITLE_REGEX = r'<div class="video_det">\s*<strong>([^<]+)</strong>'
+    _DESCRIPTION_REGEX = r'<div class="video_det">\s*<strong>[^<]+</strong>\s*<p>([^<]+)</p>'
+
+    _TEST = {
+        'url': 'http://www.divxstage.eu/video/57f238e2e5e01',
+        'md5': '63969f6eb26533a1968c4d325be63e72',
+        'info_dict': {
+            'id': '57f238e2e5e01',
+            'ext': 'flv',
+            'title': 'youtubedl test video',
+            'description': 'This is a test video for youtubedl.',
+        }
+    }
\ No newline at end of file
index 2bb77aec6cb0d9ae2a7b4c6301c6deefc4548c57..f8f49a013503cc853c2bf79e345b360af3db7fee 100644 (file)
@@ -1,23 +1,25 @@
+from __future__ import unicode_literals
+
 import re
 
 from ..utils import (
     compat_urllib_parse,
-    determine_ext
 )
 from .common import InfoExtractor
 
 
 class EHowIE(InfoExtractor):
-    IE_NAME = u'eHow'
-    _VALID_URL = r'(?:https?://)?(?:www\.)?ehow\.com/[^/_?]*_(?P<id>[0-9]+)'
+    IE_NAME = 'eHow'
+    _VALID_URL = r'https?://(?:www\.)?ehow\.com/[^/_?]*_(?P<id>[0-9]+)'
     _TEST = {
-        u'url': u'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html',
-        u'file': u'12245069.flv',
-        u'md5': u'9809b4e3f115ae2088440bcb4efbf371',
-        u'info_dict': {
-            u"title": u"Hardwood Flooring Basics",
-            u"description": u"Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...",
-                       u"uploader": u"Erick Nathan"
+        'url': 'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html',
+        'md5': '9809b4e3f115ae2088440bcb4efbf371',
+        'info_dict': {
+            'id': '12245069',
+            'ext': 'flv',
+            'title': 'Hardwood Flooring Basics',
+            'description': 'Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...',
+            'uploader': 'Erick Nathan',
         }
     }
 
@@ -26,21 +28,16 @@ class EHowIE(InfoExtractor):
         video_id = mobj.group('id')
         webpage = self._download_webpage(url, video_id)
         video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)',
-            webpage, u'video URL')
-        final_url = compat_urllib_parse.unquote(video_url)        
-        uploader = self._search_regex(r'<meta name="uploader" content="(.+?)" />',
-            webpage, u'uploader')
+            webpage, 'video URL')
+        final_url = compat_urllib_parse.unquote(video_url)
+        uploader = self._html_search_meta('uploader', webpage)
         title = self._og_search_title(webpage).replace(' | eHow', '')
-        ext = determine_ext(final_url)
 
         return {
-            '_type':       'video',
-            'id':          video_id,
-            'url':         final_url,
-            'ext':         ext,
-            'title':       title,
-            'thumbnail':   self._og_search_thumbnail(webpage),
+            'id': video_id,
+            'url': final_url,
+            'title': title,
+            'thumbnail': self._og_search_thumbnail(webpage),
             'description': self._og_search_description(webpage),
-            'uploader':    uploader,
+            'uploader': uploader,
         }
-
diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py
new file mode 100644 (file)
index 0000000..e695258
--- /dev/null
@@ -0,0 +1,54 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class EmpflixIE(InfoExtractor):
+    _VALID_URL = r'^https?://www\.empflix\.com/videos/.*?-(?P<id>[0-9]+)\.html'
+    _TEST = {
+        'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
+        'md5': 'b1bc15b6412d33902d6e5952035fcabc',
+        'info_dict': {
+            'id': '33051',
+            'ext': 'mp4',
+            'title': 'Amateur Finger Fuck',
+            'description': 'Amateur solo finger fucking.',
+            'age_limit': 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        age_limit = self._rta_search(webpage)
+
+        video_title = self._html_search_regex(
+            r'name="title" value="(?P<title>[^"]*)"', webpage, 'title')
+        video_description = self._html_search_regex(
+            r'name="description" value="([^"]*)"', webpage, 'description', fatal=False)
+
+        cfg_url = self._html_search_regex(
+            r'flashvars\.config = escape\("([^"]+)"',
+            webpage, 'flashvars.config')
+
+        cfg_xml = self._download_xml(
+            cfg_url, video_id, note='Downloading metadata')
+
+        formats = [
+            {
+                'url': item.find('videoLink').text,
+                'format_id': item.find('res').text,
+            } for item in cfg_xml.findall('./quality/item')
+        ]
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'description': video_description,
+            'formats': formats,
+            'age_limit': age_limit,
+        }
diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py
new file mode 100644 (file)
index 0000000..92ada81
--- /dev/null
@@ -0,0 +1,43 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .fivemin import FiveMinIE
+from ..utils import (
+    url_basename,
+)
+
+
+class EngadgetIE(InfoExtractor):
+    _VALID_URL = r'''(?x)https?://www.engadget.com/
+        (?:video/5min/(?P<id>\d+)|
+            [\d/]+/.*?)
+        '''
+
+    _TEST = {
+        'url': 'http://www.engadget.com/video/5min/518153925/',
+        'md5': 'c6820d4828a5064447a4d9fc73f312c9',
+        'info_dict': {
+            'id': '518153925',
+            'ext': 'mp4',
+            'title': 'Samsung Galaxy Tab Pro 8.4 Review',
+        },
+        'add_ie': ['FiveMin'],
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        if video_id is not None:
+            return FiveMinIE._build_result(video_id)
+        else:
+            title = url_basename(url)
+            webpage = self._download_webpage(url, title)
+            ids = re.findall(r'<iframe[^>]+?playList=(\d+)', webpage)
+            return {
+                '_type': 'playlist',
+                'title': title,
+                'entries': [FiveMinIE._build_result(id) for id in ids]
+            }
index 1c20e43644b917d5b73b712d1630740f1a7e7e81..14a196ffc63336ae7d016b035cfb28cc7f7d28a0 100644 (file)
@@ -1,4 +1,5 @@
-import os
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
@@ -8,18 +9,23 @@ from ..utils import (
     compat_urllib_parse,
 )
 
+
 class ExtremeTubeIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>extremetube\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)'
-    _TEST = {
-        u'url': u'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431',
-        u'file': u'652431.mp4',
-        u'md5': u'1fb9228f5e3332ec8c057d6ac36f33e0',
-        u'info_dict': {
-            u"title": u"Music Video 14 british euro brit european cumshots swallow",
-            u"uploader": u"unknown",
-            u"age_limit": 18,
+    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>extremetube\.com/.*?video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)'
+    _TESTS = [{
+        'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431',
+        'md5': '1fb9228f5e3332ec8c057d6ac36f33e0',
+        'info_dict': {
+            'id': '652431',
+            'ext': 'mp4',
+            'title': 'Music Video 14 british euro brit european cumshots swallow',
+            'uploader': 'unknown',
+            'age_limit': 18,
         }
-    }
+    }, {
+        'url': 'http://www.extremetube.com/gay/video/abcde-1234',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
@@ -30,11 +36,14 @@ class ExtremeTubeIE(InfoExtractor):
         req.add_header('Cookie', 'age_verified=1')
         webpage = self._download_webpage(req, video_id)
 
-        video_title = self._html_search_regex(r'<h1 [^>]*?title="([^"]+)"[^>]*>\1<', webpage, u'title')
-        uploader = self._html_search_regex(r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, u'uploader', fatal=False)
-        video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&amp;', webpage, u'video_url'))
+        video_title = self._html_search_regex(
+            r'<h1 [^>]*?title="([^"]+)"[^>]*>', webpage, 'title')
+        uploader = self._html_search_regex(
+            r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, 'uploader',
+            fatal=False)
+        video_url = compat_urllib_parse.unquote(self._html_search_regex(
+            r'video_url=(.+?)&amp;', webpage, 'video_url'))
         path = compat_urllib_parse_urlparse(video_url).path
-        extension = os.path.splitext(path)[1][1:]
         format = path.split('/')[5].split('_')[:2]
         format = "-".join(format)
 
@@ -43,7 +52,6 @@ class ExtremeTubeIE(InfoExtractor):
             'title': video_title,
             'uploader': uploader,
             'url': video_url,
-            'ext': extension,
             'format': format,
             'format_id': format,
             'age_limit': 18,
index 8f9154c0e6864d04ef6c4e0f441ef4a68e6d30d2..f0cd8f1565b7e7b1b5220ba0f68d0b9225d953e6 100644 (file)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import json
 import re
 import socket
@@ -9,16 +11,15 @@ from ..utils import (
     compat_urllib_error,
     compat_urllib_parse,
     compat_urllib_request,
+    urlencode_postdata,
 
     ExtractorError,
 )
 
 
 class FacebookIE(InfoExtractor):
-    """Information Extractor for Facebook"""
-
     _VALID_URL = r'''(?x)
-        (?:https?://)?(?:\w+\.)?facebook\.com/
+        https?://(?:\w+\.)?facebook\.com/
         (?:[^#?]*\#!/)?
         (?:video/video\.php|photo\.php|video/embed)\?(?:.*?)
         (?:v|video_id)=(?P<id>[0-9]+)
@@ -26,21 +27,18 @@ class FacebookIE(InfoExtractor):
     _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
     _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
     _NETRC_MACHINE = 'facebook'
-    IE_NAME = u'facebook'
+    IE_NAME = 'facebook'
     _TEST = {
-        u'url': u'https://www.facebook.com/photo.php?v=120708114770723',
-        u'file': u'120708114770723.mp4',
-        u'md5': u'48975a41ccc4b7a581abd68651c1a5a8',
-        u'info_dict': {
-            u"duration": 279,
-            u"title": u"PEOPLE ARE AWESOME 2013"
+        'url': 'https://www.facebook.com/photo.php?v=120708114770723',
+        'md5': '48975a41ccc4b7a581abd68651c1a5a8',
+        'info_dict': {
+            'id': '120708114770723',
+            'ext': 'mp4',
+            'duration': 279,
+            'title': 'PEOPLE ARE AWESOME 2013',
         }
     }
 
-    def report_login(self):
-        """Report attempt to log in."""
-        self.to_screen(u'Logging in')
-
     def _login(self):
         (useremail, password) = self._get_login_info()
         if useremail is None:
@@ -48,11 +46,13 @@ class FacebookIE(InfoExtractor):
 
         login_page_req = compat_urllib_request.Request(self._LOGIN_URL)
         login_page_req.add_header('Cookie', 'locale=en_US')
-        self.report_login()
-        login_page = self._download_webpage(login_page_req, None, note=False,
-            errnote=u'Unable to download login page')
-        lsd = self._search_regex(r'"lsd":"(\w*?)"', login_page, u'lsd')
-        lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, u'lgnrnd')
+        login_page = self._download_webpage(login_page_req, None,
+            note='Downloading login page',
+            errnote='Unable to download login page')
+        lsd = self._search_regex(
+            r'<input type="hidden" name="lsd" value="([^"]*)"',
+            login_page, 'lsd')
+        lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd')
 
         login_form = {
             'email': useremail,
@@ -65,27 +65,28 @@ class FacebookIE(InfoExtractor):
             'timezone': '-60',
             'trynum': '1',
             }
-        request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
+        request = compat_urllib_request.Request(self._LOGIN_URL, urlencode_postdata(login_form))
         request.add_header('Content-Type', 'application/x-www-form-urlencoded')
         try:
-            login_results = compat_urllib_request.urlopen(request).read()
+            login_results = self._download_webpage(request, None,
+                note='Logging in', errnote='unable to fetch login page')
             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
-                self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
+                self._downloader.report_warning('unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
                 return
 
             check_form = {
-                'fb_dtsg': self._search_regex(r'"fb_dtsg":"(.*?)"', login_results, u'fb_dtsg'),
-                'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, u'nh'),
+                'fb_dtsg': self._search_regex(r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg'),
+                'h': self._search_regex(r'name="h" value="(\w*?)"', login_results, 'h'),
                 'name_action_selected': 'dont_save',
-                'submit[Continue]': self._search_regex(r'<input value="(.*?)" name="submit\[Continue\]"', login_results, u'continue'),
             }
-            check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, compat_urllib_parse.urlencode(check_form))
+            check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, urlencode_postdata(check_form))
             check_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
-            check_response = compat_urllib_request.urlopen(check_req).read()
+            check_response = self._download_webpage(check_req, None,
+                note='Confirming login')
             if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
-                self._downloader.report_warning(u'Unable to confirm login, you have to login in your brower and authorize the login.')
+                self._downloader.report_warning('Unable to confirm login, you have to login in your brower and authorize the login.')
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
+            self._downloader.report_warning('unable to log in: %s' % compat_str(err))
             return
 
     def _real_initialize(self):
@@ -93,8 +94,6 @@ class FacebookIE(InfoExtractor):
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
         video_id = mobj.group('id')
 
         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
@@ -107,10 +106,10 @@ class FacebookIE(InfoExtractor):
             m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
             if m_msg is not None:
                 raise ExtractorError(
-                    u'The video is not available, Facebook said: "%s"' % m_msg.group(1),
+                    'The video is not available, Facebook said: "%s"' % m_msg.group(1),
                     expected=True)
             else:
-                raise ExtractorError(u'Cannot parse data')
+                raise ExtractorError('Cannot parse data')
         data = dict(json.loads(m.group(1)))
         params_raw = compat_urllib_parse.unquote(data['params'])
         params = json.loads(params_raw)
@@ -119,19 +118,15 @@ class FacebookIE(InfoExtractor):
         if not video_url:
             video_url = video_data['sd_src']
         if not video_url:
-            raise ExtractorError(u'Cannot find video URL')
-        video_duration = int(video_data['video_duration'])
-        thumbnail = video_data['thumbnail_src']
+            raise ExtractorError('Cannot find video URL')
 
         video_title = self._html_search_regex(
-            r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, u'title')
+            r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, 'title')
 
-        info = {
+        return {
             'id': video_id,
             'title': video_title,
             'url': video_url,
-            'ext': 'mp4',
-            'duration': video_duration,
-            'thumbnail': thumbnail,
+            'duration': int(video_data['video_duration']),
+            'thumbnail': video_data['thumbnail_src'],
         }
-        return [info]
diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py
new file mode 100644 (file)
index 0000000..18f91ef
--- /dev/null
@@ -0,0 +1,60 @@
+#! -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import re
+import hashlib
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    compat_urllib_request,
+    compat_urlparse,
+)
+
+
+class FC2IE(InfoExtractor):
+    _VALID_URL = r'^http://video\.fc2\.com/((?P<lang>[^/]+)/)?content/(?P<id>[^/]+)'
+    IE_NAME = 'fc2'
+    _TEST = {
+        'url': 'http://video.fc2.com/en/content/20121103kUan1KHs',
+        'md5': 'a6ebe8ebe0396518689d963774a54eb7',
+        'info_dict': {
+            'id': '20121103kUan1KHs',
+            'ext': 'flv',
+            'title': 'Boxing again with Puff',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        self._downloader.cookiejar.clear_session_cookies()  # must clear
+
+        title = self._og_search_title(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+        refer = url.replace('/content/', '/a/content/')
+
+        mimi = hashlib.md5((video_id + '_gGddgPfeaf_gzyr').encode('utf-8')).hexdigest()
+
+        info_url = (
+            "http://video.fc2.com/ginfo.php?mimi={1:s}&href={2:s}&v={0:s}&fversion=WIN%2011%2C6%2C602%2C180&from=2&otag=0&upid={0:s}&tk=null&".
+            format(video_id, mimi, compat_urllib_request.quote(refer, safe='').replace('.','%2E')))
+
+        info_webpage = self._download_webpage(
+            info_url, video_id, note='Downloading info page')
+        info = compat_urlparse.parse_qs(info_webpage)
+
+        if 'err_code' in info:
+            raise ExtractorError('Error code: %s' % info['err_code'][0])
+
+        video_url = info['filepath'][0] + '?mid=' + info['mid'][0]
+
+        return {
+            'id': video_id,
+            'title': info['title'][0],
+            'url': video_url,
+            'ext': 'flv',
+            'thumbnail': thumbnail,
+        }
index 7e3d1afd215bfc8ef39c6751e116def05bf858ed..eccd8dde9e007583b9f73f63df45a38b89c21286 100644 (file)
@@ -6,7 +6,6 @@ from .common import InfoExtractor
 
 
 class FirstpostIE(InfoExtractor):
-    IE_NAME = 'Firstpost.com'
     _VALID_URL = r'http://(?:www\.)?firstpost\.com/[^/]+/.*-(?P<id>[0-9]+)\.html'
 
     _TEST = {
@@ -16,7 +15,6 @@ class FirstpostIE(InfoExtractor):
             'id': '1025403',
             'ext': 'mp4',
             'title': 'India to launch indigenous aircraft carrier INS Vikrant today',
-            'description': 'Its flight deck is over twice the size of a football field, its power unit can light up the entire Kochi city and the cabling is enough to cover the distance between here to Delhi.',
         }
     }
 
@@ -24,15 +22,26 @@ class FirstpostIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
 
-        webpage = self._download_webpage(url, video_id)
-        video_url = self._html_search_regex(
-            r'<div.*?name="div_video".*?flashvars="([^"]+)">',
-            webpage, 'video URL')
+        data = self._download_xml(
+            'http://www.firstpost.com/getvideoxml-%s.xml' % video_id, video_id,
+            'Downloading video XML')
+
+        item = data.find('./playlist/item')
+        thumbnail = item.find('./image').text
+        title = item.find('./title').text
+
+        formats = [
+            {
+                'url': details.find('./file').text,
+                'format_id': details.find('./label').text.strip(),
+                'width': int(details.find('./width').text.strip()),
+                'height': int(details.find('./height').text.strip()),
+            } for details in item.findall('./source/file_details') if details.find('./file').text
+        ]
 
         return {
             'id': video_id,
-            'url': video_url,
-            'title': self._og_search_title(webpage),
-            'description': self._og_search_description(webpage),
-            'thumbnail': self._og_search_thumbnail(webpage),
+            'title': title,
+            'thumbnail': thumbnail,
+            'formats': formats,
         }
diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py
new file mode 100644 (file)
index 0000000..3a50bab
--- /dev/null
@@ -0,0 +1,88 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_str,
+    compat_urllib_parse,
+    ExtractorError,
+)
+
+
+class FiveMinIE(InfoExtractor):
+    IE_NAME = '5min'
+    _VALID_URL = r'''(?x)
+        (?:https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(.*?&)?playList=|
+            5min:)
+        (?P<id>\d+)
+        '''
+
+    _TESTS = [
+        {
+            # From http://www.engadget.com/2013/11/15/ipad-mini-retina-display-review/
+            'url': 'http://pshared.5min.com/Scripts/PlayerSeed.js?sid=281&width=560&height=345&playList=518013791',
+            'md5': '4f7b0b79bf1a470e5004f7112385941d',
+            'info_dict': {
+                'id': '518013791',
+                'ext': 'mp4',
+                'title': 'iPad Mini with Retina Display Review',
+            },
+        },
+        {
+            # From http://on.aol.com/video/how-to-make-a-next-level-fruit-salad-518086247
+            'url': '5min:518086247',
+            'md5': 'e539a9dd682c288ef5a498898009f69e',
+            'info_dict': {
+                'id': '518086247',
+                'ext': 'mp4',
+                'title': 'How to Make a Next-Level Fruit Salad',
+            },
+        },
+    ]
+
+    @classmethod
+    def _build_result(cls, video_id):
+        return cls.url_result('5min:%s' % video_id, cls.ie_key())
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        embed_url = 'https://embed.5min.com/playerseed/?playList=%s' % video_id
+        embed_page = self._download_webpage(embed_url, video_id,
+            'Downloading embed page')
+        sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid')
+        query = compat_urllib_parse.urlencode({
+            'func': 'GetResults',
+            'playlist': video_id,
+            'sid': sid,
+            'isPlayerSeed': 'true',
+            'url': embed_url,
+        })
+        response = self._download_json(
+            'https://syn.5min.com/handlers/SenseHandler.ashx?' + query,
+            video_id)
+        if not response['success']:
+            err_msg = response['errorMessage']
+            if err_msg == 'ErrorVideoUserNotGeo':
+                msg = 'Video not available from your location'
+            else:
+                msg = 'Aol said: %s' % err_msg
+            raise ExtractorError(msg, expected=True, video_id=video_id)
+        info = response['binding'][0]
+
+        second_id = compat_str(int(video_id[:-2]) + 1)
+        formats = []
+        for quality, height in [(1, 320), (2, 480), (4, 720), (8, 1080)]:
+            if any(r['ID'] == quality for r in info['Renditions']):
+                formats.append({
+                    'format_id': compat_str(quality),
+                    'url': 'http://avideos.5min.com/%s/%s/%s_%s.mp4' % (second_id[-3:], second_id, video_id, quality),
+                    'height': height,
+                })
+
+        return {
+            'id': video_id,
+            'title': info['Title'],
+            'formats': formats,
+        }
index 8db7fc6cba1c13b4a1b6ded218e41e0e30dc8ae4..7d56b9be93a0332e70381c3a46b748c6d39e5b6a 100644 (file)
@@ -8,8 +8,8 @@ from ..utils import (
     unified_strdate,
     str_to_int,
     parse_duration,
+    clean_html,
 )
-from youtube_dl.utils import clean_html
 
 
 class FourTubeIE(InfoExtractor):
diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py
new file mode 100644 (file)
index 0000000..898e0dd
--- /dev/null
@@ -0,0 +1,77 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_parse_qs,
+    compat_urlparse,
+)
+
+
+class FranceCultureIE(InfoExtractor):
+    _VALID_URL = r'(?P<baseurl>http://(?:www\.)?franceculture\.fr/)player/reecouter\?play=(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.franceculture.fr/player/reecouter?play=4795174',
+        'info_dict': {
+            'id': '4795174',
+            'ext': 'mp3',
+            'title': 'Rendez-vous au pays des geeks',
+            'vcodec': 'none',
+            'uploader': 'Colette Fellous',
+            'upload_date': '20140301',
+            'duration': 3601,
+            'thumbnail': r're:^http://www\.franceculture\.fr/.*/images/player/Carnet-nomade\.jpg$',
+            'description': 'Avec :Jean-Baptiste Péretié pour son documentaire sur Arte "La revanche des « geeks », une enquête menée aux Etats-Unis dans la S ...',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        baseurl = mobj.group('baseurl')
+
+        webpage = self._download_webpage(url, video_id)
+        params_code = self._search_regex(
+            r"<param name='movie' value='/sites/all/modules/rf/rf_player/swf/loader.swf\?([^']+)' />",
+            webpage, 'parameter code')
+        params = compat_parse_qs(params_code)
+        video_url = compat_urlparse.urljoin(baseurl, params['urlAOD'][0])
+
+        title = self._html_search_regex(
+            r'<h1 class="title[^"]+">(.+?)</h1>', webpage, 'title')
+        uploader = self._html_search_regex(
+            r'(?s)<div id="emission".*?<span class="author">(.*?)</span>',
+            webpage, 'uploader', fatal=False)
+        thumbnail_part = self._html_search_regex(
+            r'(?s)<div id="emission".*?<img src="([^"]+)"', webpage,
+            'thumbnail', fatal=False)
+        if thumbnail_part is None:
+            thumbnail = None
+        else:
+            thumbnail = compat_urlparse.urljoin(baseurl, thumbnail_part)
+        description = self._html_search_regex(
+            r'(?s)<p class="desc">(.*?)</p>', webpage, 'description')
+
+        info = json.loads(params['infoData'][0])[0]
+        duration = info.get('media_length')
+        upload_date_candidate = info.get('media_section5')
+        upload_date = (
+            upload_date_candidate
+            if (upload_date_candidate is not None and
+                re.match(r'[0-9]{8}$', upload_date_candidate))
+            else None)
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'vcodec': 'none' if video_url.lower().endswith('.mp3') else None,
+            'duration': duration,
+            'uploader': uploader,
+            'upload_date': upload_date,
+            'title': title,
+            'thumbnail': thumbnail,
+            'description': description,
+        }
index 51eb97b2f6290d11c9a2a7d9c4bf8a785f54239b..f3e0f38b7200a70c897dd561b45a275cf42f7193 100644 (file)
@@ -48,24 +48,36 @@ class PluzzIE(FranceTVBaseInfoExtractor):
 
 class FranceTvInfoIE(FranceTVBaseInfoExtractor):
     IE_NAME = 'francetvinfo.fr'
-    _VALID_URL = r'https?://www\.francetvinfo\.fr/replay.*/(?P<title>.+)\.html'
+    _VALID_URL = r'https?://www\.francetvinfo\.fr/.*/(?P<title>.+)\.html'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html',
-        'file': '84981923.mp4',
         'info_dict': {
+            'id': '84981923',
+            'ext': 'mp4',
             'title': 'Soir 3',
         },
         'params': {
             'skip_download': True,
         },
-    }
+    }, {
+        'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html',
+        'info_dict': {
+            'id': 'EV_20019',
+            'ext': 'mp4',
+            'title': 'Débat des candidats à la Commission européenne',
+            'description': 'Débat des candidats à la Commission européenne',
+        },
+        'params': {
+            'skip_download': 'HLS (reqires ffmpeg)'
+        }
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         page_title = mobj.group('title')
         webpage = self._download_webpage(url, page_title)
-        video_id = self._search_regex(r'id-video=(\d+?)[@"]', webpage, 'video id')
+        video_id = self._search_regex(r'id-video=((?:[^0-9]*?_)?[0-9]+)[@"]', webpage, 'video id')
         return self._extract_video(video_id)
 
 
index 7c40e675398f7738bda09827031cfb99c620a17a..6e6b6666003d0837bffd6c25ddd12fe1ce892e50 100644 (file)
@@ -1,24 +1,35 @@
 from __future__ import unicode_literals
 
+import json
 import re
 
 from .common import InfoExtractor
+from ..utils import ExtractorError
 
 
 class FunnyOrDieIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?funnyordie\.com/(?P<type>embed|videos)/(?P<id>[0-9a-f]+)(?:$|[?#/])'
+    _TESTS = [{
         'url': 'http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version',
-        'file': '0732f586d7.mp4',
-        'md5': 'f647e9e90064b53b6e046e75d0241fbd',
+        'md5': 'bcd81e0c4f26189ee09be362ad6e6ba9',
         'info_dict': {
-            'description': ('Lyrics changed to match the video. Spoken cameo '
-                'by Obscurus Lupa (from ThatGuyWithTheGlasses.com). Based on a '
-                'concept by Dustin McLean (DustFilms.com). Performed, edited, '
-                'and written by David A. Scott.'),
+            'id': '0732f586d7',
+            'ext': 'mp4',
             'title': 'Heart-Shaped Box: Literal Video Version',
+            'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338',
+            'thumbnail': 're:^http:.*\.jpg$',
         },
-    }
+    }, {
+        'url': 'http://www.funnyordie.com/embed/e402820827',
+        'md5': 'ff4d83318f89776ed0250634cfaa8d36',
+        'info_dict': {
+            'id': 'e402820827',
+            'ext': 'mp4',
+            'title': 'Please Use This Song (Jon Lajoie)',
+            'description': 'md5:2ed27d364f5a805a6dba199faaf6681d',
+            'thumbnail': 're:^http:.*\.jpg$',
+        },
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
@@ -26,14 +37,34 @@ class FunnyOrDieIE(InfoExtractor):
         video_id = mobj.group('id')
         webpage = self._download_webpage(url, video_id)
 
-        video_url = self._search_regex(
-            [r'type="video/mp4" src="(.*?)"', r'src="([^>]*?)" type=\'video/mp4\''],
-            webpage, 'video URL', flags=re.DOTALL)
+        links = re.findall(r'<source src="([^"]+/v)\d+\.([^"]+)" type=\'video', webpage)
+        if not links:
+            raise ExtractorError('No media links available for %s' % video_id)
+
+        links.sort(key=lambda link: 1 if link[1] == 'mp4' else 0)
+
+        bitrates = self._html_search_regex(r'<source src="[^"]+/v,((?:\d+,)+)\.mp4\.csmil', webpage, 'video bitrates')
+        bitrates = [int(b) for b in bitrates.rstrip(',').split(',')]
+        bitrates.sort()
+
+        formats = []
+
+        for bitrate in bitrates:
+            for link in links:
+                formats.append({
+                    'url': '%s%d.%s' % (link[0], bitrate, link[1]),
+                    'format_id': '%s-%d' % (link[1], bitrate),
+                    'vbr': bitrate,
+                })
+
+        post_json = self._search_regex(
+            r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details')
+        post = json.loads(post_json)
 
         return {
             'id': video_id,
-            'url': video_url,
-            'ext': 'mp4',
-            'title': self._og_search_title(webpage),
-            'description': self._og_search_description(webpage),
+            'title': post['name'],
+            'description': post.get('description'),
+            'thumbnail': post.get('picture'),
+            'formats': formats,
         }
index a3a5251fe5711173ccb3986c263994d560345bf8..11fee3d31e88833b8074a1b59cff885eeffa46d3 100644 (file)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
@@ -6,13 +8,14 @@ from .common import InfoExtractor
 class GamekingsIE(InfoExtractor):
     _VALID_URL = r'http://www\.gamekings\.tv/videos/(?P<name>[0-9a-z\-]+)'
     _TEST = {
-        u"url": u"http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/",
-        u'file': u'20130811.mp4',
+        'url': 'http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/',
         # MD5 is flaky, seems to change regularly
-        #u'md5': u'2f32b1f7b80fdc5cb616efb4f387f8a3',
+        # 'md5': '2f32b1f7b80fdc5cb616efb4f387f8a3',
         u'info_dict': {
-            u"title": u"Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review",
-            u"description": u"Melle en Steven hebben voor de review een week in de rechtbank doorbracht met Phoenix Wright: Ace Attorney - Dual Destinies.",
+            'id': '20130811',
+            'ext': 'mp4',
+            'title': 'Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review',
+            'description': 'md5:36fd701e57e8c15ac8682a2374c99731',
         }
     }
 
index c9598ad3aa68eb940a8995fb0f739dbfb9d33fdf..3d67b9d60242760ff3e32c9fbbbcab39542f01da 100644 (file)
@@ -15,11 +15,12 @@ from ..utils import (
 class GameSpotIE(InfoExtractor):
     _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?'
     _TEST = {
-        "url": "http://www.gamespot.com/arma-iii/videos/arma-iii-community-guide-sitrep-i-6410818/",
-        "file": "gs-2300-6410818.mp4",
-        "md5": "b2a30deaa8654fcccd43713a6b6a4825",
-        "info_dict": {
-            "title": "Arma 3 - Community Guide: SITREP I",
+        'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/',
+        'md5': 'b2a30deaa8654fcccd43713a6b6a4825',
+        'info_dict': {
+            'id': 'gs-2300-6410818',
+            'ext': 'mp4',
+            'title': 'Arma 3 - Community Guide: SITREP I',
             'description': 'Check out this video where some of the basics of Arma 3 is explained.',
         }
     }
diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py
new file mode 100644 (file)
index 0000000..89d5994
--- /dev/null
@@ -0,0 +1,134 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+    compat_urllib_request,
+)
+
+class GDCVaultIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)/(?P<name>(\w|-)+)'
+    _TESTS = [
+        {
+            'url': 'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple',
+            'md5': '7ce8388f544c88b7ac11c7ab1b593704',
+            'info_dict': {
+                'id': '1019721',
+                'ext': 'mp4',
+                'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)'
+            }
+        },
+        {
+            'url': 'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of',
+            'info_dict': {
+                'id': '1015683',
+                'ext': 'flv',
+                'title': 'Embracing the Dark Art of Mathematical Modeling in AI'
+            },
+            'params': {
+                'skip_download': True,  # Requires rtmpdump
+            }
+        },
+    ]
+
+    def _parse_mp4(self, xml_description):
+        video_formats = []
+        mp4_video = xml_description.find('./metadata/mp4video')
+        if mp4_video is None:
+            return None
+
+        mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video.text)
+        video_root = mobj.group('root')
+        formats = xml_description.findall('./metadata/MBRVideos/MBRVideo')
+        for format in formats:
+            mobj = re.match(r'mp4\:(?P<path>.*)', format.find('streamName').text)
+            url = video_root + mobj.group('path')
+            vbr = format.find('bitrate').text
+            video_formats.append({
+                'url': url,
+                'vbr': int(vbr),
+            })
+        return video_formats
+
+    def _parse_flv(self, xml_description):
+        video_formats = []
+        akami_url = xml_description.find('./metadata/akamaiHost').text
+        slide_video_path = xml_description.find('./metadata/slideVideo').text
+        video_formats.append({
+            'url': 'rtmp://' + akami_url + '/' + slide_video_path,
+            'format_note': 'slide deck video',
+            'quality': -2,
+            'preference': -2,
+            'format_id': 'slides',
+        })
+        speaker_video_path = xml_description.find('./metadata/speakerVideo').text
+        video_formats.append({
+            'url': 'rtmp://' + akami_url + '/' + speaker_video_path,
+            'format_note': 'speaker video',
+            'quality': -1,
+            'preference': -1,
+            'format_id': 'speaker',
+        })
+        return video_formats
+
+    def _login(self, webpage_url, video_id):
+        (username, password) = self._get_login_info()
+        if username is None or password is None:
+            self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.')
+            return None
+
+        mobj = re.match(r'(?P<root_url>https?://.*?/).*', webpage_url)
+        login_url = mobj.group('root_url') + 'api/login.php'
+        logout_url = mobj.group('root_url') + 'logout'
+
+        login_form = {
+            'email': username,
+            'password': password,
+        }
+
+        request = compat_urllib_request.Request(login_url, compat_urllib_parse.urlencode(login_form))
+        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        self._download_webpage(request, video_id, 'Logging in')
+        start_page = self._download_webpage(webpage_url, video_id, 'Getting authenticated video page')
+        self._download_webpage(logout_url, video_id, 'Logging out')
+
+        return start_page
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        video_id = mobj.group('id')
+        webpage_url = 'http://www.gdcvault.com/play/' + video_id
+        start_page = self._download_webpage(webpage_url, video_id)
+
+        xml_root = self._html_search_regex(r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', start_page, 'xml root', None, False)
+
+        if xml_root is None:
+            # Probably need to authenticate
+            start_page = self._login(webpage_url, video_id)
+            if start_page is None:
+                self.report_warning('Could not login.')
+            else:
+                # Grab the url from the authenticated page
+                xml_root = self._html_search_regex(r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', start_page, 'xml root')
+
+        xml_name = self._html_search_regex(r'<iframe src=".*?\?xml=(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename', None, False)
+        if xml_name is None:
+            # Fallback to the older format
+            xml_name = self._html_search_regex(r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename')
+
+        xml_decription_url = xml_root + 'xml/' + xml_name
+        xml_description = self._download_xml(xml_decription_url, video_id)
+
+        video_title = xml_description.find('./metadata/title').text
+        video_formats = self._parse_mp4(xml_description)
+        if video_formats is None:
+            video_formats = self._parse_flv(xml_description)
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'formats': video_formats,
+        }
index 5bcc78bf79734ddd47ee93b2215c31bdc28e59d3..38a357d3b0406906144e25cbbc45fbe74d2f6c2c 100644 (file)
@@ -12,9 +12,11 @@ from ..utils import (
     compat_urllib_parse,
     compat_urllib_request,
     compat_urlparse,
+    compat_xml_parse_error,
 
     ExtractorError,
     HEADRequest,
+    parse_xml,
     smuggle_url,
     unescapeHTML,
     unified_strdate,
@@ -22,6 +24,8 @@ from ..utils import (
 )
 from .brightcove import BrightcoveIE
 from .ooyala import OoyalaIE
+from .rutv import RUTVIE
+from .smotri import SmotriIE
 
 
 class GenericIE(InfoExtractor):
@@ -31,9 +35,10 @@ class GenericIE(InfoExtractor):
     _TESTS = [
         {
             'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
-            'file': '13601338388002.mp4',
-            'md5': '6e15c93721d7ec9e9ca3fdbf07982cfd',
+            'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
             'info_dict': {
+                'id': '13601338388002',
+                'ext': 'mp4',
                 'uploader': 'www.hodiho.fr',
                 'title': 'R\u00e9gis plante sa Jeep',
             }
@@ -42,8 +47,9 @@ class GenericIE(InfoExtractor):
         {
             'add_ie': ['Bandcamp'],
             'url': 'http://bronyrock.com/track/the-pony-mash',
-            'file': '3235767654.mp3',
             'info_dict': {
+                'id': '3235767654',
+                'ext': 'mp3',
                 'title': 'The Pony Mash',
                 'uploader': 'M_Pallante',
             },
@@ -69,22 +75,34 @@ class GenericIE(InfoExtractor):
         {
             # https://github.com/rg3/youtube-dl/issues/2253
             'url': 'http://bcove.me/i6nfkrc3',
-            'file': '3101154703001.mp4',
             'md5': '0ba9446db037002366bab3b3eb30c88c',
             'info_dict': {
+                'id': '3101154703001',
+                'ext': 'mp4',
                 'title': 'Still no power',
                 'uploader': 'thestar.com',
                 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
             },
             'add_ie': ['Brightcove'],
         },
+        {
+            'url': 'http://www.championat.com/video/football/v/87/87499.html',
+            'md5': 'fb973ecf6e4a78a67453647444222983',
+            'info_dict': {
+                'id': '3414141473001',
+                'ext': 'mp4',
+                'title': 'Видео. Удаление Дзагоева (ЦСКА)',
+                'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
+                'uploader': 'Championat',
+            },
+        },
         # Direct link to a video
         {
             'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
-            'file': 'trailer.mp4',
             'md5': '67d406c2bcb6af27fa886f31aa934bbe',
             'info_dict': {
                 'id': 'trailer',
+                'ext': 'mp4',
                 'title': 'trailer',
                 'upload_date': '20100513',
             }
@@ -92,7 +110,6 @@ class GenericIE(InfoExtractor):
         # ooyala video
         {
             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
-            'file': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ.mp4',
             'md5': '5644c6ca5d5782c1d0d350dad9bd840c',
             'info_dict': {
                 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
@@ -100,6 +117,150 @@ class GenericIE(InfoExtractor):
                 'title': '2cc213299525360.mov',  # that's what we get
             },
         },
+        # google redirect
+        {
+            'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
+            'info_dict': {
+                'id': 'cmQHVoWB5FY',
+                'ext': 'mp4',
+                'upload_date': '20130224',
+                'uploader_id': 'TheVerge',
+                'description': 'Chris Ziegler takes a look at the Alcatel OneTouch Fire and the ZTE Open; two of the first Firefox OS handsets to be officially announced.',
+                'uploader': 'The Verge',
+                'title': 'First Firefox OS phones side-by-side',
+            },
+            'params': {
+                'skip_download': False,
+            }
+        },
+        # embed.ly video
+        {
+            'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
+            'info_dict': {
+                'id': '9ODmcdjQcHQ',
+                'ext': 'mp4',
+                'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
+                'upload_date': '20140225',
+                'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
+                'uploader': 'Tested',
+                'uploader_id': 'testedcom',
+            },
+            # No need to test YoutubeIE here
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # funnyordie embed
+        {
+            'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
+            'md5': '7cf780be104d40fea7bae52eed4a470e',
+            'info_dict': {
+                'id': '18e820ec3f',
+                'ext': 'mp4',
+                'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
+                'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
+            },
+        },
+        # RUTV embed
+        {
+            'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
+            'info_dict': {
+                'id': '776940',
+                'ext': 'mp4',
+                'title': 'Охотское море стало целиком российским',
+                'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        # Embedded TED video
+        {
+            'url': 'http://en.support.wordpress.com/videos/ted-talks/',
+            'md5': 'deeeabcc1085eb2ba205474e7235a3d5',
+            'info_dict': {
+                'id': '981',
+                'ext': 'mp4',
+                'title': 'My web playroom',
+                'uploader': 'Ze Frank',
+                'description': 'md5:ddb2a40ecd6b6a147e400e535874947b',
+            }
+        },
+        # Embeded Ustream video
+        {
+            'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
+            'md5': '27b99cdb639c9b12a79bca876a073417',
+            'info_dict': {
+                'id': '45734260',
+                'ext': 'flv',
+                'uploader': 'AU SPA:  The NSA and Privacy',
+                'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
+            }
+        },
+        # nowvideo embed hidden behind percent encoding
+        {
+            'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
+            'md5': '2baf4ddd70f697d94b1c18cf796d5107',
+            'info_dict': {
+                'id': '06e53103ca9aa',
+                'ext': 'flv',
+                'title': 'Macross Episode 001  Watch Macross Episode 001 onl',
+                'description': 'No description',
+            },
+        },
+        # arte embed
+        {
+            'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
+            'md5': '7653032cbb25bf6c80d80f217055fa43',
+            'info_dict': {
+                'id': '048195-004_PLUS7-F',
+                'ext': 'flv',
+                'title': 'X:enius',
+                'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
+                'upload_date': '20140320',
+            },
+            'params': {
+                'skip_download': 'Requires rtmpdump'
+            }
+        },
+        # smotri embed
+        {
+            'url': 'http://rbctv.rbc.ru/archive/news/562949990879132.shtml',
+            'md5': 'ec40048448e9284c9a1de77bb188108b',
+            'info_dict': {
+                'id': 'v27008541fad',
+                'ext': 'mp4',
+                'title': 'Крым и Севастополь вошли в состав России',
+                'description': 'md5:fae01b61f68984c7bd2fa741e11c3175',
+                'duration': 900,
+                'upload_date': '20140318',
+                'uploader': 'rbctv_2012_4',
+                'uploader_id': 'rbctv_2012_4',
+            },
+        },
+        # Condé Nast embed
+        {
+            'url': 'http://www.wired.com/2014/04/honda-asimo/',
+            'md5': 'ba0dfe966fa007657bd1443ee672db0f',
+            'info_dict': {
+                'id': '53501be369702d3275860000',
+                'ext': 'mp4',
+                'title': 'Honda’s  New Asimo Robot Is More Human Than Ever',
+            }
+        },
+        # Dailymotion embed
+        {
+            'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
+            'md5': '441aeeb82eb72c422c7f14ec533999cd',
+            'info_dict': {
+                'id': 'k2mm4bCdJ6CQ2i7c8o2',
+                'ext': 'mp4',
+                'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
+                'uploader': 'Spi0n',
+            },
+            'add_ie': ['Dailymotion'],
+        }
     ]
 
     def report_download_webpage(self, video_id):
@@ -125,9 +286,14 @@ class GenericIE(InfoExtractor):
                     newurl = newurl.replace(' ', '%20')
                     newheaders = dict((k,v) for k,v in req.headers.items()
                                       if k.lower() not in ("content-length", "content-type"))
+                    try:
+                        # This function was deprecated in python 3.3 and removed in 3.4
+                        origin_req_host = req.get_origin_req_host()
+                    except AttributeError:
+                        origin_req_host = req.origin_req_host
                     return HEADRequest(newurl,
                                        headers=newheaders,
-                                       origin_req_host=req.get_origin_req_host(),
+                                       origin_req_host=origin_req_host,
                                        unverifiable=True)
                 else:
                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
@@ -159,23 +325,56 @@ class GenericIE(InfoExtractor):
             raise ExtractorError('Invalid URL protocol')
         return response
 
+    def _extract_rss(self, url, video_id, doc):
+        playlist_title = doc.find('./channel/title').text
+        playlist_desc_el = doc.find('./channel/description')
+        playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
+
+        entries = [{
+            '_type': 'url',
+            'url': e.find('link').text,
+            'title': e.find('title').text,
+        } for e in doc.findall('./channel/item')]
+
+        return {
+            '_type': 'playlist',
+            'id': url,
+            'title': playlist_title,
+            'description': playlist_desc,
+            'entries': entries,
+        }
+
     def _real_extract(self, url):
+        if url.startswith('//'):
+            return {
+                '_type': 'url',
+                'url': self.http_scheme() + url,
+            }
+
         parsed_url = compat_urlparse.urlparse(url)
         if not parsed_url.scheme:
             default_search = self._downloader.params.get('default_search')
             if default_search is None:
-                default_search = 'auto'
+                default_search = 'auto_warning'
 
-            if default_search == 'auto':
+            if default_search in ('auto', 'auto_warning'):
                 if '/' in url:
                     self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
                     return self.url_result('http://' + url)
                 else:
+                    if default_search == 'auto_warning':
+                        if re.match(r'^(?:url|URL)$', url):
+                            raise ExtractorError(
+                                'Invalid URL:  %r . Call youtube-dl like this:  youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc"  ' % url,
+                                expected=True)
+                        else:
+                            self._downloader.report_warning(
+                                'Falling back to youtube search for  %s . Set --default-search to "auto" to suppress this warning.' % url)
                     return self.url_result('ytsearch:' + url)
             else:
                 assert ':' in default_search
                 return self.url_result(default_search + url)
-        video_id = os.path.splitext(url.split('/')[-1])[0]
+        video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
 
         self.to_screen('%s: Requesting header' % video_id)
 
@@ -219,6 +418,19 @@ class GenericIE(InfoExtractor):
 
         self.report_extraction(video_id)
 
+        # Is it an RSS feed?
+        try:
+            doc = parse_xml(webpage)
+            if doc.tag == 'rss':
+                return self._extract_rss(url, video_id, doc)
+        except compat_xml_parse_error:
+            pass
+
+        # Sometimes embedded video player is hidden behind percent encoding
+        # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
+        # Unescaping the whole page allows to handle those cases in a generic way
+        webpage = compat_urllib_parse.unquote(webpage)
+
         # it's tempting to parse this further, but you would
         # have to take into account all the variations like
         #   Video Title - Site Name
@@ -252,9 +464,9 @@ class GenericIE(InfoExtractor):
 
         # Look for embedded (iframe) Vimeo player
         mobj = re.search(
-            r'<iframe[^>]+?src="((?:https?:)?//player\.vimeo\.com/video/.+?)"', webpage)
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
         if mobj:
-            player_url = unescapeHTML(mobj.group(1))
+            player_url = unescapeHTML(mobj.group('url'))
             surl = smuggle_url(player_url, {'Referer': url})
             return self.url_result(surl, 'Vimeo')
 
@@ -280,7 +492,7 @@ class GenericIE(InfoExtractor):
         matches = re.findall(
             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
         if matches:
-            urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion')
+            urlrs = [self.url_result(unescapeHTML(tuppl[1]))
                      for tuppl in matches]
             return self.playlist_result(
                 urlrs, playlist_id=video_id, playlist_title=video_title)
@@ -306,6 +518,22 @@ class GenericIE(InfoExtractor):
         if mobj:
             return self.url_result(mobj.group(1), 'BlipTV')
 
+        # Look for embedded condenast player
+        matches = re.findall(
+            r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
+            webpage)
+        if matches:
+            return {
+                '_type': 'playlist',
+                'entries': [{
+                    '_type': 'url',
+                    'ie_key': 'CondeNast',
+                    'url': ma,
+                } for ma in matches],
+                'title': video_title,
+                'id': video_id,
+            }
+
         # Look for Bandcamp pages with custom domain
         mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
         if mobj is not None:
@@ -320,12 +548,13 @@ class GenericIE(InfoExtractor):
             return self.url_result(mobj.group('url'))
 
         # Look for Ooyala videos
-        mobj = re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=([^"&]+)', webpage)
+        mobj = (re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
+             re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage))
         if mobj is not None:
-            return OoyalaIE._build_url_result(mobj.group(1))
+            return OoyalaIE._build_url_result(mobj.group('ec'))
 
         # Look for Aparat videos
-        mobj = re.search(r'<iframe src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
+        mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
         if mobj is not None:
             return self.url_result(mobj.group(1), 'Aparat')
 
@@ -334,11 +563,18 @@ class GenericIE(InfoExtractor):
         if mobj is not None:
             return self.url_result(mobj.group(1), 'Mpora')
 
-        # Look for embedded Novamov player
+        # Look for embedded NovaMov-based player
         mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:(?:embed|www)\.)?novamov\.com/embed\.php.+?)\1', webpage)
+            r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
+                    (?P<url>http://(?:(?:embed|www)\.)?
+                        (?:novamov\.com|
+                           nowvideo\.(?:ch|sx|eu|at|ag|co)|
+                           videoweed\.(?:es|com)|
+                           movshare\.(?:net|sx|ag)|
+                           divxstage\.(?:eu|net|ch|co|at|ag))
+                        /embed\.php.+?)\1''', webpage)
         if mobj is not None:
-            return self.url_result(mobj.group('url'), 'Novamov')
+            return self.url_result(mobj.group('url'))
 
         # Look for embedded Facebook player
         mobj = re.search(
@@ -346,58 +582,142 @@ class GenericIE(InfoExtractor):
         if mobj is not None:
             return self.url_result(mobj.group('url'), 'Facebook')
 
+        # Look for embedded VK player
+        mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'VK')
+
         # Look for embedded Huffington Post player
         mobj = re.search(
             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
         if mobj is not None:
             return self.url_result(mobj.group('url'), 'HuffPost')
 
+        # Look for embed.ly
+        mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'))
+        mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
+        if mobj is not None:
+            return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
+
+        # Look for funnyordie embed
+        matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
+        if matches:
+            urlrs = [self.url_result(unescapeHTML(eurl), 'FunnyOrDie')
+                     for eurl in matches]
+            return self.playlist_result(
+                urlrs, playlist_id=video_id, playlist_title=video_title)
+
+        # Look for embedded RUTV player
+        rutv_url = RUTVIE._extract_url(webpage)
+        if rutv_url:
+            return self.url_result(rutv_url, 'RUTV')
+
+        # Look for embedded TED player
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>http://embed\.ted\.com/.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'TED')
+
+        # Look for embedded Ustream videos
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'Ustream')
+
+        # Look for embedded arte.tv player
+        mobj = re.search(
+            r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
+            webpage)
+        if mobj is not None:
+            return self.url_result(mobj.group('url'), 'ArteTVEmbed')
+
+        # Look for embedded smotri.com player
+        smotri_url = SmotriIE._extract_url(webpage)
+        if smotri_url:
+            return self.url_result(smotri_url, 'Smotri')
+
+        # Look for embeded soundcloud player
+        mobj = re.search(
+            r'<iframe src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
+            webpage)
+        if mobj is not None:
+            url = unescapeHTML(mobj.group('url'))
+            return self.url_result(url)
+
         # Start with something easy: JW Player in SWFObject
-        mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
-        if mobj is None:
+        found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
+        if not found:
             # Look for gorilla-vid style embedding
-            mobj = re.search(r'(?s)(?:jw_plugins|JWPlayerOptions).*?file\s*:\s*["\'](.*?)["\']', webpage)
-        if mobj is None:
+            found = re.findall(r'''(?sx)
+                (?:
+                    jw_plugins|
+                    JWPlayerOptions|
+                    jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
+                )
+                .*?file\s*:\s*["\'](.*?)["\']''', webpage)
+        if not found:
             # Broaden the search a little bit
-            mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
-        if mobj is None:
-            # Broaden the search a little bit: JWPlayer JS loader
-            mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)
-        if mobj is None:
+            found = re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
+        if not found:
+            # Broaden the findall a little bit: JWPlayer JS loader
+            found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)
+        if not found:
             # Try to find twitter cards info
-            mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
-        if mobj is None:
+            found = re.findall(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
+        if not found:
             # We look for Open Graph info:
             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
-            m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
+            m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
             if m_video_type is not None:
-                mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
-        if mobj is None:
+                found = re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
+        if not found:
             # HTML5 video
-            mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL)
-        if mobj is None:
+            found = re.findall(r'(?s)<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage)
+        if not found:
+            found = re.search(
+                r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
+                r'(?:[a-z-]+="[^"]+"\s+)*?content="[0-9]{,2};url=\'([^\']+)\'"',
+                webpage)
+            if found:
+                new_url = found.group(1)
+                self.report_following_redirect(new_url)
+                return {
+                    '_type': 'url',
+                    'url': new_url,
+                }
+        if not found:
             raise ExtractorError('Unsupported URL: %s' % url)
 
-        # It's possible that one of the regexes
-        # matched, but returned an empty group:
-        if mobj.group(1) is None:
-            raise ExtractorError('Did not find a valid video URL at %s' % url)
+        entries = []
+        for video_url in found:
+            video_url = compat_urlparse.urljoin(url, video_url)
+            video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
 
-        video_url = mobj.group(1)
-        video_url = compat_urlparse.urljoin(url, video_url)
-        video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
+            # Sometimes, jwplayer extraction will result in a YouTube URL
+            if YoutubeIE.suitable(video_url):
+                entries.append(self.url_result(video_url, 'Youtube'))
+                continue
 
-        # Sometimes, jwplayer extraction will result in a YouTube URL
-        if YoutubeIE.suitable(video_url):
-            return self.url_result(video_url, 'Youtube')
+            # here's a fun little line of code for you:
+            video_id = os.path.splitext(video_id)[0]
+
+            entries.append({
+                'id': video_id,
+                'url': video_url,
+                'uploader': video_uploader,
+                'title': video_title,
+            })
 
-        # here's a fun little line of code for you:
-        video_id = os.path.splitext(video_id)[0]
+        if len(entries) == 1:
+            return entries[0]
+        else:
+            for num, e in enumerate(entries, start=1):
+                e['title'] = '%s (%d)' % (e['title'], num)
+            return {
+                '_type': 'playlist',
+                'entries': entries,
+            }
 
-        return {
-            'id': video_id,
-            'url': video_url,
-            'uploader': video_uploader,
-            'title': video_title,
-        }
index 5c25642702993f1ec344ce9a0d4967fffedc760a..383032d81b1c9bd965be9caee8adefd79547b208 100644 (file)
@@ -46,6 +46,6 @@ class GoogleSearchIE(SearchInfoExtractor):
                     'url': mobj.group(1)
                 })
 
-            if (len(entries) >= n) or not re.search(r'class="pn" id="pnnext"', webpage):
+            if (len(entries) >= n) or not re.search(r'id="pnnext"', webpage):
                 res['entries'] = entries[:n]
                 return res
diff --git a/youtube_dl/extractor/hentaistigma.py b/youtube_dl/extractor/hentaistigma.py
new file mode 100644 (file)
index 0000000..63d87b7
--- /dev/null
@@ -0,0 +1,42 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class HentaiStigmaIE(InfoExtractor):
+    _VALID_URL = r'^https?://hentai\.animestigma\.com/(?P<id>[^/]+)'
+    _TEST = {
+        'url': 'http://hentai.animestigma.com/inyouchuu-etsu-bonus/',
+        'md5': '4e3d07422a68a4cc363d8f57c8bf0d23',
+        'info_dict': {
+            'id': 'inyouchuu-etsu-bonus',
+            'ext': 'mp4',
+            "title": "Inyouchuu Etsu Bonus",
+            "age_limit": 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_regex(
+            r'<h2 class="posttitle"><a[^>]*>([^<]+)</a>',
+            webpage, 'title')
+        wrap_url = self._html_search_regex(
+            r'<iframe src="([^"]+mp4)"', webpage, 'wrapper url')
+        wrap_webpage = self._download_webpage(wrap_url, video_id)
+
+        video_url = self._html_search_regex(
+            r'clip:\s*{\s*url: "([^"]*)"', wrap_webpage, 'video url')
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'age_limit': 18,
+        }
index 0d1ea6802503d60c5ec05033b7d0f3fefa638fbf..94e7cf79008aa0b2426f70a26ba70218f916d731 100644 (file)
@@ -21,9 +21,10 @@ class HuffPostIE(InfoExtractor):
 
     _TEST = {
         'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677',
-        'file': '52dd3e4b02a7602131000677.mp4',
         'md5': '55f5e8981c1c80a64706a44b74833de8',
         'info_dict': {
+            'id': '52dd3e4b02a7602131000677',
+            'ext': 'mp4',
             'title': 'Legalese It! with @MikeSacksHP',
             'description': 'This week on Legalese It, Mike talks to David Bosco about his new book on the ICC, "Rough Justice," he also discusses the Virginia AG\'s historic stance on gay marriage, the execution of Edgar Tamayo, the ICC\'s delay of Kenya\'s President and more.  ',
             'duration': 1549,
similarity index 76%
rename from youtube_dl/extractor/statigram.py
rename to youtube_dl/extractor/iconosquare.py
index d602e817a076cbfbdfb5c50da8e3b1f23583b8d0..1d5a10a3b6349d95387aee00d4be5e6deac618b8 100644 (file)
@@ -5,8 +5,8 @@ import re
 from .common import InfoExtractor
 
 
-class StatigramIE(InfoExtractor):
-    _VALID_URL = r'https?://(www\.)?statigr\.am/p/(?P<id>[^/]+)'
+class IconosquareIE(InfoExtractor):
+    _VALID_URL = r'https?://(www\.)?(?:iconosquare\.com|statigr\.am)/p/(?P<id>[^/]+)'
     _TEST = {
         'url': 'http://statigr.am/p/522207370455279102_24101272',
         'md5': '6eb93b882a3ded7c378ee1d6884b1814',
@@ -15,6 +15,7 @@ class StatigramIE(InfoExtractor):
             'ext': 'mp4',
             'uploader_id': 'aguynamedpatrick',
             'title': 'Instagram photo by @aguynamedpatrick (Patrick Janelle)',
+            'description': 'md5:644406a9ec27457ed7aa7a9ebcd4ce3d',
         },
     }
 
@@ -25,7 +26,7 @@ class StatigramIE(InfoExtractor):
         html_title = self._html_search_regex(
             r'<title>(.+?)</title>',
             webpage, 'title')
-        title = re.sub(r'(?: *\(Videos?\))? \| Statigram$', '', html_title)
+        title = re.sub(r'(?: *\(Videos?\))? \| (?:Iconosquare|Statigram)$', '', html_title)
         uploader_id = self._html_search_regex(
             r'@([^ ]+)', title, 'uploader name', fatal=False)
 
@@ -33,6 +34,7 @@ class StatigramIE(InfoExtractor):
             'id': video_id,
             'url': self._og_search_video_url(webpage),
             'title': title,
+            'description': self._og_search_description(webpage),
             'thumbnail': self._og_search_thumbnail(webpage),
             'uploader_id': uploader_id
         }
index 381af91e42d4c9f642b35643107f5dafd026aad9..1f42c6d3a957674aa7bb2ee4ee3d56dac43cd2f8 100644 (file)
@@ -1,10 +1,8 @@
+from __future__ import unicode_literals
+
 import re
-import json
 
 from .common import InfoExtractor
-from ..utils import (
-    determine_ext,
-)
 
 
 class IGNIE(InfoExtractor):
@@ -14,52 +12,57 @@ class IGNIE(InfoExtractor):
     """
 
     _VALID_URL = r'https?://.+?\.ign\.com/(?P<type>videos|show_videos|articles|(?:[^/]*/feature))(/.+)?/(?P<name_or_id>.+)'
-    IE_NAME = u'ign.com'
+    IE_NAME = 'ign.com'
 
     _CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config'
-    _DESCRIPTION_RE = [r'<span class="page-object-description">(.+?)</span>',
-                       r'id="my_show_video">.*?<p>(.*?)</p>',
-                       ]
+    _DESCRIPTION_RE = [
+        r'<span class="page-object-description">(.+?)</span>',
+        r'id="my_show_video">.*?<p>(.*?)</p>',
+    ]
 
     _TESTS = [
         {
-            u'url': u'http://www.ign.com/videos/2013/06/05/the-last-of-us-review',
-            u'file': u'8f862beef863986b2785559b9e1aa599.mp4',
-            u'md5': u'eac8bdc1890980122c3b66f14bdd02e9',
-            u'info_dict': {
-                u'title': u'The Last of Us Review',
-                u'description': u'md5:c8946d4260a4d43a00d5ae8ed998870c',
+            'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review',
+            'md5': 'eac8bdc1890980122c3b66f14bdd02e9',
+            'info_dict': {
+                'id': '8f862beef863986b2785559b9e1aa599',
+                'ext': 'mp4',
+                'title': 'The Last of Us Review',
+                'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c',
             }
         },
         {
-            u'url': u'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind',
-            u'playlist': [
+            'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind',
+            'playlist': [
                 {
-                    u'file': u'5ebbd138523268b93c9141af17bec937.mp4',
-                    u'info_dict': {
-                        u'title': u'GTA 5 Video Review',
-                        u'description': u'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.',
+                    'info_dict': {
+                        'id': '5ebbd138523268b93c9141af17bec937',
+                        'ext': 'mp4',
+                        'title': 'GTA 5 Video Review',
+                        'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.',
                     },
                 },
                 {
-                    u'file': u'638672ee848ae4ff108df2a296418ee2.mp4',
-                    u'info_dict': {
-                        u'title': u'26 Twisted Moments from GTA 5 in Slow Motion',
-                        u'description': u'The twisted beauty of GTA 5 in stunning slow motion.',
+                    'info_dict': {
+                        'id': '638672ee848ae4ff108df2a296418ee2',
+                        'ext': 'mp4',
+                        'title': '26 Twisted Moments from GTA 5 in Slow Motion',
+                        'description': 'The twisted beauty of GTA 5 in stunning slow motion.',
                     },
                 },
             ],
-            u'params': {
-                u'skip_download': True,
+            'params': {
+                'skip_download': True,
             },
         },
     ]
 
     def _find_video_id(self, webpage):
-        res_id = [r'data-video-id="(.+?)"',
-                  r'<object id="vid_(.+?)"',
-                  r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"',
-                  ]
+        res_id = [
+            r'data-video-id="(.+?)"',
+            r'<object id="vid_(.+?)"',
+            r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"',
+        ]
         return self._search_regex(res_id, webpage, 'video id')
 
     def _real_extract(self, url):
@@ -68,7 +71,7 @@ class IGNIE(InfoExtractor):
         page_type = mobj.group('type')
         webpage = self._download_webpage(url, name_or_id)
         if page_type == 'articles':
-            video_url = self._search_regex(r'var videoUrl = "(.+?)"', webpage, u'video url')
+            video_url = self._search_regex(r'var videoUrl = "(.+?)"', webpage, 'video url')
             return self.url_result(video_url, ie='IGN')
         elif page_type != 'video':
             multiple_urls = re.findall(
@@ -80,50 +83,42 @@ class IGNIE(InfoExtractor):
         video_id = self._find_video_id(webpage)
         result = self._get_video_info(video_id)
         description = self._html_search_regex(self._DESCRIPTION_RE,
-                                              webpage, 'video description',
-                                              flags=re.DOTALL)
+            webpage, 'video description', flags=re.DOTALL)
         result['description'] = description
         return result
 
     def _get_video_info(self, video_id):
         config_url = self._CONFIG_URL_TEMPLATE % video_id
-        config = json.loads(self._download_webpage(config_url, video_id,
-                            u'Downloading video info'))
+        config = self._download_json(config_url, video_id)
         media = config['playlist']['media']
-        video_url = media['url']
 
-        return {'id': media['metadata']['videoId'],
-                'url': video_url,
-                'ext': determine_ext(video_url),
-                'title': media['metadata']['title'],
-                'thumbnail': media['poster'][0]['url'].replace('{size}', 'grande'),
-                }
+        return {
+            'id': media['metadata']['videoId'],
+            'url': media['url'],
+            'title': media['metadata']['title'],
+            'thumbnail': media['poster'][0]['url'].replace('{size}', 'grande'),
+        }
 
 
 class OneUPIE(IGNIE):
-    """Extractor for 1up.com, it uses the ign videos system."""
-
     _VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)'
     IE_NAME = '1up.com'
 
     _DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>'
 
-    _TEST = {
-        u'url': u'http://gamevideos.1up.com/video/id/34976',
-        u'file': u'34976.mp4',
-        u'md5': u'68a54ce4ebc772e4b71e3123d413163d',
-        u'info_dict': {
-            u'title': u'Sniper Elite V2 - Trailer',
-            u'description': u'md5:5d289b722f5a6d940ca3136e9dae89cf',
+    _TESTS = [{
+        'url': 'http://gamevideos.1up.com/video/id/34976',
+        'md5': '68a54ce4ebc772e4b71e3123d413163d',
+        'info_dict': {
+            'id': '34976',
+            'ext': 'mp4',
+            'title': 'Sniper Elite V2 - Trailer',
+            'description': 'md5:5d289b722f5a6d940ca3136e9dae89cf',
         }
-    }
-
-    # Override IGN tests
-    _TESTS = []
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        id = mobj.group('name_or_id')
         result = super(OneUPIE, self)._real_extract(url)
-        result['id'] = id
+        result['id'] = mobj.group('name_or_id')
         return result
index ed32373a1be5e3a20a0f7a1ca04c2da091ee68b6..e76dd222d1ee81dc0e0b2d5b1b3c28ef22e1bd83 100644 (file)
@@ -11,16 +11,15 @@ from ..utils import (
 
 class InfoQIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?infoq\.com/[^/]+/(?P<id>[^/]+)$'
+
     _TEST = {
-        "name": "InfoQ",
-        "url": "http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things",
-        "file": "12-jan-pythonthings.mp4",
-        "info_dict": {
-            "description": "Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.",
-            "title": "A Few of My Favorite [Python] Things",
-        },
-        "params": {
-            "skip_download": True,
+        'url': 'http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things',
+        'md5': 'b5ca0e0a8c1fed93b0e65e48e462f9a2',
+        'info_dict': {
+            'id': '12-jan-pythonthings',
+            'ext': 'mp4',
+            'description': 'Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.',
+            'title': 'A Few of My Favorite [Python] Things',
         },
     }
 
@@ -30,26 +29,39 @@ class InfoQIE(InfoExtractor):
 
         webpage = self._download_webpage(url, video_id)
 
+        video_title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title')
+        video_description = self._html_search_meta('description', webpage, 'description')
+
+        # The server URL is hardcoded
+        video_url = 'rtmpe://video.infoq.com/cfx/st/'
+
         # Extract video URL
-        encoded_id = self._search_regex(r"jsclassref ?= ?'([^']*)'", webpage, 'encoded id')
+        encoded_id = self._search_regex(
+            r"jsclassref\s*=\s*'([^']*)'", webpage, 'encoded id')
         real_id = compat_urllib_parse.unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8'))
-        video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
+        playpath = 'mp4:' + real_id
 
-        # Extract title
-        video_title = self._search_regex(r'contentTitle = "(.*?)";',
-            webpage, 'title')
+        video_filename = playpath.split('/')[-1]
+        video_id, extension = video_filename.split('.')
 
-        # Extract description
-        video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
-            webpage, 'description', fatal=False)
+        http_base = self._search_regex(
+            r'EXPRESSINSTALL_SWF\s*=\s*"(https?://[^/"]+/)', webpage,
+            'HTTP base URL')
 
-        video_filename = video_url.split('/')[-1]
-        video_id, extension = video_filename.split('.')
+        formats = [{
+            'format_id': 'rtmp',
+            'url': video_url,
+            'ext': extension,
+            'play_path': playpath,
+        }, {
+            'format_id': 'http',
+            'url': http_base + real_id,
+        }]
+        self._sort_formats(formats)
 
         return {
             'id': video_id,
-            'url': video_url,
             'title': video_title,
-            'ext': extension,  # Extension is always(?) mp4, but seems to be flv
             'description': video_description,
+            'formats': formats,
         }
index 63141af272ac077ed97dcd5baf4c5a0dcb7d3b47..b5372bf7a24e48a347127a1dc76c9dc672b32b64 100644 (file)
@@ -3,6 +3,9 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+)
 
 
 class InstagramIE(InfoExtractor):
@@ -37,3 +40,68 @@ class InstagramIE(InfoExtractor):
             'uploader_id': uploader_id,
             'description': desc,
         }
+
+
+class InstagramUserIE(InfoExtractor):
+    _VALID_URL = r'http://instagram\.com/(?P<username>[^/]{2,})/?(?:$|[?#])'
+    IE_DESC = 'Instagram user profile'
+    IE_NAME = 'instagram:user'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        uploader_id = mobj.group('username')
+
+        entries = []
+        page_count = 0
+        media_url = 'http://instagram.com/%s/media' % uploader_id
+        while True:
+            page = self._download_json(
+                media_url, uploader_id,
+                note='Downloading page %d ' % (page_count + 1),
+            )
+            page_count += 1
+
+            for it in page['items']:
+                if it.get('type') != 'video':
+                    continue
+                like_count = int_or_none(it.get('likes', {}).get('count'))
+                user = it.get('user', {})
+
+                formats = [{
+                    'format_id': k,
+                    'height': v.get('height'),
+                    'width': v.get('width'),
+                    'url': v['url'],
+                } for k, v in it['videos'].items()]
+                self._sort_formats(formats)
+
+                thumbnails_el = it.get('images', {})
+                thumbnail = thumbnails_el.get('thumbnail', {}).get('url')
+
+                title = it.get('caption', {}).get('text', it['id'])
+
+                entries.append({
+                    'id': it['id'],
+                    'title': title,
+                    'formats': formats,
+                    'thumbnail': thumbnail,
+                    'webpage_url': it.get('link'),
+                    'uploader': user.get('full_name'),
+                    'uploader_id': user.get('username'),
+                    'like_count': like_count,
+                    'timestamp': int_or_none(it.get('created_time')),
+                })
+
+            if not page['items']:
+                break
+            max_id = page['items'][-1]['id']
+            media_url = (
+                'http://instagram.com/%s/media?max_id=%s' % (
+                    uploader_id, max_id))
+
+        return {
+            '_type': 'playlist',
+            'entries': entries,
+            'id': uploader_id,
+            'title': uploader_id,
+        }
index dde4829981a7adab396a040ccc1566277c39194b..d1defd363c5fe9c86330236f53b8aa21bfe65a38 100644 (file)
@@ -6,11 +6,14 @@ from random import random
 from math import floor
 
 from .common import InfoExtractor
-from ..utils import compat_urllib_request
+from ..utils import (
+    compat_urllib_request,
+    ExtractorError,
+)
 
 
 class IPrimaIE(InfoExtractor):
-    _VALID_URL = r'https?://play\.iprima\.cz/(?P<videogroup>.+)/(?P<videoid>.+)'
+    _VALID_URL = r'https?://play\.iprima\.cz/[^?#]+/(?P<id>[^?#]+)'
 
     _TESTS = [{
         'url': 'http://play.iprima.cz/particka/particka-92',
@@ -22,20 +25,37 @@ class IPrimaIE(InfoExtractor):
             'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg',
         },
         'params': {
-            'skip_download': True,
+            'skip_download': True,  # requires rtmpdump
         },
-    },
-    ]
+    }, {
+        'url': 'http://play.iprima.cz/particka/tchibo-particka-jarni-moda',
+        'info_dict': {
+            'id': '9718337',
+            'ext': 'flv',
+            'title': 'Tchibo Partička - Jarní móda',
+            'description': 'md5:589f8f59f414220621ff8882eb3ce7be',
+            'thumbnail': 're:^http:.*\.jpg$',
+        },
+        'params': {
+            'skip_download': True,  # requires rtmpdump
+        },
+        'skip': 'Do not have permission to access this page',
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('videoid')
+        video_id = mobj.group('id')
 
         webpage = self._download_webpage(url, video_id)
 
-        player_url = 'http://embed.livebox.cz/iprimaplay/player-embed-v2.js?__tok%s__=%s' % (
-                         floor(random()*1073741824),
-                         floor(random()*1073741824))
+        if re.search(r'Nemáte oprávnění přistupovat na tuto stránku\.\s*</div>', webpage):
+            raise ExtractorError(
+                '%s said: You do not have permission to access this page' % self.IE_NAME, expected=True)
+
+        player_url = (
+            'http://embed.livebox.cz/iprimaplay/player-embed-v2.js?__tok%s__=%s' %
+            (floor(random()*1073741824), floor(random()*1073741824))
+        )
 
         req = compat_urllib_request.Request(player_url)
         req.add_header('Referer', url)
@@ -44,18 +64,20 @@ class IPrimaIE(InfoExtractor):
         base_url = ''.join(re.findall(r"embed\['stream'\] = '(.+?)'.+'(\?auth=)'.+'(.+?)';", playerpage)[1])
 
         zoneGEO = self._html_search_regex(r'"zoneGEO":(.+?),', webpage, 'zoneGEO')
-
         if zoneGEO != '0':
-            base_url = base_url.replace('token', 'token_'+zoneGEO)
+            base_url = base_url.replace('token', 'token_' + zoneGEO)
 
         formats = []
         for format_id in ['lq', 'hq', 'hd']:
-            filename = self._html_search_regex(r'"%s_id":(.+?),' % format_id, webpage, 'filename')
+            filename = self._html_search_regex(
+                r'"%s_id":(.+?),' % format_id, webpage, 'filename')
 
             if filename == 'null':
                 continue
 
-            real_id = self._search_regex(r'Prima-[0-9]{10}-([0-9]+)_', filename, 'real video id')
+            real_id = self._search_regex(
+                r'Prima-(?:[0-9]{10}|WEB)-([0-9]+)[-_]',
+                filename, 'real video id')
 
             if format_id == 'lq':
                 quality = 0
@@ -63,13 +85,13 @@ class IPrimaIE(InfoExtractor):
                 quality = 1
             elif format_id == 'hd':
                 quality = 2
-                filename = 'hq/'+filename
+                filename = 'hq/' + filename
 
             formats.append({
                 'format_id': format_id,
                 'url': base_url,
                 'quality': quality,
-                'play_path': 'mp4:'+filename.replace('"', '')[:-4],
+                'play_path': 'mp4:' + filename.replace('"', '')[:-4],
                 'rtmp_live': True,
                 'ext': 'flv',
             })
index 1ba4966c724ee15637dc0f2d08d3029dec16f4e3..528be1524ae645f7bb8b36ee2ac2378fd91561be 100644 (file)
@@ -33,14 +33,14 @@ class IviIE(InfoExtractor):
         },
         # Serial's serie
         {
-            'url': 'http://www.ivi.ru/watch/dezhurnyi_angel/74791',
-            'md5': '3e6cc9a848c1d2ebcc6476444967baa9',
+            'url': 'http://www.ivi.ru/watch/dvoe_iz_lartsa/9549',
+            'md5': '221f56b35e3ed815fde2df71032f4b3e',
             'info_dict': {
-                'id': '74791',
+                'id': '9549',
                 'ext': 'mp4',
-                'title': 'Ð\94ежÑ\83Ñ\80нÑ\8bй Ð°Ð½Ð³ÐµÐ» - 1 Ñ\81еÑ\80иÑ\8f',
-                'duration': 2490,
-                'thumbnail': 'http://thumbs.ivi.ru/f7.vcp.digitalaccess.ru/contents/8/e/bc2f6c2b6e5d291152fdd32c059141.jpg',
+                'title': 'Ð\94вое Ð¸Ð· Ð»Ð°Ñ\80Ñ\86а - Ð¡ÐµÑ\80иÑ\8f 1',
+                'duration': 2655,
+                'thumbnail': 'http://thumbs.ivi.ru/f15.vcp.digitalaccess.ru/contents/8/4/0068dc0677041f3336b7c2baad8fc0.jpg',
             },
             'skip': 'Only works from Russia',
          }
index 592c64e1de0a47299770ef838095abf1f0988bcc..9b553b9fa52873739b0d4ecb4e3927e1beff929e 100644 (file)
@@ -1,56 +1,61 @@
-# coding: utf-8
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
+    RegexNotFoundError,
     unescapeHTML,
 )
 
+
 class JukeboxIE(InfoExtractor):
     _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<video_id>[a-z0-9\-]+)\.html'
-    _IFRAME = r'<iframe .*src="(?P<iframe>[^"]*)".*>'
-    _VIDEO_URL = r'"config":{"file":"(?P<video_url>http:[^"]+[.](?P<video_ext>[^.?]+)[?]mdtk=[0-9]+)"'
-    _TITLE = r'<h1 class="inline">(?P<title>[^<]+)</h1>.*<span id="infos_article_artist">(?P<artist>[^<]+)</span>'
-    _IS_YOUTUBE = r'config":{"file":"(?P<youtube_url>http:[\\][/][\\][/]www[.]youtube[.]com[\\][/]watch[?]v=[^"]+)"'
+    _TEST = {
+        'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html',
+        'md5': '1574e9b4d6438446d5b7dbcdf2786276',
+        'info_dict': {
+            'id': 'r303r',
+            'ext': 'flv',
+            'title': 'Kosheen-En Vivo Pride',
+            'uploader': 'Kosheen',
+        },
+    }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('video_id')
 
         html = self._download_webpage(url, video_id)
-
-        mobj = re.search(self._IFRAME, html)
-        if mobj is None:
-            raise ExtractorError(u'Cannot extract iframe url')
-        iframe_url = unescapeHTML(mobj.group('iframe'))
+        iframe_url = unescapeHTML(self._search_regex(r'<iframe .*src="([^"]*)"', html, 'iframe url'))
 
         iframe_html = self._download_webpage(iframe_url, video_id, 'Downloading iframe')
-        mobj = re.search(r'class="jkb_waiting"', iframe_html)
-        if mobj is not None:
-            raise ExtractorError(u'Video is not available(in your country?)!')
+        if re.search(r'class="jkb_waiting"', iframe_html) is not None:
+            raise ExtractorError('Video is not available(in your country?)!')
 
         self.report_extraction(video_id)
 
-        mobj = re.search(self._VIDEO_URL, iframe_html)
-        if mobj is None:
-            mobj = re.search(self._IS_YOUTUBE, iframe_html)
-            if mobj is None:
-                raise ExtractorError(u'Cannot extract video url')
-            youtube_url = unescapeHTML(mobj.group('youtube_url')).replace('\/','/')
-            self.to_screen(u'Youtube video detected')
-            return self.url_result(youtube_url,ie='Youtube')
-        video_url = unescapeHTML(mobj.group('video_url')).replace('\/','/')
-        video_ext = unescapeHTML(mobj.group('video_ext'))
-
-        mobj = re.search(self._TITLE, html)
-        if mobj is None:
-            raise ExtractorError(u'Cannot extract title')
-        title = unescapeHTML(mobj.group('title'))
-        artist = unescapeHTML(mobj.group('artist'))
-
-        return [{'id': video_id,
-                 'url': video_url,
-                 'title': artist + '-' + title,
-                 'ext': video_ext
-                 }]
+        try:
+            video_url = self._search_regex(r'"config":{"file":"(?P<video_url>http:[^"]+\?mdtk=[0-9]+)"',
+                iframe_html, 'video url')
+            video_url = unescapeHTML(video_url).replace('\/', '/')
+        except RegexNotFoundError:
+            youtube_url = self._search_regex(
+                r'config":{"file":"(http:\\/\\/www\.youtube\.com\\/watch\?v=[^"]+)"',
+                iframe_html, 'youtube url')
+            youtube_url = unescapeHTML(youtube_url).replace('\/', '/')
+            self.to_screen('Youtube video detected')
+            return self.url_result(youtube_url, ie='Youtube')
+
+        title = self._html_search_regex(r'<h1 class="inline">([^<]+)</h1>',
+            html, 'title')
+        artist = self._html_search_regex(r'<span id="infos_article_artist">([^<]+)</span>',
+            html, 'artist')
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': artist + '-' + title,
+            'uploader': artist,
+        }
index e9bde0c186a76e0546f97cdb08bf69e2b80b3e93..7083db12ea012720f5dfda7039fdad9e21c12cc9 100644 (file)
@@ -1,9 +1,12 @@
+from __future__ import unicode_literals
+
 import json
 import os
 import re
 
 from .common import InfoExtractor
 from ..utils import (
+    compat_str,
     ExtractorError,
     formatSeconds,
 )
@@ -24,34 +27,31 @@ class JustinTVIE(InfoExtractor):
         /?(?:\#.*)?$
         """
     _JUSTIN_PAGE_LIMIT = 100
-    IE_NAME = u'justin.tv'
+    IE_NAME = 'justin.tv'
+    IE_DESC = 'justin.tv and twitch.tv'
     _TEST = {
-        u'url': u'http://www.twitch.tv/thegamedevhub/b/296128360',
-        u'file': u'296128360.flv',
-        u'md5': u'ecaa8a790c22a40770901460af191c9a',
-        u'info_dict': {
-            u"upload_date": u"20110927", 
-            u"uploader_id": 25114803, 
-            u"uploader": u"thegamedevhub", 
-            u"title": u"Beginner Series - Scripting With Python Pt.1"
+        'url': 'http://www.twitch.tv/thegamedevhub/b/296128360',
+        'md5': 'ecaa8a790c22a40770901460af191c9a',
+        'info_dict': {
+            'id': '296128360',
+            'ext': 'flv',
+            'upload_date': '20110927',
+            'uploader_id': 25114803,
+            'uploader': 'thegamedevhub',
+            'title': 'Beginner Series - Scripting With Python Pt.1'
         }
     }
 
-    def report_download_page(self, channel, offset):
-        """Report attempt to download a single page of videos."""
-        self.to_screen(u'%s: Downloading video information from %d to %d' %
-                (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
-
     # Return count of items, list of *valid* items
     def _parse_page(self, url, video_id):
         info_json = self._download_webpage(url, video_id,
-                                           u'Downloading video info JSON',
-                                           u'unable to download video info JSON')
+                                           'Downloading video info JSON',
+                                           'unable to download video info JSON')
 
         response = json.loads(info_json)
         if type(response) != list:
             error_text = response.get('error', 'unknown error')
-            raise ExtractorError(u'Justin.tv API: %s' % error_text)
+            raise ExtractorError('Justin.tv API: %s' % error_text)
         info = []
         for clip in response:
             video_url = clip['video_file_url']
@@ -62,7 +62,7 @@ class JustinTVIE(InfoExtractor):
                 video_id = clip['id']
                 video_title = clip.get('title', video_id)
                 info.append({
-                    'id': video_id,
+                    'id': compat_str(video_id),
                     'url': video_url,
                     'title': video_title,
                     'uploader': clip.get('channel_name', video_uploader_id),
@@ -74,8 +74,6 @@ class JustinTVIE(InfoExtractor):
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'invalid URL: %s' % url)
 
         api_base = 'http://api.justin.tv'
         paged = False
@@ -89,40 +87,41 @@ class JustinTVIE(InfoExtractor):
             webpage = self._download_webpage(url, chapter_id)
             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
             if not m:
-                raise ExtractorError(u'Cannot find archive of a chapter')
+                raise ExtractorError('Cannot find archive of a chapter')
             archive_id = m.group(1)
 
             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
-            doc = self._download_xml(api, chapter_id,
-                                             note=u'Downloading chapter information',
-                                             errnote=u'Chapter information download failed')
+            doc = self._download_xml(
+                api, chapter_id,
+                note='Downloading chapter information',
+                errnote='Chapter information download failed')
             for a in doc.findall('.//archive'):
                 if archive_id == a.find('./id').text:
                     break
             else:
-                raise ExtractorError(u'Could not find chapter in chapter information')
+                raise ExtractorError('Could not find chapter in chapter information')
 
             video_url = a.find('./video_file_url').text
-            video_ext = video_url.rpartition('.')[2] or u'flv'
+            video_ext = video_url.rpartition('.')[2] or 'flv'
 
-            chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
-            chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
-                                   note='Downloading chapter metadata',
-                                   errnote='Download of chapter metadata failed')
-            chapter_info = json.loads(chapter_info_json)
+            chapter_api_url = 'https://api.twitch.tv/kraken/videos/c' + chapter_id
+            chapter_info = self._download_json(
+                chapter_api_url, 'c' + chapter_id,
+                note='Downloading chapter metadata',
+                errnote='Download of chapter metadata failed')
 
             bracket_start = int(doc.find('.//bracket_start').text)
             bracket_end = int(doc.find('.//bracket_end').text)
 
             # TODO determine start (and probably fix up file)
             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
-            #video_url += u'?start=' + TODO:start_timestamp
+            #video_url += '?start=' + TODO:start_timestamp
             # bracket_start is 13290, but we want 51670615
-            self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
-                                            u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
+            self._downloader.report_warning('Chapter detected, but we can just download the whole file. '
+                                            'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
 
             info = {
-                'id': u'c' + chapter_id,
+                'id': 'c' + chapter_id,
                 'url': video_url,
                 'ext': video_ext,
                 'title': chapter_info['title'],
@@ -131,14 +130,12 @@ class JustinTVIE(InfoExtractor):
                 'uploader': chapter_info['channel']['display_name'],
                 'uploader_id': chapter_info['channel']['name'],
             }
-            return [info]
+            return info
         else:
             video_id = mobj.group('videoid')
             api = api_base + '/broadcast/by_archive/%s.json' % video_id
 
-        self.report_extraction(video_id)
-
-        info = []
+        entries = []
         offset = 0
         limit = self._JUSTIN_PAGE_LIMIT
         while True:
@@ -146,8 +143,12 @@ class JustinTVIE(InfoExtractor):
                 self.report_download_page(video_id, offset)
             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
             page_count, page_info = self._parse_page(page_url, video_id)
-            info.extend(page_info)
+            entries.extend(page_info)
             if not paged or page_count != limit:
                 break
             offset += limit
-        return info
+        return {
+            '_type': 'playlist',
+            'id': video_id,
+            'entries': entries,
+        }
index 29658a7d63c6396d33c370a35b07a4636d265d99..75b63cffb5961f33ea2d2f5ae37803dfb0fe37fc 100644 (file)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import os
 import re
 
@@ -11,22 +13,22 @@ from ..aes import (
     aes_decrypt_text
 )
 
+
 class KeezMoviesIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>keezmovies\.com/video/.+?(?P<videoid>[0-9]+))(?:[/?&]|$)'
+    _VALID_URL = r'^https?://(?:www\.)?keezmovies\.com/video/.+?(?P<videoid>[0-9]+)(?:[/?&]|$)'
     _TEST = {
-        u'url': u'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711',
-        u'file': u'1214711.mp4',
-        u'md5': u'6e297b7e789329923fcf83abb67c9289',
-        u'info_dict': {
-            u"title": u"Petite Asian Lady Mai Playing In Bathtub",
-            u"age_limit": 18,
+        'url': 'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711',
+        'file': '1214711.mp4',
+        'md5': '6e297b7e789329923fcf83abb67c9289',
+        'info_dict': {
+            'title': 'Petite Asian Lady Mai Playing In Bathtub',
+            'age_limit': 18,
         }
     }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('videoid')
-        url = 'http://www.' + mobj.group('url')
 
         req = compat_urllib_request.Request(url)
         req.add_header('Cookie', 'age_verified=1')
@@ -38,10 +40,10 @@ class KeezMoviesIE(InfoExtractor):
             embedded_url = mobj.group(1)
             return self.url_result(embedded_url)
 
-        video_title = self._html_search_regex(r'<h1 [^>]*>([^<]+)', webpage, u'title')
-        video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&amp;', webpage, u'video_url'))
-        if webpage.find('encrypted=true')!=-1:
-            password = self._html_search_regex(r'video_title=(.+?)&amp;', webpage, u'password')
+        video_title = self._html_search_regex(r'<h1 [^>]*>([^<]+)', webpage, 'title')
+        video_url = compat_urllib_parse.unquote(self._html_search_regex(r'video_url=(.+?)&amp;', webpage, 'video_url'))
+        if 'encrypted=true' in webpage:
+            password = self._html_search_regex(r'video_title=(.+?)&amp;', webpage, 'password')
             video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8')
         path = compat_urllib_parse_urlparse(video_url).path
         extension = os.path.splitext(path)[1][1:]
index 50bc883ef4c2a0aafae84f8e31c995f833d7d7ee..961dd1aa6459380c60b1b32e39a2e58dd3cb9a52 100644 (file)
@@ -1,37 +1,39 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
 
 
 class KickStarterIE(InfoExtractor):
-    _VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P<id>\d*)/.*'
+    _VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P<id>[^/]*)/.*'
     _TEST = {
-        u"url": u"https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant?ref=home_location",
-        u"file": u"1404461844.mp4",
-        u"md5": u"c81addca81327ffa66c642b5d8b08cab",
-        u"info_dict": {
-            u"title": u"Intersection: The Story of Josh Grant by Kyle Cowling",
+        'url': 'https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant?ref=home_location',
+        'md5': 'c81addca81327ffa66c642b5d8b08cab',
+        'info_dict': {
+            'id': '1404461844',
+            'ext': 'mp4',
+            'title': 'Intersection: The Story of Josh Grant by Kyle Cowling',
+            'description': 'A unique motocross documentary that examines the '
+                'life and mind of one of sports most elite athletes: Josh Grant.',
         },
     }
 
     def _real_extract(self, url):
         m = re.match(self._VALID_URL, url)
         video_id = m.group('id')
-        webpage_src = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(url, video_id)
 
-        video_url = self._search_regex(r'data-video="(.*?)">',
-            webpage_src, u'video URL')
-        if 'mp4' in video_url:
-            ext = 'mp4'
-        else:
-            ext = 'flv'
-        video_title = self._html_search_regex(r"<title>(.*?)</title>",
-            webpage_src, u'title').rpartition(u'\u2014 Kickstarter')[0].strip()
+        video_url = self._search_regex(r'data-video-url="(.*?)"',
+            webpage, 'video URL')
+        video_title = self._html_search_regex(r'<title>(.*?)</title>',
+            webpage, 'title').rpartition('— Kickstarter')[0].strip()
 
-        results = [{
-                    'id': video_id,
-                    'url': video_url,
-                    'title': video_title,
-                    'ext': ext,
-                    }]
-        return results
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': video_title,
+            'description': self._og_search_description(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }
index 1b45b67b0579d9fb06462f587651bc8f83e4751d..5341ac773f79fe237626bdfe3243bd1561d8003d 100644 (file)
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..utils import int_or_none
 
 
 class KontrTubeIE(InfoExtractor):
@@ -32,27 +33,26 @@ class KontrTubeIE(InfoExtractor):
 
         video_url = self._html_search_regex(r"video_url: '(.+?)/?',", webpage, 'video URL')
         thumbnail = self._html_search_regex(r"preview_url: '(.+?)/?',", webpage, 'video thumbnail', fatal=False)
-        title = self._html_search_regex(r'<title>(.+?) - Труба зовёт - Интересный видеохостинг</title>', webpage,
-            'video title')
+        title = self._html_search_regex(
+            r'<title>(.+?) - Труба зовёт - Интересный видеохостинг</title>', webpage, 'video title')
         description = self._html_search_meta('description', webpage, 'video description')
 
-        mobj = re.search(r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>',
-            webpage)
+        mobj = re.search(
+            r'<div class="col_2">Длительность: <span>(?P<minutes>\d+)м:(?P<seconds>\d+)с</span></div>', webpage)
         duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
 
-        view_count = self._html_search_regex(r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', webpage,
-            'view count', fatal=False)
-        view_count = int(view_count) if view_count is not None else None
+        view_count = self._html_search_regex(
+            r'<div class="col_2">Просмотров: <span>(\d+)</span></div>', webpage, 'view count', fatal=False)
 
         comment_count = None
-        comment_str = self._html_search_regex(r'Комментарии: <span>([^<]+)</span>', webpage, 'comment count',
-            fatal=False)
+        comment_str = self._html_search_regex(
+            r'Комментарии: <span>([^<]+)</span>', webpage, 'comment count', fatal=False)
         if comment_str.startswith('комментариев нет'):
             comment_count = 0
         else:
             mobj = re.search(r'\d+ из (?P<total>\d+) комментариев', comment_str)
             if mobj:
-                comment_count = int(mobj.group('total'))
+                comment_count = mobj.group('total')
 
         return {
             'id': video_id,
@@ -61,6 +61,6 @@ class KontrTubeIE(InfoExtractor):
             'title': title,
             'description': description,
             'duration': duration,
-            'view_count': view_count,
-            'comment_count': comment_count,
+            'view_count': int_or_none(view_count),
+            'comment_count': int_or_none(comment_count),
         }
\ No newline at end of file
diff --git a/youtube_dl/extractor/ku6.py b/youtube_dl/extractor/ku6.py
new file mode 100644 (file)
index 0000000..484239b
--- /dev/null
@@ -0,0 +1,35 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class Ku6IE(InfoExtractor):
+    _VALID_URL = r'http://v\.ku6\.com/show/(?P<id>[a-zA-Z0-9\-\_]+)(?:\.)*html'
+    _TEST = {
+        'url': 'http://v.ku6.com/show/JG-8yS14xzBr4bCn1pu0xw...html',
+        'md5': '01203549b9efbb45f4b87d55bdea1ed1',
+        'info_dict': {
+            'id': 'JG-8yS14xzBr4bCn1pu0xw',
+            'ext': 'f4v',
+            'title': 'techniques test',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        title = self._search_regex(r'<h1 title=.*>(.*?)</h1>', webpage, 'title')
+        dataUrl = 'http://v.ku6.com/fetchVideo4Player/%s.html' % video_id
+        jsonData = self._download_json(dataUrl, video_id)
+        downloadUrl = jsonData['data']['f']
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': downloadUrl
+        }
+
index 7b7185f9adb69f37dee1e4c4b468de8a5a95a556..7a431a274abc5b189af8ee8779f6024f430704c7 100644 (file)
@@ -6,7 +6,8 @@ import re
 from .common import InfoExtractor
 from ..utils import (
     int_or_none,
-    unified_strdate
+    unified_strdate,
+    ExtractorError,
 )
 
 
@@ -32,13 +33,11 @@ class LifeNewsIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
 
-        webpage = self._download_webpage('http://lifenews.ru/mobile/news/%s' % video_id, video_id, 'Downloading page')
+        webpage = self._download_webpage('http://lifenews.ru/news/%s' % video_id, video_id, 'Downloading page')
 
-        video_url = self._html_search_regex(
-            r'<video.*?src="([^"]+)".*?></video>', webpage, 'video URL')
-
-        thumbnail = self._html_search_regex(
-            r'<video.*?poster="([^"]+)".*?"></video>', webpage, 'video thumbnail')
+        videos = re.findall(r'<video.*?poster="(?P<poster>[^"]+)".*?src="(?P<video>[^"]+)".*?></video>', webpage)
+        if not videos:
+            raise ExtractorError('No media links available for %s' % video_id)
 
         title = self._og_search_title(webpage)
         TITLE_SUFFIX = ' - Первый по срочным новостям — LIFE | NEWS'
@@ -50,20 +49,26 @@ class LifeNewsIE(InfoExtractor):
         view_count = self._html_search_regex(
             r'<div class=\'views\'>(\d+)</div>', webpage, 'view count', fatal=False)
         comment_count = self._html_search_regex(
-            r'<div class=\'comments\'>(\d+)</div>', webpage, 'comment count', fatal=False)
+            r'<div class=\'comments\'>\s*<span class=\'counter\'>(\d+)</span>', webpage, 'comment count', fatal=False)
 
         upload_date = self._html_search_regex(
             r'<time datetime=\'([^\']+)\'>', webpage, 'upload date',fatal=False)
         if upload_date is not None:
             upload_date = unified_strdate(upload_date)
 
-        return {
-            'id': video_id,
-            'url': video_url,
-            'thumbnail': thumbnail,
-            'title': title,
-            'description': description,
-            'view_count': int_or_none(view_count),
-            'comment_count': int_or_none(comment_count),
-            'upload_date': upload_date,
-        }
\ No newline at end of file
+        def make_entry(video_id, media, video_number=None):
+            return {
+                'id': video_id,
+                'url': media[1],
+                'thumbnail': media[0],
+                'title': title if video_number is None else '%s-video%s' % (title, video_number),
+                'description': description,
+                'view_count': int_or_none(view_count),
+                'comment_count': int_or_none(comment_count),
+                'upload_date': upload_date,
+            }
+
+        if len(videos) == 1:
+            return make_entry(video_id, videos[0])
+        else:
+            return [make_entry(video_id, media, video_number+1) for video_number, media in enumerate(videos)]
\ No newline at end of file
index 0a700d663215df27f3553100862e9805e2c9f6ea..8e50e8f79adee2e21c7801da13da60897fcb61ec 100644 (file)
@@ -4,15 +4,17 @@ import json
 import re
 
 from .common import InfoExtractor
+from ..utils import int_or_none
 
 
 class LiveLeakIE(InfoExtractor):
     _VALID_URL = r'^(?:http://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
     _TESTS = [{
         'url': 'http://www.liveleak.com/view?i=757_1364311680',
-        'file': '757_1364311680.mp4',
         'md5': '0813c2430bea7a46bf13acf3406992f4',
         'info_dict': {
+            'id': '757_1364311680',
+            'ext': 'mp4',
             'description': 'extremely bad day for this guy..!',
             'uploader': 'ljfriel2',
             'title': 'Most unlucky car accident'
@@ -20,25 +22,62 @@ class LiveLeakIE(InfoExtractor):
     },
     {
         'url': 'http://www.liveleak.com/view?i=f93_1390833151',
-        'file': 'f93_1390833151.mp4',
         'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf',
         'info_dict': {
+            'id': 'f93_1390833151',
+            'ext': 'mp4',
             'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.',
             'uploader': 'ARD_Stinkt',
             'title': 'German Television does first Edward Snowden Interview (ENGLISH)',
         }
+    },
+    {
+        'url': 'http://www.liveleak.com/view?i=4f7_1392687779',
+        'md5': '42c6d97d54f1db107958760788c5f48f',
+        'info_dict': {
+            'id': '4f7_1392687779',
+            'ext': 'mp4',
+            'description': "The guy with the cigarette seems amazingly nonchalant about the whole thing...  I really hope my friends' reactions would be a bit stronger.\r\n\r\nAction-go to 0:55.",
+            'uploader': 'CapObveus',
+            'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck',
+            'age_limit': 18,
+        }
     }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-
         video_id = mobj.group('video_id')
         webpage = self._download_webpage(url, video_id)
+
+        video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip()
+        video_description = self._og_search_description(webpage)
+        video_uploader = self._html_search_regex(
+            r'By:.*?(\w+)</a>', webpage, 'uploader', fatal=False)
+        age_limit = int_or_none(self._search_regex(
+            r'you confirm that you are ([0-9]+) years and over.',
+            webpage, 'age limit', default=None))
+
         sources_raw = self._search_regex(
             r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None)
         if sources_raw is None:
-            sources_raw = '[{ %s}]' % (
-                self._search_regex(r'(file: ".*?"),', webpage, 'video URL'))
+            alt_source = self._search_regex(
+                r'(file: ".*?"),', webpage, 'video URL', default=None)
+            if alt_source:
+                sources_raw = '[{ %s}]' % alt_source
+            else:
+                # Maybe an embed?
+                embed_url = self._search_regex(
+                    r'<iframe[^>]+src="(http://www.prochan.com/embed\?[^"]+)"',
+                    webpage, 'embed URL')
+                return {
+                    '_type': 'url_transparent',
+                    'url': embed_url,
+                    'id': video_id,
+                    'title': video_title,
+                    'description': video_description,
+                    'uploader': video_uploader,
+                    'age_limit': age_limit,
+                }
 
         sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw)
         sources = json.loads(sources_json)
@@ -49,15 +88,11 @@ class LiveLeakIE(InfoExtractor):
         } for s in sources]
         self._sort_formats(formats)
 
-        video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip()
-        video_description = self._og_search_description(webpage)
-        video_uploader = self._html_search_regex(
-            r'By:.*?(\w+)</a>', webpage, 'uploader', fatal=False)
-
         return {
             'id': video_id,
             'title': video_title,
             'description': video_description,
             'uploader': video_uploader,
             'formats': formats,
+            'age_limit': age_limit,
         }
index 6deed27d73f999a2e245e0c1ac19025391d474c4..33f34f4e9bdda2aa034dd4f46ef3299478f181ec 100644 (file)
@@ -8,7 +8,9 @@ from .common import InfoExtractor
 from ..utils import (
     compat_urllib_parse,
     compat_urllib_request,
-    ExtractorError
+    ExtractorError,
+    int_or_none,
+    compat_str,
 )
 
 
@@ -19,16 +21,17 @@ class LyndaIE(SubtitlesInfoExtractor):
     _LOGIN_URL = 'https://www.lynda.com/login/login.aspx'
     _NETRC_MACHINE = 'lynda'
 
-    _SUCCESSFUL_LOGIN_REGEX = r'<a href="https://www.lynda.com/home/userAccount/ChangeContactInfo.aspx" data-qa="eyebrow_account_menu">My account'
+    _SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn: true'
     _TIMECODE_REGEX = r'\[(?P<timecode>\d+:\d+:\d+[\.,]\d+)\]'
 
     ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.'
 
     _TEST = {
         'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',
-        'file': '114408.mp4',
         'md5': 'ecfc6862da89489161fb9cd5f5a6fac1',
         'info_dict': {
+            'id': '114408',
+            'ext': 'mp4',
             'title': 'Using the exercise files',
             'duration': 68
         }
@@ -41,27 +44,44 @@ class LyndaIE(SubtitlesInfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group(1)
 
-        page = self._download_webpage('http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id,
-                                      video_id, 'Downloading video JSON')
+        page = self._download_webpage('http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id, video_id,
+            'Downloading video JSON')
         video_json = json.loads(page)
 
         if 'Status' in video_json:
             raise ExtractorError('lynda returned error: %s' % video_json['Message'], expected=True)
 
         if video_json['HasAccess'] is False:
-            raise ExtractorError('Video %s is only available for members. ' % video_id + self.ACCOUNT_CREDENTIALS_HINT, expected=True)
+            raise ExtractorError(
+                'Video %s is only available for members. ' % video_id + self.ACCOUNT_CREDENTIALS_HINT, expected=True)
 
-        video_id = video_json['ID']
+        video_id = compat_str(video_json['ID'])
         duration = video_json['DurationInSeconds']
         title = video_json['Title']
 
-        formats = [{'url': fmt['Url'],
+        formats = []
+
+        fmts = video_json.get('Formats')
+        if fmts:
+            formats.extend([
+                {
+                    'url': fmt['Url'],
                     'ext': fmt['Extension'],
                     'width': fmt['Width'],
                     'height': fmt['Height'],
                     'filesize': fmt['FileSize'],
                     'format_id': str(fmt['Resolution'])
-                    } for fmt in video_json['Formats']]
+                } for fmt in fmts])
+
+        prioritized_streams = video_json.get('PrioritizedStreams')
+        if prioritized_streams:
+            formats.extend([
+                {
+                    'url': video_url,
+                    'width': int_or_none(format_id),
+                    'format_id': format_id,
+                } for format_id, video_url in prioritized_streams['0'].items()
+            ])
 
         self._sort_formats(formats)
 
@@ -91,7 +111,7 @@ class LyndaIE(SubtitlesInfoExtractor):
             'stayPut': 'false'
         }        
         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
-        login_page = self._download_webpage(request, None, note='Logging in as %s' % username)
+        login_page = self._download_webpage(request, None, 'Logging in as %s' % username)
 
         # Not (yet) logged in
         m = re.search(r'loginResultJson = \'(?P<json>[^\']+)\';', login_page)
@@ -116,7 +136,7 @@ class LyndaIE(SubtitlesInfoExtractor):
                     'stayPut': 'false',
                 }
                 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(confirm_form))
-                login_page = self._download_webpage(request, None, note='Confirming log in and log out from another device')
+                login_page = self._download_webpage(request, None, 'Confirming log in and log out from another device')
 
         if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None:
             raise ExtractorError('Unable to log in')
@@ -150,7 +170,7 @@ class LyndaIE(SubtitlesInfoExtractor):
 
     def _get_available_subtitles(self, video_id, webpage):
         url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id
-        sub = self._download_webpage(url, None, note=False)
+        sub = self._download_webpage(url, None, False)
         sub_json = json.loads(sub)
         return {'en': url} if len(sub_json) > 0 else {}
 
@@ -179,6 +199,9 @@ class LyndaCourseIE(InfoExtractor):
         videos = []
         (username, _) = self._get_login_info()
 
+        # Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided
+        # by single video API anymore
+
         for chapter in course_json['Chapters']:
             for video in chapter['Videos']:
                 if username is None and video['HasAccess'] is False:
diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py
new file mode 100644 (file)
index 0000000..7460d81
--- /dev/null
@@ -0,0 +1,86 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class MailRuIE(InfoExtractor):
+    IE_NAME = 'mailru'
+    IE_DESC = 'Видео@Mail.Ru'
+    _VALID_URL = r'http://(?:www\.)?my\.mail\.ru/(?:video/.*#video=/?(?P<idv1>(?:[^/]+/){3}\d+)|(?:(?P<idv2prefix>(?:[^/]+/){2})video/(?P<idv2suffix>[^/]+/\d+))\.html)'
+
+    _TESTS = [
+        {
+            'url': 'http://my.mail.ru/video/top#video=/mail/sonypicturesrus/75/76',
+            'md5': 'dea205f03120046894db4ebb6159879a',
+            'info_dict': {
+                'id': '46301138',
+                'ext': 'mp4',
+                'title': 'Новый Человек-Паук. Высокое напряжение. Восстание Электро',
+                'timestamp': 1393232740,
+                'upload_date': '20140224',
+                'uploader': 'sonypicturesrus',
+                'uploader_id': 'sonypicturesrus@mail.ru',
+                'duration': 184,
+            },
+        },
+        {
+            'url': 'http://my.mail.ru/corp/hitech/video/news_hi-tech_mail_ru/1263.html',
+            'md5': '00a91a58c3402204dcced523777b475f',
+            'info_dict': {
+                'id': '46843144',
+                'ext': 'mp4',
+                'title': 'Samsung Galaxy S5 Hammer Smash Fail Battery Explosion',
+                'timestamp': 1397217632,
+                'upload_date': '20140411',
+                'uploader': 'hitech',
+                'uploader_id': 'hitech@corp.mail.ru',
+                'duration': 245,
+            },
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('idv1')
+
+        if not video_id:
+            video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix')
+
+        video_data = self._download_json(
+            'http://api.video.mail.ru/videos/%s.json?new=1' % video_id, video_id, 'Downloading video JSON')
+
+        author = video_data['author']
+        uploader = author['name']
+        uploader_id = author['id']
+
+        movie = video_data['movie']
+        content_id = str(movie['contentId'])
+        title = movie['title']
+        if title.endswith('.mp4'):
+            title = title[:-4]
+        thumbnail = movie['poster']
+        duration = movie['duration']
+
+        view_count = video_data['views_count']
+
+        formats = [
+            {
+                'url': video['url'],
+                'format_id': video['name'],
+            } for video in video_data['videos']
+        ]
+
+        return {
+            'id': content_id,
+            'title': title,
+            'thumbnail': thumbnail,
+            'timestamp': video_data['timestamp'],
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'duration': duration,
+            'view_count': view_count,
+            'formats': formats,
+        }
\ No newline at end of file
index 7aa0080d735fe811d6babf110156f4ab895edbdd..1b8c4a32edf5d269b1e9bb9db366487ef2fba981 100644 (file)
@@ -1,15 +1,18 @@
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-)
 
 
 class MDRIE(InfoExtractor):
-    _VALID_URL = r'^(?P<domain>(?:https?://)?(?:www\.)?mdr\.de)/mediathek/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)_.*'
+    _VALID_URL = r'^(?P<domain>https?://(?:www\.)?mdr\.de)/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)(?:_|\.html)'
     
     # No tests, MDR regularily deletes its videos
+    _TEST = {
+        'url': 'http://www.mdr.de/fakt/video189002.html',
+        'only_matching': True,
+    }
 
     def _real_extract(self, url):
         m = re.match(self._VALID_URL, url)
@@ -19,9 +22,9 @@ class MDRIE(InfoExtractor):
         # determine title and media streams from webpage
         html = self._download_webpage(url, video_id)
 
-        title = self._html_search_regex(r'<h2>(.*?)</h2>', html, u'title')
+        title = self._html_search_regex(r'<h[12]>(.*?)</h[12]>', html, 'title')
         xmlurl = self._search_regex(
-            r'(/mediathek/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, u'XML URL')
+            r'dataURL:\'(/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, 'XML URL')
 
         doc = self._download_xml(domain + xmlurl, video_id)
         formats = []
@@ -41,7 +44,7 @@ class MDRIE(InfoExtractor):
             if vbr_el is None:
                 format.update({
                     'vcodec': 'none',
-                    'format_id': u'%s-%d' % (media_type, abr),
+                    'format_id': '%s-%d' % (media_type, abr),
                 })
             else:
                 vbr = int(vbr_el.text) // 1000
@@ -49,12 +52,9 @@ class MDRIE(InfoExtractor):
                     'vbr': vbr,
                     'width': int(a.find('frameWidth').text),
                     'height': int(a.find('frameHeight').text),
-                    'format_id': u'%s-%d' % (media_type, vbr),
+                    'format_id': '%s-%d' % (media_type, vbr),
                 })
             formats.append(format)
-        if not formats:
-            raise ExtractorError(u'Could not find any valid formats')
-
         self._sort_formats(formats)
 
         return {
index 99d3c83a5e4c3a31d71e9f487e13930af498dc1f..6436c05a3cd8e3f25499b9ff911de837a6c98207 100644 (file)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
@@ -9,104 +11,103 @@ from ..utils import (
     ExtractorError,
 )
 
-class MetacafeIE(InfoExtractor):
-    """Information Extractor for metacafe.com."""
 
-    _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
+class MetacafeIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
-    IE_NAME = u'metacafe'
+    IE_NAME = 'metacafe'
     _TESTS = [
-    # Youtube video
-    {
-        u"add_ie": ["Youtube"],
-        u"url":  u"http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/",
-        u"file":  u"_aUehQsCQtM.mp4",
-        u"info_dict": {
-            u"upload_date": u"20090102",
-            u"title": u"The Electric Company | \"Short I\" | PBS KIDS GO!",
-            u"description": u"md5:2439a8ef6d5a70e380c22f5ad323e5a8",
-            u"uploader": u"PBS",
-            u"uploader_id": u"PBS"
-        }
-    },
-    # Normal metacafe video
-    {
-        u'url': u'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/',
-        u'md5': u'6e0bca200eaad2552e6915ed6fd4d9ad',
-        u'info_dict': {
-            u'id': u'11121940',
-            u'ext': u'mp4',
-            u'title': u'News: Stuff You Won\'t Do with Your PlayStation 4',
-            u'uploader': u'ign',
-            u'description': u'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.',
+        # Youtube video
+        {
+            'add_ie': ['Youtube'],
+            'url':  'http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/',
+            'info_dict': {
+                'id': '_aUehQsCQtM',
+                'ext': 'mp4',
+                'upload_date': '20090102',
+                'title': 'The Electric Company | "Short I" | PBS KIDS GO!',
+                'description': 'md5:2439a8ef6d5a70e380c22f5ad323e5a8',
+                'uploader': 'PBS',
+                'uploader_id': 'PBS'
+            }
         },
-    },
-    # AnyClip video
-    {
-        u"url": u"http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/",
-        u"file": u"an-dVVXnuY7Jh77J.mp4",
-        u"info_dict": {
-            u"title": u"The Andromeda Strain (1971): Stop the Bomb Part 3",
-            u"uploader": u"anyclip",
-            u"description": u"md5:38c711dd98f5bb87acf973d573442e67",
+        # Normal metacafe video
+        {
+            'url': 'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/',
+            'md5': '6e0bca200eaad2552e6915ed6fd4d9ad',
+            'info_dict': {
+                'id': '11121940',
+                'ext': 'mp4',
+                'title': 'News: Stuff You Won\'t Do with Your PlayStation 4',
+                'uploader': 'ign',
+                'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.',
+            },
         },
-    },
-    # age-restricted video
-    {
-        u'url': u'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/',
-        u'md5': u'98dde7c1a35d02178e8ab7560fe8bd09',
-        u'info_dict': {
-            u'id': u'5186653',
-            u'ext': u'mp4',
-            u'title': u'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.',
-            u'uploader': u'Dwayne Pipe',
-            u'description': u'md5:950bf4c581e2c059911fa3ffbe377e4b',
-            u'age_limit': 18,
+        # AnyClip video
+        {
+            'url': 'http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/',
+            'info_dict': {
+                'id': 'an-dVVXnuY7Jh77J',
+                'ext': 'mp4',
+                'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3',
+                'uploader': 'anyclip',
+                'description': 'md5:38c711dd98f5bb87acf973d573442e67',
+            },
         },
-    },
-    # cbs video
-    {
-        u'url': u'http://www.metacafe.com/watch/cb-0rOxMBabDXN6/samsung_galaxy_note_2_samsungs_next_generation_phablet/',
-        u'info_dict': {
-            u'id': u'0rOxMBabDXN6',
-            u'ext': u'flv',
-            u'title': u'Samsung Galaxy Note 2: Samsung\'s next-generation phablet',
-            u'description': u'md5:54d49fac53d26d5a0aaeccd061ada09d',
-            u'duration': 129,
+        # age-restricted video
+        {
+            'url': 'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/',
+            'md5': '98dde7c1a35d02178e8ab7560fe8bd09',
+            'info_dict': {
+                'id': '5186653',
+                'ext': 'mp4',
+                'title': 'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.',
+                'uploader': 'Dwayne Pipe',
+                'description': 'md5:950bf4c581e2c059911fa3ffbe377e4b',
+                'age_limit': 18,
+            },
         },
-        u'params': {
-            # rtmp download
-            u'skip_download': True,
+        # cbs video
+        {
+            'url': 'http://www.metacafe.com/watch/cb-8VD4r_Zws8VP/open_this_is_face_the_nation_february_9/',
+            'info_dict': {
+                'id': '8VD4r_Zws8VP',
+                'ext': 'flv',
+                'title': 'Open: This is Face the Nation, February 9',
+                'description': 'md5:8a9ceec26d1f7ed6eab610834cc1a476',
+                'duration': 96,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
         },
-    },
     ]
 
-
     def report_disclaimer(self):
-        """Report disclaimer retrieval."""
-        self.to_screen(u'Retrieving disclaimer')
+        self.to_screen('Retrieving disclaimer')
 
     def _real_initialize(self):
         # Retrieve disclaimer
         self.report_disclaimer()
-        self._download_webpage(self._DISCLAIMER, None, False, u'Unable to retrieve disclaimer')
+        self._download_webpage(self._DISCLAIMER, None, False, 'Unable to retrieve disclaimer')
 
         # Confirm age
         disclaimer_form = {
             'filters': '0',
             'submit': "Continue - I'm over 18",
-            }
+        }
         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
         request.add_header('Content-Type', 'application/x-www-form-urlencoded')
         self.report_age_confirmation()
-        self._download_webpage(request, None, False, u'Unable to confirm age')
+        self._download_webpage(request, None, False, 'Unable to confirm age')
 
     def _real_extract(self, url):
         # Extract id and simplified title from URL
         mobj = re.match(self._VALID_URL, url)
         if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
+            raise ExtractorError('Invalid URL: %s' % url)
 
         video_id = mobj.group(1)
 
@@ -153,22 +154,24 @@ class MetacafeIE(InfoExtractor):
             else:
                 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
                 if mobj is None:
-                    raise ExtractorError(u'Unable to extract media URL')
+                    raise ExtractorError('Unable to extract media URL')
                 vardict = compat_parse_qs(mobj.group(1))
                 if 'mediaData' not in vardict:
-                    raise ExtractorError(u'Unable to extract media URL')
-                mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
+                    raise ExtractorError('Unable to extract media URL')
+                mobj = re.search(
+                    r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
                 if mobj is None:
-                    raise ExtractorError(u'Unable to extract media URL')
+                    raise ExtractorError('Unable to extract media URL')
                 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
                 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
                 video_ext = determine_ext(video_url)
 
-        video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, u'title')
+        video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, 'title')
         description = self._og_search_description(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
         video_uploader = self._html_search_regex(
                 r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
-                webpage, u'uploader nickname', fatal=False)
+                webpage, 'uploader nickname', fatal=False)
 
         if re.search(r'"contentRating":"restricted"', webpage) is not None:
             age_limit = 18
@@ -176,13 +179,12 @@ class MetacafeIE(InfoExtractor):
             age_limit = 0
 
         return {
-            '_type':    'video',
-            'id':       video_id,
-            'url':      video_url,
+            'id': video_id,
+            'url': video_url,
             'description': description,
             'uploader': video_uploader,
-            'upload_date':  None,
-            'title':    video_title,
-            'ext':      video_ext,
+            'title': video_title,
+            'thumbnail':thumbnail,
+            'ext': video_ext,
             'age_limit': age_limit,
         }
index 465ac4916a4596e247f957cac636522227da7f78..07f072924a6dadb2838230fd29a6a830ff99bb64 100644 (file)
@@ -13,8 +13,9 @@ class MetacriticIE(InfoExtractor):
 
     _TEST = {
         'url': 'http://www.metacritic.com/game/playstation-4/infamous-second-son/trailers/3698222',
-        'file': '3698222.mp4',
         'info_dict': {
+            'id': '3698222',
+            'ext': 'mp4',
             'title': 'inFamous: Second Son - inSide Sucker Punch: Smoke & Mirrors',
             'description': 'Take a peak behind-the-scenes to see how Sucker Punch brings smoke into the universe of inFAMOUS Second Son on the PS4.',
             'duration': 221,
index 76b717fe5dbac08b8b103a1e44192a6fbf6d2a55..807b1dc89b608333e06c1fbab2e9d806fb7d090f 100644 (file)
@@ -1,24 +1,30 @@
+from __future__ import unicode_literals
+
 import re
 import json
 
 from .common import InfoExtractor
+from .youtube import YoutubeIE
 from ..utils import (
+    compat_urlparse,
     clean_html,
+    ExtractorError,
     get_element_by_id,
 )
 
 
 class TechTVMITIE(InfoExtractor):
-    IE_NAME = u'techtv.mit.edu'
+    IE_NAME = 'techtv.mit.edu'
     _VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P<id>\d+)'
 
     _TEST = {
-        u'url': u'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
-        u'file': u'25418.mp4',
-        u'md5': u'1f8cb3e170d41fd74add04d3c9330e5f',
-        u'info_dict': {
-            u'title': u'MIT DNA Learning Center Set',
-            u'description': u'md5:82313335e8a8a3f243351ba55bc1b474',
+        'url': 'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
+        'md5': '1f8cb3e170d41fd74add04d3c9330e5f',
+        'info_dict': {
+            'id': '25418',
+            'ext': 'mp4',
+            'title': 'MIT DNA Learning Center Set',
+            'description': 'md5:82313335e8a8a3f243351ba55bc1b474',
         },
     }
 
@@ -27,12 +33,12 @@ class TechTVMITIE(InfoExtractor):
         video_id = mobj.group('id')
         raw_page = self._download_webpage(
             'http://techtv.mit.edu/videos/%s' % video_id, video_id)
-        clean_page = re.compile(u'<!--.*?-->', re.S).sub(u'', raw_page)
+        clean_page = re.compile(r'<!--.*?-->', re.S).sub('', raw_page)
 
-        base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)',
-            raw_page, u'base url')
-        formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page,
-            u'video formats')
+        base_url = self._search_regex(
+            r'ipadUrl: \'(.+?cloudfront.net/)', raw_page, 'base url')
+        formats_json = self._search_regex(
+            r'bitrates: (\[.+?\])', raw_page, 'video formats')
         formats_mit = json.loads(formats_json)
         formats = [
             {
@@ -48,28 +54,31 @@ class TechTVMITIE(InfoExtractor):
 
         title = get_element_by_id('edit-title', clean_page)
         description = clean_html(get_element_by_id('edit-description', clean_page))
-        thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'',
-            raw_page, u'thumbnail', flags=re.DOTALL)
+        thumbnail = self._search_regex(
+            r'playlist:.*?url: \'(.+?)\'',
+            raw_page, 'thumbnail', flags=re.DOTALL)
 
-        return {'id': video_id,
-                'title': title,
-                'formats': formats,
-                'description': description,
-                'thumbnail': thumbnail,
-                }
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'description': description,
+            'thumbnail': thumbnail,
+        }
 
 
 class MITIE(TechTVMITIE):
-    IE_NAME = u'video.mit.edu'
+    IE_NAME = 'video.mit.edu'
     _VALID_URL = r'https?://video\.mit\.edu/watch/(?P<title>[^/]+)'
 
     _TEST = {
-        u'url': u'http://video.mit.edu/watch/the-government-is-profiling-you-13222/',
-        u'file': u'21783.mp4',
-        u'md5': u'7db01d5ccc1895fc5010e9c9e13648da',
-        u'info_dict': {
-            u'title': u'The Government is Profiling You',
-            u'description': u'md5:ad5795fe1e1623b73620dbfd47df9afd',
+        'url': 'http://video.mit.edu/watch/the-government-is-profiling-you-13222/',
+        'md5': '7db01d5ccc1895fc5010e9c9e13648da',
+        'info_dict': {
+            'id': '21783',
+            'ext': 'mp4',
+            'title': 'The Government is Profiling You',
+            'description': 'md5:ad5795fe1e1623b73620dbfd47df9afd',
         },
     }
 
@@ -77,7 +86,73 @@ class MITIE(TechTVMITIE):
         mobj = re.match(self._VALID_URL, url)
         page_title = mobj.group('title')
         webpage = self._download_webpage(url, page_title)
-        self.to_screen('%s: Extracting %s url' % (page_title, TechTVMITIE.IE_NAME))
-        embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage,
-            u'embed url')
+        embed_url = self._search_regex(
+            r'<iframe .*?src="(.+?)"', webpage, 'embed url')
         return self.url_result(embed_url, ie='TechTVMIT')
+
+
+class OCWMITIE(InfoExtractor):
+    IE_NAME = 'ocw.mit.edu'
+    _VALID_URL = r'^http://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)'
+    _BASE_URL = 'http://ocw.mit.edu/'
+
+    _TESTS = [
+        {
+            'url': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/',
+            'info_dict': {
+                'id': 'EObHWIEKGjA',
+                'ext': 'mp4',
+                'title': 'Lecture 7: Multiple Discrete Random Variables: Expectations, Conditioning, Independence',
+                'description': 'In this lecture, the professor discussed multiple random variables, expectations, and binomial distribution.',
+                #'subtitles': 'http://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-041-probabilistic-systems-analysis-and-applied-probability-fall-2010/video-lectures/lecture-7-multiple-variables-expectations-independence/MIT6_041F11_lec07_300k.mp4.srt'
+            }
+        },
+        {
+            'url': 'http://ocw.mit.edu/courses/mathematics/18-01sc-single-variable-calculus-fall-2010/1.-differentiation/part-a-definition-and-basic-rules/session-1-introduction-to-derivatives/',
+            'info_dict': {
+                'id': '7K1sB05pE0A',
+                'ext': 'mp4',
+                'title': 'Session 1: Introduction to Derivatives',
+                'description': 'This section contains lecture video excerpts, lecture notes, an interactive mathlet with supporting documents, and problem solving videos.',
+                #'subtitles': 'http://ocw.mit.edu//courses/mathematics/18-01sc-single-variable-calculus-fall-2010/ocw-18.01-f07-lec01_300k.SRT'
+            }
+        }
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        topic = mobj.group('topic')
+
+        webpage = self._download_webpage(url, topic)
+        title = self._html_search_meta('WT.cg_s', webpage)
+        description = self._html_search_meta('Description', webpage)
+
+        # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, start, stop, captions_file)
+        embed_chapter_media = re.search(r'ocw_embed_chapter_media\((.+?)\)', webpage)
+        if embed_chapter_media:
+            metadata = re.sub(r'[\'"]', '', embed_chapter_media.group(1))
+            metadata = re.split(r', ?', metadata)
+            yt = metadata[1]
+            subs = compat_urlparse.urljoin(self._BASE_URL, metadata[7])
+        else:
+            # search for call to ocw_embed_chapter_media(container_id, media_url, provider, page_url, image_url, captions_file)
+            embed_media = re.search(r'ocw_embed_media\((.+?)\)', webpage)
+            if embed_media:
+                metadata = re.sub(r'[\'"]', '', embed_media.group(1))
+                metadata = re.split(r', ?', metadata)
+                yt = metadata[1]
+                subs = compat_urlparse.urljoin(self._BASE_URL, metadata[5])
+            else:
+                raise ExtractorError('Unable to find embedded YouTube video.')
+        video_id = YoutubeIE.extract_id(yt)
+
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'url': yt,
+            'url_transparent'
+            'subtitles': subs,
+            'ie_key': 'Youtube',
+        }
index f3356db50ebf8941ac58e9a229778ba864c57be0..5f64e7bd0d98b74aea2a4350a51f057b4d0280ba 100644 (file)
@@ -4,24 +4,31 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
-    unified_strdate,
+    compat_urllib_parse,
     ExtractorError,
+    int_or_none,
+    parse_iso8601,
 )
 
 
 class MixcloudIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
+    _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)'
     IE_NAME = 'mixcloud'
 
     _TEST = {
         'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/',
-        'file': 'dholbach-cryptkeeper.mp3',
         'info_dict': {
+            'id': 'dholbach-cryptkeeper',
+            'ext': 'mp3',
             'title': 'Cryptkeeper',
             'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
             'uploader': 'Daniel Holbach',
             'uploader_id': 'dholbach',
             'upload_date': '20111115',
+            'timestamp': 1321359578,
+            'thumbnail': 're:https?://.*\.jpg',
+            'view_count': int,
+            'like_count': int,
         },
     }
 
@@ -45,14 +52,10 @@ class MixcloudIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         uploader = mobj.group(1)
         cloudcast_name = mobj.group(2)
-        track_id = '-'.join((uploader, cloudcast_name))
+        track_id = compat_urllib_parse.unquote('-'.join((uploader, cloudcast_name)))
 
         webpage = self._download_webpage(url, track_id)
 
-        api_url = 'http://api.mixcloud.com/%s/%s/' % (uploader, cloudcast_name)
-        info = self._download_json(
-            api_url, track_id, 'Downloading cloudcast info')
-
         preview_url = self._search_regex(
             r'\s(?:data-preview-url|m-preview)="(.+?)"', webpage, 'preview url')
         song_url = preview_url.replace('/previews/', '/c/originals/')
@@ -63,16 +66,41 @@ class MixcloudIE(InfoExtractor):
             template_url = template_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/')
             final_song_url = self._get_url(template_url)
         if final_song_url is None:
-            raise ExtractorError(u'Unable to extract track url')
+            raise ExtractorError('Unable to extract track url')
+
+        PREFIX = (
+            r'<div class="cloudcast-play-button-container"'
+            r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+')
+        title = self._html_search_regex(
+            PREFIX + r'm-title="([^"]+)"', webpage, 'title')
+        thumbnail = self._proto_relative_url(self._html_search_regex(
+            PREFIX + r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail',
+            fatal=False))
+        uploader = self._html_search_regex(
+            PREFIX + r'm-owner-name="([^"]+)"',
+            webpage, 'uploader', fatal=False)
+        uploader_id = self._search_regex(
+            r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)
+        description = self._og_search_description(webpage)
+        like_count = int_or_none(self._search_regex(
+            r'<meta itemprop="interactionCount" content="UserLikes:([0-9]+)"',
+            webpage, 'like count', fatal=False))
+        view_count = int_or_none(self._search_regex(
+            r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
+            webpage, 'play count', fatal=False))
+        timestamp = parse_iso8601(self._search_regex(
+            r'<time itemprop="dateCreated" datetime="([^"]+)">',
+            webpage, 'upload date'))
 
         return {
             'id': track_id,
-            'title': info['name'],
+            'title': title,
             'url': final_song_url,
-            'description': info.get('description'),
-            'thumbnail': info['pictures'].get('extra_large'),
-            'uploader': info['user']['name'],
-            'uploader_id': info['user']['username'],
-            'upload_date': unified_strdate(info['created_time']),
-            'view_count': info['play_count'],
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'timestamp': timestamp,
+            'view_count': view_count,
+            'like_count': like_count,
         }
index f1875add5ff0626750827276935d861c25e4d7e3..7d21ea18f1bec57a83a49478d43f000b8039041f 100644 (file)
@@ -14,7 +14,7 @@ from ..utils import (
 class MooshareIE(InfoExtractor):
     IE_NAME = 'mooshare'
     IE_DESC = 'Mooshare.biz'
-    _VALID_URL = r'http://mooshare\.biz/(?P<id>[\da-z]{12})'
+    _VALID_URL = r'http://(?:www\.)?mooshare\.biz/(?P<id>[\da-z]{12})'
 
     _TESTS = [
         {
diff --git a/youtube_dl/extractor/morningstar.py b/youtube_dl/extractor/morningstar.py
new file mode 100644 (file)
index 0000000..320d27b
--- /dev/null
@@ -0,0 +1,47 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class MorningstarIE(InfoExtractor):
+    IE_DESC = 'morningstar.com'
+    _VALID_URL = r'https?://(?:www\.)?morningstar\.com/[cC]over/video[cC]enter\.aspx\?id=(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.morningstar.com/cover/videocenter.aspx?id=615869',
+        'md5': '6c0acface7a787aadc8391e4bbf7b0f5',
+        'info_dict': {
+            'id': '615869',
+            'ext': 'mp4',
+            'title': 'Get Ahead of the Curve on 2013 Taxes',
+            'description': "Vanguard's Joel Dickson on managing higher tax rates for high-income earners and fund capital-gain distributions in 2013.",
+            'thumbnail': r're:^https?://.*m(?:orning)?star\.com/.+thumb\.jpg$'
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        title = self._html_search_regex(
+            r'<h1 id="titleLink">(.*?)</h1>', webpage, 'title')
+        video_url = self._html_search_regex(
+            r'<input type="hidden" id="hidVideoUrl" value="([^"]+)"',
+            webpage, 'video URL')
+        thumbnail = self._html_search_regex(
+            r'<input type="hidden" id="hidSnapshot" value="([^"]+)"',
+            webpage, 'thumbnail', fatal=False)
+        description = self._html_search_regex(
+            r'<div id="mstarDeck".*?>(.*?)</div>',
+            webpage, 'description', fatal=False)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'thumbnail': thumbnail,
+            'description': description,
+        }
diff --git a/youtube_dl/extractor/motorsport.py b/youtube_dl/extractor/motorsport.py
new file mode 100644 (file)
index 0000000..7c0ec6a
--- /dev/null
@@ -0,0 +1,63 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import json
+import re
+import time
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_parse_qs,
+    compat_str,
+    int_or_none,
+)
+
+
+class MotorsportIE(InfoExtractor):
+    IE_DESC = 'motorsport.com'
+    _VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/(?:$|[?#])'
+    _TEST = {
+        'url': 'http://www.motorsport.com/f1/video/main-gallery/red-bull-racing-2014-rules-explained/',
+        'md5': '5592cb7c5005d9b2c163df5ac3dc04e4',
+        'info_dict': {
+            'id': '7063',
+            'ext': 'mp4',
+            'title': 'Red Bull Racing: 2014 Rules Explained',
+            'duration': 207,
+            'description': 'A new clip from Red Bull sees Daniel Ricciardo and Sebastian Vettel explain the 2014 Formula One regulations – which are arguably the most complex the sport has ever seen.',
+            'uploader': 'rainiere',
+            'thumbnail': r're:^http://.*motorsport\.com/.+\.jpg$'
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, display_id)
+        flashvars_code = self._html_search_regex(
+            r'<embed id="player".*?flashvars="([^"]+)"', webpage, 'flashvars')
+        flashvars = compat_parse_qs(flashvars_code)
+        params = json.loads(flashvars['parameters'][0])
+
+        e = compat_str(int(time.time()) + 24 * 60 * 60)
+        base_video_url = params['location'] + '?e=' + e
+        s = 'h3hg713fh32'
+        h = hashlib.md5((s + base_video_url).encode('utf-8')).hexdigest()
+        video_url = base_video_url + '&h=' + h
+
+        uploader = self._html_search_regex(
+            r'(?s)<span class="label">Video by: </span>(.*?)</a>', webpage,
+            'uploader', fatal=False)
+
+        return {
+            'id': params['video_id'],
+            'display_id': display_id,
+            'title': params['title'],
+            'url': video_url,
+            'description': params.get('description'),
+            'thumbnail': params.get('main_thumb'),
+            'duration': int_or_none(params.get('duration')),
+            'uploader': uploader,
+        }
diff --git a/youtube_dl/extractor/moviezine.py b/youtube_dl/extractor/moviezine.py
new file mode 100644 (file)
index 0000000..4314618
--- /dev/null
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class MoviezineIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.moviezine\.se/video/(?P<id>[^?#]+)'
+
+    _TEST = {
+        'url': 'http://www.moviezine.se/video/205866',
+        'info_dict': {
+            'id': '205866',
+            'ext': 'mp4',
+            'title': 'Oculus - Trailer 1',
+            'description': 'md5:40cc6790fc81d931850ca9249b40e8a4',
+            'thumbnail': 're:http://.*\.jpg',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        jsplayer = self._download_webpage('http://www.moviezine.se/api/player.js?video=%s' % video_id, video_id, 'Downloading js api player')
+
+        formats =[{
+            'format_id': 'sd',
+            'url': self._html_search_regex(r'file: "(.+?)",', jsplayer, 'file'),
+            'quality': 0,
+            'ext': 'mp4',
+        }]
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': self._search_regex(r'title: "(.+?)",', jsplayer, 'title'),
+            'thumbnail': self._search_regex(r'image: "(.+?)",', jsplayer, 'image'),
+            'formats': formats,
+            'description': self._og_search_description(webpage),
+        }
diff --git a/youtube_dl/extractor/movshare.py b/youtube_dl/extractor/movshare.py
new file mode 100644 (file)
index 0000000..4191cf7
--- /dev/null
@@ -0,0 +1,27 @@
+from __future__ import unicode_literals
+
+from .novamov import NovaMovIE
+
+
+class MovShareIE(NovaMovIE):
+    IE_NAME = 'movshare'
+    IE_DESC = 'MovShare'
+
+    _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'movshare\.(?:net|sx|ag)'}
+
+    _HOST = 'www.movshare.net'
+
+    _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<'
+    _TITLE_REGEX = r'<strong>Title:</strong> ([^<]+)</p>'
+    _DESCRIPTION_REGEX = r'<strong>Description:</strong> ([^<]+)</p>'
+
+    _TEST = {
+        'url': 'http://www.movshare.net/video/559e28be54d96',
+        'md5': 'abd31a2132947262c50429e1d16c1bfd',
+        'info_dict': {
+            'id': '559e28be54d96',
+            'ext': 'flv',
+            'title': 'dissapeared image',
+            'description': 'optical illusion  dissapeared image  magic illusion',
+        }
+    }
\ No newline at end of file
index 6a8e2cc442c25327d9d91acebb47d4e109750731..39d6feb98d171f16b2ae5d69d71cde67b3a21372 100644 (file)
@@ -4,9 +4,7 @@ import json
 import re
 
 from .common import InfoExtractor
-from ..utils import (
-    int_or_none,
-)
+from ..utils import int_or_none
 
 
 class MporaIE(InfoExtractor):
@@ -20,7 +18,7 @@ class MporaIE(InfoExtractor):
         'info_dict': {
             'title': 'Katy Curd -  Winter in the Forest',
             'duration': 416,
-            'uploader': 'petenewman',
+            'uploader': 'Peter Newman Media',
         },
     }
 
index 5447b6c0cab098b895eda0e9f2b3b266fb65a7b0..e5ca41b4091698ad2a180f9d2ca00b4b96218c1e 100644 (file)
@@ -5,9 +5,12 @@ import re
 from .common import InfoExtractor
 from ..utils import (
     compat_urllib_parse,
+    compat_urllib_request,
     ExtractorError,
     find_xpath_attr,
     fix_xml_ampersands,
+    HEADRequest,
+    unescapeHTML,
     url_basename,
     RegexNotFoundError,
 )
@@ -18,6 +21,7 @@ def _media_xml_tag(tag):
 
 
 class MTVServicesInfoExtractor(InfoExtractor):
+    _MOBILE_TEMPLATE = None
     @staticmethod
     def _id_from_uri(uri):
         return uri.split(':')[-1]
@@ -39,9 +43,29 @@ class MTVServicesInfoExtractor(InfoExtractor):
         else:
             return thumb_node.attrib['url']
 
-    def _extract_video_formats(self, mdoc):
-        if re.match(r'.*/error_country_block\.swf$', mdoc.find('.//src').text) is not None:
-            raise ExtractorError('This video is not available from your country.', expected=True)
+    def _extract_mobile_video_formats(self, mtvn_id):
+        webpage_url = self._MOBILE_TEMPLATE % mtvn_id
+        req = compat_urllib_request.Request(webpage_url)
+        # Otherwise we get a webpage that would execute some javascript
+        req.add_header('Youtubedl-user-agent', 'curl/7')
+        webpage = self._download_webpage(req, mtvn_id,
+            'Downloading mobile page')
+        metrics_url = unescapeHTML(self._search_regex(r'<a href="(http://metrics.+?)"', webpage, 'url'))
+        req = HEADRequest(metrics_url)
+        response = self._request_webpage(req, mtvn_id, 'Resolving url')
+        url = response.geturl()
+        # Transform the url to get the best quality:
+        url = re.sub(r'.+pxE=mp4', 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639+_pxE=mp4', url, 1)
+        return [{'url': url,'ext': 'mp4'}]
+
+    def _extract_video_formats(self, mdoc, mtvn_id):
+        if re.match(r'.*/(error_country_block\.swf|geoblock\.mp4)$', mdoc.find('.//src').text) is not None:
+            if mtvn_id is not None and self._MOBILE_TEMPLATE is not None:
+                self.to_screen('The normal version is not available from your '
+                    'country, trying with the mobile version')
+                return self._extract_mobile_video_formats(mtvn_id)
+            raise ExtractorError('This video is not available from your country.',
+                expected=True)
 
         formats = []
         for rendition in mdoc.findall('.//rendition'):
@@ -56,6 +80,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
                                 })
             except (KeyError, TypeError):
                 raise ExtractorError('Invalid rendition field.')
+        self._sort_formats(formats)
         return formats
 
     def _get_video_info(self, itemdoc):
@@ -94,9 +119,16 @@ class MTVServicesInfoExtractor(InfoExtractor):
             raise ExtractorError('Could not find video title')
         title = title.strip()
 
+        # This a short id that's used in the webpage urls
+        mtvn_id = None
+        mtvn_id_node = find_xpath_attr(itemdoc, './/{http://search.yahoo.com/mrss/}category',
+                'scheme', 'urn:mtvn:id')
+        if mtvn_id_node is not None:
+            mtvn_id = mtvn_id_node.text
+
         return {
             'title': title,
-            'formats': self._extract_video_formats(mediagen_doc),
+            'formats': self._extract_video_formats(mediagen_doc, mtvn_id),
             'id': video_id,
             'thumbnail': self._get_thumbnail_url(uri, itemdoc),
             'description': description,
diff --git a/youtube_dl/extractor/musicplayon.py b/youtube_dl/extractor/musicplayon.py
new file mode 100644 (file)
index 0000000..42d7a82
--- /dev/null
@@ -0,0 +1,75 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class MusicPlayOnIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:.+?\.)?musicplayon\.com/play(?:-touch)?\?(?:v|pl=100&play)=(?P<id>\d+)'
+
+    _TEST = {
+        'url': 'http://en.musicplayon.com/play?v=433377',
+        'info_dict': {
+            'id': '433377',
+            'ext': 'mp4',
+            'title': 'Rick Ross - Interview On Chelsea Lately (2014)',
+            'description': 'Rick Ross Interview On Chelsea Lately',
+            'duration': 342,
+            'uploader': 'ultrafish',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        page = self._download_webpage(url, video_id)
+
+        title = self._og_search_title(page)
+        description = self._og_search_description(page)
+        thumbnail = self._og_search_thumbnail(page)
+        duration = self._html_search_meta('video:duration', page, 'duration', fatal=False)
+        view_count = self._og_search_property('count', page, fatal=False)
+        uploader = self._html_search_regex(
+            r'<div>by&nbsp;<a href="[^"]+" class="purple">([^<]+)</a></div>', page, 'uploader', fatal=False)
+
+        formats = [
+            {
+                'url': 'http://media0-eu-nl.musicplayon.com/stream-mobile?id=%s&type=.mp4' % video_id,
+                'ext': 'mp4',
+            }
+        ]
+
+        manifest = self._download_webpage(
+            'http://en.musicplayon.com/manifest.m3u8?v=%s' % video_id, video_id, 'Downloading manifest')
+
+        for entry in manifest.split('#')[1:]:
+            if entry.startswith('EXT-X-STREAM-INF:'):
+                meta, url, _ = entry.split('\n')
+                params = dict(param.split('=') for param in meta.split(',')[1:])
+                formats.append({
+                    'url': url,
+                    'ext': 'mp4',
+                    'tbr': int(params['BANDWIDTH']),
+                    'width': int(params['RESOLUTION'].split('x')[1]),
+                    'height': int(params['RESOLUTION'].split('x')[-1]),
+                    'format_note': params['NAME'].replace('"', '').strip(),
+                })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'duration': int_or_none(duration),
+            'view_count': int_or_none(view_count),
+            'formats': formats,
+        }
\ No newline at end of file
index 6d35c7861f38f844385454fb7e7d9e8d5ff029a9..ccb5959c4046e73f1263ecee19e829b418c6d506 100644 (file)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import binascii
 import base64
 import hashlib
@@ -14,18 +16,16 @@ from ..utils import (
 )
 
 
-
 class MyVideoIE(InfoExtractor):
-    """Information Extractor for myvideo.de."""
-
-    _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/([0-9]+)/([^?/]+).*'
-    IE_NAME = u'myvideo'
+    _VALID_URL = r'http://(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/(?P<id>[0-9]+)/[^?/]+.*'
+    IE_NAME = 'myvideo'
     _TEST = {
-        u'url': u'http://www.myvideo.de/watch/8229274/bowling_fail_or_win',
-        u'file': u'8229274.flv',
-        u'md5': u'2d2753e8130479ba2cb7e0a37002053e',
-        u'info_dict': {
-            u"title": u"bowling-fail-or-win"
+        'url': 'http://www.myvideo.de/watch/8229274/bowling_fail_or_win',
+        'md5': '2d2753e8130479ba2cb7e0a37002053e',
+        'info_dict': {
+            'id': '8229274',
+            'ext': 'flv',
+            'title': 'bowling-fail-or-win',
         }
     }
 
@@ -53,10 +53,7 @@ class MyVideoIE(InfoExtractor):
 
     def _real_extract(self,url):
         mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'invalid URL: %s' % url)
-
-        video_id = mobj.group(1)
+        video_id = mobj.group('id')
 
         GK = (
           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
@@ -74,37 +71,33 @@ class MyVideoIE(InfoExtractor):
             video_url = mobj.group(1) + '.flv'
 
             video_title = self._html_search_regex('<title>([^<]+)</title>',
-                webpage, u'title')
-
-            video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
+                webpage, 'title')
 
-            return [{
-                'id':       video_id,
-                'url':      video_url,
-                'uploader': None,
-                'upload_date':  None,
-                'title':    video_title,
-                'ext':      video_ext,
-            }]
+            return {
+                'id': video_id,
+                'url': video_url,
+                'title': video_title,
+            }
 
         mobj = re.search(r'data-video-service="/service/data/video/%s/config' % video_id, webpage)
         if mobj is not None:
             request = compat_urllib_request.Request('http://www.myvideo.de/service/data/video/%s/config' % video_id, '')
             response = self._download_webpage(request, video_id,
-                                              u'Downloading video info')
+                                              'Downloading video info')
             info = json.loads(base64.b64decode(response).decode('utf-8'))
-            return {'id': video_id,
-                    'title': info['title'],
-                    'url': info['streaming_url'].replace('rtmpe', 'rtmpt'),
-                    'play_path': info['filename'],
-                    'ext': 'flv',
-                    'thumbnail': info['thumbnail'][0]['url'],
-                    }
+            return {
+                'id': video_id,
+                'title': info['title'],
+                'url': info['streaming_url'].replace('rtmpe', 'rtmpt'),
+                'play_path': info['filename'],
+                'ext': 'flv',
+                'thumbnail': info['thumbnail'][0]['url'],
+            }
 
         # try encxml
         mobj = re.search('var flashvars={(.+?)}', webpage)
         if mobj is None:
-            raise ExtractorError(u'Unable to extract video')
+            raise ExtractorError('Unable to extract video')
 
         params = {}
         encxml = ''
@@ -118,7 +111,7 @@ class MyVideoIE(InfoExtractor):
             params['domain'] = 'www.myvideo.de'
         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
         if 'flash_playertype=MTV' in xmldata_url:
-            self._downloader.report_warning(u'avoiding MTV player')
+            self._downloader.report_warning('avoiding MTV player')
             xmldata_url = (
                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
@@ -144,7 +137,7 @@ class MyVideoIE(InfoExtractor):
             video_url = compat_urllib_parse.unquote(mobj.group(1))
             if 'myvideo2flash' in video_url:
                 self.report_warning(
-                    u'Rewriting URL to use unencrypted rtmp:// ...',
+                    'Rewriting URL to use unencrypted rtmp:// ...',
                     video_id)
                 video_url = video_url.replace('rtmpe://', 'rtmp://')
 
@@ -152,39 +145,31 @@ class MyVideoIE(InfoExtractor):
             # extract non rtmp videos
             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
             if mobj is None:
-                raise ExtractorError(u'unable to extract url')
+                raise ExtractorError('unable to extract url')
             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
 
-        video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
+        video_file = self._search_regex('source=\'(.*?)\'', dec_data, 'video file')
         video_file = compat_urllib_parse.unquote(video_file)
 
         if not video_file.endswith('f4m'):
             ppath, prefix = video_file.split('.')
             video_playpath = '%s:%s' % (prefix, ppath)
-            video_hls_playlist = ''
         else:
             video_playpath = ''
-            video_hls_playlist = (
-                video_file
-            ).replace('.f4m', '.m3u8')
 
-        video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
+        video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, 'swfobj')
         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
 
         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
-            webpage, u'title')
-
-        return [{
-            'id':                 video_id,
-            'url':                video_url,
-            'tc_url':             video_url,
-            'uploader':           None,
-            'upload_date':        None,
-            'title':              video_title,
-            'ext':                u'flv',
-            'play_path':          video_playpath,
-            'video_file':         video_file,
-            'video_hls_playlist': video_hls_playlist,
-            'player_url':         video_swfobj,
-        }]
+            webpage, 'title')
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'tc_url': video_url,
+            'title': video_title,
+            'ext': 'flv',
+            'play_path': video_playpath,
+            'player_url': video_swfobj,
+        }
 
index 4cab30631956b903682fc2de7aa5dd551bcdd4a3..c0231c197b12b86c669e9cff4b34a5c2ac1639bf 100644 (file)
@@ -1,4 +1,6 @@
 # encoding: utf-8
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
@@ -12,12 +14,13 @@ class NaverIE(InfoExtractor):
     _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)'
 
     _TEST = {
-        u'url': u'http://tvcast.naver.com/v/81652',
-        u'file': u'81652.mp4',
-        u'info_dict': {
-            u'title': u'[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번',
-            u'description': u'합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.',
-            u'upload_date': u'20130903',
+        'url': 'http://tvcast.naver.com/v/81652',
+        'info_dict': {
+            'id': '81652',
+            'ext': 'mp4',
+            'title': '[9월 모의고사 해설강의][수학_김상희] 수학 A형 16~20번',
+            'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.',
+            'upload_date': '20130903',
         },
     }
 
@@ -28,7 +31,7 @@ class NaverIE(InfoExtractor):
         m_id = re.search(r'var rmcPlayer = new nhn.rmcnmv.RMCVideoPlayer\("(.+?)", "(.+?)"',
             webpage)
         if m_id is None:
-            raise ExtractorError(u'couldn\'t extract vid and key')
+            raise ExtractorError('couldn\'t extract vid and key')
         vid = m_id.group(1)
         key = m_id.group(2)
         query = compat_urllib_parse.urlencode({'vid': vid, 'inKey': key,})
@@ -39,22 +42,27 @@ class NaverIE(InfoExtractor):
         })
         info = self._download_xml(
             'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query,
-            video_id, u'Downloading video info')
+            video_id, 'Downloading video info')
         urls = self._download_xml(
             'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls,
-            video_id, u'Downloading video formats info')
+            video_id, 'Downloading video formats info')
 
         formats = []
         for format_el in urls.findall('EncodingOptions/EncodingOption'):
             domain = format_el.find('Domain').text
-            if domain.startswith('rtmp'):
-                continue
-            formats.append({
+            f = {
                 'url': domain + format_el.find('uri').text,
                 'ext': 'mp4',
                 'width': int(format_el.find('width').text),
                 'height': int(format_el.find('height').text),
-            })
+            }
+            if domain.startswith('rtmp'):
+                f.update({
+                    'ext': 'flv',
+                    'rtmp_protocol': '1', # rtmpt
+                })
+            formats.append(f)
+        self._sort_formats(formats)
 
         return {
             'id': video_id,
index 7e421610eb6f47054aa1ea1720fc4b2cb4404d49..633b42f728489c6e9f9c61a98b8b0b4d38e57be1 100644 (file)
@@ -6,12 +6,13 @@ from .common import InfoExtractor
 
 
 class NBAIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
+    _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?:nba/)?video(?P<id>/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
     _TEST = {
         'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html',
-        'file': u'0021200253-okc-bkn-recap.nba.mp4',
         'md5': u'c0edcfc37607344e2ff8f13c378c88a4',
         'info_dict': {
+            'id': '0021200253-okc-bkn-recap.nba',
+            'ext': 'mp4',
             'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.',
             'title': 'Thunder vs. Nets',
         },
@@ -19,7 +20,7 @@ class NBAIE(InfoExtractor):
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group(1)
+        video_id = mobj.group('id')
 
         webpage = self._download_webpage(url, video_id)
 
@@ -33,7 +34,6 @@ class NBAIE(InfoExtractor):
         return {
             'id': shortened_video_id,
             'url': video_url,
-            'ext': 'mp4',
             'title': title,
             'description': description,
         }
index e8bbfff7bd59eeaca0e04c83ee1599baff8af088..aa34665d1669f32ab31a02618c58ef9c4b130fe2 100644 (file)
@@ -1,32 +1,99 @@
+from __future__ import unicode_literals
+
 import re
+import json
 
 from .common import InfoExtractor
 from ..utils import find_xpath_attr, compat_str
 
 
-class NBCNewsIE(InfoExtractor):
-    _VALID_URL = r'https?://www\.nbcnews\.com/video/.+?/(?P<id>\d+)'
+class NBCIE(InfoExtractor):
+    _VALID_URL = r'http://www\.nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+)'
 
     _TEST = {
-        u'url': u'http://www.nbcnews.com/video/nbc-news/52753292',
-        u'file': u'52753292.flv',
-        u'md5': u'47abaac93c6eaf9ad37ee6c4463a5179',
-        u'info_dict': {
-            u'title': u'Crew emerges after four-month Mars food study',
-            u'description': u'md5:24e632ffac72b35f8b67a12d1b6ddfc1',
+        'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188',
+        'md5': '54d0fbc33e0b853a65d7b4de5c06d64e',
+        'info_dict': {
+            'id': 'u1RInQZRN7QJ',
+            'ext': 'flv',
+            'title': 'I Am a Firefighter',
+            'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.',
         },
     }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
-        all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
-        info = all_info.find('video')
+        webpage = self._download_webpage(url, video_id)
+        theplatform_url = self._search_regex('class="video-player video-player-full" data-mpx-url="(.*?)"', webpage, 'theplatform url')
+        if theplatform_url.startswith('//'):
+            theplatform_url = 'http:' + theplatform_url
+        return self.url_result(theplatform_url)
+
+
+class NBCNewsIE(InfoExtractor):
+    _VALID_URL = r'''(?x)https?://www\.nbcnews\.com/
+        ((video/.+?/(?P<id>\d+))|
+        (feature/[^/]+/(?P<title>.+)))
+        '''
 
-        return {'id': video_id,
+    _TESTS = [
+        {
+            'url': 'http://www.nbcnews.com/video/nbc-news/52753292',
+            'md5': '47abaac93c6eaf9ad37ee6c4463a5179',
+            'info_dict': {
+                'id': '52753292',
+                'ext': 'flv',
+                'title': 'Crew emerges after four-month Mars food study',
+                'description': 'md5:24e632ffac72b35f8b67a12d1b6ddfc1',
+            },
+        },
+        {
+            'url': 'http://www.nbcnews.com/feature/edward-snowden-interview/how-twitter-reacted-snowden-interview-n117236',
+            'md5': 'b2421750c9f260783721d898f4c42063',
+            'info_dict': {
+                'id': 'I1wpAI_zmhsQ',
+                'ext': 'flv',
+                'title': 'How Twitter Reacted To The Snowden Interview',
+                'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64',
+            },
+            'add_ie': ['ThePlatform'],
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        if video_id is not None:
+            all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
+            info = all_info.find('video')
+
+            return {
+                'id': video_id,
                 'title': info.find('headline').text,
                 'ext': 'flv',
                 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text,
                 'description': compat_str(info.find('caption').text),
                 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text,
-                }
+            }
+        else:
+            # "feature" pages use theplatform.com
+            title = mobj.group('title')
+            webpage = self._download_webpage(url, title)
+            bootstrap_json = self._search_regex(
+                r'var bootstrapJson = ({.+})\s*$', webpage, 'bootstrap json',
+                flags=re.MULTILINE)
+            bootstrap = json.loads(bootstrap_json)
+            info = bootstrap['results'][0]['video']
+            playlist_url = info['fallbackPlaylistUrl'] + '?form=MPXNBCNewsAPI'
+            mpxid = info['mpxId']
+            all_videos = self._download_json(playlist_url, title)['videos']
+            # The response contains additional videos
+            info = next(v for v in all_videos if v['mpxId'] == mpxid)
+
+            return {
+                '_type': 'url',
+                # We get the best quality video
+                'url': info['videoAssets'][-1]['publicUrl'],
+                'ie_key': 'ThePlatform',
+            }
index 0650f956481c9011032a278fc1a9375b98e26539..3d6096e46fbe6df0f6885fbdae483f05ac07cf6f 100644 (file)
@@ -4,7 +4,11 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    qualities,
+)
 
 
 class NDRIE(InfoExtractor):
@@ -45,17 +49,16 @@ class NDRIE(InfoExtractor):
 
         page = self._download_webpage(url, video_id, 'Downloading page')
 
-        title = self._og_search_title(page)
+        title = self._og_search_title(page).strip()
         description = self._og_search_description(page)
+        if description:
+            description = description.strip()
 
-        mobj = re.search(
-            r'<div class="duration"><span class="min">(?P<minutes>\d+)</span>:<span class="sec">(?P<seconds>\d+)</span></div>',
-            page)
-        duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None
+        duration = int_or_none(self._html_search_regex(r'duration: (\d+),\n', page, 'duration', fatal=False))
 
         formats = []
 
-        mp3_url = re.search(r'''{src:'(?P<audio>[^']+)', type:"audio/mp3"},''', page)
+        mp3_url = re.search(r'''\{src:'(?P<audio>[^']+)', type:"audio/mp3"},''', page)
         if mp3_url:
             formats.append({
                 'url': mp3_url.group('audio'),
@@ -64,13 +67,15 @@ class NDRIE(InfoExtractor):
 
         thumbnail = None
 
-        video_url = re.search(r'''3: {src:'(?P<video>.+?)\.hi\.mp4', type:"video/mp4"},''', page)
+        video_url = re.search(r'''3: \{src:'(?P<video>.+?)\.hi\.mp4', type:"video/mp4"},''', page)
         if video_url:
-            thumbnail = self._html_search_regex(r'(?m)title: "NDR PLAYER",\s*poster: "([^"]+)",',
-                page, 'thumbnail', fatal=False)
-            if thumbnail:
-                thumbnail = 'http://www.ndr.de' + thumbnail
-            for format_id in ['lo', 'hi', 'hq']:
+            thumbnails = re.findall(r'''\d+: \{src: "([^"]+)"(?: \|\| '[^']+')?, quality: '([^']+)'}''', page)
+            if thumbnails:
+                quality_key = qualities(['xs', 's', 'm', 'l', 'xl'])
+                largest = max(thumbnails, key=lambda thumb: quality_key(thumb[1]))
+                thumbnail = 'http://www.ndr.de' + largest[0]
+
+            for format_id in 'lo', 'hi', 'hq':
                 formats.append({
                     'url': '%s.%s.mp4' % (video_url.group('video'), format_id),
                     'format_id': format_id,
diff --git a/youtube_dl/extractor/newstube.py b/youtube_dl/extractor/newstube.py
new file mode 100644 (file)
index 0000000..2fd5b8f
--- /dev/null
@@ -0,0 +1,87 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class NewstubeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?newstube\.ru/media/(?P<id>.+)'
+    _TEST = {
+        'url': 'http://newstube.ru/media/na-korable-progress-prodolzhaetsya-testirovanie-sistemy-kurs',
+        'info_dict': {
+            'id': 'd156a237-a6e9-4111-a682-039995f721f1',
+            'ext': 'flv',
+            'title': 'На корабле «Прогресс» продолжается тестирование системы «Курс»',
+            'description': 'md5:d0cbe7b4a6f600552617e48548d5dc77',
+            'duration': 20.04,
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        page = self._download_webpage(url, video_id, 'Downloading page')
+
+        video_guid = self._html_search_regex(
+            r'<meta property="og:video" content="https?://(?:www\.)?newstube\.ru/freshplayer\.swf\?guid=(?P<guid>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
+            page, 'video GUID')
+
+        player = self._download_xml(
+            'http://p.newstube.ru/v2/player.asmx/GetAutoPlayInfo6?state=&url=%s&sessionId=&id=%s&placement=profile&location=n2' % (url, video_guid),
+            video_guid, 'Downloading player XML')
+
+        def ns(s):
+            return s.replace('/', '/%(ns)s') % {'ns': '{http://app1.newstube.ru/N2SiteWS/player.asmx}'}
+
+        session_id = player.find(ns('./SessionId')).text
+        media_info = player.find(ns('./Medias/MediaInfo'))
+        title = media_info.find(ns('./Name')).text
+        description = self._og_search_description(page)
+        thumbnail = media_info.find(ns('./KeyFrame')).text
+        duration = int(media_info.find(ns('./Duration')).text) / 1000.0
+
+        formats = []
+
+        for stream_info in media_info.findall(ns('./Streams/StreamInfo')):
+            media_location = stream_info.find(ns('./MediaLocation'))
+            if media_location is None:
+                continue
+
+            server = media_location.find(ns('./Server')).text
+            app = media_location.find(ns('./App')).text
+            media_id = stream_info.find(ns('./Id')).text
+            quality_id = stream_info.find(ns('./QualityId')).text
+            name = stream_info.find(ns('./Name')).text
+            width = int(stream_info.find(ns('./Width')).text)
+            height = int(stream_info.find(ns('./Height')).text)
+
+            formats.append({
+                'url': 'rtmp://%s/%s' % (server, app),
+                'app': app,
+                'play_path': '01/%s' % video_guid.upper(),
+                'rtmp_conn': ['S:%s' % session_id, 'S:%s' % media_id, 'S:n2'],
+                'page_url': url,
+                'ext': 'flv',
+                'format_id': quality_id,
+                'format_note': name,
+                'width': width,
+                'height': height,
+            })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_guid,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'formats': formats,
+        }
\ No newline at end of file
index e88566c693c9139ae725604427fb8cd3894e7e3b..ba7b77a467e8e7ba969fb9d4555e47fad6ed7196 100644 (file)
@@ -73,14 +73,16 @@ class NFBIE(InfoExtractor):
                 title = media.find('title').text
                 description = media.find('description').text
                 # It seems assets always go from lower to better quality, so no need to sort
-                formats = [{
-                    'url': x.find('default/streamerURI').text,
-                    'app': x.find('default/streamerURI').text.split('/', 3)[3],
-                    'play_path': x.find('default/url').text,
-                    'rtmp_live': False,
-                    'ext': 'mp4',
-                    'format_id': x.get('quality'),
-                } for x in media.findall('assets/asset')]
+                for asset in media.findall('assets/asset'):
+                    for x in asset:
+                        formats.append({
+                            'url': x.find('streamerURI').text,
+                            'app': x.find('streamerURI').text.split('/', 3)[3],
+                            'play_path': x.find('url').text,
+                            'rtmp_live': False,
+                            'ext': 'mp4',
+                            'format_id': '%s-%s' % (x.tag, asset.get('quality')),
+                        })
 
         return {
             'id': video_id,
index 46774317c9f6b47a337b20b70100a4d56e7fe022..517a72561bbaf444c54daabde4bae61f341086b3 100644 (file)
@@ -1,12 +1,10 @@
 # encoding: utf-8
+from __future__ import unicode_literals
 
 import re
-import socket
 
 from .common import InfoExtractor
 from ..utils import (
-    compat_http_client,
-    compat_urllib_error,
     compat_urllib_parse,
     compat_urllib_request,
     compat_urlparse,
@@ -18,57 +16,54 @@ from ..utils import (
 
 
 class NiconicoIE(InfoExtractor):
-    IE_NAME = u'niconico'
-    IE_DESC = u'ニコニコ動画'
+    IE_NAME = 'niconico'
+    IE_DESC = 'ニコニコ動画'
 
     _TEST = {
-        u'url': u'http://www.nicovideo.jp/watch/sm22312215',
-        u'file': u'sm22312215.mp4',
-        u'md5': u'd1a75c0823e2f629128c43e1212760f9',
-        u'info_dict': {
-            u'title': u'Big Buck Bunny',
-            u'uploader': u'takuya0301',
-            u'uploader_id': u'2698420',
-            u'upload_date': u'20131123',
-            u'description': u'(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
+        'url': 'http://www.nicovideo.jp/watch/sm22312215',
+        'md5': 'd1a75c0823e2f629128c43e1212760f9',
+        'info_dict': {
+            'id': 'sm22312215',
+            'ext': 'mp4',
+            'title': 'Big Buck Bunny',
+            'uploader': 'takuya0301',
+            'uploader_id': '2698420',
+            'upload_date': '20131123',
+            'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
         },
-        u'params': {
-            u'username': u'ydl.niconico@gmail.com',
-            u'password': u'youtube-dl',
+        'params': {
+            'username': 'ydl.niconico@gmail.com',
+            'password': 'youtube-dl',
         },
     }
 
     _VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$'
     _NETRC_MACHINE = 'niconico'
-    # If True it will raise an error if no login info is provided
-    _LOGIN_REQUIRED = True
 
     def _real_initialize(self):
         self._login()
 
     def _login(self):
         (username, password) = self._get_login_info()
-        # No authentication to be performed
         if username is None:
-            if self._LOGIN_REQUIRED:
-                raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
-            return False
+            # Login is required
+            raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
 
         # Log in
         login_form_strs = {
-            u'mail': username,
-            u'password': password,
+            'mail': username,
+            'password': password,
         }
         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
         # chokes on unicode
-        login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
+        login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
         login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8')
         request = compat_urllib_request.Request(
-            u'https://secure.nicovideo.jp/secure/login', login_data)
+            'https://secure.nicovideo.jp/secure/login', login_data)
         login_results = self._download_webpage(
-            request, u'', note=u'Logging in', errnote=u'Unable to log in')
+            request, None, note='Logging in', errnote='Unable to log in')
         if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None:
-            self._downloader.report_warning(u'unable to log in: bad username or password')
+            self._downloader.report_warning('unable to log in: bad username or password')
             return False
         return True
 
@@ -82,12 +77,12 @@ class NiconicoIE(InfoExtractor):
 
         video_info = self._download_xml(
             'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
-            note=u'Downloading video info page')
+            note='Downloading video info page')
 
         # Get flv info
         flv_info_webpage = self._download_webpage(
-            u'http://flapi.nicovideo.jp/api/getflv?v=' + video_id,
-            video_id, u'Downloading flv info')
+            'http://flapi.nicovideo.jp/api/getflv?v=' + video_id,
+            video_id, 'Downloading flv info')
         video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]
 
         # Start extracting information
@@ -106,22 +101,22 @@ class NiconicoIE(InfoExtractor):
         url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id
         try:
             user_info = self._download_xml(
-                url, video_id, note=u'Downloading user information')
+                url, video_id, note='Downloading user information')
             video_uploader = user_info.find('.//nickname').text
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_warning(u'Unable to download user info webpage: %s' % compat_str(err))
+        except ExtractorError as err:
+            self._downloader.report_warning('Unable to download user info webpage: %s' % compat_str(err))
 
         return {
-            'id':          video_id,
-            'url':         video_real_url,
-            'title':       video_title,
-            'ext':         video_extension,
-            'format':      video_format,
-            'thumbnail':   video_thumbnail,
+            'id': video_id,
+            'url': video_real_url,
+            'title': video_title,
+            'ext': video_extension,
+            'format': video_format,
+            'thumbnail': video_thumbnail,
             'description': video_description,
-            'uploader':    video_uploader,
+            'uploader': video_uploader,
             'upload_date': video_upload_date,
             'uploader_id': video_uploader_id,
-            'view_count':  video_view_count,
+            'view_count': video_view_count,
             'webpage_url': video_webpage_url,
         }
index 2b7236be5c96a4b2e5ae2a6536faa0b9137e2af0..c2e7b67c7e5334c4f906c6a93387d18170a03c26 100644 (file)
@@ -1,45 +1,68 @@
 from __future__ import unicode_literals
 
-import json
 import re
+import json
 
 from .common import InfoExtractor
+from ..utils import str_to_int
 
 
 class NineGagIE(InfoExtractor):
     IE_NAME = '9gag'
-    _VALID_URL = r'^https?://(?:www\.)?9gag\.tv/v/(?P<id>[0-9]+)'
+    _VALID_URL = r'''(?x)^https?://(?:www\.)?9gag\.tv/
+        (?:
+            v/(?P<numid>[0-9]+)|
+            p/(?P<id>[a-zA-Z0-9]+)/(?P<display_id>[^?#/]+)
+        )
+    '''
 
-    _TEST = {
+    _TESTS = [{
         "url": "http://9gag.tv/v/1912",
-        "file": "1912.mp4",
         "info_dict": {
+            "id": "1912",
+            "ext": "mp4",
             "description": "This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)",
-            "title": "\"People Are Awesome 2013\" Is Absolutely Awesome"
+            "title": "\"People Are Awesome 2013\" Is Absolutely Awesome",
+            "view_count": int,
+            "thumbnail": "re:^https?://",
         },
         'add_ie': ['Youtube']
-    }
+    },
+    {
+        'url': 'http://9gag.tv/p/KklwM/alternate-banned-opening-scene-of-gravity?ref=fsidebar',
+        'info_dict': {
+            'id': 'KklwM',
+            'ext': 'mp4',
+            'display_id': 'alternate-banned-opening-scene-of-gravity',
+            "description": "While Gravity was a pretty awesome movie already, YouTuber Krishna Shenoi came up with a way to improve upon it, introducing a much better solution to Sandra Bullock's seemingly endless tumble in space. The ending is priceless.",
+            'title': "Banned Opening Scene Of \"Gravity\" That Changes The Whole Movie",
+        },
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = mobj.group('numid') or mobj.group('id')
+        display_id = mobj.group('display_id') or video_id
+
+        webpage = self._download_webpage(url, display_id)
 
-        webpage = self._download_webpage(url, video_id)
-        data_json = self._html_search_regex(r'''(?x)
-            <div\s*id="tv-video"\s*data-video-source="youtube"\s*
-                data-video-meta="([^"]+)"''', webpage, 'video metadata')
+        post_view = json.loads(self._html_search_regex(
+            r'var postView = new app\.PostView\({\s*post:\s*({.+?}),', webpage, 'post view'))
 
-        data = json.loads(data_json)
+        youtube_id = post_view['videoExternalId']
+        title = post_view['title']
+        description = post_view['description']
+        view_count = str_to_int(post_view['externalView'])
+        thumbnail = post_view.get('thumbnail_700w') or post_view.get('ogImageUrl') or post_view.get('thumbnail_300w')
 
         return {
             '_type': 'url_transparent',
-            'url': data['youtubeVideoId'],
+            'url': youtube_id,
             'ie_key': 'Youtube',
             'id': video_id,
-            'title': data['title'],
-            'description': data['description'],
-            'view_count': int(data['view_count']),
-            'like_count': int(data['statistic']['like']),
-            'dislike_count': int(data['statistic']['dislike']),
-            'thumbnail': data['thumbnail_url'],
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'view_count': view_count,
+            'thumbnail': thumbnail,
         }
diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py
new file mode 100644 (file)
index 0000000..d451cd1
--- /dev/null
@@ -0,0 +1,106 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    unified_strdate,
+    compat_str,
+)
+
+
+class NocoIE(InfoExtractor):
+    _VALID_URL = r'http://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)'
+
+    _TEST = {
+        'url': 'http://noco.tv/emission/11538/nolife/ami-ami-idol-hello-france/',
+        'md5': '0a993f0058ddbcd902630b2047ef710e',
+        'info_dict': {
+            'id': '11538',
+            'ext': 'mp4',
+            'title': 'Ami Ami Idol - Hello! France',
+            'description': 'md5:4eaab46ab68fa4197a317a88a53d3b86',
+            'upload_date': '20140412',
+            'uploader': 'Nolife',
+            'uploader_id': 'NOL',
+            'duration': 2851.2,
+        },
+        'skip': 'Requires noco account',
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        medias = self._download_json(
+            'http://api.noco.tv/1.0/video/medias/%s' % video_id, video_id, 'Downloading video JSON')
+
+        formats = []
+
+        for fmt in medias['fr']['video_list']['default']['quality_list']:
+            format_id = fmt['quality_key']
+
+            file = self._download_json(
+                'http://api.noco.tv/1.0/video/file/%s/fr/%s' % (format_id.lower(), video_id),
+                video_id, 'Downloading %s video JSON' % format_id)
+
+            file_url = file['file']
+            if not file_url:
+                continue
+
+            if file_url == 'forbidden':
+                raise ExtractorError(
+                    '%s returned error: %s - %s' % (
+                        self.IE_NAME, file['popmessage']['title'], file['popmessage']['message']),
+                    expected=True)
+
+            formats.append({
+                'url': file_url,
+                'format_id': format_id,
+                'width': fmt['res_width'],
+                'height': fmt['res_lines'],
+                'abr': fmt['audiobitrate'],
+                'vbr': fmt['videobitrate'],
+                'filesize': fmt['filesize'],
+                'format_note': fmt['quality_name'],
+                'preference': fmt['priority'],
+            })
+
+        self._sort_formats(formats)
+
+        show = self._download_json(
+            'http://api.noco.tv/1.0/shows/show/%s' % video_id, video_id, 'Downloading show JSON')[0]
+
+        upload_date = unified_strdate(show['indexed'])
+        uploader = show['partner_name']
+        uploader_id = show['partner_key']
+        duration = show['duration_ms'] / 1000.0
+        thumbnail = show['screenshot']
+
+        episode = show.get('show_TT') or show.get('show_OT')
+        family = show.get('family_TT') or show.get('family_OT')
+        episode_number = show.get('episode_number')
+
+        title = ''
+        if family:
+            title += family
+        if episode_number:
+            title += ' #' + compat_str(episode_number)
+        if episode:
+            title += ' - ' + episode
+
+        description = show.get('show_resume') or show.get('family_resume')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'duration': duration,
+            'formats': formats,
+        }
\ No newline at end of file
index 81b7855b0b23680398909eb74e57eeb3d21dc4bc..25e71a56e196d9cf7f9d2423c47293b01e46cd24 100644 (file)
@@ -1,61 +1,51 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
 
 from ..utils import (
-    ExtractorError,
     unified_strdate,
 )
 
+
 class NormalbootsIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?(?:www\.)?normalboots\.com/video/(?P<videoid>[0-9a-z-]*)/?$'
+    _VALID_URL = r'http://(?:www\.)?normalboots\.com/video/(?P<videoid>[0-9a-z-]*)/?$'
     _TEST = {
-        u'url': u'http://normalboots.com/video/home-alone-games-jontron/',
-        u'file': u'home-alone-games-jontron.mp4',
-        u'md5': u'8bf6de238915dd501105b44ef5f1e0f6',
-        u'info_dict': {
-            u'title': u'Home Alone Games - JonTron - NormalBoots',
-            u'description': u'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for \u2018Tense Battle Theme\u2019:\xa0http://www.youtube.com/Kiamet/',
-            u'uploader': u'JonTron',
-            u'upload_date': u'20140125',
+        'url': 'http://normalboots.com/video/home-alone-games-jontron/',
+        'md5': '8bf6de238915dd501105b44ef5f1e0f6',
+        'info_dict': {
+            'id': 'home-alone-games-jontron',
+            'ext': 'mp4',
+            'title': 'Home Alone Games - JonTron - NormalBoots',
+            'description': 'Jon is late for Christmas. Typical. Thanks to: Paul Ritchey for Co-Writing/Filming: http://www.youtube.com/user/ContinueShow Michael Azzi for Christmas Intro Animation: http://michafrar.tumblr.com/ Jerrod Waters for Christmas Intro Music: http://www.youtube.com/user/xXJerryTerryXx Casey Ormond for ‘Tense Battle Theme’:\xa0http://www.youtube.com/Kiamet/',
+            'uploader': 'JonTron',
+            'upload_date': '20140125',
         }
     }
-    
+
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
         video_id = mobj.group('videoid')
-        
-        info = {
-            'id': video_id,
-            'uploader': None,
-            'upload_date': None,
-        }
-        
-        if url[:4] != 'http':
-            url = 'http://' + url
-        
+
         webpage = self._download_webpage(url, video_id)
-        video_title = self._og_search_title(webpage)
-        video_description = self._og_search_description(webpage)
-        video_thumbnail = self._og_search_thumbnail(webpage)
         video_uploader = self._html_search_regex(r'Posted\sby\s<a\shref="[A-Za-z0-9/]*">(?P<uploader>[A-Za-z]*)\s</a>',
             webpage, 'uploader')
-        raw_upload_date = self._html_search_regex('<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>', 
+        raw_upload_date = self._html_search_regex('<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>',
             webpage, 'date')
         video_upload_date = unified_strdate(raw_upload_date)
-        video_upload_date = unified_strdate(raw_upload_date)
-            
+
         player_url = self._html_search_regex(r'<iframe\swidth="[0-9]+"\sheight="[0-9]+"\ssrc="(?P<url>[\S]+)"', webpage, 'url')
         player_page = self._download_webpage(player_url, video_id)
-        video_url = u'http://player.screenwavemedia.com/' + self._html_search_regex(r"'file':\s'(?P<file>[0-9A-Za-z-_\.]+)'", player_page, 'file')
-        
-        info['url'] = video_url
-        info['title'] = video_title
-        info['description'] = video_description
-        info['thumbnail'] = video_thumbnail
-        info['uploader'] = video_uploader
-        info['upload_date'] = video_upload_date
-        
-        return info
+        video_url = self._html_search_regex(r"file:\s'(?P<file>[^']+\.mp4)'", player_page, 'file')
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': self._og_search_title(webpage),
+            'description': self._og_search_description(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'uploader': video_uploader,
+            'upload_date': video_upload_date,
+        }
index 6af8d934c8baa18b457fe1bdc3e96f42b5a82588..2e7ab1e4f9ce23c422fddf478b23b1497aac02ae 100644 (file)
@@ -9,14 +9,26 @@ from ..utils import (
 )
 
 
-class NovamovIE(InfoExtractor):
-    _VALID_URL = r'http://(?:(?:www\.)?novamov\.com/video/|(?:(?:embed|www)\.)novamov\.com/embed\.php\?v=)(?P<videoid>[a-z\d]{13})'
+class NovaMovIE(InfoExtractor):
+    IE_NAME = 'novamov'
+    IE_DESC = 'NovaMov'
+
+    _VALID_URL_TEMPLATE = r'http://(?:(?:www\.)?%(host)s/(?:file|video)/|(?:(?:embed|www)\.)%(host)s/embed\.php\?(?:.*?&)?v=)(?P<id>[a-z\d]{13})'
+    _VALID_URL = _VALID_URL_TEMPLATE % {'host': 'novamov\.com'}
+
+    _HOST = 'www.novamov.com'
+
+    _FILE_DELETED_REGEX = r'This file no longer exists on our servers!</h2>'
+    _FILEKEY_REGEX = r'flashvars\.filekey="(?P<filekey>[^"]+)";'
+    _TITLE_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>([^<]+)</h3>'
+    _DESCRIPTION_REGEX = r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>[^<]+</h3><p>([^<]+)</p>'
 
     _TEST = {
         'url': 'http://www.novamov.com/video/4rurhn9x446jj',
-        'file': '4rurhn9x446jj.flv',
         'md5': '7205f346a52bbeba427603ba10d4b935',
         'info_dict': {
+            'id': '4rurhn9x446jj',
+            'ext': 'flv',
             'title': 'search engine optimization',
             'description': 'search engine optimization is used to rank the web page in the google search engine'
         },
@@ -25,33 +37,27 @@ class NovamovIE(InfoExtractor):
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('videoid')
-
-        page = self._download_webpage('http://www.novamov.com/video/%s' % video_id,
-                                      video_id, 'Downloading video page')
+        video_id = mobj.group('id')
 
-        if re.search(r'This file no longer exists on our servers!</h2>', page) is not None:
-            raise ExtractorError(u'Video %s does not exist' % video_id, expected=True)
+        page = self._download_webpage(
+            'http://%s/video/%s' % (self._HOST, video_id), video_id, 'Downloading video page')
 
-        filekey = self._search_regex(
-            r'flashvars\.filekey="(?P<filekey>[^"]+)";', page, 'filekey')
+        if re.search(self._FILE_DELETED_REGEX, page) is not None:
+            raise ExtractorError('Video %s does not exist' % video_id, expected=True)
 
-        title = self._html_search_regex(
-            r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>([^<]+)</h3>',
-            page, 'title', fatal=False)
+        filekey = self._search_regex(self._FILEKEY_REGEX, page, 'filekey')
 
-        description = self._html_search_regex(
-            r'(?s)<div class="v_tab blockborder rounded5" id="v_tab1">\s*<h3>[^<]+</h3><p>([^<]+)</p>',
-            page, 'description', fatal=False)
+        title = self._html_search_regex(self._TITLE_REGEX, page, 'title', fatal=False)
+        description = self._html_search_regex(self._DESCRIPTION_REGEX, page, 'description', default='', fatal=False)
 
         api_response = self._download_webpage(
-            'http://www.novamov.com/api/player.api.php?key=%s&file=%s' % (filekey, video_id),
-            video_id, 'Downloading video api response')
+            'http://%s/api/player.api.php?key=%s&file=%s' % (self._HOST, filekey, video_id), video_id,
+            'Downloading video api response')
 
         response = compat_urlparse.parse_qs(api_response)
 
         if 'error_msg' in response:
-            raise ExtractorError('novamov returned error: %s' % response['error_msg'][0], expected=True)
+            raise ExtractorError('%s returned error: %s' % (self.IE_NAME, response['error_msg'][0]), expected=True)
 
         video_url = response['url'][0]
 
@@ -60,4 +66,4 @@ class NovamovIE(InfoExtractor):
             'url': video_url,
             'title': title,
             'description': description
-        }
+        }
\ No newline at end of file
index b1bcb7e54cf3f01989eb17c51160acce680eed2c..1c5e9401f36c72a73a701bdffc89529979a1eaaf 100644 (file)
@@ -4,9 +4,7 @@ import re
 
 from .brightcove import BrightcoveIE
 from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-)
+from ..utils import ExtractorError
 
 
 class NownessIE(InfoExtractor):
@@ -14,9 +12,10 @@ class NownessIE(InfoExtractor):
 
     _TEST = {
         'url': 'http://www.nowness.com/day/2013/6/27/3131/candor--the-art-of-gesticulation',
-        'file': '2520295746001.mp4',
-        'md5': '0ece2f70a7bd252c7b00f3070182d418',
+        'md5': '068bc0202558c2e391924cb8cc470676',
         'info_dict': {
+            'id': '2520295746001',
+            'ext': 'mp4',
             'description': 'Candor: The Art of Gesticulation',
             'uploader': 'Nowness',
             'title': 'Candor: The Art of Gesticulation',
index 168ca8b9fa961f350fd3d3292bda648589be5197..bfba184184c09bfd429698229efc1375c334e617 100644 (file)
@@ -1,46 +1,28 @@
-import re
+from __future__ import unicode_literals
 
-from .common import InfoExtractor
-from ..utils import compat_urlparse
+from .novamov import NovaMovIE
 
 
-class NowVideoIE(InfoExtractor):
-    _VALID_URL = r'(?:https?://)?(?:www\.)?nowvideo\.(?:ch|sx)/video/(?P<id>\w+)'
-    _TEST = {
-        u'url': u'http://www.nowvideo.ch/video/0mw0yow7b6dxa',
-        u'file': u'0mw0yow7b6dxa.flv',
-        u'md5': u'f8fbbc8add72bd95b7850c6a02fc8817',
-        u'info_dict': {
-            u"title": u"youtubedl test video _BaW_jenozKc.mp4"
-        }
-    }
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-
-        video_id = mobj.group('id')
-        webpage_url = 'http://www.nowvideo.ch/video/' + video_id
-        embed_url = 'http://embed.nowvideo.ch/embed.php?v=' + video_id
-        webpage = self._download_webpage(webpage_url, video_id)
-        embed_page = self._download_webpage(embed_url, video_id,
-            u'Downloading embed page')
+class NowVideoIE(NovaMovIE):
+    IE_NAME = 'nowvideo'
+    IE_DESC = 'NowVideo'
 
-        self.report_extraction(video_id)
+    _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:ch|sx|eu|at|ag|co)'}
 
-        video_title = self._html_search_regex(r'<h4>(.*)</h4>',
-            webpage, u'video title')
+    _HOST = 'www.nowvideo.ch'
 
-        video_key = self._search_regex(r'var fkzd="(.*)";',
-            embed_page, u'video key')
+    _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<'
+    _FILEKEY_REGEX = r'var fkzd="([^"]+)";'
+    _TITLE_REGEX = r'<h4>([^<]+)</h4>'
+    _DESCRIPTION_REGEX = r'</h4>\s*<p>([^<]+)</p>'
 
-        api_call = "http://www.nowvideo.ch/api/player.api.php?file={0}&numOfErrors=0&cid=1&key={1}".format(video_id, video_key)
-        api_response = self._download_webpage(api_call, video_id,
-            u'Downloading API page')
-        video_url = compat_urlparse.parse_qs(api_response)[u'url'][0]
-
-        return [{
-            'id':        video_id,
-            'url':       video_url,
-            'ext':       'flv',
-            'title':     video_title,
-        }]
+    _TEST = {
+        'url': 'http://www.nowvideo.ch/video/0mw0yow7b6dxa',
+        'md5': 'f8fbbc8add72bd95b7850c6a02fc8817',
+        'info_dict': {
+            'id': '0mw0yow7b6dxa',
+            'ext': 'flv',
+            'title': 'youtubedl test video _BaW_jenozKc.mp4',
+            'description': 'Description',
+        }
+    }
\ No newline at end of file
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py
new file mode 100644 (file)
index 0000000..3a6a788
--- /dev/null
@@ -0,0 +1,145 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    unified_strdate,
+)
+
+
+class NRKIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?nrk\.no/(?:video|lyd)/[^/]+/(?P<id>[\dA-F]{16})'
+
+    _TESTS = [
+        {
+            'url': 'http://www.nrk.no/video/dompap_og_andre_fugler_i_piip_show/D0FA54B5C8B6CE59/emne/piipshow/',
+            'md5': 'a6eac35052f3b242bb6bb7f43aed5886',
+            'info_dict': {
+                'id': '150533',
+                'ext': 'flv',
+                'title': 'Dompap og andre fugler i Piip-Show',
+                'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f'
+            }
+        },
+        {
+            'url': 'http://www.nrk.no/lyd/lyd_av_oppleser_for_blinde/AEFDDD5473BA0198/',
+            'md5': '3471f2a51718195164e88f46bf427668',
+            'info_dict': {
+                'id': '154915',
+                'ext': 'flv',
+                'title': 'Slik høres internett ut når du er blind',
+                'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568',
+            }
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        page = self._download_webpage(url, video_id)
+
+        video_id = self._html_search_regex(r'<div class="nrk-video" data-nrk-id="(\d+)">', page, 'video id')
+
+        data = self._download_json(
+            'http://v7.psapi.nrk.no/mediaelement/%s' % video_id, video_id, 'Downloading media JSON')
+
+        if data['usageRights']['isGeoBlocked']:
+            raise ExtractorError('NRK har ikke rettig-heter til å vise dette programmet utenfor Norge', expected=True)
+
+        video_url = data['mediaUrl'] + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124'
+
+        images = data.get('images')
+        if images:
+            thumbnails = images['webImages']
+            thumbnails.sort(key=lambda image: image['pixelWidth'])
+            thumbnail = thumbnails[-1]['imageUrl']
+        else:
+            thumbnail = None
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'ext': 'flv',
+            'title': data['title'],
+            'description': data['description'],
+            'thumbnail': thumbnail,
+        }
+
+
+class NRKTVIE(InfoExtractor):
+    _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-z]{4}\d{8})'
+
+    _TESTS = [
+        {
+            'url': 'http://tv.nrk.no/serie/20-spoersmaal-tv/muhh48000314/23-05-2014',
+            'md5': '7b96112fbae1faf09a6f9ae1aff6cb84',
+            'info_dict': {
+                'id': 'muhh48000314',
+                'ext': 'flv',
+                'title': '20 spørsmål',
+                'description': 'md5:bdea103bc35494c143c6a9acdd84887a',
+                'upload_date': '20140523',
+                'duration': 1741.52,
+            }
+        },
+        {
+            'url': 'http://tv.nrk.no/program/mdfp15000514',
+            'md5': '383650ece2b25ecec996ad7b5bb2a384',
+            'info_dict': {
+                'id': 'mdfp15000514',
+                'ext': 'flv',
+                'title': 'Kunnskapskanalen: Grunnlovsjubiléet - Stor ståhei for ingenting',
+                'description': 'md5:654c12511f035aed1e42bdf5db3b206a',
+                'upload_date': '20140524',
+                'duration': 4605.0,
+            }
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        page = self._download_webpage(url, video_id)
+
+        title = self._html_search_meta('title', page, 'title')
+        description = self._html_search_meta('description', page, 'description')
+        thumbnail = self._html_search_regex(r'data-posterimage="([^"]+)"', page, 'thumbnail', fatal=False)
+        upload_date = unified_strdate(self._html_search_meta('rightsfrom', page, 'upload date', fatal=False))
+        duration = self._html_search_regex(r'data-duration="([^"]+)"', page, 'duration', fatal=False)
+        if duration:
+            duration = float(duration)
+
+        formats = []
+
+        f4m_url = re.search(r'data-media="([^"]+)"', page)
+        if f4m_url:
+            formats.append({
+                'url': f4m_url.group(1) + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124',
+                'format_id': 'f4m',
+                'ext': 'flv',
+            })
+
+        m3u8_url = re.search(r'data-hls-media="([^"]+)"', page)
+        if m3u8_url:
+            formats.append({
+                'url': m3u8_url.group(1),
+                'format_id': 'm3u8',
+            })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'duration': duration,
+            'formats': formats,
+        }
\ No newline at end of file
diff --git a/youtube_dl/extractor/ntv.py b/youtube_dl/extractor/ntv.py
new file mode 100644 (file)
index 0000000..733ed6c
--- /dev/null
@@ -0,0 +1,149 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    unescapeHTML
+)
+
+
+class NTVIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?ntv\.ru/(?P<id>.+)'
+
+    _TESTS = [
+        {
+            'url': 'http://www.ntv.ru/novosti/863142/',
+            'info_dict': {
+                'id': '746000',
+                'ext': 'flv',
+                'title': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины',
+                'description': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины',
+                'duration': 136,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.ntv.ru/video/novosti/750370/',
+            'info_dict': {
+                'id': '750370',
+                'ext': 'flv',
+                'title': 'Родные пассажиров пропавшего Boeing не верят в трагический исход',
+                'description': 'Родные пассажиров пропавшего Boeing не верят в трагический исход',
+                'duration': 172,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416',
+            'info_dict': {
+                'id': '747480',
+                'ext': 'flv',
+                'title': '«Сегодня». 21 марта 2014 года. 16:00 ',
+                'description': '«Сегодня». 21 марта 2014 года. 16:00 ',
+                'duration': 1496,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.ntv.ru/kino/Koma_film',
+            'info_dict': {
+                'id': '758100',
+                'ext': 'flv',
+                'title': 'Остросюжетный фильм «Кома»',
+                'description': 'Остросюжетный фильм «Кома»',
+                'duration': 5592,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.ntv.ru/serial/Delo_vrachey/m31760/o233916/',
+            'info_dict': {
+                'id': '751482',
+                'ext': 'flv',
+                'title': '«Дело врачей»: «Деревце жизни»',
+                'description': '«Дело врачей»: «Деревце жизни»',
+                'duration': 2590,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+    ]
+
+    _VIDEO_ID_REGEXES = [
+        r'<meta property="og:url" content="http://www\.ntv\.ru/video/(\d+)',
+        r'<video embed=[^>]+><id>(\d+)</id>',
+        r'<video restriction[^>]+><key>(\d+)</key>',
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        page = self._download_webpage(url, video_id)
+
+        video_id = self._html_search_regex(self._VIDEO_ID_REGEXES, page, 'video id')
+
+        player = self._download_xml('http://www.ntv.ru/vi%s/' % video_id, video_id, 'Downloading video XML')
+        title = unescapeHTML(player.find('./data/title').text)
+        description = unescapeHTML(player.find('./data/description').text)
+
+        video = player.find('./data/video')
+        video_id = video.find('./id').text
+        thumbnail = video.find('./splash').text
+        duration = int(video.find('./totaltime').text)
+        view_count = int(video.find('./views').text)
+        puid22 = video.find('./puid22').text
+
+        apps = {
+            '4': 'video1',
+            '7': 'video2',
+        }
+
+        app = apps.get(puid22, apps['4'])
+
+        formats = []
+        for format_id in ['', 'hi', 'webm']:
+            file = video.find('./%sfile' % format_id)
+            if file is None:
+                continue
+            size = video.find('./%ssize' % format_id)
+            formats.append({
+                'url': 'rtmp://media.ntv.ru/%s' % app,
+                'app': app,
+                'play_path': file.text,
+                'rtmp_conn': 'B:1',
+                'player_url': 'http://www.ntv.ru/swf/vps1.swf?update=20131128',
+                'page_url': 'http://www.ntv.ru',
+                'flash_ver': 'LNX 11,2,202,341',
+                'rtmp_live': True,
+                'ext': 'flv',
+                'filesize': int(size.text),
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'view_count': view_count,
+            'formats': formats,
+        }
\ No newline at end of file
diff --git a/youtube_dl/extractor/nuvid.py b/youtube_dl/extractor/nuvid.py
new file mode 100644 (file)
index 0000000..e3db9fe
--- /dev/null
@@ -0,0 +1,48 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class NuvidIE(InfoExtractor):
+    _VALID_URL = r'^https?://(?:www|m)\.nuvid\.com/video/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://m.nuvid.com/video/1310741/',
+        'md5': 'eab207b7ac4fccfb4e23c86201f11277',
+        'info_dict': {
+            'id': '1310741',
+            'ext': 'mp4',
+            "title": "Horny babes show their awesome bodeis and",
+            "age_limit": 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        murl = url.replace('://www.', '://m.')
+        webpage = self._download_webpage(murl, video_id)
+
+        title = self._html_search_regex(
+            r'<div class="title">\s+<h2[^>]*>([^<]+)</h2>',
+            webpage, 'title').strip()
+
+        url_end = self._html_search_regex(
+            r'href="(/[^"]+)"[^>]*data-link_type="mp4"',
+            webpage, 'video_url')
+        video_url = 'http://m.nuvid.com' + url_end
+
+        thumbnail = self._html_search_regex(
+            r'href="(/thumbs/[^"]+)"[^>]*data-link_type="thumbs"',
+            webpage, 'thumbnail URL', fatal=False)
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'ext': 'mp4',
+            'title': title,
+            'thumbnail': thumbnail,
+            'age_limit': 18,
+        }
diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py
new file mode 100644 (file)
index 0000000..7bf105d
--- /dev/null
@@ -0,0 +1,77 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import parse_iso8601
+
+
+class NYTimesIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?nytimes\.com/video/(?:[^/]+/)+(?P<id>\d+)'
+
+    _TEST = {
+        'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263',
+        'md5': '18a525a510f942ada2720db5f31644c0',
+        'info_dict': {
+            'id': '100000002847155',
+            'ext': 'mov',
+            'title': 'Verbatim: What Is a Photocopier?',
+            'description': 'md5:93603dada88ddbda9395632fdc5da260',
+            'timestamp': 1398631707,
+            'upload_date': '20140427',
+            'uploader': 'Brett Weiner',
+            'duration': 419,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        video_data = self._download_json(
+            'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id, video_id, 'Downloading video JSON')
+
+        title = video_data['headline']
+        description = video_data['summary']
+        duration = video_data['duration'] / 1000.0
+
+        uploader = video_data['byline']
+        timestamp = parse_iso8601(video_data['publication_date'][:-8])
+
+        def get_file_size(file_size):
+            if isinstance(file_size, int):
+                return file_size
+            elif isinstance(file_size, dict):
+                return int(file_size.get('value', 0))
+            else:
+                return 0
+
+        formats = [
+            {
+                'url': video['url'],
+                'format_id': video['type'],
+                'vcodec': video['video_codec'],
+                'width': video['width'],
+                'height': video['height'],
+                'filesize': get_file_size(video['fileSize']),
+            } for video in video_data['renditions']
+        ]
+        self._sort_formats(formats)
+
+        thumbnails = [
+            {
+                'url': 'http://www.nytimes.com/%s' % image['url'],
+                'resolution': '%dx%d' % (image['width'], image['height']),
+            } for image in video_data['images']
+        ]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'timestamp': timestamp,
+            'uploader': uploader,
+            'duration': duration,
+            'formats': formats,
+            'thumbnails': thumbnails,
+        }
\ No newline at end of file
diff --git a/youtube_dl/extractor/oe1.py b/youtube_dl/extractor/oe1.py
new file mode 100644 (file)
index 0000000..38971ab
--- /dev/null
@@ -0,0 +1,40 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import calendar
+import datetime
+import re
+
+from .common import InfoExtractor
+
+# audios on oe1.orf.at are only available for 7 days, so we can't
+# add tests.
+
+
+class OE1IE(InfoExtractor):
+    IE_DESC = 'oe1.orf.at'
+    _VALID_URL = r'http://oe1\.orf\.at/programm/(?P<id>[0-9]+)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        show_id = mobj.group('id')
+
+        data = self._download_json(
+            'http://oe1.orf.at/programm/%s/konsole' % show_id,
+            show_id
+        )
+
+        timestamp = datetime.datetime.strptime('%s %s' % (
+            data['item']['day_label'],
+            data['item']['time']
+        ), '%d.%m.%Y %H:%M')
+        unix_timestamp = calendar.timegm(timestamp.utctimetuple())
+
+        return {
+            'id': show_id,
+            'title': data['item']['title'],
+            'url': data['item']['url_stream'],
+            'ext': 'mp3',
+            'description': data['item'].get('info'),
+            'timestamp': unix_timestamp
+        }
index 44312ba4ecf61220ad21e8d233a40e99960389b2..13f12824c99aa71c357047ff62a866365bbc49fb 100644 (file)
@@ -1,20 +1,23 @@
+from __future__ import unicode_literals
 import re
 import json
 
 from .common import InfoExtractor
 from ..utils import unescapeHTML
 
+
 class OoyalaIE(InfoExtractor):
-    _VALID_URL = r'https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=(?P<id>.+?)(&|$)'
+    _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P<id>.+?)(&|$)'
 
     _TEST = {
         # From http://it.slashdot.org/story/13/04/25/178216/recovering-data-from-broken-hard-drives-and-ssds-video
-        u'url': u'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
-        u'file': u'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8.mp4',
-        u'md5': u'3f5cceb3a7bf461d6c29dc466cf8033c',
-        u'info_dict': {
-            u'title': u'Explaining Data Recovery from Hard Drives and SSDs',
-            u'description': u'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
+        'url': 'http://player.ooyala.com/player.js?embedCode=pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
+        'md5': '3f5cceb3a7bf461d6c29dc466cf8033c',
+        'info_dict': {
+            'id': 'pxczE2YjpfHfn1f3M-ykG_AmJRRn0PD8',
+            'ext': 'mp4',
+            'title': 'Explaining Data Recovery from Hard Drives and SSDs',
+            'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.',
         },
     }
 
@@ -28,13 +31,14 @@ class OoyalaIE(InfoExtractor):
             ie=cls.ie_key())
 
     def _extract_result(self, info, more_info):
-        return {'id': info['embedCode'],
-                'ext': 'mp4',
-                'title': unescapeHTML(info['title']),
-                'url': info.get('ipad_url') or info['url'],
-                'description': unescapeHTML(more_info['description']),
-                'thumbnail': more_info['promo'],
-                }
+        return {
+            'id': info['embedCode'],
+            'ext': 'mp4',
+            'title': unescapeHTML(info['title']),
+            'url': info.get('ipad_url') or info['url'],
+            'description': unescapeHTML(more_info['description']),
+            'thumbnail': more_info['promo'],
+        }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
@@ -42,22 +46,23 @@ class OoyalaIE(InfoExtractor):
         player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embedCode
         player = self._download_webpage(player_url, embedCode)
         mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="',
-                                        player, u'mobile player url')
+                                        player, 'mobile player url')
         mobile_player = self._download_webpage(mobile_url, embedCode)
         videos_info = self._search_regex(
             r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);',
-            mobile_player, u'info').replace('\\"','"')
-        videos_more_info = self._search_regex(r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, u'more info').replace('\\"','"')
+            mobile_player, 'info').replace('\\"','"')
+        videos_more_info = self._search_regex(r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, 'more info').replace('\\"','"')
         videos_info = json.loads(videos_info)
         videos_more_info =json.loads(videos_more_info)
 
         if videos_more_info.get('lineup'):
             videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])]
-            return {'_type': 'playlist',
-                    'id': embedCode,
-                    'title': unescapeHTML(videos_more_info['title']),
-                    'entries': videos,
-                    }
+            return {
+                '_type': 'playlist',
+                'id': embedCode,
+                'title': unescapeHTML(videos_more_info['title']),
+                'entries': videos,
+            }
         else:
             return self._extract_result(videos_info[0], videos_more_info)
         
index 5f5694393765104b45b573c53155d447a45b1e50..03421d1d5c78f2acd712e560ae17fb96d4a323be 100644 (file)
@@ -8,6 +8,7 @@ from .common import InfoExtractor
 from ..utils import (
     HEADRequest,
     unified_strdate,
+    ExtractorError,
 )
 
 
@@ -35,7 +36,15 @@ class ORFIE(InfoExtractor):
         data_json = self._search_regex(
             r'initializeAdworx\((.+?)\);\n', webpage, 'video info')
         all_data = json.loads(data_json)
-        sdata = all_data[0]['values']['segments']
+
+        def get_segments(all_data):
+            for data in all_data:
+                if data['name'] == 'Tracker::EPISODE_DETAIL_PAGE_OVER_PROGRAM':
+                    return data['values']['segments']
+
+        sdata = get_segments(all_data)
+        if not sdata:
+            raise ExtractorError('Unable to extract segments')
 
         def quality_to_int(s):
             m = re.search('([0-9]+)', s)
diff --git a/youtube_dl/extractor/parliamentliveuk.py b/youtube_dl/extractor/parliamentliveuk.py
new file mode 100644 (file)
index 0000000..0a423a0
--- /dev/null
@@ -0,0 +1,53 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class ParliamentLiveUKIE(InfoExtractor):
+    IE_NAME = 'parliamentlive.tv'
+    IE_DESC = 'UK parliament videos'
+    _VALID_URL = r'https?://www\.parliamentlive\.tv/Main/Player\.aspx\?(?:[^&]+&)*?meetingId=(?P<id>[0-9]+)'
+
+    _TEST = {
+        'url': 'http://www.parliamentlive.tv/Main/Player.aspx?meetingId=15121&player=windowsmedia',
+        'info_dict': {
+            'id': '15121',
+            'ext': 'asf',
+            'title': 'hoc home affairs committee, 18 mar 2014.pm',
+            'description': 'md5:033b3acdf83304cd43946b2d5e5798d1',
+        },
+        'params': {
+            'skip_download': True,  # Requires mplayer (mms)
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+
+        asx_url = self._html_search_regex(
+            r'embed.*?src="([^"]+)" name="MediaPlayer"', webpage,
+            'metadata URL')
+        asx = self._download_xml(asx_url, video_id, 'Downloading ASX metadata')
+        video_url = asx.find('.//REF').attrib['HREF']
+
+        title = self._search_regex(
+            r'''(?x)player\.setClipDetails\(
+                (?:(?:[0-9]+|"[^"]+"),\s*){2}
+                "([^"]+",\s*"[^"]+)"
+                ''',
+            webpage, 'title').replace('", "', ', ')
+        description = self._html_search_regex(
+            r'(?s)<span id="MainContentPlaceHolder_CaptionsBlock_WitnessInfo">(.*?)</span>',
+            webpage, 'description')
+
+        return {
+            'id': video_id,
+            'ext': 'asf',
+            'url': video_url,
+            'title': title,
+            'description': description,
+        }
index e7e0042fb4e39a77061976078d4662a9cc17f522..64cded70789249746a5e2b6604d86563a6ad499c 100644 (file)
@@ -3,6 +3,9 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..utils import (
+    US_RATINGS,
+)
 
 
 class PBSIE(InfoExtractor):
@@ -13,7 +16,7 @@ class PBSIE(InfoExtractor):
             # Article with embedded player
            (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+)/?(?:$|[?\#]) |
            # Player
-           video\.pbs\.org/partnerplayer/(?P<player_id>[^/]+)/
+           video\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
         )
     '''
 
@@ -57,6 +60,11 @@ class PBSIE(InfoExtractor):
         info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id
         info = self._download_json(info_url, display_id)
 
+        rating_str = info.get('rating')
+        if rating_str is not None:
+            rating_str = rating_str.rpartition('-')[2]
+        age_limit = US_RATINGS.get(rating_str)
+
         return {
             'id': video_id,
             'title': info['title'],
@@ -65,4 +73,5 @@ class PBSIE(InfoExtractor):
             'description': info['program'].get('description'),
             'thumbnail': info.get('image_url'),
             'duration': info.get('duration'),
+            'age_limit': age_limit,
         }
index 305b79773b3eaa25b8b580686d4e356f947b13f7..8aa69c46eb75e9ccfe6fab5b7bff2c9a5778009e 100644 (file)
@@ -1,76 +1,45 @@
-import datetime
+from __future__ import unicode_literals
+
 import json
 import re
 
 from .common import InfoExtractor
+from ..utils import compat_urllib_parse
 
-from ..utils import (
-    ExtractorError,
-)
 
 class PhotobucketIE(InfoExtractor):
-    """Information extractor for photobucket.com."""
-
-    # TODO: the original _VALID_URL was:
-    # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
-    # Check if it's necessary to keep the old extracion process
-    _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
-    IE_NAME = u'photobucket'
+    _VALID_URL = r'http://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
     _TEST = {
-        u'url': u'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0',
-        u'file': u'zpsc0c3b9fa.mp4',
-        u'md5': u'7dabfb92b0a31f6c16cebc0f8e60ff99',
-        u'info_dict': {
-            u"upload_date": u"20130504", 
-            u"uploader": u"rachaneronas", 
-            u"title": u"Tired of Link Building? Try BacklinkMyDomain.com!"
+        'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0',
+        'file': 'zpsc0c3b9fa.mp4',
+        'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99',
+        'info_dict': {
+            'timestamp': 1367669341,
+            'upload_date': '20130504',
+            'uploader': 'rachaneronas',
+            'title': 'Tired of Link Building? Try BacklinkMyDomain.com!',
         }
     }
 
     def _real_extract(self, url):
-        # Extract id from URL
         mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-
         video_id = mobj.group('id')
-
         video_extension = mobj.group('ext')
 
-        # Retrieve video webpage to extract further information
         webpage = self._download_webpage(url, video_id)
 
         # Extract URL, uploader, and title from webpage
         self.report_extraction(video_id)
-        # We try first by looking the javascript code:
-        mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
-        if mobj is not None:
-            info = json.loads(mobj.group('json'))
-            return [{
-                'id':       video_id,
-                'url':      info[u'downloadUrl'],
-                'uploader': info[u'username'],
-                'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
-                'title':    info[u'title'],
-                'ext':      video_extension,
-                'thumbnail': info[u'thumbUrl'],
-            }]
-
-        # We try looking in other parts of the webpage
-        video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
-            webpage, u'video URL')
-
-        mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = mobj.group(1).decode('utf-8')
-        video_uploader = mobj.group(2).decode('utf-8')
-
-        return [{
-            'id':       video_id.decode('utf-8'),
-            'url':      video_url.decode('utf-8'),
-            'uploader': video_uploader,
-            'upload_date':  None,
-            'title':    video_title,
-            'ext':      video_extension.decode('utf-8'),
-        }]
+        info_json = self._search_regex(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (.*?)\);',
+            webpage, 'info json')
+        info = json.loads(info_json)
+        url = compat_urllib_parse.unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url'))
+        return {
+            'id': video_id,
+            'url': url,
+            'uploader': info['username'],
+            'timestamp': info['creationDate'],
+            'title': info['title'],
+            'ext': video_extension,
+            'thumbnail': info['thumbUrl'],
+        }
diff --git a/youtube_dl/extractor/playvid.py b/youtube_dl/extractor/playvid.py
new file mode 100644 (file)
index 0000000..b1322f1
--- /dev/null
@@ -0,0 +1,80 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+)
+
+
+class PlayvidIE(InfoExtractor):
+    _VALID_URL = r'^https?://www\.playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)'
+    _TEST = {
+        'url': 'http://www.playvid.com/watch/agbDDi7WZTV',
+        'md5': '44930f8afa616efdf9482daf4fe53e1e',
+        'info_dict': {
+            'id': 'agbDDi7WZTV',
+            'ext': 'mp4',
+            'title': 'Michelle Lewin in Miami Beach',
+            'duration': 240,
+            'age_limit': 18,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_title = None
+        duration = None
+        video_thumbnail = None
+        formats = []
+
+        # most of the information is stored in the flashvars
+        flashvars = self._html_search_regex(
+            r'flashvars="(.+?)"', webpage, 'flashvars')
+
+        infos = compat_urllib_parse.unquote(flashvars).split(r'&')
+        for info in infos:
+            videovars_match = re.match(r'^video_vars\[(.+?)\]=(.+?)$', info)
+            if videovars_match:
+                key = videovars_match.group(1)
+                val = videovars_match.group(2)
+
+                if key == 'title':
+                    video_title = compat_urllib_parse.unquote_plus(val)
+                if key == 'duration':
+                    try:
+                        duration = int(val)
+                    except ValueError:
+                        pass
+                if key == 'big_thumb':
+                    video_thumbnail = val
+
+                videourl_match = re.match(
+                    r'^video_urls\]\[(?P<resolution>[0-9]+)p', key)
+                if videourl_match:
+                    height = int(videourl_match.group('resolution'))
+                    formats.append({
+                        'height': height,
+                        'url': val,
+                    })
+        self._sort_formats(formats)
+
+        # Extract title - should be in the flashvars; if not, look elsewhere
+        if video_title is None:
+            video_title = self._html_search_regex(
+                r'<title>(.*?)</title', webpage, 'title')
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': video_title,
+            'thumbnail': video_thumbnail,
+            'duration': duration,
+            'description': None,
+            'age_limit': 18
+        }
index 58200971bece7664e18b94eccb52b368ef5a999b..ffafd23800f49de2bc7987f3d63c9b5ab2104b58 100644 (file)
@@ -1,24 +1,41 @@
+from __future__ import unicode_literals
+
 import json
 import re
 
 from .common import InfoExtractor
-
+from ..utils import int_or_none
 
 class PodomaticIE(InfoExtractor):
     IE_NAME = 'podomatic'
     _VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)'
 
-    _TEST = {
-        u"url": u"http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00",
-        u"file": u"2009-01-02T16_03_35-08_00.mp3",
-        u"md5": u"84bb855fcf3429e6bf72460e1eed782d",
-        u"info_dict": {
-            u"uploader": u"Science Teaching Tips",
-            u"uploader_id": u"scienceteachingtips",
-            u"title": u"64.  When the Moon Hits Your Eye",
-            u"duration": 446,
-        }
-    }
+    _TESTS = [
+        {
+            'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00',
+            'md5': '84bb855fcf3429e6bf72460e1eed782d',
+            'info_dict': {
+                'id': '2009-01-02T16_03_35-08_00',
+                'ext': 'mp3',
+                'uploader': 'Science Teaching Tips',
+                'uploader_id': 'scienceteachingtips',
+                'title': '64.  When the Moon Hits Your Eye',
+                'duration': 446,
+            }
+        },
+        {
+            'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00',
+            'md5': 'd2cf443931b6148e27638650e2638297',
+            'info_dict': {
+                'id': '2013-11-15T16_31_21-08_00',
+                'ext': 'mp3',
+                'uploader': 'Ostbahnhof / Techno Mix',
+                'uploader_id': 'ostbahnhof',
+                'title': 'Einunddreizig',
+                'duration': 3799,
+            }
+        },
+    ]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
@@ -29,14 +46,16 @@ class PodomaticIE(InfoExtractor):
                      '?permalink=true&rtmp=0') %
                     (mobj.group('proto'), channel, video_id))
         data_json = self._download_webpage(
-            json_url, video_id, note=u'Downloading video info')
+            json_url, video_id, 'Downloading video info')
         data = json.loads(data_json)
 
         video_url = data['downloadLink']
+        if not video_url:
+            video_url = '%s/%s' % (data['streamer'].replace('rtmp', 'http'), data['mediaLocation'])
         uploader = data['podcast']
         title = data['title']
         thumbnail = data['imageLocation']
-        duration = int(data['length'] / 1000.0)
+        duration = int_or_none(data.get('length'), 1000)
 
         return {
             'id': video_id,
index 58f9c690e18b4330f7b47e862d0367913e17582a..718fe9aba5fc710ee5efc47c2bbae2b02fc3c117 100644 (file)
@@ -1,44 +1,81 @@
 from __future__ import unicode_literals
 
 import re
+import json
 
 from .common import InfoExtractor
-from ..utils import compat_urllib_parse
+from ..utils import int_or_none
 
 
 class PornHdIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<video_id>[0-9]+)/(?P<video_title>.+)'
+    _VALID_URL = r'http://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)'
     _TEST = {
         'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video',
-        'file': '1962.flv',
-        'md5': '35272469887dca97abd30abecc6cdf75',
+        'md5': '956b8ca569f7f4d8ec563e2c41598441',
         'info_dict': {
-            "title": "sierra-day-gets-his-cum-all-over-herself-hd-porn-video",
-            "age_limit": 18,
+            'id': '1962',
+            'ext': 'mp4',
+            'title': 'Sierra loves doing laundry',
+            'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294',
+            'age_limit': 18,
         }
     }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-
-        video_id = mobj.group('video_id')
-        video_title = mobj.group('video_title')
+        video_id = mobj.group('id')
 
         webpage = self._download_webpage(url, video_id)
 
-        next_url = self._html_search_regex(
-            r'&hd=(http.+?)&', webpage, 'video URL')
-        next_url = compat_urllib_parse.unquote(next_url)
+        title = self._og_search_title(webpage)
+        TITLE_SUFFIX = ' porn HD Video | PornHD.com '
+        if title.endswith(TITLE_SUFFIX):
+            title = title[:-len(TITLE_SUFFIX)]
+
+        description = self._html_search_regex(
+            r'<div class="description">([^<]+)</div>', webpage, 'description', fatal=False)
+        view_count = int_or_none(self._html_search_regex(
+            r'(\d+) views      </span>', webpage, 'view count', fatal=False))
+
+        formats = [
+            {
+                'url': format_url,
+                'ext': format.lower(),
+                'format_id': '%s-%s' % (format.lower(), quality.lower()),
+                'quality': 1 if quality.lower() == 'high' else 0,
+            } for format, quality, format_url in re.findall(
+                r'var __video([\da-zA-Z]+?)(Low|High)StreamUrl = \'(http://.+?)\?noProxy=1\'', webpage)
+        ]
+
+        mobj = re.search(r'flashVars = (?P<flashvars>{.+?});', webpage)
+        if mobj:
+            flashvars = json.loads(mobj.group('flashvars'))
+            formats.extend([
+                {
+                    'url': flashvars['hashlink'].replace('?noProxy=1', ''),
+                    'ext': 'flv',
+                    'format_id': 'flv-low',
+                    'quality': 0,
+                },
+                {
+                    'url': flashvars['hd'].replace('?noProxy=1', ''),
+                    'ext': 'flv',
+                    'format_id': 'flv-high',
+                    'quality': 1,
+                }
+            ])
+            thumbnail = flashvars['urlWallpaper']
+        else:
+            thumbnail = self._og_search_thumbnail(webpage)
 
-        video_url = self._download_webpage(
-            next_url, video_id, note='Retrieving video URL',
-            errnote='Could not retrieve video URL')
-        age_limit = 18
+        self._sort_formats(formats)
 
         return {
             'id': video_id,
-            'url': video_url,
-            'ext': 'flv',
-            'title': video_title,
-            'age_limit': age_limit,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'view_count': view_count,
+            'formats': formats,
+            'age_limit': 18,
         }
index fdda69f33064fe4bd4e89775c5e3c7d56e4946d2..4118ee9560e03d2fa1eea171766ef4893e274aa5 100644 (file)
@@ -8,6 +8,7 @@ from ..utils import (
     compat_urllib_parse_urlparse,
     compat_urllib_request,
     compat_urllib_parse,
+    str_to_int,
 )
 from ..aes import (
     aes_decrypt_text
@@ -27,6 +28,12 @@ class PornHubIE(InfoExtractor):
         }
     }
 
+    def _extract_count(self, pattern, webpage, name):
+        count = self._html_search_regex(pattern, webpage, '%s count' % name, fatal=False)
+        if count:
+            count = str_to_int(count)
+        return count
+
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('videoid')
@@ -37,14 +44,22 @@ class PornHubIE(InfoExtractor):
         webpage = self._download_webpage(req, video_id)
 
         video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
-        video_uploader = self._html_search_regex(r'<b>From: </b>(?:\s|<[^>]*>)*(.+?)<', webpage, 'uploader', fatal=False)
+        video_uploader = self._html_search_regex(
+            r'(?s)From:&nbsp;.+?<(?:a href="/users/|<span class="username)[^>]+>(.+?)<',
+            webpage, 'uploader', fatal=False)
         thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
         if thumbnail:
             thumbnail = compat_urllib_parse.unquote(thumbnail)
 
+        view_count = self._extract_count(r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
+        like_count = self._extract_count(r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
+        dislike_count = self._extract_count(r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
+        comment_count = self._extract_count(
+            r'All comments \(<var class="videoCommentCount">([\d,\.]+)</var>', webpage, 'comment')
+
         video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
         if webpage.find('"encrypted":true') != -1:
-            password = self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password').replace('+', ' ')
+            password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
             video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
 
         formats = []
@@ -77,6 +92,10 @@ class PornHubIE(InfoExtractor):
             'uploader': video_uploader,
             'title': video_title,
             'thumbnail': thumbnail,
+            'view_count': view_count,
+            'like_count': like_count,
+            'dislike_count': dislike_count,
+            'comment_count': comment_count,
             'formats': formats,
             'age_limit': 18,
         }
diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py
new file mode 100644 (file)
index 0000000..e4c4ad7
--- /dev/null
@@ -0,0 +1,286 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from hashlib import sha1
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+    unified_strdate,
+)
+
+
+class ProSiebenSat1IE(InfoExtractor):
+    IE_NAME = 'prosiebensat1'
+    IE_DESC = 'ProSiebenSat.1 Digital'
+    _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)'
+
+    _TESTS = [
+        {
+            'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge',
+            'info_dict': {
+                'id': '2104602',
+                'ext': 'mp4',
+                'title': 'Staffel 2, Episode 18 - Jahresrückblick',
+                'description': 'md5:8733c81b702ea472e069bc48bb658fc1',
+                'upload_date': '20131231',
+                'duration': 5845.04,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.prosieben.de/videokatalog/Gesellschaft/Leben/Trends/video-Lady-Umstyling-f%C3%BCr-Audrina-Rebekka-Audrina-Fergen-billig-aussehen-Battal-Modica-700544.html',
+            'info_dict': {
+                'id': '2570327',
+                'ext': 'mp4',
+                'title': 'Lady-Umstyling für Audrina',
+                'description': 'md5:4c16d0c17a3461a0d43ea4084e96319d',
+                'upload_date': '20131014',
+                'duration': 606.76,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'Seems to be broken',
+        },
+        {
+            'url': 'http://www.prosiebenmaxx.de/tv/experience/video/144-countdown-fuer-die-autowerkstatt-ganze-folge',
+            'info_dict': {
+                'id': '2429369',
+                'ext': 'mp4',
+                'title': 'Countdown für die Autowerkstatt',
+                'description': 'md5:809fc051a457b5d8666013bc40698817',
+                'upload_date': '20140223',
+                'duration': 2595.04,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip',
+            'info_dict': {
+                'id': '2904997',
+                'ext': 'mp4',
+                'title': 'Sexy laufen in Ugg Boots',
+                'description': 'md5:edf42b8bd5bc4e5da4db4222c5acb7d6',
+                'upload_date': '20140122',
+                'duration': 245.32,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip',
+            'info_dict': {
+                'id': '2906572',
+                'ext': 'mp4',
+                'title': 'Im Interview: Kai Wiesinger',
+                'description': 'md5:e4e5370652ec63b95023e914190b4eb9',
+                'upload_date': '20140225',
+                'duration': 522.56,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge',
+            'info_dict': {
+                'id': '2992323',
+                'ext': 'mp4',
+                'title': 'Jagd auf Fertigkost im Elsthal - Teil 2',
+                'description': 'md5:2669cde3febe9bce13904f701e774eb6',
+                'upload_date': '20140225',
+                'duration': 2410.44,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge',
+            'info_dict': {
+                'id': '3004256',
+                'ext': 'mp4',
+                'title': 'Schalke: Tönnies möchte Raul zurück',
+                'description': 'md5:4b5b271d9bcde223b54390754c8ece3f',
+                'upload_date': '20140226',
+                'duration': 228.96,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip',
+            'info_dict': {
+                'id': '2572814',
+                'ext': 'mp4',
+                'title': 'Andreas Kümmert: Rocket Man',
+                'description': 'md5:6ddb02b0781c6adf778afea606652e38',
+                'upload_date': '20131017',
+                'duration': 469.88,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www.fem.com/wellness/videos/wellness-video-clip-kurztripps-zum-valentinstag.html',
+            'info_dict': {
+                'id': '2156342',
+                'ext': 'mp4',
+                'title': 'Kurztrips zum Valentinstag',
+                'description': 'md5:8ba6301e70351ae0bedf8da00f7ba528',
+                'upload_date': '20130206',
+                'duration': 307.24,
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+        },
+    ]
+
+    _CLIPID_REGEXES = [
+        r'"clip_id"\s*:\s+"(\d+)"',
+        r'clipid: "(\d+)"',
+        r'clipId=(\d+)',
+    ]
+    _TITLE_REGEXES = [
+        r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>',
+        r'<header class="clearfix">\s*<h3>(.+?)</h3>',
+        r'<!-- start video -->\s*<h1>(.+?)</h1>',
+        r'<div class="ep-femvideos-pi4-video-txt">\s*<h2>(.+?)</h2>',
+    ]
+    _DESCRIPTION_REGEXES = [
+        r'<p itemprop="description">\s*(.+?)</p>',
+        r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>',
+        r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>',
+        r'<p>(.+?)</p>\s*<div class="ep-femvideos-pi4-video-footer">',
+    ]
+    _UPLOAD_DATE_REGEXES = [
+        r'<meta property="og:published_time" content="(.+?)">',
+        r'<span>\s*(\d{2}\.\d{2}\.\d{4} \d{2}:\d{2}) \|\s*<span itemprop="duration"',
+        r'<footer>\s*(\d{2}\.\d{2}\.\d{4}) \d{2}:\d{2} Uhr',
+        r'<span style="padding-left: 4px;line-height:20px; color:#404040">(\d{2}\.\d{2}\.\d{4})</span>',
+        r'(\d{2}\.\d{2}\.\d{4}) \| \d{2}:\d{2} Min<br/>',
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        page = self._download_webpage(url, video_id, 'Downloading page')
+
+        clip_id = self._html_search_regex(self._CLIPID_REGEXES, page, 'clip id')
+
+        access_token = 'testclient'
+        client_name = 'kolibri-1.2.5'
+        client_location = url
+
+        videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({
+            'access_token': access_token,
+            'client_location': client_location,
+            'client_name': client_name,
+            'ids': clip_id,
+        })
+
+        videos = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')
+
+        duration = float(videos[0]['duration'])
+        source_ids = [source['id'] for source in videos[0]['sources']]
+        source_ids_str = ','.join(map(str, source_ids))
+
+        g = '01!8d8F_)r9]4s[qeuXfP%'
+
+        client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name])
+                                 .encode('utf-8')).hexdigest()
+
+        sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse.urlencode({
+            'access_token': access_token,
+            'client_id': client_id,
+            'client_location': client_location,
+            'client_name': client_name,
+        }))
+
+        sources = self._download_json(sources_api_url, clip_id, 'Downloading sources JSON')
+        server_id = sources['server_id']
+
+        client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id,
+                                          client_location, source_ids_str, g, client_name])
+                                 .encode('utf-8')).hexdigest()
+
+        url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse.urlencode({
+            'access_token': access_token,
+            'client_id': client_id,
+            'client_location': client_location,
+            'client_name': client_name,
+            'server_id': server_id,
+            'source_ids': source_ids_str,
+        }))
+
+        urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON')
+
+        title = self._html_search_regex(self._TITLE_REGEXES, page, 'title')
+        description = self._html_search_regex(self._DESCRIPTION_REGEXES, page, 'description', fatal=False)
+        thumbnail = self._og_search_thumbnail(page)
+
+        upload_date = unified_strdate(self._html_search_regex(
+            self._UPLOAD_DATE_REGEXES, page, 'upload date', fatal=False))
+
+        formats = []
+
+        urls_sources = urls['sources']
+        if isinstance(urls_sources, dict):
+            urls_sources = urls_sources.values()
+
+        def fix_bitrate(bitrate):
+            return bitrate / 1000 if bitrate % 1000 == 0 else bitrate
+
+        for source in urls_sources:
+            protocol = source['protocol']
+            if protocol == 'rtmp' or protocol == 'rtmpe':
+                mobj = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', source['url'])
+                if not mobj:
+                    continue
+                formats.append({
+                    'url': mobj.group('url'),
+                    'app': mobj.group('app'),
+                    'play_path': mobj.group('playpath'),
+                    'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf',
+                    'page_url': 'http://www.prosieben.de',
+                    'vbr': fix_bitrate(source['bitrate']),
+                    'ext': 'mp4',
+                    'format_id': '%s_%s' % (source['cdn'], source['bitrate']),
+                })
+            else:
+                formats.append({
+                    'url': source['url'],
+                    'vbr': fix_bitrate(source['bitrate']),
+                })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': clip_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'duration': duration,
+            'formats': formats,
+        }
\ No newline at end of file
index 33054591b755b383eb0d216623b99d308073c64d..0bc0859b466e533419d5647d7f0250988d2f36db 100644 (file)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import re
 import os
 
@@ -5,45 +7,51 @@ from .common import InfoExtractor
 
 
 class PyvideoIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)'
-    _TESTS = [{
-        u'url': u'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes',
-        u'file': u'24_4WWkSmNo.mp4',
-        u'md5': u'de317418c8bc76b1fd8633e4f32acbc6',
-        u'info_dict': {
-            u"title": u"Become a logging expert in 30 minutes",
-            u"description": u"md5:9665350d466c67fb5b1598de379021f7",
-            u"upload_date": u"20130320",
-            u"uploader": u"NextDayVideo",
-            u"uploader_id": u"NextDayVideo",
+    _VALID_URL = r'http://(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)'
+
+    _TESTS = [
+        {
+            'url': 'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes',
+            'md5': 'de317418c8bc76b1fd8633e4f32acbc6',
+            'info_dict': {
+                'id': '24_4WWkSmNo',
+                'ext': 'mp4',
+                'title': 'Become a logging expert in 30 minutes',
+                'description': 'md5:9665350d466c67fb5b1598de379021f7',
+                'upload_date': '20130320',
+                'uploader': 'NextDayVideo',
+                'uploader_id': 'NextDayVideo',
+            },
+            'add_ie': ['Youtube'],
         },
-        u'add_ie': ['Youtube'],
-    },
-    {
-        u'url': u'http://pyvideo.org/video/2542/gloriajw-spotifywitherikbernhardsson182m4v',
-        u'md5': u'5fe1c7e0a8aa5570330784c847ff6d12',
-        u'info_dict': {
-            u'id': u'2542',
-            u'ext': u'm4v',
-            u'title': u'Gloriajw-SpotifyWithErikBernhardsson182',
+        {
+            'url': 'http://pyvideo.org/video/2542/gloriajw-spotifywitherikbernhardsson182m4v',
+            'md5': '5fe1c7e0a8aa5570330784c847ff6d12',
+            'info_dict': {
+                'id': '2542',
+                'ext': 'm4v',
+                'title': 'Gloriajw-SpotifyWithErikBernhardsson182',
+            },
         },
-    },
     ]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
+
         webpage = self._download_webpage(url, video_id)
-        m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', webpage)
 
+        m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', webpage)
         if m_youtube is not None:
             return self.url_result(m_youtube.group(1), 'Youtube')
 
-        title = self._html_search_regex(r'<div class="section">.*?<h3>([^>]+?)</h3>',
-            webpage, u'title', flags=re.DOTALL)
-        video_url = self._search_regex([r'<source src="(.*?)"',
-            r'<dt>Download</dt>.*?<a href="(.+?)"'],
-            webpage, u'video url', flags=re.DOTALL)
+        title = self._html_search_regex(
+            r'<div class="section">.*?<h3(?:\s+class="[^"]*")?>([^>]+?)</h3>',
+            webpage, 'title', flags=re.DOTALL)
+        video_url = self._search_regex(
+            [r'<source src="(.*?)"', r'<dt>Download</dt>.*?<a href="(.+?)"'],
+            webpage, 'video url', flags=re.DOTALL)
+
         return {
             'id': video_id,
             'title': os.path.splitext(title)[0],
index 34652f6c11ca9668bce89cd4c908d0c363221218..09352ed8250819518be78e2d5cf8bb97108913e0 100644 (file)
@@ -1,4 +1,6 @@
 # coding: utf-8
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
@@ -6,16 +8,17 @@ from .common import InfoExtractor
 
 class RadioFranceIE(InfoExtractor):
     _VALID_URL = r'^https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)'
-    IE_NAME = u'radiofrance'
+    IE_NAME = 'radiofrance'
 
     _TEST = {
-        u'url': u'http://maison.radiofrance.fr/radiovisions/one-one',
-        u'file': u'one-one.ogg',
-        u'md5': u'bdbb28ace95ed0e04faab32ba3160daf',
-        u'info_dict': {
-            u"title": u"One to one",
-            u"description": u"Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
-            u"uploader": u"Thomas Hercouët",
+        'url': 'http://maison.radiofrance.fr/radiovisions/one-one',
+        'md5': 'bdbb28ace95ed0e04faab32ba3160daf',
+        'info_dict': {
+            'id': 'one-one',
+            'ext': 'ogg',
+            "title": "One to one",
+            "description": "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
+            "uploader": "Thomas Hercouët",
         },
     }
 
@@ -24,27 +27,28 @@ class RadioFranceIE(InfoExtractor):
         video_id = m.group('id')
 
         webpage = self._download_webpage(url, video_id)
-        title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, u'title')
+        title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
         description = self._html_search_regex(
             r'<div class="bloc_page_wrapper"><div class="text">(.*?)</div>',
-            webpage, u'description', fatal=False)
+            webpage, 'description', fatal=False)
         uploader = self._html_search_regex(
             r'<div class="credit">&nbsp;&nbsp;&copy;&nbsp;(.*?)</div>',
-            webpage, u'uploader', fatal=False)
+            webpage, 'uploader', fatal=False)
 
         formats_str = self._html_search_regex(
             r'class="jp-jplayer[^"]*" data-source="([^"]+)">',
-            webpage, u'audio URLs')
+            webpage, 'audio URLs')
         formats = [
             {
                 'format_id': fm[0],
                 'url': fm[1],
                 'vcodec': 'none',
+                'preference': i,
             }
-            for fm in
-            re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str)
+            for i, fm in
+            enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str))
         ]
-        # No sorting, we don't know any more about these formats
+        self._sort_formats(formats)
 
         return {
             'id': video_id,
index 4678f62dfadba9968ff363a919471c869eb35c71..a6ad594659250254b6cee9efd31cdcb8364da89b 100644 (file)
@@ -18,7 +18,7 @@ class Ro220IE(InfoExtractor):
         'md5': '03af18b73a07b4088753930db7a34add',
         'info_dict': {
             "title": "Luati-le Banii sez 4 ep 1",
-            "description": "Iata-ne reveniti dupa o binemeritata vacanta. Va astept si pe Facebook cu pareri si comentarii.",
+            "description": "re:^Iata-ne reveniti dupa o binemeritata vacanta\. +Va astept si pe Facebook cu pareri si comentarii.$",
         }
     }
 
index d339e6cb532223e271a5c762ceb1ff58c6071ee2..41638c1d01e2e76398d60ae5ef869d93845a59bc 100644 (file)
@@ -1,5 +1,6 @@
+from __future__ import unicode_literals
+
 import re
-import json
 
 from .common import InfoExtractor
 from ..utils import unified_strdate, determine_ext
@@ -9,41 +10,44 @@ class RoxwelIE(InfoExtractor):
     _VALID_URL = r'https?://www\.roxwel\.com/player/(?P<filename>.+?)(\.|\?|$)'
 
     _TEST = {
-        u'url': u'http://www.roxwel.com/player/passionpittakeawalklive.html',
-        u'file': u'passionpittakeawalklive.flv',
-        u'md5': u'd9dea8360a1e7d485d2206db7fe13035',
-        u'info_dict': {
-            u'title': u'Take A Walk (live)',
-            u'uploader': u'Passion Pit',
-            u'description': u'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ',
+        'url': 'http://www.roxwel.com/player/passionpittakeawalklive.html',
+        'info_dict': {
+            'id': 'passionpittakeawalklive',
+            'ext': 'flv',
+            'title': 'Take A Walk (live)',
+            'uploader': 'Passion Pit',
+            'uploader_id': 'passionpit',
+            'upload_date': '20120928',
+            'description': 'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ',
         },
-        u'skip': u'Requires rtmpdump',
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
     }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         filename = mobj.group('filename')
         info_url = 'http://www.roxwel.com/api/videos/%s' % filename
-        info_page = self._download_webpage(info_url, filename,
-                                           u'Downloading video info')
+        info = self._download_json(info_url, filename)
 
-        self.report_extraction(filename)
-        info = json.loads(info_page)
         rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')])
         best_rate = rtmp_rates[-1]
         url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate)
-        rtmp_url = self._download_webpage(url_page_url, filename, u'Downloading video url')
+        rtmp_url = self._download_webpage(url_page_url, filename, 'Downloading video url')
         ext = determine_ext(rtmp_url)
         if ext == 'f4v':
             rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename)
 
-        return {'id': filename,
-                'title': info['title'],
-                'url': rtmp_url,
-                'ext': 'flv',
-                'description': info['description'],
-                'thumbnail': info.get('player_image_url') or info.get('image_url_large'),
-                'uploader': info['artist'],
-                'uploader_id': info['artistname'],
-                'upload_date': unified_strdate(info['dbdate']),
-                }
+        return {
+            'id': filename,
+            'title': info['title'],
+            'url': rtmp_url,
+            'ext': 'flv',
+            'description': info['description'],
+            'thumbnail': info.get('player_image_url') or info.get('image_url_large'),
+            'uploader': info['artist'],
+            'uploader_id': info['artistname'],
+            'upload_date': unified_strdate(info['dbdate']),
+        }
diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py
new file mode 100644 (file)
index 0000000..205f8a1
--- /dev/null
@@ -0,0 +1,49 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+
+
+class RTBFIE(InfoExtractor):
+    _VALID_URL = r'https?://www.rtbf.be/video/[^\?]+\?id=(?P<id>\d+)'
+    _TEST = {
+        'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274',
+        'md5': '799f334ddf2c0a582ba80c44655be570',
+        'info_dict': {
+            'id': '1921274',
+            'ext': 'mp4',
+            'title': 'Les Diables au coeur (épisode 2)',
+            'description': 'Football - Diables Rouges',
+            'duration': 3099,
+            'timestamp': 1398456336,
+            'upload_date': '20140425',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        page = self._download_webpage('https://www.rtbf.be/video/embed?id=%s' % video_id, video_id)
+
+        data = json.loads(self._html_search_regex(
+            r'<div class="js-player-embed" data-video="([^"]+)"', page, 'data video'))['data']
+
+        video_url = data.get('downloadUrl') or data.get('url')
+
+        if data['provider'].lower() == 'youtube':
+            return self.url_result(video_url, 'Youtube')
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': data['title'],
+            'description': data.get('description') or data.get('subtitle'),
+            'thumbnail': data['thumbnail']['large'],
+            'duration': data.get('duration') or data.get('realDuration'),
+            'timestamp': data['created'],
+            'view_count': data['viewCount'],
+        }
index cd50f708d202cc29e1a9a24765f2f91396a0074e..4835ec5ecada755a12d7003fed6355adfd6936a6 100644 (file)
 # encoding: utf-8
-
 from __future__ import unicode_literals
 
 import re
 
 from .common import InfoExtractor
 from ..utils import (
-    clean_html,
     ExtractorError,
+    clean_html,
+    unified_strdate,
+    int_or_none,
 )
 
 
 class RTLnowIE(InfoExtractor):
     """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW"""
-    _VALID_URL = r'(?:http://)?(?P<url>(?P<domain>rtl-now\.rtl\.de|rtl2now\.rtl2\.de|(?:www\.)?voxnow\.de|(?:www\.)?rtlnitronow\.de|(?:www\.)?superrtlnow\.de|(?:www\.)?n-tvnow\.de)/+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?(?:container_id|film_id)=(?P<video_id>[0-9]+)&player=1(?:&season=[0-9]+)?(?:&.*)?)'
-    _TESTS = [{
-        'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
-        'file': '90419.flv',
-        'info_dict': {
-            'upload_date': '20070416',
-            'title': 'Ahornallee - Folge 1 - Der Einzug',
-            'description': 'Folge 1 - Der Einzug',
-        },
-        'params': {
-            'skip_download': True,
-        },
-        'skip': 'Only works from Germany',
-    },
-    {
-        'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5',
-        'file': '69756.flv',
-        'info_dict': {
-            'upload_date': '20120519',
-            'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit...',
-            'description': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.',
-            'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg',
-        },
-        'params': {
-            'skip_download': True,
-        },
-        'skip': 'Only works from Germany',
-    },
-    {
-        'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17',
-        'file': '13883.flv',
-        'info_dict': {
-            'upload_date': '20090627',
-            'title': 'Voxtours - Südafrika-Reporter II',
-            'description': 'Südafrika-Reporter II',
-        },
-        'params': {
-            'skip_download': True,
+    _VALID_URL = r'''(?x)
+                        (?:https?://)?
+                        (?P<url>
+                            (?P<domain>
+                                rtl-now\.rtl\.de|
+                                rtl2now\.rtl2\.de|
+                                (?:www\.)?voxnow\.de|
+                                (?:www\.)?rtlnitronow\.de|
+                                (?:www\.)?superrtlnow\.de|
+                                (?:www\.)?n-tvnow\.de)
+                            /+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\?
+                            (?:container_id|film_id)=(?P<video_id>[0-9]+)&
+                            player=1(?:&season=[0-9]+)?(?:&.*)?
+                        )'''
+
+    _TESTS = [
+        {
+            'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1',
+            'info_dict': {
+                'id': '90419',
+                'ext': 'flv',
+                'title': 'Ahornallee - Folge 1 - Der Einzug',
+                'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de',
+                'upload_date': '20070416',
+                'duration': 1685,
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'skip': 'Only works from Germany',
         },
-    },
-    {
-        'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
-        'file': '99205.flv',
-        'info_dict': {
-            'upload_date': '20080928', 
-            'title': 'Medicopter 117 - Angst!',
-            'description': 'Angst!',
-            'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg'
+        {
+            'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5',
+            'info_dict': {
+                'id': '69756',
+                'ext': 'flv',
+                'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.',
+                'description': 'md5:3fb247005ed21a935ffc82b7dfa70cf0',
+                'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg',
+                'upload_date': '20120519',
+                'duration': 1245,
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'skip': 'Only works from Germany',
         },
-        'params': {
-            'skip_download': True,
+        {
+            'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17',
+            'info_dict': {
+                'id': '13883',
+                'ext': 'flv',
+                'title': 'Voxtours - Südafrika-Reporter II',
+                'description': 'md5:de7f8d56be6fd4fed10f10f57786db00',
+                'upload_date': '20090627',
+                'duration': 1800,
+            },
+            'params': {
+                'skip_download': True,
+            },
         },
-    },
-    {
-        'url': 'http://www.n-tvnow.de/top-gear/episode-1-2013-01-01-00-00-00.php?film_id=124903&player=1&season=10',
-        'file': '124903.flv',
-        'info_dict': {
-            'upload_date': '20130101',
-            'title': 'Top Gear vom 01.01.2013',
-            'description': 'Episode 1',
+        {
+            'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1',
+            'info_dict': {
+                'id': '99205',
+                'ext': 'flv',
+                'title': 'Medicopter 117 - Angst!',
+                'description': 'md5:895b1df01639b5f61a04fc305a5cb94d',
+                'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg',
+                'upload_date': '20080928',
+                'duration': 2691,
+            },
+            'params': {
+                'skip_download': True,
+            },
         },
-        'params': {
-            'skip_download': True,
+        {
+            'url': 'http://www.n-tvnow.de/deluxe-alles-was-spass-macht/thema-ua-luxushotel-fuer-vierbeiner.php?container_id=153819&player=1&season=0',
+            'info_dict': {
+                'id': '153819',
+                'ext': 'flv',
+                'title': 'Deluxe - Alles was Spaß macht - Thema u.a.: Luxushotel für Vierbeiner',
+                'description': 'md5:c3705e1bb32e1a5b2bcd634fc065c631',
+                'thumbnail': 'http://autoimg.static-fra.de/ntvnow/383157/1500x1500/image2.jpg',
+                'upload_date': '20140221',
+                'duration': 2429,
+            },
+            'skip': 'Only works from Germany',
         },
-        'skip': 'Only works from Germany',
-    }]
+    ]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-
-        webpage_url = 'http://' + mobj.group('url')
-        video_page_url = 'http://' + mobj.group('domain') + '/'
+        video_page_url = 'http://%s/' % mobj.group('domain')
         video_id = mobj.group('video_id')
 
-        webpage = self._download_webpage(webpage_url, video_id)
+        webpage = self._download_webpage('http://' + mobj.group('url'), video_id)
 
-        note_m = re.search(r'''(?sx)
-            <div[ ]style="margin-left:[ ]20px;[ ]font-size:[ ]13px;">(.*?)
-            <div[ ]id="playerteaser">''', webpage)
-        if note_m:
-            msg = clean_html(note_m.group(1))
-            raise ExtractorError(msg)
+        mobj = re.search(r'(?s)<div style="margin-left: 20px; font-size: 13px;">(.*?)<div id="playerteaser">', webpage)
+        if mobj:
+            raise ExtractorError(clean_html(mobj.group(1)), expected=True)
 
-        video_title = self._html_search_regex(
-            r'<title>(?P<title>[^<]+?)( \| [^<]*)?</title>',
-            webpage, 'title')
-        playerdata_url = self._html_search_regex(
-            r'\'playerdata\': \'(?P<playerdata_url>[^\']+)\'',
-            webpage, 'playerdata_url')
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage)
+        thumbnail = self._og_search_thumbnail(webpage, default=None)
 
-        playerdata = self._download_webpage(playerdata_url, video_id)
-        mobj = re.search(r'<title><!\[CDATA\[(?P<description>.+?)(?:\s+- (?:Sendung )?vom (?P<upload_date_d>[0-9]{2})\.(?P<upload_date_m>[0-9]{2})\.(?:(?P<upload_date_Y>[0-9]{4})|(?P<upload_date_y>[0-9]{2})) [0-9]{2}:[0-9]{2} Uhr)?\]\]></title>', playerdata)
-        if mobj:
-            video_description = mobj.group('description')
-            if mobj.group('upload_date_Y'):
-                video_upload_date = mobj.group('upload_date_Y')
-            elif mobj.group('upload_date_y'):
-                video_upload_date = '20' + mobj.group('upload_date_y')
-            else:
-                video_upload_date = None
-            if video_upload_date:
-                video_upload_date += mobj.group('upload_date_m') + mobj.group('upload_date_d')
-        else:
-            video_description = None
-            video_upload_date = None
-            self._downloader.report_warning('Unable to extract description and upload date')
+        upload_date = unified_strdate(self._html_search_meta('uploadDate', webpage, 'upload date'))
 
-        # Thumbnail: not every video has an thumbnail
-        mobj = re.search(r'<meta property="og:image" content="(?P<thumbnail>[^"]+)">', webpage)
-        if mobj:
-            video_thumbnail = mobj.group('thumbnail')
-        else:
-            video_thumbnail = None
+        mobj = re.search(r'<meta itemprop="duration" content="PT(?P<seconds>\d+)S" />', webpage)
+        duration = int(mobj.group('seconds')) if mobj else None
 
-        mobj = re.search(r'<filename [^>]+><!\[CDATA\[(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>[^\]]+)\]\]></filename>', playerdata)
-        if mobj is None:
-            raise ExtractorError('Unable to extract media URL')
-        video_url = mobj.group('url')
-        video_play_path = 'mp4:' + mobj.group('play_path')
-        video_player_url = video_page_url + 'includes/vodplayer.swf'
+        playerdata_url = self._html_search_regex(
+            r"'playerdata': '(?P<playerdata_url>[^']+)'", webpage, 'playerdata_url')
+
+        playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML')
+
+        videoinfo = playerdata.find('./playlist/videoinfo')
+        
+        formats = []
+        for filename in videoinfo.findall('filename'):
+            mobj = re.search(r'(?P<url>rtmpe://(?:[^/]+/){2})(?P<play_path>.+)', filename.text)
+            if mobj:
+                fmt = {
+                    'url': mobj.group('url'),
+                    'play_path': 'mp4:' + mobj.group('play_path'),
+                    'page_url': video_page_url,
+                    'player_url': video_page_url + 'includes/vodplayer.swf',
+                }
+            else:
+                fmt = {
+                    'url': filename.text,
+                }
+            fmt.update({
+                'width': int_or_none(filename.get('width')),
+                'height': int_or_none(filename.get('height')),
+                'vbr': int_or_none(filename.get('bitrate')),
+                'ext': 'flv',
+            })
+            formats.append(fmt)
 
         return {
             'id': video_id,
-            'url': video_url,
-            'play_path': video_play_path,
-            'page_url': video_page_url,
-            'player_url': video_player_url,
-            'ext': 'flv',
-            'title': video_title,
-            'description': video_description,
-            'upload_date': video_upload_date,
-            'thumbnail': video_thumbnail,
-        }
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'duration': duration,
+            'formats': formats,
+        }
\ No newline at end of file
diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py
new file mode 100644 (file)
index 0000000..e8199b1
--- /dev/null
@@ -0,0 +1,154 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_duration,
+    parse_iso8601,
+    unescapeHTML,
+    compat_str,
+)
+
+
+class RTSIE(InfoExtractor):
+    IE_DESC = 'RTS.ch'
+    _VALID_URL = r'^https?://(?:www\.)?rts\.ch/(?:[^/]+/){2,}(?P<id>[0-9]+)-.*?\.html'
+
+    _TESTS = [
+        {
+            'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html',
+            'md5': '753b877968ad8afaeddccc374d4256a5',
+            'info_dict': {
+                'id': '3449373',
+                'ext': 'mp4',
+                'duration': 1488,
+                'title': 'Les Enfants Terribles',
+                'description': 'France Pommier et sa soeur Luce Feral, les deux filles de ce groupe de 5.',
+                'uploader': 'Divers',
+                'upload_date': '19680921',
+                'timestamp': -40280400,
+                'thumbnail': 're:^https?://.*\.image'
+            },
+        },
+        {
+            'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html',
+            'md5': 'c148457a27bdc9e5b1ffe081a7a8337b',
+            'info_dict': {
+                'id': '5624067',
+                'ext': 'mp4',
+                'duration': 3720,
+                'title': 'Les yeux dans les cieux - Mon homard au Canada',
+                'description': 'md5:d22ee46f5cc5bac0912e5a0c6d44a9f7',
+                'uploader': 'Passe-moi les jumelles',
+                'upload_date': '20140404',
+                'timestamp': 1396635300,
+                'thumbnail': 're:^https?://.*\.image'
+            },
+        },
+        {
+            'url': 'http://www.rts.ch/video/sport/hockey/5745975-1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski.html',
+            'md5': 'b4326fecd3eb64a458ba73c73e91299d',
+            'info_dict': {
+                'id': '5745975',
+                'ext': 'mp4',
+                'duration': 48,
+                'title': '1/2, Kloten - Fribourg (5-2): second but pour Gottéron par Kwiatowski',
+                'description': 'Hockey - Playoff',
+                'uploader': 'Hockey',
+                'upload_date': '20140403',
+                'timestamp': 1396556882,
+                'thumbnail': 're:^https?://.*\.image'
+            },
+            'skip': 'Blocked outside Switzerland',
+        },
+        {
+            'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html',
+            'md5': '9bb06503773c07ce83d3cbd793cebb91',
+            'info_dict': {
+                'id': '5745356',
+                'ext': 'mp4',
+                'duration': 33,
+                'title': 'Londres cachée par un épais smog',
+                'description': 'Un important voile de smog recouvre Londres depuis mercredi, provoqué par la pollution et du sable du Sahara.',
+                'uploader': 'Le Journal en continu',
+                'upload_date': '20140403',
+                'timestamp': 1396537322,
+                'thumbnail': 're:^https?://.*\.image'
+            },
+        },
+        {
+            'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html',
+            'md5': 'dd8ef6a22dff163d063e2a52bc8adcae',
+            'info_dict': {
+                'id': '5706148',
+                'ext': 'mp3',
+                'duration': 123,
+                'title': '"Urban Hippie", de Damien Krisl',
+                'description': 'Des Hippies super glam.',
+                'upload_date': '20140403',
+                'timestamp': 1396551600,
+            },
+        },
+    ]
+
+    def _real_extract(self, url):
+        m = re.match(self._VALID_URL, url)
+        video_id = m.group('id')
+
+        def download_json(internal_id):
+            return self._download_json(
+                'http://www.rts.ch/a/%s.html?f=json/article' % internal_id,
+                video_id)
+
+        all_info = download_json(video_id)
+
+        # video_id extracted out of URL is not always a real id
+        if 'video' not in all_info and 'audio' not in all_info:
+            page = self._download_webpage(url, video_id)
+            internal_id = self._html_search_regex(
+                r'<(?:video|audio) data-id="([0-9]+)"', page,
+                'internal video id')
+            all_info = download_json(internal_id)
+
+        info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio']
+
+        upload_timestamp = parse_iso8601(info.get('broadcast_date'))
+        duration = info.get('duration') or info.get('cutout') or info.get('cutduration')
+        if isinstance(duration, compat_str):
+            duration = parse_duration(duration)
+        view_count = info.get('plays')
+        thumbnail = unescapeHTML(info.get('preview_image_url'))
+
+        def extract_bitrate(url):
+            return int_or_none(self._search_regex(
+                r'-([0-9]+)k\.', url, 'bitrate', default=None))
+
+        formats = [{
+            'format_id': fid,
+            'url': furl,
+            'tbr': extract_bitrate(furl),
+        } for fid, furl in info['streams'].items()]
+
+        if 'media' in info:
+            formats.extend([{
+                'format_id': '%s-%sk' % (media['ext'], media['rate']),
+                'url': 'http://download-video.rts.ch/%s' % media['url'],
+                'tbr': media['rate'] or extract_bitrate(media['url']),
+            } for media in info['media'] if media.get('rate')])
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': info['title'],
+            'description': info.get('intro'),
+            'duration': duration,
+            'view_count': view_count,
+            'uploader': info.get('programName'),
+            'timestamp': upload_timestamp,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py
new file mode 100644 (file)
index 0000000..77fd08d
--- /dev/null
@@ -0,0 +1,84 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+import base64
+
+from .common import InfoExtractor
+from ..utils import (
+    struct_unpack,
+)
+
+
+class RTVEALaCartaIE(InfoExtractor):
+    IE_NAME = 'rtve.es:alacarta'
+    IE_DESC = 'RTVE a la carta'
+    _VALID_URL = r'http://www\.rtve\.es/alacarta/videos/[^/]+/[^/]+/(?P<id>\d+)'
+
+    _TEST = {
+        'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/',
+        'md5': '18fcd45965bdd076efdb12cd7f6d7b9e',
+        'info_dict': {
+            'id': '2491869',
+            'ext': 'mp4',
+            'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia',
+        },
+    }
+
+    def _decrypt_url(self, png):
+        encrypted_data = base64.b64decode(png)
+        text_index = encrypted_data.find(b'tEXt')
+        text_chunk = encrypted_data[text_index-4:]
+        length = struct_unpack('!I', text_chunk[:4])[0]
+        # Use bytearray to get integers when iterating in both python 2.x and 3.x
+        data = bytearray(text_chunk[8:8+length])
+        data = [chr(b) for b in data if b != 0]
+        hash_index = data.index('#')
+        alphabet_data = data[:hash_index]
+        url_data = data[hash_index+1:]
+
+        alphabet = []
+        e = 0
+        d = 0
+        for l in alphabet_data:
+            if d == 0:
+                alphabet.append(l)
+                d = e = (e + 1) % 4
+            else:
+                d -= 1
+        url = ''
+        f = 0
+        e = 3
+        b = 1
+        for letter in url_data:
+            if f == 0:
+                l = int(letter)*10
+                f = 1
+            else:
+                if e == 0:
+                    l += int(letter)
+                    url += alphabet[l]
+                    e = (b + 3) % 4
+                    f = 0
+                    b += 1
+                else:
+                    e -= 1
+
+        return url
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        info = self._download_json(
+            'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id,
+            video_id)['page']['items'][0]
+        png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % video_id
+        png = self._download_webpage(png_url, video_id, 'Downloading url information')
+        video_url = self._decrypt_url(png)
+
+        return {
+            'id': video_id,
+            'title': info['title'],
+            'url': video_url,
+            'thumbnail': info['image'],
+        }
index 4922dd764eb119a4993bad607a3a01afccf99f51..357edbbdaf88c6c29395aa7878c18f305c79b216 100644 (file)
@@ -2,7 +2,6 @@
 from __future__ import unicode_literals
 
 import re
-import json
 import itertools
 
 from .common import InfoExtractor
@@ -20,8 +19,9 @@ class RutubeIE(InfoExtractor):
 
     _TEST = {
         'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
-        'file': '3eac3b4561676c17df9132a9a1e62e3e.mp4',
         'info_dict': {
+            'id': '3eac3b4561676c17df9132a9a1e62e3e',
+            'ext': 'mp4',
             'title': 'Раненный кенгуру забежал в аптеку',
             'description': 'http://www.ntdtv.ru ',
             'duration': 80,
@@ -38,18 +38,19 @@ class RutubeIE(InfoExtractor):
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
-        
-        api_response = self._download_webpage('http://rutube.ru/api/video/%s/?format=json' % video_id,
-                                              video_id, 'Downloading video JSON')
-        video = json.loads(api_response)
-        
-        api_response = self._download_webpage('http://rutube.ru/api/play/trackinfo/%s/?format=json' % video_id,
-                                              video_id, 'Downloading trackinfo JSON')
-        trackinfo = json.loads(api_response)
-        
+
+        video = self._download_json(
+            'http://rutube.ru/api/video/%s/?format=json' % video_id,
+            video_id, 'Downloading video JSON')
+
         # Some videos don't have the author field
-        author = trackinfo.get('author') or {}
-        m3u8_url = trackinfo['video_balancer'].get('m3u8')
+        author = video.get('author') or {}
+
+        options = self._download_json(
+            'http://rutube.ru/api/play/options/%s/?format=json' % video_id,
+            video_id, 'Downloading options JSON')
+
+        m3u8_url = options['video_balancer'].get('m3u8')
         if m3u8_url is None:
             raise ExtractorError('Couldn\'t find m3u8 manifest url')
 
@@ -79,10 +80,9 @@ class RutubeChannelIE(InfoExtractor):
     def _extract_videos(self, channel_id, channel_title=None):
         entries = []
         for pagenum in itertools.count(1):
-            api_response = self._download_webpage(
+            page = self._download_json(
                 self._PAGE_TEMPLATE % (channel_id, pagenum),
                 channel_id, 'Downloading page %s' % pagenum)
-            page = json.loads(api_response)
             results = page['results']
             if not results:
                 break
@@ -108,10 +108,9 @@ class RutubeMovieIE(RutubeChannelIE):
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         movie_id = mobj.group('id')
-        api_response = self._download_webpage(
+        movie = self._download_json(
             self._MOVIE_TEMPLATE % movie_id, movie_id,
             'Downloading movie JSON')
-        movie = json.loads(api_response)
         movie_name = movie['name']
         return self._extract_videos(movie_id, movie_name)
 
diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py
new file mode 100644 (file)
index 0000000..6c5f5a6
--- /dev/null
@@ -0,0 +1,194 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none
+)
+
+
+class RUTVIE(InfoExtractor):
+    IE_DESC = 'RUTV.RU'
+    _VALID_URL = r'''(?x)
+        https?://player\.(?:rutv\.ru|vgtrk\.com)/
+            (?P<path>flash2v/container\.swf\?id=
+            |iframe/(?P<type>swf|video|live)/id/
+            |index/iframe/cast_id/)
+            (?P<id>\d+)'''
+
+    _TESTS = [
+        {
+            'url': 'http://player.rutv.ru/flash2v/container.swf?id=774471&sid=kultura&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972347/video_id/978186/brand_id/31724',
+            'info_dict': {
+                'id': '774471',
+                'ext': 'mp4',
+                'title': 'Монологи на все времена',
+                'description': 'md5:18d8b5e6a41fb1faa53819471852d5d5',
+                'duration': 2906,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'https://player.vgtrk.com/flash2v/container.swf?id=774016&sid=russiatv&fbv=true&isPlay=true&ssl=false&i=560&acc_video_id=episode_id/972098/video_id/977760/brand_id/57638',
+            'info_dict': {
+                'id': '774016',
+                'ext': 'mp4',
+                'title': 'Чужой в семье Сталина',
+                'description': '',
+                'duration': 2539,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://player.rutv.ru/iframe/swf/id/766888/sid/hitech/?acc_video_id=4000',
+            'info_dict': {
+                'id': '766888',
+                'ext': 'mp4',
+                'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"',
+                'description': 'md5:65ddd47f9830c4f42ed6475f8730c995',
+                'duration': 279,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://player.rutv.ru/iframe/video/id/771852/start_zoom/true/showZoomBtn/false/sid/russiatv/?acc_video_id=episode_id/970443/video_id/975648/brand_id/5169',
+            'info_dict': {
+                'id': '771852',
+                'ext': 'mp4',
+                'title': 'Прямой эфир. Жертвы загадочной болезни: смерть от старости в 17 лет',
+                'description': 'md5:b81c8c55247a4bd996b43ce17395b2d8',
+                'duration': 3096,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://player.rutv.ru/iframe/live/id/51499/showZoomBtn/false/isPlay/true/sid/sochi2014',
+            'info_dict': {
+                'id': '51499',
+                'ext': 'flv',
+                'title': 'Сочи-2014. Биатлон. Индивидуальная гонка. Мужчины ',
+                'description': 'md5:9e0ed5c9d2fa1efbfdfed90c9a6d179c',
+            },
+            'params': {
+                # rtmp download
+                'skip_download': True,
+            },
+            'skip': 'Translation has finished',
+        },
+    ]
+
+    @classmethod
+    def _extract_url(cls, webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.rutv\.ru/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
+        if mobj:
+            return mobj.group('url')
+
+        mobj = re.search(
+            r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>http://player\.(?:rutv\.ru|vgtrk\.com)/flash2v/container\.swf\?id=.+?\2)',
+            webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        video_path = mobj.group('path')
+
+        if video_path.startswith('flash2v'):
+            video_type = 'video'
+        elif video_path.startswith('iframe'):
+            video_type = mobj.group('type')
+            if video_type == 'swf':
+                video_type = 'video'
+        elif video_path.startswith('index/iframe/cast_id'):
+            video_type = 'live'
+
+        json_data = self._download_json(
+            'http://player.rutv.ru/iframe/%splay/id/%s' % ('live-' if video_type == 'live' else '', video_id),
+            video_id, 'Downloading JSON')
+
+        if json_data['errors']:
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, json_data['errors']), expected=True)
+
+        playlist = json_data['data']['playlist']
+        medialist = playlist['medialist']
+        media = medialist[0]
+
+        if media['errors']:
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, media['errors']), expected=True)
+
+        view_count = playlist.get('count_views')
+        priority_transport = playlist['priority_transport']
+
+        thumbnail = media['picture']
+        width = int_or_none(media['width'])
+        height = int_or_none(media['height'])
+        description = media['anons']
+        title = media['title']
+        duration = int_or_none(media.get('duration'))
+
+        formats = []
+
+        for transport, links in media['sources'].items():
+            for quality, url in links.items():
+                if transport == 'rtmp':
+                    mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>.+)$', url)
+                    if not mobj:
+                        continue
+                    fmt = {
+                        'url': mobj.group('url'),
+                        'play_path': mobj.group('playpath'),
+                        'app': mobj.group('app'),
+                        'page_url': 'http://player.rutv.ru',
+                        'player_url': 'http://player.rutv.ru/flash2v/osmf.swf?i=22',
+                        'rtmp_live': True,
+                        'ext': 'flv',
+                        'vbr': int(quality),
+                    }
+                elif transport == 'm3u8':
+                    fmt = {
+                        'url': url,
+                        'ext': 'mp4',
+                    }
+                else:
+                    fmt = {
+                        'url': url
+                    }
+                fmt.update({
+                    'width': width,
+                    'height': height,
+                    'format_id': '%s-%s' % (transport, quality),
+                    'preference': -1 if priority_transport == transport else -2,
+                })
+                formats.append(fmt)
+
+        if not formats:
+            raise ExtractorError('No media links available for %s' % video_id)
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'view_count': view_count,
+            'duration': duration,
+            'formats': formats,
+        }
\ No newline at end of file
diff --git a/youtube_dl/extractor/savefrom.py b/youtube_dl/extractor/savefrom.py
new file mode 100644 (file)
index 0000000..198a08c
--- /dev/null
@@ -0,0 +1,37 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import os.path
+import re
+
+from .common import InfoExtractor
+
+
+class SaveFromIE(InfoExtractor):
+    IE_NAME = 'savefrom.net'
+    _VALID_URL = r'https?://[^.]+\.savefrom\.net/\#url=(?P<url>.*)$'
+
+    _TEST = {
+        'url': 'http://en.savefrom.net/#url=http://youtube.com/watch?v=UlVRAPW2WJY&utm_source=youtube.com&utm_medium=short_domains&utm_campaign=ssyoutube.com',
+        'info_dict': {
+            'id': 'UlVRAPW2WJY',
+            'ext': 'mp4',
+            'title': 'About Team Radical MMA | MMA Fighting',
+            'upload_date': '20120816',
+            'uploader': 'Howcast',
+            'uploader_id': 'Howcast',
+            'description': 'md5:4f0aac94361a12e1ce57d74f85265175',
+        },
+        'params': {
+            'skip_download': True
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = os.path.splitext(url.split('/')[-1])[0]
+        return {
+            '_type': 'url',
+            'id': video_id,
+            'url': mobj.group('url'),
+        }
diff --git a/youtube_dl/extractor/scivee.py b/youtube_dl/extractor/scivee.py
new file mode 100644 (file)
index 0000000..55a481c
--- /dev/null
@@ -0,0 +1,56 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class SciVeeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?scivee\.tv/node/(?P<id>\d+)'
+
+    _TEST = {
+        'url': 'http://www.scivee.tv/node/62352',
+        'md5': 'b16699b74c9e6a120f6772a44960304f',
+        'info_dict': {
+            'id': '62352',
+            'ext': 'mp4',
+            'title': 'Adam Arkin at the 2014 DOE JGI Genomics of Energy & Environment Meeting',
+            'description': 'md5:81f1710638e11a481358fab1b11059d7',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        # annotations XML is malformed
+        annotations = self._download_webpage(
+            'http://www.scivee.tv/assets/annotations/%s' % video_id, video_id, 'Downloading annotations')
+
+        title = self._html_search_regex(r'<title>([^<]+)</title>', annotations, 'title')
+        description = self._html_search_regex(r'<abstract>([^<]+)</abstract>', annotations, 'abstract', fatal=False)
+        filesize = int_or_none(self._html_search_regex(
+            r'<filesize>([^<]+)</filesize>', annotations, 'filesize', fatal=False))
+
+        formats = [
+            {
+                'url': 'http://www.scivee.tv/assets/audio/%s' % video_id,
+                'ext': 'mp3',
+                'format_id': 'audio',
+            },
+            {
+                'url': 'http://www.scivee.tv/assets/video/%s' % video_id,
+                'ext': 'mp4',
+                'format_id': 'video',
+                'filesize': filesize,
+            },
+        ]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': 'http://www.scivee.tv/assets/videothumb/%s' % video_id,
+            'formats': formats,
+        }
\ No newline at end of file
diff --git a/youtube_dl/extractor/slashdot.py b/youtube_dl/extractor/slashdot.py
deleted file mode 100644 (file)
index d68646d..0000000
+++ /dev/null
@@ -1,24 +0,0 @@
-import re
-
-from .common import InfoExtractor
-
-
-class SlashdotIE(InfoExtractor):
-    _VALID_URL = r'https?://tv\.slashdot\.org/video/\?embed=(?P<id>.*?)(&|$)'
-
-    _TEST = {
-        u'add_ie': ['Ooyala'],
-        u'url': u'http://tv.slashdot.org/video/?embed=JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz',
-        u'file': u'JscHMzZDplD0p-yNLOzTfzC3Q3xzJaUz.mp4',
-        u'md5': u'd2222e7a4a4c1541b3e0cf732fb26735',
-        u'info_dict': {
-            u'title': u' Meet the Stampede Supercomputing Cluster\'s Administrator',
-        },
-    }
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        webpage = self._download_webpage(url, video_id)
-        ooyala_url = self._search_regex(r'<script src="(.*?)"', webpage, 'ooyala url')
-        return self.url_result(ooyala_url, 'Ooyala')
index 9c62825cc7f7cab2a4023a74e36307b48c280bd6..53c3c9220374737b88dc516ec810ecb6865b74f7 100644 (file)
@@ -39,7 +39,8 @@ class SlideshareIE(InfoExtractor):
         ext = info['jsplayer']['video_extension']
         video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext)
         description = self._html_search_regex(
-            r'<p class="description.*?"[^>]*>(.*?)</p>', webpage, 'description')
+            r'<p\s+(?:style="[^"]*"\s+)?class="description.*?"[^>]*>(.*?)</p>', webpage,
+            'description', fatal=False)
 
         return {
             '_type': 'video',
diff --git a/youtube_dl/extractor/slutload.py b/youtube_dl/extractor/slutload.py
new file mode 100644 (file)
index 0000000..ecc0abf
--- /dev/null
@@ -0,0 +1,47 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+)
+
+
+class SlutloadIE(InfoExtractor):
+    _VALID_URL = r'^https?://(?:\w+\.)?slutload\.com/video/[^/]+/(?P<id>[^/]+)/?$'
+    _TEST = {
+        'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/',
+        'md5': '0cf531ae8006b530bd9df947a6a0df77',
+        'info_dict': {
+            'id': 'TD73btpBqSxc',
+            'ext': 'mp4',
+            "title": "virginie baisee en cam",
+            "age_limit": 18,
+            'thumbnail': 're:https?://.*?\.jpg'
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_title = self._html_search_regex(r'<h1><strong>([^<]+)</strong>',
+            webpage, 'title').strip()
+
+        video_url = self._html_search_regex(
+            r'(?s)<div id="vidPlayer"\s+data-url="([^"]+)"',
+            webpage, 'video URL')
+        thumbnail = self._html_search_regex(
+            r'(?s)<div id="vidPlayer"\s+.*?previewer-file="([^"]+)"',
+            webpage, 'thumbnail', fatal=False)
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': video_title,
+            'thumbnail': thumbnail,
+            'age_limit': 18
+        }
index 540c557039fd5876ad32af816152fdc912a87e69..13e7e71cb37b4d7b5ec2e5ab2c341551e7e05f28 100644 (file)
@@ -13,22 +13,24 @@ from ..utils import (
     compat_urllib_request,
     ExtractorError,
     url_basename,
+    int_or_none,
 )
 
 
 class SmotriIE(InfoExtractor):
     IE_DESC = 'Smotri.com'
     IE_NAME = 'smotri'
-    _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))'
+    _VALID_URL = r'^https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})'
     _NETRC_MACHINE = 'smotri'
 
     _TESTS = [
         # real video id 2610366
         {
             'url': 'http://smotri.com/video/view/?id=v261036632ab',
-            'file': 'v261036632ab.mp4',
             'md5': '2a7b08249e6f5636557579c368040eb9',
             'info_dict': {
+                'id': 'v261036632ab',
+                'ext': 'mp4',
                 'title': 'катастрофа с камер видеонаблюдения',
                 'uploader': 'rbc2008',
                 'uploader_id': 'rbc08',
@@ -40,9 +42,10 @@ class SmotriIE(InfoExtractor):
         # real video id 57591
         {
             'url': 'http://smotri.com/video/view/?id=v57591cb20',
-            'file': 'v57591cb20.flv',
             'md5': '830266dfc21f077eac5afd1883091bcd',
             'info_dict': {
+                'id': 'v57591cb20',
+                'ext': 'flv',
                 'title': 'test',
                 'uploader': 'Support Photofile@photofile',
                 'uploader_id': 'support-photofile',
@@ -54,9 +57,10 @@ class SmotriIE(InfoExtractor):
         # video-password
         {
             'url': 'http://smotri.com/video/view/?id=v1390466a13c',
-            'file': 'v1390466a13c.mp4',
             'md5': 'f6331cef33cad65a0815ee482a54440b',
             'info_dict': {
+                'id': 'v1390466a13c',
+                'ext': 'mp4',
                 'title': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1',
                 'uploader': 'timoxa40',
                 'uploader_id': 'timoxa40',
@@ -71,9 +75,10 @@ class SmotriIE(InfoExtractor):
         # age limit + video-password
         {
             'url': 'http://smotri.com/video/view/?id=v15408898bcf',
-            'file': 'v15408898bcf.flv',
             'md5': '91e909c9f0521adf5ee86fbe073aad70',
             'info_dict': {
+                'id': 'v15408898bcf',
+                'ext': 'flv',
                 'title': 'этот ролик не покажут по ТВ',
                 'uploader': 'zzxxx',
                 'uploader_id': 'ueggb',
@@ -85,7 +90,22 @@ class SmotriIE(InfoExtractor):
             'params': {
                 'videopassword': '333'
             }
-        }
+        },
+        # swf player
+        {
+            'url': 'http://pics.smotri.com/scrubber_custom8.swf?file=v9188090500',
+            'md5': '4d47034979d9390d14acdf59c4935bc2',
+            'info_dict': {
+                'id': 'v9188090500',
+                'ext': 'mp4',
+                'title': 'Shakira - Don\'t Bother',
+                'uploader': 'HannahL',
+                'uploader_id': 'lisaha95',
+                'upload_date': '20090331',
+                'description': 'Shakira - Don\'t Bother, видео Shakira - Don\'t Bother',
+                'thumbnail': 'http://frame8.loadup.ru/44/0b/918809.7.3.jpg',
+            },
+        },
     ]
 
     _SUCCESS = 0
@@ -93,6 +113,21 @@ class SmotriIE(InfoExtractor):
     _PASSWORD_DETECTED = 2
     _VIDEO_NOT_FOUND = 3
 
+    @classmethod
+    def _extract_url(cls, webpage):
+        mobj = re.search(
+            r'<embed[^>]src=(["\'])(?P<url>http://pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=v.+?\1)',
+            webpage)
+        if mobj is not None:
+            return mobj.group('url')
+
+        mobj = re.search(
+            r'''(?x)<div\s+class="video_file">http://smotri\.com/video/download/file/[^<]+</div>\s*
+                    <div\s+class="video_image">[^<]+</div>\s*
+                    <div\s+class="video_id">(?P<id>[^<]+)</div>''', webpage)
+        if mobj is not None:
+            return 'http://smotri.com/video/view/?id=%s' % mobj.group('id')
+
     def _search_meta(self, name, html, display_name=None):
         if display_name is None:
             display_name = name
@@ -134,7 +169,7 @@ class SmotriIE(InfoExtractor):
 
         # Video JSON does not provide enough meta data
         # We will extract some from the video web page instead
-        video_page_url = 'http://' + mobj.group('url')
+        video_page_url = 'http://smotri.com/video/view/?id=%s' % video_id
         video_page = self._download_webpage(video_page_url, video_id, 'Downloading video page')
 
         # Warning if video is unavailable
@@ -222,7 +257,7 @@ class SmotriIE(InfoExtractor):
             'upload_date': video_upload_date,
             'uploader_id': video_uploader_id,
             'duration': video_duration,
-            'view_count': video_view_count,
+            'view_count': int_or_none(video_view_count),
             'age_limit': 18 if adult_content else 0,
             'video_page_url': video_page_url
         }
index 393b5f17c53d5ed216b53ac0d1ff2941cb1d24f7..25515f0686b0725075005da7f93f17544bd1b1ea 100644 (file)
@@ -12,6 +12,7 @@ from ..utils import (
     compat_urllib_parse,
 
     ExtractorError,
+    int_or_none,
     unified_strdate,
 )
 
@@ -25,7 +26,7 @@ class SoundcloudIE(InfoExtractor):
        of the stream token and uid
      """
 
-    _VALID_URL = r'''^(?:https?://)?
+    _VALID_URL = r'''(?x)^(?:https?://)?
                     (?:(?:(?:www\.|m\.)?soundcloud\.com/
                             (?P<uploader>[\w\d-]+)/
                             (?!sets/)(?P<title>[\w\d-]+)/?
@@ -44,7 +45,8 @@ class SoundcloudIE(InfoExtractor):
                 "upload_date": "20121011",
                 "description": "No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd",
                 "uploader": "E.T. ExTerrestrial Music",
-                "title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1"
+                "title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1",
+                "duration": 143,
             }
         },
         # not streamable song
@@ -54,8 +56,10 @@ class SoundcloudIE(InfoExtractor):
                 'id': '47127627',
                 'ext': 'mp3',
                 'title': 'Goldrushed',
+                'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com',
                 'uploader': 'The Royal Concept',
                 'upload_date': '20120521',
+                'duration': 227,
             },
             'params': {
                 # rtmp
@@ -73,6 +77,7 @@ class SoundcloudIE(InfoExtractor):
                 'uploader': 'jaimeMF',
                 'description': 'test chars:  \"\'/\\ä↭',
                 'upload_date': '20131209',
+                'duration': 9,
             },
         },
         # downloadable song
@@ -86,6 +91,7 @@ class SoundcloudIE(InfoExtractor):
                 'description': 'Vocals',
                 'uploader': 'Sim Gretina',
                 'upload_date': '20130815',
+                #'duration': 42,
             },
         },
     ]
@@ -93,13 +99,9 @@ class SoundcloudIE(InfoExtractor):
     _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28'
     _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'
 
-    @classmethod
-    def suitable(cls, url):
-        return re.match(cls._VALID_URL, url, flags=re.VERBOSE) is not None
-
     def report_resolve(self, video_id):
         """Report information extraction."""
-        self.to_screen(u'%s: Resolving id' % video_id)
+        self.to_screen('%s: Resolving id' % video_id)
 
     @classmethod
     def _resolv_url(cls, url):
@@ -122,46 +124,47 @@ class SoundcloudIE(InfoExtractor):
             'title': info['title'],
             'description': info['description'],
             'thumbnail': thumbnail,
+            'duration': int_or_none(info.get('duration'), 1000),
         }
+        formats = []
         if info.get('downloadable', False):
             # We can build a direct link to the song
             format_url = (
                 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(
                     track_id, self._CLIENT_ID))
-            result['formats'] = [{
+            formats.append({
                 'format_id': 'download',
                 'ext': info.get('original_format', 'mp3'),
                 'url': format_url,
                 'vcodec': 'none',
-            }]
-        else:
-            # We have to retrieve the url
-            streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?'
-                'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token))
-            stream_json = self._download_webpage(
-                streams_url,
-                track_id, 'Downloading track url')
-
-            formats = []
-            format_dict = json.loads(stream_json)
-            for key, stream_url in format_dict.items():
-                if key.startswith(u'http'):
-                    formats.append({
-                        'format_id': key,
-                        'ext': ext,
-                        'url': stream_url,
-                        'vcodec': 'none',
-                    })
-                elif key.startswith(u'rtmp'):
-                    # The url doesn't have an rtmp app, we have to extract the playpath
-                    url, path = stream_url.split('mp3:', 1)
-                    formats.append({
-                        'format_id': key,
-                        'url': url,
-                        'play_path': 'mp3:' + path,
-                        'ext': ext,
-                        'vcodec': 'none',
-                    })
+                'preference': 10,
+            })
+
+        # We have to retrieve the url
+        streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?'
+            'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token))
+        format_dict = self._download_json(
+            streams_url,
+            track_id, 'Downloading track url')
+
+        for key, stream_url in format_dict.items():
+            if key.startswith('http'):
+                formats.append({
+                    'format_id': key,
+                    'ext': ext,
+                    'url': stream_url,
+                    'vcodec': 'none',
+                })
+            elif key.startswith('rtmp'):
+                # The url doesn't have an rtmp app, we have to extract the playpath
+                url, path = stream_url.split('mp3:', 1)
+                formats.append({
+                    'format_id': key,
+                    'url': url,
+                    'play_path': 'mp3:' + path,
+                    'ext': ext,
+                    'vcodec': 'none',
+                })
 
             if not formats:
                 # We fallback to the stream_url in the original info, this
@@ -187,7 +190,7 @@ class SoundcloudIE(InfoExtractor):
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
         if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
+            raise ExtractorError('Invalid URL: %s' % url)
 
         track_id = mobj.group('track_id')
         token = None
@@ -196,7 +199,7 @@ class SoundcloudIE(InfoExtractor):
             full_title = track_id
         elif mobj.group('player'):
             query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
-            return self.url_result(query['url'][0], ie='Soundcloud')
+            return self.url_result(query['url'][0])
         else:
             # extract uploader (which is in the url)
             uploader = mobj.group('uploader')
@@ -211,13 +214,13 @@ class SoundcloudIE(InfoExtractor):
     
             url = 'http://soundcloud.com/%s' % resolve_title
             info_json_url = self._resolv_url(url)
-        info_json = self._download_webpage(info_json_url, full_title, 'Downloading info JSON')
+        info = self._download_json(info_json_url, full_title, 'Downloading info JSON')
 
-        info = json.loads(info_json)
         return self._extract_info_dict(info, full_title, secret_token=token)
 
+
 class SoundcloudSetIE(SoundcloudIE):
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$'
+    _VALID_URL = r'https?://(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
     IE_NAME = 'soundcloud:set'
     # it's in tests/test_playlists.py
     _TESTS = []
@@ -225,24 +228,23 @@ class SoundcloudSetIE(SoundcloudIE):
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
+            raise ExtractorError('Invalid URL: %s' % url)
 
         # extract uploader (which is in the url)
         uploader = mobj.group(1)
         # extract simple title (uploader + slug of song title)
-        slug_title =  mobj.group(2)
+        slug_title = mobj.group(2)
         full_title = '%s/sets/%s' % (uploader, slug_title)
 
         self.report_resolve(full_title)
 
         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
         resolv_url = self._resolv_url(url)
-        info_json = self._download_webpage(resolv_url, full_title)
+        info = self._download_json(resolv_url, full_title)
 
-        info = json.loads(info_json)
         if 'errors' in info:
             for err in info['errors']:
-                self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
+                self._downloader.report_error('unable to download video webpage: %s' % compat_str(err['error_message']))
             return
 
         self.report_extraction(full_title)
@@ -266,26 +268,55 @@ class SoundcloudUserIE(SoundcloudIE):
 
         url = 'http://soundcloud.com/%s/' % uploader
         resolv_url = self._resolv_url(url)
-        user_json = self._download_webpage(resolv_url, uploader,
-            'Downloading user info')
-        user = json.loads(user_json)
+        user = self._download_json(
+            resolv_url, uploader, 'Downloading user info')
+        base_url = 'http://api.soundcloud.com/users/%s/tracks.json?' % uploader
 
-        tracks = []
+        entries = []
         for i in itertools.count():
-            data = compat_urllib_parse.urlencode({'offset': i*50,
-                                                  'client_id': self._CLIENT_ID,
-                                                  })
-            tracks_url = 'http://api.soundcloud.com/users/%s/tracks.json?' % user['id'] + data
-            response = self._download_webpage(tracks_url, uploader, 
-                'Downloading tracks page %s' % (i+1))
-            new_tracks = json.loads(response)
-            tracks.extend(self._extract_info_dict(track, quiet=True) for track in new_tracks)
-            if len(new_tracks) < 50:
+            data = compat_urllib_parse.urlencode({
+                'offset': i * 50,
+                'client_id': self._CLIENT_ID,
+            })
+            new_entries = self._download_json(
+                base_url + data, uploader, 'Downloading track page %s' % (i + 1))
+            entries.extend(self._extract_info_dict(e, quiet=True) for e in new_entries)
+            if len(new_entries) < 50:
                 break
 
         return {
             '_type': 'playlist',
             'id': compat_str(user['id']),
             'title': user['username'],
-            'entries': tracks,
+            'entries': entries,
+        }
+
+
+class SoundcloudPlaylistIE(SoundcloudIE):
+    _VALID_URL = r'https?://api\.soundcloud\.com/playlists/(?P<id>[0-9]+)'
+    IE_NAME = 'soundcloud:playlist'
+
+     # it's in tests/test_playlists.py
+    _TESTS = []
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        playlist_id = mobj.group('id')
+        base_url = '%s//api.soundcloud.com/playlists/%s.json?' % (self.http_scheme(), playlist_id)
+
+        data = compat_urllib_parse.urlencode({
+            'client_id': self._CLIENT_ID,
+        })
+        data = self._download_json(
+            base_url + data, playlist_id, 'Downloading playlist')
+
+        entries = [
+            self._extract_info_dict(t, quiet=True) for t in data['tracks']]
+
+        return {
+            '_type': 'playlist',
+            'id': playlist_id,
+            'title': data.get('title'),
+            'description': data.get('description'),
+            'entries': entries,
         }
index 4a3e52ad8c02d0c7ccf653d761fbf95c63ed7bc8..d34aefeaa24a2b8b307005e9164ad9349b638e88 100644 (file)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
@@ -8,14 +10,14 @@ from ..utils import RegexNotFoundError, ExtractorError
 class SpaceIE(InfoExtractor):
     _VALID_URL = r'https?://(?:(?:www|m)\.)?space\.com/\d+-(?P<title>[^/\.\?]*?)-video\.html'
     _TEST = {
-        u'add_ie': ['Brightcove'],
-        u'url': u'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html',
-        u'info_dict': {
-            u'id': u'2780937028001',
-            u'ext': u'mp4',
-            u'title': u'Huge Martian Landforms\' Detail Revealed By European Probe | Video',
-            u'description': u'md5:db81cf7f3122f95ed234b631a6ea1e61',
-            u'uploader': u'TechMedia Networks',
+        'add_ie': ['Brightcove'],
+        'url': 'http://www.space.com/23373-huge-martian-landforms-detail-revealed-by-european-probe-video.html',
+        'info_dict': {
+            'id': '2780937028001',
+            'ext': 'mp4',
+            'title': 'Huge Martian Landforms\' Detail Revealed By European Probe | Video',
+            'description': 'md5:db81cf7f3122f95ed234b631a6ea1e61',
+            'uploader': 'TechMedia Networks',
         },
     }
 
index 3362b3db85c65c97d8839d8707e36fecf8bc6646..2007a00134dfe73cd721dbee9e86e4f349a2e034 100644 (file)
@@ -1,6 +1,5 @@
 from __future__ import unicode_literals
 
-import os
 import re
 
 from .common import InfoExtractor
@@ -8,23 +7,27 @@ from ..utils import (
     compat_urllib_parse_urlparse,
     compat_urllib_request,
     compat_urllib_parse,
+    unified_strdate,
+    str_to_int,
+    int_or_none,
 )
-from ..aes import (
-    aes_decrypt_text
-)
+from ..aes import aes_decrypt_text
 
 
 class SpankwireIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<videoid>[0-9]+)/?)'
+    _VALID_URL = r'https?://(?:www\.)?(?P<url>spankwire\.com/[^/]*/video(?P<videoid>[0-9]+)/?)'
     _TEST = {
         'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/',
-        'file': '103545.mp4',
-        'md5': '1b3f55e345500552dbc252a3e9c1af43',
+        'md5': '8bbfde12b101204b39e4b9fe7eb67095',
         'info_dict': {
-            "uploader": "oreusz",
-            "title": "Buckcherry`s X Rated Music Video Crazy Bitch",
-            "description": "Crazy Bitch X rated music video.",
-            "age_limit": 18,
+            'id': '103545',
+            'ext': 'mp4',
+            'title': 'Buckcherry`s X Rated Music Video Crazy Bitch',
+            'description': 'Crazy Bitch X rated music video.',
+            'uploader': 'oreusz',
+            'uploader_id': '124697',
+            'upload_date': '20070508',
+            'age_limit': 18,
         }
     }
 
@@ -37,13 +40,26 @@ class SpankwireIE(InfoExtractor):
         req.add_header('Cookie', 'age_verified=1')
         webpage = self._download_webpage(req, video_id)
 
-        video_title = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title')
-        video_uploader = self._html_search_regex(
-            r'by:\s*<a [^>]*>(.+?)</a>', webpage, 'uploader', fatal=False)
-        thumbnail = self._html_search_regex(
-            r'flashvars\.image_url = "([^"]+)', webpage, 'thumbnail', fatal=False)
+        title = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title')
         description = self._html_search_regex(
             r'<div\s+id="descriptionContent">([^<]+)<', webpage, 'description', fatal=False)
+        thumbnail = self._html_search_regex(
+            r'flashvars\.image_url = "([^"]+)', webpage, 'thumbnail', fatal=False)
+
+        uploader = self._html_search_regex(
+            r'by:\s*<a [^>]*>(.+?)</a>', webpage, 'uploader', fatal=False)
+        uploader_id = self._html_search_regex(
+            r'by:\s*<a href="/Profile\.aspx\?.*?UserId=(\d+).*?"', webpage, 'uploader id', fatal=False)
+        upload_date = self._html_search_regex(r'</a> on (.+?) at \d+:\d+', webpage, 'upload date', fatal=False)
+        if upload_date:
+            upload_date = unified_strdate(upload_date)
+        
+        view_count = self._html_search_regex(
+            r'<div id="viewsCounter"><span>([^<]+)</span> views</div>', webpage, 'view count', fatal=False)
+        if view_count:
+            view_count = str_to_int(view_count)
+        comment_count = int_or_none(self._html_search_regex(
+            r'<span id="spCommentCount">\s*(\d+)</span> Comments</div>', webpage, 'comment count', fatal=False))
 
         video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage)))
         if webpage.find('flashvars\.encrypted = "true"') != -1:
@@ -53,16 +69,13 @@ class SpankwireIE(InfoExtractor):
         formats = []
         for video_url in video_urls:
             path = compat_urllib_parse_urlparse(video_url).path
-            extension = os.path.splitext(path)[1][1:]
             format = path.split('/')[4].split('_')[:2]
             resolution, bitrate_str = format
             format = "-".join(format)
-            height = int(resolution.rstrip('P'))
-            tbr = int(bitrate_str.rstrip('K'))
-
+            height = int(resolution.rstrip('Pp'))
+            tbr = int(bitrate_str.rstrip('Kk'))
             formats.append({
                 'url': video_url,
-                'ext': extension,
                 'resolution': resolution,
                 'format': format,
                 'tbr': tbr,
@@ -75,10 +88,14 @@ class SpankwireIE(InfoExtractor):
 
         return {
             'id': video_id,
-            'uploader': video_uploader,
-            'title': video_title,
-            'thumbnail': thumbnail,
+            'title': title,
             'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'upload_date': upload_date,
+            'view_count': view_count,
+            'comment_count': comment_count,
             'formats': formats,
             'age_limit': age_limit,
         }
diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py
new file mode 100644 (file)
index 0000000..7f388ac
--- /dev/null
@@ -0,0 +1,81 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+from .common import InfoExtractor
+
+
+class SpiegeltvIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/filme/(?P<id>[\-a-z0-9]+)'
+    _TEST = {
+        'url': 'http://www.spiegel.tv/filme/flug-mh370/',
+        'info_dict': {
+            'id': 'flug-mh370',
+            'ext': 'm4v',
+            'title': 'Flug MH370',
+            'description': 'Das Rätsel um die Boeing 777 der Malaysia-Airlines',
+            'thumbnail': 're:http://.*\.jpg$',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        title = self._html_search_regex(r'<h1.*?>(.*?)</h1>', webpage, 'title')
+
+        apihost = 'http://spiegeltv-ivms2-restapi.s3.amazonaws.com'
+        version_json = self._download_json(
+            '%s/version.json' % apihost, video_id,
+            note='Downloading version information')
+        version_name = version_json['version_name']
+
+        slug_json = self._download_json(
+            '%s/%s/restapi/slugs/%s.json' % (apihost, version_name, video_id),
+            video_id,
+            note='Downloading object information')
+        oid = slug_json['object_id']
+
+        media_json = self._download_json(
+            '%s/%s/restapi/media/%s.json' % (apihost, version_name, oid),
+            video_id, note='Downloading media information')
+        uuid = media_json['uuid']
+        is_wide = media_json['is_wide']
+
+        server_json = self._download_json(
+            'http://www.spiegel.tv/streaming_servers/', video_id,
+            note='Downloading server information')
+        server = server_json[0]['endpoint']
+
+        thumbnails = []
+        for image in media_json['images']:
+            thumbnails.append({
+                'url': image['url'],
+                'width': image['width'],
+                'height': image['height'],
+            })
+
+        description = media_json['subtitle']
+        duration = media_json['duration_in_ms'] / 1000.
+
+        if is_wide:
+            format = '16x9'
+        else:
+            format = '4x3'
+
+        url = server + 'mp4:' + uuid + '_spiegeltv_0500_' + format + '.m4v'
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': url,
+            'ext': 'm4v',
+            'description': description,
+            'duration': duration,
+            'thumbnails': thumbnails
+        }
\ No newline at end of file
index 56682ac45595344c80ff06fc9861ae54469b4a95..a3adf54e3097a5f91a5617c24f286cae5e374e24 100644 (file)
@@ -1,10 +1,15 @@
 from __future__ import unicode_literals
 
+import re
+
 from .mtv import MTVServicesInfoExtractor
 
 
 class SpikeIE(MTVServicesInfoExtractor):
-    _VALID_URL = r'https?://www\.spike\.com/(video-clips|episodes)/.+'
+    _VALID_URL = r'''(?x)https?://
+        (www\.spike\.com/(video-clips|episodes)/.+|
+         m\.spike\.com/videos/video.rbml\?id=(?P<mobile_id>[^&]+))
+        '''
     _TEST = {
         'url': 'http://www.spike.com/video-clips/lhtu8m/auction-hunters-can-allen-ride-a-hundred-year-old-motorcycle',
         'md5': '1a9265f32b0c375793d6c4ce45255256',
@@ -17,3 +22,11 @@ class SpikeIE(MTVServicesInfoExtractor):
     }
 
     _FEED_URL = 'http://www.spike.com/feeds/mrss/'
+    _MOBILE_TEMPLATE = 'http://m.spike.com/videos/video.rbml?id=%s'
+
+    def _real_extract(self, url):
+        mobj = re.search(self._VALID_URL, url)
+        mobile_id = mobj.group('mobile_id')
+        if mobile_id is not None:
+            url = 'http://www.spike.com/video-clips/%s' % mobile_id
+        return super(SpikeIE, self)._real_extract(url)
index 91658f8925cac6199bda5f7aa05aa0a2a73e85e4..1d8d5722468a4dcf763d1c1b36a91ab1cf69b0e3 100644 (file)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
@@ -8,78 +10,114 @@ from ..utils import (
 
 
 class SteamIE(InfoExtractor):
-    _VALID_URL = r"""http://store\.steampowered\.com/
-                (agecheck/)?
-                (?P<urltype>video|app)/ #If the page is only for videos or for a game
-                (?P<gameID>\d+)/?
-                (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
-                """
+    _VALID_URL = r"""(?x)
+        https?://store\.steampowered\.com/
+            (agecheck/)?
+            (?P<urltype>video|app)/ #If the page is only for videos or for a game
+            (?P<gameID>\d+)/?
+            (?P<videoID>\d*)(?P<extra>\??) # For urltype == video we sometimes get the videoID
+        |
+        https?://(?:www\.)?steamcommunity\.com/sharedfiles/filedetails/\?id=(?P<fileID>[0-9]+)
+    """
     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
-    _TEST = {
-        u"url": u"http://store.steampowered.com/video/105600/",
-        u"playlist": [
+    _TESTS = [{
+        "url": "http://store.steampowered.com/video/105600/",
+        "playlist": [
             {
-                u"file": u"81300.flv",
-                u"md5": u"f870007cee7065d7c76b88f0a45ecc07",
-                u"info_dict": {
-                        u"title": u"Terraria 1.1 Trailer",
-                        u'playlist_index': 1,
+                "md5": "f870007cee7065d7c76b88f0a45ecc07",
+                "info_dict": {
+                    'id': '81300',
+                    'ext': 'flv',
+                    "title": "Terraria 1.1 Trailer",
+                    'playlist_index': 1,
                 }
             },
             {
-                u"file": u"80859.flv",
-                u"md5": u"61aaf31a5c5c3041afb58fb83cbb5751",
-                u"info_dict": {
-                    u"title": u"Terraria Trailer",
-                    u'playlist_index': 2,
+                "md5": "61aaf31a5c5c3041afb58fb83cbb5751",
+                "info_dict": {
+                    'id': '80859',
+                    'ext': 'flv',
+                    "title": "Terraria Trailer",
+                    'playlist_index': 2,
                 }
             }
-        ]
-    }
-
-
-    @classmethod
-    def suitable(cls, url):
-        """Receives a URL and returns True if suitable for this IE."""
-        return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
+        ],
+        'params': {
+            'playlistend': 2,
+        }
+    }, {
+        'url': 'http://steamcommunity.com/sharedfiles/filedetails/?id=242472205',
+        'info_dict': {
+            'id': 'WB5DvDOOvAY',
+            'ext': 'mp4',
+            'upload_date': '20140329',
+            'title': 'FRONTIERS - Final Greenlight Trailer',
+            'description': "The final trailer for the Steam Greenlight launch. Hooray, progress! Here's the official Greenlight page: http://steamcommunity.com/sharedfiles/filedetails/?id=242472205",
+            'uploader': 'AAD Productions',
+            'uploader_id': 'AtomicAgeDogGames',
+        }
+    }]
 
     def _real_extract(self, url):
-        m = re.match(self._VALID_URL, url, re.VERBOSE)
-        gameID = m.group('gameID')
-
-        videourl = self._VIDEO_PAGE_TEMPLATE % gameID
-        webpage = self._download_webpage(videourl, gameID)
+        m = re.match(self._VALID_URL, url)
+        fileID = m.group('fileID')
+        if fileID:
+            videourl = url
+            playlist_id = fileID
+        else:
+            gameID = m.group('gameID')
+            playlist_id = gameID
+            videourl = self._VIDEO_PAGE_TEMPLATE % playlist_id
+        webpage = self._download_webpage(videourl, playlist_id)
 
         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
-            videourl = self._AGECHECK_TEMPLATE % gameID
+            videourl = self._AGECHECK_TEMPLATE % playlist_id
             self.report_age_confirmation()
-            webpage = self._download_webpage(videourl, gameID)
+            webpage = self._download_webpage(videourl, playlist_id)
+
+        if fileID:
+            playlist_title = self._html_search_regex(
+                r'<div class="workshopItemTitle">(.+)</div>', webpage, 'title')
+            mweb = re.finditer(r'''(?x)
+                'movie_(?P<videoID>[0-9]+)':\s*\{\s*
+                YOUTUBE_VIDEO_ID:\s*"(?P<youtube_id>[^"]+)",
+                ''', webpage)
+            videos = [{
+                '_type': 'url',
+                'url': vid.group('youtube_id'),
+                'ie_key': 'Youtube',
+            } for vid in mweb]
+        else:
+            playlist_title = self._html_search_regex(
+                r'<h2 class="pageheader">(.*?)</h2>', webpage, 'game title')
+
+            mweb = re.finditer(r'''(?x)
+                'movie_(?P<videoID>[0-9]+)':\s*\{\s*
+                FILENAME:\s*"(?P<videoURL>[\w:/\.\?=]+)"
+                (,\s*MOVIE_NAME:\s*\"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},
+                ''', webpage)
+            titles = re.finditer(
+                r'<span class="title">(?P<videoName>.+?)</span>', webpage)
+            thumbs = re.finditer(
+                r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">', webpage)
+            videos = []
 
-        self.report_extraction(gameID)
-        game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
-                                             webpage, 'game title')
+            for vid, vtitle, thumb in zip(mweb, titles, thumbs):
+                video_id = vid.group('videoID')
+                title = vtitle.group('videoName')
+                video_url = vid.group('videoURL')
+                video_thumb = thumb.group('thumbnail')
+                if not video_url:
+                    raise ExtractorError('Cannot find video url for %s' % video_id)
+                videos.append({
+                    'id': video_id,
+                    'url': video_url,
+                    'ext': 'flv',
+                    'title': unescapeHTML(title),
+                    'thumbnail': video_thumb
+                })
+        if not videos:
+            raise ExtractorError('Could not find any videos')
 
-        urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
-        mweb = re.finditer(urlRE, webpage)
-        namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
-        titles = re.finditer(namesRE, webpage)
-        thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
-        thumbs = re.finditer(thumbsRE, webpage)
-        videos = []
-        for vid,vtitle,thumb in zip(mweb,titles,thumbs):
-            video_id = vid.group('videoID')
-            title = vtitle.group('videoName')
-            video_url = vid.group('videoURL')
-            video_thumb = thumb.group('thumbnail')
-            if not video_url:
-                raise ExtractorError(u'Cannot find video url for %s' % video_id)
-            info = {
-                'id':video_id,
-                'url':video_url,
-                'ext': 'flv',
-                'title': unescapeHTML(title),
-                'thumbnail': video_thumb
-                  }
-            videos.append(info)
-        return [self.playlist_result(videos, gameID, game_title)]
+        return self.playlist_result(videos, playlist_id, playlist_title)
index 7362904db50588acb6f1f988b7fc6665f9b8ce2e..73efe95420ff7b83412864de02d8d5601690b537 100644 (file)
@@ -5,13 +5,16 @@ import re
 import json
 
 from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+    int_or_none,
+    compat_str,
+)
 
 
 class StreamCZIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?stream\.cz/.+/(?P<videoid>.+)'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.stream.cz/peklonataliri/765767-ecka-pro-deti',
         'md5': '6d3ca61a8d0633c9c542b92fcb936b0c',
         'info_dict': {
@@ -22,7 +25,18 @@ class StreamCZIE(InfoExtractor):
             'thumbnail': 'http://im.stream.cz/episode/52961d7e19d423f8f06f0100',
             'duration': 256,
         },
-    }
+    }, {
+        'url': 'http://www.stream.cz/blanik/10002447-tri-roky-pro-mazanka',
+        'md5': '246272e753e26bbace7fcd9deca0650c',
+        'info_dict': {
+            'id': '10002447',
+            'ext': 'mp4',
+            'title': 'Kancelář Blaník: Tři roky pro Mazánka',
+            'description': 'md5:9177695a8b756a0a8ab160de4043b392',
+            'thumbnail': 'http://im.stream.cz/episode/537f838c50c11f8d21320000',
+            'duration': 368,
+        },
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
@@ -57,7 +71,7 @@ class StreamCZIE(InfoExtractor):
         self._sort_formats(formats)
 
         return {
-            'id': str(jsonData['id']),
+            'id': compat_str(jsonData['episode_id']),
             'title': self._og_search_title(webpage),
             'thumbnail': jsonData['episode_image_original_url'].replace('//', 'http://'),
             'formats': formats,
diff --git a/youtube_dl/extractor/swrmediathek.py b/youtube_dl/extractor/swrmediathek.py
new file mode 100644 (file)
index 0000000..6c688c5
--- /dev/null
@@ -0,0 +1,104 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import parse_duration
+
+
+class SWRMediathekIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?swrmediathek\.de/player\.htm\?show=(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+
+    _TESTS = [{
+        'url': 'http://swrmediathek.de/player.htm?show=849790d0-dab8-11e3-a953-0026b975f2e6',
+        'md5': '8c5f6f0172753368547ca8413a7768ac',
+        'info_dict': {
+            'id': '849790d0-dab8-11e3-a953-0026b975f2e6',
+            'ext': 'mp4',
+            'title': 'SWR odysso',
+            'description': 'md5:2012e31baad36162e97ce9eb3f157b8a',
+            'thumbnail': 're:^http:.*\.jpg$',
+            'duration': 2602,
+            'upload_date': '20140515',
+            'uploader': 'SWR Fernsehen',
+            'uploader_id': '990030',
+        },
+    }, {
+        'url': 'http://swrmediathek.de/player.htm?show=0e1a8510-ddf2-11e3-9be3-0026b975f2e6',
+        'md5': 'b10ab854f912eecc5a6b55cd6fc1f545',
+        'info_dict': {
+            'id': '0e1a8510-ddf2-11e3-9be3-0026b975f2e6',
+            'ext': 'mp4',
+            'title': 'Nachtcafé - Alltagsdroge Alkohol - zwischen Sektempfang und Komasaufen',
+            'description': 'md5:e0a3adc17e47db2c23aab9ebc36dbee2',
+            'thumbnail': 're:http://.*\.jpg',
+            'duration': 5305,
+            'upload_date': '20140516',
+            'uploader': 'SWR Fernsehen',
+            'uploader_id': '990030',
+        },
+    }, {
+        'url': 'http://swrmediathek.de/player.htm?show=bba23e10-cb93-11e3-bf7f-0026b975f2e6',
+        'md5': '4382e4ef2c9d7ce6852535fa867a0dd3',
+        'info_dict': {
+            'id': 'bba23e10-cb93-11e3-bf7f-0026b975f2e6',
+            'ext': 'mp3',
+            'title': 'Saša Stanišic: Vor dem Fest',
+            'description': 'md5:5b792387dc3fbb171eb709060654e8c9',
+            'thumbnail': 're:http://.*\.jpg',
+            'duration': 3366,
+            'upload_date': '20140520',
+            'uploader': 'SWR 2',
+            'uploader_id': '284670',
+        }
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        video = self._download_json(
+            'http://swrmediathek.de/AjaxEntry?ekey=%s' % video_id, video_id, 'Downloading video JSON')
+
+        attr = video['attr']
+        media_type = attr['entry_etype']
+
+        formats = []
+        for entry in video['sub']:
+            if entry['name'] != 'entry_media':
+                continue
+
+            entry_attr = entry['attr']
+            codec = entry_attr['val0']
+            quality = int(entry_attr['val1'])
+
+            fmt = {
+                'url': entry_attr['val2'],
+                'quality': quality,
+            }
+
+            if media_type == 'Video':
+                fmt.update({
+                    'format_note': ['144p', '288p', '544p'][quality-1],
+                    'vcodec': codec,
+                })
+            elif media_type == 'Audio':
+                fmt.update({
+                    'acodec': codec,
+                })
+            formats.append(fmt)
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': attr['entry_title'],
+            'description': attr['entry_descl'],
+            'thumbnail': attr['entry_image_16_9'],
+            'duration': parse_duration(attr['entry_durat']),
+            'upload_date': attr['entry_pdatet'][:-4],
+            'uploader': attr['channel_title'],
+            'uploader_id': attr['channel_idkey'],
+            'formats': formats,
+        }
\ No newline at end of file
index 8809a57fe31d23ea081e553302ef38222fb02221..f76b6e2b22c7fa391664218f9e59fa62908c4c08 100644 (file)
@@ -6,9 +6,9 @@ from .common import InfoExtractor
 
 
 class SyfyIE(InfoExtractor):
-    _VALID_URL = r'https?://www\.syfy\.com/videos/.+?vid:(?P<id>\d+)'
+    _VALID_URL = r'https?://www\.syfy\.com/(?:videos/.+?vid:(?P<id>[0-9]+)|(?!videos)(?P<video_name>[^/]+)(?:$|[?#]))'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.syfy.com/videos/Robot%20Combat%20League/Behind%20the%20Scenes/vid:2631458',
         'md5': 'e07de1d52c7278adbb9b9b1c93a66849',
         'info_dict': {
@@ -18,10 +18,30 @@ class SyfyIE(InfoExtractor):
             'description': 'Listen to what insights George Lucas give his daughter Amanda.',
         },
         'add_ie': ['ThePlatform'],
-    }
+    }, {
+        'url': 'http://www.syfy.com/wilwheaton',
+        'md5': '94dfa54ee3ccb63295b276da08c415f6',
+        'info_dict': {
+            'id': '4yoffOOXC767',
+            'ext': 'flv',
+            'title': 'The Wil Wheaton Project - Premiering May 27th at 10/9c.',
+            'description': 'The Wil Wheaton Project premieres May 27th at 10/9c. Don\'t miss it.',
+        },
+        'add_ie': ['ThePlatform'],
+        'skip': 'Blocked outside the US',
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_name = mobj.group('video_name')
+        if video_name:
+            generic_webpage = self._download_webpage(url, video_name)
+            video_id = self._search_regex(
+                r'<iframe.*?class="video_iframe_page"\s+src="/_utils/video/thP_video_controller.php.*?_vid([0-9]+)">',
+                generic_webpage, 'video ID')
+            url = 'http://www.syfy.com/videos/%s/%s/vid:%s' % (
+                video_name, video_name, video_id)
+        else:
+            video_id = mobj.group('id')
         webpage = self._download_webpage(url, video_id)
         return self.url_result(self._og_search_video_url(webpage))
diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py
new file mode 100644 (file)
index 0000000..3633152
--- /dev/null
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class TagesschauIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/video/video(?P<id>-?[0-9]+)\.html'
+
+    _TESTS = [{
+        'url': 'http://www.tagesschau.de/multimedia/video/video1399128.html',
+        'md5': 'bcdeac2194fb296d599ce7929dfa4009',
+        'info_dict': {
+            'id': '1399128',
+            'ext': 'mp4',
+            'title': 'Harald Range, Generalbundesanwalt, zu den Ermittlungen',
+            'description': 'md5:69da3c61275b426426d711bde96463ab',
+            'thumbnail': 're:^http:.*\.jpg$',
+        },
+    }, {
+        'url': 'http://www.tagesschau.de/multimedia/video/video-196.html',
+        'md5': '8aaa8bf3ae1ca2652309718c03019128',
+        'info_dict': {
+            'id': '196',
+            'ext': 'mp4',
+            'title': 'Ukraine-Konflikt: Klitschko in Kiew als Bürgermeister vereidigt',
+            'description': 'md5:f22e4af75821d174fa6c977349682691',
+            'thumbnail': 're:http://.*\.jpg',
+        },
+    }]
+
+    _FORMATS = {
+        's': {'width': 256, 'height': 144, 'quality': 1},
+        'm': {'width': 512, 'height': 288, 'quality': 2},
+        'l': {'width': 960, 'height': 544, 'quality': 3},
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        if video_id.startswith('-'):
+            display_id = video_id.strip('-')
+        else:
+            display_id = video_id
+
+        webpage = self._download_webpage(url, display_id)
+
+        playerpage = self._download_webpage(
+            'http://www.tagesschau.de/multimedia/video/video%s~player_autoplay-true.html' % video_id,
+            display_id, 'Downloading player page')
+
+        medias = re.findall(
+            r'"(http://media.+?)", type:"video/(.+?)", quality:"(.+?)"',
+            playerpage)
+
+        formats = []
+        for url, ext, res in medias:
+            f = {
+                'format_id': res + '_' + ext,
+                'url': url,
+                'ext': ext,
+            }
+            f.update(self._FORMATS.get(res, {}))
+            formats.append(f)
+
+        self._sort_formats(formats)
+
+        thumbnail = re.findall(r'"(/multimedia/.+?\.jpg)"', playerpage)[-1]
+
+        return {
+            'id': display_id,
+            'title': self._og_search_title(webpage).strip(),
+            'thumbnail': 'http://www.tagesschau.de' + thumbnail,
+            'formats': formats,
+            'description': self._og_search_description(webpage).strip(),
+        }
diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py
new file mode 100644 (file)
index 0000000..6d52763
--- /dev/null
@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    qualities,
+    determine_ext,
+)
+
+
+class TeacherTubeIE(InfoExtractor):
+    IE_NAME = 'teachertube'
+    IE_DESC = 'teachertube.com videos'
+
+    _VALID_URL = r'https?://(?:www\.)?teachertube\.com/(viewVideo\.php\?video_id=|music\.php\?music_id=)(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://www.teachertube.com/viewVideo.php?video_id=339997',
+        'md5': 'f9434ef992fd65936d72999951ee254c',
+        'info_dict': {
+            'id': '339997',
+            'ext': 'mp4',
+            'title': 'Measures of dispersion from a frequency table_x264',
+            'description': 'md5:a3e9853487185e9fcd7181a07164650b',
+            'thumbnail': 're:http://.*\.jpg',
+        },
+    }, {
+        'url': 'http://www.teachertube.com/viewVideo.php?video_id=340064',
+        'md5': '0d625ec6bc9bf50f70170942ad580676',
+        'info_dict': {
+            'id': '340064',
+            'ext': 'mp4',
+            'title': 'How to Make Paper Dolls _ Paper Art Projects',
+            'description': 'md5:2ca52b20cd727773d1dc418b3d6bd07b',
+            'thumbnail': 're:http://.*\.jpg',
+        },
+    }, {
+        'url': 'http://www.teachertube.com/music.php?music_id=8805',
+        'md5': '01e8352006c65757caf7b961f6050e21',
+        'info_dict': {
+            'id': '8805',
+            'ext': 'mp3',
+            'title': 'PER ASPERA AD ASTRA',
+            'description': 'RADIJSKA EMISIJA ZRAKOPLOVNE TEHNIČKE ŠKOLE PER ASPERA AD ASTRA',
+        },
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        quality = qualities(['mp3', 'flv', 'mp4'])
+
+        formats = [
+            {
+                'url': media_url,
+                'quality': quality(determine_ext(media_url))
+            } for media_url in set(zip(*re.findall(r'([\'"])file\1\s*:\s*"([^"]+)"', webpage))[1])
+        ]
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': self._og_search_title(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'formats': formats,
+            'description': self._og_search_description(webpage),
+        }
+
+
+class TeacherTubeClassroomIE(InfoExtractor):
+    IE_NAME = 'teachertube:classroom'
+    IE_DESC = 'teachertube.com online classrooms'
+
+    _VALID_URL = r'https?://(?:www\.)?teachertube\.com/view_classroom\.php\?user=(?P<user>[0-9a-zA-Z]+)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        user_id = mobj.group('user')
+
+        rss = self._download_xml('http://www.teachertube.com/rssclassroom.php?mode=user&username=%s' % user_id,
+                                      user_id, 'Downloading classroom RSS')
+
+        entries = []
+        for url in rss.findall('.//{http://search.yahoo.com/mrss/}player'):
+            entries.append(self.url_result(url.attrib['url'], 'TeacherTube'))
+
+        return self.playlist_result(entries, user_id)
diff --git a/youtube_dl/extractor/teachingchannel.py b/youtube_dl/extractor/teachingchannel.py
new file mode 100644 (file)
index 0000000..117afa9
--- /dev/null
@@ -0,0 +1,33 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .ooyala import OoyalaIE
+
+
+class TeachingChannelIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.teachingchannel\.org/videos/(?P<title>.+)'
+
+    _TEST = {
+        'url': 'https://www.teachingchannel.org/videos/teacher-teaming-evolution',
+        'info_dict': {
+            'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM',
+            'ext': 'mp4',
+            'title': 'A History of Teaming',
+            'description': 'md5:2a9033db8da81f2edffa4c99888140b3',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        title = mobj.group('title')
+        webpage = self._download_webpage(url, title)
+        ooyala_code = self._search_regex(
+            r'data-embed-code=\'(.+?)\'', webpage, 'ooyala code')
+
+        return OoyalaIE._build_url_result(ooyala_code)
index 9dcffead04d5466c14c6f2ff60995ecfb5435e6d..f8dd7e955ada5ce58fd04d668027587eda1b6c00 100644 (file)
@@ -3,14 +3,21 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-)
 
 
 class TeamcocoIE(InfoExtractor):
-    _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
-    _TEST = {
+    _VALID_URL = r'http://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<display_id>.*)'
+    _TESTS = [
+    {
+        'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant',
+        'file': '80187.mp4',
+        'md5': '3f7746aa0dc86de18df7539903d399ea',
+        'info_dict': {
+            'title': 'Conan Becomes A Mary Kay Beauty Consultant',
+            'description': 'Mary Kay is perhaps the most trusted name in female beauty, so of course Conan is a natural choice to sell their products.'
+        }
+    },
+    {
         'url': 'http://teamcoco.com/video/louis-ck-interview-george-w-bush',
         'file': '19705.mp4',
         'md5': 'cde9ba0fa3506f5f017ce11ead928f9a',
@@ -19,22 +26,23 @@ class TeamcocoIE(InfoExtractor):
             "title": "Louis C.K. Interview Pt. 1 11/3/11"
         }
     }
+    ]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError('Invalid URL: %s' % url)
-        url_title = mobj.group('url_title')
-        webpage = self._download_webpage(url, url_title)
-
-        video_id = self._html_search_regex(
-            r'<article class="video" data-id="(\d+?)"',
-            webpage, 'video id')
 
-        self.report_extraction(video_id)
+        display_id = mobj.group('display_id')
+        webpage = self._download_webpage(url, display_id)
+        
+        video_id = mobj.group("video_id")
+        if not video_id:
+            video_id = self._html_search_regex(
+                r'<article class="video" data-id="(\d+?)"',
+                webpage, 'video id')
 
         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
-        data = self._download_xml(data_url, video_id, 'Downloading data webpage')
+        data = self._download_xml(
+            data_url, display_id, 'Downloading data webpage')
 
         qualities = ['500k', '480p', '1000k', '720p', '1080p']
         formats = []
@@ -69,6 +77,7 @@ class TeamcocoIE(InfoExtractor):
 
         return {
             'id': video_id,
+            'display_id': display_id,
             'formats': formats,
             'title': self._og_search_title(webpage),
             'thumbnail': self._og_search_thumbnail(webpage),
index 8b31caa92c1e44473aa42953427b3cc2d71762f7..d260c91c2172deabb697c3ad242cfda24f395d41 100644 (file)
@@ -6,115 +6,185 @@ import re
 from .subtitles import SubtitlesInfoExtractor
 
 from ..utils import (
-    RegexNotFoundError,
+    compat_str,
 )
 
 
 class TEDIE(SubtitlesInfoExtractor):
-    _VALID_URL=r'''http://www\.ted\.com/
-                   (
-                        ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
-                        |
-                        ((?P<type_talk>talks)) # We have a simple talk
-                   )
-                   (/lang/(.*?))? # The url may contain the language
-                   /(?P<name>\w+) # Here goes the name and then ".html"
-                   '''
-    _TEST = {
+    _VALID_URL = r'''(?x)
+        (?P<proto>https?://)
+        (?P<type>www|embed)(?P<urlmain>\.ted\.com/
+        (
+            (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
+            |
+            ((?P<type_talk>talks)) # We have a simple talk
+            |
+            (?P<type_watch>watch)/[^/]+/[^/]+
+        )
+        (/lang/(.*?))? # The url may contain the language
+        /(?P<name>[\w-]+) # Here goes the name and then ".html"
+        .*)$
+        '''
+    _TESTS = [{
         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
-        'file': '102.mp4',
         'md5': '4ea1dada91e4174b53dac2bb8ace429d',
         'info_dict': {
-            "description": "md5:c6fa72e6eedbd938c9caf6b2702f5922",
-            "title": "Dan Dennett: The illusion of consciousness"
+            'id': '102',
+            'ext': 'mp4',
+            'title': 'The illusion of consciousness',
+            'description': ('Philosopher Dan Dennett makes a compelling '
+                'argument that not only don\'t we understand our own '
+                'consciousness, but that half the time our brains are '
+                'actively fooling us.'),
+            'uploader': 'Dan Dennett',
+            'width': 854,
         }
+    }, {
+        'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
+        'md5': '226f4fb9c62380d11b7995efa4c87994',
+        'info_dict': {
+            'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
+            'ext': 'mp4',
+            'title': 'Vishal Sikka: The beauty and power of algorithms',
+            'thumbnail': 're:^https?://.+\.jpg',
+            'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
+        }
+    }, {
+        'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
+        'md5': '49144e345a899b8cb34d315f3b9cfeeb',
+        'info_dict': {
+            'id': '1972',
+            'ext': 'mp4',
+            'title': 'Be passionate. Be courageous. Be your best.',
+            'uploader': 'Gabby Giffords and Mark Kelly',
+            'description': 'md5:5174aed4d0f16021b704120360f72b92',
+        },
+    }]
+
+    _NATIVE_FORMATS = {
+        'low': {'preference': 1, 'width': 320, 'height': 180},
+        'medium': {'preference': 2, 'width': 512, 'height': 288},
+        'high': {'preference': 3, 'width': 854, 'height': 480},
     }
 
-    @classmethod
-    def suitable(cls, url):
-        """Receives a URL and returns True if suitable for this IE."""
-        return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
+    def _extract_info(self, webpage):
+        info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
+            webpage, 'info json')
+        return json.loads(info_json)
 
     def _real_extract(self, url):
-        m=re.match(self._VALID_URL, url, re.VERBOSE)
+        m = re.match(self._VALID_URL, url, re.VERBOSE)
+        if m.group('type') == 'embed':
+            desktop_url = m.group('proto') + 'www' + m.group('urlmain')
+            return self.url_result(desktop_url, 'TED')
+        name = m.group('name')
         if m.group('type_talk'):
-            return self._talk_info(url)
-        else :
-            playlist_id=m.group('playlist_id')
-            name=m.group('name')
-            self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
-            return [self._playlist_videos_info(url,name,playlist_id)]
-
+            return self._talk_info(url, name)
+        elif m.group('type_watch'):
+            return self._watch_info(url, name)
+        else:
+            return self._playlist_videos_info(url, name)
 
-    def _playlist_videos_info(self, url, name, playlist_id):
+    def _playlist_videos_info(self, url, name):
         '''Returns the videos of the playlist'''
 
-        webpage = self._download_webpage(
-            url, playlist_id, 'Downloading playlist webpage')
-        matches = re.finditer(
-            r'<p\s+class="talk-title[^"]*"><a\s+href="(?P<talk_url>/talks/[^"]+\.html)">[^<]*</a></p>',
-            webpage)
-
-        playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
-                                                 webpage, 'playlist title')
+        webpage = self._download_webpage(url, name,
+            'Downloading playlist webpage')
+        info = self._extract_info(webpage)
+        playlist_info = info['playlist']
 
         playlist_entries = [
-            self.url_result(u'http://www.ted.com' + m.group('talk_url'), 'TED')
-            for m in matches
+            self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
+            for talk in info['talks']
         ]
         return self.playlist_result(
-            playlist_entries, playlist_id=playlist_id, playlist_title=playlist_title)
+            playlist_entries,
+            playlist_id=compat_str(playlist_info['id']),
+            playlist_title=playlist_info['title'])
 
-    def _talk_info(self, url, video_id=0):
-        """Return the video for the talk in the url"""
-        m = re.match(self._VALID_URL, url,re.VERBOSE)
-        video_name = m.group('name')
-        webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
+    def _talk_info(self, url, video_name):
+        webpage = self._download_webpage(url, video_name)
         self.report_extraction(video_name)
-        # If the url includes the language we get the title translated
-        title = self._html_search_regex(r'<span .*?id="altHeadline".+?>(?P<title>.*)</span>',
-                                        webpage, 'title')
-        json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
-                                    webpage, 'json data')
-        info = json.loads(json_data)
-        desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
-                                       webpage, 'description', flags = re.DOTALL)
-        
-        thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
-                                       webpage, 'thumbnail')
-        formats = [{
-            'ext': 'mp4',
-            'url': stream['file'],
-            'format': stream['id']
-        } for stream in info['htmlStreams']]
 
-        video_id = info['id']
+        talk_info = self._extract_info(webpage)['talks'][0]
 
+        formats = [{
+            'url': format_url,
+            'format_id': format_id,
+            'format': format_id,
+        } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
+        if formats:
+            for f in formats:
+                finfo = self._NATIVE_FORMATS.get(f['format_id'])
+                if finfo:
+                    f.update(finfo)
+        else:
+            # Use rtmp downloads
+            formats = [{
+                'format_id': f['name'],
+                'url': talk_info['streamer'],
+                'play_path': f['file'],
+                'ext': 'flv',
+                'width': f['width'],
+                'height': f['height'],
+                'tbr': f['bitrate'],
+            } for f in talk_info['resources']['rtmp']]
+        self._sort_formats(formats)
+
+        video_id = compat_str(talk_info['id'])
         # subtitles
-        video_subtitles = self.extract_subtitles(video_id, webpage)
+        video_subtitles = self.extract_subtitles(video_id, talk_info)
         if self._downloader.params.get('listsubtitles', False):
-            self._list_available_subtitles(video_id, webpage)
+            self._list_available_subtitles(video_id, talk_info)
             return
 
+        thumbnail = talk_info['thumb']
+        if not thumbnail.startswith('http'):
+            thumbnail = 'http://' + thumbnail
         return {
             'id': video_id,
-            'title': title,
+            'title': talk_info['title'],
+            'uploader': talk_info['speaker'],
             'thumbnail': thumbnail,
-            'description': desc,
+            'description': self._og_search_description(webpage),
             'subtitles': video_subtitles,
             'formats': formats,
         }
 
-    def _get_available_subtitles(self, video_id, webpage):
-        try:
-            options = self._search_regex(r'(?:<select name="subtitles_language_select" id="subtitles_language_select">)(.*?)(?:</select>)', webpage, 'subtitles_language_select', flags=re.DOTALL)
-            languages = re.findall(r'(?:<option value=")(\S+)"', options)
-            if languages:
-                sub_lang_list = {}
-                for l in languages:
-                    url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
-                    sub_lang_list[l] = url
-                return sub_lang_list
-        except RegexNotFoundError:
-            self._downloader.report_warning(u'video doesn\'t have subtitles')
-        return {}
+    def _get_available_subtitles(self, video_id, talk_info):
+        languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
+        if languages:
+            sub_lang_list = {}
+            for l in languages:
+                url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
+                sub_lang_list[l] = url
+            return sub_lang_list
+        else:
+            self._downloader.report_warning('video doesn\'t have subtitles')
+            return {}
+
+    def _watch_info(self, url, name):
+        webpage = self._download_webpage(url, name)
+
+        config_json = self._html_search_regex(
+            r"data-config='([^']+)", webpage, 'config')
+        config = json.loads(config_json)
+        video_url = config['video']['url']
+        thumbnail = config.get('image', {}).get('url')
+
+        title = self._html_search_regex(
+            r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
+        description = self._html_search_regex(
+            [
+                r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
+                r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
+            ],
+            webpage, 'description', fatal=False)
+
+        return {
+            'id': name,
+            'url': video_url,
+            'title': title,
+            'thumbnail': thumbnail,
+            'description': description,
+        }
diff --git a/youtube_dl/extractor/testurl.py b/youtube_dl/extractor/testurl.py
new file mode 100644 (file)
index 0000000..c7d5593
--- /dev/null
@@ -0,0 +1,68 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class TestURLIE(InfoExtractor):
+    """ Allows adressing of the test cases as test:yout.*be_1 """
+
+    IE_DESC = False  # Do not list
+    _VALID_URL = r'test(?:url)?:(?P<id>(?P<extractor>.+?)(?:_(?P<num>[0-9]+))?)$'
+
+    def _real_extract(self, url):
+        from ..extractor import gen_extractors
+
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        extractor_id = mobj.group('extractor')
+        all_extractors = gen_extractors()
+
+        rex = re.compile(extractor_id, flags=re.IGNORECASE)
+        matching_extractors = [
+            e for e in all_extractors if rex.search(e.IE_NAME)]
+
+        if len(matching_extractors) == 0:
+            raise ExtractorError(
+                'No extractors matching %r found' % extractor_id,
+                expected=True)
+        elif len(matching_extractors) > 1:
+            # Is it obvious which one to pick?
+            try:
+                extractor = next(
+                    ie for ie in matching_extractors
+                    if ie.IE_NAME.lower() == extractor_id.lower())
+            except StopIteration:
+                raise ExtractorError(
+                    ('Found multiple matching extractors: %s' %
+                        ' '.join(ie.IE_NAME for ie in matching_extractors)),
+                    expected=True)
+        else:
+            extractor = matching_extractors[0]
+
+        num_str = mobj.group('num')
+        num = int(num_str) if num_str else 0
+
+        testcases = []
+        t = getattr(extractor, '_TEST', None)
+        if t:
+            testcases.append(t)
+        testcases.extend(getattr(extractor, '_TESTS', []))
+
+        try:
+            tc = testcases[num]
+        except IndexError:
+            raise ExtractorError(
+                ('Test case %d not found, got only %d tests' %
+                    (num, len(testcases))),
+                expected=True)
+
+        self.to_screen('Test URL: %s' % tc['url'])
+
+        return {
+            '_type': 'url',
+            'url': tc['url'],
+            'id': video_id,
+        }
index 2c5c88be8ede5ae6d0fa9f3c4e540cddb13190b6..fdae17b1b817efd2a7666d44cc2cc38de1ccfa22 100644 (file)
@@ -1,33 +1,37 @@
 # coding: utf-8
+from __future__ import unicode_literals
 
-import json
 import re
 
 from .common import InfoExtractor
 
+
 class TF1IE(InfoExtractor):
     """TF1 uses the wat.tv player."""
-    _VALID_URL = r'http://videos\.tf1\.fr/.*-(.*?)\.html'
+    _VALID_URL = r'http://videos\.tf1\.fr/.*-(?P<id>.*?)\.html'
     _TEST = {
-        u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
-        u'file': u'10635995.mp4',
-        u'md5': u'2e378cc28b9957607d5e88f274e637d8',
-        u'info_dict': {
-            u'title': u'Citroën Grand C4 Picasso 2013 : présentation officielle',
-            u'description': u'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.',
+        'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
+        'info_dict': {
+            'id': '10635995',
+            'ext': 'mp4',
+            'title': 'Citroën Grand C4 Picasso 2013 : présentation officielle',
+            'description': 'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.',
+        },
+        'params': {
+            # Sometimes wat serves the whole file with the --test option
+            'skip_download': True,
         },
-        u'skip': u'Sometimes wat serves the whole file with the --test option',
     }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        id = mobj.group(1)
-        webpage = self._download_webpage(url, id)
-        embed_url = self._html_search_regex(r'"(https://www.wat.tv/embedframe/.*?)"',
-                                webpage, 'embed url')
-        embed_page = self._download_webpage(embed_url, id, u'Downloading embed player page')
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+        embed_url = self._html_search_regex(
+            r'"(https://www.wat.tv/embedframe/.*?)"', webpage, 'embed url')
+        embed_page = self._download_webpage(embed_url, video_id,
+            'Downloading embed player page')
         wat_id = self._search_regex(r'UVID=(.*?)&', embed_page, 'wat id')
-        wat_info = self._download_webpage('http://www.wat.tv/interface/contentv3/%s' % wat_id, id, u'Downloading Wat info')
-        wat_info = json.loads(wat_info)['media']
-        wat_url = wat_info['url']
-        return self.url_result(wat_url, 'Wat')
+        wat_info = self._download_json(
+            'http://www.wat.tv/interface/contentv3/%s' % wat_id, video_id)
+        return self.url_result(wat_info['media']['url'], 'Wat')
index d60702325d7d4ffebafcb45cdc9054c8fa4d2af7..b6b2dba9ca9e6ee02c7dc6b2cf01d3601874a6b2 100644 (file)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import re
 import json
 
@@ -13,22 +15,22 @@ _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language
 class ThePlatformIE(InfoExtractor):
     _VALID_URL = r'''(?x)
         (?:https?://(?:link|player)\.theplatform\.com/[sp]/[^/]+/
-           (?P<config>[^/\?]+/(?:swf|config)/select/)?
+           (?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)?
          |theplatform:)(?P<id>[^/\?&]+)'''
 
     _TEST = {
         # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/
-        u'url': u'http://link.theplatform.com/s/dJ5BDC/e9I_cZgTgIPd/meta.smil?format=smil&Tracking=true&mbr=true',
-        u'info_dict': {
-            u'id': u'e9I_cZgTgIPd',
-            u'ext': u'flv',
-            u'title': u'Blackberry\'s big, bold Z30',
-            u'description': u'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.',
-            u'duration': 247,
+        'url': 'http://link.theplatform.com/s/dJ5BDC/e9I_cZgTgIPd/meta.smil?format=smil&Tracking=true&mbr=true',
+        'info_dict': {
+            'id': 'e9I_cZgTgIPd',
+            'ext': 'flv',
+            'title': 'Blackberry\'s big, bold Z30',
+            'description': 'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.',
+            'duration': 247,
         },
-        u'params': {
+        'params': {
             # rtmp download
-            u'skip_download': True,
+            'skip_download': True,
         },
     }
 
@@ -39,7 +41,7 @@ class ThePlatformIE(InfoExtractor):
             error_msg = next(
                 n.attrib['abstract']
                 for n in meta.findall(_x('.//smil:ref'))
-                if n.attrib.get('title') == u'Geographic Restriction')
+                if n.attrib.get('title') == 'Geographic Restriction')
         except StopIteration:
             pass
         else:
@@ -52,12 +54,17 @@ class ThePlatformIE(InfoExtractor):
         head = meta.find(_x('smil:head'))
         body = meta.find(_x('smil:body'))
 
-        f4m_node = body.find(_x('smil:seq/smil:video'))
+        f4m_node = body.find(_x('smil:seq//smil:video'))
         if f4m_node is not None:
+            f4m_url = f4m_node.attrib['src']
+            if 'manifest.f4m?' not in f4m_url:
+                f4m_url += '?'
+            # the parameters are from syfy.com, other sites may use others,
+            # they also work for nbc.com
+            f4m_url += '&g=UXWGVKRWHFSP&hdcore=3.0.3'
             formats = [{
                 'ext': 'flv',
-                # the parameters are from syfy.com, other sites may use others
-                'url': f4m_node.attrib['src'] + '?g=UXWGVKRWHFSP&hdcore=3.0.3',
+                'url': f4m_url,
             }]
         else:
             base_url = head.find(_x('smil:meta')).attrib['base']
@@ -95,9 +102,9 @@ class ThePlatformIE(InfoExtractor):
         if mobj.group('config'):
             config_url = url+ '&form=json'
             config_url = config_url.replace('swf/', 'config/')
-            config_json = self._download_webpage(config_url, video_id, u'Downloading config')
-            config = json.loads(config_json)
-            smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4'
+            config_url = config_url.replace('onsite/', 'onsite/config/')
+            config = self._download_json(config_url, video_id, 'Downloading config')
+            smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m'
         else:
             smil_url = ('http://link.theplatform.com/s/dJ5BDC/{0}/meta.smil?'
                 'format=smil&mbr=true'.format(video_id))
index 2246d27b2f741047eb6b53ad0e4f54e79293294b..a4aa25f661223301b9d16c7ac87b6c502aa0e0ff 100644 (file)
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from youtube_dl.utils import ExtractorError
+from ..utils import ExtractorError
 
 
 class TinyPicIE(InfoExtractor):
diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py
new file mode 100644 (file)
index 0000000..ad175b8
--- /dev/null
@@ -0,0 +1,60 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+import re
+
+from .common import InfoExtractor
+from .brightcove import BrightcoveIE
+from .discovery import DiscoveryIE
+
+
+class TlcIE(DiscoveryIE):
+    IE_NAME = 'tlc.com'
+    _VALID_URL = r'http://www\.tlc\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?'
+
+    _TEST = {
+        'url': 'http://www.tlc.com/tv-shows/cake-boss/videos/too-big-to-fly.htm',
+        'md5': 'c4038f4a9b44d0b5d74caaa64ed2a01a',
+        'info_dict': {
+            'id': '853232',
+            'ext': 'mp4',
+            'title': 'Cake Boss: Too Big to Fly',
+            'description': 'Buddy has taken on a high flying task.',
+            'duration': 119,
+        },
+    }
+
+
+class TlcDeIE(InfoExtractor):
+    IE_NAME = 'tlc.de'
+    _VALID_URL = r'http://www\.tlc\.de/sendungen/[^/]+/videos/(?P<title>[^/?]+)'
+
+    _TEST = {
+        'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001',
+        'info_dict': {
+            'id': '3235167922001',
+            'ext': 'mp4',
+            'title': 'Breaking Amish: Die Welt da draußen',
+            'uploader': 'Discovery Networks - Germany',
+            'description': 'Vier Amische und eine Mennonitin wagen in New York'
+                '  den Sprung in ein komplett anderes Leben. Begleitet sie auf'
+                ' ihrem spannenden Weg.',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        title = mobj.group('title')
+        webpage = self._download_webpage(url, title)
+        iframe_url = self._search_regex(
+            '<iframe src="(http://www\.tlc\.de/wp-content/.+?)"', webpage,
+            'iframe url')
+        # Otherwise we don't get the correct 'BrightcoveExperience' element,
+        # example: http://www.tlc.de/sendungen/cake-boss/videos/cake-boss-cannoli-drama/
+        iframe_url = iframe_url.replace('.htm?', '.php?')
+        iframe = self._download_webpage(iframe_url, title)
+
+        return {
+            '_type': 'url',
+            'url': BrightcoveIE._extract_brightcove_url(iframe),
+            'ie': BrightcoveIE.ie_key(),
+        }
diff --git a/youtube_dl/extractor/toypics.py b/youtube_dl/extractor/toypics.py
new file mode 100644 (file)
index 0000000..34008af
--- /dev/null
@@ -0,0 +1,75 @@
+from .common import InfoExtractor
+import re
+
+
+class ToypicsIE(InfoExtractor):
+    IE_DESC = 'Toypics user profile'
+    _VALID_URL = r'http://videos\.toypics\.net/view/(?P<id>[0-9]+)/.*'
+    _TEST = {
+        'url': 'http://videos.toypics.net/view/514/chancebulged,-2-1/',
+        'md5': '16e806ad6d6f58079d210fe30985e08b',
+        'info_dict': {
+            'id': '514',
+            'ext': 'mp4',
+            'title': 'Chance-Bulge\'d, 2',
+            'age_limit': 18,
+            'uploader': 'kidsune',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        page = self._download_webpage(url, video_id)
+        video_url = self._html_search_regex(
+            r'src:\s+"(http://static[0-9]+\.toypics\.net/flvideo/[^"]+)"', page, 'video URL')
+        title = self._html_search_regex(
+            r'<title>Toypics - ([^<]+)</title>', page, 'title')
+        username = self._html_search_regex(
+            r'toypics.net/([^/"]+)" class="user-name">', page, 'username')
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'uploader': username,
+            'age_limit': 18,
+        }
+
+
+class ToypicsUserIE(InfoExtractor):
+    IE_DESC = 'Toypics user profile'
+    _VALID_URL = r'http://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        username = mobj.group('username')
+
+        profile_page = self._download_webpage(
+            url, username, note='Retrieving profile page')
+
+        video_count = int(self._search_regex(
+            r'public/">Public Videos \(([0-9]+)\)</a></li>', profile_page,
+            'video count'))
+
+        PAGE_SIZE = 8
+        urls = []
+        page_count = (video_count + PAGE_SIZE + 1) // PAGE_SIZE
+        for n in range(1, page_count + 1):
+            lpage_url = url + '/public/%d' % n
+            lpage = self._download_webpage(
+                lpage_url, username,
+                note='Downloading page %d/%d' % (n, page_count))
+            urls.extend(
+                re.findall(
+                    r'<p class="video-entry-title">\n\s*<a href="(http://videos.toypics.net/view/[^"]+)">',
+                    lpage))
+
+        return {
+            '_type': 'playlist',
+            'id': username,
+            'entries': [{
+                '_type': 'url',
+                'url': eurl,
+                'ie_key': 'Toypics',
+            } for eurl in urls]
+        }
diff --git a/youtube_dl/extractor/trutube.py b/youtube_dl/extractor/trutube.py
new file mode 100644 (file)
index 0000000..57f9566
--- /dev/null
@@ -0,0 +1,44 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class TruTubeIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?trutube\.tv/video/(?P<id>[0-9]+)/.*'
+    _TEST = {
+        'url': 'http://trutube.tv/video/14880/Ramses-II-Proven-To-Be-A-Red-Headed-Caucasoid-',
+        'md5': 'c5b6e301b0a2040b074746cbeaa26ca1',
+        'info_dict': {
+            'id': '14880',
+            'ext': 'flv',
+            'title': 'Ramses II - Proven To Be A Red Headed Caucasoid',
+            'thumbnail': 're:^http:.*\.jpg$',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        video_title = self._og_search_title(webpage).strip()
+        thumbnail = self._search_regex(
+            r"var splash_img = '([^']+)';", webpage, 'thumbnail', fatal=False)
+
+        all_formats = re.finditer(
+            r"var (?P<key>[a-z]+)_video_file\s*=\s*'(?P<url>[^']+)';", webpage)
+        formats = [{
+            'format_id': m.group('key'),
+            'quality': -i,
+            'url': m.group('url'),
+        } for i, m in enumerate(all_formats)]
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+        }
index 3ec9442c9392fe6168d315925ee83f0df6222b5d..36bc36ad8aa7bf3680db6e30d081fec644d61bcd 100644 (file)
@@ -1,63 +1,83 @@
-import os
+from __future__ import unicode_literals
+
+import json
 import re
 
 from .common import InfoExtractor
 from ..utils import (
     compat_urllib_parse_urlparse,
     compat_urllib_request,
+    int_or_none,
+    str_to_int,
 )
-from ..aes import (
-    aes_decrypt_text
-)
+from ..aes import aes_decrypt_text
+
 
 class Tube8IE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>tube8\.com/.+?/(?P<videoid>\d+)/?)$'
+    _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/){2}(?P<id>\d+)'
     _TEST = {
-        u'url': u'http://www.tube8.com/teen/kasia-music-video/229795/',
-        u'file': u'229795.mp4',
-        u'md5': u'e9e0b0c86734e5e3766e653509475db0',
-        u'info_dict': {
-            u"description": u"hot teen Kasia grinding", 
-            u"uploader": u"unknown", 
-            u"title": u"Kasia music video",
-            u"age_limit": 18,
+        'url': 'http://www.tube8.com/teen/kasia-music-video/229795/',
+        'file': '229795.mp4',
+        'md5': 'e9e0b0c86734e5e3766e653509475db0',
+        'info_dict': {
+            'description': 'hot teen Kasia grinding',
+            'uploader': 'unknown',
+            'title': 'Kasia music video',
+            'age_limit': 18,
         }
     }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('videoid')
-        url = 'http://www.' + mobj.group('url')
+        video_id = mobj.group('id')
 
         req = compat_urllib_request.Request(url)
         req.add_header('Cookie', 'age_verified=1')
         webpage = self._download_webpage(req, video_id)
 
-        video_title = self._html_search_regex(r'videotitle     ="([^"]+)', webpage, u'title')
-        video_description = self._html_search_regex(r'>Description:</strong>(.+?)<', webpage, u'description', fatal=False)
-        video_uploader = self._html_search_regex(r'>Submitted by:</strong>(?:\s|<[^>]*>)*(.+?)<', webpage, u'uploader', fatal=False)
-        thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, u'thumbnail', fatal=False)
-        if thumbnail:
-            thumbnail = thumbnail.replace('\\/', '/')
-
-        video_url = self._html_search_regex(r'"video_url":"([^"]+)', webpage, u'video_url')
-        if webpage.find('"encrypted":true')!=-1:
-            password = self._html_search_regex(r'"video_title":"([^"]+)', webpage, u'password')
-            video_url = aes_decrypt_text(video_url, password, 32).decode('utf-8')
+        flashvars = json.loads(self._html_search_regex(
+            r'var flashvars\s*=\s*({.+?})', webpage, 'flashvars'))
+
+        video_url = flashvars['video_url']
+        if flashvars.get('encrypted') is True:
+            video_url = aes_decrypt_text(video_url, flashvars['video_title'], 32).decode('utf-8')
         path = compat_urllib_parse_urlparse(video_url).path
-        extension = os.path.splitext(path)[1][1:]
-        format = path.split('/')[4].split('_')[:2]
-        format = "-".join(format)
+        format_id = '-'.join(path.split('/')[4].split('_')[:2])
+
+        thumbnail = flashvars.get('image_url')
+
+        title = self._html_search_regex(
+            r'videotitle\s*=\s*"([^"]+)', webpage, 'title')
+        description = self._html_search_regex(
+            r'>Description:</strong>(.+?)<', webpage, 'description', fatal=False)
+        uploader = self._html_search_regex(
+            r'<strong class="video-username">(?:<a href="[^"]+">)?([^<]+)(?:</a>)?</strong>',
+            webpage, 'uploader', fatal=False)
+
+        like_count = int_or_none(self._html_search_regex(
+            r"rupVar\s*=\s*'(\d+)'", webpage, 'like count', fatal=False))
+        dislike_count = int_or_none(self._html_search_regex(
+            r"rdownVar\s*=\s*'(\d+)'", webpage, 'dislike count', fatal=False))
+        view_count = self._html_search_regex(
+            r'<strong>Views: </strong>([\d,\.]+)</li>', webpage, 'view count', fatal=False)
+        if view_count:
+            view_count = str_to_int(view_count)
+        comment_count = self._html_search_regex(
+            r'<span id="allCommentsCount">(\d+)</span>', webpage, 'comment count', fatal=False)
+        if comment_count:
+            comment_count = str_to_int(comment_count)
 
         return {
             'id': video_id,
-            'uploader': video_uploader,
-            'title': video_title,
-            'thumbnail': thumbnail,
-            'description': video_description,
             'url': video_url,
-            'ext': extension,
-            'format': format,
-            'format_id': format,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'format_id': format_id,
+            'view_count': view_count,
+            'like_count': like_count,
+            'dislike_count': dislike_count,
+            'comment_count': comment_count,
             'age_limit': 18,
         }
diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py
new file mode 100644 (file)
index 0000000..0921cc5
--- /dev/null
@@ -0,0 +1,84 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    unified_strdate,
+    clean_html,
+    int_or_none,
+)
+
+
+class TvigleIE(InfoExtractor):
+    IE_NAME = 'tvigle'
+    IE_DESC = 'Интернет-телевидение Tvigle.ru'
+    _VALID_URL = r'http://(?:www\.)?tvigle\.ru/category/.+?[\?&]v(?:ideo)?=(?P<id>\d+)'
+
+    _TESTS = [
+        {
+            'url': 'http://www.tvigle.ru/category/cinema/1608/?video=503081',
+            'md5': '09afba4616666249f087efc6dcf83cb3',
+            'info_dict': {
+                'id': '503081',
+                'ext': 'flv',
+                'title': 'Брат 2 ',
+                'description': 'md5:f5a42970f50648cee3d7ad740f3ae769',
+                'upload_date': '20110919',
+            },
+        },
+        {
+            'url': 'http://www.tvigle.ru/category/men/vysotskiy_vospominaniya02/?flt=196&v=676433',
+            'md5': 'e7efe5350dd5011d0de6550b53c3ba7b',
+            'info_dict': {
+                'id': '676433',
+                'ext': 'flv',
+                'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком',
+                'description': 'md5:027f7dc872948f14c96d19b4178428a4',
+                'upload_date': '20121218',
+            },
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        video_data = self._download_xml(
+            'http://www.tvigle.ru/xml/single.php?obj=%s' % video_id, video_id, 'Downloading video XML')
+
+        video = video_data.find('./video')
+
+        title = video.get('name')
+        description = video.get('anons')
+        if description:
+            description = clean_html(description)
+        thumbnail = video_data.get('img')
+        upload_date = unified_strdate(video.get('date'))
+        like_count = int_or_none(video.get('vtp'))
+
+        formats = []
+        for num, (format_id, format_note) in enumerate([['low_file', 'SQ'], ['file', 'HQ'], ['hd', 'HD 720']]):
+            video_url = video.get(format_id)
+            if not video_url:
+                continue
+            formats.append({
+                'url': video_url,
+                'format_id': format_id,
+                'format_note': format_note,
+                'quality': num,
+            })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+            'like_count': like_count,
+            'age_limit': 18,
+            'formats': formats,
+        }
\ No newline at end of file
diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py
new file mode 100644 (file)
index 0000000..054f427
--- /dev/null
@@ -0,0 +1,164 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+    compat_urllib_request,
+    ExtractorError,
+)
+
+
+class UdemyIE(InfoExtractor):
+    IE_NAME = 'udemy'
+    _VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P<id>\d+)'
+    _LOGIN_URL = 'https://www.udemy.com/join/login-submit/'
+    _NETRC_MACHINE = 'udemy'
+
+    _TESTS = [{
+        'url': 'https://www.udemy.com/java-tutorial/#/lecture/172757',
+        'md5': '98eda5b657e752cf945d8445e261b5c5',
+        'info_dict': {
+            'id': '160614',
+            'ext': 'mp4',
+            'title': 'Introduction and Installation',
+            'description': 'md5:c0d51f6f21ef4ec65f091055a5eef876',
+            'duration': 579.29,
+        },
+        'skip': 'Requires udemy account credentials',
+    }]
+
+    def _handle_error(self, response):
+        if not isinstance(response, dict):
+            return
+        error = response.get('error')
+        if error:
+            error_str = 'Udemy returned error #%s: %s' % (error.get('code'), error.get('message'))
+            error_data = error.get('data')
+            if error_data:
+                error_str += ' - %s' % error_data.get('formErrors')
+            raise ExtractorError(error_str, expected=True)
+
+    def _download_json(self, url, video_id, note='Downloading JSON metadata'):
+        response = super(UdemyIE, self)._download_json(url, video_id, note)
+        self._handle_error(response)
+        return response
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            raise ExtractorError(
+                'Udemy account is required, use --username and --password options to provide account credentials.',
+                expected=True)
+
+        login_popup = self._download_webpage(
+            'https://www.udemy.com/join/login-popup?displayType=ajax&showSkipButton=1', None,
+            'Downloading login popup')
+
+        if login_popup == '<div class="run-command close-popup redirect" data-url="https://www.udemy.com/"></div>':
+            return
+
+        csrf = self._html_search_regex(r'<input type="hidden" name="csrf" value="(.+?)"', login_popup, 'csrf token')
+
+        login_form = {
+            'email': username,
+            'password': password,
+            'csrf': csrf,
+            'displayType': 'json',
+            'isSubmitted': '1',
+        }
+        request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
+        response = self._download_json(request, None, 'Logging in as %s' % username)
+
+        if 'returnUrl' not in response:
+            raise ExtractorError('Unable to log in')
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        lecture_id = mobj.group('id')
+
+        lecture = self._download_json(
+            'https://www.udemy.com/api-1.1/lectures/%s' % lecture_id, lecture_id, 'Downloading lecture JSON')
+
+        if lecture['assetType'] != 'Video':
+            raise ExtractorError('Lecture %s is not a video' % lecture_id, expected=True)
+
+        asset = lecture['asset']
+
+        stream_url = asset['streamUrl']
+        mobj = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', stream_url)
+        if mobj:
+            return self.url_result(mobj.group(1), 'Youtube')
+
+        video_id = asset['id']
+        thumbnail = asset['thumbnailUrl']
+        duration = asset['data']['duration']
+
+        download_url = asset['downloadUrl']
+
+        formats = [
+            {
+                'url': download_url['Video480p'][0],
+                'format_id': '360p',
+            },
+            {
+                'url': download_url['Video'][0],
+                'format_id': '720p',
+            },
+        ]
+
+        title = lecture['title']
+        description = lecture['description']
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'formats': formats
+        }
+
+
+class UdemyCourseIE(UdemyIE):
+    IE_NAME = 'udemy:course'
+    _VALID_URL = r'https?://www\.udemy\.com/(?P<coursepath>[\da-z-]+)'
+    _SUCCESSFULLY_ENROLLED = '>You have enrolled in this course!<'
+    _ALREADY_ENROLLED = '>You are already taking this course.<'
+    _TESTS = []
+
+    @classmethod
+    def suitable(cls, url):
+        return False if UdemyIE.suitable(url) else super(UdemyCourseIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        course_path = mobj.group('coursepath')
+
+        response = self._download_json(
+            'https://www.udemy.com/api-1.1/courses/%s' % course_path, course_path, 'Downloading course JSON')
+
+        course_id = int(response['id'])
+        course_title = response['title']
+
+        webpage = self._download_webpage(
+            'https://www.udemy.com/course/subscribe/?courseId=%s' % course_id, course_id, 'Enrolling in the course')
+
+        if self._SUCCESSFULLY_ENROLLED in webpage:
+            self.to_screen('%s: Successfully enrolled in' % course_id)
+        elif self._ALREADY_ENROLLED in webpage:
+            self.to_screen('%s: Already enrolled in' % course_id)
+
+        response = self._download_json('https://www.udemy.com/api-1.1/courses/%s/curriculum' % course_id,
+            course_id, 'Downloading course curriculum')
+
+        entries = [
+            self.url_result('https://www.udemy.com/%s/#/lecture/%s' % (course_path, asset['id']), 'Udemy')
+            for asset in response if asset.get('assetType') == 'Video'
+        ]
+
+        return self.playlist_result(entries, course_id, course_title)
\ No newline at end of file
diff --git a/youtube_dl/extractor/urort.py b/youtube_dl/extractor/urort.py
new file mode 100644 (file)
index 0000000..5d06fcc
--- /dev/null
@@ -0,0 +1,61 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+    unified_strdate,
+)
+
+
+class UrortIE(InfoExtractor):
+    IE_DESC = 'NRK P3 Urørt'
+    _VALID_URL = r'https?://(?:www\.)?urort\.p3\.no/#!/Band/(?P<id>[^/]+)$'
+
+    _TEST = {
+        'url': 'https://urort.p3.no/#!/Band/Gerilja',
+        'md5': '5ed31a924be8a05e47812678a86e127b',
+        'info_dict': {
+            'id': '33124-4',
+            'ext': 'mp3',
+            'title': 'The Bomb',
+            'thumbnail': 're:^https?://.+\.jpg',
+            'like_count': int,
+            'uploader': 'Gerilja',
+            'uploader_id': 'Gerilja',
+            'upload_date': '20100323',
+        },
+        'params': {
+            'matchtitle': '^The Bomb$',  # To test, we want just one video
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        playlist_id = mobj.group('id')
+
+        fstr = compat_urllib_parse.quote("InternalBandUrl eq '%s'" % playlist_id)
+        json_url = 'http://urort.p3.no/breeze/urort/TrackDtos?$filter=' + fstr
+        songs = self._download_json(json_url, playlist_id)
+        print(songs[0])
+
+        entries = [{
+            'id': '%d-%s' % (s['BandId'], s['$id']),
+            'title': s['Title'],
+            'url': s['TrackUrl'],
+            'ext': 'mp3',
+            'uploader_id': playlist_id,
+            'uploader': s.get('BandName', playlist_id),
+            'like_count': s.get('LikeCount'),
+            'thumbnail': 'http://urort.p3.no/cloud/images/%s' % s['Image'],
+            'upload_date': unified_strdate(s.get('Released')),
+        } for s in songs]
+
+        return {
+            '_type': 'playlist',
+            'id': playlist_id,
+            'title': playlist_id,
+            'entries': entries,
+        }
index 7fa2b9e159ed1a60c056140f05f51851663830e9..488b10df96e298c683cd02287e2da0c49f21a1cc 100644 (file)
@@ -11,15 +11,16 @@ from ..utils import (
 
 
 class UstreamIE(InfoExtractor):
-    _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
+    _VALID_URL = r'https?://www\.ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<videoID>\d+)'
     IE_NAME = 'ustream'
     _TEST = {
         'url': 'http://www.ustream.tv/recorded/20274954',
-        'file': '20274954.flv',
         'md5': '088f151799e8f572f84eb62f17d73e5c',
         'info_dict': {
-            "uploader": "Young Americans for Liberty",
-            "title": "Young Americans for Liberty February 7, 2012 2:28 AM",
+            'id': '20274954',
+            'ext': 'flv',
+            'uploader': 'Young Americans for Liberty',
+            'title': 'Young Americans for Liberty February 7, 2012 2:28 AM',
         },
     }
 
@@ -27,6 +28,19 @@ class UstreamIE(InfoExtractor):
         m = re.match(self._VALID_URL, url)
         video_id = m.group('videoID')
 
+        # some sites use this embed format (see: http://github.com/rg3/youtube-dl/issues/2990)
+        if m.group('type') == 'embed/recorded':
+            video_id = m.group('videoID')
+            desktop_url = 'http://www.ustream.tv/recorded/' + video_id
+            return self.url_result(desktop_url, 'Ustream')
+        if m.group('type') == 'embed':
+            video_id = m.group('videoID')
+            webpage = self._download_webpage(url, video_id)
+            desktop_video_id = self._html_search_regex(
+                r'ContentVideoIds=\["([^"]*?)"\]', webpage, 'desktop_video_id')
+            desktop_url = 'http://www.ustream.tv/recorded/' + desktop_video_id
+            return self.url_result(desktop_url, 'Ustream')
+
         video_url = 'http://tcdn.ustream.tv/video/%s' % video_id
         webpage = self._download_webpage(url, video_id)
 
index baa57f3438603e6a5d0d2b406389df4110353ebe..d16993daf0ddb8546f838ed59220a7efeb6cdcc6 100644 (file)
@@ -4,44 +4,118 @@ import re
 import json
 
 from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_request,
+    int_or_none,
+)
 
 
 class VeohIE(InfoExtractor):
-    _VALID_URL = r'http://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/v(?P<id>\d*)'
-
-    _TEST = {
-        'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3',
-        'file': '56314296.mp4',
-        'md5': '620e68e6a3cff80086df3348426c9ca3',
-        'info_dict': {
-            'title': 'Straight Backs Are Stronger',
-            'uploader': 'LUMOback',
-            'description': 'At LUMOback, we believe straight backs are stronger.  The LUMOback Posture & Movement Sensor:  It gently vibrates when you slouch, inspiring improved posture and mobility.  Use the app to track your data and improve your posture over time. ',
+    _VALID_URL = r'http://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|yapi-)[\da-zA-Z]+)'
+
+    _TESTS = [
+        {
+            'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3',
+            'md5': '620e68e6a3cff80086df3348426c9ca3',
+            'info_dict': {
+                'id': '56314296',
+                'ext': 'mp4',
+                'title': 'Straight Backs Are Stronger',
+                'uploader': 'LUMOback',
+                'description': 'At LUMOback, we believe straight backs are stronger.  The LUMOback Posture & Movement Sensor:  It gently vibrates when you slouch, inspiring improved posture and mobility.  Use the app to track your data and improve your posture over time. ',
+            },
+        },
+        {
+            'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage',
+            'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa',
+            'info_dict': {
+                'id': '27701988',
+                'ext': 'mp4',
+                'title': 'Chile workers cover up to avoid skin damage',
+                'description': 'md5:2bd151625a60a32822873efc246ba20d',
+                'uploader': 'afp-news',
+                'duration': 123,
+            },
+        },
+        {
+            'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX',
+            'md5': '4fde7b9e33577bab2f2f8f260e30e979',
+            'note': 'Embedded ooyala video',
+            'info_dict': {
+                'id': '69525809',
+                'ext': 'mp4',
+                'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery',
+                'description': 'md5:f5a11c51f8fb51d2315bca0937526891',
+                'uploader': 'newsy-videos',
+            },
+        },
+    ]
+
+    def _extract_formats(self, source):
+        formats = []
+        link = source.get('aowPermalink')
+        if link:
+            formats.append({
+                'url': link,
+                'ext': 'mp4',
+                'format_id': 'aow',
+            })
+        link = source.get('fullPreviewHashLowPath')
+        if link:
+            formats.append({
+                'url': link,
+                'format_id': 'low',
+            })
+        link = source.get('fullPreviewHashHighPath')
+        if link:
+            formats.append({
+                'url': link,
+                'format_id': 'high',
+            })
+        return formats
+
+    def _extract_video(self, source):
+        return {
+            'id': source.get('videoId'),
+            'title': source.get('title'),
+            'description': source.get('description'),
+            'thumbnail': source.get('highResImage') or source.get('medResImage'),
+            'uploader': source.get('username'),
+            'duration': int_or_none(source.get('length')),
+            'view_count': int_or_none(source.get('views')),
+            'age_limit': 18 if source.get('isMature') == 'true' or source.get('isSexy') == 'true' else 0,
+            'formats': self._extract_formats(source),
         }
-    }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
+
+        if video_id.startswith('v'):
+            rsp = self._download_xml(
+                r'http://www.veoh.com/api/findByPermalink?permalink=%s' % video_id, video_id, 'Downloading video XML')
+            if rsp.get('stat') == 'ok':
+                return self._extract_video(rsp.find('./videoList/video'))
+
         webpage = self._download_webpage(url, video_id)
+        age_limit = 0
+        if 'class="adultwarning-container"' in webpage:
+            self.report_age_confirmation()
+            age_limit = 18
+            request = compat_urllib_request.Request(url)
+            request.add_header('Cookie', 'confirmedAdult=true')
+            webpage = self._download_webpage(request, video_id)
 
-        m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|")', webpage)
+        m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|"|\?)', webpage)
         if m_youtube is not None:
             youtube_id = m_youtube.group(1)
             self.to_screen('%s: detected Youtube video.' % video_id)
             return self.url_result(youtube_id, 'Youtube')
 
-        self.report_extraction(video_id)
-        info = self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info')
-        info = json.loads(info)
-        video_url = info.get('fullPreviewHashHighPath') or info.get('fullPreviewHashLowPath')
+        info = json.loads(
+            self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info').replace('\\\'', '\''))
 
-        return {
-            'id': info['videoId'],
-            'title': info['title'],
-            'url': video_url,
-            'uploader': info['username'],
-            'thumbnail': info.get('highResImage') or info.get('medResImage'),
-            'description': info['description'],
-            'view_count': info['views'],
-        }
+        video = self._extract_video(info)
+        video['age_limit'] = age_limit
+
+        return video
index f51d4dcfa6c0cbda5fb8c53d0421ac099cea8295..27f9acb670b1b10ffa2ee0220f62dc411e49d8d7 100644 (file)
@@ -4,14 +4,11 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-    int_or_none
-)
+from ..utils import ExtractorError
+from .rutv import RUTVIE
 
 
 class VestiIE(InfoExtractor):
-    IE_NAME = 'vesti'
     IE_DESC = 'Вести.Ru'
     _VALID_URL = r'http://(?:.+?\.)?vesti\.ru/(?P<id>.+)'
 
@@ -30,6 +27,20 @@ class VestiIE(InfoExtractor):
                 'skip_download': True,
             },
         },
+        {
+            'url': 'http://www.vesti.ru/doc.html?id=1349233',
+            'info_dict': {
+                'id': '773865',
+                'ext': 'mp4',
+                'title': 'Участники митинга штурмуют Донецкую областную администрацию',
+                'description': 'md5:1a160e98b3195379b4c849f2f4958009',
+                'duration': 210,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
         {
             'url': 'http://www.vesti.ru/only_video.html?vid=576180',
             'info_dict': {
@@ -44,6 +55,20 @@ class VestiIE(InfoExtractor):
                 'skip_download': True,
             },
         },
+        {
+            'url': 'http://hitech.vesti.ru/news/view/id/4000',
+            'info_dict': {
+                'id': '766888',
+                'ext': 'mp4',
+                'title': 'Вести.net: интернет-гиганты начали перетягивание программных "одеял"',
+                'description': 'md5:65ddd47f9830c4f42ed6475f8730c995',
+                'duration': 279,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            },
+        },
         {
             'url': 'http://sochi2014.vesti.ru/video/index/video_id/766403',
             'info_dict': {
@@ -57,7 +82,7 @@ class VestiIE(InfoExtractor):
                 # m3u8 download
                 'skip_download': True,
             },
-            'skip': 'Blocked outside Russia'
+            'skip': 'Blocked outside Russia',
         },
         {
             'url': 'http://sochi2014.vesti.ru/live/play/live_id/301',
@@ -72,7 +97,7 @@ class VestiIE(InfoExtractor):
                 'skip_download': True,
             },
             'skip': 'Translation has finished'
-        }
+        },
     ]
 
     def _real_extract(self, url):
@@ -81,90 +106,16 @@ class VestiIE(InfoExtractor):
 
         page = self._download_webpage(url, video_id, 'Downloading page')
 
-        mobj = re.search(r'<meta property="og:video" content=".+?\.swf\?v?id=(?P<id>\d+).*?" />', page)
+        mobj = re.search(
+            r'<meta[^>]+?property="og:video"[^>]+?content="http://www\.vesti\.ru/i/flvplayer_videoHost\.swf\?vid=(?P<id>\d+)',
+            page)
         if mobj:
-            video_type = 'video'
-            video_id = mobj.group('id')
-        else:
-            mobj = re.search(
-                r'<iframe.+?src="http://player\.rutv\.ru/iframe/(?P<type>[^/]+)/id/(?P<id>\d+)[^"]*".*?></iframe>', page)
-
-            if not mobj:
-                raise ExtractorError('No media found')
-
-            video_type = mobj.group('type')
             video_id = mobj.group('id')
+            page = self._download_webpage('http://www.vesti.ru/only_video.html?vid=%s' % video_id, video_id,
+                'Downloading video page')
 
-        json_data = self._download_json(
-            'http://player.rutv.ru/iframe/%splay/id/%s' % ('live-' if video_type == 'live' else '', video_id),
-            video_id, 'Downloading JSON')
-
-        if json_data['errors']:
-            raise ExtractorError('vesti returned error: %s' % json_data['errors'], expected=True)
-
-        playlist = json_data['data']['playlist']
-        medialist = playlist['medialist']
-        media = medialist[0]
-
-        if media['errors']:
-            raise ExtractorError('vesti returned error: %s' % media['errors'], expected=True)
-
-        view_count = playlist.get('count_views')
-        priority_transport = playlist['priority_transport']
-
-        thumbnail = media['picture']
-        width = media['width']
-        height = media['height']
-        description = media['anons']
-        title = media['title']
-        duration = int_or_none(media.get('duration'))
-
-        formats = []
-
-        for transport, links in media['sources'].items():
-            for quality, url in links.items():
-                if transport == 'rtmp':
-                    mobj = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+))/(?P<playpath>.+)$', url)
-                    if not mobj:
-                        continue
-                    fmt = {
-                        'url': mobj.group('url'),
-                        'play_path': mobj.group('playpath'),
-                        'app': mobj.group('app'),
-                        'page_url': 'http://player.rutv.ru',
-                        'player_url': 'http://player.rutv.ru/flash2v/osmf.swf?i=22',
-                        'rtmp_live': True,
-                        'ext': 'flv',
-                        'vbr': int(quality),
-                    }
-                elif transport == 'm3u8':
-                    fmt = {
-                        'url': url,
-                        'ext': 'mp4',
-                    }
-                else:
-                    fmt = {
-                        'url': url
-                    }
-                fmt.update({
-                    'width': width,
-                    'height': height,
-                    'format_id': '%s-%s' % (transport, quality),
-                    'preference': -1 if priority_transport == transport else -2,
-                })
-                formats.append(fmt)
-
-        if not formats:
-            raise ExtractorError('No media links available for %s' % video_id)
-
-        self._sort_formats(formats)
+        rutv_url = RUTVIE._extract_url(page)
+        if rutv_url:
+            return self.url_result(rutv_url, 'RUTV')
 
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'view_count': view_count,
-            'duration': duration,
-            'formats': formats,
-        }
\ No newline at end of file
+        raise ExtractorError('No video found', expected=True)
\ No newline at end of file
index e458ac961f701391c2fb651b1557ac7070dc6e2a..eada13ce920b9f4e892f952242ef87bfac504600 100644 (file)
@@ -2,7 +2,6 @@ from __future__ import unicode_literals
 
 import re
 import xml.etree.ElementTree
-import datetime
 
 from .common import InfoExtractor
 from ..utils import (
@@ -17,22 +16,55 @@ class VevoIE(InfoExtractor):
     (currently used by MTVIE)
     """
     _VALID_URL = r'''(?x)
-        (?:https?://www\.vevo\.com/watch/(?:[^/]+/[^/]+/)?|
+        (?:https?://www\.vevo\.com/watch/(?:[^/]+/(?:[^/]+/)?)?|
            https?://cache\.vevo\.com/m/html/embed\.html\?video=|
            https?://videoplayer\.vevo\.com/embed/embedded\?videoId=|
            vevo:)
         (?P<id>[^&?#]+)'''
+
     _TESTS = [{
         'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280',
-        'file': 'GB1101300280.mp4',
         "md5": "06bea460acb744eab74a9d7dcb4bfd61",
         'info_dict': {
+            'id': 'GB1101300280',
+            'ext': 'mp4',
             "upload_date": "20130624",
             "uploader": "Hurts",
             "title": "Somebody to Die For",
             "duration": 230.12,
             "width": 1920,
             "height": 1080,
+            # timestamp and upload_date are often incorrect; seem to change randomly
+            'timestamp': int,
+        }
+    }, {
+        'note': 'v3 SMIL format',
+        'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923',
+        'md5': '893ec0e0d4426a1d96c01de8f2bdff58',
+        'info_dict': {
+            'id': 'USUV71302923',
+            'ext': 'mp4',
+            'upload_date': '20140219',
+            'uploader': 'Cassadee Pope',
+            'title': 'I Wish I Could Break Your Heart',
+            'duration': 226.101,
+            'age_limit': 0,
+            'timestamp': int,
+        }
+    }, {
+        'note': 'Age-limited video',
+        'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282',
+        'info_dict': {
+            'id': 'USRV81300282',
+            'ext': 'mp4',
+            'age_limit': 18,
+            'title': 'Tunnel Vision (Explicit)',
+            'uploader': 'Justin Timberlake',
+            'upload_date': 're:2013070[34]',
+            'timestamp': int,
+        },
+        'params': {
+            'skip_download': 'true',
         }
     }]
     _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/'
@@ -102,12 +134,40 @@ class VevoIE(InfoExtractor):
         video_id = mobj.group('id')
 
         json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id
-        video_info = self._download_json(json_url, video_id)['video']
+        response = self._download_json(json_url, video_id)
+        video_info = response['video']
+
+        if not video_info:
+            if 'statusMessage' in response:
+                raise ExtractorError('%s said: %s' % (self.IE_NAME, response['statusMessage']), expected=True)
+            raise ExtractorError('Unable to extract videos')
 
         formats = self._formats_from_json(video_info)
+
+        is_explicit = video_info.get('isExplicit')
+        if is_explicit is True:
+            age_limit = 18
+        elif is_explicit is False:
+            age_limit = 0
+        else:
+            age_limit = None
+
+        # Download SMIL
+        smil_blocks = sorted((
+            f for f in video_info['videoVersions']
+            if f['sourceType'] == 13),
+            key=lambda f: f['version'])
+
+        smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (
+            self._SMIL_BASE_URL, video_id, video_id.lower())
+        if smil_blocks:
+            smil_url_m = self._search_regex(
+                r'url="([^"]+)"', smil_blocks[-1]['data'], 'SMIL URL',
+                fatal=False)
+            if smil_url_m is not None:
+                smil_url = smil_url_m
+
         try:
-            smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % (
-                self._SMIL_BASE_URL, video_id, video_id.lower())
             smil_xml = self._download_webpage(smil_url, video_id,
                                               'Downloading SMIL info')
             formats.extend(self._formats_from_smil(smil_xml))
@@ -119,13 +179,14 @@ class VevoIE(InfoExtractor):
 
         timestamp_ms = int(self._search_regex(
             r'/Date\((\d+)\)/', video_info['launchDate'], 'launch date'))
-        upload_date = datetime.datetime.fromtimestamp(timestamp_ms // 1000)
+
         return {
             'id': video_id,
             'title': video_info['title'],
             'formats': formats,
             'thumbnail': video_info['imageUrl'],
-            'upload_date': upload_date.strftime('%Y%m%d'),
+            'timestamp': timestamp_ms // 1000,
             'uploader': video_info['mainArtists'][0]['artistName'],
             'duration': video_info['duration'],
+            'age_limit': age_limit,
         }
diff --git a/youtube_dl/extractor/vh1.py b/youtube_dl/extractor/vh1.py
new file mode 100644 (file)
index 0000000..2f77e38
--- /dev/null
@@ -0,0 +1,124 @@
+from __future__ import unicode_literals
+
+from .mtv import MTVIE
+
+import re
+from ..utils import fix_xml_ampersands
+
+
+class VH1IE(MTVIE):
+    IE_NAME = 'vh1.com'
+    _FEED_URL = 'http://www.vh1.com/player/embed/AS3/fullepisode/rss/'
+    _TESTS = [{
+        'url': 'http://www.vh1.com/video/metal-evolution/full-episodes/progressive-metal/1678612/playlist.jhtml',
+        'playlist': [
+            {
+                'md5': '7827a7505f59633983165bbd2c119b52',
+                'info_dict': {
+                    'id': '731565',
+                    'ext': 'mp4',
+                    'title': 'Metal Evolution: Ep. 11 Act 1',
+                    'description': 'Many rock academics have proclaimed that the truly progressive musicianship of the last 20 years has been found right here in the world of heavy metal, rather than obvious locales such as jazz, fusion or progressive rock. It stands to reason then, that much of this jaw-dropping virtuosity occurs within what\'s known as progressive metal, a genre that takes root with the likes of Rush in the \'70s, Queensryche and Fates Warning in the \'80s, and Dream Theater in the \'90s. Since then, the genre has exploded with creativity, spawning mind-bending, genre-defying acts such as Tool, Mastodon, Coheed And Cambria, Porcupine Tree, Meshuggah, A Perfect Circle and Opeth. Episode 12 looks at the extreme musicianship of these bands, as well as their often extreme literary prowess and conceptual strength, the end result being a rich level of respect and attention such challenging acts have brought upon the world of heavy metal, from a critical community usually dismissive of the form.'
+                }
+            },
+            {
+                'md5': '34fb4b7321c546b54deda2102a61821f',
+                'info_dict': {
+                    'id': '731567',
+                    'ext': 'mp4',
+                    'title': 'Metal Evolution: Ep. 11 Act 2',
+                    'description': 'Many rock academics have proclaimed that the truly progressive musicianship of the last 20 years has been found right here in the world of heavy metal, rather than obvious locales such as jazz, fusion or progressive rock. It stands to reason then, that much of this jaw-dropping virtuosity occurs within what\'s known as progressive metal, a genre that takes root with the likes of Rush in the \'70s, Queensryche and Fates Warning in the \'80s, and Dream Theater in the \'90s. Since then, the genre has exploded with creativity, spawning mind-bending, genre-defying acts such as Tool, Mastodon, Coheed And Cambria, Porcupine Tree, Meshuggah, A Perfect Circle and Opeth. Episode 11 looks at the extreme musicianship of these bands, as well as their often extreme literary prowess and conceptual strength, the end result being a rich level of respect and attention such challenging acts have brought upon the world of heavy metal, from a critical community usually dismissive of the form.'
+                }
+            },
+            {
+                'md5': '813f38dba4c1b8647196135ebbf7e048',
+                'info_dict': {
+                    'id': '731568',
+                    'ext': 'mp4',
+                    'title': 'Metal Evolution: Ep. 11 Act 3',
+                    'description': 'Many rock academics have proclaimed that the truly progressive musicianship of the last 20 years has been found right here in the world of heavy metal, rather than obvious locales such as jazz, fusion or progressive rock. It stands to reason then, that much of this jaw-dropping virtuosity occurs within what\'s known as progressive metal, a genre that takes root with the likes of Rush in the \'70s, Queensryche and Fates Warning in the \'80s, and Dream Theater in the \'90s. Since then, the genre has exploded with creativity, spawning mind-bending, genre-defying acts such as Tool, Mastodon, Coheed And Cambria, Porcupine Tree, Meshuggah, A Perfect Circle and Opeth. Episode 11 looks at the extreme musicianship of these bands, as well as their often extreme literary prowess and conceptual strength, the end result being a rich level of respect and attention such challenging acts have brought upon the world of heavy metal, from a critical community usually dismissive of the form.'
+                }
+            },
+            {
+                'md5': '51adb72439dfaed11c799115d76e497f',
+                'info_dict': {
+                    'id': '731569',
+                    'ext': 'mp4',
+                    'title': 'Metal Evolution: Ep. 11 Act 4',
+                    'description': 'Many rock academics have proclaimed that the truly progressive musicianship of the last 20 years has been found right here in the world of heavy metal, rather than obvious locales such as jazz, fusion or progressive rock. It stands to reason then, that much of this jaw-dropping virtuosity occurs within what\'s known as progressive metal, a genre that takes root with the likes of Rush in the \'70s, Queensryche and Fates Warning in the \'80s, and Dream Theater in the \'90s. Since then, the genre has exploded with creativity, spawning mind-bending, genre-defying acts such as Tool, Mastodon, Coheed And Cambria, Porcupine Tree, Meshuggah, A Perfect Circle and Opeth. Episode 11 looks at the extreme musicianship of these bands, as well as their often extreme literary prowess and conceptual strength, the end result being a rich level of respect and attention such challenging acts have brought upon the world of heavy metal, from a critical community usually dismissive of the form.'
+                }
+            },
+            {
+                'md5': '93d554aaf79320703b73a95288c76a6e',
+                'info_dict': {
+                    'id': '731570',
+                    'ext': 'mp4',
+                    'title': 'Metal Evolution: Ep. 11 Act 5',
+                    'description': 'Many rock academics have proclaimed that the truly progressive musicianship of the last 20 years has been found right here in the world of heavy metal, rather than obvious locales such as jazz, fusion or progressive rock. It stands to reason then, that much of this jaw-dropping virtuosity occurs within what\'s known as progressive metal, a genre that takes root with the likes of Rush in the \'70s, Queensryche and Fates Warning in the \'80s, and Dream Theater in the \'90s. Since then, the genre has exploded with creativity, spawning mind-bending, genre-defying acts such as Tool, Mastodon, Coheed And Cambria, Porcupine Tree, Meshuggah, A Perfect Circle and Opeth. Episode 11 looks at the extreme musicianship of these bands, as well as their often extreme literary prowess and conceptual strength, the end result being a rich level of respect and attention such challenging acts have brought upon the world of heavy metal, from a critical community usually dismissive of the form.'
+                }
+            }
+        ],
+        'skip': 'Blocked outside the US',
+    }, {
+        # Clip
+        'url': 'http://www.vh1.com/video/misc/706675/metal-evolution-episode-1-pre-metal-show-clip.jhtml#id=1674118',
+        'md5': '7d67cf6d9cdc6b4f3d3ac97a55403844',
+        'info_dict': {
+            'id': '706675',
+            'ext': 'mp4',
+            'title': 'Metal Evolution: Episode 1 Pre-Metal Show Clip',
+            'description': 'The greatest documentary ever made about Heavy Metal begins as our host Sam Dunn travels the globe to seek out the origins and influences that helped create Heavy Metal. Sam speaks to legends like Kirk Hammett, Alice Cooper, Slash, Bill Ward, Geezer Butler, Tom Morello, Ace Frehley, Lemmy Kilmister, Dave Davies, and many many more. This episode is the prologue for the 11 hour series, and Sam goes back to the very beginning to reveal how Heavy Metal was created.'
+        },
+        'skip': 'Blocked outside the US',
+    }, {
+        # Short link
+        'url': 'http://www.vh1.com/video/play.jhtml?id=1678353',
+        'md5': '853192b87ad978732b67dd8e549b266a',
+        'info_dict': {
+            'id': '730355',
+            'ext': 'mp4',
+            'title': 'Metal Evolution: Episode 11 Progressive Metal Sneak',
+            'description': 'In Metal Evolution\'s finale sneak, Sam sits with Michael Giles of King Crimson and gets feedback from Metallica guitarist Kirk Hammett on why the group was influential.'
+        },
+        'skip': 'Blocked outside the US',
+    }, {
+        'url': 'http://www.vh1.com/video/macklemore-ryan-lewis/900535/cant-hold-us-ft-ray-dalton.jhtml',
+        'md5': 'b1bcb5b4380c9d7f544065589432dee7',
+        'info_dict': {
+            'id': '900535',
+            'ext': 'mp4',
+            'title': 'Macklemore & Ryan Lewis - "Can\'t Hold Us ft. Ray Dalton"',
+            'description': 'The Heist'
+        },
+        'skip': 'Blocked outside the US',
+    }]
+
+    _VALID_URL = r'''(?x)
+        https?://www\.vh1\.com/video/
+        (?:
+            .+?/full-episodes/.+?/(?P<playlist_id>[^/]+)/playlist\.jhtml
+        |
+            (?:
+            play.jhtml\?id=|
+            misc/.+?/.+?\.jhtml\#id=
+            )
+            (?P<video_id>[0-9]+)$
+        |
+            [^/]+/(?P<music_id>[0-9]+)/[^/]+?
+        )
+    '''
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj.group('music_id'):
+            id_field = 'vid'
+            video_id = mobj.group('music_id')
+        else:
+            video_id = mobj.group('playlist_id') or mobj.group('video_id')
+            id_field = 'id'
+        doc_url = '%s?%s=%s' % (self._FEED_URL, id_field, video_id)
+
+        idoc = self._download_xml(
+            doc_url, video_id,
+            'Downloading info', transform_source=fix_xml_ampersands)
+        return [self._get_video_info(item) for item in idoc.findall('.//item')]
diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py
deleted file mode 100644 (file)
index 87812d6..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-import re
-
-from .common import InfoExtractor
-from .ooyala import OoyalaIE
-from ..utils import ExtractorError
-
-
-class ViceIE(InfoExtractor):
-    _VALID_URL = r'http://www\.vice\.com/.*?/(?P<name>.+)'
-
-    _TEST = {
-        u'url': u'http://www.vice.com/Fringes/cowboy-capitalists-part-1',
-        u'file': u'43cW1mYzpia9IlestBjVpd23Yu3afAfp.mp4',
-        u'info_dict': {
-            u'title': u'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
-        },
-        u'params': {
-            # Requires ffmpeg (m3u8 manifest)
-            u'skip_download': True,
-        },
-    }
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        name = mobj.group('name')
-        webpage = self._download_webpage(url, name)
-        try:
-            ooyala_url = self._og_search_video_url(webpage)
-        except ExtractorError:
-            try:
-                embed_code = self._search_regex(
-                    r'OO.Player.create\(\'ooyalaplayer\', \'(.+?)\'', webpage,
-                    u'ooyala embed code')
-                ooyala_url = OoyalaIE._url_for_embed_code(embed_code)
-            except ExtractorError:
-                raise ExtractorError(u'The page doesn\'t contain a video', expected=True)
-        return self.url_result(ooyala_url, ie='Ooyala')
-
diff --git a/youtube_dl/extractor/videobam.py b/youtube_dl/extractor/videobam.py
new file mode 100644 (file)
index 0000000..fed95ef
--- /dev/null
@@ -0,0 +1,81 @@
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class VideoBamIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?videobam\.com/(?:videos/download/)?(?P<id>[a-zA-Z]+)'
+
+    _TESTS = [
+        {
+            'url': 'http://videobam.com/OiJQM',
+            'md5': 'db471f27763a531f10416a0c58b5a1e0',
+            'info_dict': {
+                'id': 'OiJQM',
+                'ext': 'mp4',
+                'title': 'Is Alcohol Worse Than Ecstasy?',
+                'description': 'md5:d25b96151515c91debc42bfbb3eb2683',
+                'uploader': 'frihetsvinge',
+            },
+        },
+        {
+            'url': 'http://videobam.com/pqLvq',
+            'md5': 'd9a565b5379a99126ef94e1d7f9a383e',
+            'note': 'HD video',
+            'info_dict': {
+                'id': 'pqLvq',
+                'ext': 'mp4',
+                'title': '_',
+            }
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        page = self._download_webpage('http://videobam.com/%s' % video_id, video_id, 'Downloading page')
+
+        formats = []
+
+        for preference, format_id in enumerate(['low', 'high']):
+            mobj = re.search(r"%s: '(?P<url>[^']+)'" % format_id, page)
+            if not mobj:
+                continue
+            formats.append({
+                'url': mobj.group('url'),
+                'ext': 'mp4',
+                'format_id': format_id,
+                'preference': preference,
+            })
+
+        if not formats:
+            player_config = json.loads(self._html_search_regex(r'var player_config = ({.+?});', page, 'player config'))
+            formats = [{
+                'url': item['url'],
+                'ext': 'mp4',
+            } for item in player_config['playlist'] if 'autoPlay' in item]
+
+        self._sort_formats(formats)
+
+        title = self._og_search_title(page, default='_', fatal=False)
+        description = self._og_search_description(page, default=None)
+        thumbnail = self._og_search_thumbnail(page)
+        uploader = self._html_search_regex(r'Upload by ([^<]+)</a>', page, 'uploader', fatal=False, default=None)
+        view_count = int_or_none(
+            self._html_search_regex(r'<strong>Views:</strong> (\d+) ', page, 'view count', fatal=False))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'view_count': view_count,
+            'formats': formats,
+            'age_limit': 18,
+        }
\ No newline at end of file
index 265dd5b91fd9e5c4fc5a0cac8a9f36dd36731cfe..ac6c255376442d132948eb5f54e0517bca5a66f4 100644 (file)
@@ -1,22 +1,23 @@
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
 from .internetvideoarchive import InternetVideoArchiveIE
-from ..utils import (
-    compat_urlparse,
-)
+from ..utils import compat_urlparse
 
 
 class VideoDetectiveIE(InfoExtractor):
     _VALID_URL = r'https?://www\.videodetective\.com/[^/]+/[^/]+/(?P<id>\d+)'
 
     _TEST = {
-        u'url': u'http://www.videodetective.com/movies/kick-ass-2/194487',
-        u'file': u'194487.mp4',
-        u'info_dict': {
-            u'title': u'KICK-ASS 2',
-            u'description': u'md5:65ba37ad619165afac7d432eaded6013',
-            u'duration': 135,
+        'url': 'http://www.videodetective.com/movies/kick-ass-2/194487',
+        'info_dict': {
+            'id': '194487',
+            'ext': 'mp4',
+            'title': 'KICK-ASS 2',
+            'description': 'md5:65ba37ad619165afac7d432eaded6013',
+            'duration': 135,
         },
     }
 
@@ -26,5 +27,4 @@ class VideoDetectiveIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
         og_video = self._og_search_video_url(webpage)
         query = compat_urlparse.urlparse(og_video).query
-        return self.url_result(InternetVideoArchiveIE._build_url(query),
-            ie=InternetVideoArchiveIE.ie_key())
+        return self.url_result(InternetVideoArchiveIE._build_url(query), ie=InternetVideoArchiveIE.ie_key())
diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py
new file mode 100644 (file)
index 0000000..ebd2a3d
--- /dev/null
@@ -0,0 +1,70 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    find_xpath_attr,
+    int_or_none,
+    parse_duration,
+    unified_strdate,
+)
+
+
+class VideoLecturesNetIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/'
+    IE_NAME = 'videolectures.net'
+
+    _TEST = {
+        'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/',
+        'info_dict': {
+            'id': 'promogram_igor_mekjavic_eng',
+            'ext': 'mp4',
+            'title': 'Automatics, robotics and biocybernetics',
+            'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
+            'upload_date': '20130627',
+            'duration': 565,
+            'thumbnail': 're:http://.*\.jpg',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        smil_url = 'http://videolectures.net/%s/video/1/smil.xml' % video_id
+        smil = self._download_xml(smil_url, video_id)
+
+        title = find_xpath_attr(smil, './/meta', 'name', 'title').attrib['content']
+        description_el = find_xpath_attr(smil, './/meta', 'name', 'abstract')
+        description = (
+            None if description_el is None
+            else description_el.attrib['content'])
+        upload_date = unified_strdate(
+            find_xpath_attr(smil, './/meta', 'name', 'date').attrib['content'])
+
+        switch = smil.find('.//switch')
+        duration = parse_duration(switch.attrib.get('dur'))
+        thumbnail_el = find_xpath_attr(switch, './image', 'type', 'thumbnail')
+        thumbnail = (
+            None if thumbnail_el is None else thumbnail_el.attrib.get('src'))
+
+        formats = [{
+            'url': v.attrib['src'],
+            'width': int_or_none(v.attrib.get('width')),
+            'height': int_or_none(v.attrib.get('height')),
+            'filesize': int_or_none(v.attrib.get('size')),
+            'tbr': int_or_none(v.attrib.get('systemBitrate')) / 1000.0,
+            'ext': v.attrib.get('ext'),
+        } for v in switch.findall('./video')
+            if v.attrib.get('proto') == 'http']
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'upload_date': upload_date,
+            'duration': duration,
+            'thumbnail': thumbnail,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py
new file mode 100644 (file)
index 0000000..b5034b0
--- /dev/null
@@ -0,0 +1,58 @@
+from __future__ import unicode_literals
+
+import re
+import base64
+
+from .common import InfoExtractor
+from ..utils import unified_strdate
+
+
+class VideoTtIE(InfoExtractor):
+    ID_NAME = 'video.tt'
+    IE_DESC = 'video.tt - Your True Tube'
+    _VALID_URL = r'http://(?:www\.)?video\.tt/(?:video/|watch_video\.php\?v=)(?P<id>[\da-zA-Z]{9})'
+
+    _TEST = {
+        'url': 'http://www.video.tt/watch_video.php?v=amd5YujV8',
+        'md5': 'b13aa9e2f267effb5d1094443dff65ba',
+        'info_dict': {
+            'id': 'amd5YujV8',
+            'ext': 'flv',
+            'title': 'Motivational video Change your mind in just 2.50 mins',
+            'description': '',
+            'upload_date': '20130827',
+            'uploader': 'joseph313',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        settings = self._download_json(
+            'http://www.video.tt/player_control/settings.php?v=%s' % video_id, video_id,
+            'Downloading video JSON')['settings']
+
+        video = settings['video_details']['video']
+
+        formats = [
+            {
+                'url': base64.b64decode(res['u']).decode('utf-8'),
+                'ext': 'flv',
+                'format_id': res['l'],
+            } for res in settings['res'] if res['u']
+        ]
+
+        return {
+            'id': video_id,
+            'title': video['title'],
+            'description': video['description'],
+            'thumbnail': settings['config']['thumbnail'],
+            'upload_date': unified_strdate(video['added']),
+            'uploader': video['owner'],
+            'view_count': int(video['view_count']),
+            'comment_count': int(video['comment_count']),
+            'like_count': int(video['liked']),
+            'dislike_count': int(video['disliked']),
+            'formats': formats,
+        }
\ No newline at end of file
diff --git a/youtube_dl/extractor/videoweed.py b/youtube_dl/extractor/videoweed.py
new file mode 100644 (file)
index 0000000..4a08ddd
--- /dev/null
@@ -0,0 +1,26 @@
+from __future__ import unicode_literals
+
+from .novamov import NovaMovIE
+
+
+class VideoWeedIE(NovaMovIE):
+    IE_NAME = 'videoweed'
+    IE_DESC = 'VideoWeed'
+
+    _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'videoweed\.(?:es|com)'}
+
+    _HOST = 'www.videoweed.es'
+
+    _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<'
+    _TITLE_REGEX = r'<h1 class="text_shadow">([^<]+)</h1>'
+
+    _TEST = {
+        'url': 'http://www.videoweed.es/file/b42178afbea14',
+        'md5': 'abd31a2132947262c50429e1d16c1bfd',
+        'info_dict': {
+            'id': 'b42178afbea14',
+            'ext': 'flv',
+            'title': 'optical illusion  dissapeared image magic illusion',
+            'description': ''
+        },
+    }
\ No newline at end of file
index 2206a06d59f57093f59135f6faa8d68381695a95..15f31529822bcba124cfb12bcb9e56566b3bfba7 100644 (file)
@@ -1,29 +1,33 @@
+from __future__ import unicode_literals
+
 import re
 
 from ..utils import (
     ExtractorError,
     unescapeHTML,
     unified_strdate,
+    US_RATINGS,
 )
 from .subtitles import SubtitlesInfoExtractor
 
 
 class VikiIE(SubtitlesInfoExtractor):
-    IE_NAME = u'viki'
+    IE_NAME = 'viki'
 
     _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
     _TEST = {
-        u'url': u'http://www.viki.com/videos/1023585v-heirs-episode-14',
-        u'file': u'1023585v.mp4',
-        u'md5': u'a21454021c2646f5433514177e2caa5f',
-        u'info_dict': {
-            u'title': u'Heirs Episode 14',
-            u'uploader': u'SBS',
-            u'description': u'md5:c4b17b9626dd4b143dcc4d855ba3474e',
-            u'upload_date': u'20131121',
-            u'age_limit': 13,
+        'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14',
+        'md5': 'a21454021c2646f5433514177e2caa5f',
+        'info_dict': {
+            'id': '1023585v',
+            'ext': 'mp4',
+            'title': 'Heirs Episode 14',
+            'uploader': 'SBS',
+            'description': 'md5:c4b17b9626dd4b143dcc4d855ba3474e',
+            'upload_date': '20131121',
+            'age_limit': 13,
         },
-        u'skip': u'Blocked in the US',
+        'skip': 'Blocked in the US',
     }
 
     def _real_extract(self, url):
@@ -44,28 +48,21 @@ class VikiIE(SubtitlesInfoExtractor):
 
         rating_str = self._html_search_regex(
             r'<strong>Rating: </strong>\s*([^<]*)<', webpage,
-            u'rating information', default='').strip()
-        RATINGS = {
-            'G': 0,
-            'PG': 10,
-            'PG-13': 13,
-            'R': 16,
-            'NC': 18,
-        }
-        age_limit = RATINGS.get(rating_str)
+            'rating information', default='').strip()
+        age_limit = US_RATINGS.get(rating_str)
 
         info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id
         info_webpage = self._download_webpage(
-            info_url, video_id, note=u'Downloading info page')
+            info_url, video_id, note='Downloading info page')
         if re.match(r'\s*<div\s+class="video-error', info_webpage):
             raise ExtractorError(
-                u'Video %s is blocked from your location.' % video_id,
+                'Video %s is blocked from your location.' % video_id,
                 expected=True)
         video_url = self._html_search_regex(
-            r'<source[^>]+src="([^"]+)"', info_webpage, u'video URL')
+            r'<source[^>]+src="([^"]+)"', info_webpage, 'video URL')
 
         upload_date_str = self._html_search_regex(
-            r'"created_at":"([^"]+)"', info_webpage, u'upload date')
+            r'"created_at":"([^"]+)"', info_webpage, 'upload date')
         upload_date = (
             unified_strdate(upload_date_str)
             if upload_date_str is not None
index 4bc262049c1a937a420f2f255ff5f9e09bcca0fd..255855558cf64ddfe847db56e00e029f4bbbdf22 100644 (file)
@@ -8,6 +8,7 @@ import itertools
 from .common import InfoExtractor
 from .subtitles import SubtitlesInfoExtractor
 from ..utils import (
+    compat_HTTPError,
     compat_urllib_parse,
     compat_urllib_request,
     clean_html,
@@ -16,10 +17,39 @@ from ..utils import (
     RegexNotFoundError,
     std_headers,
     unsmuggle_url,
+    urlencode_postdata,
+    int_or_none,
 )
 
 
-class VimeoIE(SubtitlesInfoExtractor):
+class VimeoBaseInfoExtractor(InfoExtractor):
+    _NETRC_MACHINE = 'vimeo'
+    _LOGIN_REQUIRED = False
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            if self._LOGIN_REQUIRED:
+                raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
+            return
+        self.report_login()
+        login_url = 'https://vimeo.com/log_in'
+        webpage = self._download_webpage(login_url, None, False)
+        token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token')
+        data = urlencode_postdata({
+            'email': username,
+            'password': password,
+            'action': 'login',
+            'service': 'vimeo',
+            'token': token,
+        })
+        login_request = compat_urllib_request.Request(login_url, data)
+        login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        login_request.add_header('Cookie', 'xsrft=%s' % token)
+        self._download_webpage(login_request, None, False, 'Wrong login info')
+
+
+class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor):
     """Information extractor for vimeo.com."""
 
     # _VALID_URL matches Vimeo URLs
@@ -32,53 +62,60 @@ class VimeoIE(SubtitlesInfoExtractor):
         (?:videos?/)?
         (?P<id>[0-9]+)
         /?(?:[?&].*)?(?:[#].*)?$'''
-    _NETRC_MACHINE = 'vimeo'
     IE_NAME = 'vimeo'
     _TESTS = [
         {
             'url': 'http://vimeo.com/56015672#at=0',
-            'file': '56015672.mp4',
             'md5': '8879b6cc097e987f02484baf890129e5',
             'info_dict': {
-                "upload_date": "20121220", 
-                "description": "This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", 
-                "uploader_id": "user7108434", 
-                "uploader": "Filippo Valsorda", 
+                'id': '56015672',
+                'ext': 'mp4',
+                "upload_date": "20121220",
+                "description": "This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
+                "uploader_id": "user7108434",
+                "uploader": "Filippo Valsorda",
                 "title": "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
+                "duration": 10,
             },
         },
         {
             'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876',
-            'file': '68093876.mp4',
             'md5': '3b5ca6aa22b60dfeeadf50b72e44ed82',
             'note': 'Vimeo Pro video (#1197)',
             'info_dict': {
+                'id': '68093876',
+                'ext': 'mp4',
                 'uploader_id': 'openstreetmapus',
                 'uploader': 'OpenStreetMap US',
                 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography',
+                'duration': 1595,
             },
         },
         {
             'url': 'http://player.vimeo.com/video/54469442',
-            'file': '54469442.mp4',
             'md5': '619b811a4417aa4abe78dc653becf511',
             'note': 'Videos that embed the url in the player page',
             'info_dict': {
+                'id': '54469442',
+                'ext': 'mp4',
                 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software',
                 'uploader': 'The BLN & Business of Software',
                 'uploader_id': 'theblnbusinessofsoftware',
+                'duration': 3610,
             },
         },
         {
             'url': 'http://vimeo.com/68375962',
-            'file': '68375962.mp4',
             'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7',
             'note': 'Video protected with password',
             'info_dict': {
+                'id': '68375962',
+                'ext': 'mp4',
                 'title': 'youtube-dl password protected test video',
                 'upload_date': '20130614',
                 'uploader_id': 'user18948128',
                 'uploader': 'Jaime Marquínez Ferrándiz',
+                'duration': 10,
             },
             'params': {
                 'videopassword': 'youtube-dl',
@@ -96,42 +133,35 @@ class VimeoIE(SubtitlesInfoExtractor):
                 'upload_date': '20131015',
                 'uploader_id': 'staff',
                 'uploader': 'Vimeo Staff',
+                'duration': 62,
             }
         },
     ]
 
-    def _login(self):
-        (username, password) = self._get_login_info()
-        if username is None:
-            return
-        self.report_login()
-        login_url = 'https://vimeo.com/log_in'
-        webpage = self._download_webpage(login_url, None, False)
-        token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token')
-        data = compat_urllib_parse.urlencode({'email': username,
-                                              'password': password,
-                                              'action': 'login',
-                                              'service': 'vimeo',
-                                              'token': token,
-                                              })
-        login_request = compat_urllib_request.Request(login_url, data)
-        login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
-        login_request.add_header('Cookie', 'xsrft=%s' % token)
-        self._download_webpage(login_request, None, False, 'Wrong login info')
+    @classmethod
+    def suitable(cls, url):
+        if VimeoChannelIE.suitable(url):
+            # Otherwise channel urls like http://vimeo.com/channels/31259 would
+            # match
+            return False
+        else:
+            return super(VimeoIE, cls).suitable(url)
 
     def _verify_video_password(self, url, video_id, webpage):
         password = self._downloader.params.get('videopassword', None)
         if password is None:
             raise ExtractorError('This video is protected by a password, use the --video-password option')
         token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token')
-        data = compat_urllib_parse.urlencode({'password': password,
-                                              'token': token})
+        data = compat_urllib_parse.urlencode({
+            'password': password,
+            'token': token,
+        })
         # I didn't manage to use the password with https
         if url.startswith('https'):
-            pass_url = url.replace('https','http')
+            pass_url = url.replace('https', 'http')
         else:
             pass_url = url
-        password_request = compat_urllib_request.Request(pass_url+'/password', data)
+        password_request = compat_urllib_request.Request(pass_url + '/password', data)
         password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
         password_request.add_header('Cookie', 'xsrft=%s' % token)
         self._download_webpage(password_request, video_id,
@@ -171,7 +201,18 @@ class VimeoIE(SubtitlesInfoExtractor):
 
         # Retrieve video webpage to extract further information
         request = compat_urllib_request.Request(url, None, headers)
-        webpage = self._download_webpage(request, video_id)
+        try:
+            webpage = self._download_webpage(request, video_id)
+        except ExtractorError as ee:
+            if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
+                errmsg = ee.cause.read()
+                if b'Because of its privacy settings, this video cannot be played here' in errmsg:
+                    raise ExtractorError(
+                        'Cannot download embed-only video without embedding '
+                        'URL. Please call youtube-dl with the URL of the page '
+                        'that embeds this video.',
+                        expected=True)
+            raise
 
         # Now we begin extracting as much information as we can from what we
         # retrieved. First we extract the information common to all extractors,
@@ -220,13 +261,16 @@ class VimeoIE(SubtitlesInfoExtractor):
         # Extract video thumbnail
         video_thumbnail = config["video"].get("thumbnail")
         if video_thumbnail is None:
-            _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in config["video"]["thumbs"].items())[-1]
+            video_thumbs = config["video"].get("thumbs")
+            if video_thumbs and isinstance(video_thumbs, dict):
+                _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in video_thumbs.items())[-1]
 
         # Extract video description
         video_description = None
         try:
-            video_description = get_element_by_attribute("itemprop", "description", webpage)
-            if video_description: video_description = clean_html(video_description)
+            video_description = get_element_by_attribute("class", "description_wrapper", webpage)
+            if video_description:
+                video_description = clean_html(video_description)
         except AssertionError as err:
             # On some pages like (http://player.vimeo.com/video/54469442) the
             # html tags are not closed, python 2.6 cannot handle it
@@ -235,6 +279,9 @@ class VimeoIE(SubtitlesInfoExtractor):
             else:
                 raise
 
+        # Extract video duration
+        video_duration = int_or_none(config["video"].get("duration"))
+
         # Extract upload date
         video_upload_date = None
         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
@@ -272,7 +319,7 @@ class VimeoIE(SubtitlesInfoExtractor):
                     file_info = {}
                 if video_url is None:
                     video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
-                        %(video_id, sig, timestamp, quality, codec_name.upper())
+                        % (video_id, sig, timestamp, quality, codec_name.upper())
 
                 files[key].append({
                     'ext': codec_extension,
@@ -306,6 +353,7 @@ class VimeoIE(SubtitlesInfoExtractor):
             'title': video_title,
             'thumbnail': video_thumbnail,
             'description': video_description,
+            'duration': video_duration,
             'formats': formats,
             'webpage_url': url,
             'view_count': view_count,
@@ -317,7 +365,7 @@ class VimeoIE(SubtitlesInfoExtractor):
 
 class VimeoChannelIE(InfoExtractor):
     IE_NAME = 'vimeo:channel'
-    _VALID_URL = r'(?:https?://)?vimeo\.com/channels/(?P<id>[^/]+)'
+    _VALID_URL = r'(?:https?://)?vimeo\.com/channels/(?P<id>[^/]+)/?(\?.*)?$'
     _MORE_PAGES_INDICATOR = r'<a.+?rel="next"'
     _TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"'
 
@@ -331,7 +379,7 @@ class VimeoChannelIE(InfoExtractor):
         video_ids = []
         for pagenum in itertools.count(1):
             webpage = self._download_webpage(
-                self._page_url(base_url, pagenum) ,list_id,
+                self._page_url(base_url, pagenum)list_id,
                 'Downloading page %s' % pagenum)
             video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage))
             if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
@@ -347,7 +395,7 @@ class VimeoChannelIE(InfoExtractor):
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        channel_id =  mobj.group('id')
+        channel_id = mobj.group('id')
         return self._extract_videos(channel_id, 'http://vimeo.com/channels/%s' % channel_id)
 
 
@@ -414,3 +462,25 @@ class VimeoReviewIE(InfoExtractor):
         video_id = mobj.group('id')
         player_url = 'https://player.vimeo.com/player/' + video_id
         return self.url_result(player_url, 'Vimeo', video_id)
+
+
+class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE):
+    IE_NAME = 'vimeo:watchlater'
+    IE_DESC = 'Vimeo watch later list, "vimeowatchlater" keyword (requires authentication)'
+    _VALID_URL = r'https?://vimeo\.com/home/watchlater|:vimeowatchlater'
+    _LOGIN_REQUIRED = True
+    _TITLE_RE = r'href="/home/watchlater".*?>(.*?)<'
+
+    def _real_initialize(self):
+        self._login()
+
+    def _page_url(self, base_url, pagenum):
+        url = '%s/page:%d/' % (base_url, pagenum)
+        request = compat_urllib_request.Request(url)
+        # Set the header to get a partial html page with the ids,
+        # the normal page doesn't contain them.
+        request.add_header('X-Requested-With', 'XMLHttpRequest')
+        return request
+
+    def _real_extract(self, url):
+        return self._extract_videos('watchlater', 'https://vimeo.com/home/watchlater')
index e14ff91d44b80b6d6278faece8453b856c39485e..076c87119943f3879845ccc3aaf74cdbebf73859 100644 (file)
@@ -1,8 +1,11 @@
 from __future__ import unicode_literals
 
 import re
+import json
+import itertools
 
 from .common import InfoExtractor
+from ..utils import unified_strdate
 
 
 class VineIE(InfoExtractor):
@@ -13,31 +16,76 @@ class VineIE(InfoExtractor):
         'info_dict': {
             'id': 'b9KOOWX7HUx',
             'ext': 'mp4',
-            'uploader': 'Jack Dorsey',
             'title': 'Chicken.',
+            'description': 'Chicken.',
+            'upload_date': '20130519',
+            'uploader': 'Jack Dorsey',
+            'uploader_id': '76',
         },
     }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-
         video_id = mobj.group('id')
-        webpage_url = 'https://vine.co/v/' + video_id
-        webpage = self._download_webpage(webpage_url, video_id)
 
-        self.report_extraction(video_id)
+        webpage = self._download_webpage('https://vine.co/v/' + video_id, video_id)
 
-        video_url = self._html_search_meta('twitter:player:stream', webpage,
-            'video URL')
+        data = json.loads(self._html_search_regex(
+            r'window\.POST_DATA = { %s: ({.+?}) }' % video_id, webpage, 'vine data'))
 
-        uploader = self._html_search_regex(r'<p class="username">(.*?)</p>',
-            webpage, 'uploader', fatal=False, flags=re.DOTALL)
+        formats = [
+            {
+                'url': data['videoLowURL'],
+                'ext': 'mp4',
+                'format_id': 'low',
+            },
+            {
+                'url': data['videoUrl'],
+                'ext': 'mp4',
+                'format_id': 'standard',
+            }
+        ]
 
         return {
             'id': video_id,
-            'url': video_url,
-            'ext': 'mp4',
             'title': self._og_search_title(webpage),
-            'thumbnail': self._og_search_thumbnail(webpage),
-            'uploader': uploader,
+            'description': data['description'],
+            'thumbnail': data['thumbnailUrl'],
+            'upload_date': unified_strdate(data['created']),
+            'uploader': data['username'],
+            'uploader_id': data['userIdStr'],
+            'like_count': data['likes']['count'],
+            'comment_count': data['comments']['count'],
+            'repost_count': data['reposts']['count'],
+            'formats': formats,
         }
+
+
+class VineUserIE(InfoExtractor):
+    IE_NAME = 'vine:user'
+    _VALID_URL = r'(?:https?://)?vine\.co/(?P<user>[^/]+)/?(\?.*)?$'
+    _VINE_BASE_URL = "https://vine.co/"
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        user = mobj.group('user')
+
+        profile_url = "%sapi/users/profiles/vanity/%s" % (
+            self._VINE_BASE_URL, user)
+        profile_data = self._download_json(
+            profile_url, user, note='Downloading user profile data')
+
+        user_id = profile_data['data']['userId']
+        timeline_data = []
+        for pagenum in itertools.count(1):
+            timeline_url = "%sapi/timelines/users/%s?page=%s" % (
+                self._VINE_BASE_URL, user_id, pagenum)
+            timeline_page = self._download_json(
+                timeline_url, user, note='Downloading page %d' % pagenum)
+            timeline_data.extend(timeline_page['data']['records'])
+            if timeline_page['data']['nextPage'] is None:
+                break
+
+        entries = [
+            self.url_result(e['permalinkUrl'], 'Vine') for e in timeline_data]
+        return self.playlist_result(entries, user)
index a293b8875138d6fca999cc1240f47d15df433290..fb082f36412bb714669e59d071bf609622a5cc56 100644 (file)
@@ -16,7 +16,7 @@ from ..utils import (
 
 class VKIE(InfoExtractor):
     IE_NAME = 'vk.com'
-    _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P<id>.*?)(?:\?|%2F|$)'
+    _VALID_URL = r'https?://vk\.com/(?:video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)|(?:videos.*?\?.*?z=)?video(?P<videoid>.*?)(?:\?|%2F|$))'
     _NETRC_MACHINE = 'vk'
 
     _TESTS = [
@@ -37,11 +37,23 @@ class VKIE(InfoExtractor):
             'info_dict': {
                 'id': '163339118',
                 'ext': 'mp4',
-                'uploader': 'Elvira Dzhonik',
+                'uploader': 'Elya Iskhakova',
                 'title': 'Dream Theater - Hollow Years Live at Budokan 720*',
                 'duration': 558,
             }
         },
+        {
+            'note': 'Embedded video',
+            'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1',
+            'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a',
+            'info_dict': {
+                'id': '162925554',
+                'ext': 'mp4',
+                'uploader': 'Vladimir Gavrin',
+                'title': 'Lin Dan',
+                'duration': 101,
+            }
+        },
         {
             'url': 'http://vk.com/video-8871596_164049491',
             'md5': 'a590bcaf3d543576c9bd162812387666',
@@ -54,7 +66,7 @@ class VKIE(InfoExtractor):
                 'duration': 8352,
             },
             'skip': 'Requires vk account credentials',
-        }
+        },
     ]
 
     def _login(self):
@@ -82,7 +94,10 @@ class VKIE(InfoExtractor):
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = mobj.group('videoid')
+
+        if not video_id:
+            video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
 
         info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id
         info_page = self._download_webpage(info_url, video_id)
@@ -93,7 +108,7 @@ class VKIE(InfoExtractor):
 
         m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page)
         if m_yt is not None:
-            self.to_screen(u'Youtube video detected')
+            self.to_screen('Youtube video detected')
             return self.url_result(m_yt.group(1), 'Youtube')
         data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars')
         data = json.loads(data_json)
index fbdff471afcff7f26c203c8a895217dfb6436b41..7b77865cb172f6b46cb86561a5ae022d0263a95d 100644 (file)
@@ -1,47 +1,69 @@
 from __future__ import unicode_literals
 
 import re
-import datetime
 
 from .common import InfoExtractor
+from ..utils import int_or_none
 
 
 class VubeIE(InfoExtractor):
     IE_NAME = 'vube'
     IE_DESC = 'Vube.com'
-    _VALID_URL = r'http://vube\.com/[^/]+/(?P<id>[\da-zA-Z]{10})'
+    _VALID_URL = r'http://vube\.com/(?:[^/]+/)+(?P<id>[\da-zA-Z]{10})\b'
 
-    _TEST = {
-        'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon',
-        'md5': 'f81dcf6d0448e3291f54380181695821',
-        'info_dict': {
-            'id': 'YL2qNPkqon',
-            'ext': 'mp4',
-            'title': 'Chiara Grispo - Price Tag by Jessie J',
-            'description': 'md5:8ea652a1f36818352428cb5134933313',
-            'thumbnail': 'http://frame.thestaticvube.com/snap/228x128/102e7e63057-5ebc-4f5c-4065-6ce4ebde131f.jpg',
-            'uploader': 'Chiara.Grispo',
-            'uploader_id': '1u3hX0znhP',
-            'upload_date': '20140103',
-            'duration': 170.56
+    _TESTS = [
+        {
+            'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon',
+            'md5': 'db7aba89d4603dadd627e9d1973946fe',
+            'info_dict': {
+                'id': 'YL2qNPkqon',
+                'ext': 'mp4',
+                'title': 'Chiara Grispo - Price Tag by Jessie J',
+                'description': 'md5:8ea652a1f36818352428cb5134933313',
+                'thumbnail': 'http://frame.thestaticvube.com/snap/228x128/102e7e63057-5ebc-4f5c-4065-6ce4ebde131f.jpg',
+                'uploader': 'Chiara.Grispo',
+                'uploader_id': '1u3hX0znhP',
+                'timestamp': 1388743358,
+                'upload_date': '20140103',
+                'duration': 170.56
+            }
+        },
+        {
+            'url': 'http://vube.com/SerainaMusic/my-7-year-old-sister-and-i-singing-alive-by-krewella/UeBhTudbfS?t=s&n=1',
+            'md5': '5d4a52492d76f72712117ce6b0d98d08',
+            'info_dict': {
+                'id': 'UeBhTudbfS',
+                'ext': 'mp4',
+                'title': 'My 7 year old Sister and I singing "Alive" by Krewella',
+                'description': 'md5:40bcacb97796339f1690642c21d56f4a',
+                'thumbnail': 'http://frame.thestaticvube.com/snap/228x128/102265d5a9f-0f17-4f6b-5753-adf08484ee1e.jpg',
+                'uploader': 'Seraina',
+                'uploader_id': 'XU9VE2BQ2q',
+                'timestamp': 1396492438,
+                'upload_date': '20140403',
+                'duration': 240.107
+            }
         }
-    }
+    ]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
 
-        video = self._download_json('http://vube.com/api/v2/video/%s' % video_id,
-            video_id, 'Downloading video JSON')
+        video = self._download_json(
+            'http://vube.com/api/v2/video/%s' % video_id, video_id, 'Downloading video JSON')
 
         public_id = video['public_id']
 
-        formats = [{'url': 'http://video.thestaticvube.com/video/%s/%s.mp4' % (fmt['media_resolution_id'], public_id),
-                   'height': int(fmt['height']),
-                   'abr': int(fmt['audio_bitrate']),
-                   'vbr': int(fmt['video_bitrate']),
-                   'format_id': fmt['media_resolution_id']
-                   } for fmt in video['mtm'] if fmt['transcoding_status'] == 'processed']
+        formats = [
+            {
+                'url': 'http://video.thestaticvube.com/video/%s/%s.mp4' % (fmt['media_resolution_id'], public_id),
+                'height': int(fmt['height']),
+                'abr': int(fmt['audio_bitrate']),
+                'vbr': int(fmt['video_bitrate']),
+                'format_id': fmt['media_resolution_id']
+            } for fmt in video['mtm'] if fmt['transcoding_status'] == 'processed'
+        ]
 
         self._sort_formats(formats)
 
@@ -52,16 +74,16 @@ class VubeIE(InfoExtractor):
             thumbnail = 'http:' + thumbnail
         uploader = video['user_alias']
         uploader_id = video['user_url_id']
-        upload_date = datetime.datetime.fromtimestamp(int(video['upload_time'])).strftime('%Y%m%d')
+        timestamp = int(video['upload_time'])
         duration = video['duration']
-        view_count = video['raw_view_count']
-        like_count = video['total_likes']
-        dislike_count= video['total_hates']
+        view_count = video.get('raw_view_count')
+        like_count = video.get('total_likes')
+        dislike_count= video.get('total_hates')
 
-        comment = self._download_json('http://vube.com/api/video/%s/comment' % video_id,
-            video_id, 'Downloading video comment JSON')
+        comment = self._download_json(
+            'http://vube.com/api/video/%s/comment' % video_id, video_id, 'Downloading video comment JSON')
 
-        comment_count = comment['total']
+        comment_count = int_or_none(comment.get('total'))
 
         return {
             'id': video_id,
@@ -71,10 +93,10 @@ class VubeIE(InfoExtractor):
             'thumbnail': thumbnail,
             'uploader': uploader,
             'uploader_id': uploader_id,
-            'upload_date': upload_date,
+            'timestamp': timestamp,
             'duration': duration,
             'view_count': view_count,
             'like_count': like_count,
             'dislike_count': dislike_count,
             'comment_count': comment_count,
-        }
\ No newline at end of file
+        }
diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py
new file mode 100644 (file)
index 0000000..fb0600f
--- /dev/null
@@ -0,0 +1,66 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse_urlparse,
+    parse_duration,
+    qualities,
+)
+
+
+class VuClipIE(InfoExtractor):
+    _VALID_URL = r'http://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)'
+
+    _TEST = {
+        'url': 'http://m.vuclip.com/w?cid=843902317&fid=63532&z=1007&nvar&frm=index.html&bu=4757321434',
+        'md5': '92ac9d1ccefec4f0bb474661ab144fcf',
+        'info_dict': {
+            'id': '843902317',
+            'ext': '3gp',
+            'title': 'Movie Trailer: Noah',
+            'duration': 139,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        ad_m = re.search(
+            r'''value="No.*?" onClick="location.href='([^"']+)'"''', webpage)
+        if ad_m:
+            urlr = compat_urllib_parse_urlparse(url)
+            adfree_url = urlr.scheme + '://' + urlr.netloc + ad_m.group(1)
+            webpage = self._download_webpage(
+                adfree_url, video_id, note='Download post-ad page')
+
+        links_code = self._search_regex(
+            r'(?s)<div class="social align_c".*?>(.*?)<hr\s*/?>', webpage,
+            'links')
+        title = self._html_search_regex(
+            r'<title>(.*?)-\s*Vuclip</title>', webpage, 'title').strip()
+
+        quality_order = qualities(['Reg', 'Hi'])
+        formats = []
+        for url, q in re.findall(
+                r'<a href="(?P<url>[^"]+)".*?>(?P<q>[^<]+)</a>', links_code):
+            format_id = compat_urllib_parse_urlparse(url).scheme + '-' + q
+            formats.append({
+                'format_id': format_id,
+                'url': url,
+                'quality': quality_order(q),
+            })
+        self._sort_formats(formats)
+
+        duration = parse_duration(self._search_regex(
+            r'\(([0-9:]+)\)</span></h1>', webpage, 'duration', fatal=False))
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'duration': duration,
+        }
diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py
new file mode 100644 (file)
index 0000000..cb8f088
--- /dev/null
@@ -0,0 +1,103 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    strip_jsonp,
+)
+
+
+class WashingtonPostIE(InfoExtractor):
+    _VALID_URL = r'^https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])'
+    _TEST = {
+        'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/',
+        'playlist': [{
+            'md5': 'c3f4b4922ffa259243f68e928db2db8c',
+            'info_dict': {
+                'id': 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f',
+                'ext': 'mp4',
+                'title': 'Breaking Points: The Paper Mine',
+                'duration': 1287,
+                'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.',
+                'uploader': 'The Washington Post',
+                'timestamp': 1395527908,
+                'upload_date': '20140322',
+            },
+        }, {
+            'md5': 'f645a07652c2950cd9134bb852c5f5eb',
+            'info_dict': {
+                'id': '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f',
+                'ext': 'mp4',
+                'title': 'The town bureaucracy sustains',
+                'description': 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it\'s like to do paperwork 230 feet underground.',
+                'duration': 2217,
+                'timestamp': 1395528005,
+                'upload_date': '20140322',
+                'uploader': 'The Washington Post',
+            },
+        }]
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        page_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, page_id)
+        title = self._og_search_title(webpage)
+        uuids = re.findall(r'data-video-uuid="([^"]+)"', webpage)
+        entries = []
+        for i, uuid in enumerate(uuids, start=1):
+            vinfo_all = self._download_json(
+                'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % uuid,
+                page_id,
+                transform_source=strip_jsonp,
+                note='Downloading information of video %d/%d' % (i, len(uuids))
+            )
+            vinfo = vinfo_all[0]['contentConfig']
+            uploader = vinfo.get('credits', {}).get('source')
+            timestamp = int_or_none(
+                vinfo.get('dateConfig', {}).get('dateFirstPublished'), 1000)
+
+            formats = [{
+                'format_id': (
+                    '%s-%s-%s' % (s.get('type'), s.get('width'), s.get('bitrate'))
+                    if s.get('width')
+                    else s.get('type')),
+                'vbr': s.get('bitrate') if s.get('width') != 0 else None,
+                'width': s.get('width'),
+                'height': s.get('height'),
+                'acodec': s.get('audioCodec'),
+                'vcodec': s.get('videoCodec') if s.get('width') != 0 else 'none',
+                'filesize': s.get('fileSize'),
+                'url': s.get('url'),
+                'ext': 'mp4',
+                'protocol': {
+                    'MP4': 'http',
+                    'F4F': 'f4m',
+                }.get(s.get('type'))
+            } for s in vinfo.get('streams', [])]
+            source_media_url = vinfo.get('sourceMediaURL')
+            if source_media_url:
+                formats.append({
+                    'format_id': 'source_media',
+                    'url': source_media_url,
+                })
+            self._sort_formats(formats)
+            entries.append({
+                'id': uuid,
+                'title': vinfo['title'],
+                'description': vinfo.get('blurb'),
+                'uploader': uploader,
+                'formats': formats,
+                'duration': int_or_none(vinfo.get('videoDuration'), 100),
+                'timestamp': timestamp,
+            })
+
+        return {
+            '_type': 'playlist',
+            'entries': entries,
+            'id': page_id,
+            'title': title,
+        }
index 4fab6c6e8511711047e3ba9143452397a0aca0fa..a584e08966ac57354c51a71d7fee520d7ce67df8 100644 (file)
@@ -1,37 +1,37 @@
 # coding: utf-8
+from __future__ import unicode_literals
 
-import json
 import re
 
 from .common import InfoExtractor
-
 from ..utils import (
     unified_strdate,
 )
 
 
 class WatIE(InfoExtractor):
-    _VALID_URL=r'http://www\.wat\.tv/.*-(?P<shortID>.*?)_.*?\.html'
+    _VALID_URL = r'http://www\.wat\.tv/.*-(?P<shortID>.*?)_.*?\.html'
     IE_NAME = 'wat.tv'
     _TEST = {
-        u'url': u'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html',
-        u'file': u'10631273.mp4',
-        u'md5': u'd8b2231e1e333acd12aad94b80937e19',
-        u'info_dict': {
-            u'title': u'World War Z - Philadelphia VOST',
-            u'description': u'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr',
+        'url': 'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html',
+        'info_dict': {
+            'id': '10631273',
+            'ext': 'mp4',
+            'title': 'World War Z - Philadelphia VOST',
+            'description': 'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr',
+        },
+        'params': {
+            # Sometimes wat serves the whole file with the --test option
+            'skip_download': True,
         },
-        u'skip': u'Sometimes wat serves the whole file with the --test option',
     }
-    
+
     def download_video_info(self, real_id):
         # 'contentv4' is used in the website, but it also returns the related
         # videos, we don't need them
-        info = self._download_webpage('http://www.wat.tv/interface/contentv3/' + real_id, real_id, 'Downloading video info')
-        info = json.loads(info)
+        info = self._download_json('http://www.wat.tv/interface/contentv3/' + real_id, real_id)
         return info['media']
 
-
     def _real_extract(self, url):
         def real_id_for_chapter(chapter):
             return chapter['tc_start'].split('-')[0]
@@ -56,17 +56,17 @@ class WatIE(InfoExtractor):
             entries = [self.url_result(chapter_url) for chapter_url in chapter_urls]
             return self.playlist_result(entries, real_id, video_info['title'])
 
+        upload_date = None
+        if 'date_diffusion' in first_chapter:
+            upload_date = unified_strdate(first_chapter['date_diffusion'])
         # Otherwise we can continue and extract just one part, we have to use
         # the short id for getting the video url
-        info = {'id': real_id,
-                'url': 'http://wat.tv/get/android5/%s.mp4' % real_id,
-                'ext': 'mp4',
-                'title': first_chapter['title'],
-                'thumbnail': first_chapter['preview'],
-                'description': first_chapter['description'],
-                'view_count': video_info['views'],
-                }
-        if 'date_diffusion' in first_chapter:
-            info['upload_date'] = unified_strdate(first_chapter['date_diffusion'])
-
-        return info
+        return {
+            'id': real_id,
+            'url': 'http://wat.tv/get/android5/%s.mp4' % real_id,
+            'title': first_chapter['title'],
+            'thumbnail': first_chapter['preview'],
+            'description': first_chapter['description'],
+            'view_count': video_info['views'],
+            'upload_date': upload_date,
+        }
diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py
new file mode 100644 (file)
index 0000000..feeb44b
--- /dev/null
@@ -0,0 +1,224 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_parse_qs,
+    compat_urlparse,
+    determine_ext,
+    unified_strdate,
+)
+
+
+class WDRIE(InfoExtractor):
+    _PLAYER_REGEX = '-(?:video|audio)player(?:_size-[LMS])?'
+    _VALID_URL = r'(?P<url>https?://www\d?\.(?:wdr\d?|funkhauseuropa)\.de/)(?P<id>.+?)(?P<player>%s)?\.html' % _PLAYER_REGEX
+
+    _TESTS = [
+        {
+            'url': 'http://www1.wdr.de/mediathek/video/sendungen/servicezeit/videoservicezeit560-videoplayer_size-L.html',
+            'info_dict': {
+                'id': 'mdb-362427',
+                'ext': 'flv',
+                'title': 'Servicezeit',
+                'description': 'md5:c8f43e5e815eeb54d0b96df2fba906cb',
+                'upload_date': '20140310',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www1.wdr.de/themen/av/videomargaspiegelisttot101-videoplayer.html',
+            'info_dict': {
+                'id': 'mdb-363194',
+                'ext': 'flv',
+                'title': 'Marga Spiegel ist tot',
+                'description': 'md5:2309992a6716c347891c045be50992e4',
+                'upload_date': '20140311',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            'url': 'http://www1.wdr.de/themen/kultur/audioerlebtegeschichtenmargaspiegel100-audioplayer.html',
+            'md5': '83e9e8fefad36f357278759870805898',
+            'info_dict': {
+                'id': 'mdb-194332',
+                'ext': 'mp3',
+                'title': 'Erlebte Geschichten: Marga Spiegel (29.11.2009)',
+                'description': 'md5:2309992a6716c347891c045be50992e4',
+                'upload_date': '20091129',
+            },
+        },
+        {
+            'url': 'http://www.funkhauseuropa.de/av/audiogrenzenlosleckerbaklava101-audioplayer.html',
+            'md5': 'cfff440d4ee64114083ac44676df5d15',
+            'info_dict': {
+                'id': 'mdb-363068',
+                'ext': 'mp3',
+                'title': 'Grenzenlos lecker - Baklava',
+                'description': 'md5:7b29e97e10dfb6e265238b32fa35b23a',
+                'upload_date': '20140311',
+            },
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        page_url = mobj.group('url')
+        page_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, page_id)
+
+        if mobj.group('player') is None:
+            entries = [
+                self.url_result(page_url + href, 'WDR')
+                for href in re.findall(r'<a href="/?(.+?%s\.html)" rel="nofollow"' % self._PLAYER_REGEX, webpage)
+            ]
+            return self.playlist_result(entries, page_id)
+
+        flashvars = compat_urlparse.parse_qs(
+            self._html_search_regex(r'<param name="flashvars" value="([^"]+)"', webpage, 'flashvars'))
+
+        page_id = flashvars['trackerClipId'][0]
+        video_url = flashvars['dslSrc'][0]
+        title = flashvars['trackerClipTitle'][0]
+        thumbnail = flashvars['startPicture'][0] if 'startPicture' in flashvars else None
+
+        if 'trackerClipAirTime' in flashvars:
+            upload_date = flashvars['trackerClipAirTime'][0]
+        else:
+            upload_date = self._html_search_meta('DC.Date', webpage, 'upload date')
+
+        if upload_date:
+            upload_date = unified_strdate(upload_date)
+
+        if video_url.endswith('.f4m'):
+            video_url += '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18'
+            ext = 'flv'
+        else:
+            ext = determine_ext(video_url)
+
+        description = self._html_search_meta('Description', webpage, 'description')
+
+        return {
+            'id': page_id,
+            'url': video_url,
+            'ext': ext,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+        }
+
+
+class WDRMobileIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+        https?://mobile-ondemand\.wdr\.de/
+        .*?/fsk(?P<age_limit>[0-9]+)
+        /[0-9]+/[0-9]+/
+        (?P<id>[0-9]+)_(?P<title>[0-9]+)'''
+    IE_NAME = 'wdr:mobile'
+    _TEST = {
+        'url': 'http://mobile-ondemand.wdr.de/CMS2010/mdb/ondemand/weltweit/fsk0/42/421735/421735_4283021.mp4',
+        'info_dict': {
+            'title': '4283021',
+            'id': '421735',
+            'age_limit': 0,
+        },
+        '_skip': 'Will be depublicized shortly'
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        return {
+            'id': mobj.group('id'),
+            'title': mobj.group('title'),
+            'age_limit': int(mobj.group('age_limit')),
+            'url': url,
+            'user_agent': 'mobile',
+        }
+
+
+class WDRMausIE(InfoExtractor):
+    _VALID_URL = 'http://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P<id>[^/?#]+)(?:/index\.php5|(?<!index)\.php5|/(?:$|[?#]))'
+    IE_DESC = 'Sendung mit der Maus'
+    _TESTS = [{
+        'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5',
+        'info_dict': {
+            'id': 'aktuelle-sendung',
+            'ext': 'mp4',
+            'thumbnail': 're:^http://.+\.jpg',
+            'upload_date': 're:^[0-9]{8}$',
+            'title': 're:^[0-9.]{10} - Aktuelle Sendung$',
+        }
+    }, {
+        'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/40_jahre_maus.php5',
+        'md5': '3b1227ca3ed28d73ec5737c65743b2a3',
+        'info_dict': {
+            'id': '40_jahre_maus',
+            'ext': 'mp4',
+            'thumbnail': 're:^http://.+\.jpg',
+            'upload_date': '20131007',
+            'title': '12.03.2011 - 40 Jahre Maus',
+        }
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        param_code = self._html_search_regex(
+            r'<a href="\?startVideo=1&amp;([^"]+)"', webpage, 'parameters')
+
+        title_date = self._search_regex(
+            r'<div class="sendedatum"><p>Sendedatum:\s*([0-9\.]+)</p>',
+            webpage, 'air date')
+        title_str = self._html_search_regex(
+            r'<h1>(.*?)</h1>', webpage, 'title')
+        title = '%s - %s' % (title_date, title_str)
+        upload_date = unified_strdate(
+            self._html_search_meta('dc.date', webpage))
+
+        fields = compat_parse_qs(param_code)
+        video_url = fields['firstVideo'][0]
+        thumbnail = compat_urlparse.urljoin(url, fields['startPicture'][0])
+
+        formats = [{
+            'format_id': 'rtmp',
+            'url': video_url,
+        }]
+
+        jscode = self._download_webpage(
+            'http://www.wdrmaus.de/codebase/js/extended-medien.min.js',
+            video_id, fatal=False,
+            note='Downloading URL translation table',
+            errnote='Could not download URL translation table')
+        if jscode:
+            for m in re.finditer(
+                    r"stream:\s*'dslSrc=(?P<stream>[^']+)',\s*download:\s*'(?P<dl>[^']+)'\s*\}",
+                    jscode):
+                if video_url.startswith(m.group('stream')):
+                    http_url = video_url.replace(
+                        m.group('stream'), m.group('dl'))
+                    formats.append({
+                        'format_id': 'http',
+                        'url': http_url,
+                    })
+                    break
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'thumbnail': thumbnail,
+            'upload_date': upload_date,
+        }
+
+# TODO test _1
\ No newline at end of file
index fa784ab994d2b8acede7e4b4496b12779a787de6..b24297a409911c79433cca404dc94206009aefe5 100644 (file)
@@ -1,10 +1,11 @@
 # coding: utf-8
+from __future__ import unicode_literals
 
 import re
-import json
 
 from .common import InfoExtractor
 
+
 class WeiboIE(InfoExtractor):
     """
     The videos in Weibo come from different sites, this IE just finds the link
@@ -13,16 +14,16 @@ class WeiboIE(InfoExtractor):
     _VALID_URL = r'https?://video\.weibo\.com/v/weishipin/t_(?P<id>.+?)\.htm'
 
     _TEST = {
-        u'add_ie': ['Sina'],
-        u'url': u'http://video.weibo.com/v/weishipin/t_zjUw2kZ.htm',
-        u'file': u'98322879.flv',
-        u'info_dict': {
-            u'title': u'魔声耳机最新广告“All Eyes On Us”',
+        'url': 'http://video.weibo.com/v/weishipin/t_zjUw2kZ.htm',
+        'info_dict': {
+            'id': '98322879',
+            'ext': 'flv',
+            'title': '魔声耳机最新广告“All Eyes On Us”',
         },
-        u'note': u'Sina video',
-        u'params': {
-            u'skip_download': True,
+        'params': {
+            'skip_download': True,
         },
+        'add_ie': ['Sina'],
     }
 
     # Additional example videos from different sites
@@ -33,17 +34,16 @@ class WeiboIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
         video_id = mobj.group('id')
         info_url = 'http://video.weibo.com/?s=v&a=play_list&format=json&mix_video_id=t_%s' % video_id
-        info_page = self._download_webpage(info_url, video_id)
-        info = json.loads(info_page)
+        info = self._download_json(info_url, video_id)
 
         videos_urls = map(lambda v: v['play_page_url'], info['result']['data'])
-        #Prefer sina video since they have thumbnails
-        videos_urls = sorted(videos_urls, key=lambda u: u'video.sina.com' in u)
+        # Prefer sina video since they have thumbnails
+        videos_urls = sorted(videos_urls, key=lambda u: 'video.sina.com' in u)
         player_url = videos_urls[-1]
-        m_sina = re.match(r'https?://video.sina.com.cn/v/b/(\d+)-\d+.html', player_url)
+        m_sina = re.match(r'https?://video\.sina\.com\.cn/v/b/(\d+)-\d+\.html',
+            player_url)
         if m_sina is not None:
             self.to_screen('Sina video detected')
             sina_id = m_sina.group(1)
             player_url = 'http://you.video.sina.com.cn/swf/quotePlayer.swf?vid=%s' % sina_id
         return self.url_result(player_url)
-
index 9a6bb0c768a046e96bac0aa3dd39875821119e83..c27dda9440e62274e13b9359f24c2a909516b4bc 100644 (file)
@@ -3,19 +3,34 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from .youtube import YoutubeIE
 
 
 class WimpIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?(?:www\.)?wimp\.com/([^/]+)/'
-    _TEST = {
-        'url': 'http://www.wimp.com/deerfence/',
-        'file': 'deerfence.flv',
-        'md5': '8b215e2e0168c6081a1cf84b2846a2b5',
+    _VALID_URL = r'http://(?:www\.)?wimp\.com/([^/]+)/'
+    _TESTS = [{
+        'url': 'http://www.wimp.com/maruexhausted/',
+        'md5': 'f1acced123ecb28d9bb79f2479f2b6a1',
         'info_dict': {
-            "title": "Watch Till End: Herd of deer jump over a fence.",
-            "description": "These deer look as fluid as running water when they jump over this fence as a herd. This video is one that needs to be watched until the very end for the true majesty to be witnessed, but once it comes, it's sure to take your breath away.",
+            'id': 'maruexhausted',
+            'ext': 'flv',
+            'title': 'Maru is exhausted.',
+            'description': 'md5:57e099e857c0a4ea312542b684a869b8',
         }
-    }
+    }, {
+        # youtube video
+        'url': 'http://www.wimp.com/clowncar/',
+        'info_dict': {
+            'id': 'cG4CEr2aiSg',
+            'ext': 'mp4',
+            'title': 'Basset hound clown car...incredible!',
+            'description': 'md5:8d228485e0719898c017203f900b3a35',
+            'uploader': 'Gretchen Hoey',
+            'uploader_id': 'gretchenandjeff1',
+            'upload_date': '20140303',
+        },
+        'add_ie': ['Youtube'],
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
@@ -23,6 +38,13 @@ class WimpIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
         video_url = self._search_regex(
             r's1\.addVariable\("file",\s*"([^"]+)"\);', webpage, 'video URL')
+        if YoutubeIE.suitable(video_url):
+            self.to_screen('Found YouTube video')
+            return {
+                '_type': 'url',
+                'url': video_url,
+                'ie_key': YoutubeIE.ie_key(),
+            }
 
         return {
             'id': video_id,
index 3237596a3ace9796001f8ab78921ca9b6c84d2d1..4e89acd81bbb5ddfb97b1da3381b1b9f25873c96 100644 (file)
@@ -1,3 +1,5 @@
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
@@ -7,14 +9,14 @@ class WorldStarHipHopIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
     _TEST = {
         "url": "http://www.worldstarhiphop.com/videos/video.php?v=wshh6a7q1ny0G34ZwuIO",
-        "file": "wshh6a7q1ny0G34ZwuIO.mp4",
         "md5": "9d04de741161603bf7071bbf4e883186",
         "info_dict": {
+            "id": "wshh6a7q1ny0G34ZwuIO",
+            "ext": "mp4",
             "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!"
         }
     }
 
-
     def _real_extract(self, url):
         m = re.match(self._VALID_URL, url)
         video_id = m.group('id')
@@ -22,42 +24,33 @@ class WorldStarHipHopIE(InfoExtractor):
         webpage_src = self._download_webpage(url, video_id)
 
         m_vevo_id = re.search(r'videoId=(.*?)&amp?',
-            webpage_src)
-        
+                              webpage_src)
         if m_vevo_id is not None:
-            self.to_screen(u'Vevo video detected:')
             return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo')
 
-        video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
-            webpage_src, u'video URL')
+        video_url = self._search_regex(
+            r'so\.addVariable\("file","(.*?)"\)', webpage_src, 'video URL')
 
         if 'youtube' in video_url:
-            self.to_screen(u'Youtube video detected:')
             return self.url_result(video_url, ie='Youtube')
 
-        if 'mp4' in video_url:
-            ext = 'mp4'
-        else:
-            ext = 'flv'
-
-        video_title = self._html_search_regex(r"<title>(.*)</title>",
-            webpage_src, u'title')
+        video_title = self._html_search_regex(
+            r"<title>(.*)</title>", webpage_src, 'title')
 
         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
-        thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
-            webpage_src, u'thumbnail', fatal=False)
-
+        thumbnail = self._html_search_regex(
+            r'rel="image_src" href="(.*)" />', webpage_src, 'thumbnail',
+            fatal=False)
         if not thumbnail:
             _title = r"""candytitles.*>(.*)</span>"""
             mobj = re.search(_title, webpage_src)
             if mobj is not None:
                 video_title = mobj.group(1)
 
-        results = [{
-                    'id': video_id,
-                    'url' : video_url,
-                    'title' : video_title,
-                    'thumbnail' : thumbnail,
-                    'ext' : ext,
-                    }]
-        return results
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': video_title,
+            'thumbnail': thumbnail,
+        }
+
diff --git a/youtube_dl/extractor/xbef.py b/youtube_dl/extractor/xbef.py
new file mode 100644 (file)
index 0000000..71bd7c4
--- /dev/null
@@ -0,0 +1,50 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+)
+
+
+class XBefIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?xbef\.com/video/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://xbef.com/video/5119-glamourous-lesbians-smoking-drinking-and-fucking',
+        'md5': 'a478b565baff61634a98f5e5338be995',
+        'info_dict': {
+            'id': '5119',
+            'ext': 'mp4',
+            'title': 'md5:7358a9faef8b7b57acda7c04816f170e',
+            'age_limit': 18,
+            'thumbnail': 're:^http://.*\.jpg',
+        }
+    }
+
+    def _real_extract(self, url):
+        m = re.match(self._VALID_URL, url)
+        video_id = m.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        title = self._html_search_regex(
+            r'<h1[^>]*>(.*?)</h1>', webpage, 'title')
+
+        config_url_enc = self._download_webpage(
+            'http://xbef.com/Main/GetVideoURLEncoded/%s' % video_id, video_id,
+            note='Retrieving config URL')
+        config_url = compat_urllib_parse.unquote(config_url_enc)
+        config = self._download_xml(
+            config_url, video_id, note='Retrieving config')
+
+        video_url = config.find('./file').text
+        thumbnail = config.find('./image').text
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'thumbnail': thumbnail,
+            'age_limit': 18,
+        }
+
index f6c515f7f8a5ad9b038b7f417800c3c833fb8ac3..5374495f9b08f4d13fd7552fd612c19339b99e54 100644 (file)
@@ -4,51 +4,51 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
-    compat_urllib_parse,
     ExtractorError,
+    unified_strdate,
+    str_to_int,
+    int_or_none,
+    parse_duration,
 )
 
 
 class XHamsterIE(InfoExtractor):
     """Information Extractor for xHamster"""
-    _VALID_URL = r'(?:http://)?(?:www\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
-    _TESTS = [{
-        'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
-        'file': '1509445.mp4',
-        'md5': '8281348b8d3c53d39fffb377d24eac4e',
-        'info_dict': {
-            "upload_date": "20121014",
-            "uploader_id": "Ruseful2011",
-            "title": "FemaleAgent Shy beauty takes the bait",
-            "age_limit": 18,
-        }
-    },
-    {
-        'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
-        'file': '2221348.flv',
-        'md5': 'e767b9475de189320f691f49c679c4c7',
-        'info_dict': {
-            "upload_date": "20130914",
-            "uploader_id": "jojo747400",
-            "title": "Britney Spears  Sexy Booty",
-            "age_limit": 18,
+    _VALID_URL = r'http://(?:www\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?'
+    _TESTS = [
+        {
+            'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
+            'md5': '8281348b8d3c53d39fffb377d24eac4e',
+            'info_dict': {
+                'id': '1509445',
+                'ext': 'mp4',
+                'title': 'FemaleAgent Shy beauty takes the bait',
+                'upload_date': '20121014',
+                'uploader_id': 'Ruseful2011',
+                'duration': 893,
+                'age_limit': 18,
+            }
+        },
+        {
+            'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
+            'md5': '4cbd8d56708ecb4fb4124c23e4acb81a',
+            'info_dict': {
+                'id': '2221348',
+                'ext': 'mp4',
+                'title': 'Britney Spears  Sexy Booty',
+                'upload_date': '20130914',
+                'uploader_id': 'jojo747400',
+                'duration': 200,
+                'age_limit': 18,
+            }
         }
-    }]
+    ]
 
     def _real_extract(self,url):
         def extract_video_url(webpage):
-            mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
-            if mobj is None:
-                raise ExtractorError('Unable to extract media URL')
-            if len(mobj.group('server')) == 0:
-                return compat_urllib_parse.unquote(mobj.group('file'))
-            else:
-                return mobj.group('server')+'/key='+mobj.group('file')
-
-        def extract_mp4_video_url(webpage):
-            mp4 = re.search(r'<a href=\"(.+?)\" class=\"mp4Play\"',webpage)
+            mp4 = re.search(r'<video\s+.*?file="([^"]+)".*?>', webpage)
             if mp4 is None:
-                return None
+                raise ExtractorError('Unable to extract media URL')
             else:
                 return mp4.group(1)
 
@@ -62,50 +62,49 @@ class XHamsterIE(InfoExtractor):
         mrss_url = 'http://xhamster.com/movies/%s/%s.html' % (video_id, seo)
         webpage = self._download_webpage(mrss_url, video_id)
 
-        video_title = self._html_search_regex(
-            r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, 'title')
+        title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage, 'title')
 
         # Only a few videos have an description
         mobj = re.search(r'<span>Description: </span>([^<]+)', webpage)
-        video_description = mobj.group(1) if mobj else None
+        description = mobj.group(1) if mobj else None
 
-        mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
-        if mobj:
-            video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
-        else:
-            video_upload_date = None
-            self._downloader.report_warning('Unable to extract upload date')
+        upload_date = self._html_search_regex(r'hint=\'(\d{4}-\d{2}-\d{2}) \d{2}:\d{2}:\d{2} [A-Z]{3,4}\'',
+            webpage, 'upload date', fatal=False)
+        if upload_date:
+            upload_date = unified_strdate(upload_date)
 
-        video_uploader_id = self._html_search_regex(
-            r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
+        uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
             webpage, 'uploader id', default='anonymous')
 
-        video_thumbnail = self._search_regex(
-            r'\'image\':\'(?P<thumbnail>[^\']+)\'',
-            webpage, 'thumbnail', fatal=False)
+        thumbnail = self._html_search_regex(r'<video\s+.*?poster="([^"]+)".*?>', webpage, 'thumbnail', fatal=False)
+
+        duration = parse_duration(self._html_search_regex(r'<span>Runtime:</span> (\d+:\d+)</div>',
+            webpage, 'duration', fatal=False))
+
+        view_count = self._html_search_regex(r'<span>Views:</span> ([^<]+)</div>', webpage, 'view count', fatal=False)
+        if view_count:
+            view_count = str_to_int(view_count)
+
+        mobj = re.search(r"hint='(?P<likecount>\d+) Likes / (?P<dislikecount>\d+) Dislikes'", webpage)
+        (like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None)
+
+        mobj = re.search(r'</label>Comments \((?P<commentcount>\d+)\)</div>', webpage)
+        comment_count = mobj.group('commentcount') if mobj else 0
 
         age_limit = self._rta_search(webpage)
 
         hd = is_hd(webpage)
+
         video_url = extract_video_url(webpage)
         formats = [{
             'url': video_url,
             'format_id': 'hd' if hd else 'sd',
-            'preference': 0,
+            'preference': 1,
         }]
 
-        video_mp4_url = extract_mp4_video_url(webpage)
-        if video_mp4_url is not None:
-            formats.append({
-                'url': video_mp4_url,
-                'ext': 'mp4',
-                'format_id': 'mp4-hd' if hd else 'mp4-sd',
-                'preference': 1,
-            })
-
         if not hd:
-            webpage = self._download_webpage(
-                mrss_url + '?hd', video_id, note='Downloading HD webpage')
+            mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url')
+            webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage')
             if is_hd(webpage):
                 video_url = extract_video_url(webpage)
                 formats.append({
@@ -118,11 +117,16 @@ class XHamsterIE(InfoExtractor):
 
         return {
             'id': video_id,
-            'title': video_title,
-            'formats': formats,
-            'description': video_description,
-            'upload_date': video_upload_date,
-            'uploader_id': video_uploader_id,
-            'thumbnail': video_thumbnail,
+            'title': title,
+            'description': description,
+            'upload_date': upload_date,
+            'uploader_id': uploader_id,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'view_count': view_count,
+            'like_count': int_or_none(like_count),
+            'dislike_count': int_or_none(dislike_count),
+            'comment_count': int_or_none(comment_count),
             'age_limit': age_limit,
+            'formats': formats,
         }
index 1177a4b14ec04748bebb5ab17db2f0a29c68ca5c..7a73b243080406b29b6c4a17d50a3e4d1ac023cb 100644 (file)
@@ -1,55 +1,49 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
 import re
 
 from .common import InfoExtractor
 from ..utils import (
     compat_urllib_parse,
-
-    ExtractorError,
 )
 
 
 class XNXXIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:video|www)\.xnxx\.com/video([0-9]+)/(.*)'
-    VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
-    VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
-    VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
+    _VALID_URL = r'^https?://(?:video|www)\.xnxx\.com/video(?P<id>[0-9]+)/(.*)'
     _TEST = {
-        u'url': u'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_',
-        u'file': u'1135332.flv',
-        u'md5': u'0831677e2b4761795f68d417e0b7b445',
-        u'info_dict': {
-            u"title": u"lida \u00bb Naked Funny Actress  (5)",
-            u"age_limit": 18,
+        'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_',
+        'md5': '0831677e2b4761795f68d417e0b7b445',
+        'info_dict': {
+            'id': '1135332',
+            'ext': 'flv',
+            'title': 'lida » Naked Funny Actress  (5)',
+            'age_limit': 18,
         }
     }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-        video_id = mobj.group(1)
+        video_id = mobj.group('id')
 
         # Get webpage content
         webpage = self._download_webpage(url, video_id)
 
-        video_url = self._search_regex(self.VIDEO_URL_RE,
-            webpage, u'video URL')
+        video_url = self._search_regex(r'flv_url=(.*?)&amp;',
+            webpage, 'video URL')
         video_url = compat_urllib_parse.unquote(video_url)
 
-        video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
-            webpage, u'title')
+        video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XNXX.COM',
+            webpage, 'title')
 
-        video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
-            webpage, u'thumbnail', fatal=False)
+        video_thumbnail = self._search_regex(r'url_bigthumb=(.*?)&amp;',
+            webpage, 'thumbnail', fatal=False)
 
-        return [{
+        return {
             'id': video_id,
             'url': video_url,
-            'uploader': None,
-            'upload_date': None,
             'title': video_title,
             'ext': 'flv',
             'thumbnail': video_thumbnail,
-            'description': None,
             'age_limit': 18,
-        }]
+        }
index 982619922d8ef5fdd0f260902b0253f5dce024dd..b293e2665b81b9a486bff2ec91b410da4a6d9998 100644 (file)
@@ -1,25 +1,29 @@
 from __future__ import unicode_literals
 
-import os
 import re
+import json
 
 from .common import InfoExtractor
 from ..utils import (
-    compat_urllib_parse_urlparse,
     compat_urllib_request,
+    parse_duration,
+    str_to_int,
 )
 
+
 class XTubeIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))'
+    _VALID_URL = r'https?://(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))'
     _TEST = {
         'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_',
-        'file': 'kVTUy_G222_.mp4',
         'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab',
         'info_dict': {
-            "title": "strange erotica",
-            "description": "surreal gay themed erotica...almost an ET kind of thing",
-            "uploader": "greenshowers",
-            "age_limit": 18,
+            'id': 'kVTUy_G222_',
+            'ext': 'mp4',
+            'title': 'strange erotica',
+            'description': 'surreal gay themed erotica...almost an ET kind of thing',
+            'uploader': 'greenshowers',
+            'duration': 450,
+            'age_limit': 18,
         }
     }
 
@@ -32,25 +36,79 @@ class XTubeIE(InfoExtractor):
         req.add_header('Cookie', 'age_verified=1')
         webpage = self._download_webpage(req, video_id)
 
-        video_title = self._html_search_regex(r'<div class="p_5px[^>]*>([^<]+)', webpage, 'title')
-        video_uploader = self._html_search_regex(r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False)
-        video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, 'description', fatal=False)
-        video_url= self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, 'video_url').replace('\\/', '/')
-        path = compat_urllib_parse_urlparse(video_url).path
-        extension = os.path.splitext(path)[1][1:]
-        format = path.split('/')[5].split('_')[:2]
-        format[0] += 'p'
-        format[1] += 'k'
-        format = "-".join(format)
+        video_title = self._html_search_regex(r'<p class="title">([^<]+)', webpage, 'title')
+        video_uploader = self._html_search_regex(
+            r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False)
+        video_description = self._html_search_regex(
+            r'<p class="fieldsDesc">([^<]+)', webpage, 'description', fatal=False)
+        duration = parse_duration(self._html_search_regex(
+            r'<span class="bold">Runtime:</span> ([^<]+)</p>', webpage, 'duration', fatal=False))
+        view_count = self._html_search_regex(
+            r'<span class="bold">Views:</span> ([\d,\.]+)</p>', webpage, 'view count', fatal=False)
+        if view_count:
+            view_count = str_to_int(view_count)
+        comment_count = self._html_search_regex(
+            r'<div id="commentBar">([\d,\.]+) Comments</div>', webpage, 'comment count', fatal=False)
+        if comment_count:
+            comment_count = str_to_int(comment_count)
+
+        player_quality_option = json.loads(self._html_search_regex(
+            r'playerQualityOption = ({.+?});', webpage, 'player quality option'))
+
+        QUALITIES = ['3gp', 'mp4_normal', 'mp4_high', 'flv', 'mp4_ultra', 'mp4_720', 'mp4_1080']
+        formats = [
+            {
+                'url': furl,
+                'format_id': format_id,
+                'preference': QUALITIES.index(format_id) if format_id in QUALITIES else -1,
+            } for format_id, furl in player_quality_option.items()
+        ]
+        self._sort_formats(formats)
 
         return {
             'id': video_id,
             'title': video_title,
             'uploader': video_uploader,
             'description': video_description,
-            'url': video_url,
-            'ext': extension,
-            'format': format,
-            'format_id': format,
+            'duration': duration,
+            'view_count': view_count,
+            'comment_count': comment_count,
+            'formats': formats,
             'age_limit': 18,
         }
+
+class XTubeUserIE(InfoExtractor):
+    IE_DESC = 'XTube user profile'
+    _VALID_URL = r'https?://(?:www\.)?xtube\.com/community/profile\.php\?(.*?)user=(?P<username>[^&#]+)(?:$|[&#])'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        username = mobj.group('username')
+
+        profile_page = self._download_webpage(
+            url, username, note='Retrieving profile page')
+
+        video_count = int(self._search_regex(
+            r'<strong>%s\'s Videos \(([0-9]+)\)</strong>'%username, profile_page,
+            'video count'))
+
+        PAGE_SIZE = 25
+        urls = []
+        page_count = (video_count + PAGE_SIZE + 1) // PAGE_SIZE
+        for n in range(1, page_count + 1):
+            lpage_url = 'http://www.xtube.com/user_videos.php?page=%d&u=%s' % (n, username)
+            lpage = self._download_webpage(
+                lpage_url, username,
+                note='Downloading page %d/%d' % (n, page_count))
+            urls.extend(
+                re.findall(r'addthis:url="([^"]+)"', lpage))
+
+        return {
+            '_type': 'playlist',
+            'id': username,
+            'entries': [{
+                '_type': 'url',
+                'url': eurl,
+                'ie_key': 'XTube',
+            } for eurl in urls]
+        }
index 85e99e1b02b8ab7e7647d6c91dbad08f4827d5f3..7e00448246beb9ab9b7c25f33b05e6f4f1bb8283 100644 (file)
@@ -5,18 +5,21 @@ import re
 from .common import InfoExtractor
 from ..utils import (
     compat_urllib_parse,
+    ExtractorError,
+    clean_html,
 )
 
 
 class XVideosIE(InfoExtractor):
     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
     _TEST = {
-        'url': 'http://www.xvideos.com/video939581/funny_porns_by_s_-1',
-        'file': '939581.flv',
-        'md5': '1d0c835822f0a71a7bf011855db929d0',
+        'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl',
+        'md5': '4b46ae6ea5e6e9086e714d883313c0c9',
         'info_dict': {
-            "title": "Funny Porns By >>>>S<<<<<< -1",
-            "age_limit": 18,
+            'id': '4588838',
+            'ext': 'flv',
+            'title': 'Biker Takes his Girl',
+            'age_limit': 18,
         }
     }
 
@@ -28,6 +31,10 @@ class XVideosIE(InfoExtractor):
 
         self.report_extraction(video_id)
 
+        mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage)
+        if mobj:
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True)
+
         # Extract video URL
         video_url = compat_urllib_parse.unquote(
             self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL'))
index d92d14f718158f285b2696944afb155fdd664538..d84be25620eecb944845b74299510067772c583f 100644 (file)
@@ -14,27 +14,39 @@ from ..utils import (
 
 
 class YahooIE(InfoExtractor):
-    IE_DESC = 'Yahoo screen'
-    _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
+    IE_DESC = 'Yahoo screen and movies'
+    _VALID_URL = r'https?://(?:screen|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html'
     _TESTS = [
         {
             'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
-            'file': '214727115.mp4',
             'md5': '4962b075c08be8690a922ee026d05e69',
             'info_dict': {
+                'id': '2d25e626-2378-391f-ada0-ddaf1417e588',
+                'ext': 'mp4',
                 'title': 'Julian Smith & Travis Legg Watch Julian Smith',
                 'description': 'Julian and Travis watch Julian Smith',
             },
         },
         {
             'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
-            'file': '103000935.mp4',
             'md5': 'd6e6fc6e1313c608f316ddad7b82b306',
             'info_dict': {
+                'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9',
+                'ext': 'mp4',
                 'title': 'Codefellas - The Cougar Lies with Spanish Moss',
                 'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
             },
         },
+        {
+            'url': 'https://movies.yahoo.com/video/world-loves-spider-man-190819223.html',
+            'md5': '410b7104aa9893b765bc22787a22f3d9',
+            'info_dict': {
+                'id': '516ed8e2-2c4f-339f-a211-7a8b49d30845',
+                'ext': 'mp4',
+                'title': 'The World Loves Spider-Man',
+                'description': '''People all over the world are celebrating the release of \"The Amazing Spider-Man 2.\" We're taking a look at the enthusiastic response Spider-Man has received from viewers all over the world.''',
+            }
+        }
     ]
 
     def _real_extract(self, url):
@@ -42,16 +54,25 @@ class YahooIE(InfoExtractor):
         video_id = mobj.group('id')
         webpage = self._download_webpage(url, video_id)
 
-        items_json = self._search_regex(r'mediaItems: ({.*?})$',
-            webpage, 'items', flags=re.MULTILINE)
-        items = json.loads(items_json)
-        info = items['mediaItems']['query']['results']['mediaObj'][0]
-        # The 'meta' field is not always in the video webpage, we request it
-        # from another page
-        long_id = info['id']
-        return self._get_info(long_id, video_id)
-
-    def _get_info(self, long_id, video_id):
+        items_json = self._search_regex(
+            r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE,
+            default=None)
+        if items_json is None:
+            CONTENT_ID_REGEXES = [
+                r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"',
+                r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"'
+            ]
+            long_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID')
+            video_id = long_id
+        else:
+            items = json.loads(items_json)
+            info = items['mediaItems']['query']['results']['mediaObj'][0]
+            # The 'meta' field is not always in the video webpage, we request it
+            # from another page
+            long_id = info['id']
+        return self._get_info(long_id, video_id, webpage)
+
+    def _get_info(self, long_id, video_id, webpage):
         query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"'
                  ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"'
                  ' AND protocol="http"' % long_id)
@@ -60,10 +81,9 @@ class YahooIE(InfoExtractor):
             'env': 'prod',
             'format': 'json',
         })
-        query_result_json = self._download_webpage(
+        query_result = self._download_json(
             'http://video.query.yahoo.com/v1/public/yql?' + data,
             video_id, 'Downloading video info')
-        query_result = json.loads(query_result_json)
         info = query_result['query']['results']['mediaObj'][0]
         meta = info['meta']
 
@@ -86,7 +106,6 @@ class YahooIE(InfoExtractor):
             else:
                 format_url = compat_urlparse.urljoin(host, path)
                 format_info['url'] = format_url
-                
             formats.append(format_info)
 
         self._sort_formats(formats)
@@ -96,7 +115,7 @@ class YahooIE(InfoExtractor):
             'title': meta['title'],
             'formats': formats,
             'description': clean_html(meta['description']),
-            'thumbnail': meta['thumbnail'],
+            'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),
         }
 
 
@@ -104,7 +123,7 @@ class YahooNewsIE(YahooIE):
     IE_NAME = 'yahoo:news'
     _VALID_URL = r'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
         'md5': '67010fdf3a08d290e060a4dd96baa07b',
         'info_dict': {
@@ -113,17 +132,14 @@ class YahooNewsIE(YahooIE):
             'title': 'China Moses Is Crazy About the Blues',
             'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0',
         },
-    }
-
-    # Overwrite YahooIE properties we don't want
-    _TESTS = []
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
         webpage = self._download_webpage(url, video_id)
         long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, 'long id')
-        return self._get_info(long_id, video_id)
+        return self._get_info(long_id, video_id, webpage)
 
 
 class YahooSearchIE(SearchInfoExtractor):
@@ -134,27 +150,25 @@ class YahooSearchIE(SearchInfoExtractor):
 
     def _get_n_results(self, query, n):
         """Get a specified number of results for a query"""
-
-        res = {
-            '_type': 'playlist',
-            'id': query,
-            'entries': []
-        }
-        for pagenum in itertools.count(0): 
+        entries = []
+        for pagenum in itertools.count(0):
             result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
-            webpage = self._download_webpage(result_url, query,
-                                             note='Downloading results page '+str(pagenum+1))
-            info = json.loads(webpage)
+            info = self._download_json(result_url, query,
+                note='Downloading results page '+str(pagenum+1))
             m = info['m']
             results = info['results']
 
             for (i, r) in enumerate(results):
-                if (pagenum * 30) +i >= n:
+                if (pagenum * 30) + i >= n:
                     break
                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
-                res['entries'].append(e)
-            if (pagenum * 30 +i >= n) or (m['last'] >= (m['total'] -1)):
+                entries.append(e)
+            if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)):
                 break
 
-        return res
+        return {
+            '_type': 'playlist',
+            'id': query,
+            'entries': entries,
+        }
index 77ad423c44b38af655fc14a8918dfbcf677ca936..d456c4da522d689ac7bcbd33c5f8a3b1204c3b00 100644 (file)
@@ -1,3 +1,6 @@
+from __future__ import unicode_literals
+
+
 import json
 import re
 import sys
@@ -17,24 +20,25 @@ from ..aes import (
 
 
 class YouPornIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))'
+    _VALID_URL = r'^(?P<proto>https?://)(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))'
     _TEST = {
-        u'url': u'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
-        u'file': u'505835.mp4',
-        u'md5': u'71ec5fcfddacf80f495efa8b6a8d9a89',
-        u'info_dict': {
-            u"upload_date": u"20101221",
-            u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?",
-            u"uploader": u"Ask Dan And Jennifer",
-            u"title": u"Sex Ed: Is It Safe To Masturbate Daily?",
-            u"age_limit": 18,
+        'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/',
+        'md5': '71ec5fcfddacf80f495efa8b6a8d9a89',
+        'info_dict': {
+            'id': '505835',
+            'ext': 'mp4',
+            'upload_date': '20101221',
+            'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?',
+            'uploader': 'Ask Dan And Jennifer',
+            'title': 'Sex Ed: Is It Safe To Masturbate Daily?',
+            'age_limit': 18,
         }
     }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('videoid')
-        url = 'http://www.' + mobj.group('url')
+        url = mobj.group('proto') + 'www.' + mobj.group('url')
 
         req = compat_urllib_request.Request(url)
         req.add_header('Cookie', 'age_verified=1')
@@ -42,7 +46,7 @@ class YouPornIE(InfoExtractor):
         age_limit = self._rta_search(webpage)
 
         # Get JSON parameters
-        json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
+        json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, 'JSON parameters')
         try:
             params = json.loads(json_params)
         except:
@@ -61,7 +65,7 @@ class YouPornIE(InfoExtractor):
         # Get all of the links from the page
         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
-            webpage, u'download list').strip()
+            webpage, 'download list').strip()
         LINK_RE = r'<a href="([^"]+)">'
         links = re.findall(LINK_RE, download_list_html)
 
@@ -86,7 +90,7 @@ class YouPornIE(InfoExtractor):
             resolution = format_parts[0]
             height = int(resolution[:-len('p')])
             bitrate = int(format_parts[1][:-len('k')])
-            format = u'-'.join(format_parts) + u'-' + dn
+            format = '-'.join(format_parts) + '-' + dn
 
             formats.append({
                 'url': video_url,
index a8103684301eef0711fdb1f28bd15dcb40c6430b..7c50881c4453eaff4ac69776fcc2dc94feef8d31 100644 (file)
@@ -7,13 +7,13 @@ import itertools
 import json
 import os.path
 import re
-import string
 import struct
 import traceback
 import zlib
 
 from .common import InfoExtractor, SearchInfoExtractor
 from .subtitles import SubtitlesInfoExtractor
+from ..jsinterp import JSInterpreter
 from ..utils import (
     compat_chr,
     compat_parse_qs,
@@ -29,7 +29,6 @@ from ..utils import (
     ExtractorError,
     int_or_none,
     PagedList,
-    RegexNotFoundError,
     unescapeHTML,
     unified_strdate,
     orderedSet,
@@ -138,19 +137,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
                             (?:www\.)?deturl\.com/www\.youtube\.com/|
                             (?:www\.)?pwnyoutube\.com/|
+                            (?:www\.)?yourepeat\.com/|
                             tube\.majestyc\.net/|
                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
                          (?:                                                  # the various things that can precede the ID:
                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
                              |(?:                                             # or the v= param in all its forms
-                                 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)?    # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
+                                 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
                                  v=
                              )
                          ))
                          |youtu\.be/                                          # just youtu.be/xxxx
+                         |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
                          )
                      )?                                                       # all until now is optional -> you can pass the naked ID
                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
@@ -176,32 +177,32 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 
 
         # 3d videos
-        '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
-        '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
-        '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
-        '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20},
-        '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20},
-        '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20},
-        '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20},
+        '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
+        '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
+        '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
+        '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
+        '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
+        '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
+        '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
 
         # Apple HTTP Live Streaming
-        '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
-        '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10},
-        '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10},
-        '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10},
-        '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10},
-        '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10},
-        '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
+        '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
+        '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
+        '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
+        '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
+        '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
+        '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
+        '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
 
         # DASH mp4 video
-        '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
-        '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
-        '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
-        '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
-        '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
-        '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
-        '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
-        '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
+        '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 
         # Dash mp4 audio
         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
@@ -209,23 +210,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
 
         # Dash webm
-        '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
-        '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
-        '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
-        '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
-        '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
-        '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
-        '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
-        '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
-        '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
-        '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
-        '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
-        '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40},
-        '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40},
+        '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
+        '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
+        '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
+        '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
+        '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
+        '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
+        '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+        '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 
         # Dash webm audio
-        '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50},
-        '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50},
+        '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
+        '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
 
         # RTMP (unnamed)
         '_rtmp': {'protocol': 'rtmp'},
@@ -241,7 +243,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                 u"uploader": u"Philipp Hagemeister",
                 u"uploader_id": u"phihag",
                 u"upload_date": u"20121002",
-                u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
+                u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
+                u"categories": [u'Science & Technology'],
             }
         },
         {
@@ -251,7 +254,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
             u"info_dict": {
                 u"upload_date": u"20120506",
                 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
-                u"description": u"md5:5b292926389560516e384ac437c0ec07",
+                u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
                 u"uploader": u"Icona Pop",
                 u"uploader_id": u"IconaPop"
             }
@@ -296,6 +299,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                 u"format": "141",
             },
         },
+        # DASH manifest with encrypted signature
+        {
+            u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
+            u'info_dict': {
+                u'id': u'IB3lcPjvWLA',
+                u'ext': u'm4a',
+                u'title': u'Afrojack - The Spark ft. Spree Wilson',
+                u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
+                u'uploader': u'AfrojackVEVO',
+                u'uploader_id': u'AfrojackVEVO',
+                u'upload_date': u'20131011',
+            },
+            u"params": {
+                u'youtube_include_dash_manifest': True,
+                u'format': '141',
+            },
+        },
     ]
 
 
@@ -421,113 +441,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
     def _parse_sig_js(self, jscode):
         funcname = self._search_regex(
             r'signature=([a-zA-Z]+)', jscode,
-            u'Initial JS player signature function name')
-
-        functions = {}
-
-        def argidx(varname):
-            return string.lowercase.index(varname)
-
-        def interpret_statement(stmt, local_vars, allow_recursion=20):
-            if allow_recursion < 0:
-                raise ExtractorError(u'Recursion limit reached')
-
-            if stmt.startswith(u'var '):
-                stmt = stmt[len(u'var '):]
-            ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
-                             r'=(?P<expr>.*)$', stmt)
-            if ass_m:
-                if ass_m.groupdict().get('index'):
-                    def assign(val):
-                        lvar = local_vars[ass_m.group('out')]
-                        idx = interpret_expression(ass_m.group('index'),
-                                                   local_vars, allow_recursion)
-                        assert isinstance(idx, int)
-                        lvar[idx] = val
-                        return val
-                    expr = ass_m.group('expr')
-                else:
-                    def assign(val):
-                        local_vars[ass_m.group('out')] = val
-                        return val
-                    expr = ass_m.group('expr')
-            elif stmt.startswith(u'return '):
-                assign = lambda v: v
-                expr = stmt[len(u'return '):]
-            else:
-                raise ExtractorError(
-                    u'Cannot determine left side of statement in %r' % stmt)
-
-            v = interpret_expression(expr, local_vars, allow_recursion)
-            return assign(v)
-
-        def interpret_expression(expr, local_vars, allow_recursion):
-            if expr.isdigit():
-                return int(expr)
-
-            if expr.isalpha():
-                return local_vars[expr]
-
-            m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
-            if m:
-                member = m.group('member')
-                val = local_vars[m.group('in')]
-                if member == 'split("")':
-                    return list(val)
-                if member == 'join("")':
-                    return u''.join(val)
-                if member == 'length':
-                    return len(val)
-                if member == 'reverse()':
-                    return val[::-1]
-                slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
-                if slice_m:
-                    idx = interpret_expression(
-                        slice_m.group('idx'), local_vars, allow_recursion-1)
-                    return val[idx:]
-
-            m = re.match(
-                r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
-            if m:
-                val = local_vars[m.group('in')]
-                idx = interpret_expression(m.group('idx'), local_vars,
-                                           allow_recursion-1)
-                return val[idx]
-
-            m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
-            if m:
-                a = interpret_expression(m.group('a'),
-                                         local_vars, allow_recursion)
-                b = interpret_expression(m.group('b'),
-                                         local_vars, allow_recursion)
-                return a % b
-
-            m = re.match(
-                r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
-            if m:
-                fname = m.group('func')
-                if fname not in functions:
-                    functions[fname] = extract_function(fname)
-                argvals = [int(v) if v.isdigit() else local_vars[v]
-                           for v in m.group('args').split(',')]
-                return functions[fname](argvals)
-            raise ExtractorError(u'Unsupported JS expression %r' % expr)
-
-        def extract_function(funcname):
-            func_m = re.search(
-                r'function ' + re.escape(funcname) +
-                r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
-                jscode)
-            argnames = func_m.group('args').split(',')
-
-            def resf(args):
-                local_vars = dict(zip(argnames, args))
-                for stmt in func_m.group('code').split(';'):
-                    res = interpret_statement(stmt, local_vars)
-                return res
-            return resf
-
-        initial_function = extract_function(funcname)
+             u'Initial JS player signature function name')
+
+        jsi = JSInterpreter(jscode)
+        initial_function = jsi.extract_function(funcname)
         return lambda s: initial_function([s])
 
     def _parse_sig_swf(self, file_contents):
@@ -1113,14 +1030,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
         return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
 
     def _real_extract(self, url):
+        proto = (
+            u'http' if self._downloader.params.get('prefer_insecure', False)
+            else u'https')
+
         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
         mobj = re.search(self._NEXT_URL_RE, url)
         if mobj:
-            url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
+            url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
         video_id = self.extract_id(url)
 
         # Get video webpage
-        url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
+        url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
         video_webpage = self._download_webpage(url, video_id)
 
         # Attempt to extract SWF player URL
@@ -1145,7 +1066,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                                                   'asv': 3,
                                                   'sts':'1588',
                                                   })
-            video_info_url = 'https://www.youtube.com/get_video_info?' + data
+            video_info_url = proto + '://www.youtube.com/get_video_info?' + data
             video_info_webpage = self._download_webpage(video_info_url, video_id,
                                     note=False,
                                     errnote='unable to download video info webpage')
@@ -1153,7 +1074,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
         else:
             age_gate = False
             for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
-                video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
+                video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
                         % (video_id, el_type))
                 video_info_webpage = self._download_webpage(video_info_url, video_id,
                                         note=False,
@@ -1163,9 +1084,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                     break
         if 'token' not in video_info:
             if 'reason' in video_info:
-                raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
+                raise ExtractorError(
+                    u'YouTube said: %s' % video_info['reason'][0],
+                    expected=True, video_id=video_id)
             else:
-                raise ExtractorError(u'"token" parameter not in video info for unknown reason')
+                raise ExtractorError(
+                    u'"token" parameter not in video info for unknown reason',
+                    video_id=video_id)
 
         if 'view_count' in video_info:
             view_count = int(video_info['view_count'][0])
@@ -1194,7 +1119,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 
         # title
         if 'title' in video_info:
-            video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
+            video_title = video_info['title'][0]
         else:
             self._downloader.report_warning(u'Unable to extract video title')
             video_title = u'_'
@@ -1213,11 +1138,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 
         # upload date
         upload_date = None
-        mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
+        mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
+        if mobj is None:
+            mobj = re.search(
+                r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
+                video_webpage)
         if mobj is not None:
             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
             upload_date = unified_strdate(upload_date)
 
+        m_cat_container = get_element_by_id("eow-category", video_webpage)
+        if m_cat_container:
+            category = self._html_search_regex(
+                r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
+                default=None)
+            video_categories = None if category is None else [category]
+        else:
+            video_categories = None
+
         # description
         video_description = get_element_by_id("eow-description", video_webpage)
         if video_description:
@@ -1268,11 +1206,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 
         # Decide which formats to download
         try:
-            mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
+            mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
             if not mobj:
                 raise ValueError('Could not find vevo ID')
-            info = json.loads(mobj.group(1))
-            args = info['args']
+            json_code = uppercase_escape(mobj.group(1))
+            ytplayer_config = json.loads(json_code)
+            args = ytplayer_config['args']
             # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
             # this signatures are encrypted
             if 'url_encoded_fmt_stream_map' not in args:
@@ -1365,12 +1304,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
             raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
 
         # Look for the DASH manifest
-        dash_manifest_url_lst = video_info.get('dashmpd')
-        if (dash_manifest_url_lst and dash_manifest_url_lst[0] and
-                self._downloader.params.get('youtube_include_dash_manifest', False)):
+        if (self._downloader.params.get('youtube_include_dash_manifest', False)):
             try:
+                # The DASH manifest used needs to be the one from the original video_webpage.
+                # The one found in get_video_info seems to be using different signatures.
+                # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
+                # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
+                # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
+                if age_gate:
+                    dash_manifest_url = video_info.get('dashmpd')[0]
+                else:
+                    dash_manifest_url = ytplayer_config['args']['dashmpd']
+                def decrypt_sig(mobj):
+                    s = mobj.group(1)
+                    dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
+                    return '/signature/%s' % dec_s
+                dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
                 dash_doc = self._download_xml(
-                    dash_manifest_url_lst[0], video_id,
+                    dash_manifest_url, video_id,
                     note=u'Downloading DASH manifest',
                     errnote=u'Could not download DASH manifest')
                 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
@@ -1411,11 +1362,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
             'title':        video_title,
             'thumbnail':    video_thumbnail,
             'description':  video_description,
+            'categories':   video_categories,
             'subtitles':    video_subtitles,
             'duration':     video_duration,
             'age_limit':    18 if age_gate else 0,
             'annotations':  video_annotations,
-            'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
+            'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
             'view_count':   view_count,
             'like_count': like_count,
             'dislike_count': dislike_count,
@@ -1442,9 +1394,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
                      |
                         ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
                      )"""
-    _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
+    _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
     _MORE_PAGES_INDICATOR = r'data-link-type="next"'
-    _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
+    _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
     IE_NAME = u'youtube:playlist'
 
     def _real_initialize(self):
@@ -1459,11 +1411,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
         # the id of the playlist is just 'RD' + video_id
         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
         webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
-        title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
-            get_element_by_attribute('class', 'title ', webpage))
+        search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
+        title_span = (search_title('playlist-title') or
+            search_title('title long-title') or search_title('title'))
         title = clean_html(title_span)
-        video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
-        ids = orderedSet(re.findall(video_re, webpage))
+        video_re = r'''(?x)data-video-username=".*?".*?
+                       href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
+        ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
         url_results = self._ids_to_results(ids)
 
         return self.playlist_result(url_results, playlist_id, title)
@@ -1483,7 +1437,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
                 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
                 return self.url_result(video_id, 'Youtube', video_id=video_id)
             else:
-                self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
+                self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
 
         if playlist_id.startswith('RD'):
             # Mixes require a custom extraction process
@@ -1492,29 +1446,41 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
             raise ExtractorError(u'For downloading YouTube.com top lists, use '
                 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
 
+        url = self._TEMPLATE_URL % playlist_id
+        page = self._download_webpage(url, playlist_id)
+        more_widget_html = content_html = page
+
+        # Check if the playlist exists or is private
+        if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
+            raise ExtractorError(
+                u'The playlist doesn\'t exist or is private, use --username or '
+                '--netrc to access it.',
+                expected=True)
+
         # Extract the video ids from the playlist pages
         ids = []
 
         for page_num in itertools.count(1):
-            url = self._TEMPLATE_URL % (playlist_id, page_num)
-            page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
-            matches = re.finditer(self._VIDEO_RE, page)
+            matches = re.finditer(self._VIDEO_RE, content_html)
             # We remove the duplicates and the link with index 0
             # (it's not the first video of the playlist)
             new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
             ids.extend(new_ids)
 
-            if re.search(self._MORE_PAGES_INDICATOR, page) is None:
+            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+            if not mobj:
                 break
 
-        try:
-            playlist_title = self._og_search_title(page)
-        except RegexNotFoundError:
-            self.report_warning(
-                u'Playlist page is missing OpenGraph title, falling back ...',
-                playlist_id)
-            playlist_title = self._html_search_regex(
-                r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title')
+            more = self._download_json(
+                'https://youtube.com/%s' % mobj.group('more'), playlist_id,
+                'Downloading page #%s' % page_num,
+                transform_source=uppercase_escape)
+            content_html = more['content_html']
+            more_widget_html = more['load_more_widget_html']
+
+        playlist_title = self._html_search_regex(
+            r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
+            page, u'title')
 
         url_results = self._ids_to_results(ids)
         return self.playlist_result(url_results, playlist_id, playlist_title)
@@ -1610,7 +1576,7 @@ class YoutubeChannelIE(InfoExtractor):
 
 class YoutubeUserIE(InfoExtractor):
     IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
-    _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
+    _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
     _GDATA_PAGE_SIZE = 50
     _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
@@ -1672,7 +1638,7 @@ class YoutubeUserIE(InfoExtractor):
 
 class YoutubeSearchIE(SearchInfoExtractor):
     IE_DESC = u'YouTube.com searches'
-    _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
+    _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
     _MAX_RESULTS = 1000
     IE_NAME = u'youtube:search'
     _SEARCH_KEY = 'ytsearch'
@@ -1683,9 +1649,12 @@ class YoutubeSearchIE(SearchInfoExtractor):
         video_ids = []
         pagenum = 0
         limit = n
+        PAGE_SIZE = 50
 
-        while (50 * pagenum) < limit:
-            result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
+        while (PAGE_SIZE * pagenum) < limit:
+            result_url = self._API_URL % (
+                compat_urllib_parse.quote_plus(query.encode('utf-8')),
+                (PAGE_SIZE * pagenum) + 1)
             data_json = self._download_webpage(
                 result_url, video_id=u'query "%s"' % query,
                 note=u'Downloading page %s' % (pagenum + 1),
@@ -1709,12 +1678,50 @@ class YoutubeSearchIE(SearchInfoExtractor):
                   for video_id in video_ids]
         return self.playlist_result(videos, query)
 
+
 class YoutubeSearchDateIE(YoutubeSearchIE):
     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
     _SEARCH_KEY = 'ytsearchdate'
     IE_DESC = u'YouTube.com searches, newest videos first'
 
+
+class YoutubeSearchURLIE(InfoExtractor):
+    IE_DESC = u'YouTube.com search URLs'
+    IE_NAME = u'youtube:search_url'
+    _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        query = compat_urllib_parse.unquote_plus(mobj.group('query'))
+
+        webpage = self._download_webpage(url, query)
+        result_code = self._search_regex(
+            r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML')
+
+        part_codes = re.findall(
+            r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
+        entries = []
+        for part_code in part_codes:
+            part_title = self._html_search_regex(
+                r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False)
+            part_url_snippet = self._html_search_regex(
+                r'(?s)href="([^"]+)"', part_code, 'item URL')
+            part_url = compat_urlparse.urljoin(
+                'https://www.youtube.com/', part_url_snippet)
+            entries.append({
+                '_type': 'url',
+                'url': part_url,
+                'title': part_title,
+            })
+
+        return {
+            '_type': 'playlist',
+            'entries': entries,
+            'title': query,
+        }
+
+
 class YoutubeShowIE(InfoExtractor):
     IE_DESC = u'YouTube.com (multi-season) shows'
     _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
@@ -1758,23 +1765,25 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
         feed_entries = []
         paging = 0
         for i in itertools.count(1):
-            info = self._download_webpage(self._FEED_TEMPLATE % paging,
+            info = self._download_json(self._FEED_TEMPLATE % paging,
                                           u'%s feed' % self._FEED_NAME,
                                           u'Downloading page %s' % i)
-            info = json.loads(info)
-            feed_html = info['feed_html']
+            feed_html = info.get('feed_html') or info.get('content_html')
             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
             ids = orderedSet(m.group(1) for m in m_ids)
             feed_entries.extend(
                 self.url_result(video_id, 'Youtube', video_id=video_id)
                 for video_id in ids)
-            if info['paging'] is None:
+            mobj = re.search(
+                r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
+                feed_html)
+            if mobj is None:
                 break
-            paging = info['paging']
+            paging = mobj.group('paging')
         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
 
 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
-    IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
+    IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
     _FEED_NAME = 'subscriptions'
     _PLAYLIST_TITLE = u'Youtube Subscriptions'
@@ -1815,7 +1824,7 @@ class YoutubeTruncatedURLIE(InfoExtractor):
     IE_NAME = 'youtube:truncated_url'
     IE_DESC = False  # Do not list
     _VALID_URL = r'''(?x)
-        (?:https?://)?[^/]+/watch\?feature=[a-z_]+$|
+        (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$|
         (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
     '''
 
index 829f002cf02f9c908a5057ab3c6b20f520e2f2ea..3b1ac4e9f5246e268e0c0b49d64249196270e9d4 100644 (file)
@@ -1,4 +1,5 @@
 # coding: utf-8
+from __future__ import unicode_literals
 
 import re
 
@@ -13,52 +14,42 @@ class ZDFIE(InfoExtractor):
     _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P<hash>#)?/(.*beitrag/(?:video/)?)(?P<video_id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?'
 
     _TEST = {
-        u"url": u"http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt",
-        u"file": u"2037704.webm",
-        u"info_dict": {
-            u"upload_date": u"20131127",
-            u"description": u"Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial \"Ende des Machtpokers - Große Koalition für Deutschland\".",
-            u"uploader": u"spezial",
-            u"title": u"ZDFspezial - Ende des Machtpokers"
+        'url': 'http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt',
+        'info_dict': {
+            'id': '2037704',
+            'ext': 'webm',
+            'title': 'ZDFspezial - Ende des Machtpokers',
+            'description': 'Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial "Ende des Machtpokers - Große Koalition für Deutschland".',
+            'duration': 1022,
+            'uploader': 'spezial',
+            'uploader_id': '225948',
+            'upload_date': '20131127',
         },
-        u"skip": u"Videos on ZDF.de are depublicised in short order",
+        'skip': 'Videos on ZDF.de are depublicised in short order',
     }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('video_id')
 
-        xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
+        xml_url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
         doc = self._download_xml(
             xml_url, video_id,
-            note=u'Downloading video info',
-            errnote=u'Failed to download video info')
+            note='Downloading video info',
+            errnote='Failed to download video info')
 
         title = doc.find('.//information/title').text
         description = doc.find('.//information/detail').text
+        duration = int(doc.find('.//details/lengthSec').text)
         uploader_node = doc.find('.//details/originChannelTitle')
         uploader = None if uploader_node is None else uploader_node.text
-        duration_str = doc.find('.//details/length').text
-        duration_m = re.match(r'''(?x)^
-            (?P<hours>[0-9]{2})
-            :(?P<minutes>[0-9]{2})
-            :(?P<seconds>[0-9]{2})
-            (?:\.(?P<ms>[0-9]+)?)
-            ''', duration_str)
-        duration = (
-            (
-                (int(duration_m.group('hours')) * 60 * 60) +
-                (int(duration_m.group('minutes')) * 60) +
-                int(duration_m.group('seconds'))
-            )
-            if duration_m
-            else None
-        )
+        uploader_id_node = doc.find('.//details/originChannelId')
+        uploader_id = None if uploader_id_node is None else uploader_id_node.text
         upload_date = unified_strdate(doc.find('.//details/airtime').text)
 
         def xml_to_format(fnode):
             video_url = fnode.find('url').text
-            is_available = u'http://www.metafilegenerator' not in video_url
+            is_available = 'http://www.metafilegenerator' not in video_url
 
             format_id = fnode.attrib['basetype']
             format_m = re.match(r'''(?x)
@@ -71,22 +62,28 @@ class ZDFIE(InfoExtractor):
 
             quality = fnode.find('./quality').text
             abr = int(fnode.find('./audioBitrate').text) // 1000
-            vbr = int(fnode.find('./videoBitrate').text) // 1000
+            vbr_node = fnode.find('./videoBitrate')
+            vbr = None if vbr_node is None else int(vbr_node.text) // 1000
 
-            format_note = u''
+            width_node = fnode.find('./width')
+            width = None if width_node is None else int_or_none(width_node.text)
+            height_node = fnode.find('./height')
+            height = None if height_node is None else int_or_none(height_node.text)
+
+            format_note = ''
             if not format_note:
                 format_note = None
 
             return {
-                'format_id': format_id + u'-' + quality,
+                'format_id': format_id + '-' + quality,
                 'url': video_url,
                 'ext': ext,
                 'acodec': format_m.group('acodec'),
                 'vcodec': format_m.group('vcodec'),
                 'abr': abr,
                 'vbr': vbr,
-                'width': int_or_none(fnode.find('./width').text),
-                'height': int_or_none(fnode.find('./height').text),
+                'width': width,
+                'height': height,
                 'filesize': int_or_none(fnode.find('./filesize').text),
                 'format_note': format_note,
                 'protocol': proto,
@@ -103,9 +100,10 @@ class ZDFIE(InfoExtractor):
         return {
             'id': video_id,
             'title': title,
-            'formats': formats,
             'description': description,
-            'uploader': uploader,
             'duration': duration,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
             'upload_date': upload_date,
-        }
+            'formats': formats,
+        }
\ No newline at end of file
diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py
new file mode 100644 (file)
index 0000000..449482d
--- /dev/null
@@ -0,0 +1,116 @@
+from __future__ import unicode_literals
+
+import re
+
+from .utils import (
+    ExtractorError,
+)
+
+
+class JSInterpreter(object):
+    def __init__(self, code):
+        self.code = code
+        self._functions = {}
+
+    def interpret_statement(self, stmt, local_vars, allow_recursion=20):
+        if allow_recursion < 0:
+            raise ExtractorError('Recursion limit reached')
+
+        if stmt.startswith('var '):
+            stmt = stmt[len('var '):]
+        ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
+                         r'=(?P<expr>.*)$', stmt)
+        if ass_m:
+            if ass_m.groupdict().get('index'):
+                def assign(val):
+                    lvar = local_vars[ass_m.group('out')]
+                    idx = self.interpret_expression(
+                        ass_m.group('index'), local_vars, allow_recursion)
+                    assert isinstance(idx, int)
+                    lvar[idx] = val
+                    return val
+                expr = ass_m.group('expr')
+            else:
+                def assign(val):
+                    local_vars[ass_m.group('out')] = val
+                    return val
+                expr = ass_m.group('expr')
+        elif stmt.startswith('return '):
+            assign = lambda v: v
+            expr = stmt[len('return '):]
+        else:
+            raise ExtractorError(
+                'Cannot determine left side of statement in %r' % stmt)
+
+        v = self.interpret_expression(expr, local_vars, allow_recursion)
+        return assign(v)
+
+    def interpret_expression(self, expr, local_vars, allow_recursion):
+        if expr.isdigit():
+            return int(expr)
+
+        if expr.isalpha():
+            return local_vars[expr]
+
+        m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
+        if m:
+            member = m.group('member')
+            val = local_vars[m.group('in')]
+            if member == 'split("")':
+                return list(val)
+            if member == 'join("")':
+                return u''.join(val)
+            if member == 'length':
+                return len(val)
+            if member == 'reverse()':
+                return val[::-1]
+            slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
+            if slice_m:
+                idx = self.interpret_expression(
+                    slice_m.group('idx'), local_vars, allow_recursion - 1)
+                return val[idx:]
+
+        m = re.match(
+            r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
+        if m:
+            val = local_vars[m.group('in')]
+            idx = self.interpret_expression(
+                m.group('idx'), local_vars, allow_recursion - 1)
+            return val[idx]
+
+        m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
+        if m:
+            a = self.interpret_expression(
+                m.group('a'), local_vars, allow_recursion)
+            b = self.interpret_expression(
+                m.group('b'), local_vars, allow_recursion)
+            return a % b
+
+        m = re.match(
+            r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
+        if m:
+            fname = m.group('func')
+            if fname not in self._functions:
+                self._functions[fname] = self.extract_function(fname)
+            argvals = [int(v) if v.isdigit() else local_vars[v]
+                       for v in m.group('args').split(',')]
+            return self._functions[fname](argvals)
+        raise ExtractorError('Unsupported JS expression %r' % expr)
+
+    def extract_function(self, funcname):
+        func_m = re.search(
+            (r'(?:function %s|%s\s*=\s*function)' % (
+                re.escape(funcname), re.escape(funcname))) +
+            r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
+            self.code)
+        if func_m is None:
+            raise ExtractorError('Could not find JS function %r' % funcname)
+        argnames = func_m.group('args').split(',')
+
+        def resf(args):
+            local_vars = dict(zip(argnames, args))
+            for stmt in func_m.group('code').split(';'):
+                res = self.interpret_statement(stmt, local_vars)
+            return res
+        return resf
+
index 7f19f717f20dc3f6b76c4bbea9f89e12d5cb9b10..08e6ddd00cbfe5691fb14943d6b2217748e96398 100644 (file)
@@ -1,5 +1,7 @@
 
+from .atomicparsley import AtomicParsleyPP
 from .ffmpeg import (
+    FFmpegAudioFixPP,
     FFmpegMergerPP,
     FFmpegMetadataPP,
     FFmpegVideoConvertor,
@@ -9,6 +11,8 @@ from .ffmpeg import (
 from .xattrpp import XAttrMetadataPP
 
 __all__ = [
+    'AtomicParsleyPP',
+    'FFmpegAudioFixPP',
     'FFmpegMergerPP',
     'FFmpegMetadataPP',
     'FFmpegVideoConvertor',
diff --git a/youtube_dl/postprocessor/atomicparsley.py b/youtube_dl/postprocessor/atomicparsley.py
new file mode 100644 (file)
index 0000000..765b2d9
--- /dev/null
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+
+import os
+import subprocess
+
+from .common import PostProcessor
+
+from ..utils import (
+    check_executable,
+    compat_urlretrieve,
+    encodeFilename,
+    PostProcessingError,
+    prepend_extension,
+    shell_quote
+)
+
+
+class AtomicParsleyPPError(PostProcessingError):
+    pass
+
+
+class AtomicParsleyPP(PostProcessor):
+    def run(self, info):
+        if not check_executable('AtomicParsley', ['-v']):
+            raise AtomicParsleyPPError('AtomicParsley was not found. Please install.')
+
+        filename = info['filepath']
+        temp_filename = prepend_extension(filename, 'temp')
+        temp_thumbnail = prepend_extension(filename, 'thumb')
+
+        if not info.get('thumbnail'):
+            raise AtomicParsleyPPError('Thumbnail was not found. Nothing to do.')
+
+        compat_urlretrieve(info['thumbnail'], temp_thumbnail)
+
+        cmd = ['AtomicParsley', filename, '--artwork', temp_thumbnail, '-o', temp_filename]
+
+        self._downloader.to_screen('[atomicparsley] Adding thumbnail to "%s"' % filename)
+
+        if self._downloader.params.get('verbose', False):
+            self._downloader.to_screen('[debug] AtomicParsley command line: %s' % shell_quote(cmd))
+
+        p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        stdout, stderr = p.communicate()
+
+        if p.returncode != 0:
+            msg = stderr.decode('utf-8', 'replace').strip()
+            raise AtomicParsleyPPError(msg)
+
+        os.remove(encodeFilename(filename))
+        os.remove(encodeFilename(temp_thumbnail))
+        os.rename(encodeFilename(temp_filename), encodeFilename(filename))
+
+        return True, info
index c22f2cdc648b440f52a08cb1ba7947e02a985019..45328ed43ef44052fedb9dcf5f2de012aa3007ae 100644 (file)
@@ -9,6 +9,7 @@ from .common import AudioConversionError, PostProcessor
 from ..utils import (
     check_executable,
     compat_subprocess_get_DEVNULL,
+    encodeArgument,
     encodeFilename,
     PostProcessingError,
     prepend_extension,
@@ -48,13 +49,13 @@ class FFmpegPostProcessor(PostProcessor):
         for path in input_paths:
             files_cmd.extend(['-i', encodeFilename(path, True)])
         cmd = ([self._get_executable(), '-y'] + files_cmd
-               + opts +
+               + [encodeArgument(o) for o in opts] +
                [encodeFilename(self._ffmpeg_filename_argument(out_path), True)])
 
         if self._downloader.params.get('verbose', False):
             self._downloader.to_screen(u'[debug] ffmpeg command line: %s' % shell_quote(cmd))
         p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        stdout,stderr = p.communicate()
+        stdout, stderr = p.communicate()
         if p.returncode != 0:
             stderr = stderr.decode('utf-8', 'replace')
             msg = stderr.strip().split('\n')[-1]
@@ -464,7 +465,11 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
         filename = info['filepath']
         temp_filename = prepend_extension(filename, 'temp')
 
-        options = ['-c', 'copy']
+        if info['ext'] == u'm4a':
+            options = ['-vn', '-acodec', 'copy']
+        else:
+            options = ['-c', 'copy']
+
         for (name, value) in metadata.items():
             options.extend(['-metadata', '%s=%s' % (name, value)])
 
@@ -483,3 +488,17 @@ class FFmpegMergerPP(FFmpegPostProcessor):
         self.run_ffmpeg_multiple_files(info['__files_to_merge'], filename, args)
         return True, info
 
+
+class FFmpegAudioFixPP(FFmpegPostProcessor):
+    def run(self, info):
+        filename = info['filepath']
+        temp_filename = prepend_extension(filename, 'temp')
+
+        options = ['-vn', '-acodec', 'copy']
+        self._downloader.to_screen(u'[ffmpeg] Fixing audio file "%s"' % filename)
+        self.run_ffmpeg(filename, temp_filename, options)
+
+        os.remove(encodeFilename(filename))
+        os.rename(encodeFilename(temp_filename), encodeFilename(filename))
+
+        return True, info
index 18979241cd9f0bb70cf2d8e9c00709ec43db1fea..f6940940b340dea5c23e5ce118b8fcc4d7ee8574 100644 (file)
@@ -6,6 +6,7 @@ from .common import PostProcessor
 from ..utils import (
     check_executable,
     hyphenate_date,
+    subprocess_check_output
 )
 
 
@@ -57,7 +58,7 @@ class XAttrMetadataPP(PostProcessor):
                         elif user_has_xattr:
                             cmd = ['xattr', '-w', key, value, path]
 
-                        subprocess.check_output(cmd)
+                        subprocess_check_output(cmd)
 
                 else:
                     # On Unix, and can't find pyxattr, setfattr, or xattr.
index 057cd20d1d70977dd9e56e3d857907a648b228c8..b97e62ae9307f7e2380db7ec9c723e8ae8517708 100644 (file)
@@ -1,10 +1,14 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+import calendar
+import codecs
+import contextlib
 import ctypes
 import datetime
 import email.utils
 import errno
+import getpass
 import gzip
 import itertools
 import io
@@ -21,6 +25,7 @@ import struct
 import subprocess
 import sys
 import traceback
+import xml.etree.ElementTree
 import zlib
 
 try:
@@ -174,6 +179,11 @@ try:
 except NameError:
     compat_chr = chr
 
+try:
+    from xml.etree.ElementTree import ParseError as compat_xml_parse_error
+except ImportError:  # Python 2.6
+    from xml.parsers.expat import ExpatError as compat_xml_parse_error
+
 def compat_ord(c):
     if type(c) is int: return c
     else: return ord(c)
@@ -493,13 +503,13 @@ def orderedSet(iterable):
             res.append(el)
     return res
 
+
 def unescapeHTML(s):
-    """
-    @param s a string
-    """
-    assert type(s) == type(u'')
+    if s is None:
+        return None
+    assert type(s) == compat_str
 
-    result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
+    result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
     return result
 
 
@@ -531,6 +541,15 @@ def encodeFilename(s, for_subprocess=False):
     return s.encode(encoding, 'ignore')
 
 
+def encodeArgument(s):
+    if not isinstance(s, compat_str):
+        # Legacy code that uses byte strings
+        # Uncomment the following line after fixing all post processors
+        #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
+        s = s.decode('ascii')
+    return encodeFilename(s, True)
+
+
 def decodeOption(optval):
     if optval is None:
         return optval
@@ -585,13 +604,15 @@ def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 
 class ExtractorError(Exception):
     """Error during info extraction."""
-    def __init__(self, msg, tb=None, expected=False, cause=None):
+    def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
         """ tb, if given, is the original traceback (so that it can be printed out).
         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
         """
 
         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
             expected = True
+        if video_id is not None:
+            msg = video_id + ': ' + msg
         if not expected:
             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
         super(ExtractorError, self).__init__(msg)
@@ -599,6 +620,7 @@ class ExtractorError(Exception):
         self.traceback = tb
         self.exc_info = sys.exc_info()  # preserve original exception
         self.cause = cause
+        self.video_id = video_id
 
     def format_traceback(self):
         if self.traceback is None:
@@ -753,8 +775,37 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
     https_response = http_response
 
 
+def parse_iso8601(date_str):
+    """ Return a UNIX timestamp from the given date """
+
+    if date_str is None:
+        return None
+
+    m = re.search(
+        r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
+        date_str)
+    if not m:
+        timezone = datetime.timedelta()
+    else:
+        date_str = date_str[:-len(m.group(0))]
+        if not m.group('sign'):
+            timezone = datetime.timedelta()
+        else:
+            sign = 1 if m.group('sign') == '+' else -1
+            timezone = datetime.timedelta(
+                hours=sign * int(m.group('hours')),
+                minutes=sign * int(m.group('minutes')))
+
+    dt = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - timezone
+    return calendar.timegm(dt.timetuple())
+
+
 def unified_strdate(date_str):
     """Return a string with the date in the format YYYYMMDD"""
+
+    if date_str is None:
+        return None
+
     upload_date = None
     #Replace commas
     date_str = date_str.replace(',', ' ')
@@ -766,14 +817,17 @@ def unified_strdate(date_str):
         '%B %d %Y',
         '%b %d %Y',
         '%Y-%m-%d',
+        '%d.%m.%Y',
         '%d/%m/%Y',
         '%Y/%m/%d %H:%M:%S',
         '%Y-%m-%d %H:%M:%S',
         '%d.%m.%Y %H:%M',
+        '%d.%m.%Y %H.%M',
         '%Y-%m-%dT%H:%M:%SZ',
         '%Y-%m-%dT%H:%M:%S.%fZ',
         '%Y-%m-%dT%H:%M:%S.%f0Z',
         '%Y-%m-%dT%H:%M:%S',
+        '%Y-%m-%dT%H:%M:%S.%f',
         '%Y-%m-%dT%H:%M',
     ]
     for expression in format_expressions:
@@ -869,25 +923,97 @@ def platform_name():
     return res
 
 
-def write_string(s, out=None):
+def _windows_write_string(s, out):
+    """ Returns True if the string was written using special methods,
+    False if it has yet to be written out."""
+    # Adapted from http://stackoverflow.com/a/3259271/35070
+
+    import ctypes
+    import ctypes.wintypes
+
+    WIN_OUTPUT_IDS = {
+        1: -11,
+        2: -12,
+    }
+
+    try:
+        fileno = out.fileno()
+    except AttributeError:
+        # If the output stream doesn't have a fileno, it's virtual
+        return False
+    if fileno not in WIN_OUTPUT_IDS:
+        return False
+
+    GetStdHandle = ctypes.WINFUNCTYPE(
+        ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
+        ("GetStdHandle", ctypes.windll.kernel32))
+    h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
+
+    WriteConsoleW = ctypes.WINFUNCTYPE(
+        ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
+        ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
+        ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
+    written = ctypes.wintypes.DWORD(0)
+
+    GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
+    FILE_TYPE_CHAR = 0x0002
+    FILE_TYPE_REMOTE = 0x8000
+    GetConsoleMode = ctypes.WINFUNCTYPE(
+        ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
+        ctypes.POINTER(ctypes.wintypes.DWORD))(
+        ("GetConsoleMode", ctypes.windll.kernel32))
+    INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
+
+    def not_a_console(handle):
+        if handle == INVALID_HANDLE_VALUE or handle is None:
+            return True
+        return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
+                or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
+
+    if not_a_console(h):
+        return False
+
+    def next_nonbmp_pos(s):
+        try:
+            return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
+        except StopIteration:
+            return len(s)
+
+    while s:
+        count = min(next_nonbmp_pos(s), 1024)
+
+        ret = WriteConsoleW(
+            h, s, count if count else 2, ctypes.byref(written), None)
+        if ret == 0:
+            raise OSError('Failed to write string')
+        if not count:  # We just wrote a non-BMP character
+            assert written.value == 2
+            s = s[1:]
+        else:
+            assert written.value > 0
+            s = s[written.value:]
+    return True
+
+
+def write_string(s, out=None, encoding=None):
     if out is None:
         out = sys.stderr
     assert type(s) == compat_str
 
+    if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
+        if _windows_write_string(s, out):
+            return
+
     if ('b' in getattr(out, 'mode', '') or
             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
-        s = s.encode(preferredencoding(), 'ignore')
-    try:
+        byt = s.encode(encoding or preferredencoding(), 'ignore')
+        out.write(byt)
+    elif hasattr(out, 'buffer'):
+        enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
+        byt = s.encode(enc, 'ignore')
+        out.buffer.write(byt)
+    else:
         out.write(s)
-    except UnicodeEncodeError:
-        # In Windows shells, this can fail even when the codec is just charmap!?
-        # See https://wiki.python.org/moin/PrintFails#Issue
-        if sys.platform == 'win32' and hasattr(out, 'encoding'):
-            s = s.encode(out.encoding, 'ignore').decode(out.encoding)
-            out.write(s)
-        else:
-            raise
-
     out.flush()
 
 
@@ -1111,11 +1237,11 @@ def setproctitle(title):
         libc = ctypes.cdll.LoadLibrary("libc.so.6")
     except OSError:
         return
-    title = title
-    buf = ctypes.create_string_buffer(len(title) + 1)
-    buf.value = title.encode('utf-8')
+    title_bytes = title.encode('utf-8')
+    buf = ctypes.create_string_buffer(len(title_bytes))
+    buf.value = title_bytes
     try:
-        libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
+        libc.prctl(15, buf, 0, 0, 0)
     except AttributeError:
         return  # Strange libc, just skip this
 
@@ -1136,8 +1262,15 @@ class HEADRequest(compat_urllib_request.Request):
         return "HEAD"
 
 
-def int_or_none(v, scale=1):
-    return v if v is None else (int(v) // scale)
+def int_or_none(v, scale=1, default=None, get_attr=None):
+    if get_attr:
+        if v is not None:
+            v = getattr(v, get_attr, None)
+    return default if v is None else (int(v) // scale)
+
+
+def float_or_none(v, scale=1, default=None):
+    return default if v is None else (float(v) / scale)
 
 
 def parse_duration(s):
@@ -1145,7 +1278,7 @@ def parse_duration(s):
         return None
 
     m = re.match(
-        r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
+        r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
     if not m:
         return None
     res = int(m.group('secs'))
@@ -1219,9 +1352,11 @@ class PagedList(object):
 
 
 def uppercase_escape(s):
+    unicode_escape = codecs.getdecoder('unicode_escape')
     return re.sub(
-        r'\\U([0-9a-fA-F]{8})',
-        lambda m: compat_chr(int(m.group(1), base=16)), s)
+        r'\\U[0-9a-fA-F]{8}',
+        lambda m: unicode_escape(m.group(0))[0],
+        s)
 
 try:
     struct.pack(u'!I', 0)
@@ -1239,3 +1374,80 @@ except TypeError:
 else:
     struct_pack = struct.pack
     struct_unpack = struct.unpack
+
+
+def read_batch_urls(batch_fd):
+    def fixup(url):
+        if not isinstance(url, compat_str):
+            url = url.decode('utf-8', 'replace')
+        BOM_UTF8 = u'\xef\xbb\xbf'
+        if url.startswith(BOM_UTF8):
+            url = url[len(BOM_UTF8):]
+        url = url.strip()
+        if url.startswith(('#', ';', ']')):
+            return False
+        return url
+
+    with contextlib.closing(batch_fd) as fd:
+        return [url for url in map(fixup, fd) if url]
+
+
+def urlencode_postdata(*args, **kargs):
+    return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
+
+
+def parse_xml(s):
+    class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
+        def doctype(self, name, pubid, system):
+            pass  # Ignore doctypes
+
+    parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
+    kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
+    return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
+
+
+if sys.version_info < (3, 0) and sys.platform == 'win32':
+    def compat_getpass(prompt, *args, **kwargs):
+        if isinstance(prompt, compat_str):
+            prompt = prompt.encode(preferredencoding())
+        return getpass.getpass(prompt, *args, **kwargs)
+else:
+    compat_getpass = getpass.getpass
+
+
+US_RATINGS = {
+    'G': 0,
+    'PG': 10,
+    'PG-13': 13,
+    'R': 16,
+    'NC': 18,
+}
+
+
+def strip_jsonp(code):
+    return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code)
+
+
+def qualities(quality_ids):
+    """ Get a numeric quality value out of a list of possible values """
+    def q(qid):
+        try:
+            return quality_ids.index(qid)
+        except ValueError:
+            return -1
+    return q
+
+
+DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
+
+try:
+    subprocess_check_output = subprocess.check_output
+except AttributeError:
+    def subprocess_check_output(*args, **kwargs):
+        assert 'input' not in kwargs
+        p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
+        output, _ = p.communicate()
+        ret = p.poll()
+        if ret:
+            raise subprocess.CalledProcessError(ret, p.args, output=output)
+        return output
index a9fead95d4c6ca5fe2c464565adf1b0bfe6a7146..6fe7c7b257d64cc3c928a445e213c9a79e7d030f 100644 (file)
@@ -1,2 +1,2 @@
 
-__version__ = '2014.02.17'
+__version__ = '2014.06.07'