]> Raphaël G. Git Repositories - youtubedl/commitdiff
Imported Upstream version 2016.08.17
authorRogério Brito <rbrito@ime.usp.br>
Wed, 17 Aug 2016 12:19:41 +0000 (09:19 -0300)
committerRogério Brito <rbrito@ime.usp.br>
Wed, 17 Aug 2016 12:19:41 +0000 (09:19 -0300)
224 files changed:
.github/ISSUE_TEMPLATE.md
.github/PULL_REQUEST_TEMPLATE.md [new file with mode: 0644]
.travis.yml
AUTHORS
CONTRIBUTING.md
ChangeLog [new file with mode: 0644]
Makefile
README.md
devscripts/gh-pages/generate-download.py
devscripts/install_srelay.sh [deleted file]
devscripts/prepare_manpage.py
devscripts/release.sh
devscripts/show-downloads-statistics.py [new file with mode: 0644]
docs/supportedsites.md
test/test_InfoExtractor.py
test/test_YoutubeDL.py
test/test_all_urls.py
test/test_compat.py
test/test_http.py
test/test_utils.py
test/test_verbose_output.py [new file with mode: 0644]
youtube_dl/YoutubeDL.py
youtube_dl/__init__.py
youtube_dl/compat.py
youtube_dl/downloader/common.py
youtube_dl/downloader/external.py
youtube_dl/downloader/f4m.py
youtube_dl/downloader/hls.py
youtube_dl/extractor/adobepass.py [new file with mode: 0644]
youtube_dl/extractor/adultswim.py
youtube_dl/extractor/aenetworks.py
youtube_dl/extractor/amcnetworks.py [new file with mode: 0644]
youtube_dl/extractor/amp.py
youtube_dl/extractor/animeondemand.py
youtube_dl/extractor/aol.py
youtube_dl/extractor/aparat.py
youtube_dl/extractor/archiveorg.py
youtube_dl/extractor/ard.py
youtube_dl/extractor/arkena.py [new file with mode: 0644]
youtube_dl/extractor/arte.py
youtube_dl/extractor/bbc.py
youtube_dl/extractor/bigflix.py
youtube_dl/extractor/bilibili.py
youtube_dl/extractor/biobiochiletv.py
youtube_dl/extractor/biqle.py
youtube_dl/extractor/bloomberg.py
youtube_dl/extractor/brightcove.py
youtube_dl/extractor/buzzfeed.py
youtube_dl/extractor/camdemy.py
youtube_dl/extractor/cbc.py
youtube_dl/extractor/cbsinteractive.py
youtube_dl/extractor/cbslocal.py
youtube_dl/extractor/cbsnews.py
youtube_dl/extractor/chaturbate.py
youtube_dl/extractor/chirbit.py
youtube_dl/extractor/cliphunter.py
youtube_dl/extractor/cliprs.py
youtube_dl/extractor/cloudy.py
youtube_dl/extractor/cmt.py
youtube_dl/extractor/comedycentral.py
youtube_dl/extractor/common.py
youtube_dl/extractor/condenast.py
youtube_dl/extractor/crunchyroll.py
youtube_dl/extractor/cspan.py
youtube_dl/extractor/ctsnews.py
youtube_dl/extractor/ctv.py [new file with mode: 0644]
youtube_dl/extractor/ctvnews.py [new file with mode: 0644]
youtube_dl/extractor/cwtv.py
youtube_dl/extractor/dailymail.py
youtube_dl/extractor/dailymotion.py
youtube_dl/extractor/daum.py
youtube_dl/extractor/dbtv.py
youtube_dl/extractor/dcn.py
youtube_dl/extractor/discoverygo.py [new file with mode: 0644]
youtube_dl/extractor/dreisat.py
youtube_dl/extractor/drtuber.py
youtube_dl/extractor/eagleplatform.py
youtube_dl/extractor/ellentv.py
youtube_dl/extractor/engadget.py
youtube_dl/extractor/eporner.py
youtube_dl/extractor/expotv.py
youtube_dl/extractor/extractors.py
youtube_dl/extractor/extremetube.py
youtube_dl/extractor/facebook.py
youtube_dl/extractor/fivemin.py
youtube_dl/extractor/flipagram.py [new file with mode: 0644]
youtube_dl/extractor/formula1.py
youtube_dl/extractor/fourtube.py
youtube_dl/extractor/fox.py
youtube_dl/extractor/franceculture.py
youtube_dl/extractor/francetv.py
youtube_dl/extractor/fusion.py [new file with mode: 0644]
youtube_dl/extractor/fxnetworks.py [new file with mode: 0644]
youtube_dl/extractor/gamekings.py [deleted file]
youtube_dl/extractor/gamespot.py
youtube_dl/extractor/generic.py
youtube_dl/extractor/goldenmoustache.py [deleted file]
youtube_dl/extractor/hgtv.py [new file with mode: 0644]
youtube_dl/extractor/hrti.py [new file with mode: 0644]
youtube_dl/extractor/imgur.py
youtube_dl/extractor/instagram.py
youtube_dl/extractor/iqiyi.py
youtube_dl/extractor/jwplatform.py
youtube_dl/extractor/kaltura.py
youtube_dl/extractor/kamcord.py [new file with mode: 0644]
youtube_dl/extractor/keezmovies.py
youtube_dl/extractor/kuwo.py
youtube_dl/extractor/la7.py
youtube_dl/extractor/lcp.py [new file with mode: 0644]
youtube_dl/extractor/leeco.py
youtube_dl/extractor/lifenews.py
youtube_dl/extractor/limelight.py
youtube_dl/extractor/lynda.py
youtube_dl/extractor/m6.py
youtube_dl/extractor/meta.py [new file with mode: 0644]
youtube_dl/extractor/metacafe.py
youtube_dl/extractor/mgtv.py
youtube_dl/extractor/miomio.py
youtube_dl/extractor/mitele.py
youtube_dl/extractor/mofosex.py
youtube_dl/extractor/msn.py [new file with mode: 0644]
youtube_dl/extractor/mtv.py
youtube_dl/extractor/muenchentv.py
youtube_dl/extractor/nationalgeographic.py
youtube_dl/extractor/naver.py
youtube_dl/extractor/nextmovie.py [deleted file]
youtube_dl/extractor/nick.py
youtube_dl/extractor/ninecninemedia.py [new file with mode: 0644]
youtube_dl/extractor/ninenow.py [new file with mode: 0644]
youtube_dl/extractor/nintendo.py [new file with mode: 0644]
youtube_dl/extractor/ntvru.py
youtube_dl/extractor/odatv.py [new file with mode: 0644]
youtube_dl/extractor/onet.py [new file with mode: 0644]
youtube_dl/extractor/onionstudios.py
youtube_dl/extractor/openload.py
youtube_dl/extractor/orf.py
youtube_dl/extractor/pbs.py
youtube_dl/extractor/periscope.py
youtube_dl/extractor/pladform.py
youtube_dl/extractor/playvid.py
youtube_dl/extractor/pokemon.py [new file with mode: 0644]
youtube_dl/extractor/polskieradio.py [new file with mode: 0644]
youtube_dl/extractor/pornhub.py
youtube_dl/extractor/pornotube.py
youtube_dl/extractor/prosiebensat1.py
youtube_dl/extractor/radiocanada.py
youtube_dl/extractor/rai.py
youtube_dl/extractor/rbmaradio.py
youtube_dl/extractor/rds.py
youtube_dl/extractor/roosterteeth.py [new file with mode: 0644]
youtube_dl/extractor/rozhlas.py [new file with mode: 0644]
youtube_dl/extractor/rtlnl.py
youtube_dl/extractor/rtve.py
youtube_dl/extractor/rtvnh.py
youtube_dl/extractor/rudo.py [new file with mode: 0644]
youtube_dl/extractor/safari.py
youtube_dl/extractor/sandia.py
youtube_dl/extractor/sendtonews.py
youtube_dl/extractor/shahid.py
youtube_dl/extractor/shared.py
youtube_dl/extractor/sixplay.py [new file with mode: 0644]
youtube_dl/extractor/skynewsarabia.py
youtube_dl/extractor/skysports.py [new file with mode: 0644]
youtube_dl/extractor/slideshare.py
youtube_dl/extractor/smotri.py
youtube_dl/extractor/sohu.py
youtube_dl/extractor/sonyliv.py [new file with mode: 0644]
youtube_dl/extractor/soundcloud.py
youtube_dl/extractor/southpark.py
youtube_dl/extractor/spiegel.py
youtube_dl/extractor/spike.py
youtube_dl/extractor/srmediathek.py
youtube_dl/extractor/stitcher.py
youtube_dl/extractor/streamable.py [new file with mode: 0644]
youtube_dl/extractor/sunporno.py
youtube_dl/extractor/svt.py
youtube_dl/extractor/syfy.py
youtube_dl/extractor/tapely.py [deleted file]
youtube_dl/extractor/telecinco.py
youtube_dl/extractor/telegraaf.py
youtube_dl/extractor/theplatform.py
youtube_dl/extractor/threeqsdn.py
youtube_dl/extractor/tmz.py
youtube_dl/extractor/tnaflix.py
youtube_dl/extractor/toutv.py
youtube_dl/extractor/tube8.py
youtube_dl/extractor/tv2.py
youtube_dl/extractor/tvland.py
youtube_dl/extractor/tvp.py
youtube_dl/extractor/tvplay.py
youtube_dl/extractor/tweakers.py
youtube_dl/extractor/twentyfourvideo.py
youtube_dl/extractor/twitch.py
youtube_dl/extractor/uol.py [new file with mode: 0644]
youtube_dl/extractor/uplynk.py [new file with mode: 0644]
youtube_dl/extractor/urplay.py [new file with mode: 0644]
youtube_dl/extractor/vbox7.py
youtube_dl/extractor/vgtv.py
youtube_dl/extractor/viceland.py [new file with mode: 0644]
youtube_dl/extractor/vidbit.py [new file with mode: 0644]
youtube_dl/extractor/vidzi.py
youtube_dl/extractor/viki.py
youtube_dl/extractor/vimeo.py
youtube_dl/extractor/vine.py
youtube_dl/extractor/vk.py
youtube_dl/extractor/vodplatform.py [new file with mode: 0644]
youtube_dl/extractor/vrt.py
youtube_dl/extractor/vuclip.py
youtube_dl/extractor/wat.py
youtube_dl/extractor/xiami.py
youtube_dl/extractor/xtube.py
youtube_dl/extractor/xuite.py
youtube_dl/extractor/xvideos.py
youtube_dl/extractor/yahoo.py
youtube_dl/extractor/yandexmusic.py
youtube_dl/extractor/youjizz.py
youtube_dl/extractor/youku.py
youtube_dl/extractor/youtube.py
youtube_dl/extractor/zippcast.py [deleted file]
youtube_dl/options.py
youtube_dl/postprocessor/metadatafromtitle.py
youtube_dl/socks.py
youtube_dl/utils.py
youtube_dl/version.py

index c73f9a9044f211e732f8ca6fb5181ea5e0db9ad9..ae28d83d50d64d73b88993eeaa5bfc3e6c421b16 100644 (file)
@@ -6,8 +6,8 @@
 
 ---
 
-### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.25*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
-- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.25**
+### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.17*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
+- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.17**
 
 ### Before submitting an *issue* make sure you have:
 - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@@ -35,7 +35,7 @@ $ youtube-dl -v <your command line>
 [debug] User config: []
 [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
 [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
-[debug] youtube-dl version 2016.06.25
+[debug] youtube-dl version 2016.08.17
 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
 [debug] Proxy map: {}
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644 (file)
index 0000000..f24bb4b
--- /dev/null
@@ -0,0 +1,22 @@
+## Please follow the guide below
+
+- You will be asked some questions, please read them **carefully** and answer honestly
+- Put an `x` into all the boxes [ ] relevant to your *pull request* (like that [x])
+- Use *Preview* tab to see how your *pull request* will actually look like
+
+---
+
+### Before submitting a *pull request* make sure you have:
+- [ ] At least skimmed through [adding new extractor tutorial](https://github.com/rg3/youtube-dl#adding-support-for-a-new-site) and [youtube-dl coding conventions](https://github.com/rg3/youtube-dl#youtube-dl-coding-conventions) sections
+- [ ] [Searched](https://github.com/rg3/youtube-dl/search?q=is%3Apr&type=Issues) the bugtracker for similar pull requests
+
+### What is the purpose of your *pull request*?
+- [ ] Bug fix
+- [ ] New extractor
+- [ ] New feature
+
+---
+
+### Description of your *pull request* and other information
+
+Explanation of your *pull request* in arbitrary form goes here. Please make sure the description explains the purpose and effect of your *pull request* and is worded well enough to be understood. Provide as much context and examples as possible.
index 136c339f0cf9058fd9811077c6257afc553e30cf..c74c9cc1295ba7c54228c5a25a71e469f8ea8026 100644 (file)
@@ -7,9 +7,6 @@ python:
   - "3.4"
   - "3.5"
 sudo: false
-install:
-  - bash ./devscripts/install_srelay.sh
-  - export PATH=$PATH:$(pwd)/tmp/srelay-0.4.8b6
 script: nosetests test --verbose
 notifications:
   email:
diff --git a/AUTHORS b/AUTHORS
index cdf655c39cde859cffb2aced10b8d1588363c6df..1fd4be78522b5d9e66e8531cb3d885a7ae65c73d 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -175,3 +175,9 @@ Tomáš Čech
 Déstin Reed
 Roman Tsiupa
 Artur Krysiak
+Jakub Adam Wieczorek
+Aleksandar Topuzović
+Nehal Patel
+Rob van Bekkum
+Petr Zvoníček
+Pratyush Singh
index a59fac9b2328f9817b90c820c0027afd1ad2f049..95392030ea2fc7f78cbb23b368ee37a9fd5ee010 100644 (file)
@@ -46,7 +46,7 @@ Make sure that someone has not already opened the issue you're trying to open. S
 
 ###  Why are existing options not enough?
 
-Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/rg3/youtube-dl/blob/master/README.md#synopsis). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem.
+Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/rg3/youtube-dl/blob/master/README.md#options). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem.
 
 ###  Is there enough context in your bug report?
 
@@ -97,9 +97,17 @@ If you want to add support for a new site, first of all **make sure** this site
 After you have ensured this site is distributing it's content legally, you can follow this quick list (assuming your service is called `yourextractor`):
 
 1. [Fork this repository](https://github.com/rg3/youtube-dl/fork)
-2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git`
-3. Start a new git branch with `cd youtube-dl; git checkout -b yourextractor`
+2. Check out the source code with:
+
+        git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git
+
+3. Start a new git branch with
+
+        cd youtube-dl
+        git checkout -b yourextractor
+
 4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`:
+
     ```python
     # coding: utf-8
     from __future__ import unicode_literals
@@ -143,16 +151,148 @@ After you have ensured this site is distributing it's content legally, you can f
 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py).
 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc.
 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want.
-8. Keep in mind that the only mandatory fields in info dict for successful extraction process are `id`, `title` and either `url` or `formats`, i.e. these are the critical data the extraction does not make any sense without. This means that [any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L148-L252) apart from aforementioned mandatory ones should be treated **as optional** and extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. For example, if you have some intermediate dict `meta` that is a source of metadata and it has a key `summary` that you want to extract and put into resulting info dict as `description`, you should be ready that this key may be missing from the `meta` dict, i.e. you should extract it as `meta.get('summary')` and not `meta['summary']`. Similarly, you should pass `fatal=False` when extracting data from a webpage with `_search_regex/_html_search_regex`.
-9. Check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
-10. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
+8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
+9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
 
         $ git add youtube_dl/extractor/extractors.py
         $ git add youtube_dl/extractor/yourextractor.py
         $ git commit -m '[yourextractor] Add new extractor'
         $ git push origin yourextractor
 
-11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
+10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
 
 In any case, thank you very much for your contributions!
 
+## youtube-dl coding conventions
+
+This section introduces a guide lines for writing idiomatic, robust and future-proof extractor code.
+
+Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hoster out of your control and this layout tend to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize code dependency on source's layout changes and even to make the code foresee potential future changes and be ready for that. This is important because it will allow extractor not to break on minor layout changes thus keeping old youtube-dl versions working. Even though this breakage issue is easily fixed by emitting a new version of youtube-dl with fix incorporated all the previous version become broken in all repositories and distros' packages that may not be so prompt in fetching the update from us. Needless to say some may never receive an update at all that is possible for non rolling release distros.
+
+### Mandatory and optional metafields
+
+For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in *info dict* are considered mandatory for successful extraction process by youtube-dl:
+
+ - `id` (media identifier)
+ - `title` (media title)
+ - `url` (media download URL) or `formats`
+
+In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` to be mandatory. Thus aforementioned metafields are the critical data the extraction does not make any sense without and if any of them fail to be extracted then extractor is considered completely broken.
+
+[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields.
+
+#### Example
+
+Say you have some source dictionary `meta` that you've fetched as JSON with HTTP request and it has a key `summary`:
+
+```python
+meta = self._download_json(url, video_id)
+```
+    
+Assume at this point `meta`'s layout is:
+
+```python
+{
+    ...
+    "summary": "some fancy summary text",
+    ...
+}
+```
+
+Assume you want to extract `summary` and put into resulting info dict as `description`. Since `description` is optional metafield you should be ready that this key may be missing from the `meta` dict, so that you should extract it like:
+
+```python
+description = meta.get('summary')  # correct
+```
+
+and not like:
+
+```python
+description = meta['summary']  # incorrect
+```
+
+The latter will break extraction process with `KeyError` if `summary` disappears from `meta` at some time later but with former approach extraction will just go ahead with `description` set to `None` that is perfectly fine (remember `None` is equivalent for absence of data). 
+
+Similarly, you should pass `fatal=False` when extracting optional data from a webpage with `_search_regex`, `_html_search_regex` or similar methods, for instance:
+
+```python
+description = self._search_regex(
+    r'<span[^>]+id="title"[^>]*>([^<]+)<',
+    webpage, 'description', fatal=False)
+```
+
+With `fatal` set to `False` if `_search_regex` fails to extract `description` it will emit a warning and continue extraction.
+
+You can also pass `default=<some fallback value>`, for example:
+
+```python
+description = self._search_regex(
+    r'<span[^>]+id="title"[^>]*>([^<]+)<',
+    webpage, 'description', default=None)
+```
+
+On failure this code will silently continue the extraction with `description` set to `None`. That is useful for metafields that are known to may or may not be present.
+### Provide fallbacks
+
+When extracting metadata try to provide several scenarios for that. For example if `title` is present in several places/sources try extracting from at least some of them. This would make it more future-proof in case some of the sources became unavailable.
+
+#### Example
+
+Say `meta` from previous example has a `title` and you are about to extract it. Since `title` is mandatory meta field you should end up with something like:
+
+```python
+title = meta['title']
+```
+
+If `title` disappeares from `meta` in future due to some changes on hoster's side the extraction would fail since `title` is mandatory. That's expected.
+
+Assume that you have some another source you can extract `title` from, for example `og:title` HTML meta of a `webpage`. In this case you can provide a fallback scenario:
+
+```python
+title = meta.get('title') or self._og_search_title(webpage)
+```
+
+This code will try to extract from `meta` first and if it fails it will try extracting `og:title` from a `webpage`.
+
+### Make regular expressions flexible
+
+When using regular expressions try to write them fuzzy and flexible.
+#### Example
+
+Say you need to extract `title` from the following HTML code:
+
+```html
+<span style="position: absolute; left: 910px; width: 90px; float: right; z-index: 9999;" class="title">some fancy title</span>
+```
+
+The code for that task should look similar to:
+
+```python
+title = self._search_regex(
+    r'<span[^>]+class="title"[^>]*>([^<]+)', webpage, 'title')
+```
+
+Or even better:
+
+```python
+title = self._search_regex(
+    r'<span[^>]+class=(["\'])title\1[^>]*>(?P<title>[^<]+)',
+    webpage, 'title', group='title')
+```
+
+Note how you tolerate potential changes in `style` attribute's value or switch from using double quotes to single for `class` attribute: 
+
+The code definitely should not look like:
+
+```python
+title = self._search_regex(
+    r'<span style="position: absolute; left: 910px; width: 90px; float: right; z-index: 9999;" class="title">(.*?)</span>',
+    webpage, 'title', group='title')
+```
+
+### Use safe conversion functions
+
+Wrap all extracted numeric data into safe functions from `utils`: `int_or_none`, `float_or_none`. Use them for string to number conversions as well.
+
diff --git a/ChangeLog b/ChangeLog
new file mode 100644 (file)
index 0000000..354306a
--- /dev/null
+++ b/ChangeLog
@@ -0,0 +1,423 @@
+version 2016.08.17
+
+Core
++ Add _get_netrc_login_info
+
+Extractors
+* [mofosex] Extract all formats (#10335)
++ [generic] Add support for vbox7 embeds
++ [vbox7] Add support for embed URLs
++ [viafree] Add extractor (#10358)
++ [mtg] Add support for viafree URLs (#10358)
+* [theplatform] Extract all subtitles per language
++ [xvideos] Fix HLS extraction (#10356)
++ [amcnetworks] Add extractor
++ [bbc:playlist] Add support for pagination (#10349)
++ [fxnetworks] Add extractor (#9462)
+* [cbslocal] Fix extraction for SendtoNews-based videos
+* [sendtonews] Fix extraction
+* [jwplatform] Extract video id from JWPlayer data
+- [zippcast] Remove extractor (#10332)
++ [viceland] Add extractor (#8799)
++ [adobepass] Add base extractor for Adobe Pass Authentication
+* [life:embed] Improve extraction
+* [vgtv] Detect geo restricted videos (#10348)
++ [uplynk] Add extractor
+* [xiami] Fix extraction (#10342)
+
+
+version 2016.08.13
+
+Core
+* Show progress for curl external downloader
+* Forward more options to curl external downloader
+
+Extractors
+* [pbs] Fix description extraction
+* [franceculture] Fix extraction (#10324)
+* [pornotube] Fix extraction (#10322)
+* [4tube] Fix metadata extraction (#10321)
+* [imgur] Fix width and height extraction (#10325)
+* [expotv] Improve extraction
++ [vbox7] Fix extraction (#10309)
+- [tapely] Remove extractor (#10323)
+* [muenchentv] Fix extraction (#10313)
++ [24video] Add support for .me and .xxx TLDs
+* [24video] Fix comment count extraction
+* [sunporno] Add support for embed URLs
+* [sunporno] Fix metadata extraction (#10316)
++ [hgtv] Add extractor for hgtv.ca (#3999)
+- [pbs] Remove request to unavailable API
++ [pbs] Add support for high quality HTTP formats
++ [crunchyroll] Add support for HLS formats (#10301)
+
+
+version 2016.08.12
+
+Core
+* Subtitles are now written as is. Newline conversions are disabled. (#10268)
++ Recognize more formats in unified_timestamp
+
+Extractors
+- [goldenmoustache] Remove extractor (#10298)
+* [drtuber] Improve title extraction
+* [drtuber] Make dislike count optional (#10297)
+* [chirbit] Fix extraction (#10296)
+* [francetvinfo] Relax URL regular expression
+* [rtlnl] Relax URL regular expression (#10282)
+* [formula1] Relax URL regular expression (#10283)
+* [wat] Improve extraction (#10281)
+* [ctsnews] Fix extraction
+
+
+version 2016.08.10
+
+Core
+* Make --metadata-from-title non fatal when title does not match the pattern
+* Introduce options for randomized sleep before each download
+  --min-sleep-interval and --max-sleep-interval (#9930)
+* Respect default in _search_json_ld
+
+Extractors
++ [uol] Add extractor for uol.com.br (#4263)
+* [rbmaradio] Fix extraction and extract all formats (#10242)
++ [sonyliv] Add extractor for sonyliv.com (#10258)
+* [aparat] Fix extraction
+* [cwtv] Extract HTTP formats
++ [rozhlas] Add extractor for prehravac.rozhlas.cz (#10253)
+* [kuwo:singer] Fix extraction
+
+
+version 2016.08.07
+
+Core
++ Add support for TV Parental Guidelines ratings in parse_age_limit
++ Add decode_png (#9706)
++ Add support for partOfTVSeries in JSON-LD
+* Lower master M3U8 manifest preference for better format sorting
+
+Extractors
++ [discoverygo] Add extractor (#10245)
+* [flipagram] Make JSON-LD extraction non fatal
+* [generic] Make JSON-LD extraction non fatal
++ [bbc] Add support for morph embeds (#10239)
+* [tnaflixnetworkbase] Improve title extraction
+* [tnaflix] Fix metadata extraction (#10249)
+* [fox] Fix theplatform release URL query
+* [openload] Fix extraction (#9706)
+* [bbc] Skip duplicate manifest URLs
+* [bbc] Improve format code
++ [bbc] Add support for DASH and F4M
+* [bbc] Improve format sorting and listing
+* [bbc] Improve playlist extraction
++ [pokemon] Add extractor (#10093)
++ [condenast] Add fallback scenario for video info extraction
+
+
+version 2016.08.06
+
+Core
+* Add support for JSON-LD root list entries (#10203)
+* Improve unified_timestamp
+* Lower preference of RTSP formats in generic sorting
++ Add support for multiple properties in _og_search_property
+* Improve password hiding from verbose output
+
+Extractors
++ [adultswim] Add support for trailers (#10235)
+* [archiveorg] Improve extraction (#10219)
++ [jwplatform] Add support for playlists
++ [jwplatform] Add support for relative URLs
+* [jwplatform] Improve audio detection
++ [tvplay] Capture and output native error message
++ [tvplay] Extract series metadata
++ [tvplay] Add support for subtitles (#10194)
+* [tvp] Improve extraction (#7799)
+* [cbslocal] Fix timestamp parsing (#10213)
++ [naver] Add support for subtitles (#8096)
+* [naver] Improve extraction
+* [condenast] Improve extraction
+* [engadget] Relax URL regular expression
+* [5min] Fix extraction
++ [nationalgeographic] Add support for Episode Guide
++ [kaltura] Add support for subtitles
+* [kaltura] Optimize network requests
++ [vodplatform] Add extractor for vod-platform.net
+- [gamekings] Remove extractor
+* [limelight] Extract HTTP formats
+* [ntvru] Fix extraction
++ [comedycentral] Re-add :tds and :thedailyshow shortnames
+
+
+version 2016.08.01
+
+Fixed/improved extractors
+- [yandexmusic:track] Adapt to changes in track location JSON (#10193)
+- [bloomberg] Support another form of player (#10187)
+- [limelight] Skip DRM protected videos
+- [safari] Relax regular expressions for URL matching (#10202)
+- [cwtv] Add support for cwtvpr.com (#10196)
+
+
+version 2016.07.30
+
+Fixed/improved extractors
+- [twitch:clips] Sort formats
+- [tv2] Use m3u8_native
+- [tv2:article] Fix video detection (#10188)
+- rtve (#10076)
+- [dailymotion:playlist] Optimize download archive processing (#10180)
+
+
+version 2016.07.28
+
+Fixed/improved extractors
+- shared (#10170)
+- soundcloud (#10179)
+- twitch (#9767)
+
+
+version 2016.07.26.2
+
+Fixed/improved extractors
+- smotri
+- camdemy
+- mtv
+- comedycentral
+- cmt
+- cbc
+- mgtv
+- orf
+
+
+version 2016.07.24
+
+New extractors
+- arkena (#8682)
+- lcp (#8682)
+
+Fixed/improved extractors
+- facebook (#10151)
+- dailymail
+- telegraaf
+- dcn
+- onet
+- tvp
+
+Miscellaneous
+- Support $Time$ in DASH manifests
+
+
+version 2016.07.22
+
+New extractors
+- odatv (#9285)
+
+Fixed/improved extractors
+- bbc
+- youjizz (#10131)
+- youtube (#10140)
+- pornhub (#10138)
+- eporner (#10139)
+
+
+version 2016.07.17
+
+New extractors
+- nintendo (#9986)
+- streamable (#9122)
+
+Fixed/improved extractors
+- ard (#10095)
+- mtv
+- comedycentral (#10101)
+- viki (#10098)
+- spike (#10106)
+
+Miscellaneous
+- Improved twitter player detection (#10090)
+
+
+version 2016.07.16
+
+New extractors
+- ninenow (#5181)
+
+Fixed/improved extractors
+- rtve (#10076)
+- brightcove
+- 3qsdn
+- syfy (#9087, #3820, #2388)
+- youtube (#10083)
+
+Miscellaneous
+- Fix subtitle embedding for video-only and audio-only files (#10081)
+
+
+version 2016.07.13
+
+New extractors
+- rudo
+
+Fixed/improved extractors
+- biobiochiletv
+- tvplay
+- dbtv
+- brightcove
+- tmz
+- youtube (#10059)
+- shahid (#10062)
+- vk
+- ellentv (#10067)
+
+
+version 2016.07.11
+
+New Extractors
+- roosterteeth (#9864)
+
+Fixed/improved extractors
+- miomio (#9605)
+- vuclip
+- youtube
+- vidzi (#10058)
+
+
+version 2016.07.09.2
+
+Fixed/improved extractors
+- vimeo (#1638)
+- facebook (#10048)
+- lynda (#10047)
+- animeondemand
+
+Fixed/improved features
+- Embedding subtitles no longer throws an error with problematic inputs (#9063)
+
+
+version 2016.07.09.1
+
+Fixed/improved extractors
+- youtube
+- ard
+- srmediatek (#9373)
+
+
+version 2016.07.09
+
+New extractors
+- Flipagram (#9898)
+
+Fixed/improved extractors
+- telecinco
+- toutv
+- radiocanada
+- tweakers (#9516)
+- lynda
+- nick (#7542)
+- polskieradio (#10028)
+- le
+- facebook (#9851)
+- mgtv
+- animeondemand (#10031)
+
+Fixed/improved features
+- `--postprocessor-args` and `--downloader-args` now accepts non-ASCII inputs
+  on non-Windows systems
+
+
+version 2016.07.07
+
+New extractors
+- kamcord (#10001)
+
+Fixed/improved extractors
+- spiegel (#10018)
+- metacafe (#8539, #3253)
+- onet (#9950)
+- francetv (#9955)
+- brightcove (#9965)
+- daum (#9972)
+
+
+version 2016.07.06
+
+Fixed/improved extractors
+- youtube (#10007, #10009)
+- xuite
+- stitcher
+- spiegel
+- slideshare
+- sandia
+- rtvnh
+- prosiebensat1
+- onionstudios
+
+
+version 2016.07.05
+
+Fixed/improved extractors
+- brightcove
+- yahoo (#9995)
+- pornhub (#9997)
+- iqiyi
+- kaltura (#5557)
+- la7
+- Changed features
+- Rename --cn-verfication-proxy to --geo-verification-proxy
+Miscellaneous
+- Add script for displaying downloads statistics
+
+
+version 2016.07.03.1
+
+Fixed/improved extractors
+- theplatform
+- aenetworks
+- nationalgeographic
+- hrti (#9482)
+- facebook (#5701)
+- buzzfeed (#5701)
+- rai (#8617, #9157, #9232, #8552, #8551)
+- nationalgeographic (#9991)
+- iqiyi
+
+
+version 2016.07.03
+
+New extractors
+- hrti (#9482)
+
+Fixed/improved extractors
+- vk (#9981)
+- facebook (#9938)
+- xtube (#9953, #9961)
+
+
+version 2016.07.02
+
+New extractors
+- fusion (#9958)
+
+Fixed/improved extractors
+- twitch (#9975)
+- vine (#9970)
+- periscope (#9967)
+- pornhub (#8696)
+
+
+version 2016.07.01
+
+New extractors
+- 9c9media
+- ctvnews (#2156)
+- ctv (#4077)
+
+Fixed/Improved extractors
+- rds
+- meta (#8789)
+- pornhub (#9964)
+- sixplay (#2183)
+
+New features
+- Accept quoted strings across multiple lines (#9940)
index 6ee4ba4ebc6804ad78061d6b346fd67cd3fd01e5..354052c50822b716423c7bc4d27606e90ed58796 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -94,7 +94,7 @@ _EXTRACTOR_FILES != find youtube_dl/extractor -iname '*.py' -and -not -iname 'la
 youtube_dl/extractor/lazy_extractors.py: devscripts/make_lazy_extractors.py devscripts/lazy_load_template.py $(_EXTRACTOR_FILES)
        $(PYTHON) devscripts/make_lazy_extractors.py $@
 
-youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish
+youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish ChangeLog
        @tar -czf youtube-dl.tar.gz --transform "s|^|youtube-dl/|" --owner 0 --group 0 \
                --exclude '*.DS_Store' \
                --exclude '*.kate-swp' \
@@ -107,7 +107,7 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-
                --exclude 'docs/_build' \
                -- \
                bin devscripts test youtube_dl docs \
-               LICENSE README.md README.txt \
+               ChangeLog LICENSE README.md README.txt \
                Makefile MANIFEST.in youtube-dl.1 youtube-dl.bash-completion \
                youtube-dl.zsh youtube-dl.fish setup.py \
                youtube-dl
index c6feef1162e1a614e4db4d7afa02a543c15943b8..cabbbef76690a6e2ce6513d8f7a03291507aa343 100644 (file)
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ youtube-dl - download videos from youtube.com or other video platforms
 
 To install it right away for all UNIX users (Linux, OS X, etc.), type:
 
-    sudo curl -L https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl
+    sudo curl -L https://yt-dl.org/downloads/latest/youtube-dl -o /usr/local/bin/youtube-dl
     sudo chmod a+rx /usr/local/bin/youtube-dl
 
 If you do not have curl, you can alternatively use a recent wget:
@@ -103,9 +103,9 @@ which means you can modify it, redistribute it or use it however you like.
                                      (experimental)
     -6, --force-ipv6                 Make all connections via IPv6
                                      (experimental)
-    --cn-verification-proxy URL      Use this proxy to verify the IP address for
-                                     some Chinese sites. The default proxy
-                                     specified by --proxy (or none, if the
+    --geo-verification-proxy URL     Use this proxy to verify the IP address for
+                                     some geo-restricted sites. The default
+                                     proxy specified by --proxy (or none, if the
                                      options is not present) is used for the
                                      actual downloading. (experimental)
 
@@ -330,7 +330,15 @@ which means you can modify it, redistribute it or use it however you like.
                                      bidirectional text support. Requires bidiv
                                      or fribidi executable in PATH
     --sleep-interval SECONDS         Number of seconds to sleep before each
-                                     download.
+                                     download when used alone or a lower bound
+                                     of a range for randomized sleep before each
+                                     download (minimum possible number of
+                                     seconds to sleep) when used along with
+                                     --max-sleep-interval.
+    --max-sleep-interval SECONDS     Upper bound of a range for randomized sleep
+                                     before each download (maximum possible
+                                     number of seconds to sleep). Must only be
+                                     used along with --min-sleep-interval.
 
 ## Video Format Options:
     -f, --format FORMAT              Video format code, see the "FORMAT
@@ -424,7 +432,7 @@ which means you can modify it, redistribute it or use it however you like.
 
 # CONFIGURATION
 
-You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and OS X, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`.
+You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and OS X, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`. Note that by default configuration file may not exist so you may need to create it yourself.
 
 For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory:
 ```
@@ -432,6 +440,7 @@ For example, with the following configuration file youtube-dl will always extrac
 --no-mtime
 --proxy 127.0.0.1:3128
 -o ~/Movies/%(title)s.%(ext)s
+# Lines starting with # are comments
 ```
 
 Note that options in configuration file are just the same options aka switches used in regular command line calls thus there **must be no whitespace** after `-` or `--`, e.g. `-o` or `--proxy` but not `- o` or `-- proxy`.
@@ -890,9 +899,17 @@ If you want to add support for a new site, first of all **make sure** this site
 After you have ensured this site is distributing it's content legally, you can follow this quick list (assuming your service is called `yourextractor`):
 
 1. [Fork this repository](https://github.com/rg3/youtube-dl/fork)
-2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git`
-3. Start a new git branch with `cd youtube-dl; git checkout -b yourextractor`
+2. Check out the source code with:
+
+        git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git
+
+3. Start a new git branch with
+
+        cd youtube-dl
+        git checkout -b yourextractor
+
 4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`:
+
     ```python
     # coding: utf-8
     from __future__ import unicode_literals
@@ -936,19 +953,151 @@ After you have ensured this site is distributing it's content legally, you can f
 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py).
 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc.
 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want.
-8. Keep in mind that the only mandatory fields in info dict for successful extraction process are `id`, `title` and either `url` or `formats`, i.e. these are the critical data the extraction does not make any sense without. This means that [any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L148-L252) apart from aforementioned mandatory ones should be treated **as optional** and extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. For example, if you have some intermediate dict `meta` that is a source of metadata and it has a key `summary` that you want to extract and put into resulting info dict as `description`, you should be ready that this key may be missing from the `meta` dict, i.e. you should extract it as `meta.get('summary')` and not `meta['summary']`. Similarly, you should pass `fatal=False` when extracting data from a webpage with `_search_regex/_html_search_regex`.
-9. Check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
-10. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
+8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
+9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
 
         $ git add youtube_dl/extractor/extractors.py
         $ git add youtube_dl/extractor/yourextractor.py
         $ git commit -m '[yourextractor] Add new extractor'
         $ git push origin yourextractor
 
-11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
+10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
 
 In any case, thank you very much for your contributions!
 
+## youtube-dl coding conventions
+
+This section introduces a guide lines for writing idiomatic, robust and future-proof extractor code.
+
+Extractors are very fragile by nature since they depend on the layout of the source data provided by 3rd party media hoster out of your control and this layout tend to change. As an extractor implementer your task is not only to write code that will extract media links and metadata correctly but also to minimize code dependency on source's layout changes and even to make the code foresee potential future changes and be ready for that. This is important because it will allow extractor not to break on minor layout changes thus keeping old youtube-dl versions working. Even though this breakage issue is easily fixed by emitting a new version of youtube-dl with fix incorporated all the previous version become broken in all repositories and distros' packages that may not be so prompt in fetching the update from us. Needless to say some may never receive an update at all that is possible for non rolling release distros.
+
+### Mandatory and optional metafields
+
+For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in *info dict* are considered mandatory for successful extraction process by youtube-dl:
+
+ - `id` (media identifier)
+ - `title` (media title)
+ - `url` (media download URL) or `formats`
+
+In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` to be mandatory. Thus aforementioned metafields are the critical data the extraction does not make any sense without and if any of them fail to be extracted then extractor is considered completely broken.
+
+[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields.
+
+#### Example
+
+Say you have some source dictionary `meta` that you've fetched as JSON with HTTP request and it has a key `summary`:
+
+```python
+meta = self._download_json(url, video_id)
+```
+    
+Assume at this point `meta`'s layout is:
+
+```python
+{
+    ...
+    "summary": "some fancy summary text",
+    ...
+}
+```
+
+Assume you want to extract `summary` and put into resulting info dict as `description`. Since `description` is optional metafield you should be ready that this key may be missing from the `meta` dict, so that you should extract it like:
+
+```python
+description = meta.get('summary')  # correct
+```
+
+and not like:
+
+```python
+description = meta['summary']  # incorrect
+```
+
+The latter will break extraction process with `KeyError` if `summary` disappears from `meta` at some time later but with former approach extraction will just go ahead with `description` set to `None` that is perfectly fine (remember `None` is equivalent for absence of data). 
+
+Similarly, you should pass `fatal=False` when extracting optional data from a webpage with `_search_regex`, `_html_search_regex` or similar methods, for instance:
+
+```python
+description = self._search_regex(
+    r'<span[^>]+id="title"[^>]*>([^<]+)<',
+    webpage, 'description', fatal=False)
+```
+
+With `fatal` set to `False` if `_search_regex` fails to extract `description` it will emit a warning and continue extraction.
+
+You can also pass `default=<some fallback value>`, for example:
+
+```python
+description = self._search_regex(
+    r'<span[^>]+id="title"[^>]*>([^<]+)<',
+    webpage, 'description', default=None)
+```
+
+On failure this code will silently continue the extraction with `description` set to `None`. That is useful for metafields that are known to may or may not be present.
+### Provide fallbacks
+
+When extracting metadata try to provide several scenarios for that. For example if `title` is present in several places/sources try extracting from at least some of them. This would make it more future-proof in case some of the sources became unavailable.
+
+#### Example
+
+Say `meta` from previous example has a `title` and you are about to extract it. Since `title` is mandatory meta field you should end up with something like:
+
+```python
+title = meta['title']
+```
+
+If `title` disappeares from `meta` in future due to some changes on hoster's side the extraction would fail since `title` is mandatory. That's expected.
+
+Assume that you have some another source you can extract `title` from, for example `og:title` HTML meta of a `webpage`. In this case you can provide a fallback scenario:
+
+```python
+title = meta.get('title') or self._og_search_title(webpage)
+```
+
+This code will try to extract from `meta` first and if it fails it will try extracting `og:title` from a `webpage`.
+
+### Make regular expressions flexible
+
+When using regular expressions try to write them fuzzy and flexible.
+#### Example
+
+Say you need to extract `title` from the following HTML code:
+
+```html
+<span style="position: absolute; left: 910px; width: 90px; float: right; z-index: 9999;" class="title">some fancy title</span>
+```
+
+The code for that task should look similar to:
+
+```python
+title = self._search_regex(
+    r'<span[^>]+class="title"[^>]*>([^<]+)', webpage, 'title')
+```
+
+Or even better:
+
+```python
+title = self._search_regex(
+    r'<span[^>]+class=(["\'])title\1[^>]*>(?P<title>[^<]+)',
+    webpage, 'title', group='title')
+```
+
+Note how you tolerate potential changes in `style` attribute's value or switch from using double quotes to single for `class` attribute: 
+
+The code definitely should not look like:
+
+```python
+title = self._search_regex(
+    r'<span style="position: absolute; left: 910px; width: 90px; float: right; z-index: 9999;" class="title">(.*?)</span>',
+    webpage, 'title', group='title')
+```
+
+### Use safe conversion functions
+
+Wrap all extracted numeric data into safe functions from `utils`: `int_or_none`, `float_or_none`. Use them for string to number conversions as well.
+
 # EMBEDDING YOUTUBE-DL
 
 youtube-dl makes the best effort to be a good command-line program, and thus should be callable from any programming language. If you encounter any problems parsing its output, feel free to [create a report](https://github.com/rg3/youtube-dl/issues/new).
@@ -1055,7 +1204,7 @@ Make sure that someone has not already opened the issue you're trying to open. S
 
 ###  Why are existing options not enough?
 
-Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/rg3/youtube-dl/blob/master/README.md#synopsis). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem.
+Before requesting a new feature, please have a quick peek at [the list of supported options](https://github.com/rg3/youtube-dl/blob/master/README.md#options). Many feature requests are for features that actually exist already! Please, absolutely do show off your work in the issue report and detail how the existing similar options do *not* solve your problem.
 
 ###  Is there enough context in your bug report?
 
index 392e3ba21ab86070f2df164362fbd177b750c726..fcd7e1dff663f1607c2842081b40dc1890cf677a 100755 (executable)
@@ -15,13 +15,9 @@ data = urllib.request.urlopen(URL).read()
 with open('download.html.in', 'r', encoding='utf-8') as tmplf:
     template = tmplf.read()
 
-md5sum = hashlib.md5(data).hexdigest()
-sha1sum = hashlib.sha1(data).hexdigest()
 sha256sum = hashlib.sha256(data).hexdigest()
 template = template.replace('@PROGRAM_VERSION@', version)
 template = template.replace('@PROGRAM_URL@', URL)
-template = template.replace('@PROGRAM_MD5SUM@', md5sum)
-template = template.replace('@PROGRAM_SHA1SUM@', sha1sum)
 template = template.replace('@PROGRAM_SHA256SUM@', sha256sum)
 template = template.replace('@EXE_URL@', versions_info['versions'][version]['exe'][0])
 template = template.replace('@EXE_SHA256SUM@', versions_info['versions'][version]['exe'][1])
diff --git a/devscripts/install_srelay.sh b/devscripts/install_srelay.sh
deleted file mode 100755 (executable)
index 33ce8a3..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-
-mkdir -p tmp && cd tmp
-wget -N http://downloads.sourceforge.net/project/socks-relay/socks-relay/srelay-0.4.8/srelay-0.4.8b6.tar.gz
-tar zxvf srelay-0.4.8b6.tar.gz
-cd srelay-0.4.8b6
-./configure
-make
index e3f6339b5a60fc0a19106e7447842d08e680dce2..ce548739f57f5c52e6c73d46f2095e894c8f940d 100644 (file)
@@ -54,17 +54,21 @@ def filter_options(readme):
 
         if in_options:
             if line.lstrip().startswith('-'):
-                option, description = re.split(r'\s{2,}', line.lstrip())
-                split_option = option.split(' ')
-
-                if not split_option[-1].startswith('-'):  # metavar
-                    option = ' '.join(split_option[:-1] + ['*%s*' % split_option[-1]])
-
-                # Pandoc's definition_lists. See http://pandoc.org/README.html
-                # for more information.
-                ret += '\n%s\n:   %s\n' % (option, description)
-            else:
-                ret += line.lstrip() + '\n'
+                split = re.split(r'\s{2,}', line.lstrip())
+                # Description string may start with `-` as well. If there is
+                # only one piece then it's a description bit not an option.
+                if len(split) > 1:
+                    option, description = split
+                    split_option = option.split(' ')
+
+                    if not split_option[-1].startswith('-'):  # metavar
+                        option = ' '.join(split_option[:-1] + ['*%s*' % split_option[-1]])
+
+                    # Pandoc's definition_lists. See http://pandoc.org/README.html
+                    # for more information.
+                    ret += '\n%s\n:   %s\n' % (option, description)
+                    continue
+            ret += line.lstrip() + '\n'
         else:
             ret += line + '\n'
 
index f8d466ba879ff7832a652ea41cfb2527b49ce954..ca6ae1b491215b0bcff9a6b5398f5e6de127a0d7 100755 (executable)
@@ -71,9 +71,12 @@ fi
 /bin/echo -e "\n### Changing version in version.py..."
 sed -i "s/__version__ = '.*'/__version__ = '$version'/" youtube_dl/version.py
 
+/bin/echo -e "\n### Changing version in ChangeLog..."
+sed -i "s/<unreleased>/$version/" ChangeLog
+
 /bin/echo -e "\n### Committing documentation, templates and youtube_dl/version.py..."
 make README.md CONTRIBUTING.md .github/ISSUE_TEMPLATE.md supportedsites
-git add README.md CONTRIBUTING.md .github/ISSUE_TEMPLATE.md docs/supportedsites.md youtube_dl/version.py
+git add README.md CONTRIBUTING.md .github/ISSUE_TEMPLATE.md docs/supportedsites.md youtube_dl/version.py ChangeLog
 git commit $gpg_sign_commits -m "release $version"
 
 /bin/echo -e "\n### Now tagging, signing and pushing..."
diff --git a/devscripts/show-downloads-statistics.py b/devscripts/show-downloads-statistics.py
new file mode 100644 (file)
index 0000000..e25d284
--- /dev/null
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+from __future__ import unicode_literals
+
+import itertools
+import json
+import os
+import re
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from youtube_dl.compat import (
+    compat_print,
+    compat_urllib_request,
+)
+from youtube_dl.utils import format_bytes
+
+
+def format_size(bytes):
+    return '%s (%d bytes)' % (format_bytes(bytes), bytes)
+
+
+total_bytes = 0
+
+for page in itertools.count(1):
+    releases = json.loads(compat_urllib_request.urlopen(
+        'https://api.github.com/repos/rg3/youtube-dl/releases?page=%s' % page
+    ).read().decode('utf-8'))
+
+    if not releases:
+        break
+
+    for release in releases:
+        compat_print(release['name'])
+        for asset in release['assets']:
+            asset_name = asset['name']
+            total_bytes += asset['download_count'] * asset['size']
+            if all(not re.match(p, asset_name) for p in (
+                    r'^youtube-dl$',
+                    r'^youtube-dl-\d{4}\.\d{2}\.\d{2}(?:\.\d+)?\.tar\.gz$',
+                    r'^youtube-dl\.exe$')):
+                continue
+            compat_print(
+                ' %s size: %s downloads: %d'
+                % (asset_name, format_size(asset['size']), asset['download_count']))
+
+compat_print('total downloads traffic: %s' % format_size(total_bytes))
index 891499f59f71583cc4e29a3c7c61b003f373227d..189b9301dfc6eb84610305a458f3e16685e7e83a 100644 (file)
@@ -14,6 +14,7 @@
  - **8tracks**
  - **91porn**
  - **9gag**
+ - **9now.com.au**
  - **abc.net.au**
  - **Abc7News**
  - **abcnews**
@@ -34,6 +35,7 @@
  - **AlJazeera**
  - **Allocine**
  - **AlphaPorno**
+ - **AMCNetworks**
  - **AnimeOnDemand**
  - **anitube.se**
  - **AnySex**
@@ -45,7 +47,7 @@
  - **archive.org**: archive.org videos
  - **ARD**
  - **ARD:mediathek**
- - **ARD:mediathek**: Saarländischer Rundfunk
+ - **Arkena**
  - **arte.tv**
  - **arte.tv:+7**
  - **arte.tv:cinema**
  - **CollegeRama**
  - **ComCarCoff**
  - **ComedyCentral**
- - **ComedyCentralShows**: The Daily Show / The Colbert Report
+ - **ComedyCentralShortname**
+ - **ComedyCentralTV**
  - **CondeNast**: Condé Nast media group: Allure, Architectural Digest, Ars Technica, Bon Appétit, Brides, Condé Nast, Condé Nast Traveler, Details, Epicurious, GQ, Glamour, Golf Digest, SELF, Teen Vogue, The New Yorker, Vanity Fair, Vogue, W Magazine, WIRED
  - **Coub**
  - **Cracked**
  - **CSNNE**
  - **CSpan**: C-SPAN
  - **CtsNews**: 華視新聞
+ - **CTV**
+ - **CTVNews**
  - **culturebox.francetvinfo.fr**
  - **CultureUnplugged**
  - **CWTV**
  - **DigitallySpeaking**
  - **Digiteka**
  - **Discovery**
+ - **DiscoveryGo**
  - **Dotsub**
  - **DouyuTV**: 斗鱼
  - **DPlay**
  - **Firstpost**
  - **FiveTV**
  - **Flickr**
+ - **Flipagram**
  - **Folketinget**: Folketinget (ft.dk; Danish parliament)
  - **FootyRoom**
  - **Formula1**
  - **FoxSports**
  - **france2.fr:generation-quoi**
  - **FranceCulture**
- - **FranceCultureEmission**
  - **FranceInter**
  - **francetv**: France 2, 3, 4, 5 and Ô
  - **francetvinfo.fr**
  - **FreeVideo**
  - **Funimation**
  - **FunnyOrDie**
+ - **Fusion**
+ - **FXNetworks**
  - **GameInformer**
- - **Gamekings**
  - **GameOne**
  - **gameone:playlist**
  - **Gamersyde**
  - **GloboArticle**
  - **GodTube**
  - **GodTV**
- - **GoldenMoustache**
  - **Golem**
  - **GoogleDrive**
  - **Goshgay**
  - **HellPorno**
  - **Helsinki**: helsinki.fi
  - **HentaiStigma**
+ - **HGTV**
  - **HistoricFilms**
+ - **history:topic**: History.com Topic
  - **hitbox**
  - **hitbox:live**
  - **HornBunny**
  - **HotStar**
  - **Howcast**
  - **HowStuffWorks**
+ - **HRTi**
+ - **HRTiPlaylist**
  - **HuffPost**: Huffington Post
  - **Hypem**
  - **Iconosquare**
  - **jpopsuki.tv**
  - **JWPlatform**
  - **Kaltura**
+ - **Kamcord**
  - **KanalPlay**: Kanal 5/9/11 Play
  - **Kankan**
  - **Karaoketv**
  - **kuwo:mv**: 酷我音乐 - MV
  - **kuwo:singer**: 酷我音乐 - 歌手
  - **kuwo:song**: 酷我音乐
- - **la7.tv**
+ - **la7.it**
  - **Laola1Tv**
+ - **Lcp**
+ - **LcpPlay**
  - **Le**: 乐视网
  - **Learnr**
  - **Lecture2Go**
  - **MatchTV**
  - **MDR**: MDR.DE and KiKA
  - **media.ccc.de**
+ - **META**
  - **metacafe**
  - **Metacritic**
  - **Mgoon**
  - **MovieFap**
  - **Moviezine**
  - **MPORA**
+ - **MSN**
+ - **mtg**: MTG services
  - **MTV**
  - **mtv.de**
- - **mtviggy.com**
  - **mtvservices:embedded**
  - **MuenchenTV**: münchen.tv
  - **MusicPlayOn**
  - **MyVidster**
  - **n-tv.de**
  - **natgeo**
- - **natgeo:channel**
+ - **natgeo:episodeguide**
+ - **natgeo:video**
  - **Naver**
  - **NBA**
  - **NBC**
  - **Newstube**
  - **NextMedia**: 蘋果日報
  - **NextMediaActionNews**: 蘋果日報 - 動新聞
- - **nextmovie.com**
  - **nfb**: National Film Board of Canada
  - **nfl.com**
  - **nhl.com**
  - **nick.de**
  - **niconico**: ニコニコ動画
  - **NiconicoPlaylist**
+ - **NineCNineMedia**
+ - **Nintendo**
  - **njoy**: N-JOY
  - **njoy:embed**
  - **Noco**
  - **NYTimes**
  - **NYTimesArticle**
  - **ocw.mit.edu**
+ - **OdaTV**
  - **Odnoklassniki**
  - **OktoberfestTV**
  - **on.aol.com**
+ - **onet.tv**
+ - **onet.tv:channel**
  - **OnionStudios**
  - **Ooyala**
  - **OoyalaExternal**
  - **plus.google**: Google Plus
  - **pluzz.francetv.fr**
  - **podomatic**
+ - **Pokemon**
+ - **PolskieRadio**
  - **PornHd**
- - **PornHub**
+ - **PornHub**: PornHub and Thumbzilla
  - **PornHubPlaylist**
  - **PornHubUserVideos**
  - **Pornotube**
  - **RICE**
  - **RingTV**
  - **RockstarGames**
+ - **RoosterTeeth**
  - **RottenTomatoes**
  - **Roxwel**
+ - **Rozhlas**
  - **RTBF**
  - **rte**: Raidió Teilifís Éireann TV
  - **rte:radio**: Raidió Teilifís Éireann radio
  - **rtve.es:alacarta**: RTVE a la carta
  - **rtve.es:infantil**: RTVE infantil
  - **rtve.es:live**: RTVE.es live streams
+ - **rtve.es:television**
  - **RTVNH**
+ - **Rudo**
  - **RUHD**
  - **RulePorn**
  - **rutube**: Rutube videos
  - **Shared**: shared.sx and vivo.sx
  - **ShareSix**
  - **Sina**
+ - **SixPlay**
+ - **skynewsarabia:article**
  - **skynewsarabia:video**
- - **skynewsarabia:video**
+ - **SkySports**
  - **Slideshare**
  - **Slutload**
  - **smotri**: Smotri.com
  - **smotri:user**: Smotri.com user videos
  - **Snotr**
  - **Sohu**
+ - **SonyLIV**
  - **soundcloud**
  - **soundcloud:playlist**
  - **soundcloud:search**: Soundcloud search
  - **SportBoxEmbed**
  - **SportDeutschland**
  - **Sportschau**
+ - **sr:mediathek**: Saarländischer Rundfunk
  - **SRGSSR**
  - **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites
  - **SSA**
  - **stanfordoc**: Stanford Open ClassRoom
  - **Steam**
  - **Stitcher**
+ - **Streamable**
  - **streamcloud.eu**
  - **StreamCZ**
  - **StreetVoice**
  - **SztvHu**
  - **Tagesschau**
  - **tagesschau:player**
- - **Tapely**
  - **Tass**
  - **TDSLifeway**
  - **teachertube**: teachertube.com videos
  - **TNAFlix**
  - **TNAFlixNetworkEmbed**
  - **toggle**
+ - **Tosh**: Tosh.0
  - **tou.tv**
  - **Toypics**: Toypics user profile
  - **ToypicsUser**: Toypics user profile
  - **tvigle**: Интернет-телевидение Tvigle.ru
  - **tvland.com**
  - **tvp**: Telewizja Polska
+ - **tvp:embed**: Telewizja Polska
  - **tvp:series**
- - **TVPlay**: TV3Play and related services
  - **Tweakers**
  - **twitch:chapter**
  - **twitch:clips**
  - **udemy:course**
  - **UDNEmbed**: 聯合影音
  - **Unistra**
+ - **uol.com.br**
+ - **uplynk**
+ - **uplynk:preplay**
  - **Urort**: NRK P3 Urørt
+ - **URPlay**
  - **USAToday**
  - **ustream**
  - **ustream:channel**
  - **VevoPlaylist**
  - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet
  - **vh1.com**
+ - **Viafree**
  - **Vice**
+ - **Viceland**
  - **ViceShow**
+ - **Vidbit**
  - **Viddler**
  - **video.google:search**: Google Video search
  - **video.mit.edu**
  - **vine:user**
  - **vk**: VK
  - **vk:uservideos**: VK - User's Videos
+ - **vk:wallpost**
  - **vlive**
  - **Vodlocker**
+ - **VODPlatform**
  - **VoiceRepublic**
  - **VoxMedia**
  - **Vporn**
  - **youtube:search**: YouTube.com searches
  - **youtube:search:date**: YouTube.com searches, newest videos first
  - **youtube:search_url**: YouTube.com search URLs
+ - **youtube:shared**
  - **youtube:show**: YouTube.com (multi-season) shows
  - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)
  - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword)
  - **ZDFChannel**
  - **zingmp3:album**: mp3.zing.vn albums
  - **zingmp3:song**: mp3.zing.vn songs
- - **ZippCast**
index 6404ac89f55df282e9525f6ae1a8e62f7344dd40..a98305c747635c1b1638f761d7bdf9bead353d19 100644 (file)
@@ -11,7 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from test.helper import FakeYDL
 from youtube_dl.extractor.common import InfoExtractor
 from youtube_dl.extractor import YoutubeIE, get_info_extractor
-from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError
+from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError
 
 
 class TestIE(InfoExtractor):
@@ -48,6 +48,9 @@ class TestInfoExtractor(unittest.TestCase):
         self.assertEqual(ie._og_search_property('foobar', html), 'Foo')
         self.assertEqual(ie._og_search_property('test1', html), 'foo > < bar')
         self.assertEqual(ie._og_search_property('test2', html), 'foo >//< bar')
+        self.assertEqual(ie._og_search_property(('test0', 'test1'), html), 'foo > < bar')
+        self.assertRaises(RegexNotFoundError, ie._og_search_property, 'test0', html, None, fatal=True)
+        self.assertRaises(RegexNotFoundError, ie._og_search_property, ('test0', 'test00'), html, None, fatal=True)
 
     def test_html_search_meta(self):
         ie = self.ie
@@ -66,6 +69,11 @@ class TestInfoExtractor(unittest.TestCase):
         self.assertEqual(ie._html_search_meta('d', html), '4')
         self.assertEqual(ie._html_search_meta('e', html), '5')
         self.assertEqual(ie._html_search_meta('f', html), '6')
+        self.assertEqual(ie._html_search_meta(('a', 'b', 'c'), html), '1')
+        self.assertEqual(ie._html_search_meta(('c', 'b', 'a'), html), '3')
+        self.assertEqual(ie._html_search_meta(('z', 'x', 'c'), html), '3')
+        self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True)
+        self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True)
 
     def test_download_json(self):
         uri = encode_data_uri(b'{"foo": "blah"}', 'application/json')
index ca25025e23a1eb1fd82eed39a534072bffe293ac..0dfe25c00165a015338f2732cdaf1bccf769195a 100644 (file)
@@ -335,6 +335,40 @@ class TestFormatSelection(unittest.TestCase):
             downloaded = ydl.downloaded_info_dicts[0]
             self.assertEqual(downloaded['format_id'], f1['format_id'])
 
+    def test_audio_only_extractor_format_selection(self):
+        # For extractors with incomplete formats (all formats are audio-only or
+        # video-only) best and worst should fallback to corresponding best/worst
+        # video-only or audio-only formats (as per
+        # https://github.com/rg3/youtube-dl/pull/5556)
+        formats = [
+            {'format_id': 'low', 'ext': 'mp3', 'preference': 1, 'vcodec': 'none', 'url': TEST_URL},
+            {'format_id': 'high', 'ext': 'mp3', 'preference': 2, 'vcodec': 'none', 'url': TEST_URL},
+        ]
+        info_dict = _make_result(formats)
+
+        ydl = YDL({'format': 'best'})
+        ydl.process_ie_result(info_dict.copy())
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], 'high')
+
+        ydl = YDL({'format': 'worst'})
+        ydl.process_ie_result(info_dict.copy())
+        downloaded = ydl.downloaded_info_dicts[0]
+        self.assertEqual(downloaded['format_id'], 'low')
+
+    def test_format_not_available(self):
+        formats = [
+            {'format_id': 'regular', 'ext': 'mp4', 'height': 360, 'url': TEST_URL},
+            {'format_id': 'video', 'ext': 'mp4', 'height': 720, 'acodec': 'none', 'url': TEST_URL},
+        ]
+        info_dict = _make_result(formats)
+
+        # This must fail since complete video-audio format does not match filter
+        # and extractor does not provide incomplete only formats (i.e. only
+        # video-only or audio-only).
+        ydl = YDL({'format': 'best[height>360]'})
+        self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy())
+
     def test_invalid_format_specs(self):
         def assert_syntax_error(format_spec):
             ydl = YDL({'format': format_spec})
index f5af184e6e0a79ccc11a9a66c2a9f19434087108..cd1cd4b24708da589ae2b15c79801271cc71166b 100644 (file)
@@ -6,6 +6,7 @@ from __future__ import unicode_literals
 import os
 import sys
 import unittest
+import collections
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 
@@ -100,8 +101,6 @@ class TestAllURLsMatching(unittest.TestCase):
         self.assertMatch(':ytsubs', ['youtube:subscriptions'])
         self.assertMatch(':ytsubscriptions', ['youtube:subscriptions'])
         self.assertMatch(':ythistory', ['youtube:history'])
-        self.assertMatch(':thedailyshow', ['ComedyCentralShows'])
-        self.assertMatch(':tds', ['ComedyCentralShows'])
 
     def test_vimeo_matching(self):
         self.assertMatch('https://vimeo.com/channels/tributes', ['vimeo:channel'])
@@ -130,6 +129,15 @@ class TestAllURLsMatching(unittest.TestCase):
             'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html',
             ['Yahoo'])
 
+    def test_no_duplicated_ie_names(self):
+        name_accu = collections.defaultdict(list)
+        for ie in self.ies:
+            name_accu[ie.IE_NAME.lower()].append(type(ie).__name__)
+        for (ie_name, ie_list) in name_accu.items():
+            self.assertEqual(
+                len(ie_list), 1,
+                'Multiple extractors with the same IE_NAME "%s" (%s)' % (ie_name, ', '.join(ie_list)))
+
 
 if __name__ == '__main__':
     unittest.main()
index f5317ac3e24290d5aa73e12c7e490bfed72d6c21..b574249489a3ded4cf5dcd66e49c123829c2331e 100644 (file)
@@ -87,6 +87,8 @@ class TestCompat(unittest.TestCase):
 
     def test_compat_shlex_split(self):
         self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two'])
+        self.assertEqual(compat_shlex_split('-option "one\ntwo" \n -flag'), ['-option', 'one\ntwo', '-flag'])
+        self.assertEqual(compat_shlex_split('-val 中文'), ['-val', '中文'])
 
     def test_compat_etree_fromstring(self):
         xml = '''
index 5076ced510847b83b2ac7a5374b92ef248ba8ec8..fdc68ccb42c85410788ecb7bcb1eafd802b3a794 100644 (file)
@@ -138,27 +138,27 @@ class TestProxy(unittest.TestCase):
         self.proxy_thread.daemon = True
         self.proxy_thread.start()
 
-        self.cn_proxy = compat_http_server.HTTPServer(
-            ('localhost', 0), _build_proxy_handler('cn'))
-        self.cn_port = http_server_port(self.cn_proxy)
-        self.cn_proxy_thread = threading.Thread(target=self.cn_proxy.serve_forever)
-        self.cn_proxy_thread.daemon = True
-        self.cn_proxy_thread.start()
+        self.geo_proxy = compat_http_server.HTTPServer(
+            ('localhost', 0), _build_proxy_handler('geo'))
+        self.geo_port = http_server_port(self.geo_proxy)
+        self.geo_proxy_thread = threading.Thread(target=self.geo_proxy.serve_forever)
+        self.geo_proxy_thread.daemon = True
+        self.geo_proxy_thread.start()
 
     def test_proxy(self):
-        cn_proxy = 'localhost:{0}'.format(self.cn_port)
+        geo_proxy = 'localhost:{0}'.format(self.geo_port)
         ydl = YoutubeDL({
             'proxy': 'localhost:{0}'.format(self.port),
-            'cn_verification_proxy': cn_proxy,
+            'geo_verification_proxy': geo_proxy,
         })
         url = 'http://foo.com/bar'
         response = ydl.urlopen(url).read().decode('utf-8')
         self.assertEqual(response, 'normal: {0}'.format(url))
 
         req = compat_urllib_request.Request(url)
-        req.add_header('Ytdl-request-proxy', cn_proxy)
+        req.add_header('Ytdl-request-proxy', geo_proxy)
         response = ydl.urlopen(req).read().decode('utf-8')
-        self.assertEqual(response, 'cn: {0}'.format(url))
+        self.assertEqual(response, 'geo: {0}'.format(url))
 
     def test_proxy_with_idn(self):
         ydl = YoutubeDL({
index b7ef51f8defa05da4a329315a069f10133d26bea..74fcf91c0cb622662d3a7c2d4985377a0ea6ce7b 100644 (file)
@@ -33,6 +33,7 @@ from youtube_dl.utils import (
     ExtractorError,
     find_xpath_attr,
     fix_xml_ampersands,
+    get_element_by_class,
     InAdvancePagedList,
     intlist_to_bytes,
     is_html,
@@ -41,6 +42,7 @@ from youtube_dl.utils import (
     ohdave_rsa_encrypt,
     OnDemandPagedList,
     orderedSet,
+    parse_age_limit,
     parse_duration,
     parse_filesize,
     parse_count,
@@ -60,11 +62,13 @@ from youtube_dl.utils import (
     timeconvert,
     unescapeHTML,
     unified_strdate,
+    unified_timestamp,
     unsmuggle_url,
     uppercase_escape,
     lowercase_escape,
     url_basename,
     urlencode_postdata,
+    urshift,
     update_url_query,
     version_tuple,
     xpath_with_ns,
@@ -78,6 +82,7 @@ from youtube_dl.utils import (
     cli_option,
     cli_valueless_option,
     cli_bool_option,
+    parse_codecs,
 )
 from youtube_dl.compat import (
     compat_chr,
@@ -283,8 +288,29 @@ class TestUtil(unittest.TestCase):
             '20150202')
         self.assertEqual(unified_strdate('Feb 14th 2016 5:45PM'), '20160214')
         self.assertEqual(unified_strdate('25-09-2014'), '20140925')
+        self.assertEqual(unified_strdate('27.02.2016 17:30'), '20160227')
         self.assertEqual(unified_strdate('UNKNOWN DATE FORMAT'), None)
 
+    def test_unified_timestamps(self):
+        self.assertEqual(unified_timestamp('December 21, 2010'), 1292889600)
+        self.assertEqual(unified_timestamp('8/7/2009'), 1247011200)
+        self.assertEqual(unified_timestamp('Dec 14, 2012'), 1355443200)
+        self.assertEqual(unified_timestamp('2012/10/11 01:56:38 +0000'), 1349920598)
+        self.assertEqual(unified_timestamp('1968 12 10'), -33436800)
+        self.assertEqual(unified_timestamp('1968-12-10'), -33436800)
+        self.assertEqual(unified_timestamp('28/01/2014 21:00:00 +0100'), 1390939200)
+        self.assertEqual(
+            unified_timestamp('11/26/2014 11:30:00 AM PST', day_first=False),
+            1417001400)
+        self.assertEqual(
+            unified_timestamp('2/2/2015 6:47:40 PM', day_first=False),
+            1422902860)
+        self.assertEqual(unified_timestamp('Feb 14th 2016 5:45PM'), 1455471900)
+        self.assertEqual(unified_timestamp('25-09-2014'), 1411603200)
+        self.assertEqual(unified_timestamp('27.02.2016 17:30'), 1456594200)
+        self.assertEqual(unified_timestamp('UNKNOWN DATE FORMAT'), None)
+        self.assertEqual(unified_timestamp('May 16, 2016 11:15 PM'), 1463440500)
+
     def test_determine_ext(self):
         self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4')
         self.assertEqual(determine_ext('http://example.com/foo/bar/?download', None), None)
@@ -383,6 +409,12 @@ class TestUtil(unittest.TestCase):
         self.assertEqual(res_url, url)
         self.assertEqual(res_data, None)
 
+        smug_url = smuggle_url(url, {'a': 'b'})
+        smug_smug_url = smuggle_url(smug_url, {'c': 'd'})
+        res_url, res_data = unsmuggle_url(smug_smug_url)
+        self.assertEqual(res_url, url)
+        self.assertEqual(res_data, {'a': 'b', 'c': 'd'})
+
     def test_shell_quote(self):
         args = ['ffmpeg', '-i', encodeFilename('ñ€ß\'.mp4')]
         self.assertEqual(shell_quote(args), """ffmpeg -i 'ñ€ß'"'"'.mp4'""")
@@ -401,6 +433,20 @@ class TestUtil(unittest.TestCase):
             url_basename('http://media.w3.org/2010/05/sintel/trailer.mp4'),
             'trailer.mp4')
 
+    def test_parse_age_limit(self):
+        self.assertEqual(parse_age_limit(None), None)
+        self.assertEqual(parse_age_limit(False), None)
+        self.assertEqual(parse_age_limit('invalid'), None)
+        self.assertEqual(parse_age_limit(0), 0)
+        self.assertEqual(parse_age_limit(18), 18)
+        self.assertEqual(parse_age_limit(21), 21)
+        self.assertEqual(parse_age_limit(22), None)
+        self.assertEqual(parse_age_limit('18'), 18)
+        self.assertEqual(parse_age_limit('18+'), 18)
+        self.assertEqual(parse_age_limit('PG-13'), 13)
+        self.assertEqual(parse_age_limit('TV-14'), 14)
+        self.assertEqual(parse_age_limit('TV-MA'), 17)
+
     def test_parse_duration(self):
         self.assertEqual(parse_duration(None), None)
         self.assertEqual(parse_duration(False), None)
@@ -579,6 +625,29 @@ class TestUtil(unittest.TestCase):
             limit_length('foo bar baz asd', 12).startswith('foo bar'))
         self.assertTrue('...' in limit_length('foo bar baz asd', 12))
 
+    def test_parse_codecs(self):
+        self.assertEqual(parse_codecs(''), {})
+        self.assertEqual(parse_codecs('avc1.77.30, mp4a.40.2'), {
+            'vcodec': 'avc1.77.30',
+            'acodec': 'mp4a.40.2',
+        })
+        self.assertEqual(parse_codecs('mp4a.40.2'), {
+            'vcodec': 'none',
+            'acodec': 'mp4a.40.2',
+        })
+        self.assertEqual(parse_codecs('mp4a.40.5,avc1.42001e'), {
+            'vcodec': 'avc1.42001e',
+            'acodec': 'mp4a.40.5',
+        })
+        self.assertEqual(parse_codecs('avc3.640028'), {
+            'vcodec': 'avc3.640028',
+            'acodec': 'none',
+        })
+        self.assertEqual(parse_codecs(', h264,,newcodec,aac'), {
+            'vcodec': 'h264',
+            'acodec': 'aac',
+        })
+
     def test_escape_rfc3986(self):
         reserved = "!*'();:@&=+$,/?#[]"
         unreserved = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~'
@@ -899,6 +968,7 @@ The first line
         self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128'])
         self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), [])
         self.assertEqual(cli_option({}, '--proxy', 'proxy'), [])
+        self.assertEqual(cli_option({'retries': 10}, '--retries', 'retries'), ['--retries', '10'])
 
     def test_cli_valueless_option(self):
         self.assertEqual(cli_valueless_option(
@@ -959,5 +1029,17 @@ The first line
         self.assertRaises(ValueError, encode_base_n, 0, 70)
         self.assertRaises(ValueError, encode_base_n, 0, 60, custom_table)
 
+    def test_urshift(self):
+        self.assertEqual(urshift(3, 1), 1)
+        self.assertEqual(urshift(-3, 1), 2147483646)
+
+    def test_get_element_by_class(self):
+        html = '''
+            <span class="foo bar">nice</span>
+        '''
+
+        self.assertEqual(get_element_by_class('foo', html), 'nice')
+        self.assertEqual(get_element_by_class('no-such-class', html), None)
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/test_verbose_output.py b/test/test_verbose_output.py
new file mode 100644 (file)
index 0000000..96a66f7
--- /dev/null
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+import unittest
+
+import sys
+import os
+import subprocess
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+rootDir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+
+class TestVerboseOutput(unittest.TestCase):
+    def test_private_info_arg(self):
+        outp = subprocess.Popen(
+            [
+                sys.executable, 'youtube_dl/__main__.py', '-v',
+                '--username', 'johnsmith@gmail.com',
+                '--password', 'secret',
+            ], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        sout, serr = outp.communicate()
+        self.assertTrue(b'--username' in serr)
+        self.assertTrue(b'johnsmith' not in serr)
+        self.assertTrue(b'--password' in serr)
+        self.assertTrue(b'secret' not in serr)
+
+    def test_private_info_shortarg(self):
+        outp = subprocess.Popen(
+            [
+                sys.executable, 'youtube_dl/__main__.py', '-v',
+                '-u', 'johnsmith@gmail.com',
+                '-p', 'secret',
+            ], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        sout, serr = outp.communicate()
+        self.assertTrue(b'-u' in serr)
+        self.assertTrue(b'johnsmith' not in serr)
+        self.assertTrue(b'-p' in serr)
+        self.assertTrue(b'secret' not in serr)
+
+    def test_private_info_eq(self):
+        outp = subprocess.Popen(
+            [
+                sys.executable, 'youtube_dl/__main__.py', '-v',
+                '--username=johnsmith@gmail.com',
+                '--password=secret',
+            ], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        sout, serr = outp.communicate()
+        self.assertTrue(b'--username' in serr)
+        self.assertTrue(b'johnsmith' not in serr)
+        self.assertTrue(b'--password' in serr)
+        self.assertTrue(b'secret' not in serr)
+
+    def test_private_info_shortarg_eq(self):
+        outp = subprocess.Popen(
+            [
+                sys.executable, 'youtube_dl/__main__.py', '-v',
+                '-u=johnsmith@gmail.com',
+                '-p=secret',
+            ], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        sout, serr = outp.communicate()
+        self.assertTrue(b'-u' in serr)
+        self.assertTrue(b'johnsmith' not in serr)
+        self.assertTrue(b'-p' in serr)
+        self.assertTrue(b'secret' not in serr)
+
+if __name__ == '__main__':
+    unittest.main()
index 5036289b062f9ee05ab5c872c76ab38babcb4a54..e844dc98a5b3915070ffae079395233de7ed04f7 100755 (executable)
@@ -5,6 +5,7 @@ from __future__ import absolute_import, unicode_literals
 
 import collections
 import contextlib
+import copy
 import datetime
 import errno
 import fileinput
@@ -196,8 +197,8 @@ class YoutubeDL(object):
     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
                        At the moment, this is only supported by YouTube.
     proxy:             URL of the proxy server to use
-    cn_verification_proxy:  URL of the proxy to use for IP address verification
-                       on Chinese sites. (Experimental)
+    geo_verification_proxy:  URL of the proxy to use for IP address verification
+                       on geo-restricted sites. (Experimental)
     socket_timeout:    Time to wait for unresponsive hosts, in seconds
     bidi_workaround:   Work around buggy terminals without bidirectional text
                        support, using fridibi
@@ -248,7 +249,16 @@ class YoutubeDL(object):
     source_address:    (Experimental) Client-side IP address to bind to.
     call_home:         Boolean, true iff we are allowed to contact the
                        youtube-dl servers for debugging.
-    sleep_interval:    Number of seconds to sleep before each download.
+    sleep_interval:    Number of seconds to sleep before each download when
+                       used alone or a lower bound of a range for randomized
+                       sleep before each download (minimum possible number
+                       of seconds to sleep) when used along with
+                       max_sleep_interval.
+    max_sleep_interval:Upper bound of a range for randomized sleep before each
+                       download (maximum possible number of seconds to sleep).
+                       Must only be used along with sleep_interval.
+                       Actual sleep time will be a random float from range
+                       [sleep_interval; max_sleep_interval].
     listformats:       Print an overview of available video formats and exit.
     list_thumbnails:   Print a table of all thumbnails and exit.
     match_filter:      A function that gets called with the info_dict of
@@ -304,6 +314,11 @@ class YoutubeDL(object):
         self.params.update(params)
         self.cache = Cache(self)
 
+        if self.params.get('cn_verification_proxy') is not None:
+            self.report_warning('--cn-verification-proxy is deprecated. Use --geo-verification-proxy instead.')
+            if self.params.get('geo_verification_proxy') is None:
+                self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
+
         if params.get('bidi_workaround', False):
             try:
                 import pty
@@ -1046,9 +1061,9 @@ class YoutubeDL(object):
             if isinstance(selector, list):
                 fs = [_build_selector_function(s) for s in selector]
 
-                def selector_function(formats):
+                def selector_function(ctx):
                     for f in fs:
-                        for format in f(formats):
+                        for format in f(ctx):
                             yield format
                 return selector_function
             elif selector.type == GROUP:
@@ -1056,17 +1071,17 @@ class YoutubeDL(object):
             elif selector.type == PICKFIRST:
                 fs = [_build_selector_function(s) for s in selector.selector]
 
-                def selector_function(formats):
+                def selector_function(ctx):
                     for f in fs:
-                        picked_formats = list(f(formats))
+                        picked_formats = list(f(ctx))
                         if picked_formats:
                             return picked_formats
                     return []
             elif selector.type == SINGLE:
                 format_spec = selector.selector
 
-                def selector_function(formats):
-                    formats = list(formats)
+                def selector_function(ctx):
+                    formats = list(ctx['formats'])
                     if not formats:
                         return
                     if format_spec == 'all':
@@ -1079,9 +1094,10 @@ class YoutubeDL(object):
                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
                         if audiovideo_formats:
                             yield audiovideo_formats[format_idx]
-                        # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format
-                        elif (all(f.get('acodec') != 'none' for f in formats) or
-                              all(f.get('vcodec') != 'none' for f in formats)):
+                        # for extractors with incomplete formats (audio only (soundcloud)
+                        # or video only (imgur)) we will fallback to best/worst
+                        # {video,audio}-only format
+                        elif ctx['incomplete_formats']:
                             yield formats[format_idx]
                     elif format_spec == 'bestaudio':
                         audio_formats = [
@@ -1155,17 +1171,18 @@ class YoutubeDL(object):
                     }
                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
 
-                def selector_function(formats):
-                    formats = list(formats)
-                    for pair in itertools.product(video_selector(formats), audio_selector(formats)):
+                def selector_function(ctx):
+                    for pair in itertools.product(
+                            video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
                         yield _merge(pair)
 
             filters = [self._build_format_filter(f) for f in selector.filters]
 
-            def final_selector(formats):
+            def final_selector(ctx):
+                ctx_copy = copy.deepcopy(ctx)
                 for _filter in filters:
-                    formats = list(filter(_filter, formats))
-                return selector_function(formats)
+                    ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
+                return selector_function(ctx_copy)
             return final_selector
 
         stream = io.BytesIO(format_spec.encode('utf-8'))
@@ -1372,7 +1389,34 @@ class YoutubeDL(object):
             req_format_list.append('best')
             req_format = '/'.join(req_format_list)
         format_selector = self.build_format_selector(req_format)
-        formats_to_download = list(format_selector(formats))
+
+        # While in format selection we may need to have an access to the original
+        # format set in order to calculate some metrics or do some processing.
+        # For now we need to be able to guess whether original formats provided
+        # by extractor are incomplete or not (i.e. whether extractor provides only
+        # video-only or audio-only formats) for proper formats selection for
+        # extractors with such incomplete formats (see
+        # https://github.com/rg3/youtube-dl/pull/5556).
+        # Since formats may be filtered during format selection and may not match
+        # the original formats the results may be incorrect. Thus original formats
+        # or pre-calculated metrics should be passed to format selection routines
+        # as well.
+        # We will pass a context object containing all necessary additional data
+        # instead of just formats.
+        # This fixes incorrect format selection issue (see
+        # https://github.com/rg3/youtube-dl/issues/10083).
+        incomplete_formats = (
+            # All formats are video-only or
+            all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or
+            # all formats are audio-only
+            all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
+
+        ctx = {
+            'formats': formats,
+            'incomplete_formats': incomplete_formats,
+        }
+
+        formats_to_download = list(format_selector(ctx))
         if not formats_to_download:
             raise ExtractorError('requested format not available',
                                  expected=True)
@@ -1559,7 +1603,9 @@ class YoutubeDL(object):
                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
                     else:
                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
-                        with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
+                        # Use newline='' to prevent conversion of newline characters
+                        # See https://github.com/rg3/youtube-dl/issues/10268
+                        with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
                             subfile.write(sub_data)
                 except (OSError, IOError):
                     self.report_error('Cannot write subtitles file ' + sub_filename)
index 4905674ad26f29323264cdcb188f1c9aed621a0a..a9730292cd6801c1bbd7f68b586e288bad7fe21d 100644 (file)
@@ -145,6 +145,16 @@ def _real_main(argv=None):
         if numeric_limit is None:
             parser.error('invalid max_filesize specified')
         opts.max_filesize = numeric_limit
+    if opts.sleep_interval is not None:
+        if opts.sleep_interval < 0:
+            parser.error('sleep interval must be positive or 0')
+    if opts.max_sleep_interval is not None:
+        if opts.max_sleep_interval < 0:
+            parser.error('max sleep interval must be positive or 0')
+        if opts.max_sleep_interval < opts.sleep_interval:
+            parser.error('max sleep interval must be greater than or equal to min sleep interval')
+    else:
+        opts.max_sleep_interval = opts.sleep_interval
 
     def parse_retries(retries):
         if retries in ('inf', 'infinite'):
@@ -370,6 +380,7 @@ def _real_main(argv=None):
         'source_address': opts.source_address,
         'call_home': opts.call_home,
         'sleep_interval': opts.sleep_interval,
+        'max_sleep_interval': opts.max_sleep_interval,
         'external_downloader': opts.external_downloader,
         'list_thumbnails': opts.list_thumbnails,
         'playlist_items': opts.playlist_items,
@@ -382,6 +393,8 @@ def _real_main(argv=None):
         'external_downloader_args': external_downloader_args,
         'postprocessor_args': postprocessor_args,
         'cn_verification_proxy': opts.cn_verification_proxy,
+        'geo_verification_proxy': opts.geo_verification_proxy,
+
     }
 
     with YoutubeDL(ydl_opts) as ydl:
index 67db1c7c6595b0f219d393da7df1363c48103f26..b8aaf5a461c9e3ca2884c748ebb3225a2fd9fe29 100644 (file)
@@ -1,3 +1,4 @@
+# coding: utf-8
 from __future__ import unicode_literals
 
 import binascii
@@ -2594,15 +2595,19 @@ except ImportError:  # Python < 3.3
             return "'" + s.replace("'", "'\"'\"'") + "'"
 
 
-if sys.version_info >= (2, 7, 3):
+try:
+    args = shlex.split('中文')
+    assert (isinstance(args, list) and
+            isinstance(args[0], compat_str) and
+            args[0] == '中文')
     compat_shlex_split = shlex.split
-else:
+except (AssertionError, UnicodeEncodeError):
     # Working around shlex issue with unicode strings on some python 2
     # versions (see http://bugs.python.org/issue1548891)
     def compat_shlex_split(s, comments=False, posix=True):
         if isinstance(s, compat_str):
             s = s.encode('utf-8')
-        return shlex.split(s, comments, posix)
+        return list(map(lambda s: s.decode('utf-8'), shlex.split(s, comments, posix)))
 
 
 def compat_ord(c):
index 1dba9f49a8b9b586b8428c6c7d65ca641de02c58..8482cbd8423dae254db7efff873588cd8fb10b8a 100644 (file)
@@ -4,6 +4,7 @@ import os
 import re
 import sys
 import time
+import random
 
 from ..compat import compat_os_name
 from ..utils import (
@@ -342,8 +343,11 @@ class FileDownloader(object):
             })
             return True
 
-        sleep_interval = self.params.get('sleep_interval')
-        if sleep_interval:
+        min_sleep_interval = self.params.get('sleep_interval')
+        if min_sleep_interval:
+            max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval)
+            print(min_sleep_interval, max_sleep_interval)
+            sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval)
             self.to_screen('[download] Sleeping %s seconds...' % sleep_interval)
             time.sleep(sleep_interval)
 
index fae2450248494a70f237a65d07a4bedcfddaadeb..cf45562217ac0c27b77f3a241f3881d770d800ce 100644 (file)
@@ -96,6 +96,12 @@ class CurlFD(ExternalFD):
         cmd = [self.exe, '--location', '-o', tmpfilename]
         for key, val in info_dict['http_headers'].items():
             cmd += ['--header', '%s: %s' % (key, val)]
+        cmd += self._bool_option('--continue-at', 'continuedl', '-', '0')
+        cmd += self._valueless_option('--silent', 'noprogress')
+        cmd += self._valueless_option('--verbose', 'verbose')
+        cmd += self._option('--limit-rate', 'ratelimit')
+        cmd += self._option('--retry', 'retries')
+        cmd += self._option('--max-filesize', 'max_filesize')
         cmd += self._option('--interface', 'source_address')
         cmd += self._option('--proxy', 'proxy')
         cmd += self._valueless_option('--insecure', 'nocheckcertificate')
@@ -103,6 +109,16 @@ class CurlFD(ExternalFD):
         cmd += ['--', info_dict['url']]
         return cmd
 
+    def _call_downloader(self, tmpfilename, info_dict):
+        cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)]
+
+        self._debug_cmd(cmd)
+
+        # curl writes the progress to stderr so don't capture it.
+        p = subprocess.Popen(cmd)
+        p.communicate()
+        return p.returncode
+
 
 class AxelFD(ExternalFD):
     AVAILABLE_OPT = '-V'
index 8f88b02414a28a8da650e03418c8ecd7e52a59a6..80c21d40bc88382a64634b0eeb9daa3eaaccc303 100644 (file)
@@ -196,6 +196,11 @@ def build_fragments_list(boot_info):
     first_frag_number = fragment_run_entry_table[0]['first']
     fragments_counter = itertools.count(first_frag_number)
     for segment, fragments_count in segment_run_table['segment_run']:
+        # In some live HDS streams (for example Rai), `fragments_count` is
+        # abnormal and causing out-of-memory errors. It's OK to change the
+        # number of fragments for live streams as they are updated periodically
+        if fragments_count == 4294967295 and boot_info['live']:
+            fragments_count = 2
         for _ in range(fragments_count):
             res.append((segment, next(fragments_counter)))
 
@@ -329,7 +334,11 @@ class F4mFD(FragmentFD):
 
         base_url = compat_urlparse.urljoin(man_url, media.attrib['url'])
         bootstrap_node = doc.find(_add_ns('bootstrapInfo'))
-        boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, base_url)
+        # From Adobe F4M 3.0 spec:
+        # The <baseURL> element SHALL be the base URL for all relative
+        # (HTTP-based) URLs in the manifest. If <baseURL> is not present, said
+        # URLs should be relative to the location of the containing document.
+        boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, man_url)
         live = boot_info['live']
         metadata_node = media.find(_add_ns('metadata'))
         if metadata_node is not None:
index 3b7bb35087568b084e84fcd0610f951c0eaf8472..8d7971e5d7042e5917deaf589cdebac7d488cd10 100644 (file)
@@ -20,6 +20,7 @@ from ..utils import (
     encodeFilename,
     sanitize_open,
     parse_m3u8_attributes,
+    update_url_query,
 )
 
 
@@ -82,6 +83,7 @@ class HlsFD(FragmentFD):
 
         self._prepare_and_start_frag_download(ctx)
 
+        extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url')
         i = 0
         media_sequence = 0
         decrypt_info = {'METHOD': 'NONE'}
@@ -95,6 +97,8 @@ class HlsFD(FragmentFD):
                         if re.match(r'^https?://', line)
                         else compat_urlparse.urljoin(man_url, line))
                     frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i)
+                    if extra_param_to_segment_url:
+                        frag_url = update_url_query(frag_url, extra_param_to_segment_url)
                     success = ctx['dl'].download(frag_filename, {'url': frag_url})
                     if not success:
                         return False
@@ -120,6 +124,8 @@ class HlsFD(FragmentFD):
                         if not re.match(r'^https?://', decrypt_info['URI']):
                             decrypt_info['URI'] = compat_urlparse.urljoin(
                                 man_url, decrypt_info['URI'])
+                        if extra_param_to_segment_url:
+                            decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_param_to_segment_url)
                         decrypt_info['KEY'] = self.ydl.urlopen(decrypt_info['URI']).read()
                 elif line.startswith('#EXT-X-MEDIA-SEQUENCE'):
                     media_sequence = int(line[22:])
diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py
new file mode 100644 (file)
index 0000000..9e3a3e3
--- /dev/null
@@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import re
+import time
+import xml.etree.ElementTree as etree
+
+from .common import InfoExtractor
+from ..utils import (
+    unescapeHTML,
+    urlencode_postdata,
+    unified_timestamp,
+)
+
+
+class AdobePassIE(InfoExtractor):
+    _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s'
+    _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0'
+
+    @staticmethod
+    def _get_mvpd_resource(provider_id, title, guid, rating):
+        channel = etree.Element('channel')
+        channel_title = etree.SubElement(channel, 'title')
+        channel_title.text = provider_id
+        item = etree.SubElement(channel, 'item')
+        resource_title = etree.SubElement(item, 'title')
+        resource_title.text = title
+        resource_guid = etree.SubElement(item, 'guid')
+        resource_guid.text = guid
+        resource_rating = etree.SubElement(item, 'media:rating')
+        resource_rating.attrib = {'scheme': 'urn:v-chip'}
+        resource_rating.text = rating
+        return '<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">' + etree.tostring(channel).decode() + '</rss>'
+
+    def _extract_mvpd_auth(self, url, video_id, requestor_id, resource):
+        def xml_text(xml_str, tag):
+            return self._search_regex(
+                '<%s>(.+?)</%s>' % (tag, tag), xml_str, tag)
+
+        mvpd_headers = {
+            'ap_42': 'anonymous',
+            'ap_11': 'Linux i686',
+            'ap_z': self._USER_AGENT,
+            'User-Agent': self._USER_AGENT,
+        }
+
+        guid = xml_text(resource, 'guid')
+        requestor_info = self._downloader.cache.load('mvpd', requestor_id) or {}
+        authn_token = requestor_info.get('authn_token')
+        if authn_token:
+            token_expires = unified_timestamp(re.sub(r'[_ ]GMT', '', xml_text(authn_token, 'simpleTokenExpires')))
+            if token_expires and token_expires <= int(time.time()):
+                authn_token = None
+                requestor_info = {}
+        if not authn_token:
+            # TODO add support for other TV Providers
+            mso_id = 'DTV'
+            username, password = self._get_netrc_login_info(mso_id)
+            if not username or not password:
+                return ''
+
+            def post_form(form_page, note, data={}):
+                post_url = self._html_search_regex(r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_page, 'post url', group='url')
+                return self._download_webpage(
+                    post_url, video_id, note, data=urlencode_postdata(data or self._hidden_inputs(form_page)), headers={
+                        'Content-Type': 'application/x-www-form-urlencoded',
+                    })
+
+            provider_redirect_page = self._download_webpage(
+                self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id,
+                'Downloading Provider Redirect Page', query={
+                    'noflash': 'true',
+                    'mso_id': mso_id,
+                    'requestor_id': requestor_id,
+                    'no_iframe': 'false',
+                    'domain_name': 'adobe.com',
+                    'redirect_url': url,
+                })
+            provider_login_page = post_form(
+                provider_redirect_page, 'Downloading Provider Login Page')
+            mvpd_confirm_page = post_form(provider_login_page, 'Logging in', {
+                'username': username,
+                'password': password,
+            })
+            post_form(mvpd_confirm_page, 'Confirming Login')
+
+            session = self._download_webpage(
+                self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id,
+                'Retrieving Session', data=urlencode_postdata({
+                    '_method': 'GET',
+                    'requestor_id': requestor_id,
+                }), headers=mvpd_headers)
+            if '<pendingLogout' in session:
+                self._downloader.cache.store('mvpd', requestor_id, {})
+                return self._extract_mvpd_auth(url, video_id, requestor_id, resource)
+            authn_token = unescapeHTML(xml_text(session, 'authnToken'))
+            requestor_info['authn_token'] = authn_token
+            self._downloader.cache.store('mvpd', requestor_id, requestor_info)
+
+        authz_token = requestor_info.get(guid)
+        if not authz_token:
+            authorize = self._download_webpage(
+                self._SERVICE_PROVIDER_TEMPLATE % 'authorize', video_id,
+                'Retrieving Authorization Token', data=urlencode_postdata({
+                    'resource_id': resource,
+                    'requestor_id': requestor_id,
+                    'authentication_token': authn_token,
+                    'mso_id': xml_text(authn_token, 'simpleTokenMsoID'),
+                    'userMeta': '1',
+                }), headers=mvpd_headers)
+            if '<pendingLogout' in authorize:
+                self._downloader.cache.store('mvpd', requestor_id, {})
+                return self._extract_mvpd_auth(url, video_id, requestor_id, resource)
+            authz_token = unescapeHTML(xml_text(authorize, 'authzToken'))
+            requestor_info[guid] = authz_token
+            self._downloader.cache.store('mvpd', requestor_id, requestor_info)
+
+        mvpd_headers.update({
+            'ap_19': xml_text(authn_token, 'simpleSamlNameID'),
+            'ap_23': xml_text(authn_token, 'simpleSamlSessionIndex'),
+        })
+
+        short_authorize = self._download_webpage(
+            self._SERVICE_PROVIDER_TEMPLATE % 'shortAuthorize',
+            video_id, 'Retrieving Media Token', data=urlencode_postdata({
+                'authz_token': authz_token,
+                'requestor_id': requestor_id,
+                'session_guid': xml_text(authn_token, 'simpleTokenAuthenticationGuid'),
+                'hashed_guid': 'false',
+            }), headers=mvpd_headers)
+        if '<pendingLogout' in short_authorize:
+            self._downloader.cache.store('mvpd', requestor_id, {})
+            return self._extract_mvpd_auth(url, video_id, requestor_id, resource)
+        return short_authorize
index 8157da2cb63af8a7079fda8c388be3108281a7ad..3f7f8c03624a9e25611b239e89cf900b41cbce0e 100644 (file)
@@ -83,6 +83,20 @@ class AdultSwimIE(InfoExtractor):
             # m3u8 download
             'skip_download': True,
         }
+    }, {
+        # heroMetadata.trailer
+        'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/',
+        'info_dict': {
+            'id': 'I0LQFQkaSUaFp8PnAWHhoQ',
+            'ext': 'mp4',
+            'title': 'Decker - Inside Decker: A New Hero',
+            'description': 'md5:c916df071d425d62d70c86d4399d3ee0',
+            'duration': 249.008,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
     }]
 
     @staticmethod
@@ -133,20 +147,26 @@ class AdultSwimIE(InfoExtractor):
             if video_info is None:
                 if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path:
                     video_info = bootstrapped_data['slugged_video']
-                else:
-                    raise ExtractorError('Unable to find video info')
+            if not video_info:
+                video_info = bootstrapped_data.get('heroMetadata', {}).get('trailer').get('video')
+            if not video_info:
+                raise ExtractorError('Unable to find video info')
 
             show = bootstrapped_data['show']
             show_title = show['title']
             stream = video_info.get('stream')
-            clips = [stream] if stream else video_info.get('clips')
-            if not clips:
+            if stream and stream.get('videoPlaybackID'):
+                segment_ids = [stream['videoPlaybackID']]
+            elif video_info.get('clips'):
+                segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']]
+            elif video_info.get('videoPlaybackID'):
+                segment_ids = [video_info['videoPlaybackID']]
+            else:
                 raise ExtractorError(
                     'This video is only available via cable service provider subscription that'
                     ' is not currently supported. You may want to use --cookies.'
                     if video_info.get('auth') is True else 'Unable to find stream or clips',
                     expected=True)
-            segment_ids = [clip['videoPlaybackID'] for clip in clips]
 
         episode_id = video_info['id']
         episode_title = video_info['title']
index 1bbfe264177dab80c6e40009e8543e52600c3d9d..6adb6d824c00ec733afaf1bbe1b243f7d623b647 100644 (file)
@@ -2,41 +2,33 @@ from __future__ import unicode_literals
 
 import re
 
-from .common import InfoExtractor
+from .theplatform import ThePlatformIE
 from ..utils import (
     smuggle_url,
     update_url_query,
     unescapeHTML,
+    extract_attributes,
+    get_element_by_attribute,
 )
+from ..compat import (
+    compat_urlparse,
+)
+
 
+class AENetworksBaseIE(ThePlatformIE):
+    _THEPLATFORM_KEY = 'crazyjava'
+    _THEPLATFORM_SECRET = 's3cr3t'
 
-class AENetworksIE(InfoExtractor):
+
+class AENetworksIE(AENetworksBaseIE):
     IE_NAME = 'aenetworks'
     IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network'
-    _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?P<type>[^/]+)/(?:[^/]+/)+(?P<id>[^/]+?)(?:$|[?#])'
-
+    _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?:shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})|movies/(?P<movie_display_id>[^/]+)/full-movie)'
     _TESTS = [{
-        'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false',
-        'info_dict': {
-            'id': 'g12m5Gyt3fdR',
-            'ext': 'mp4',
-            'title': "Bet You Didn't Know: Valentine's Day",
-            'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7',
-            'timestamp': 1375819729,
-            'upload_date': '20130806',
-            'uploader': 'AENE-NEW',
-        },
-        'params': {
-            # m3u8 download
-            'skip_download': True,
-        },
-        'add_ie': ['ThePlatform'],
-        'expected_warnings': ['JSON-LD'],
-    }, {
         'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1',
         'md5': '8ff93eb073449f151d6b90c0ae1ef0c7',
         'info_dict': {
-            'id': 'eg47EERs_JsZ',
+            'id': '22253814',
             'ext': 'mp4',
             'title': 'Winter Is Coming',
             'description': 'md5:641f424b7a19d8e24f26dea22cf59d74',
@@ -46,42 +38,171 @@ class AENetworksIE(InfoExtractor):
         },
         'add_ie': ['ThePlatform'],
     }, {
-        'url': 'http://www.aetv.com/shows/duck-dynasty/video/inlawful-entry',
+        'url': 'http://www.history.com/shows/ancient-aliens/season-1',
+        'info_dict': {
+            'id': '71889446852',
+        },
+        'playlist_mincount': 5,
+    }, {
+        'url': 'http://www.mylifetime.com/shows/atlanta-plastic',
+        'info_dict': {
+            'id': 'SERIES4317',
+            'title': 'Atlanta Plastic',
+        },
+        'playlist_mincount': 2,
+    }, {
+        'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1',
         'only_matching': True
     }, {
-        'url': 'http://www.fyi.tv/shows/tiny-house-nation/videos/207-sq-ft-minnesota-prairie-cottage',
+        'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8',
         'only_matching': True
     }, {
-        'url': 'http://www.mylifetime.com/shows/project-runway-junior/video/season-1/episode-6/superstar-clients',
+        'url': 'http://www.mylifetime.com/shows/project-runway-junior/season-1/episode-6',
+        'only_matching': True
+    }, {
+        'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie',
         'only_matching': True
     }]
+    _DOMAIN_TO_REQUESTOR_ID = {
+        'history.com': 'HISTORY',
+        'aetv.com': 'AETV',
+        'mylifetime.com': 'LIFETIME',
+        'fyi.tv': 'FYI',
+    }
 
     def _real_extract(self, url):
-        page_type, video_id = re.match(self._VALID_URL, url).groups()
+        domain, show_path, movie_display_id = re.match(self._VALID_URL, url).groups()
+        display_id = show_path or movie_display_id
+        webpage = self._download_webpage(url, display_id)
+        if show_path:
+            url_parts = show_path.split('/')
+            url_parts_len = len(url_parts)
+            if url_parts_len == 1:
+                entries = []
+                for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage):
+                    entries.append(self.url_result(
+                        compat_urlparse.urljoin(url, season_url_path), 'AENetworks'))
+                return self.playlist_result(
+                    entries, self._html_search_meta('aetn:SeriesId', webpage),
+                    self._html_search_meta('aetn:SeriesTitle', webpage))
+            elif url_parts_len == 2:
+                entries = []
+                for episode_item in re.findall(r'(?s)<div[^>]+class="[^"]*episode-item[^"]*"[^>]*>', webpage):
+                    episode_attributes = extract_attributes(episode_item)
+                    episode_url = compat_urlparse.urljoin(
+                        url, episode_attributes['data-canonical'])
+                    entries.append(self.url_result(
+                        episode_url, 'AENetworks',
+                        episode_attributes['data-videoid']))
+                return self.playlist_result(
+                    entries, self._html_search_meta('aetn:SeasonId', webpage))
+
+        query = {
+            'mbr': 'true',
+            'assetTypes': 'medium_video_s3'
+        }
+        video_id = self._html_search_meta('aetn:VideoID', webpage)
+        media_url = self._search_regex(
+            r"media_url\s*=\s*'([^']+)'", webpage, 'video url')
+        theplatform_metadata = self._download_theplatform_metadata(self._search_regex(
+            r'https?://link.theplatform.com/s/([^?]+)', media_url, 'theplatform_path'), video_id)
+        info = self._parse_theplatform_metadata(theplatform_metadata)
+        if theplatform_metadata.get('AETN$isBehindWall'):
+            requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain]
+            resource = self._get_mvpd_resource(
+                requestor_id, theplatform_metadata['title'],
+                theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'),
+                theplatform_metadata['ratings'][0]['rating'])
+            query['auth'] = self._extract_mvpd_auth(
+                url, video_id, requestor_id, resource)
+        info.update(self._search_json_ld(webpage, video_id, fatal=False))
+        media_url = update_url_query(media_url, query)
+        media_url = self._sign_url(media_url, self._THEPLATFORM_KEY, self._THEPLATFORM_SECRET)
+        formats, subtitles = self._extract_theplatform_smil(media_url, video_id)
+        self._sort_formats(formats)
+        info.update({
+            'id': video_id,
+            'formats': formats,
+            'subtitles': subtitles,
+        })
+        return info
 
-        webpage = self._download_webpage(url, video_id)
 
-        video_url_re = [
-            r'data-href="[^"]*/%s"[^>]+data-release-url="([^"]+)"' % video_id,
-            r"media_url\s*=\s*'([^']+)'"
-        ]
-        video_url = unescapeHTML(self._search_regex(video_url_re, webpage, 'video url'))
-        query = {'mbr': 'true'}
-        if page_type == 'shows':
-            query['assetTypes'] = 'medium_video_s3'
-        if 'switch=hds' in video_url:
-            query['switch'] = 'hls'
+class HistoryTopicIE(AENetworksBaseIE):
+    IE_NAME = 'history:topic'
+    IE_DESC = 'History.com Topic'
+    _VALID_URL = r'https?://(?:www\.)?history\.com/topics/(?:[^/]+/)?(?P<topic_id>[^/]+)(?:/[^/]+(?:/(?P<video_display_id>[^/?#]+))?)?'
+    _TESTS = [{
+        'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false',
+        'info_dict': {
+            'id': '40700995724',
+            'ext': 'mp4',
+            'title': "Bet You Didn't Know: Valentine's Day",
+            'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7',
+            'timestamp': 1375819729,
+            'upload_date': '20130806',
+            'uploader': 'AENE-NEW',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'add_ie': ['ThePlatform'],
+    }, {
+        'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/videos',
+        'info_dict':
+        {
+            'id': 'world-war-i-history',
+            'title': 'World War I History',
+        },
+        'playlist_mincount': 24,
+    }, {
+        'url': 'http://www.history.com/topics/world-war-i-history/videos',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.history.com/topics/world-war-i/world-war-i-history',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/speeches',
+        'only_matching': True,
+    }]
 
-        info = self._search_json_ld(webpage, video_id, fatal=False)
-        info.update({
+    def theplatform_url_result(self, theplatform_url, video_id, query):
+        return {
             '_type': 'url_transparent',
+            'id': video_id,
             'url': smuggle_url(
-                update_url_query(video_url, query),
+                update_url_query(theplatform_url, query),
                 {
                     'sig': {
-                        'key': 'crazyjava',
-                        'secret': 's3cr3t'},
+                        'key': self._THEPLATFORM_KEY,
+                        'secret': self._THEPLATFORM_SECRET,
+                    },
                     'force_smil_url': True
                 }),
-        })
-        return info
+            'ie_key': 'ThePlatform',
+        }
+
+    def _real_extract(self, url):
+        topic_id, video_display_id = re.match(self._VALID_URL, url).groups()
+        if video_display_id:
+            webpage = self._download_webpage(url, video_display_id)
+            release_url, video_id = re.search(r"_videoPlayer.play\('([^']+)'\s*,\s*'[^']+'\s*,\s*'(\d+)'\)", webpage).groups()
+            release_url = unescapeHTML(release_url)
+
+            return self.theplatform_url_result(
+                release_url, video_id, {
+                    'mbr': 'true',
+                    'switch': 'hls'
+                })
+        else:
+            webpage = self._download_webpage(url, topic_id)
+            entries = []
+            for episode_item in re.findall(r'<a.+?data-release-url="[^"]+"[^>]*>', webpage):
+                video_attributes = extract_attributes(episode_item)
+                entries.append(self.theplatform_url_result(
+                    video_attributes['data-release-url'], video_attributes['data-id'], {
+                        'mbr': 'true',
+                        'switch': 'hls'
+                    }))
+            return self.playlist_result(entries, topic_id, get_element_by_attribute('class', 'show-title', webpage))
diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py
new file mode 100644 (file)
index 0000000..c739d2c
--- /dev/null
@@ -0,0 +1,91 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .theplatform import ThePlatformIE
+from ..utils import (
+    update_url_query,
+    parse_age_limit,
+    int_or_none,
+)
+
+
+class AMCNetworksIE(ThePlatformIE):
+    _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|wetv)\.com/(?:movies/|shows/[^/]+/(?:full-episodes/)?season-\d+/episode-\d+(?:-(?:[^/]+/)?|/))(?P<id>[^/?#]+)'
+    _TESTS = [{
+        'url': 'http://www.ifc.com/shows/maron/season-04/episode-01/step-1',
+        'md5': '',
+        'info_dict': {
+            'id': 's3MX01Nl4vPH',
+            'ext': 'mp4',
+            'title': 'Maron - Season 4 - Step 1',
+            'description': 'In denial about his current situation, Marc is reluctantly convinced by his friends to enter rehab. Starring Marc Maron and Constance Zimmer.',
+            'age_limit': 17,
+            'upload_date': '20160505',
+            'timestamp': 1462468831,
+            'uploader': 'AMCN',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.amc.com/shows/preacher/full-episodes/season-01/episode-00/pilot',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.wetv.com/shows/million-dollar-matchmaker/season-01/episode-06-the-dumped-dj-and-shallow-hal',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.ifc.com/movies/chaos',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        query = {
+            'mbr': 'true',
+            'manifest': 'm3u',
+        }
+        media_url = self._search_regex(r'window\.platformLinkURL\s*=\s*[\'"]([^\'"]+)', webpage, 'media url')
+        theplatform_metadata = self._download_theplatform_metadata(self._search_regex(
+            r'https?://link.theplatform.com/s/([^?]+)', media_url, 'theplatform_path'), display_id)
+        info = self._parse_theplatform_metadata(theplatform_metadata)
+        video_id = theplatform_metadata['pid']
+        title = theplatform_metadata['title']
+        rating = theplatform_metadata['ratings'][0]['rating']
+        auth_required = self._search_regex(r'window\.authRequired\s*=\s*(true|false);', webpage, 'auth required')
+        if auth_required == 'true':
+            requestor_id = self._search_regex(r'window\.requestor_id\s*=\s*[\'"]([^\'"]+)', webpage, 'requestor id')
+            resource = self._get_mvpd_resource(requestor_id, title, video_id, rating)
+            query['auth'] = self._extract_mvpd_auth(url, video_id, requestor_id, resource)
+        media_url = update_url_query(media_url, query)
+        formats, subtitles = self._extract_theplatform_smil(media_url, video_id)
+        self._sort_formats(formats)
+        info.update({
+            'id': video_id,
+            'subtitles': subtitles,
+            'formats': formats,
+            'age_limit': parse_age_limit(parse_age_limit(rating)),
+        })
+        ns_keys = theplatform_metadata.get('$xmlns', {}).keys()
+        if ns_keys:
+            ns = list(ns_keys)[0]
+            series = theplatform_metadata.get(ns + '$show')
+            season_number = int_or_none(theplatform_metadata.get(ns + '$season'))
+            episode = theplatform_metadata.get(ns + '$episodeTitle')
+            episode_number = int_or_none(theplatform_metadata.get(ns + '$episode'))
+            if season_number:
+                title = 'Season %d - %s' % (season_number, title)
+            if series:
+                title = '%s - %s' % (series, title)
+            info.update({
+                'title': title,
+                'series': series,
+                'season_number': season_number,
+                'episode': episode,
+                'episode_number': episode_number,
+            })
+        return info
index 8545681bead617b2ffedeee04212e032bdc3406e..e8e40126baca4bad27f8593dd9bd026f16fad131 100644 (file)
@@ -5,6 +5,8 @@ from .common import InfoExtractor
 from ..utils import (
     int_or_none,
     parse_iso8601,
+    mimetype2ext,
+    determine_ext,
 )
 
 
@@ -50,21 +52,25 @@ class AMPIE(InfoExtractor):
         if isinstance(media_content, dict):
             media_content = [media_content]
         for media_data in media_content:
-            media = media_data['@attributes']
-            media_type = media['type']
-            if media_type in ('video/f4m', 'application/f4m+xml'):
+            media = media_data.get('@attributes', {})
+            media_url = media.get('url')
+            if not media_url:
+                continue
+            ext = mimetype2ext(media.get('type')) or determine_ext(media_url)
+            if ext == 'f4m':
                 formats.extend(self._extract_f4m_formats(
-                    media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124',
+                    media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124',
                     video_id, f4m_id='hds', fatal=False))
-            elif media_type == 'application/x-mpegURL':
+            elif ext == 'm3u8':
                 formats.extend(self._extract_m3u8_formats(
-                    media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False))
+                    media_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
             else:
                 formats.append({
                     'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),
                     'url': media['url'],
                     'tbr': int_or_none(media.get('bitrate')),
                     'filesize': int_or_none(media.get('fileSize')),
+                    'ext': ext,
                 })
 
         self._sort_formats(formats)
index 9b01e38f5fe8b5a80b2635061433cc214fb1b315..9e28f25790bb284b5ba00376292cf33c8ded8cb1 100644 (file)
@@ -22,6 +22,7 @@ class AnimeOnDemandIE(InfoExtractor):
     _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply'
     _NETRC_MACHINE = 'animeondemand'
     _TESTS = [{
+        # jap, OmU
         'url': 'https://www.anime-on-demand.de/anime/161',
         'info_dict': {
             'id': '161',
@@ -30,17 +31,21 @@ class AnimeOnDemandIE(InfoExtractor):
         },
         'playlist_mincount': 4,
     }, {
-        # Film wording is used instead of Episode
+        # Film wording is used instead of Episode, ger/jap, Dub/OmU
         'url': 'https://www.anime-on-demand.de/anime/39',
         'only_matching': True,
     }, {
-        # Episodes without titles
+        # Episodes without titles, jap, OmU
         'url': 'https://www.anime-on-demand.de/anime/162',
         'only_matching': True,
     }, {
         # ger/jap, Dub/OmU, account required
         'url': 'https://www.anime-on-demand.de/anime/169',
         'only_matching': True,
+    }, {
+        # Full length film, non-series, ger/jap, Dub/OmU, account required
+        'url': 'https://www.anime-on-demand.de/anime/185',
+        'only_matching': True,
     }]
 
     def _login(self):
@@ -110,35 +115,12 @@ class AnimeOnDemandIE(InfoExtractor):
 
         entries = []
 
-        for num, episode_html in enumerate(re.findall(
-                r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage), 1):
-            episodebox_title = self._search_regex(
-                (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1',
-                 r'class="episodebox-title"[^>]+>(?P<title>.+?)<'),
-                episode_html, 'episodebox title', default=None, group='title')
-            if not episodebox_title:
-                continue
-
-            episode_number = int(self._search_regex(
-                r'(?:Episode|Film)\s*(\d+)',
-                episodebox_title, 'episode number', default=num))
-            episode_title = self._search_regex(
-                r'(?:Episode|Film)\s*\d+\s*-\s*(.+)',
-                episodebox_title, 'episode title', default=None)
-
-            video_id = 'episode-%d' % episode_number
-
-            common_info = {
-                'id': video_id,
-                'series': anime_title,
-                'episode': episode_title,
-                'episode_number': episode_number,
-            }
-
+        def extract_info(html, video_id, num=None):
+            title, description = [None] * 2
             formats = []
 
             for input_ in re.findall(
-                    r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', episode_html):
+                    r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', html):
                 attributes = extract_attributes(input_)
                 playlist_urls = []
                 for playlist_key in ('data-playlist', 'data-otherplaylist'):
@@ -161,7 +143,7 @@ class AnimeOnDemandIE(InfoExtractor):
                         format_id_list.append(lang)
                     if kind:
                         format_id_list.append(kind)
-                    if not format_id_list:
+                    if not format_id_list and num is not None:
                         format_id_list.append(compat_str(num))
                     format_id = '-'.join(format_id_list)
                     format_note = ', '.join(filter(None, (kind, lang_note)))
@@ -215,28 +197,74 @@ class AnimeOnDemandIE(InfoExtractor):
                             })
                         formats.extend(file_formats)
 
-            if formats:
-                self._sort_formats(formats)
+            return {
+                'title': title,
+                'description': description,
+                'formats': formats,
+            }
+
+        def extract_entries(html, video_id, common_info, num=None):
+            info = extract_info(html, video_id, num)
+
+            if info['formats']:
+                self._sort_formats(info['formats'])
                 f = common_info.copy()
-                f.update({
-                    'title': title,
-                    'description': description,
-                    'formats': formats,
-                })
+                f.update(info)
                 entries.append(f)
 
-            # Extract teaser only when full episode is not available
-            if not formats:
+            # Extract teaser/trailer only when full episode is not available
+            if not info['formats']:
                 m = re.search(
-                    r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<',
-                    episode_html)
+                    r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>(?P<kind>Teaser|Trailer)<',
+                    html)
                 if m:
                     f = common_info.copy()
                     f.update({
-                        'id': '%s-teaser' % f['id'],
+                        'id': '%s-%s' % (f['id'], m.group('kind').lower()),
                         'title': m.group('title'),
                         'url': compat_urlparse.urljoin(url, m.group('href')),
                     })
                     entries.append(f)
 
+        def extract_episodes(html):
+            for num, episode_html in enumerate(re.findall(
+                    r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', html), 1):
+                episodebox_title = self._search_regex(
+                    (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1',
+                     r'class="episodebox-title"[^>]+>(?P<title>.+?)<'),
+                    episode_html, 'episodebox title', default=None, group='title')
+                if not episodebox_title:
+                    continue
+
+                episode_number = int(self._search_regex(
+                    r'(?:Episode|Film)\s*(\d+)',
+                    episodebox_title, 'episode number', default=num))
+                episode_title = self._search_regex(
+                    r'(?:Episode|Film)\s*\d+\s*-\s*(.+)',
+                    episodebox_title, 'episode title', default=None)
+
+                video_id = 'episode-%d' % episode_number
+
+                common_info = {
+                    'id': video_id,
+                    'series': anime_title,
+                    'episode': episode_title,
+                    'episode_number': episode_number,
+                }
+
+                extract_entries(episode_html, video_id, common_info)
+
+        def extract_film(html, video_id):
+            common_info = {
+                'id': anime_id,
+                'title': anime_title,
+                'description': anime_description,
+            }
+            extract_entries(html, video_id, common_info)
+
+        extract_episodes(webpage)
+
+        if not entries:
+            extract_film(webpage, anime_id)
+
         return self.playlist_result(entries, anime_id, anime_title, anime_description)
index 42c21bf41d975bcb49fd6c398b19ed2897cd3bd2..2cdee33200232dc69c1755213fc2f8298c6c8fa3 100644 (file)
@@ -123,6 +123,10 @@ class AolFeaturesIE(InfoExtractor):
             'title': 'What To Watch - February 17, 2016',
         },
         'add_ie': ['FiveMin'],
+        'params': {
+            # encrypted m3u8 download
+            'skip_download': True,
+        },
     }]
 
     def _real_extract(self, url):
index 63429780e8abf528165daf7e50a6317bce9a6c7d..025e29aa46fe5db97c323fa95d947470f1f2023a 100644 (file)
@@ -1,8 +1,6 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
@@ -15,7 +13,7 @@ class AparatIE(InfoExtractor):
 
     _TEST = {
         'url': 'http://www.aparat.com/v/wP8On',
-        'md5': '6714e0af7e0d875c5a39c4dc4ab46ad1',
+        'md5': '131aca2e14fe7c4dcb3c4877ba300c89',
         'info_dict': {
             'id': 'wP8On',
             'ext': 'mp4',
@@ -31,13 +29,13 @@ class AparatIE(InfoExtractor):
         # Note: There is an easier-to-parse configuration at
         # http://www.aparat.com/video/video/config/videohash/%video_id
         # but the URL in there does not work
-        embed_url = ('http://www.aparat.com/video/video/embed/videohash/' +
-                     video_id + '/vt/frame')
+        embed_url = 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id
         webpage = self._download_webpage(embed_url, video_id)
 
-        video_urls = [video_url.replace('\\/', '/') for video_url in re.findall(
-            r'(?:fileList\[[0-9]+\]\s*=|"file"\s*:)\s*"([^"]+)"', webpage)]
-        for i, video_url in enumerate(video_urls):
+        file_list = self._parse_json(self._search_regex(
+            r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, 'file list'), video_id)
+        for i, item in enumerate(file_list[0]):
+            video_url = item['file']
             req = HEADRequest(video_url)
             res = self._request_webpage(
                 req, video_id, note='Testing video URL %d' % i, errnote=False)
index 8feb7cb7456ec4db8d6a8f28b411a58cb5ac47a1..486dff82d00a44a13384e3b7d8ff1b0189da8451 100644 (file)
@@ -1,67 +1,65 @@
 from __future__ import unicode_literals
 
-from .common import InfoExtractor
-from ..utils import unified_strdate
+from .jwplatform import JWPlatformBaseIE
+from ..utils import (
+    unified_strdate,
+    clean_html,
+)
 
 
-class ArchiveOrgIE(InfoExtractor):
+class ArchiveOrgIE(JWPlatformBaseIE):
     IE_NAME = 'archive.org'
     IE_DESC = 'archive.org videos'
-    _VALID_URL = r'https?://(?:www\.)?archive\.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
+    _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#]+)(?:[?].*)?$'
     _TESTS = [{
         'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
         'md5': '8af1d4cf447933ed3c7f4871162602db',
         'info_dict': {
             'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect',
-            'ext': 'ogv',
+            'ext': 'ogg',
             'title': '1968 Demo - FJCC Conference Presentation Reel #1',
-            'description': 'md5:1780b464abaca9991d8968c877bb53ed',
+            'description': 'md5:da45c349df039f1cc8075268eb1b5c25',
             'upload_date': '19681210',
             'uploader': 'SRI International'
         }
     }, {
         'url': 'https://archive.org/details/Cops1922',
-        'md5': '18f2a19e6d89af8425671da1cf3d4e04',
+        'md5': 'bc73c8ab3838b5a8fc6c6651fa7b58ba',
         'info_dict': {
             'id': 'Cops1922',
-            'ext': 'ogv',
+            'ext': 'mp4',
             'title': 'Buster Keaton\'s "Cops" (1922)',
-            'description': 'md5:70f72ee70882f713d4578725461ffcc3',
+            'description': 'md5:b4544662605877edd99df22f9620d858',
         }
+    }, {
+        'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
+        webpage = self._download_webpage(
+            'http://archive.org/embed/' + video_id, video_id)
+        jwplayer_playlist = self._parse_json(self._search_regex(
+            r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\);",
+            webpage, 'jwplayer playlist'), video_id)
+        info = self._parse_jwplayer_data(
+            {'playlist': jwplayer_playlist}, video_id, base_url=url)
 
-        json_url = url + ('&' if '?' in url else '?') + 'output=json'
-        data = self._download_json(json_url, video_id)
-
-        def get_optional(data_dict, field):
-            return data_dict['metadata'].get(field, [None])[0]
-
-        title = get_optional(data, 'title')
-        description = get_optional(data, 'description')
-        uploader = get_optional(data, 'creator')
-        upload_date = unified_strdate(get_optional(data, 'date'))
+        def get_optional(metadata, field):
+            return metadata.get(field, [None])[0]
 
-        formats = [
-            {
-                'format': fdata['format'],
-                'url': 'http://' + data['server'] + data['dir'] + fn,
-                'file_size': int(fdata['size']),
-            }
-            for fn, fdata in data['files'].items()
-            if 'Video' in fdata['format']]
-
-        self._sort_formats(formats)
-
-        return {
-            '_type': 'video',
-            'id': video_id,
-            'title': title,
-            'formats': formats,
-            'description': description,
-            'uploader': uploader,
-            'upload_date': upload_date,
-            'thumbnail': data.get('misc', {}).get('image'),
-        }
+        metadata = self._download_json(
+            'http://archive.org/details/' + video_id, video_id, query={
+                'output': 'json',
+            })['metadata']
+        info.update({
+            'title': get_optional(metadata, 'title') or info.get('title'),
+            'description': clean_html(get_optional(metadata, 'description')),
+        })
+        if info.get('_type') != 'playlist':
+            info.update({
+                'uploader': get_optional(metadata, 'creator'),
+                'upload_date': unified_strdate(get_optional(metadata, 'date')),
+            })
+        return info
index fd45b3e42b374ec6a7077454d238932c66d16d46..07e67dd3393962eedccd2460954711d01f08efdb 100644 (file)
@@ -13,13 +13,14 @@ from ..utils import (
     parse_duration,
     unified_strdate,
     xpath_text,
+    update_url_query,
 )
 from ..compat import compat_etree_fromstring
 
 
 class ARDMediathekIE(InfoExtractor):
     IE_NAME = 'ARD:mediathek'
-    _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
+    _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
 
     _TESTS = [{
         'url': 'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114',
@@ -34,6 +35,7 @@ class ARDMediathekIE(InfoExtractor):
             # m3u8 download
             'skip_download': True,
         },
+        'skip': 'HTTP Error 404: Not Found',
     }, {
         'url': 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916',
         'md5': 'f4d98b10759ac06c0072bbcd1f0b9e3e',
@@ -44,6 +46,7 @@ class ARDMediathekIE(InfoExtractor):
             'description': 'md5:196392e79876d0ac94c94e8cdb2875f1',
             'duration': 5252,
         },
+        'skip': 'HTTP Error 404: Not Found',
     }, {
         # audio
         'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
@@ -55,9 +58,22 @@ class ARDMediathekIE(InfoExtractor):
             'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef',
             'duration': 3240,
         },
+        'skip': 'HTTP Error 404: Not Found',
     }, {
         'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
         'only_matching': True,
+    }, {
+        # audio
+        'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158',
+        'md5': '4e8f00631aac0395fee17368ac0e9867',
+        'info_dict': {
+            'id': '30796318',
+            'ext': 'mp3',
+            'title': 'Vor dem Fest',
+            'description': 'md5:c0c1c8048514deaed2a73b3a60eecacb',
+            'duration': 3287,
+        },
+        'skip': 'Video is no longer available',
     }]
 
     def _extract_media_info(self, media_info_url, webpage, video_id):
@@ -113,11 +129,14 @@ class ARDMediathekIE(InfoExtractor):
                         continue
                     if ext == 'f4m':
                         formats.extend(self._extract_f4m_formats(
-                            stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124',
-                            video_id, preference=-1, f4m_id='hds', fatal=False))
+                            update_url_query(stream_url, {
+                                'hdcore': '3.1.1',
+                                'plugin': 'aasp-3.1.1.69.124'
+                            }),
+                            video_id, f4m_id='hds', fatal=False))
                     elif ext == 'm3u8':
                         formats.extend(self._extract_m3u8_formats(
-                            stream_url, video_id, 'mp4', preference=1, m3u8_id='hls', fatal=False))
+                            stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
                     else:
                         if server and server.startswith('rtmp'):
                             f = {
@@ -231,7 +250,8 @@ class ARDIE(InfoExtractor):
             'title': 'Die Story im Ersten: Mission unter falscher Flagge',
             'upload_date': '20140804',
             'thumbnail': 're:^https?://.*\.jpg$',
-        }
+        },
+        'skip': 'HTTP Error 404: Not Found',
     }
 
     def _real_extract(self, url):
diff --git a/youtube_dl/extractor/arkena.py b/youtube_dl/extractor/arkena.py
new file mode 100644 (file)
index 0000000..d45cae3
--- /dev/null
@@ -0,0 +1,115 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    float_or_none,
+    int_or_none,
+    mimetype2ext,
+    parse_iso8601,
+    strip_jsonp,
+)
+
+
+class ArkenaIE(InfoExtractor):
+    _VALID_URL = r'https?://play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P<id>[^/]+)/[^/]+/(?P<account_id>\d+)'
+    _TESTS = [{
+        'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411',
+        'md5': 'b96f2f71b359a8ecd05ce4e1daa72365',
+        'info_dict': {
+            'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe',
+            'ext': 'mp4',
+            'title': 'Big Buck Bunny',
+            'description': 'Royalty free test video',
+            'timestamp': 1432816365,
+            'upload_date': '20150528',
+            'is_live': False,
+        },
+    }, {
+        'url': 'https://play.arkena.com/config/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411/?callbackMethod=jQuery1111023664739129262213_1469227693893',
+        'only_matching': True,
+    }, {
+        'url': 'http://play.arkena.com/config/avp/v1/player/media/327336/darkmatter/131064/?callbackMethod=jQuery1111002221189684892677_1469227595972',
+        'only_matching': True,
+    }, {
+        'url': 'http://play.arkena.com/embed/avp/v1/player/media/327336/darkmatter/131064/',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video
+        mobj = re.search(
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1',
+            webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        account_id = mobj.group('account_id')
+
+        playlist = self._download_json(
+            'https://play.arkena.com/config/avp/v2/player/media/%s/0/%s/?callbackMethod=_'
+            % (video_id, account_id),
+            video_id, transform_source=strip_jsonp)['Playlist'][0]
+
+        media_info = playlist['MediaInfo']
+        title = media_info['Title']
+        media_files = playlist['MediaFiles']
+
+        is_live = False
+        formats = []
+        for kind_case, kind_formats in media_files.items():
+            kind = kind_case.lower()
+            for f in kind_formats:
+                f_url = f.get('Url')
+                if not f_url:
+                    continue
+                is_live = f.get('Live') == 'true'
+                exts = (mimetype2ext(f.get('Type')), determine_ext(f_url, None))
+                if kind == 'm3u8' or 'm3u8' in exts:
+                    formats.extend(self._extract_m3u8_formats(
+                        f_url, video_id, 'mp4',
+                        entry_protocol='m3u8' if is_live else 'm3u8_native',
+                        m3u8_id=kind, fatal=False, live=is_live))
+                elif kind == 'flash' or 'f4m' in exts:
+                    formats.extend(self._extract_f4m_formats(
+                        f_url, video_id, f4m_id=kind, fatal=False))
+                elif kind == 'dash' or 'mpd' in exts:
+                    formats.extend(self._extract_mpd_formats(
+                        f_url, video_id, mpd_id=kind, fatal=False))
+                elif kind == 'silverlight':
+                    # TODO: process when ism is supported (see
+                    # https://github.com/rg3/youtube-dl/issues/8118)
+                    continue
+                else:
+                    tbr = float_or_none(f.get('Bitrate'), 1000)
+                    formats.append({
+                        'url': f_url,
+                        'format_id': '%s-%d' % (kind, tbr) if tbr else kind,
+                        'tbr': tbr,
+                    })
+        self._sort_formats(formats)
+
+        description = media_info.get('Description')
+        video_id = media_info.get('VideoId') or video_id
+        timestamp = parse_iso8601(media_info.get('PublishDate'))
+        thumbnails = [{
+            'url': thumbnail['Url'],
+            'width': int_or_none(thumbnail.get('Size')),
+        } for thumbnail in (media_info.get('Poster') or []) if thumbnail.get('Url')]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'timestamp': timestamp,
+            'is_live': is_live,
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
index 049f1fa9ed35404acba9e6ad7f1c12f516b436f2..e0c5c18045312a064d8663a025a9fdaabb7a28df 100644 (file)
@@ -419,6 +419,7 @@ class ArteTVPlaylistIE(ArteTVBaseIE):
         'info_dict': {
             'id': 'PL-013263',
             'title': 'Areva & Uramin',
+            'description': 'md5:a1dc0312ce357c262259139cfd48c9bf',
         },
         'playlist_mincount': 6,
     }, {
index 4b3cd8c65a65967c5ad017d53126e9ccc76a71ef..deb9cc1c0fe2855c77c330f36f06069e8448189f 100644 (file)
@@ -2,19 +2,23 @@
 from __future__ import unicode_literals
 
 import re
+import itertools
 
 from .common import InfoExtractor
 from ..utils import (
+    dict_get,
     ExtractorError,
     float_or_none,
     int_or_none,
     parse_duration,
     parse_iso8601,
+    try_get,
     unescapeHTML,
 )
 from ..compat import (
     compat_etree_fromstring,
     compat_HTTPError,
+    compat_urlparse,
 )
 
 
@@ -229,51 +233,6 @@ class BBCCoUkIE(InfoExtractor):
         asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
         return [ref.get('href') for ref in asx.findall('./Entry/ref')]
 
-    def _extract_connection(self, connection, programme_id):
-        formats = []
-        kind = connection.get('kind')
-        protocol = connection.get('protocol')
-        supplier = connection.get('supplier')
-        if protocol == 'http':
-            href = connection.get('href')
-            transfer_format = connection.get('transferFormat')
-            # ASX playlist
-            if supplier == 'asx':
-                for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
-                    formats.append({
-                        'url': ref,
-                        'format_id': 'ref%s_%s' % (i, supplier),
-                    })
-            # Skip DASH until supported
-            elif transfer_format == 'dash':
-                pass
-            elif transfer_format == 'hls':
-                formats.extend(self._extract_m3u8_formats(
-                    href, programme_id, ext='mp4', entry_protocol='m3u8_native',
-                    m3u8_id=supplier, fatal=False))
-            # Direct link
-            else:
-                formats.append({
-                    'url': href,
-                    'format_id': supplier or kind or protocol,
-                })
-        elif protocol == 'rtmp':
-            application = connection.get('application', 'ondemand')
-            auth_string = connection.get('authString')
-            identifier = connection.get('identifier')
-            server = connection.get('server')
-            formats.append({
-                'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
-                'play_path': identifier,
-                'app': '%s?%s' % (application, auth_string),
-                'page_url': 'http://www.bbc.co.uk',
-                'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
-                'rtmp_live': False,
-                'ext': 'flv',
-                'format_id': supplier,
-            })
-        return formats
-
     def _extract_items(self, playlist):
         return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS)
 
@@ -294,46 +253,6 @@ class BBCCoUkIE(InfoExtractor):
     def _extract_connections(self, media):
         return self._findall_ns(media, './{%s}connection')
 
-    def _extract_video(self, media, programme_id):
-        formats = []
-        vbr = int_or_none(media.get('bitrate'))
-        vcodec = media.get('encoding')
-        service = media.get('service')
-        width = int_or_none(media.get('width'))
-        height = int_or_none(media.get('height'))
-        file_size = int_or_none(media.get('media_file_size'))
-        for connection in self._extract_connections(media):
-            conn_formats = self._extract_connection(connection, programme_id)
-            for format in conn_formats:
-                format.update({
-                    'width': width,
-                    'height': height,
-                    'vbr': vbr,
-                    'vcodec': vcodec,
-                    'filesize': file_size,
-                })
-                if service:
-                    format['format_id'] = '%s_%s' % (service, format['format_id'])
-            formats.extend(conn_formats)
-        return formats
-
-    def _extract_audio(self, media, programme_id):
-        formats = []
-        abr = int_or_none(media.get('bitrate'))
-        acodec = media.get('encoding')
-        service = media.get('service')
-        for connection in self._extract_connections(media):
-            conn_formats = self._extract_connection(connection, programme_id)
-            for format in conn_formats:
-                format.update({
-                    'format_id': '%s_%s' % (service, format['format_id']),
-                    'abr': abr,
-                    'acodec': acodec,
-                    'vcodec': 'none',
-                })
-            formats.extend(conn_formats)
-        return formats
-
     def _get_subtitles(self, media, programme_id):
         subtitles = {}
         for connection in self._extract_connections(media):
@@ -379,13 +298,87 @@ class BBCCoUkIE(InfoExtractor):
     def _process_media_selector(self, media_selection, programme_id):
         formats = []
         subtitles = None
+        urls = []
 
         for media in self._extract_medias(media_selection):
             kind = media.get('kind')
-            if kind == 'audio':
-                formats.extend(self._extract_audio(media, programme_id))
-            elif kind == 'video':
-                formats.extend(self._extract_video(media, programme_id))
+            if kind in ('video', 'audio'):
+                bitrate = int_or_none(media.get('bitrate'))
+                encoding = media.get('encoding')
+                service = media.get('service')
+                width = int_or_none(media.get('width'))
+                height = int_or_none(media.get('height'))
+                file_size = int_or_none(media.get('media_file_size'))
+                for connection in self._extract_connections(media):
+                    href = connection.get('href')
+                    if href in urls:
+                        continue
+                    if href:
+                        urls.append(href)
+                    conn_kind = connection.get('kind')
+                    protocol = connection.get('protocol')
+                    supplier = connection.get('supplier')
+                    transfer_format = connection.get('transferFormat')
+                    format_id = supplier or conn_kind or protocol
+                    if service:
+                        format_id = '%s_%s' % (service, format_id)
+                    # ASX playlist
+                    if supplier == 'asx':
+                        for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
+                            formats.append({
+                                'url': ref,
+                                'format_id': 'ref%s_%s' % (i, format_id),
+                            })
+                    elif transfer_format == 'dash':
+                        formats.extend(self._extract_mpd_formats(
+                            href, programme_id, mpd_id=format_id, fatal=False))
+                    elif transfer_format == 'hls':
+                        formats.extend(self._extract_m3u8_formats(
+                            href, programme_id, ext='mp4', entry_protocol='m3u8_native',
+                            m3u8_id=format_id, fatal=False))
+                    elif transfer_format == 'hds':
+                        formats.extend(self._extract_f4m_formats(
+                            href, programme_id, f4m_id=format_id, fatal=False))
+                    else:
+                        if not service and not supplier and bitrate:
+                            format_id += '-%d' % bitrate
+                        fmt = {
+                            'format_id': format_id,
+                            'filesize': file_size,
+                        }
+                        if kind == 'video':
+                            fmt.update({
+                                'width': width,
+                                'height': height,
+                                'vbr': bitrate,
+                                'vcodec': encoding,
+                            })
+                        else:
+                            fmt.update({
+                                'abr': bitrate,
+                                'acodec': encoding,
+                                'vcodec': 'none',
+                            })
+                        if protocol == 'http':
+                            # Direct link
+                            fmt.update({
+                                'url': href,
+                            })
+                        elif protocol == 'rtmp':
+                            application = connection.get('application', 'ondemand')
+                            auth_string = connection.get('authString')
+                            identifier = connection.get('identifier')
+                            server = connection.get('server')
+                            fmt.update({
+                                'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
+                                'play_path': identifier,
+                                'app': '%s?%s' % (application, auth_string),
+                                'page_url': 'http://www.bbc.co.uk',
+                                'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
+                                'rtmp_live': False,
+                                'ext': 'flv',
+                            })
+                        formats.append(fmt)
             elif kind == 'captions':
                 subtitles = self.extract_subtitles(media, programme_id)
         return formats, subtitles
@@ -590,6 +583,7 @@ class BBCIE(BBCCoUkIE):
             'id': '150615_telabyad_kentin_cogu',
             'ext': 'mp4',
             'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde",
+            'description': 'md5:33a4805a855c9baf7115fcbde57e7025',
             'timestamp': 1434397334,
             'upload_date': '20150615',
         },
@@ -603,6 +597,7 @@ class BBCIE(BBCCoUkIE):
             'id': '150619_video_honduras_militares_hospitales_corrupcion_aw',
             'ext': 'mp4',
             'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción',
+            'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8',
             'timestamp': 1434713142,
             'upload_date': '20150619',
         },
@@ -652,6 +647,23 @@ class BBCIE(BBCCoUkIE):
             # rtmp download
             'skip_download': True,
         }
+    }, {
+        # single video embedded with Morph
+        'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975',
+        'info_dict': {
+            'id': 'p041vhd0',
+            'ext': 'mp4',
+            'title': "Nigeria v Japan - Men's First Round",
+            'description': 'Live coverage of the first round from Group B at the Amazonia Arena.',
+            'duration': 7980,
+            'uploader': 'BBC Sport',
+            'uploader_id': 'bbc_sport',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'skip': 'Georestricted to UK',
     }, {
         # single video with playlist.sxml URL in playlist param
         'url': 'http://www.bbc.com/sport/0/football/33653409',
@@ -749,7 +761,7 @@ class BBCIE(BBCCoUkIE):
 
         webpage = self._download_webpage(url, playlist_id)
 
-        json_ld_info = self._search_json_ld(webpage, playlist_id, default=None)
+        json_ld_info = self._search_json_ld(webpage, playlist_id, default={})
         timestamp = json_ld_info.get('timestamp')
 
         playlist_title = json_ld_info.get('title')
@@ -818,8 +830,29 @@ class BBCIE(BBCCoUkIE):
                         # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani)
                         playlist = data_playable.get('otherSettings', {}).get('playlist', {})
                         if playlist:
-                            entries.append(self._extract_from_playlist_sxml(
-                                playlist.get('progressiveDownloadUrl'), playlist_id, timestamp))
+                            entry = None
+                            for key in ('streaming', 'progressiveDownload'):
+                                playlist_url = playlist.get('%sUrl' % key)
+                                if not playlist_url:
+                                    continue
+                                try:
+                                    info = self._extract_from_playlist_sxml(
+                                        playlist_url, playlist_id, timestamp)
+                                    if not entry:
+                                        entry = info
+                                    else:
+                                        entry['title'] = info['title']
+                                        entry['formats'].extend(info['formats'])
+                                except Exception as e:
+                                    # Some playlist URL may fail with 500, at the same time
+                                    # the other one may work fine (e.g.
+                                    # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu)
+                                    if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
+                                        continue
+                                    raise
+                            if entry:
+                                self._sort_formats(entry['formats'])
+                                entries.append(entry)
 
         if entries:
             return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
@@ -852,6 +885,50 @@ class BBCIE(BBCCoUkIE):
                 'subtitles': subtitles,
             }
 
+        # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975)
+        # There are several setPayload calls may be present but the video
+        # seems to be always related to the first one
+        morph_payload = self._parse_json(
+            self._search_regex(
+                r'Morph\.setPayload\([^,]+,\s*({.+?})\);',
+                webpage, 'morph payload', default='{}'),
+            playlist_id, fatal=False)
+        if morph_payload:
+            components = try_get(morph_payload, lambda x: x['body']['components'], list) or []
+            for component in components:
+                if not isinstance(component, dict):
+                    continue
+                lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict)
+                if not lead_media:
+                    continue
+                identifiers = lead_media.get('identifiers')
+                if not identifiers or not isinstance(identifiers, dict):
+                    continue
+                programme_id = identifiers.get('vpid') or identifiers.get('playablePid')
+                if not programme_id:
+                    continue
+                title = lead_media.get('title') or self._og_search_title(webpage)
+                formats, subtitles = self._download_media_selector(programme_id)
+                self._sort_formats(formats)
+                description = lead_media.get('summary')
+                uploader = lead_media.get('masterBrand')
+                uploader_id = lead_media.get('mid')
+                duration = None
+                duration_d = lead_media.get('duration')
+                if isinstance(duration_d, dict):
+                    duration = parse_duration(dict_get(
+                        duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration')))
+                return {
+                    'id': programme_id,
+                    'title': title,
+                    'description': description,
+                    'duration': duration,
+                    'uploader': uploader,
+                    'uploader_id': uploader_id,
+                    'formats': formats,
+                    'subtitles': subtitles,
+                }
+
         def extract_all(pattern):
             return list(filter(None, map(
                 lambda s: self._parse_json(s, playlist_id, fatal=False),
@@ -869,7 +946,7 @@ class BBCIE(BBCCoUkIE):
             r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage))
         if entries:
             return self.playlist_result(
-                [self.url_result(entry, 'BBCCoUk') for entry in entries],
+                [self.url_result(entry_, 'BBCCoUk') for entry_ in entries],
                 playlist_id, playlist_title, playlist_description)
 
         # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511)
@@ -981,27 +1058,43 @@ class BBCCoUkArticleIE(InfoExtractor):
 
 
 class BBCCoUkPlaylistBaseIE(InfoExtractor):
+    def _entries(self, webpage, url, playlist_id):
+        single_page = 'page' in compat_urlparse.parse_qs(
+            compat_urlparse.urlparse(url).query)
+        for page_num in itertools.count(2):
+            for video_id in re.findall(
+                    self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage):
+                yield self.url_result(
+                    self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
+            if single_page:
+                return
+            next_page = self._search_regex(
+                r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
+                webpage, 'next page url', default=None, group='url')
+            if not next_page:
+                break
+            webpage = self._download_webpage(
+                compat_urlparse.urljoin(url, next_page), playlist_id,
+                'Downloading page %d' % page_num, page_num)
+
     def _real_extract(self, url):
         playlist_id = self._match_id(url)
 
         webpage = self._download_webpage(url, playlist_id)
 
-        entries = [
-            self.url_result(self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key())
-            for video_id in re.findall(
-                self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage)]
-
         title, description = self._extract_title_and_description(webpage)
 
-        return self.playlist_result(entries, playlist_id, title, description)
+        return self.playlist_result(
+            self._entries(webpage, url, playlist_id),
+            playlist_id, title, description)
 
 
 class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
     IE_NAME = 'bbc.co.uk:iplayer:playlist'
-    _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/episodes/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
+    _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
     _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
     _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
         'info_dict': {
             'id': 'b05rcz9v',
@@ -1009,7 +1102,17 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
             'description': 'French thriller serial about a missing teenager.',
         },
         'playlist_mincount': 6,
-    }
+        'skip': 'This programme is not currently available on BBC iPlayer',
+    }, {
+        # Available for over a year unlike 30 days for most other programmes
+        'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32',
+        'info_dict': {
+            'id': 'p02tcc32',
+            'title': 'Bohemian Icons',
+            'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
+        },
+        'playlist_mincount': 10,
+    }]
 
     def _extract_title_and_description(self, webpage):
         title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
@@ -1032,6 +1135,24 @@ class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE):
             'description': 'French thriller serial about a missing teenager.',
         },
         'playlist_mincount': 7,
+    }, {
+        # multipage playlist, explicit page
+        'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1',
+        'info_dict': {
+            'id': 'b00mfl7n',
+            'title': 'Frozen Planet - Clips - BBC One',
+            'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
+        },
+        'playlist_mincount': 24,
+    }, {
+        # multipage playlist, all pages
+        'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips',
+        'info_dict': {
+            'id': 'b00mfl7n',
+            'title': 'Frozen Planet - Clips - BBC One',
+            'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c',
+        },
+        'playlist_mincount': 142,
     }, {
         'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06',
         'only_matching': True,
index 33762ad93eae6f7b204cd00c11058f60c621bf50..b4ce767af6735321ab08769e4d2c87b716b93e65 100644 (file)
@@ -11,22 +11,13 @@ from ..compat import compat_urllib_parse_unquote
 class BigflixIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?bigflix\.com/.+/(?P<id>[0-9]+)'
     _TESTS = [{
-        'url': 'http://www.bigflix.com/Hindi-movies/Action-movies/Singham-Returns/16537',
-        'md5': 'ec76aa9b1129e2e5b301a474e54fab74',
-        'info_dict': {
-            'id': '16537',
-            'ext': 'mp4',
-            'title': 'Singham Returns',
-            'description': 'md5:3d2ba5815f14911d5cc6a501ae0cf65d',
-        }
-    }, {
         # 2 formats
         'url': 'http://www.bigflix.com/Tamil-movies/Drama-movies/Madarasapatinam/16070',
         'info_dict': {
             'id': '16070',
             'ext': 'mp4',
             'title': 'Madarasapatinam',
-            'description': 'md5:63b9b8ed79189c6f0418c26d9a3452ca',
+            'description': 'md5:9f0470b26a4ba8e824c823b5d95c2f6b',
             'formats': 'mincount:2',
         },
         'params': {
index b17047b399b6630fe2334aa24f0a5e97aed8506f..d8eb718212b3d8482f7ac7cf479217da2391de6d 100644 (file)
@@ -25,13 +25,13 @@ class BiliBiliIE(InfoExtractor):
 
     _TESTS = [{
         'url': 'http://www.bilibili.tv/video/av1074402/',
-        'md5': '5f7d29e1a2872f3df0cf76b1f87d3788',
+        'md5': '9fa226fe2b8a9a4d5a69b4c6a183417e',
         'info_dict': {
             'id': '1554319',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': '【金坷垃】金泡沫',
             'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923',
-            'duration': 308.067,
+            'duration': 308.315,
             'timestamp': 1398012660,
             'upload_date': '20140420',
             'thumbnail': 're:^https?://.+\.jpg',
@@ -41,73 +41,33 @@ class BiliBiliIE(InfoExtractor):
     }, {
         'url': 'http://www.bilibili.com/video/av1041170/',
         'info_dict': {
-            'id': '1041170',
+            'id': '1507019',
+            'ext': 'mp4',
             'title': '【BD1080P】刀语【诸神&异域】',
             'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦!~',
+            'timestamp': 1396530060,
+            'upload_date': '20140403',
+            'uploader': '枫叶逝去',
+            'uploader_id': '520116',
         },
-        'playlist_count': 9,
     }, {
         'url': 'http://www.bilibili.com/video/av4808130/',
         'info_dict': {
-            'id': '4808130',
+            'id': '7802182',
+            'ext': 'mp4',
             'title': '【长篇】哆啦A梦443【钉铛】',
             'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
+            'timestamp': 1464564180,
+            'upload_date': '20160529',
+            'uploader': '喜欢拉面',
+            'uploader_id': '151066',
         },
-        'playlist': [{
-            'md5': '55cdadedf3254caaa0d5d27cf20a8f9c',
-            'info_dict': {
-                'id': '4808130_part1',
-                'ext': 'flv',
-                'title': '【长篇】哆啦A梦443【钉铛】',
-                'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
-                'timestamp': 1464564180,
-                'upload_date': '20160529',
-                'uploader': '喜欢拉面',
-                'uploader_id': '151066',
-            },
-        }, {
-            'md5': '926f9f67d0c482091872fbd8eca7ea3d',
-            'info_dict': {
-                'id': '4808130_part2',
-                'ext': 'flv',
-                'title': '【长篇】哆啦A梦443【钉铛】',
-                'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
-                'timestamp': 1464564180,
-                'upload_date': '20160529',
-                'uploader': '喜欢拉面',
-                'uploader_id': '151066',
-            },
-        }, {
-            'md5': '4b7b225b968402d7c32348c646f1fd83',
-            'info_dict': {
-                'id': '4808130_part3',
-                'ext': 'flv',
-                'title': '【长篇】哆啦A梦443【钉铛】',
-                'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
-                'timestamp': 1464564180,
-                'upload_date': '20160529',
-                'uploader': '喜欢拉面',
-                'uploader_id': '151066',
-            },
-        }, {
-            'md5': '7b795e214166501e9141139eea236e91',
-            'info_dict': {
-                'id': '4808130_part4',
-                'ext': 'flv',
-                'title': '【长篇】哆啦A梦443【钉铛】',
-                'description': '(2016.05.27)来组合客人的脸吧&amp;amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;amp;illust_id=56912929',
-                'timestamp': 1464564180,
-                'upload_date': '20160529',
-                'uploader': '喜欢拉面',
-                'uploader_id': '151066',
-            },
-        }],
     }, {
         # Missing upload time
         'url': 'http://www.bilibili.com/video/av1867637/',
         'info_dict': {
             'id': '2880301',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': '【HDTV】【喜剧】岳父岳母真难当 (2014)【法国票房冠军】',
             'description': '一个信奉天主教的法国旧式传统资产阶级家庭中有四个女儿。三个女儿却分别找了阿拉伯、犹太、中国丈夫,老夫老妻唯独期盼剩下未嫁的小女儿能找一个信奉天主教的法国白人,结果没想到小女儿找了一位非裔黑人……【这次应该不会跳帧了】',
             'uploader': '黑夜为猫',
index 1332281337b2b5d64d0b89eb7d257a11ebffee71..7608c0a085b3c656277b03f61d19c1f60ea8d4f1 100644 (file)
@@ -2,11 +2,15 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..utils import remove_end
+from ..utils import (
+    ExtractorError,
+    remove_end,
+)
+from .rudo import RudoIE
 
 
 class BioBioChileTVIE(InfoExtractor):
-    _VALID_URL = r'https?://tv\.biobiochile\.cl/notas/(?:[^/]+/)+(?P<id>[^/]+)\.shtml'
+    _VALID_URL = r'https?://(?:tv|www)\.biobiochile\.cl/(?:notas|noticias)/(?:[^/]+/)+(?P<id>[^/]+)\.shtml'
 
     _TESTS = [{
         'url': 'http://tv.biobiochile.cl/notas/2015/10/21/sobre-camaras-y-camarillas-parlamentarias.shtml',
@@ -18,6 +22,7 @@ class BioBioChileTVIE(InfoExtractor):
             'thumbnail': 're:^https?://.*\.jpg$',
             'uploader': 'Fernando Atria',
         },
+        'skip': 'URL expired and redirected to http://www.biobiochile.cl/portada/bbtv/index.html',
     }, {
         # different uploader layout
         'url': 'http://tv.biobiochile.cl/notas/2016/03/18/natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades.shtml',
@@ -32,6 +37,16 @@ class BioBioChileTVIE(InfoExtractor):
         'params': {
             'skip_download': True,
         },
+        'skip': 'URL expired and redirected to http://www.biobiochile.cl/portada/bbtv/index.html',
+    }, {
+        'url': 'http://www.biobiochile.cl/noticias/bbtv/comentarios-bio-bio/2016/07/08/edecanes-del-congreso-figuras-decorativas-que-le-cuestan-muy-caro-a-los-chilenos.shtml',
+        'info_dict': {
+            'id': 'edecanes-del-congreso-figuras-decorativas-que-le-cuestan-muy-caro-a-los-chilenos',
+            'ext': 'mp4',
+            'uploader': '(none)',
+            'upload_date': '20160708',
+            'title': 'Edecanes del Congreso: Figuras decorativas que le cuestan muy caro a los chilenos',
+        },
     }, {
         'url': 'http://tv.biobiochile.cl/notas/2015/10/22/ninos-transexuales-de-quien-es-la-decision.shtml',
         'only_matching': True,
@@ -45,42 +60,22 @@ class BioBioChileTVIE(InfoExtractor):
 
         webpage = self._download_webpage(url, video_id)
 
-        title = remove_end(self._og_search_title(webpage), ' - BioBioChile TV')
+        rudo_url = RudoIE._extract_url(webpage)
+        if not rudo_url:
+            raise ExtractorError('No videos found')
 
-        file_url = self._search_regex(
-            r'loadFWPlayerVideo\([^,]+,\s*(["\'])(?P<url>.+?)\1',
-            webpage, 'file url', group='url')
-
-        base_url = self._search_regex(
-            r'file\s*:\s*(["\'])(?P<url>.+?)\1\s*\+\s*fileURL', webpage,
-            'base url', default='http://unlimited2-cl.digitalproserver.com/bbtv/',
-            group='url')
-
-        formats = self._extract_m3u8_formats(
-            '%s%s/playlist.m3u8' % (base_url, file_url), video_id, 'mp4',
-            entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
-        f = {
-            'url': '%s%s' % (base_url, file_url),
-            'format_id': 'http',
-            'protocol': 'http',
-            'preference': 1,
-        }
-        if formats:
-            f_copy = formats[-1].copy()
-            f_copy.update(f)
-            f = f_copy
-        formats.append(f)
-        self._sort_formats(formats)
+        title = remove_end(self._og_search_title(webpage), ' - BioBioChile TV')
 
         thumbnail = self._og_search_thumbnail(webpage)
         uploader = self._html_search_regex(
-            r'<a[^>]+href=["\']https?://busca\.biobiochile\.cl/author[^>]+>(.+?)</a>',
+            r'<a[^>]+href=["\']https?://(?:busca|www)\.biobiochile\.cl/(?:lista/)?(?:author|autor)[^>]+>(.+?)</a>',
             webpage, 'uploader', fatal=False)
 
         return {
+            '_type': 'url_transparent',
+            'url': rudo_url,
             'id': video_id,
             'title': title,
             'thumbnail': thumbnail,
             'uploader': uploader,
-            'formats': formats,
         }
index ae4579b33f656a9ead9cbca01a4cb0a08cd585fb..beaebfd2add55b9740ba3d5ac81256f442277abf 100644 (file)
@@ -24,7 +24,8 @@ class BIQLEIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Ребенок в шоке от автоматической мойки',
             'uploader': 'Dmitry Kotov',
-        }
+        },
+        'skip': ' This video was marked as adult.  Embedding adult videos on external sites is prohibited.',
     }]
 
     def _real_extract(self, url):
index bd538be50bc4a650d000eaffb5e292a4e8e76cbe..2a8cd64b99d2551da9777aaa259d356d8cad51ed 100644 (file)
@@ -1,3 +1,4 @@
+# coding: utf-8
 from __future__ import unicode_literals
 
 import re
@@ -20,6 +21,18 @@ class BloombergIE(InfoExtractor):
         'params': {
             'format': 'best[format_id^=hds]',
         },
+    }, {
+        # video ID in BPlayer(...)
+        'url': 'http://www.bloomberg.com/features/2016-hello-world-new-zealand/',
+        'info_dict': {
+            'id': '938c7e72-3f25-4ddb-8b85-a9be731baa74',
+            'ext': 'flv',
+            'title': 'Meet the Real-Life Tech Wizards of Middle Earth',
+            'description': 'Hello World, Episode 1: New Zealand’s freaky AI babies, robot exoskeletons, and a virtual you.',
+        },
+        'params': {
+            'format': 'best[format_id^=hds]',
+        },
     }, {
         'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets',
         'only_matching': True,
@@ -33,7 +46,11 @@ class BloombergIE(InfoExtractor):
         webpage = self._download_webpage(url, name)
         video_id = self._search_regex(
             r'["\']bmmrId["\']\s*:\s*(["\'])(?P<url>.+?)\1',
-            webpage, 'id', group='url')
+            webpage, 'id', group='url', default=None)
+        if not video_id:
+            bplayer_data = self._parse_json(self._search_regex(
+                r'BPlayer\(null,\s*({[^;]+})\);', webpage, 'id'), name)
+            video_id = bplayer_data['id']
         title = re.sub(': Video$', '', self._og_search_title(webpage))
 
         embed_info = self._download_json(
index ef560b592792910c2d39174eb2db129cea5590fe..aeb22be168402fd4e7e134c164339db74f95a0f8 100644 (file)
@@ -26,6 +26,8 @@ from ..utils import (
     unescapeHTML,
     unsmuggle_url,
     update_url_query,
+    clean_html,
+    mimetype2ext,
 )
 
 
@@ -90,6 +92,7 @@ class BrightcoveLegacyIE(InfoExtractor):
                 'description': 'md5:363109c02998fee92ec02211bd8000df',
                 'uploader': 'National Ballet of Canada',
             },
+            'skip': 'Video gone',
         },
         {
             # test flv videos served by akamaihd.net
@@ -108,7 +111,7 @@ class BrightcoveLegacyIE(InfoExtractor):
             },
         },
         {
-            # playlist test
+            # playlist with 'videoList'
             # from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players
             'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL',
             'info_dict': {
@@ -117,6 +120,15 @@ class BrightcoveLegacyIE(InfoExtractor):
             },
             'playlist_mincount': 7,
         },
+        {
+            # playlist with 'playlistTab' (https://github.com/rg3/youtube-dl/issues/9965)
+            'url': 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=AQ%7E%7E,AAABXlLMdok%7E,NJ4EoMlZ4rZdx9eU1rkMVd8EaYPBBUlg',
+            'info_dict': {
+                'id': '1522758701001',
+                'title': 'Lesson 08',
+            },
+            'playlist_mincount': 10,
+        },
     ]
     FLV_VCODECS = {
         1: 'SORENSON',
@@ -298,13 +310,19 @@ class BrightcoveLegacyIE(InfoExtractor):
             info_url, player_key, 'Downloading playlist information')
 
         json_data = json.loads(playlist_info)
-        if 'videoList' not in json_data:
+        if 'videoList' in json_data:
+            playlist_info = json_data['videoList']
+            playlist_dto = playlist_info['mediaCollectionDTO']
+        elif 'playlistTabs' in json_data:
+            playlist_info = json_data['playlistTabs']
+            playlist_dto = playlist_info['lineupListDTO']['playlistDTOs'][0]
+        else:
             raise ExtractorError('Empty playlist')
-        playlist_info = json_data['videoList']
-        videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
+
+        videos = [self._extract_video_info(video_info) for video_info in playlist_dto['videoDTOs']]
 
         return self.playlist_result(videos, playlist_id='%s' % playlist_info['id'],
-                                    playlist_title=playlist_info['mediaCollectionDTO']['displayName'])
+                                    playlist_title=playlist_dto['displayName'])
 
     def _extract_video_info(self, video_info):
         video_id = compat_str(video_info['id'])
@@ -528,14 +546,16 @@ class BrightcoveNewIE(InfoExtractor):
         formats = []
         for source in json_data.get('sources', []):
             container = source.get('container')
-            source_type = source.get('type')
+            ext = mimetype2ext(source.get('type'))
             src = source.get('src')
-            if source_type == 'application/x-mpegURL' or container == 'M2TS':
+            if ext == 'ism':
+                continue
+            elif ext == 'm3u8' or container == 'M2TS':
                 if not src:
                     continue
                 formats.extend(self._extract_m3u8_formats(
                     src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
-            elif source_type == 'application/dash+xml':
+            elif ext == 'mpd':
                 if not src:
                     continue
                 formats.extend(self._extract_mpd_formats(src, video_id, 'dash', fatal=False))
@@ -551,7 +571,7 @@ class BrightcoveNewIE(InfoExtractor):
                     'tbr': tbr,
                     'filesize': int_or_none(source.get('size')),
                     'container': container,
-                    'ext': container.lower(),
+                    'ext': ext or container.lower(),
                 }
                 if width == 0 and height == 0:
                     f.update({
@@ -585,6 +605,13 @@ class BrightcoveNewIE(InfoExtractor):
                         'format_id': build_format_id('rtmp'),
                     })
                 formats.append(f)
+
+        errors = json_data.get('errors')
+        if not formats and errors:
+            error = errors[0]
+            raise ExtractorError(
+                error.get('message') or error.get('error_subcode') or error['error_code'], expected=True)
+
         self._sort_formats(formats)
 
         subtitles = {}
@@ -597,7 +624,7 @@ class BrightcoveNewIE(InfoExtractor):
         return {
             'id': video_id,
             'title': title,
-            'description': json_data.get('description'),
+            'description': clean_html(json_data.get('description')),
             'thumbnail': json_data.get('thumbnail') or json_data.get('poster'),
             'duration': float_or_none(json_data.get('duration'), 1000),
             'timestamp': parse_iso8601(json_data.get('published_at')),
index df503ecc0f50283f0cc77a867353912a47eee5dd..75fa92d7cfc0204f4539e10c762585b2537abbb6 100644 (file)
@@ -5,6 +5,7 @@ import json
 import re
 
 from .common import InfoExtractor
+from .facebook import FacebookIE
 
 
 class BuzzFeedIE(InfoExtractor):
@@ -20,11 +21,11 @@ class BuzzFeedIE(InfoExtractor):
             'info_dict': {
                 'id': 'aVCR29aE_OQ',
                 'ext': 'mp4',
+                'title': 'Angry Ram destroys a punching bag..',
+                'description': 'md5:c59533190ef23fd4458a5e8c8c872345',
                 'upload_date': '20141024',
                 'uploader_id': 'Buddhanz1',
-                'description': 'He likes to stay in shape with his heavy bag, he wont stop until its on the ground\n\nFollow Angry Ram on Facebook for regular updates -\nhttps://www.facebook.com/pages/Angry-Ram/1436897249899558?ref=hl',
-                'uploader': 'Buddhanz',
-                'title': 'Angry Ram destroys a punching bag',
+                'uploader': 'Angry Ram',
             }
         }]
     }, {
@@ -41,13 +42,30 @@ class BuzzFeedIE(InfoExtractor):
             'info_dict': {
                 'id': 'mVmBL8B-In0',
                 'ext': 'mp4',
+                'title': 're:Munchkin the Teddy Bear gets her exercise',
+                'description': 'md5:28faab95cda6e361bcff06ec12fc21d8',
                 'upload_date': '20141124',
                 'uploader_id': 'CindysMunchkin',
-                'description': 're:© 2014 Munchkin the',
                 'uploader': 're:^Munchkin the',
-                'title': 're:Munchkin the Teddy Bear gets her exercise',
             },
         }]
+    }, {
+        'url': 'http://www.buzzfeed.com/craigsilverman/the-most-adorable-crash-landing-ever#.eq7pX0BAmK',
+        'info_dict': {
+            'id': 'the-most-adorable-crash-landing-ever',
+            'title': 'Watch This Baby Goose Make The Most Adorable Crash Landing',
+            'description': 'This gosling knows how to stick a landing.',
+        },
+        'playlist': [{
+            'md5': '763ca415512f91ca62e4621086900a23',
+            'info_dict': {
+                'id': '971793786185728',
+                'ext': 'mp4',
+                'title': 'We set up crash pads so that the goslings on our roof would have a safe landi...',
+                'uploader': 'Calgary Outdoor Centre-University of Calgary',
+            },
+        }],
+        'add_ie': ['Facebook'],
     }]
 
     def _real_extract(self, url):
@@ -66,6 +84,10 @@ class BuzzFeedIE(InfoExtractor):
                 continue
             entries.append(self.url_result(video['url']))
 
+        facebook_url = FacebookIE._extract_url(webpage)
+        if facebook_url:
+            entries.append(self.url_result(facebook_url))
+
         return {
             '_type': 'playlist',
             'id': playlist_id,
index 6ffbeabd371fd6f80a9ead1d23762f760a13ba2f..268c34392468e1b145e893a55496ad7bbb728d41 100644 (file)
@@ -1,7 +1,6 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import datetime
 import re
 
 from .common import InfoExtractor
@@ -10,8 +9,10 @@ from ..compat import (
     compat_urlparse,
 )
 from ..utils import (
-    parse_iso8601,
+    clean_html,
+    parse_duration,
     str_to_int,
+    unified_strdate,
 )
 
 
@@ -26,14 +27,14 @@ class CamdemyIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Ch1-1 Introduction, Signals (02-23-2012)',
             'thumbnail': 're:^https?://.*\.jpg$',
-            'description': '',
             'creator': 'ss11spring',
+            'duration': 1591,
             'upload_date': '20130114',
-            'timestamp': 1358154556,
             'view_count': int,
         }
     }, {
         # With non-empty description
+        # webpage returns "No permission or not login"
         'url': 'http://www.camdemy.com/media/13885',
         'md5': '4576a3bb2581f86c61044822adbd1249',
         'info_dict': {
@@ -41,64 +42,71 @@ class CamdemyIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'EverCam + Camdemy QuickStart',
             'thumbnail': 're:^https?://.*\.jpg$',
-            'description': 'md5:050b62f71ed62928f8a35f1a41e186c9',
+            'description': 'md5:2a9f989c2b153a2342acee579c6e7db6',
             'creator': 'evercam',
-            'upload_date': '20140620',
-            'timestamp': 1403271569,
+            'duration': 318,
         }
     }, {
-        # External source
+        # External source (YouTube)
         'url': 'http://www.camdemy.com/media/14842',
-        'md5': '50e1c3c3aa233d3d7b7daa2fa10b1cf7',
         'info_dict': {
             'id': '2vsYQzNIsJo',
             'ext': 'mp4',
+            'title': 'Excel 2013 Tutorial - How to add Password Protection',
+            'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection',
             'upload_date': '20130211',
             'uploader': 'Hun Kim',
-            'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection',
             'uploader_id': 'hunkimtutorials',
-            'title': 'Excel 2013 Tutorial - How to add Password Protection',
-        }
+        },
+        'params': {
+            'skip_download': True,
+        },
     }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        page = self._download_webpage(url, video_id)
+
+        webpage = self._download_webpage(url, video_id)
 
         src_from = self._html_search_regex(
-            r"<div class='srcFrom'>Source: <a title='([^']+)'", page,
-            'external source', default=None)
+            r"class=['\"]srcFrom['\"][^>]*>Sources?(?:\s+from)?\s*:\s*<a[^>]+(?:href|title)=(['\"])(?P<url>(?:(?!\1).)+)\1",
+            webpage, 'external source', default=None, group='url')
         if src_from:
             return self.url_result(src_from)
 
         oembed_obj = self._download_json(
             'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id)
 
+        title = oembed_obj['title']
         thumb_url = oembed_obj['thumbnail_url']
         video_folder = compat_urlparse.urljoin(thumb_url, 'video/')
         file_list_doc = self._download_xml(
             compat_urlparse.urljoin(video_folder, 'fileList.xml'),
-            video_id, 'Filelist XML')
+            video_id, 'Downloading filelist XML')
         file_name = file_list_doc.find('./video/item/fileName').text
         video_url = compat_urlparse.urljoin(video_folder, file_name)
 
-        timestamp = parse_iso8601(self._html_search_regex(
-            r"<div class='title'>Posted\s*:</div>\s*<div class='value'>([^<>]+)<",
-            page, 'creation time', fatal=False),
-            delimiter=' ', timezone=datetime.timedelta(hours=8))
-        view_count = str_to_int(self._html_search_regex(
-            r"<div class='title'>Views\s*:</div>\s*<div class='value'>([^<>]+)<",
-            page, 'view count', fatal=False))
+        # Some URLs return "No permission or not login" in a webpage despite being
+        # freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885)
+        upload_date = unified_strdate(self._search_regex(
+            r'>published on ([^<]+)<', webpage,
+            'upload date', default=None))
+        view_count = str_to_int(self._search_regex(
+            r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views',
+            webpage, 'view count', default=None))
+        description = self._html_search_meta(
+            'description', webpage, default=None) or clean_html(
+            oembed_obj.get('description'))
 
         return {
             'id': video_id,
             'url': video_url,
-            'title': oembed_obj['title'],
+            'title': title,
             'thumbnail': thumb_url,
-            'description': self._html_search_meta('description', page),
-            'creator': oembed_obj['author_name'],
-            'duration': oembed_obj['duration'],
-            'timestamp': timestamp,
+            'description': description,
+            'creator': oembed_obj.get('author_name'),
+            'duration': parse_duration(oembed_obj.get('duration')),
+            'upload_date': upload_date,
             'view_count': view_count,
         }
 
index ff663d07947fd345a107203892cbe1811080ac6c..a87e971406b9a45d665d4bfe14e655c851f5a899 100644 (file)
@@ -4,9 +4,11 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_str
 from ..utils import (
     js_to_json,
     smuggle_url,
+    try_get,
 )
 
 
@@ -25,8 +27,22 @@ class CBCIE(InfoExtractor):
             'upload_date': '20160203',
             'uploader': 'CBCC-NEW',
         },
+        'skip': 'Geo-restricted to Canada',
     }, {
-        # with clipId
+        # with clipId, feed available via tpfeed.cbc.ca and feed.theplatform.com
+        'url': 'http://www.cbc.ca/22minutes/videos/22-minutes-update/22-minutes-update-episode-4',
+        'md5': '162adfa070274b144f4fdc3c3b8207db',
+        'info_dict': {
+            'id': '2414435309',
+            'ext': 'mp4',
+            'title': '22 Minutes Update: What Not To Wear Quebec',
+            'description': "This week's latest Canadian top political story is What Not To Wear Quebec.",
+            'upload_date': '20131025',
+            'uploader': 'CBCC-NEW',
+            'timestamp': 1382717907,
+        },
+    }, {
+        # with clipId, feed only available via tpfeed.cbc.ca
         'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live',
         'md5': '0274a90b51a9b4971fe005c63f592f12',
         'info_dict': {
@@ -64,6 +80,7 @@ class CBCIE(InfoExtractor):
                 'uploader': 'CBCC-NEW',
             },
         }],
+        'skip': 'Geo-restricted to Canada',
     }]
 
     @classmethod
@@ -81,9 +98,15 @@ class CBCIE(InfoExtractor):
             media_id = player_info.get('mediaId')
             if not media_id:
                 clip_id = player_info['clipId']
-                media_id = self._download_json(
-                    'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id,
-                    clip_id)['entries'][0]['id'].split('/')[-1]
+                feed = self._download_json(
+                    'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id,
+                    clip_id, fatal=False)
+                if feed:
+                    media_id = try_get(feed, lambda x: x['entries'][0]['guid'], compat_str)
+                if not media_id:
+                    media_id = self._download_json(
+                        'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id,
+                        clip_id)['entries'][0]['id'].split('/')[-1]
             return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id)
         else:
             entries = [self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) for media_id in re.findall(r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"', webpage)]
@@ -104,6 +127,7 @@ class CBCPlayerIE(InfoExtractor):
             'upload_date': '20160210',
             'uploader': 'CBCC-NEW',
         },
+        'skip': 'Geo-restricted to Canada',
     }, {
         # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/
         'url': 'http://www.cbc.ca/player/play/2657631896',
index 0011c30296971486e66fc121d82f7c9be8c53164..821db20b23052ca71d594c6c05ad705a400129a3 100644 (file)
@@ -80,9 +80,6 @@ class CBSInteractiveIE(ThePlatformIE):
 
         media_guid_path = 'media/guid/%d/%s' % (self.MPX_ACCOUNTS[site], vdata['mpxRefId'])
         formats, subtitles = [], {}
-        if site == 'cnet':
-            formats, subtitles = self._extract_theplatform_smil(
-                self.TP_RELEASE_URL_TEMPLATE % media_guid_path, video_id)
         for (fkey, vid) in vdata['files'].items():
             if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']:
                 continue
@@ -94,7 +91,7 @@ class CBSInteractiveIE(ThePlatformIE):
             subtitles = self._merge_subtitles(subtitles, tp_subtitles)
         self._sort_formats(formats)
 
-        info = self.get_metadata('kYEXFC/%s' % media_guid_path, video_id)
+        info = self._extract_theplatform_metadata('kYEXFC/%s' % media_guid_path, video_id)
         info.update({
             'id': video_id,
             'display_id': display_id,
index 74adb38a6cbac2cdda2a87dff49876389c9cf75e..4bcd104af7463b1cc4b9c6c88673cafc9ad655e7 100644 (file)
@@ -1,12 +1,10 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import calendar
-import datetime
-
 from .anvato import AnvatoIE
 from .sendtonews import SendtoNewsIE
 from ..compat import compat_urlparse
+from ..utils import unified_timestamp
 
 
 class CBSLocalIE(AnvatoIE):
@@ -43,13 +41,8 @@ class CBSLocalIE(AnvatoIE):
         'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/',
         'info_dict': {
             'id': 'GxfCe0Zo7D-175909-5588',
-            'ext': 'mp4',
-            'title': 'Recap: CLE 15, CIN 6',
-            'description': '5/16/16: Indians\' bats explode for 15 runs in a win',
-            'upload_date': '20160516',
-            'timestamp': 1463433840,
-            'duration': 49,
         },
+        'playlist_count': 9,
         'params': {
             # m3u8 download
             'skip_download': True,
@@ -62,19 +55,15 @@ class CBSLocalIE(AnvatoIE):
 
         sendtonews_url = SendtoNewsIE._extract_url(webpage)
         if sendtonews_url:
-            info_dict = {
-                '_type': 'url_transparent',
-                'url': compat_urlparse.urljoin(url, sendtonews_url),
-            }
-        else:
-            info_dict = self._extract_anvato_videos(webpage, display_id)
+            return self.url_result(
+                compat_urlparse.urljoin(url, sendtonews_url),
+                ie=SendtoNewsIE.ie_key())
+
+        info_dict = self._extract_anvato_videos(webpage, display_id)
 
         time_str = self._html_search_regex(
             r'class="entry-date">([^<]+)<', webpage, 'released date', fatal=False)
-        timestamp = None
-        if time_str:
-            timestamp = calendar.timegm(datetime.datetime.strptime(
-                time_str, '%b %d, %Y %I:%M %p').timetuple())
+        timestamp = unified_timestamp(time_str)
 
         info_dict.update({
             'display_id': display_id,
index 387537e766fc4886201285f221022181b2beeb53..9d3b75526395ee946ef660c14e0629019e10c1e2 100644 (file)
@@ -26,6 +26,7 @@ class CBSNewsIE(CBSBaseIE):
                 # rtmp download
                 'skip_download': True,
             },
+            'skip': 'Subscribers only',
         },
         {
             'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/',
@@ -69,6 +70,7 @@ class CBSNewsLiveVideoIE(InfoExtractor):
     IE_DESC = 'CBS News Live Videos'
     _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P<id>[\da-z_-]+)'
 
+    # Live videos get deleted soon. See http://www.cbsnews.com/live/ for the latest examples
     _TEST = {
         'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/',
         'info_dict': {
@@ -77,6 +79,7 @@ class CBSNewsLiveVideoIE(InfoExtractor):
             'title': 'Clinton, Sanders Prepare To Face Off In NH',
             'duration': 334,
         },
+        'skip': 'Video gone',
     }
 
     def _real_extract(self, url):
index b2234549e8b6747cd76d6d1936af893522e59cae..29a8820d5835b1b3cf7aca3840705a2fb2f2e1e3 100644 (file)
@@ -17,7 +17,8 @@ class ChaturbateIE(InfoExtractor):
         },
         'params': {
             'skip_download': True,
-        }
+        },
+        'skip': 'Room is offline',
     }, {
         'url': 'https://en.chaturbate.com/siswet19/',
         'only_matching': True,
index b1eeaf101dda3a4a962862fa854db97ae1809329..b435186523454e0873a8adedda2f80c1a8a6ecbe 100644 (file)
@@ -1,30 +1,33 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import base64
+
 from .common import InfoExtractor
-from ..utils import (
-    parse_duration,
-    int_or_none,
-)
+from ..utils import parse_duration
 
 
 class ChirbitIE(InfoExtractor):
     IE_NAME = 'chirbit'
     _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P<id>[\da-zA-Z]+)'
     _TESTS = [{
-        'url': 'http://chirb.it/PrIPv5',
-        'md5': '9847b0dad6ac3e074568bf2cfb197de8',
+        'url': 'http://chirb.it/be2abG',
         'info_dict': {
-            'id': 'PrIPv5',
+            'id': 'be2abG',
             'ext': 'mp3',
-            'title': 'Фасадстрой',
-            'duration': 52,
-            'view_count': int,
-            'comment_count': int,
+            'title': 'md5:f542ea253f5255240be4da375c6a5d7e',
+            'description': 'md5:f24a4e22a71763e32da5fed59e47c770',
+            'duration': 306,
+        },
+        'params': {
+            'skip_download': True,
         }
     }, {
         'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5',
         'only_matching': True,
+    }, {
+        'url': 'https://chirb.it/wp/MN58c2',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
@@ -33,27 +36,30 @@ class ChirbitIE(InfoExtractor):
         webpage = self._download_webpage(
             'http://chirb.it/%s' % audio_id, audio_id)
 
-        audio_url = self._search_regex(
-            r'"setFile"\s*,\s*"([^"]+)"', webpage, 'audio url')
+        data_fd = self._search_regex(
+            r'data-fd=(["\'])(?P<url>(?:(?!\1).)+)\1',
+            webpage, 'data fd', group='url')
+
+        # Reverse engineered from https://chirb.it/js/chirbit.player.js (look
+        # for soundURL)
+        audio_url = base64.b64decode(
+            data_fd[::-1].encode('ascii')).decode('utf-8')
 
         title = self._search_regex(
-            r'itemprop="name">([^<]+)', webpage, 'title')
-        duration = parse_duration(self._html_search_meta(
-            'duration', webpage, 'duration', fatal=False))
-        view_count = int_or_none(self._search_regex(
-            r'itemprop="playCount"\s*>(\d+)', webpage,
-            'listen count', fatal=False))
-        comment_count = int_or_none(self._search_regex(
-            r'>(\d+) Comments?:', webpage,
-            'comment count', fatal=False))
+            r'class=["\']chirbit-title["\'][^>]*>([^<]+)', webpage, 'title')
+        description = self._search_regex(
+            r'<h3>Description</h3>\s*<pre[^>]*>([^<]+)</pre>',
+            webpage, 'description', default=None)
+        duration = parse_duration(self._search_regex(
+            r'class=["\']c-length["\'][^>]*>([^<]+)',
+            webpage, 'duration', fatal=False))
 
         return {
             'id': audio_id,
             'url': audio_url,
             'title': title,
+            'description': description,
             'duration': duration,
-            'view_count': view_count,
-            'comment_count': comment_count,
         }
 
 
index 19f8b397e44a679ea936ad638048ccb488dc4b93..252c2e846969c96d733911f2f471054286ec0777 100644 (file)
@@ -23,7 +23,7 @@ class CliphunterIE(InfoExtractor):
         (?P<id>[0-9]+)/
         (?P<seo>.+?)(?:$|[#\?])
     '''
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.cliphunter.com/w/1012420/Fun_Jynx_Maze_solo',
         'md5': 'b7c9bbd4eb3a226ab91093714dcaa480',
         'info_dict': {
@@ -32,8 +32,19 @@ class CliphunterIE(InfoExtractor):
             'title': 'Fun Jynx Maze solo',
             'thumbnail': 're:^https?://.*\.jpg$',
             'age_limit': 18,
-        }
-    }
+        },
+        'skip': 'Video gone',
+    }, {
+        'url': 'http://www.cliphunter.com/w/2019449/ShesNew__My_booty_girlfriend_Victoria_Paradices_pussy_filled_with_jizz',
+        'md5': '55a723c67bfc6da6b0cfa00d55da8a27',
+        'info_dict': {
+            'id': '2019449',
+            'ext': 'mp4',
+            'title': 'ShesNew - My booty girlfriend, Victoria Paradice\'s pussy filled with jizz',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'age_limit': 18,
+        },
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
index 4f9320ea57cb542f69b6b0be1965bb004fa35574..d55b26d59ff89af6ef9ae7943cb67b4346031f66 100644 (file)
@@ -1,16 +1,10 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-    float_or_none,
-    int_or_none,
-    parse_iso8601,
-)
+from .onet import OnetBaseIE
 
 
-class ClipRsIE(InfoExtractor):
+class ClipRsIE(OnetBaseIE):
     _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P<id>[^/]+)/\d+'
     _TEST = {
         'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732',
@@ -27,64 +21,13 @@ class ClipRsIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
+        display_id = self._match_id(url)
 
-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(url, display_id)
 
-        video_id = self._search_regex(
-            r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id')
+        mvp_id = self._search_mvp_id(webpage)
 
-        response = self._download_json(
-            'http://qi.ckm.onetapi.pl/', video_id,
-            query={
-                'body[id]': video_id,
-                'body[jsonrpc]': '2.0',
-                'body[method]': 'get_asset_detail',
-                'body[params][ID_Publikacji]': video_id,
-                'body[params][Service]': 'www.onet.pl',
-                'content-type': 'application/jsonp',
-                'x-onet-app': 'player.front.onetapi.pl',
-            })
+        info_dict = self._extract_from_id(mvp_id, webpage)
+        info_dict['display_id'] = display_id
 
-        error = response.get('error')
-        if error:
-            raise ExtractorError(
-                '%s said: %s' % (self.IE_NAME, error['message']), expected=True)
-
-        video = response['result'].get('0')
-
-        formats = []
-        for _, formats_dict in video['formats'].items():
-            if not isinstance(formats_dict, dict):
-                continue
-            for format_id, format_list in formats_dict.items():
-                if not isinstance(format_list, list):
-                    continue
-                for f in format_list:
-                    if not f.get('url'):
-                        continue
-                    formats.append({
-                        'url': f['url'],
-                        'format_id': format_id,
-                        'height': int_or_none(f.get('vertical_resolution')),
-                        'width': int_or_none(f.get('horizontal_resolution')),
-                        'abr': float_or_none(f.get('audio_bitrate')),
-                        'vbr': float_or_none(f.get('video_bitrate')),
-                    })
-        self._sort_formats(formats)
-
-        meta = video.get('meta', {})
-
-        title = self._og_search_title(webpage, default=None) or meta['title']
-        description = self._og_search_description(webpage, default=None) or meta.get('description')
-        duration = meta.get('length') or meta.get('lenght')
-        timestamp = parse_iso8601(meta.get('addDate'), ' ')
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'duration': duration,
-            'timestamp': timestamp,
-            'formats': formats,
-        }
+        return info_dict
index 9a28ef35423a5dfd28295333ffc037b8919873ac..ae5ba0015a0e5026f2db38eaa103417ddc8ce1da 100644 (file)
@@ -6,7 +6,6 @@ import re
 from .common import InfoExtractor
 from ..compat import (
     compat_parse_qs,
-    compat_urllib_parse_urlencode,
     compat_HTTPError,
 )
 from ..utils import (
@@ -17,37 +16,26 @@ from ..utils import (
 
 
 class CloudyIE(InfoExtractor):
-    _IE_DESC = 'cloudy.ec and videoraj.ch'
+    _IE_DESC = 'cloudy.ec'
     _VALID_URL = r'''(?x)
-        https?://(?:www\.)?(?P<host>cloudy\.ec|videoraj\.(?:ch|to))/
+        https?://(?:www\.)?cloudy\.ec/
         (?:v/|embed\.php\?id=)
         (?P<id>[A-Za-z0-9]+)
         '''
-    _EMBED_URL = 'http://www.%s/embed.php?id=%s'
-    _API_URL = 'http://www.%s/api/player.api.php?%s'
+    _EMBED_URL = 'http://www.cloudy.ec/embed.php?id=%s'
+    _API_URL = 'http://www.cloudy.ec/api/player.api.php'
     _MAX_TRIES = 2
-    _TESTS = [
-        {
-            'url': 'https://www.cloudy.ec/v/af511e2527aac',
-            'md5': '5cb253ace826a42f35b4740539bedf07',
-            'info_dict': {
-                'id': 'af511e2527aac',
-                'ext': 'flv',
-                'title': 'Funny Cats and Animals Compilation june 2013',
-            }
-        },
-        {
-            'url': 'http://www.videoraj.to/v/47f399fd8bb60',
-            'md5': '7d0f8799d91efd4eda26587421c3c3b0',
-            'info_dict': {
-                'id': '47f399fd8bb60',
-                'ext': 'flv',
-                'title': 'Burning a New iPhone 5 with Gasoline - Will it Survive?',
-            }
+    _TEST = {
+        'url': 'https://www.cloudy.ec/v/af511e2527aac',
+        'md5': '5cb253ace826a42f35b4740539bedf07',
+        'info_dict': {
+            'id': 'af511e2527aac',
+            'ext': 'flv',
+            'title': 'Funny Cats and Animals Compilation june 2013',
         }
-    ]
+    }
 
-    def _extract_video(self, video_host, video_id, file_key, error_url=None, try_num=0):
+    def _extract_video(self, video_id, file_key, error_url=None, try_num=0):
 
         if try_num > self._MAX_TRIES - 1:
             raise ExtractorError('Unable to extract video URL', expected=True)
@@ -64,9 +52,8 @@ class CloudyIE(InfoExtractor):
                 'errorUrl': error_url,
             })
 
-        data_url = self._API_URL % (video_host, compat_urllib_parse_urlencode(form))
         player_data = self._download_webpage(
-            data_url, video_id, 'Downloading player data')
+            self._API_URL, video_id, 'Downloading player data', query=form)
         data = compat_parse_qs(player_data)
 
         try_num += 1
@@ -88,7 +75,7 @@ class CloudyIE(InfoExtractor):
             except ExtractorError as e:
                 if isinstance(e.cause, compat_HTTPError) and e.cause.code in [404, 410]:
                     self.report_warning('Invalid video URL, requesting another', video_id)
-                    return self._extract_video(video_host, video_id, file_key, video_url, try_num)
+                    return self._extract_video(video_id, file_key, video_url, try_num)
 
         return {
             'id': video_id,
@@ -98,14 +85,13 @@ class CloudyIE(InfoExtractor):
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        video_host = mobj.group('host')
         video_id = mobj.group('id')
 
-        url = self._EMBED_URL % (video_host, video_id)
+        url = self._EMBED_URL % video_id
         webpage = self._download_webpage(url, video_id)
 
         file_key = self._search_regex(
             [r'key\s*:\s*"([^"]+)"', r'filekey\s*=\s*"([^"]+)"'],
             webpage, 'file_key')
 
-        return self._extract_video(video_host, video_id, file_key)
+        return self._extract_video(video_id, file_key)
index f1311b14f8f1c572c9647bcd36de3f19de676373..f24568dcc25740f7814c134f9e58e659e7f11855 100644 (file)
@@ -1,5 +1,7 @@
 from __future__ import unicode_literals
+
 from .mtv import MTVIE
+from ..utils import ExtractorError
 
 
 class CMTIE(MTVIE):
@@ -16,7 +18,27 @@ class CMTIE(MTVIE):
             'title': 'Garth Brooks - "The Call (featuring Trisha Yearwood)"',
             'description': 'Blame It All On My Roots',
         },
+        'skip': 'Video not available',
+    }, {
+        'url': 'http://www.cmt.com/videos/misc/1504699/still-the-king-ep-109-in-3-minutes.jhtml#id=1739908',
+        'md5': 'e61a801ca4a183a466c08bd98dccbb1c',
+        'info_dict': {
+            'id': '1504699',
+            'ext': 'mp4',
+            'title': 'Still The King Ep. 109 in 3 Minutes',
+            'description': 'Relive or catch up with Still The King by watching this recap of season 1, episode 9. New episodes Sundays 9/8c.',
+            'timestamp': 1469421000.0,
+            'upload_date': '20160725',
+        },
     }, {
         'url': 'http://www.cmt.com/shows/party-down-south/party-down-south-ep-407-gone-girl/1738172/playlist/#id=1738172',
         'only_matching': True,
     }]
+
+    @classmethod
+    def _transform_rtmp_url(cls, rtmp_video_url):
+        if 'error_not_available.swf' in rtmp_video_url:
+            raise ExtractorError(
+                '%s said: video is not available' % cls.IE_NAME, expected=True)
+
+        return super(CMTIE, cls)._transform_rtmp_url(rtmp_video_url)
index 2b6aaa3aa47541669a59b7936477023d54ae7437..88346dde7754a124e2b1d88d5ab8291dca4ca632 100644 (file)
@@ -1,17 +1,7 @@
 from __future__ import unicode_literals
 
-import re
-
 from .mtv import MTVServicesInfoExtractor
-from ..compat import (
-    compat_str,
-    compat_urllib_parse_urlencode,
-)
-from ..utils import (
-    ExtractorError,
-    float_or_none,
-    unified_strdate,
-)
+from .common import InfoExtractor
 
 
 class ComedyCentralIE(MTVServicesInfoExtractor):
@@ -26,8 +16,10 @@ class ComedyCentralIE(MTVServicesInfoExtractor):
         'info_dict': {
             'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354',
             'ext': 'mp4',
-            'title': 'CC:Stand-Up|Greg Fitzsimmons: Life on Stage|Uncensored - Too Good of a Mother',
+            'title': 'CC:Stand-Up|August 18, 2013|1|0101|Uncensored - Too Good of a Mother',
             'description': 'After a certain point, breastfeeding becomes c**kblocking.',
+            'timestamp': 1376798400,
+            'upload_date': '20130818',
         },
     }, {
         'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview',
@@ -35,241 +27,92 @@ class ComedyCentralIE(MTVServicesInfoExtractor):
     }]
 
 
-class ComedyCentralShowsIE(MTVServicesInfoExtractor):
-    IE_DESC = 'The Daily Show / The Colbert Report'
-    # urls can be abbreviations like :thedailyshow
-    # urls for episodes like:
-    # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
-    #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
-    #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
-    _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow)
-                      |https?://(:www\.)?
-                          (?P<showname>thedailyshow|thecolbertreport|tosh)\.(?:cc\.)?com/
-                         ((?:full-)?episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)|
-                          (?P<clip>
-                              (?:(?:guests/[^/]+|videos|video-(?:clips|playlists)|special-editions|news-team/[^/]+)/[^/]+/(?P<videotitle>[^/?#]+))
-                              |(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
-                              |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*))
-                          )|
-                          (?P<interview>
-                              extended-interviews/(?P<interID>[0-9a-z]+)/
-                              (?:playlist_tds_extended_)?(?P<interview_title>[^/?#]*?)
-                              (?:/[^/?#]?|[?#]|$))))
-                     '''
+class ToshIE(MTVServicesInfoExtractor):
+    IE_DESC = 'Tosh.0'
+    _VALID_URL = r'^https?://tosh\.cc\.com/video-(?:clips|collections)/[^/]+/(?P<videotitle>[^/?#]+)'
+    _FEED_URL = 'http://tosh.cc.com/feeds/mrss'
+
     _TESTS = [{
-        'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart',
-        'md5': '4e2f5cb088a83cd8cdb7756132f9739d',
-        'info_dict': {
-            'id': 'ab9ab3e7-5a98-4dbe-8b21-551dc0523d55',
-            'ext': 'mp4',
-            'upload_date': '20121213',
-            'description': 'Kristen Stewart learns to let loose in "On the Road."',
-            'uploader': 'thedailyshow',
-            'title': 'thedailyshow kristen-stewart part 1',
-        }
-    }, {
-        'url': 'http://thedailyshow.cc.com/extended-interviews/b6364d/sarah-chayes-extended-interview',
+        'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans',
         'info_dict': {
-            'id': 'sarah-chayes-extended-interview',
-            'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."',
-            'title': 'thedailyshow Sarah Chayes Extended Interview',
+            'description': 'Tosh asked fans to share their summer plans.',
+            'title': 'Twitter Users Share Summer Plans',
         },
-        'playlist': [
-            {
-                'info_dict': {
-                    'id': '0baad492-cbec-4ec1-9e50-ad91c291127f',
-                    'ext': 'mp4',
-                    'upload_date': '20150129',
-                    'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."',
-                    'uploader': 'thedailyshow',
-                    'title': 'thedailyshow sarah-chayes-extended-interview part 1',
+        'playlist': [{
+            'md5': 'f269e88114c1805bb6d7653fecea9e06',
+            'info_dict': {
+                'id': '90498ec2-ed00-11e0-aca6-0026b9414f30',
+                'ext': 'mp4',
+                'title': 'Tosh.0|June 9, 2077|2|211|Twitter Users Share Summer Plans',
+                'description': 'Tosh asked fans to share their summer plans.',
+                'thumbnail': 're:^https?://.*\.jpg',
+                # It's really reported to be published on year 2077
+                'upload_date': '20770610',
+                'timestamp': 3390510600,
+                'subtitles': {
+                    'en': 'mincount:3',
                 },
             },
-            {
-                'info_dict': {
-                    'id': '1e4fb91b-8ce7-4277-bd7c-98c9f1bbd283',
-                    'ext': 'mp4',
-                    'upload_date': '20150129',
-                    'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."',
-                    'uploader': 'thedailyshow',
-                    'title': 'thedailyshow sarah-chayes-extended-interview part 2',
-                },
-            },
-        ],
+        }]
+    }, {
+        'url': 'http://tosh.cc.com/video-collections/x2iz7k/just-plain-foul/m5q4fp',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def _transform_rtmp_url(cls, rtmp_video_url):
+        new_urls = super(ToshIE, cls)._transform_rtmp_url(rtmp_video_url)
+        new_urls['rtmp'] = rtmp_video_url.replace('viacomccstrm', 'viacommtvstrm')
+        return new_urls
+
+
+class ComedyCentralTVIE(MTVServicesInfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/(?:staffeln|shows)/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://www.comedycentral.tv/staffeln/7436-the-mindy-project-staffel-4',
+        'info_dict': {
+            'id': 'local_playlist-f99b626bdfe13568579a',
+            'ext': 'flv',
+            'title': 'Episode_the-mindy-project_shows_season-4_episode-3_full-episode_part1',
+        },
         'params': {
+            # rtmp download
             'skip_download': True,
         },
     }, {
-        'url': 'http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview',
-        'only_matching': True,
-    }, {
-        'url': 'http://thecolbertreport.cc.com/videos/29w6fx/-realhumanpraise-for-fox-news',
-        'only_matching': True,
-    }, {
-        'url': 'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114',
-        'only_matching': True,
-    }, {
-        'url': 'http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3',
-        'only_matching': True,
-    }, {
-        'url': 'http://thedailyshow.cc.com/episodes/sy7yv0/april-8--2014---denis-leary',
-        'only_matching': True,
-    }, {
-        'url': 'http://thecolbertreport.cc.com/episodes/8ase07/april-8--2014---jane-goodall',
+        'url': 'http://www.comedycentral.tv/shows/1074-workaholics',
         'only_matching': True,
     }, {
-        'url': 'http://thedailyshow.cc.com/video-playlists/npde3s/the-daily-show-19088-highlights',
-        'only_matching': True,
-    }, {
-        'url': 'http://thedailyshow.cc.com/video-playlists/t6d9sg/the-daily-show-20038-highlights/be3cwo',
-        'only_matching': True,
-    }, {
-        'url': 'http://thedailyshow.cc.com/special-editions/2l8fdb/special-edition---a-look-back-at-food',
-        'only_matching': True,
-    }, {
-        'url': 'http://thedailyshow.cc.com/news-team/michael-che/7wnfel/we-need-to-talk-about-israel',
-        'only_matching': True,
-    }, {
-        'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans',
+        'url': 'http://www.comedycentral.tv/shows/1727-the-mindy-project/bonus',
         'only_matching': True,
     }]
 
-    _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
-
-    _video_extensions = {
-        '3500': 'mp4',
-        '2200': 'mp4',
-        '1700': 'mp4',
-        '1200': 'mp4',
-        '750': 'mp4',
-        '400': 'mp4',
-    }
-    _video_dimensions = {
-        '3500': (1280, 720),
-        '2200': (960, 540),
-        '1700': (768, 432),
-        '1200': (640, 360),
-        '750': (512, 288),
-        '400': (384, 216),
-    }
-
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-
-        if mobj.group('shortname'):
-            return self.url_result('http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes')
-
-        if mobj.group('clip'):
-            if mobj.group('videotitle'):
-                epTitle = mobj.group('videotitle')
-            elif mobj.group('showname') == 'thedailyshow':
-                epTitle = mobj.group('tdstitle')
-            else:
-                epTitle = mobj.group('cntitle')
-            dlNewest = False
-        elif mobj.group('interview'):
-            epTitle = mobj.group('interview_title')
-            dlNewest = False
-        else:
-            dlNewest = not mobj.group('episode')
-            if dlNewest:
-                epTitle = mobj.group('showname')
-            else:
-                epTitle = mobj.group('episode')
-        show_name = mobj.group('showname')
-
-        webpage, htmlHandle = self._download_webpage_handle(url, epTitle)
-        if dlNewest:
-            url = htmlHandle.geturl()
-            mobj = re.match(self._VALID_URL, url, re.VERBOSE)
-            if mobj is None:
-                raise ExtractorError('Invalid redirected URL: ' + url)
-            if mobj.group('episode') == '':
-                raise ExtractorError('Redirected URL is still not specific: ' + url)
-            epTitle = (mobj.group('episode') or mobj.group('videotitle')).rpartition('/')[-1]
+        video_id = self._match_id(url)
 
-        mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
-        if len(mMovieParams) == 0:
-            # The Colbert Report embeds the information in a without
-            # a URL prefix; so extract the alternate reference
-            # and then add the URL prefix manually.
+        webpage = self._download_webpage(url, video_id)
 
-            altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video|playlist).*?:.*?)"', webpage)
-            if len(altMovieParams) == 0:
-                raise ExtractorError('unable to find Flash URL in webpage ' + url)
-            else:
-                mMovieParams = [('http://media.mtvnservices.com/' + altMovieParams[0], altMovieParams[0])]
+        mrss_url = self._search_regex(
+            r'data-mrss=(["\'])(?P<url>(?:(?!\1).)+)\1',
+            webpage, 'mrss url', group='url')
 
-        uri = mMovieParams[0][1]
-        # Correct cc.com in uri
-        uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.com', uri)
+        return self._get_videos_info_from_url(mrss_url, video_id)
 
-        index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse_urlencode({'uri': uri}))
-        idoc = self._download_xml(
-            index_url, epTitle,
-            'Downloading show index', 'Unable to download episode index')
 
-        title = idoc.find('./channel/title').text
-        description = idoc.find('./channel/description').text
-
-        entries = []
-        item_els = idoc.findall('.//item')
-        for part_num, itemEl in enumerate(item_els):
-            upload_date = unified_strdate(itemEl.findall('./pubDate')[0].text)
-            thumbnail = itemEl.find('.//{http://search.yahoo.com/mrss/}thumbnail').attrib.get('url')
-
-            content = itemEl.find('.//{http://search.yahoo.com/mrss/}content')
-            duration = float_or_none(content.attrib.get('duration'))
-            mediagen_url = content.attrib['url']
-            guid = itemEl.find('./guid').text.rpartition(':')[-1]
-
-            cdoc = self._download_xml(
-                mediagen_url, epTitle,
-                'Downloading configuration for segment %d / %d' % (part_num + 1, len(item_els)))
-
-            turls = []
-            for rendition in cdoc.findall('.//rendition'):
-                finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
-                turls.append(finfo)
-
-            formats = []
-            for format, rtmp_video_url in turls:
-                w, h = self._video_dimensions.get(format, (None, None))
-                formats.append({
-                    'format_id': 'vhttp-%s' % format,
-                    'url': self._transform_rtmp_url(rtmp_video_url),
-                    'ext': self._video_extensions.get(format, 'mp4'),
-                    'height': h,
-                    'width': w,
-                })
-                formats.append({
-                    'format_id': 'rtmp-%s' % format,
-                    'url': rtmp_video_url.replace('viacomccstrm', 'viacommtvstrm'),
-                    'ext': self._video_extensions.get(format, 'mp4'),
-                    'height': h,
-                    'width': w,
-                })
-                self._sort_formats(formats)
-
-            subtitles = self._extract_subtitles(cdoc, guid)
-
-            virtual_id = show_name + ' ' + epTitle + ' part ' + compat_str(part_num + 1)
-            entries.append({
-                'id': guid,
-                'title': virtual_id,
-                'formats': formats,
-                'uploader': show_name,
-                'upload_date': upload_date,
-                'duration': duration,
-                'thumbnail': thumbnail,
-                'description': description,
-                'subtitles': subtitles,
-            })
+class ComedyCentralShortnameIE(InfoExtractor):
+    _VALID_URL = r'^:(?P<id>tds|thedailyshow)$'
+    _TESTS = [{
+        'url': ':tds',
+        'only_matching': True,
+    }, {
+        'url': ':thedailyshow',
+        'only_matching': True,
+    }]
 
-        return {
-            '_type': 'playlist',
-            'id': epTitle,
-            'entries': entries,
-            'title': show_name + ' ' + title,
-            'description': description,
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        shortcut_map = {
+            'tds': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes',
+            'thedailyshow': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes',
         }
+        return self.url_result(shortcut_map[video_id])
index 5a2603b509244810e5a19d8adb2b124d3a5c2d78..9427ff4499243d5236e5349c7cc98c11b9413fb9 100644 (file)
@@ -44,6 +44,7 @@ from ..utils import (
     sanitized_Request,
     unescapeHTML,
     unified_strdate,
+    unified_timestamp,
     url_basename,
     xpath_element,
     xpath_text,
@@ -54,6 +55,8 @@ from ..utils import (
     update_Request,
     update_url_query,
     parse_m3u8_attributes,
+    extract_attributes,
+    parse_codecs,
 )
 
 
@@ -161,6 +164,7 @@ class InfoExtractor(object):
                         * "height" (optional, int)
                         * "resolution" (optional, string "{width}x{height"},
                                         deprecated)
+                        * "filesize" (optional, int)
     thumbnail:      Full URL to a video thumbnail image.
     description:    Full video description.
     uploader:       Full name of the video uploader.
@@ -658,6 +662,24 @@ class InfoExtractor(object):
         else:
             return res
 
+    def _get_netrc_login_info(self, netrc_machine=None):
+        username = None
+        password = None
+        netrc_machine = netrc_machine or self._NETRC_MACHINE
+
+        if self._downloader.params.get('usenetrc', False):
+            try:
+                info = netrc.netrc().authenticators(netrc_machine)
+                if info is not None:
+                    username = info[0]
+                    password = info[2]
+                else:
+                    raise netrc.NetrcParseError('No authenticators for %s' % netrc_machine)
+            except (IOError, netrc.NetrcParseError) as err:
+                self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
+
+        return (username, password)
+
     def _get_login_info(self):
         """
         Get the login info as (username, password)
@@ -675,16 +697,8 @@ class InfoExtractor(object):
         if downloader_params.get('username') is not None:
             username = downloader_params['username']
             password = downloader_params['password']
-        elif downloader_params.get('usenetrc', False):
-            try:
-                info = netrc.netrc().authenticators(self._NETRC_MACHINE)
-                if info is not None:
-                    username = info[0]
-                    password = info[2]
-                else:
-                    raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
-            except (IOError, netrc.NetrcParseError) as err:
-                self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
+        else:
+            username, password = self._get_netrc_login_info()
 
         return (username, password)
 
@@ -723,9 +737,14 @@ class InfoExtractor(object):
                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 
     def _og_search_property(self, prop, html, name=None, **kargs):
+        if not isinstance(prop, (list, tuple)):
+            prop = [prop]
         if name is None:
-            name = 'OpenGraph %s' % prop
-        escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
+            name = 'OpenGraph %s' % prop[0]
+        og_regexes = []
+        for p in prop:
+            og_regexes.extend(self._og_regexes(p))
+        escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
         if escaped is None:
             return None
         return unescapeHTML(escaped)
@@ -749,10 +768,12 @@ class InfoExtractor(object):
         return self._og_search_property('url', html, **kargs)
 
     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
+        if not isinstance(name, (list, tuple)):
+            name = [name]
         if display_name is None:
-            display_name = name
+            display_name = name[0]
         return self._html_search_regex(
-            self._meta_regex(name),
+            [self._meta_regex(n) for n in name],
             html, display_name, fatal=fatal, group='content', **kwargs)
 
     def _dc_search_uploader(self, html):
@@ -801,40 +822,66 @@ class InfoExtractor(object):
         return self._html_search_meta('twitter:player', html,
                                       'twitter card player')
 
-    def _search_json_ld(self, html, video_id, **kwargs):
+    def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
         json_ld = self._search_regex(
             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
             html, 'JSON-LD', group='json_ld', **kwargs)
+        default = kwargs.get('default', NO_DEFAULT)
         if not json_ld:
-            return {}
-        return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
-
-    def _json_ld(self, json_ld, video_id, fatal=True):
+            return default if default is not NO_DEFAULT else {}
+        # JSON-LD may be malformed and thus `fatal` should be respected.
+        # At the same time `default` may be passed that assumes `fatal=False`
+        # for _search_regex. Let's simulate the same behavior here as well.
+        fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
+        return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
+
+    def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
         if isinstance(json_ld, compat_str):
             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
         if not json_ld:
             return {}
         info = {}
-        if json_ld.get('@context') == 'http://schema.org':
-            item_type = json_ld.get('@type')
-            if item_type == 'TVEpisode':
-                info.update({
-                    'episode': unescapeHTML(json_ld.get('name')),
-                    'episode_number': int_or_none(json_ld.get('episodeNumber')),
-                    'description': unescapeHTML(json_ld.get('description')),
-                })
-                part_of_season = json_ld.get('partOfSeason')
-                if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
-                    info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
-                part_of_series = json_ld.get('partOfSeries')
-                if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
-                    info['series'] = unescapeHTML(part_of_series.get('name'))
-            elif item_type == 'Article':
-                info.update({
-                    'timestamp': parse_iso8601(json_ld.get('datePublished')),
-                    'title': unescapeHTML(json_ld.get('headline')),
-                    'description': unescapeHTML(json_ld.get('articleBody')),
-                })
+        if not isinstance(json_ld, (list, tuple, dict)):
+            return info
+        if isinstance(json_ld, dict):
+            json_ld = [json_ld]
+        for e in json_ld:
+            if e.get('@context') == 'http://schema.org':
+                item_type = e.get('@type')
+                if expected_type is not None and expected_type != item_type:
+                    return info
+                if item_type == 'TVEpisode':
+                    info.update({
+                        'episode': unescapeHTML(e.get('name')),
+                        'episode_number': int_or_none(e.get('episodeNumber')),
+                        'description': unescapeHTML(e.get('description')),
+                    })
+                    part_of_season = e.get('partOfSeason')
+                    if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
+                        info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
+                    part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
+                    if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
+                        info['series'] = unescapeHTML(part_of_series.get('name'))
+                elif item_type == 'Article':
+                    info.update({
+                        'timestamp': parse_iso8601(e.get('datePublished')),
+                        'title': unescapeHTML(e.get('headline')),
+                        'description': unescapeHTML(e.get('articleBody')),
+                    })
+                elif item_type == 'VideoObject':
+                    info.update({
+                        'url': e.get('contentUrl'),
+                        'title': unescapeHTML(e.get('name')),
+                        'description': unescapeHTML(e.get('description')),
+                        'thumbnail': e.get('thumbnailUrl'),
+                        'duration': parse_duration(e.get('duration')),
+                        'timestamp': unified_timestamp(e.get('uploadDate')),
+                        'filesize': float_or_none(e.get('contentSize')),
+                        'tbr': int_or_none(e.get('bitrate')),
+                        'width': int_or_none(e.get('width')),
+                        'height': int_or_none(e.get('height')),
+                    })
+                break
         return dict((k, v) for k, v in info.items() if v is not None)
 
     @staticmethod
@@ -876,7 +923,11 @@ class InfoExtractor(object):
                 f['ext'] = determine_ext(f['url'])
 
             if isinstance(field_preference, (list, tuple)):
-                return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
+                return tuple(
+                    f.get(field)
+                    if f.get(field) is not None
+                    else ('' if field == 'format_id' else -1)
+                    for field in field_preference)
 
             preference = f.get('preference')
             if preference is None:
@@ -884,7 +935,8 @@ class InfoExtractor(object):
                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
                     preference -= 0.5
 
-            proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
+            protocol = f.get('protocol') or determine_protocol(f)
+            proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
 
             if f.get('vcodec') == 'none':  # audio only
                 preference -= 50
@@ -1101,7 +1153,7 @@ class InfoExtractor(object):
             'url': m3u8_url,
             'ext': ext,
             'protocol': 'm3u8',
-            'preference': preference - 1 if preference else -1,
+            'preference': preference - 100 if preference else -100,
             'resolution': 'multiple',
             'format_note': 'Quality selection URL',
         }
@@ -1180,6 +1232,7 @@ class InfoExtractor(object):
                     'url': format_url(line.strip()),
                     'tbr': tbr,
                     'ext': ext,
+                    'fps': float_or_none(last_info.get('FRAME-RATE')),
                     'protocol': entry_protocol,
                     'preference': preference,
                 }
@@ -1188,24 +1241,17 @@ class InfoExtractor(object):
                     width_str, height_str = resolution.split('x')
                     f['width'] = int(width_str)
                     f['height'] = int(height_str)
-                codecs = last_info.get('CODECS')
-                if codecs:
-                    vcodec, acodec = [None] * 2
-                    va_codecs = codecs.split(',')
-                    if len(va_codecs) == 1:
-                        # Audio only entries usually come with single codec and
-                        # no resolution. For more robustness we also check it to
-                        # be mp4 audio.
-                        if not resolution and va_codecs[0].startswith('mp4a'):
-                            vcodec, acodec = 'none', va_codecs[0]
-                        else:
-                            vcodec = va_codecs[0]
-                    else:
-                        vcodec, acodec = va_codecs[:2]
+                # Unified Streaming Platform
+                mobj = re.search(
+                    r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
+                if mobj:
+                    abr, vbr = mobj.groups()
+                    abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
                     f.update({
-                        'acodec': acodec,
-                        'vcodec': vcodec,
+                        'vbr': vbr,
+                        'abr': abr,
                     })
+                f.update(parse_codecs(last_info.get('CODECS')))
                 if last_media is not None:
                     f['m3u8_media'] = last_media
                     last_media = None
@@ -1460,6 +1506,13 @@ class InfoExtractor(object):
             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
 
     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
+        """
+        Parse formats from MPD manifest.
+        References:
+         1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
+            http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
+         2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
+        """
         if mpd_doc.get('type') == 'dynamic':
             return []
 
@@ -1492,8 +1545,16 @@ class InfoExtractor(object):
                         s_e = segment_timeline.findall(_add_ns('S'))
                         if s_e:
                             ms_info['total_number'] = 0
+                            ms_info['s'] = []
                             for s in s_e:
-                                ms_info['total_number'] += 1 + int(s.get('r', '0'))
+                                r = int(s.get('r', 0))
+                                ms_info['total_number'] += 1 + r
+                                ms_info['s'].append({
+                                    't': int(s.get('t', 0)),
+                                    # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
+                                    'd': int(s.attrib['d']),
+                                    'r': r,
+                                })
                     else:
                         timescale = segment_template.get('timescale')
                         if timescale:
@@ -1530,7 +1591,7 @@ class InfoExtractor(object):
                         continue
                     representation_attrib = adaptation_set.attrib.copy()
                     representation_attrib.update(representation.attrib)
-                    # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
+                    # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
                     mime_type = representation_attrib['mimeType']
                     content_type = mime_type.split('/')[0]
                     if content_type == 'text':
@@ -1574,16 +1635,40 @@ class InfoExtractor(object):
                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
                             media_template = representation_ms_info['media_template']
                             media_template = media_template.replace('$RepresentationID$', representation_id)
-                            media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template)
-                            media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template)
+                            media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template)
+                            media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template)
                             media_template.replace('$$', '$')
-                            representation_ms_info['segment_urls'] = [
-                                media_template % {
-                                    'Number': segment_number,
-                                    'Bandwidth': representation_attrib.get('bandwidth')}
-                                for segment_number in range(
-                                    representation_ms_info['start_number'],
-                                    representation_ms_info['total_number'] + representation_ms_info['start_number'])]
+
+                            # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
+                            # can't be used at the same time
+                            if '%(Number' in media_template:
+                                representation_ms_info['segment_urls'] = [
+                                    media_template % {
+                                        'Number': segment_number,
+                                        'Bandwidth': representation_attrib.get('bandwidth'),
+                                    }
+                                    for segment_number in range(
+                                        representation_ms_info['start_number'],
+                                        representation_ms_info['total_number'] + representation_ms_info['start_number'])]
+                            else:
+                                representation_ms_info['segment_urls'] = []
+                                segment_time = 0
+
+                                def add_segment_url():
+                                    representation_ms_info['segment_urls'].append(
+                                        media_template % {
+                                            'Time': segment_time,
+                                            'Bandwidth': representation_attrib.get('bandwidth'),
+                                        }
+                                    )
+
+                                for num, s in enumerate(representation_ms_info['s']):
+                                    segment_time = s.get('t') or segment_time
+                                    add_segment_url()
+                                    for r in range(s.get('r', 0)):
+                                        segment_time += s['d']
+                                        add_segment_url()
+                                    segment_time += s['d']
                         if 'segment_urls' in representation_ms_info:
                             f.update({
                                 'segment_urls': representation_ms_info['segment_urls'],
@@ -1610,6 +1695,62 @@ class InfoExtractor(object):
                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
         return formats
 
+    def _parse_html5_media_entries(self, base_url, webpage):
+        def absolute_url(video_url):
+            return compat_urlparse.urljoin(base_url, video_url)
+
+        def parse_content_type(content_type):
+            if not content_type:
+                return {}
+            ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
+            if ctr:
+                mimetype, codecs = ctr.groups()
+                f = parse_codecs(codecs)
+                f['ext'] = mimetype2ext(mimetype)
+                return f
+            return {}
+
+        entries = []
+        for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
+            media_info = {
+                'formats': [],
+                'subtitles': {},
+            }
+            media_attributes = extract_attributes(media_tag)
+            src = media_attributes.get('src')
+            if src:
+                media_info['formats'].append({
+                    'url': absolute_url(src),
+                    'vcodec': 'none' if media_type == 'audio' else None,
+                })
+            media_info['thumbnail'] = media_attributes.get('poster')
+            if media_content:
+                for source_tag in re.findall(r'<source[^>]+>', media_content):
+                    source_attributes = extract_attributes(source_tag)
+                    src = source_attributes.get('src')
+                    if not src:
+                        continue
+                    f = parse_content_type(source_attributes.get('type'))
+                    f.update({
+                        'url': absolute_url(src),
+                        'vcodec': 'none' if media_type == 'audio' else None,
+                    })
+                    media_info['formats'].append(f)
+                for track_tag in re.findall(r'<track[^>]+>', media_content):
+                    track_attributes = extract_attributes(track_tag)
+                    kind = track_attributes.get('kind')
+                    if not kind or kind == 'subtitles':
+                        src = track_attributes.get('src')
+                        if not src:
+                            continue
+                        lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
+                        media_info['subtitles'].setdefault(lang, []).append({
+                            'url': absolute_url(src),
+                        })
+            if media_info['formats']:
+                entries.append(media_info)
+        return entries
+
     def _live_title(self, name):
         """ Generate the title for a live video """
         now = datetime.datetime.now()
@@ -1670,7 +1811,7 @@ class InfoExtractor(object):
 
         any_restricted = False
         for tc in self.get_testcases(include_onlymatching=False):
-            if 'playlist' in tc:
+            if tc.get('playlist', []):
                 tc = tc['playlist'][0]
             is_restricted = age_restricted(
                 tc.get('info_dict', {}).get('age_limit'), age_limit)
@@ -1723,6 +1864,13 @@ class InfoExtractor(object):
     def _mark_watched(self, *args, **kwargs):
         raise NotImplementedError('This method must be implemented by subclasses')
 
+    def geo_verification_headers(self):
+        headers = {}
+        geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
+        if geo_verification_proxy:
+            headers['Ytdl-request-proxy'] = geo_verification_proxy
+        return headers
+
 
 class SearchInfoExtractor(InfoExtractor):
     """
index e8f2b5a07591410c16fe6fe096678a12006abe48..8d8f605980bdf531c49a0c5d5067223d1f41dc4d 100644 (file)
@@ -5,13 +5,17 @@ import re
 
 from .common import InfoExtractor
 from ..compat import (
-    compat_urllib_parse_urlencode,
     compat_urllib_parse_urlparse,
     compat_urlparse,
 )
 from ..utils import (
     orderedSet,
     remove_end,
+    extract_attributes,
+    mimetype2ext,
+    determine_ext,
+    int_or_none,
+    parse_iso8601,
 )
 
 
@@ -58,6 +62,9 @@ class CondeNastIE(InfoExtractor):
             'ext': 'mp4',
             'title': '3D Printed Speakers Lit With LED',
             'description': 'Check out these beautiful 3D printed LED speakers.  You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.',
+            'uploader': 'wired',
+            'upload_date': '20130314',
+            'timestamp': 1363219200,
         }
     }, {
         # JS embed
@@ -67,70 +74,93 @@ class CondeNastIE(InfoExtractor):
             'id': '55f9cf8b61646d1acf00000c',
             'ext': 'mp4',
             'title': '3D printed TSA Travel Sentry keys really do open TSA locks',
+            'uploader': 'arstechnica',
+            'upload_date': '20150916',
+            'timestamp': 1442434955,
         }
     }]
 
     def _extract_series(self, url, webpage):
-        title = self._html_search_regex(r'<div class="cne-series-info">.*?<h1>(.+?)</h1>',
-                                        webpage, 'series title', flags=re.DOTALL)
+        title = self._html_search_regex(
+            r'(?s)<div class="cne-series-info">.*?<h1>(.+?)</h1>',
+            webpage, 'series title')
         url_object = compat_urllib_parse_urlparse(url)
         base_url = '%s://%s' % (url_object.scheme, url_object.netloc)
-        m_paths = re.finditer(r'<p class="cne-thumb-title">.*?<a href="(/watch/.+?)["\?]',
-                              webpage, flags=re.DOTALL)
+        m_paths = re.finditer(
+            r'(?s)<p class="cne-thumb-title">.*?<a href="(/watch/.+?)["\?]', webpage)
         paths = orderedSet(m.group(1) for m in m_paths)
         build_url = lambda path: compat_urlparse.urljoin(base_url, path)
         entries = [self.url_result(build_url(path), 'CondeNast') for path in paths]
         return self.playlist_result(entries, playlist_title=title)
 
     def _extract_video(self, webpage, url_type):
-        if url_type != 'embed':
-            description = self._html_search_regex(
-                [
-                    r'<div class="cne-video-description">(.+?)</div>',
-                    r'<div class="video-post-content">(.+?)</div>',
-                ],
-                webpage, 'description', fatal=False, flags=re.DOTALL)
+        query = {}
+        params = self._search_regex(
+            r'(?s)var params = {(.+?)}[;,]', webpage, 'player params', default=None)
+        if params:
+            query.update({
+                'videoId': self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id'),
+                'playerId': self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id'),
+                'target': self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target'),
+            })
         else:
-            description = None
-        params = self._search_regex(r'var params = {(.+?)}[;,]', webpage,
-                                    'player params', flags=re.DOTALL)
-        video_id = self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id')
-        player_id = self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id')
-        target = self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target')
-        data = compat_urllib_parse_urlencode({'videoId': video_id,
-                                              'playerId': player_id,
-                                              'target': target,
-                                              })
-        base_info_url = self._search_regex(r'url = [\'"](.+?)[\'"][,;]',
-                                           webpage, 'base info url',
-                                           default='http://player.cnevids.com/player/loader.js?')
-        info_url = base_info_url + data
-        info_page = self._download_webpage(info_url, video_id,
-                                           'Downloading video info')
-        video_info = self._search_regex(r'var\s+video\s*=\s*({.+?});', info_page, 'video info')
-        video_info = self._parse_json(video_info, video_id)
-
-        formats = [{
-            'format_id': '%s-%s' % (fdata['type'].split('/')[-1], fdata['quality']),
-            'url': fdata['src'],
-            'ext': fdata['type'].split('/')[-1],
-            'quality': 1 if fdata['quality'] == 'high' else 0,
-        } for fdata in video_info['sources'][0]]
+            params = extract_attributes(self._search_regex(
+                r'(<[^>]+data-js="video-player"[^>]+>)',
+                webpage, 'player params element'))
+            query.update({
+                'videoId': params['data-video'],
+                'playerId': params['data-player'],
+                'target': params['id'],
+            })
+        video_id = query['videoId']
+        video_info = None
+        info_page = self._download_webpage(
+            'http://player.cnevids.com/player/video.js',
+            video_id, 'Downloading video info', query=query, fatal=False)
+        if info_page:
+            video_info = self._parse_json(self._search_regex(
+                r'loadCallback\(({.+})\)', info_page, 'video info'), video_id)['video']
+        else:
+            info_page = self._download_webpage(
+                'http://player.cnevids.com/player/loader.js',
+                video_id, 'Downloading loader info', query=query)
+            video_info = self._parse_json(self._search_regex(
+                r'var\s+video\s*=\s*({.+?});', info_page, 'video info'), video_id)
+        title = video_info['title']
+
+        formats = []
+        for fdata in video_info.get('sources', [{}])[0]:
+            src = fdata.get('src')
+            if not src:
+                continue
+            ext = mimetype2ext(fdata.get('type')) or determine_ext(src)
+            quality = fdata.get('quality')
+            formats.append({
+                'format_id': ext + ('-%s' % quality if quality else ''),
+                'url': src,
+                'ext': ext,
+                'quality': 1 if quality == 'high' else 0,
+            })
         self._sort_formats(formats)
 
-        return {
+        info = self._search_json_ld(
+            webpage, video_id, fatal=False) if url_type != 'embed' else {}
+        info.update({
             'id': video_id,
             'formats': formats,
-            'title': video_info['title'],
-            'thumbnail': video_info['poster_frame'],
-            'description': description,
-        }
+            'title': title,
+            'thumbnail': video_info.get('poster_frame'),
+            'uploader': video_info.get('brand'),
+            'duration': int_or_none(video_info.get('duration')),
+            'tags': video_info.get('tags'),
+            'series': video_info.get('series_title'),
+            'season': video_info.get('season_title'),
+            'timestamp': parse_iso8601(video_info.get('premiere_date')),
+        })
+        return info
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        site = mobj.group('site')
-        url_type = mobj.group('type')
-        item_id = mobj.group('id')
+        site, url_type, item_id = re.match(self._VALID_URL, url).groups()
 
         # Convert JS embed to regular embed
         if url_type == 'embedjs':
index 90a64303d3282c925d1620787922622e99a8b612..6d3abb52f735de655e11ce88c70a4b6b9cc287d6 100644 (file)
@@ -114,6 +114,21 @@ class CrunchyrollIE(CrunchyrollBaseIE):
             # rtmp
             'skip_download': True,
         },
+    }, {
+        'url': 'http://www.crunchyroll.com/rezero-starting-life-in-another-world-/episode-5-the-morning-of-our-promise-is-still-distant-702409',
+        'info_dict': {
+            'id': '702409',
+            'ext': 'mp4',
+            'title': 'Re:ZERO -Starting Life in Another World- Episode 5 – The Morning of Our Promise Is Still Distant',
+            'description': 'md5:97664de1ab24bbf77a9c01918cb7dca9',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'uploader': 'TV TOKYO',
+            'upload_date': '20160508',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
     }, {
         'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697',
         'only_matching': True,
@@ -336,9 +351,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
             if video_encode_id in video_encode_ids:
                 continue
             video_encode_ids.append(video_encode_id)
+
+            video_file = xpath_text(stream_info, './file')
+            if not video_file:
+                continue
+            if video_file.startswith('http'):
+                formats.extend(self._extract_m3u8_formats(
+                    video_file, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+                continue
+
             video_url = xpath_text(stream_info, './host')
-            video_play_path = xpath_text(stream_info, './file')
-            if not video_url or not video_play_path:
+            if not video_url:
                 continue
             metadata = stream_info.find('./metadata')
             format_info = {
@@ -353,7 +377,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
                 parsed_video_url = compat_urlparse.urlparse(video_url)
                 direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace(
                     netloc='v.lvlt.crcdn.net',
-                    path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_play_path.split(':')[-1])))
+                    path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1])))
                 if self._is_valid_url(direct_video_url, video_id, video_format):
                     format_info.update({
                         'url': direct_video_url,
@@ -363,7 +387,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
 
             format_info.update({
                 'url': video_url,
-                'play_path': video_play_path,
+                'play_path': video_file,
                 'ext': 'flv',
             })
             formats.append(format_info)
index 84b36f44cfac7bd45a8a7d28adb6767093a7d19b..7e5d4f2276385a363eade175dba78519cea515fe 100644 (file)
@@ -51,8 +51,11 @@ class CSpanIE(InfoExtractor):
         'url': 'http://www.c-span.org/video/?104517-1/immigration-reforms-needed-protect-skilled-american-workers',
         'info_dict': {
             'id': 'judiciary031715',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': 'Immigration Reforms Needed to Protect Skilled American Workers',
+        },
+        'params': {
+            'skip_download': True,  # m3u8 downloads
         }
     }]
 
index 1622fc844a1b8d4794fc12694f03f37c00076f15..83ca90c3b68a66c8c612bd29cda89ae6d91f1478 100644 (file)
@@ -1,13 +1,12 @@
-# -*- coding: utf-8 -*-
+# coding: utf-8
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..utils import parse_iso8601, ExtractorError
+from ..utils import unified_timestamp
 
 
 class CtsNewsIE(InfoExtractor):
     IE_DESC = '華視新聞'
-    # https connection failed (Connection reset)
     _VALID_URL = r'https?://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P<id>\d+)\.html'
     _TESTS = [{
         'url': 'http://news.cts.com.tw/cts/international/201501/201501291578109.html',
@@ -16,7 +15,7 @@ class CtsNewsIE(InfoExtractor):
             'id': '201501291578109',
             'ext': 'mp4',
             'title': '以色列.真主黨交火 3人死亡',
-            'description': 'md5:95e9b295c898b7ff294f09d450178d7d',
+            'description': '以色列和黎巴嫩真主黨,爆發五年最嚴重衝突,雙方砲轟交火,兩名以軍死亡,還有一名西班牙籍的聯合國維和人...',
             'timestamp': 1422528540,
             'upload_date': '20150129',
         }
@@ -28,7 +27,7 @@ class CtsNewsIE(InfoExtractor):
             'id': '201309031304098',
             'ext': 'mp4',
             'title': '韓國31歲童顏男 貌如十多歲小孩',
-            'description': 'md5:f183feeba3752b683827aab71adad584',
+            'description': '越有年紀的人,越希望看起來年輕一點,而南韓卻有一位31歲的男子,看起來像是11、12歲的小孩,身...',
             'thumbnail': 're:^https?://.*\.jpg$',
             'timestamp': 1378205880,
             'upload_date': '20130903',
@@ -36,8 +35,7 @@ class CtsNewsIE(InfoExtractor):
     }, {
         # With Youtube embedded video
         'url': 'http://news.cts.com.tw/cts/money/201501/201501291578003.html',
-        'md5': '1d842c771dc94c8c3bca5af2cc1db9c5',
-        'add_ie': ['Youtube'],
+        'md5': 'e4726b2ccd70ba2c319865e28f0a91d1',
         'info_dict': {
             'id': 'OVbfO7d0_hQ',
             'ext': 'mp4',
@@ -47,42 +45,37 @@ class CtsNewsIE(InfoExtractor):
             'upload_date': '20150128',
             'uploader_id': 'TBSCTS',
             'uploader': '中華電視公司',
-        }
+        },
+        'add_ie': ['Youtube'],
     }]
 
     def _real_extract(self, url):
         news_id = self._match_id(url)
         page = self._download_webpage(url, news_id)
 
-        if self._search_regex(r'(CTSPlayer2)', page, 'CTSPlayer2 identifier', default=None):
-            feed_url = self._html_search_regex(
-                r'(http://news\.cts\.com\.tw/action/mp4feed\.php\?news_id=\d+)',
-                page, 'feed url')
-            video_url = self._download_webpage(
-                feed_url, news_id, note='Fetching feed')
+        news_id = self._hidden_inputs(page).get('get_id')
+
+        if news_id:
+            mp4_feed = self._download_json(
+                'http://news.cts.com.tw/action/test_mp4feed.php',
+                news_id, note='Fetching feed', query={'news_id': news_id})
+            video_url = mp4_feed['source_url']
         else:
             self.to_screen('Not CTSPlayer video, trying Youtube...')
             youtube_url = self._search_regex(
-                r'src="(//www\.youtube\.com/embed/[^"]+)"', page, 'youtube url',
-                default=None)
-            if not youtube_url:
-                raise ExtractorError('The news includes no videos!', expected=True)
+                r'src="(//www\.youtube\.com/embed/[^"]+)"', page, 'youtube url')
 
-            return {
-                '_type': 'url',
-                'url': youtube_url,
-                'ie_key': 'Youtube',
-            }
+            return self.url_result(youtube_url, ie='Youtube')
 
         description = self._html_search_meta('description', page)
-        title = self._html_search_meta('title', page)
+        title = self._html_search_meta('title', page, fatal=True)
         thumbnail = self._html_search_meta('image', page)
 
         datetime_str = self._html_search_regex(
-            r'(\d{4}/\d{2}/\d{2} \d{2}:\d{2})', page, 'date and time')
-        # Transform into ISO 8601 format with timezone info
-        datetime_str = datetime_str.replace('/', '-') + ':00+0800'
-        timestamp = parse_iso8601(datetime_str, delimiter=' ')
+            r'(\d{4}/\d{2}/\d{2} \d{2}:\d{2})', page, 'date and time', fatal=False)
+        timestamp = None
+        if datetime_str:
+            timestamp = unified_timestamp(datetime_str) - 8 * 3600
 
         return {
             'id': news_id,
diff --git a/youtube_dl/extractor/ctv.py b/youtube_dl/extractor/ctv.py
new file mode 100644 (file)
index 0000000..5807fba
--- /dev/null
@@ -0,0 +1,30 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class CTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ctv\.ca/video/player\?vid=(?P<id>[0-9.]+)'
+    _TESTS = [{
+        'url': 'http://www.ctv.ca/video/player?vid=706966',
+        'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0',
+        'info_dict': {
+            'id': '706966',
+            'ext': 'mp4',
+            'title': 'Larry Day and Richard Jutras on the TIFF red carpet of \'Stonewall\'',
+            'description': 'etalk catches up with Larry Day and Richard Jutras on the TIFF red carpet of "Stonewall”.',
+            'upload_date': '20150919',
+            'timestamp': 1442624700,
+        },
+        'expected_warnings': ['HTTP Error 404'],
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'url': '9c9media:ctv_web:%s' % video_id,
+            'ie_key': 'NineCNineMedia',
+        }
diff --git a/youtube_dl/extractor/ctvnews.py b/youtube_dl/extractor/ctvnews.py
new file mode 100644 (file)
index 0000000..1023b61
--- /dev/null
@@ -0,0 +1,65 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import orderedSet
+
+
+class CTVNewsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P<id>[0-9.]+)'
+    _TESTS = [{
+        'url': 'http://www.ctvnews.ca/video?clipId=901995',
+        'md5': '10deb320dc0ccb8d01d34d12fc2ea672',
+        'info_dict': {
+            'id': '901995',
+            'ext': 'mp4',
+            'title': 'Extended: \'That person cannot be me\' Johnson says',
+            'description': 'md5:958dd3b4f5bbbf0ed4d045c790d89285',
+            'timestamp': 1467286284,
+            'upload_date': '20160630',
+        }
+    }, {
+        'url': 'http://www.ctvnews.ca/video?playlistId=1.2966224',
+        'info_dict':
+        {
+            'id': '1.2966224',
+        },
+        'playlist_mincount': 19,
+    }, {
+        'url': 'http://www.ctvnews.ca/video?binId=1.2876780',
+        'info_dict':
+        {
+            'id': '1.2876780',
+        },
+        'playlist_mincount': 100,
+    }, {
+        'url': 'http://www.ctvnews.ca/1.810401',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.ctvnews.ca/canadiens-send-p-k-subban-to-nashville-in-blockbuster-trade-1.2967231',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        page_id = self._match_id(url)
+
+        def ninecninemedia_url_result(clip_id):
+            return {
+                '_type': 'url_transparent',
+                'id': clip_id,
+                'url': '9c9media:ctvnews_web:%s' % clip_id,
+                'ie_key': 'NineCNineMedia',
+            }
+
+        if page_id.isdigit():
+            return ninecninemedia_url_result(page_id)
+        else:
+            webpage = self._download_webpage('http://www.ctvnews.ca/%s' % page_id, page_id, query={
+                'ot': 'example.AjaxPageLayout.ot',
+                'maxItemsPerPage': 1000000,
+            })
+            entries = [ninecninemedia_url_result(clip_id) for clip_id in orderedSet(
+                re.findall(r'clip\.id\s*=\s*(\d+);', webpage))]
+            return self.playlist_result(entries, page_id)
index ebd14cb1638b6309f1522c142502c0dac5d763a2..1ab9333b2b15c4ec96bcf1583aa6d0f491aafb76 100644 (file)
@@ -9,7 +9,7 @@ from ..utils import (
 
 
 class CWTVIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?cw(?:tv|seed)\.com/(?:shows/)?(?:[^/]+/){2}\?.*\bplay=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})'
+    _VALID_URL = r'https?://(?:www\.)?cw(?:tv(?:pr)?|seed)\.com/(?:shows/)?(?:[^/]+/)+[^?]*\?.*\b(?:play|watch)=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})'
     _TESTS = [{
         'url': 'http://cwtv.com/shows/arrow/legends-of-yesterday/?play=6b15e985-9345-4f60-baf8-56e96be57c63',
         'info_dict': {
@@ -28,7 +28,8 @@ class CWTVIE(InfoExtractor):
         'params': {
             # m3u8 download
             'skip_download': True,
-        }
+        },
+        'skip': 'redirect to http://cwtv.com/shows/arrow/',
     }, {
         'url': 'http://www.cwseed.com/shows/whose-line-is-it-anyway/jeff-davis-4/?play=24282b12-ead2-42f2-95ad-26770c2c6088',
         'info_dict': {
@@ -44,22 +45,43 @@ class CWTVIE(InfoExtractor):
             'upload_date': '20151006',
             'timestamp': 1444107300,
         },
-        'params': {
-            # m3u8 download
-            'skip_download': True,
-        }
     }, {
         'url': 'http://cwtv.com/thecw/chroniclesofcisco/?play=8adebe35-f447-465f-ab52-e863506ff6d6',
         'only_matching': True,
+    }, {
+        'url': 'http://cwtvpr.com/the-cw/video?watch=9eee3f60-ef4e-440b-b3b2-49428ac9c54e',
+        'only_matching': True,
+    }, {
+        'url': 'http://cwtv.com/shows/arrow/legends-of-yesterday/?watch=6b15e985-9345-4f60-baf8-56e96be57c63',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        video_data = self._download_json(
-            'http://metaframe.digitalsmiths.tv/v2/CWtv/assets/%s/partner/132?format=json' % video_id, video_id)
-
-        formats = self._extract_m3u8_formats(
-            video_data['videos']['variantplaylist']['uri'], video_id, 'mp4')
+        video_data = None
+        formats = []
+        for partner in (154, 213):
+            vdata = self._download_json(
+                'http://metaframe.digitalsmiths.tv/v2/CWtv/assets/%s/partner/%d?format=json' % (video_id, partner), video_id, fatal=False)
+            if not vdata:
+                continue
+            video_data = vdata
+            for quality, quality_data in vdata.get('videos', {}).items():
+                quality_url = quality_data.get('uri')
+                if not quality_url:
+                    continue
+                if quality == 'variantplaylist':
+                    formats.extend(self._extract_m3u8_formats(
+                        quality_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+                else:
+                    tbr = int_or_none(quality_data.get('bitrate'))
+                    format_id = 'http' + ('-%d' % tbr if tbr else '')
+                    if self._is_valid_url(quality_url, video_id, format_id):
+                        formats.append({
+                            'format_id': format_id,
+                            'url': quality_url,
+                            'tbr': tbr,
+                        })
         self._sort_formats(formats)
 
         thumbnails = [{
index b60a1d813fa2c48c81e44306402b303922b65e88..98c835bf12f0508ac75d57c37a64e89f14543183 100644 (file)
@@ -5,19 +5,20 @@ from .common import InfoExtractor
 from ..utils import (
     int_or_none,
     determine_protocol,
+    unescapeHTML,
 )
 
 
 class DailyMailIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)'
     _TEST = {
-        'url': 'http://www.dailymail.co.uk/video/sciencetech/video-1288527/Turn-video-impressionist-masterpiece.html',
-        'md5': '2f639d446394f53f3a33658b518b6615',
+        'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html',
+        'md5': 'f6129624562251f628296c3a9ffde124',
         'info_dict': {
-            'id': '1288527',
+            'id': '1295863',
             'ext': 'mp4',
-            'title': 'Turn any video into an impressionist masterpiece',
-            'description': 'md5:88ddbcb504367987b2708bb38677c9d2',
+            'title': 'The Mountain appears in sparkling water ad for \'Heavy Bubbles\'',
+            'description': 'md5:a93d74b6da172dd5dc4d973e0b766a84',
         }
     }
 
@@ -26,7 +27,7 @@ class DailyMailIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
         video_data = self._parse_json(self._search_regex(
             r"data-opts='({.+?})'", webpage, 'video data'), video_id)
-        title = video_data['title']
+        title = unescapeHTML(video_data['title'])
         video_sources = self._download_json(video_data.get(
             'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id)
 
@@ -55,7 +56,7 @@ class DailyMailIE(InfoExtractor):
         return {
             'id': video_id,
             'title': title,
-            'description': video_data.get('descr'),
+            'description': unescapeHTML(video_data.get('descr')),
             'thumbnail': video_data.get('poster') or video_data.get('thumbnail'),
             'formats': formats,
         }
index 2e6226ea0774af2e636cbc4b4a4ca9f1ecb763a3..496883d15bbc0b3e1a3503d0545ea1e46c33e6c9 100644 (file)
@@ -16,6 +16,7 @@ from ..utils import (
     sanitized_Request,
     str_to_int,
     unescapeHTML,
+    mimetype2ext,
 )
 
 
@@ -111,6 +112,13 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
         }
     ]
 
+    @staticmethod
+    def _extract_urls(webpage):
+        # Look for embedded Dailymotion player
+        matches = re.findall(
+            r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage)
+        return list(map(lambda m: unescapeHTML(m[1]), matches))
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
@@ -153,18 +161,19 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
                     type_ = media.get('type')
                     if type_ == 'application/vnd.lumberjack.manifest':
                         continue
-                    ext = determine_ext(media_url)
-                    if type_ == 'application/x-mpegURL' or ext == 'm3u8':
+                    ext = mimetype2ext(type_) or determine_ext(media_url)
+                    if ext == 'm3u8':
                         formats.extend(self._extract_m3u8_formats(
                             media_url, video_id, 'mp4', preference=-1,
                             m3u8_id='hls', fatal=False))
-                    elif type_ == 'application/f4m' or ext == 'f4m':
+                    elif ext == 'f4m':
                         formats.extend(self._extract_f4m_formats(
                             media_url, video_id, preference=-1, f4m_id='hds', fatal=False))
                     else:
                         f = {
                             'url': media_url,
                             'format_id': 'http-%s' % quality,
+                            'ext': ext,
                         }
                         m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url)
                         if m:
@@ -322,7 +331,9 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
 
             for video_id in re.findall(r'data-xid="(.+?)"', webpage):
                 if video_id not in video_ids:
-                    yield self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')
+                    yield self.url_result(
+                        'http://www.dailymotion.com/video/%s' % video_id,
+                        DailymotionIE.ie_key(), video_id)
                     video_ids.add(video_id)
 
             if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
index 86024a745661dda2da9d3fb883ccf4db017a722c..b5c310ccb8042c7bfa44c6a909ead398fc679dd4 100644 (file)
@@ -66,22 +66,32 @@ class DaumIE(InfoExtractor):
             'view_count': int,
             'comment_count': int,
         },
+    }, {
+        # Requires dte_type=WEB (#9972)
+        'url': 'http://tvpot.daum.net/v/s3794Uf1NZeZ1qMpGpeqeRU',
+        'md5': 'a8917742069a4dd442516b86e7d66529',
+        'info_dict': {
+            'id': 's3794Uf1NZeZ1qMpGpeqeRU',
+            'ext': 'mp4',
+            'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny) [쇼! 음악중심] 508회 20160611',
+            'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\n\n[쇼! 음악중심] 20160611, 507회',
+            'upload_date': '20160611',
+        },
     }]
 
     def _real_extract(self, url):
         video_id = compat_urllib_parse_unquote(self._match_id(url))
-        query = compat_urllib_parse_urlencode({'vid': video_id})
         movie_data = self._download_json(
-            'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json?' + query,
-            video_id, 'Downloading video formats info')
+            'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json',
+            video_id, 'Downloading video formats info', query={'vid': video_id, 'dte_type': 'WEB'})
 
         # For urls like http://m.tvpot.daum.net/v/65139429, where the video_id is really a clipid
         if not movie_data.get('output_list', {}).get('output_list') and re.match(r'^\d+$', video_id):
             return self.url_result('http://tvpot.daum.net/clip/ClipView.do?clipid=%s' % video_id)
 
         info = self._download_xml(
-            'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id,
-            'Downloading video info')
+            'http://tvpot.daum.net/clip/ClipInfoXml.do', video_id,
+            'Downloading video info', query={'vid': video_id})
 
         formats = []
         for format_el in movie_data['output_list']['output_list']:
index 133cdc50b8c8379021d6d7fffb7c7c28dd2ba30d..caff8842e9432f5737f7fee247c943c4a0685920 100644 (file)
@@ -4,78 +4,47 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_str
-from ..utils import (
-    float_or_none,
-    int_or_none,
-    clean_html,
-)
 
 
 class DBTVIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?dbtv\.no/(?:(?:lazyplayer|player)/)?(?P<id>[0-9]+)(?:#(?P<display_id>.+))?'
+    _VALID_URL = r'https?://(?:www\.)?dbtv\.no/(?:[^/]+/)?(?P<id>[0-9]+)(?:#(?P<display_id>.+))?'
     _TESTS = [{
         'url': 'http://dbtv.no/3649835190001#Skulle_teste_ut_fornøyelsespark,_men_kollegaen_var_bare_opptatt_av_bikinikroppen',
-        'md5': 'b89953ed25dacb6edb3ef6c6f430f8bc',
+        'md5': '2e24f67936517b143a234b4cadf792ec',
         'info_dict': {
-            'id': '33100',
+            'id': '3649835190001',
             'display_id': 'Skulle_teste_ut_fornøyelsespark,_men_kollegaen_var_bare_opptatt_av_bikinikroppen',
             'ext': 'mp4',
             'title': 'Skulle teste ut fornøyelsespark, men kollegaen var bare opptatt av bikinikroppen',
             'description': 'md5:1504a54606c4dde3e4e61fc97aa857e0',
-            'thumbnail': 're:https?://.*\.jpg$',
-            'timestamp': 1404039863.438,
+            'thumbnail': 're:https?://.*\.jpg',
+            'timestamp': 1404039863,
             'upload_date': '20140629',
             'duration': 69.544,
-            'view_count': int,
-            'categories': list,
-        }
+            'uploader_id': '1027729757001',
+        },
+        'add_ie': ['BrightcoveNew']
     }, {
         'url': 'http://dbtv.no/3649835190001',
         'only_matching': True,
     }, {
         'url': 'http://www.dbtv.no/lazyplayer/4631135248001',
         'only_matching': True,
+    }, {
+        'url': 'http://dbtv.no/vice/5000634109001',
+        'only_matching': True,
+    }, {
+        'url': 'http://dbtv.no/filmtrailer/3359293614001',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        display_id = mobj.group('display_id') or video_id
-
-        data = self._download_json(
-            'http://api.dbtv.no/discovery/%s' % video_id, display_id)
-
-        video = data['playlist'][0]
-
-        formats = [{
-            'url': f['URL'],
-            'vcodec': f.get('container'),
-            'width': int_or_none(f.get('width')),
-            'height': int_or_none(f.get('height')),
-            'vbr': float_or_none(f.get('rate'), 1000),
-            'filesize': int_or_none(f.get('size')),
-        } for f in video['renditions'] if 'URL' in f]
-
-        if not formats:
-            for url_key, format_id in [('URL', 'mp4'), ('HLSURL', 'hls')]:
-                if url_key in video:
-                    formats.append({
-                        'url': video[url_key],
-                        'format_id': format_id,
-                    })
-
-        self._sort_formats(formats)
+        video_id, display_id = re.match(self._VALID_URL, url).groups()
 
         return {
-            'id': compat_str(video['id']),
+            '_type': 'url_transparent',
+            'url': 'http://players.brightcove.net/1027729757001/default_default/index.html?videoId=%s' % video_id,
+            'id': video_id,
             'display_id': display_id,
-            'title': video['title'],
-            'description': clean_html(video['desc']),
-            'thumbnail': video.get('splash') or video.get('thumb'),
-            'timestamp': float_or_none(video.get('publishedAt'), 1000),
-            'duration': float_or_none(video.get('length'), 1000),
-            'view_count': int_or_none(video.get('views')),
-            'categories': video.get('tags'),
-            'formats': formats,
+            'ie_key': 'BrightcoveNew',
         }
index efb8585e825c9f5cd746e4601380aa90d09246f7..b8542820a36534fe548aac43ea7f64375396e3ba 100644 (file)
@@ -62,11 +62,9 @@ class DCNBaseIE(InfoExtractor):
                 r'file\s*:\s*"https?(://[^"]+)/playlist.m3u8',
                 r'<a[^>]+href="rtsp(://[^"]+)"'
             ], webpage, 'format url')
-        # TODO: Current DASH formats are broken - $Time$ pattern in
-        # <SegmentTemplate> not implemented yet
-        # formats.extend(self._extract_mpd_formats(
-        #     format_url_base + '/manifest.mpd',
-        #     video_id, mpd_id='dash', fatal=False))
+        formats.extend(self._extract_mpd_formats(
+            format_url_base + '/manifest.mpd',
+            video_id, mpd_id='dash', fatal=False))
         formats.extend(self._extract_m3u8_formats(
             format_url_base + '/playlist.m3u8', video_id, 'mp4',
             m3u8_entry_protocol, m3u8_id='hls', fatal=False))
diff --git a/youtube_dl/extractor/discoverygo.py b/youtube_dl/extractor/discoverygo.py
new file mode 100644 (file)
index 0000000..adb68b9
--- /dev/null
@@ -0,0 +1,98 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    extract_attributes,
+    int_or_none,
+    parse_age_limit,
+    unescapeHTML,
+)
+
+
+class DiscoveryGoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?discoverygo\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://www.discoverygo.com/love-at-first-kiss/kiss-first-ask-questions-later/',
+        'info_dict': {
+            'id': '57a33c536b66d1cd0345eeb1',
+            'ext': 'mp4',
+            'title': 'Kiss First, Ask Questions Later!',
+            'description': 'md5:fe923ba34050eae468bffae10831cb22',
+            'duration': 2579,
+            'series': 'Love at First Kiss',
+            'season_number': 1,
+            'episode_number': 1,
+            'age_limit': 14,
+        },
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        container = extract_attributes(
+            self._search_regex(
+                r'(<div[^>]+class=["\']video-player-container[^>]+>)',
+                webpage, 'video container'))
+
+        video = self._parse_json(
+            unescapeHTML(container.get('data-video') or container.get('data-json')),
+            display_id)
+
+        title = video['name']
+
+        stream = video['stream']
+        STREAM_URL_SUFFIX = 'streamUrl'
+        formats = []
+        for stream_kind in ('', 'hds'):
+            suffix = STREAM_URL_SUFFIX.capitalize() if stream_kind else STREAM_URL_SUFFIX
+            stream_url = stream.get('%s%s' % (stream_kind, suffix))
+            if not stream_url:
+                continue
+            if stream_kind == '':
+                formats.extend(self._extract_m3u8_formats(
+                    stream_url, display_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            elif stream_kind == 'hds':
+                formats.extend(self._extract_f4m_formats(
+                    stream_url, display_id, f4m_id=stream_kind, fatal=False))
+        self._sort_formats(formats)
+
+        video_id = video.get('id') or display_id
+        description = video.get('description', {}).get('detailed')
+        duration = int_or_none(video.get('duration'))
+
+        series = video.get('show', {}).get('name')
+        season_number = int_or_none(video.get('season', {}).get('number'))
+        episode_number = int_or_none(video.get('episodeNumber'))
+
+        tags = video.get('tags')
+        age_limit = parse_age_limit(video.get('parental', {}).get('rating'))
+
+        subtitles = {}
+        captions = stream.get('captions')
+        if isinstance(captions, list):
+            for caption in captions:
+                subtitle_url = caption.get('fileUrl')
+                if (not subtitle_url or not isinstance(subtitle_url, compat_str) or
+                        not subtitle_url.startswith('http')):
+                    continue
+                lang = caption.get('fileLang', 'en')
+                subtitles.setdefault(lang, []).append({'url': subtitle_url})
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'series': series,
+            'season_number': season_number,
+            'episode_number': episode_number,
+            'tags': tags,
+            'age_limit': age_limit,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
index 0040e70d4929828ebf2dc7dc74199ed639dcfebd..908c9e514c41ea72bac0e6f6ede41def4ba0b20b 100644 (file)
@@ -17,8 +17,12 @@ class DreiSatIE(ZDFIE):
                 'ext': 'mp4',
                 'title': 'Waidmannsheil',
                 'description': 'md5:cce00ca1d70e21425e72c86a98a56817',
-                'uploader': '3sat',
+                'uploader': 'SCHWEIZWEIT',
+                'uploader_id': '100000210',
                 'upload_date': '20140913'
+            },
+            'params': {
+                'skip_download': True,  # m3u8 downloads
             }
         },
         {
index 639f9182c5484a22f0056e25fc6aa7e56f193df5..e8870c4607d4e4f0293f134610ed8f1f8d48e956 100644 (file)
@@ -3,7 +3,10 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..utils import str_to_int
+from ..utils import (
+    NO_DEFAULT,
+    str_to_int,
+)
 
 
 class DrTuberIE(InfoExtractor):
@@ -17,7 +20,6 @@ class DrTuberIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'hot perky blonde naked golf',
             'like_count': int,
-            'dislike_count': int,
             'comment_count': int,
             'categories': ['Babe', 'Blonde', 'Erotic', 'Outdoor', 'Softcore', 'Solo'],
             'thumbnail': 're:https?://.*\.jpg$',
@@ -36,25 +38,29 @@ class DrTuberIE(InfoExtractor):
             r'<source src="([^"]+)"', webpage, 'video URL')
 
         title = self._html_search_regex(
-            [r'<p[^>]+class="title_substrate">([^<]+)</p>', r'<title>([^<]+) - \d+'],
+            (r'class="title_watch"[^>]*><p>([^<]+)<',
+             r'<p[^>]+class="title_substrate">([^<]+)</p>',
+             r'<title>([^<]+) - \d+'),
             webpage, 'title')
 
         thumbnail = self._html_search_regex(
             r'poster="([^"]+)"',
             webpage, 'thumbnail', fatal=False)
 
-        def extract_count(id_, name):
+        def extract_count(id_, name, default=NO_DEFAULT):
             return str_to_int(self._html_search_regex(
                 r'<span[^>]+(?:class|id)="%s"[^>]*>([\d,\.]+)</span>' % id_,
-                webpage, '%s count' % name, fatal=False))
+                webpage, '%s count' % name, default=default, fatal=False))
 
         like_count = extract_count('rate_likes', 'like')
-        dislike_count = extract_count('rate_dislikes', 'dislike')
+        dislike_count = extract_count('rate_dislikes', 'dislike', default=None)
         comment_count = extract_count('comments_count', 'comment')
 
         cats_str = self._search_regex(
-            r'<div[^>]+class="categories_list">(.+?)</div>', webpage, 'categories', fatal=False)
-        categories = [] if not cats_str else re.findall(r'<a title="([^"]+)"', cats_str)
+            r'<div[^>]+class="categories_list">(.+?)</div>',
+            webpage, 'categories', fatal=False)
+        categories = [] if not cats_str else re.findall(
+            r'<a title="([^"]+)"', cats_str)
 
         return {
             'id': video_id,
index 113a4966fb3c444f5759e617e36b293ee88e6872..12d28d3b9f1e76f84f0f9fa322befd0bfa056f09 100644 (file)
@@ -50,6 +50,14 @@ class EaglePlatformIE(InfoExtractor):
         'skip': 'Georestricted',
     }]
 
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1',
+            webpage)
+        if mobj is not None:
+            return mobj.group('url')
+
     @staticmethod
     def _handle_error(response):
         status = int_or_none(response.get('status', 200))
index 4c8190d68d712bf702b5e015c95bbdda4643cefd..74bbc5c51c576880465e76453604891b6b5f48ca 100644 (file)
@@ -6,12 +6,13 @@ import json
 from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
+    NO_DEFAULT,
 )
 
 
 class EllenTVIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?(?:ellentv|ellentube)\.com/videos/(?P<id>[a-z0-9_-]+)'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.ellentv.com/videos/0-ipq1gsai/',
         'md5': '4294cf98bc165f218aaa0b89e0fd8042',
         'info_dict': {
@@ -22,24 +23,47 @@ class EllenTVIE(InfoExtractor):
             'timestamp': 1428035648,
             'upload_date': '20150403',
             'uploader_id': 'batchUser',
-        }
-    }
+        },
+    }, {
+        # not available via http://widgets.ellentube.com/
+        'url': 'http://www.ellentv.com/videos/1-szkgu2m2/',
+        'info_dict': {
+            'id': '1_szkgu2m2',
+            'ext': 'flv',
+            'title': "Ellen's Amazingly Talented Audience",
+            'description': 'md5:86ff1e376ff0d717d7171590e273f0a5',
+            'timestamp': 1255140900,
+            'upload_date': '20091010',
+            'uploader_id': 'ellenkaltura@gmail.com',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        webpage = self._download_webpage(
-            'http://widgets.ellentube.com/videos/%s' % video_id,
-            video_id)
+        URLS = ('http://widgets.ellentube.com/videos/%s' % video_id, url)
+
+        for num, url_ in enumerate(URLS, 1):
+            webpage = self._download_webpage(
+                url_, video_id, fatal=num == len(URLS))
+
+            default = NO_DEFAULT if num == len(URLS) else None
+
+            partner_id = self._search_regex(
+                r"var\s+partnerId\s*=\s*'([^']+)", webpage, 'partner id',
+                default=default)
 
-        partner_id = self._search_regex(
-            r"var\s+partnerId\s*=\s*'([^']+)", webpage, 'partner id')
+            kaltura_id = self._search_regex(
+                [r'id="kaltura_player_([^"]+)"',
+                 r"_wb_entry_id\s*:\s*'([^']+)",
+                 r'data-kaltura-entry-id="([^"]+)'],
+                webpage, 'kaltura id', default=default)
 
-        kaltura_id = self._search_regex(
-            [r'id="kaltura_player_([^"]+)"',
-             r"_wb_entry_id\s*:\s*'([^']+)",
-             r'data-kaltura-entry-id="([^"]+)'],
-            webpage, 'kaltura id')
+            if partner_id and kaltura_id:
+                break
 
         return self.url_result('kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura')
 
index e5e57d48518d3dd3999dad650d0c32406079ce33..a39e9010d4c732a8bbb9d36f124a99fa2992990a 100644 (file)
@@ -4,9 +4,10 @@ from .common import InfoExtractor
 
 
 class EngadgetIE(InfoExtractor):
-    _VALID_URL = r'https?://www.engadget.com/video/(?P<id>\d+)'
+    _VALID_URL = r'https?://www.engadget.com/video/(?P<id>[^/?#]+)'
 
-    _TEST = {
+    _TESTS = [{
+        # video with 5min ID
         'url': 'http://www.engadget.com/video/518153925/',
         'md5': 'c6820d4828a5064447a4d9fc73f312c9',
         'info_dict': {
@@ -15,8 +16,12 @@ class EngadgetIE(InfoExtractor):
             'title': 'Samsung Galaxy Tab Pro 8.4 Review',
         },
         'add_ie': ['FiveMin'],
-    }
+    }, {
+        # video with vidible ID
+        'url': 'https://www.engadget.com/video/57a28462134aa15a39f0421a/',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        return self.url_result('5min:%s' % video_id)
+        return self.url_result('aol-video:%s' % video_id)
index ac5d0fe2426a8e3fb213b2bb6d8f59fa8acc5fba..f3734e9f8984ab5a1a723bbb0be171c3fd9cf7b5 100644 (file)
@@ -4,19 +4,23 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_str
 from ..utils import (
+    encode_base_n,
+    ExtractorError,
+    int_or_none,
     parse_duration,
     str_to_int,
 )
 
 
 class EpornerIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\w+)/(?P<display_id>[\w-]+)'
+    _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\w+)(?:/(?P<display_id>[\w-]+))?'
     _TESTS = [{
         'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/',
         'md5': '39d486f046212d8e1b911c52ab4691f8',
         'info_dict': {
-            'id': '95008',
+            'id': 'qlDUmNsj6VS',
             'display_id': 'Infamous-Tiffany-Teen-Strip-Tease-Video',
             'ext': 'mp4',
             'title': 'Infamous Tiffany Teen Strip Tease Video',
@@ -28,34 +32,72 @@ class EpornerIE(InfoExtractor):
         # New (May 2016) URL layout
         'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0/Star-Wars-XXX-Parody/',
         'only_matching': True,
+    }, {
+        'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
-        display_id = mobj.group('display_id')
+        display_id = mobj.group('display_id') or video_id
+
+        webpage, urlh = self._download_webpage_handle(url, display_id)
+
+        video_id = self._match_id(compat_str(urlh.geturl()))
 
-        webpage = self._download_webpage(url, display_id)
-        title = self._html_search_regex(
-            r'<title>(.*?) - EPORNER', webpage, 'title')
+        hash = self._search_regex(
+            r'hash\s*:\s*["\']([\da-f]{32})', webpage, 'hash')
 
-        redirect_url = 'http://www.eporner.com/config5/%s' % video_id
-        player_code = self._download_webpage(
-            redirect_url, display_id, note='Downloading player config')
+        title = self._og_search_title(webpage, default=None) or self._html_search_regex(
+            r'<title>(.+?) - EPORNER', webpage, 'title')
 
-        sources = self._search_regex(
-            r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', player_code, 'sources')
+        # Reverse engineered from vjs.js
+        def calc_hash(s):
+            return ''.join((encode_base_n(int(s[lb:lb + 8], 16), 36) for lb in range(0, 32, 8)))
+
+        video = self._download_json(
+            'http://www.eporner.com/xhr/video/%s' % video_id,
+            display_id, note='Downloading video JSON',
+            query={
+                'hash': calc_hash(hash),
+                'device': 'generic',
+                'domain': 'www.eporner.com',
+                'fallback': 'false',
+            })
+
+        if video.get('available') is False:
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, video['message']), expected=True)
+
+        sources = video['sources']
 
         formats = []
-        for video_url, format_id in re.findall(r'file\s*:\s*"([^"]+)",\s*label\s*:\s*"([^"]+)"', sources):
-            fmt = {
-                'url': video_url,
-                'format_id': format_id,
-            }
-            m = re.search(r'^(\d+)', format_id)
-            if m:
-                fmt['height'] = int(m.group(1))
-            formats.append(fmt)
+        for kind, formats_dict in sources.items():
+            if not isinstance(formats_dict, dict):
+                continue
+            for format_id, format_dict in formats_dict.items():
+                if not isinstance(format_dict, dict):
+                    continue
+                src = format_dict.get('src')
+                if not isinstance(src, compat_str) or not src.startswith('http'):
+                    continue
+                if kind == 'hls':
+                    formats.extend(self._extract_m3u8_formats(
+                        src, display_id, 'mp4', entry_protocol='m3u8_native',
+                        m3u8_id=kind, fatal=False))
+                else:
+                    height = int_or_none(self._search_regex(
+                        r'(\d+)[pP]', format_id, 'height', default=None))
+                    fps = int_or_none(self._search_regex(
+                        r'(\d+)fps', format_id, 'fps', default=None))
+
+                    formats.append({
+                        'url': src,
+                        'format_id': format_id,
+                        'height': height,
+                        'fps': fps,
+                    })
         self._sort_formats(formats)
 
         duration = parse_duration(self._html_search_meta('duration', webpage))
index 1585a03bb9235a520c63e84c306294f6958dfe13..971c918a419c0609f5dcb50a768d53505252fbc4 100644 (file)
@@ -1,7 +1,5 @@
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 from ..utils import (
     int_or_none,
@@ -12,23 +10,22 @@ from ..utils import (
 class ExpoTVIE(InfoExtractor):
     _VALID_URL = r'https?://www\.expotv\.com/videos/[^?#]*/(?P<id>[0-9]+)($|[?#])'
     _TEST = {
-        'url': 'http://www.expotv.com/videos/reviews/1/24/LinneCardscom/17561',
-        'md5': '2985e6d7a392b2f7a05e0ca350fe41d0',
+        'url': 'http://www.expotv.com/videos/reviews/3/40/NYX-Butter-lipstick/667916',
+        'md5': 'fe1d728c3a813ff78f595bc8b7a707a8',
         'info_dict': {
-            'id': '17561',
+            'id': '667916',
             'ext': 'mp4',
-            'upload_date': '20060212',
-            'title': 'My Favorite Online Scrapbook Store',
-            'view_count': int,
-            'description': 'You\'ll find most everything you need at this virtual store front.',
-            'uploader': 'Anna T.',
+            'title': 'NYX Butter Lipstick Little Susie',
+            'description': 'Goes on like butter, but looks better!',
             'thumbnail': 're:^https?://.*\.jpg$',
+            'uploader': 'Stephanie S.',
+            'upload_date': '20150520',
+            'view_count': int,
         }
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
 
         webpage = self._download_webpage(url, video_id)
         player_key = self._search_regex(
@@ -66,7 +63,7 @@ class ExpoTVIE(InfoExtractor):
             fatal=False)
         upload_date = unified_strdate(self._search_regex(
             r'<h5>Reviewed on ([0-9/.]+)</h5>', webpage, 'upload date',
-            fatal=False))
+            fatal=False), day_first=False)
 
         return {
             'id': video_id,
index 6fc5a18f502a39fd6fc3e0561fa3b6f93cf575d2..55c639158dda61b53f290d880a47602aa0bb4616 100644 (file)
@@ -20,12 +20,16 @@ from .adobetv import (
     AdobeTVVideoIE,
 )
 from .adultswim import AdultSwimIE
-from .aenetworks import AENetworksIE
+from .aenetworks import (
+    AENetworksIE,
+    HistoryTopicIE,
+)
 from .afreecatv import AfreecaTVIE
 from .aftonbladet import AftonbladetIE
 from .airmozilla import AirMozillaIE
 from .aljazeera import AlJazeeraIE
 from .alphaporno import AlphaPornoIE
+from .amcnetworks import AMCNetworksIE
 from .animeondemand import AnimeOnDemandIE
 from .anitube import AnitubeIE
 from .anysex import AnySexIE
@@ -41,6 +45,7 @@ from .appletrailers import (
     AppleTrailersSectionIE,
 )
 from .archiveorg import ArchiveOrgIE
+from .arkena import ArkenaIE
 from .ard import (
     ARDIE,
     ARDMediathekIE,
@@ -136,9 +141,9 @@ from .chirbit import (
     ChirbitProfileIE,
 )
 from .cinchcast import CinchcastIE
-from .cliprs import ClipRsIE
 from .clipfish import ClipfishIE
 from .cliphunter import CliphunterIE
+from .cliprs import ClipRsIE
 from .clipsyndicate import ClipsyndicateIE
 from .closertotruth import CloserToTruthIE
 from .cloudy import CloudyIE
@@ -153,7 +158,12 @@ from .cnn import (
 )
 from .coub import CoubIE
 from .collegerama import CollegeRamaIE
-from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
+from .comedycentral import (
+    ComedyCentralIE,
+    ComedyCentralShortnameIE,
+    ComedyCentralTVIE,
+    ToshIE,
+)
 from .comcarcoff import ComCarCoffIE
 from .commonmistakes import CommonMistakesIE, UnicodeBOMIE
 from .commonprotocols import RtmpIE
@@ -168,6 +178,8 @@ from .crunchyroll import (
 )
 from .cspan import CSpanIE
 from .ctsnews import CtsNewsIE
+from .ctv import CTVIE
+from .ctvnews import CTVNewsIE
 from .cultureunplugged import CultureUnpluggedIE
 from .cwtv import CWTVIE
 from .dailymail import DailyMailIE
@@ -210,6 +222,7 @@ from .dvtv import DVTVIE
 from .dumpert import DumpertIE
 from .defense import DefenseGouvFrIE
 from .discovery import DiscoveryIE
+from .discoverygo import DiscoveryGoIE
 from .dispeak import DigitallySpeakingIE
 from .dropbox import DropboxIE
 from .dw import (
@@ -251,6 +264,7 @@ from .fivemin import FiveMinIE
 from .fivetv import FiveTVIE
 from .fktv import FKTVIE
 from .flickr import FlickrIE
+from .flipagram import FlipagramIE
 from .folketinget import FolketingetIE
 from .footyroom import FootyRoomIE
 from .formula1 import Formula1IE
@@ -259,10 +273,7 @@ from .fox import FOXIE
 from .foxgay import FoxgayIE
 from .foxnews import FoxNewsIE
 from .foxsports import FoxSportsIE
-from .franceculture import (
-    FranceCultureIE,
-    FranceCultureEmissionIE,
-)
+from .franceculture import FranceCultureIE
 from .franceinter import FranceInterIE
 from .francetv import (
     PluzzIE,
@@ -276,8 +287,9 @@ from .freespeech import FreespeechIE
 from .freevideo import FreeVideoIE
 from .funimation import FunimationIE
 from .funnyordie import FunnyOrDieIE
+from .fusion import FusionIE
+from .fxnetworks import FXNetworksIE
 from .gameinformer import GameInformerIE
-from .gamekings import GamekingsIE
 from .gameone import (
     GameOneIE,
     GameOnePlaylistIE,
@@ -298,7 +310,6 @@ from .globo import (
 )
 from .godtube import GodTubeIE
 from .godtv import GodTVIE
-from .goldenmoustache import GoldenMoustacheIE
 from .golem import GolemIE
 from .googledrive import GoogleDriveIE
 from .googleplus import GooglePlusIE
@@ -313,6 +324,7 @@ from .heise import HeiseIE
 from .hellporno import HellPornoIE
 from .helsinki import HelsinkiIE
 from .hentaistigma import HentaiStigmaIE
+from .hgtv import HGTVIE
 from .historicfilms import HistoricFilmsIE
 from .hitbox import HitboxIE, HitboxLiveIE
 from .hornbunny import HornBunnyIE
@@ -320,6 +332,10 @@ from .hotnewhiphop import HotNewHipHopIE
 from .hotstar import HotStarIE
 from .howcast import HowcastIE
 from .howstuffworks import HowStuffWorksIE
+from .hrti import (
+    HRTiIE,
+    HRTiPlaylistIE,
+)
 from .huffpost import HuffPostIE
 from .hypem import HypemIE
 from .iconosquare import IconosquareIE
@@ -358,6 +374,7 @@ from .jove import JoveIE
 from .jwplatform import JWPlatformIE
 from .jpopsukitv import JpopsukiIE
 from .kaltura import KalturaIE
+from .kamcord import KamcordIE
 from .kanalplay import KanalPlayIE
 from .kankan import KankanIE
 from .karaoketv import KaraoketvIE
@@ -381,6 +398,10 @@ from .kuwo import (
 )
 from .la7 import LA7IE
 from .laola1tv import Laola1TvIE
+from .lcp import (
+    LcpPlayIE,
+    LcpIE,
+)
 from .learnr import LearnrIE
 from .lecture2go import Lecture2GoIE
 from .lemonde import LemondeIE
@@ -422,6 +443,7 @@ from .makerschannel import MakersChannelIE
 from .makertv import MakerTVIE
 from .matchtv import MatchTVIE
 from .mdr import MDRIE
+from .meta import METAIE
 from .metacafe import MetacafeIE
 from .metacritic import MetacriticIE
 from .mgoon import MgoonIE
@@ -454,10 +476,10 @@ from .motherless import MotherlessIE
 from .motorsport import MotorsportIE
 from .movieclips import MovieClipsIE
 from .moviezine import MoviezineIE
+from .msn import MSNIE
 from .mtv import (
     MTVIE,
     MTVServicesEmbeddedIE,
-    MTVIggyIE,
     MTVDEIE,
 )
 from .muenchentv import MuenchenTVIE
@@ -469,8 +491,9 @@ from .myvi import MyviIE
 from .myvideo import MyVideoIE
 from .myvidster import MyVidsterIE
 from .nationalgeographic import (
+    NationalGeographicVideoIE,
     NationalGeographicIE,
-    NationalGeographicChannelIE,
+    NationalGeographicEpisodeGuideIE,
 )
 from .naver import NaverIE
 from .nba import NBAIE
@@ -507,7 +530,6 @@ from .nextmedia import (
     NextMediaActionNewsIE,
     AppleDailyIE,
 )
-from .nextmovie import NextMovieIE
 from .nfb import NFBIE
 from .nfl import NFLIE
 from .nhl import (
@@ -521,7 +543,10 @@ from .nick import (
     NickDeIE,
 )
 from .niconico import NiconicoIE, NiconicoPlaylistIE
+from .ninecninemedia import NineCNineMediaIE
 from .ninegag import NineGagIE
+from .ninenow import NineNowIE
+from .nintendo import NintendoIE
 from .noco import NocoIE
 from .normalboots import NormalbootsIE
 from .nosvideo import NosVideoIE
@@ -566,8 +591,13 @@ from .nytimes import (
     NYTimesArticleIE,
 )
 from .nuvid import NuvidIE
+from .odatv import OdaTVIE
 from .odnoklassniki import OdnoklassnikiIE
 from .oktoberfesttv import OktoberfestTVIE
+from .onet import (
+    OnetIE,
+    OnetChannelIE,
+)
 from .onionstudios import OnionStudiosIE
 from .ooyala import (
     OoyalaIE,
@@ -606,6 +636,8 @@ from .pluralsight import (
     PluralsightCourseIE,
 )
 from .podomatic import PodomaticIE
+from .pokemon import PokemonIE
+from .polskieradio import PolskieRadioIE
 from .porn91 import Porn91IE
 from .pornhd import PornHdIE
 from .pornhub import (
@@ -660,16 +692,19 @@ from .rice import RICEIE
 from .ringtv import RingTVIE
 from .ro220 import Ro220IE
 from .rockstargames import RockstarGamesIE
+from .roosterteeth import RoosterTeethIE
 from .rottentomatoes import RottenTomatoesIE
 from .roxwel import RoxwelIE
+from .rozhlas import RozhlasIE
 from .rtbf import RTBFIE
 from .rte import RteIE, RteRadioIE
 from .rtlnl import RtlNlIE
 from .rtl2 import RTL2IE
 from .rtp import RTPIE
 from .rts import RTSIE
-from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE
+from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE
 from .rtvnh import RTVNHIE
+from .rudo import RudoIE
 from .ruhd import RUHDIE
 from .ruleporn import RulePornIE
 from .rutube import (
@@ -704,10 +739,12 @@ from .shahid import ShahidIE
 from .shared import SharedIE
 from .sharesix import ShareSixIE
 from .sina import SinaIE
+from .sixplay import SixPlayIE
 from .skynewsarabia import (
     SkyNewsArabiaIE,
     SkyNewsArabiaArticleIE,
 )
+from .skysports import SkySportsIE
 from .slideshare import SlideshareIE
 from .slutload import SlutloadIE
 from .smotri import (
@@ -718,6 +755,7 @@ from .smotri import (
 )
 from .snotr import SnotrIE
 from .sohu import SohuIE
+from .sonyliv import SonyLIVIE
 from .soundcloud import (
     SoundcloudIE,
     SoundcloudSetIE,
@@ -757,6 +795,7 @@ from .srmediathek import SRMediathekIE
 from .ssa import SSAIE
 from .stanfordoc import StanfordOpenClassroomIE
 from .steam import SteamIE
+from .streamable import StreamableIE
 from .streamcloud import StreamcloudIE
 from .streamcz import StreamCZIE
 from .streetvoice import StreetVoiceIE
@@ -772,7 +811,6 @@ from .tagesschau import (
     TagesschauPlayerIE,
     TagesschauIE,
 )
-from .tapely import TapelyIE
 from .tass import TassIE
 from .tdslifeway import TDSLifewayIE
 from .teachertube import (
@@ -856,10 +894,14 @@ from .tvc import (
 from .tvigle import TvigleIE
 from .tvland import TVLandIE
 from .tvp import (
+    TVPEmbedIE,
     TVPIE,
     TVPSeriesIE,
 )
-from .tvplay import TVPlayIE
+from .tvplay import (
+    TVPlayIE,
+    ViafreeIE,
+)
 from .tweakers import TweakersIE
 from .twentyfourvideo import TwentyFourVideoIE
 from .twentymin import TwentyMinutenIE
@@ -888,7 +930,13 @@ from .udemy import (
 from .udn import UDNEmbedIE
 from .digiteka import DigitekaIE
 from .unistra import UnistraIE
+from .uol import UOLIE
+from .uplynk import (
+    UplynkIE,
+    UplynkPreplayIE,
+)
 from .urort import UrortIE
+from .urplay import URPlayIE
 from .usatoday import USATodayIE
 from .ustream import UstreamIE, UstreamChannelIE
 from .ustudio import (
@@ -915,6 +963,8 @@ from .vice import (
     ViceIE,
     ViceShowIE,
 )
+from .viceland import VicelandIE
+from .vidbit import VidbitIE
 from .viddler import ViddlerIE
 from .videodetective import VideoDetectiveIE
 from .videofyme import VideofyMeIE
@@ -963,9 +1013,11 @@ from .viki import (
 from .vk import (
     VKIE,
     VKUserVideosIE,
+    VKWallPostIE,
 )
 from .vlive import VLiveIE
 from .vodlocker import VodlockerIE
+from .vodplatform import VODPlatformIE
 from .voicerepublic import VoiceRepublicIE
 from .voxmedia import VoxMediaIE
 from .vporn import VpornIE
@@ -1048,6 +1100,7 @@ from .youtube import (
     YoutubeSearchDateIE,
     YoutubeSearchIE,
     YoutubeSearchURLIE,
+    YoutubeSharedVideoIE,
     YoutubeShowIE,
     YoutubeSubscriptionsIE,
     YoutubeTruncatedIDIE,
@@ -1061,4 +1114,3 @@ from .zingmp3 import (
     ZingMp3SongIE,
     ZingMp3AlbumIE,
 )
-from .zippcast import ZippCastIE
index 3403581fddf08a0928a8e4c5b22e740117646bd2..b4fd9334aeb7f3dfc3f45ce2da67e5236d5d4018 100644 (file)
@@ -1,22 +1,17 @@
 from __future__ import unicode_literals
 
-import re
+from ..utils import str_to_int
+from .keezmovies import KeezMoviesIE
 
-from .common import InfoExtractor
-from ..utils import (
-    int_or_none,
-    sanitized_Request,
-    str_to_int,
-)
 
-
-class ExtremeTubeIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P<id>[^/#?&]+)'
+class ExtremeTubeIE(KeezMoviesIE):
+    _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?:(?P<display_id>[^/]+)-)(?P<id>\d+)'
     _TESTS = [{
         'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431',
-        'md5': '344d0c6d50e2f16b06e49ca011d8ac69',
+        'md5': '1fb9228f5e3332ec8c057d6ac36f33e0',
         'info_dict': {
-            'id': 'music-video-14-british-euro-brit-european-cumshots-swallow-652431',
+            'id': '652431',
+            'display_id': 'music-video-14-british-euro-brit-european-cumshots-swallow',
             'ext': 'mp4',
             'title': 'Music Video 14 british euro brit european cumshots swallow',
             'uploader': 'unknown',
@@ -35,58 +30,22 @@ class ExtremeTubeIE(InfoExtractor):
     }]
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
+        webpage, info = self._extract_info(url)
 
-        req = sanitized_Request(url)
-        req.add_header('Cookie', 'age_verified=1')
-        webpage = self._download_webpage(req, video_id)
+        if not info['title']:
+            info['title'] = self._search_regex(
+                r'<h1[^>]+title="([^"]+)"[^>]*>', webpage, 'title')
 
-        video_title = self._html_search_regex(
-            r'<h1 [^>]*?title="([^"]+)"[^>]*>', webpage, 'title')
         uploader = self._html_search_regex(
             r'Uploaded by:\s*</strong>\s*(.+?)\s*</div>',
             webpage, 'uploader', fatal=False)
-        view_count = str_to_int(self._html_search_regex(
+        view_count = str_to_int(self._search_regex(
             r'Views:\s*</strong>\s*<span>([\d,\.]+)</span>',
             webpage, 'view count', fatal=False))
 
-        flash_vars = self._parse_json(
-            self._search_regex(
-                r'var\s+flashvars\s*=\s*({.+?});', webpage, 'flash vars'),
-            video_id)
-
-        formats = []
-        for quality_key, video_url in flash_vars.items():
-            height = int_or_none(self._search_regex(
-                r'quality_(\d+)[pP]$', quality_key, 'height', default=None))
-            if not height:
-                continue
-            f = {
-                'url': video_url,
-            }
-            mobj = re.search(
-                r'/(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', video_url)
-            if mobj:
-                height = int(mobj.group('height'))
-                bitrate = int(mobj.group('bitrate'))
-                f.update({
-                    'format_id': '%dp-%dk' % (height, bitrate),
-                    'height': height,
-                    'tbr': bitrate,
-                })
-            else:
-                f.update({
-                    'format_id': '%dp' % height,
-                    'height': height,
-                })
-            formats.append(f)
-        self._sort_formats(formats)
-
-        return {
-            'id': video_id,
-            'title': video_title,
-            'formats': formats,
+        info.update({
             'uploader': uploader,
             'view_count': view_count,
-            'age_limit': 18,
-        }
+        })
+
+        return info
index 9b87b37ae54da724c360e85429de804f29413bc6..0fb781a733f4c19780ed88f8f5b24c1102b10a44 100644 (file)
@@ -27,7 +27,7 @@ class FacebookIE(InfoExtractor):
     _VALID_URL = r'''(?x)
                 (?:
                     https?://
-                        (?:\w+\.)?facebook\.com/
+                        (?:[\w-]+\.)?facebook\.com/
                         (?:[^#]*?\#!/)?
                         (?:
                             (?:
@@ -127,8 +127,26 @@ class FacebookIE(InfoExtractor):
     }, {
         'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/',
         'only_matching': True,
+    }, {
+        'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/',
+        'only_matching': True,
     }]
 
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
+        if mobj is not None:
+            return mobj.group('url')
+
+        # Facebook API embed
+        # see https://developers.facebook.com/docs/plugins/embedded-video-player
+        mobj = re.search(r'''(?x)<div[^>]+
+                class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+
+                data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage)
+        if mobj is not None:
+            return mobj.group('url')
+
     def _login(self):
         (useremail, password) = self._get_login_info()
         if useremail is None:
@@ -204,12 +222,25 @@ class FacebookIE(InfoExtractor):
 
         BEFORE = '{swf.addParam(param[0], param[1]);});'
         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
-        m = re.search(re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER), webpage)
-        if m:
-            swf_params = m.group(1).replace('\\\\', '\\').replace('\\"', '"')
+        PATTERN = re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER)
+
+        for m in re.findall(PATTERN, webpage):
+            swf_params = m.replace('\\\\', '\\').replace('\\"', '"')
             data = dict(json.loads(swf_params))
             params_raw = compat_urllib_parse_unquote(data['params'])
-            video_data = json.loads(params_raw)['video_data']
+            video_data_candidate = json.loads(params_raw)['video_data']
+            for _, f in video_data_candidate.items():
+                if not f:
+                    continue
+                if isinstance(f, dict):
+                    f = [f]
+                if not isinstance(f, list):
+                    continue
+                if f[0].get('video_id') == video_id:
+                    video_data = video_data_candidate
+                    break
+            if video_data:
+                break
 
         def video_data_list2dict(video_data):
             ret = {}
index 6b834541636533d808ce396ae456f980f989c731..f3f876ecda7fa3776f21013352833548f9be42c6 100644 (file)
@@ -1,24 +1,11 @@
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
-from ..compat import (
-    compat_parse_qs,
-    compat_urllib_parse_urlencode,
-    compat_urllib_parse_urlparse,
-    compat_urlparse,
-)
-from ..utils import (
-    ExtractorError,
-    parse_duration,
-    replace_extension,
-)
 
 
 class FiveMinIE(InfoExtractor):
     IE_NAME = '5min'
-    _VALID_URL = r'(?:5min:(?P<id>\d+)(?::(?P<sid>\d+))?|https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?P<query>.*))'
+    _VALID_URL = r'(?:5min:|https?://(?:[^/]*?5min\.com/|delivery\.vidible\.tv/aol)(?:(?:Scripts/PlayerSeed\.js|playerseed/?)?\?.*?playList=)?)(?P<id>\d+)'
 
     _TESTS = [
         {
@@ -29,8 +16,16 @@ class FiveMinIE(InfoExtractor):
                 'id': '518013791',
                 'ext': 'mp4',
                 'title': 'iPad Mini with Retina Display Review',
+                'description': 'iPad mini with Retina Display review',
                 'duration': 177,
+                'uploader': 'engadget',
+                'upload_date': '20131115',
+                'timestamp': 1384515288,
             },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            }
         },
         {
             # From http://on.aol.com/video/how-to-make-a-next-level-fruit-salad-518086247
@@ -44,108 +39,16 @@ class FiveMinIE(InfoExtractor):
             },
             'skip': 'no longer available',
         },
-    ]
-    _ERRORS = {
-        'ErrorVideoNotExist': 'We\'re sorry, but the video you are trying to watch does not exist.',
-        'ErrorVideoNoLongerAvailable': 'We\'re sorry, but the video you are trying to watch is no longer available.',
-        'ErrorVideoRejected': 'We\'re sorry, but the video you are trying to watch has been removed.',
-        'ErrorVideoUserNotGeo': 'We\'re sorry, but the video you are trying to watch cannot be viewed from your current location.',
-        'ErrorVideoLibraryRestriction': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.',
-        'ErrorExposurePermission': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.',
-    }
-    _QUALITIES = {
-        1: {
-            'width': 640,
-            'height': 360,
-        },
-        2: {
-            'width': 854,
-            'height': 480,
-        },
-        4: {
-            'width': 1280,
-            'height': 720,
-        },
-        8: {
-            'width': 1920,
-            'height': 1080,
-        },
-        16: {
-            'width': 640,
-            'height': 360,
-        },
-        32: {
-            'width': 854,
-            'height': 480,
-        },
-        64: {
-            'width': 1280,
-            'height': 720,
-        },
-        128: {
-            'width': 640,
-            'height': 360,
+        {
+            'url': 'http://embed.5min.com/518726732/',
+            'only_matching': True,
         },
-    }
+        {
+            'url': 'http://delivery.vidible.tv/aol?playList=518013791',
+            'only_matching': True,
+        }
+    ]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        sid = mobj.group('sid')
-
-        if mobj.group('query'):
-            qs = compat_parse_qs(mobj.group('query'))
-            if not qs.get('playList'):
-                raise ExtractorError('Invalid URL', expected=True)
-            video_id = qs['playList'][0]
-            if qs.get('sid'):
-                sid = qs['sid'][0]
-
-        embed_url = 'https://embed.5min.com/playerseed/?playList=%s' % video_id
-        if not sid:
-            embed_page = self._download_webpage(embed_url, video_id,
-                                                'Downloading embed page')
-            sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid')
-
-        response = self._download_json(
-            'https://syn.5min.com/handlers/SenseHandler.ashx?' +
-            compat_urllib_parse_urlencode({
-                'func': 'GetResults',
-                'playlist': video_id,
-                'sid': sid,
-                'isPlayerSeed': 'true',
-                'url': embed_url,
-            }),
-            video_id)
-        if not response['success']:
-            raise ExtractorError(
-                '%s said: %s' % (
-                    self.IE_NAME,
-                    self._ERRORS.get(response['errorMessage'], response['errorMessage'])),
-                expected=True)
-        info = response['binding'][0]
-
-        formats = []
-        parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs(
-            compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0])
-        for rendition in info['Renditions']:
-            if rendition['RenditionType'] == 'aac' or rendition['RenditionType'] == 'm3u8':
-                continue
-            else:
-                rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType'])))
-                quality = self._QUALITIES.get(rendition['ID'], {})
-                formats.append({
-                    'format_id': '%s-%d' % (rendition['RenditionType'], rendition['ID']),
-                    'url': rendition_url,
-                    'width': quality.get('width'),
-                    'height': quality.get('height'),
-                })
-        self._sort_formats(formats)
-
-        return {
-            'id': video_id,
-            'title': info['Title'],
-            'thumbnail': info.get('ThumbURL'),
-            'duration': parse_duration(info.get('Duration')),
-            'formats': formats,
-        }
+        video_id = self._match_id(url)
+        return self.url_result('aol-video:%s' % video_id)
diff --git a/youtube_dl/extractor/flipagram.py b/youtube_dl/extractor/flipagram.py
new file mode 100644 (file)
index 0000000..1902a23
--- /dev/null
@@ -0,0 +1,115 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    float_or_none,
+    try_get,
+    unified_timestamp,
+)
+
+
+class FlipagramIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?flipagram\.com/f/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://flipagram.com/f/nyvTSJMKId',
+        'md5': '888dcf08b7ea671381f00fab74692755',
+        'info_dict': {
+            'id': 'nyvTSJMKId',
+            'ext': 'mp4',
+            'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction',
+            'description': 'md5:d55e32edc55261cae96a41fa85ff630e',
+            'duration': 35.571,
+            'timestamp': 1461244995,
+            'upload_date': '20160421',
+            'uploader': 'kitty juria',
+            'uploader_id': 'sjuria101',
+            'creator': 'kitty juria',
+            'view_count': int,
+            'like_count': int,
+            'repost_count': int,
+            'comment_count': int,
+            'comments': list,
+            'formats': 'mincount:2',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        video_data = self._parse_json(
+            self._search_regex(
+                r'window\.reactH2O\s*=\s*({.+});', webpage, 'video data'),
+            video_id)
+
+        flipagram = video_data['flipagram']
+        video = flipagram['video']
+
+        json_ld = self._search_json_ld(webpage, video_id, default={})
+        title = json_ld.get('title') or flipagram['captionText']
+        description = json_ld.get('description') or flipagram.get('captionText')
+
+        formats = [{
+            'url': video['url'],
+            'width': int_or_none(video.get('width')),
+            'height': int_or_none(video.get('height')),
+            'filesize': int_or_none(video_data.get('size')),
+        }]
+
+        preview_url = try_get(
+            flipagram, lambda x: x['music']['track']['previewUrl'], compat_str)
+        if preview_url:
+            formats.append({
+                'url': preview_url,
+                'ext': 'm4a',
+                'vcodec': 'none',
+            })
+
+        self._sort_formats(formats)
+
+        counts = flipagram.get('counts', {})
+        user = flipagram.get('user', {})
+        video_data = flipagram.get('video', {})
+
+        thumbnails = [{
+            'url': self._proto_relative_url(cover['url']),
+            'width': int_or_none(cover.get('width')),
+            'height': int_or_none(cover.get('height')),
+            'filesize': int_or_none(cover.get('size')),
+        } for cover in flipagram.get('covers', []) if cover.get('url')]
+
+        # Note that this only retrieves comments that are initally loaded.
+        # For videos with large amounts of comments, most won't be retrieved.
+        comments = []
+        for comment in video_data.get('comments', {}).get(video_id, {}).get('items', []):
+            text = comment.get('comment')
+            if not text or not isinstance(text, list):
+                continue
+            comments.append({
+                'author': comment.get('user', {}).get('name'),
+                'author_id': comment.get('user', {}).get('username'),
+                'id': comment.get('id'),
+                'text': text[0],
+                'timestamp': unified_timestamp(comment.get('created')),
+            })
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'duration': float_or_none(flipagram.get('duration'), 1000),
+            'thumbnails': thumbnails,
+            'timestamp': unified_timestamp(flipagram.get('iso8601Created')),
+            'uploader': user.get('name'),
+            'uploader_id': user.get('username'),
+            'creator': user.get('name'),
+            'view_count': int_or_none(counts.get('plays')),
+            'like_count': int_or_none(counts.get('likes')),
+            'repost_count': int_or_none(counts.get('reflips')),
+            'comment_count': int_or_none(counts.get('comments')),
+            'comments': comments,
+            'formats': formats,
+        }
index 322c41e5afcb20935f4f7841ccb03303d5476f65..8c417ab65b0478025ae92929830d03767448ebfa 100644 (file)
@@ -5,8 +5,8 @@ from .common import InfoExtractor
 
 
 class Formula1IE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?formula1\.com/content/fom-website/en/video/\d{4}/\d{1,2}/(?P<id>.+?)\.html'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?formula1\.com/(?:content/fom-website/)?en/video/\d{4}/\d{1,2}/(?P<id>.+?)\.html'
+    _TESTS = [{
         'url': 'http://www.formula1.com/content/fom-website/en/video/2016/5/Race_highlights_-_Spain_2016.html',
         'md5': '8c79e54be72078b26b89e0e111c0502b',
         'info_dict': {
@@ -15,7 +15,10 @@ class Formula1IE(InfoExtractor):
             'title': 'Race highlights - Spain 2016',
         },
         'add_ie': ['Ooyala'],
-    }
+    }, {
+        'url': 'http://www.formula1.com/en/video/2016/5/Race_highlights_-_Spain_2016.html',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         display_id = self._match_id(url)
index fc4a5a0fbf01801d598e20a9addd29ebef4a298e..9776c8422228f1f44b5b0a3bf0c40a125e1fa50a 100644 (file)
@@ -43,14 +43,14 @@ class FourTubeIE(InfoExtractor):
             'uploadDate', webpage))
         thumbnail = self._html_search_meta('thumbnailUrl', webpage)
         uploader_id = self._html_search_regex(
-            r'<a class="img-avatar" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">',
+            r'<a class="item-to-subscribe" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">',
             webpage, 'uploader id', fatal=False)
         uploader = self._html_search_regex(
-            r'<a class="img-avatar" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">',
+            r'<a class="item-to-subscribe" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">',
             webpage, 'uploader', fatal=False)
 
         categories_html = self._search_regex(
-            r'(?s)><i class="icon icon-tag"></i>\s*Categories / Tags\s*.*?<ul class="list">(.*?)</ul>',
+            r'(?s)><i class="icon icon-tag"></i>\s*Categories / Tags\s*.*?<ul class="[^"]*?list[^"]*?">(.*?)</ul>',
             webpage, 'categories', fatal=False)
         categories = None
         if categories_html:
@@ -59,10 +59,10 @@ class FourTubeIE(InfoExtractor):
                     r'(?s)<li><a.*?>(.*?)</a>', categories_html)]
 
         view_count = str_to_int(self._search_regex(
-            r'<meta itemprop="interactionCount" content="UserPlays:([0-9,]+)">',
+            r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([0-9,]+)">',
             webpage, 'view count', fatal=False))
         like_count = str_to_int(self._search_regex(
-            r'<meta itemprop="interactionCount" content="UserLikes:([0-9,]+)">',
+            r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserLikes:([0-9,]+)">',
             webpage, 'like count', fatal=False))
         duration = parse_duration(self._html_search_meta('duration', webpage))
 
index 95c1abf94bde4cf6ff9cbe5ffe383106db2a2237..9f406b17eebfab6248f0a6e0d4722151abd7add9 100644 (file)
@@ -2,7 +2,10 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..utils import smuggle_url
+from ..utils import (
+    smuggle_url,
+    update_url_query,
+)
 
 
 class FOXIE(InfoExtractor):
@@ -29,11 +32,12 @@ class FOXIE(InfoExtractor):
 
         release_url = self._parse_json(self._search_regex(
             r'"fox_pdk_player"\s*:\s*({[^}]+?})', webpage, 'fox_pdk_player'),
-            video_id)['release_url'] + '&switch=http'
+            video_id)['release_url']
 
         return {
             '_type': 'url_transparent',
             'ie_key': 'ThePlatform',
-            'url': smuggle_url(release_url, {'force_smil_url': True}),
+            'url': smuggle_url(update_url_query(
+                release_url, {'switch': 'http'}), {'force_smil_url': True}),
             'id': video_id,
         }
index e2ca962838932f682f0ac833bd64169ccaba8fc5..186da0d3ba7b0f1241bf761d9b64a3222ed33fec 100644 (file)
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_urlparse,
-)
 from ..utils import (
     determine_ext,
-    int_or_none,
-    ExtractorError,
+    unified_strdate,
 )
 
 
 class FranceCultureIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/player/reecouter\?play=(?P<id>[0-9]+)'
+    _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)'
     _TEST = {
-        'url': 'http://www.franceculture.fr/player/reecouter?play=4795174',
+        'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks',
         'info_dict': {
-            'id': '4795174',
+            'id': 'rendez-vous-au-pays-des-geeks',
+            'display_id': 'rendez-vous-au-pays-des-geeks',
             'ext': 'mp3',
             'title': 'Rendez-vous au pays des geeks',
-            'alt_title': 'Carnet nomade | 13-14',
-            'vcodec': 'none',
+            'thumbnail': 're:^https?://.*\\.jpg$',
             'upload_date': '20140301',
-            'thumbnail': r're:^http://static\.franceculture\.fr/.*/images/player/Carnet-nomade\.jpg$',
-            'description': 'startswith:Avec :Jean-Baptiste Péretié pour son documentaire sur Arte "La revanche',
-            'timestamp': 1393700400,
+            'vcodec': 'none',
         }
     }
 
-    def _extract_from_player(self, url, video_id):
-        webpage = self._download_webpage(url, video_id)
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
 
-        video_path = self._search_regex(
-            r'<a id="player".*?href="([^"]+)"', webpage, 'video path')
-        video_url = compat_urlparse.urljoin(url, video_path)
-        timestamp = int_or_none(self._search_regex(
-            r'<a id="player".*?data-date="([0-9]+)"',
-            webpage, 'upload date', fatal=False))
-        thumbnail = self._search_regex(
-            r'<a id="player".*?>\s+<img src="([^"]+)"',
-            webpage, 'thumbnail', fatal=False)
+        webpage = self._download_webpage(url, display_id)
 
-        display_id = self._search_regex(
-            r'<span class="path-diffusion">emission-(.*?)</span>', webpage, 'display_id')
+        video_url = self._search_regex(
+            r'(?s)<div[^>]+class="[^"]*?title-zone-diffusion[^"]*?"[^>]*>.*?<a[^>]+href="([^"]+)"',
+            webpage, 'video path')
 
-        title = self._html_search_regex(
-            r'<span class="title-diffusion">(.*?)</span>', webpage, 'title')
-        alt_title = self._html_search_regex(
-            r'<span class="title">(.*?)</span>',
-            webpage, 'alt_title', fatal=False)
-        description = self._html_search_regex(
-            r'<span class="description">(.*?)</span>',
-            webpage, 'description', fatal=False)
+        title = self._og_search_title(webpage)
 
+        upload_date = unified_strdate(self._search_regex(
+            '(?s)<div[^>]+class="date"[^>]*>.*?<span[^>]+class="inner"[^>]*>([^<]+)<',
+            webpage, 'upload date', fatal=False))
+        thumbnail = self._search_regex(
+            r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+data-pagespeed-(?:lazy|high-res)-src="([^"]+)"',
+            webpage, 'thumbnail', fatal=False)
         uploader = self._html_search_regex(
             r'(?s)<div id="emission".*?<span class="author">(.*?)</span>',
             webpage, 'uploader', default=None)
         vcodec = 'none' if determine_ext(video_url.lower()) == 'mp3' else None
 
         return {
-            'id': video_id,
+            'id': display_id,
+            'display_id': display_id,
             'url': video_url,
-            'vcodec': vcodec,
-            'uploader': uploader,
-            'timestamp': timestamp,
             'title': title,
-            'alt_title': alt_title,
             'thumbnail': thumbnail,
-            'description': description,
-            'display_id': display_id,
+            'vcodec': vcodec,
+            'uploader': uploader,
+            'upload_date': upload_date,
         }
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        return self._extract_from_player(url, video_id)
-
-
-class FranceCultureEmissionIE(FranceCultureIE):
-    _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emission-(?P<id>[^?#]+)'
-    _TEST = {
-        'url': 'http://www.franceculture.fr/emission-les-carnets-de-la-creation-jean-gabriel-periot-cineaste-2015-10-13',
-        'info_dict': {
-            'title': 'Jean-Gabriel Périot, cinéaste',
-            'alt_title': 'Les Carnets de la création',
-            'id': '5093239',
-            'display_id': 'les-carnets-de-la-creation-jean-gabriel-periot-cineaste-2015-10-13',
-            'ext': 'mp3',
-            'timestamp': 1444762500,
-            'upload_date': '20151013',
-            'description': 'startswith:Aujourd\'hui dans "Les carnets de la création", le cinéaste',
-        },
-    }
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-        video_path = self._html_search_regex(
-            r'<a class="rf-player-open".*?href="([^"]+)"', webpage, 'video path', 'no_path_player')
-        if video_path == 'no_path_player':
-            raise ExtractorError('no player : no sound in this page.', expected=True)
-        new_id = self._search_regex('play=(?P<id>[0-9]+)', video_path, 'new_id', group='id')
-        video_url = compat_urlparse.urljoin(url, video_path)
-        return self._extract_from_player(video_url, new_id)
index ad94e31f346cc97cd71ad1be9f6983a16b6df209..3233f66d5fe2efd8381b125948e4eed3d0e446ce 100644 (file)
@@ -14,7 +14,10 @@ from ..utils import (
     parse_duration,
     determine_ext,
 )
-from .dailymotion import DailymotionCloudIE
+from .dailymotion import (
+    DailymotionIE,
+    DailymotionCloudIE,
+)
 
 
 class FranceTVBaseInfoExtractor(InfoExtractor):
@@ -128,7 +131,7 @@ class PluzzIE(FranceTVBaseInfoExtractor):
 
 class FranceTvInfoIE(FranceTVBaseInfoExtractor):
     IE_NAME = 'francetvinfo.fr'
-    _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/.*/(?P<title>.+)\.html'
+    _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<title>[^/?#&.]+)'
 
     _TESTS = [{
         'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html',
@@ -188,6 +191,24 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
         'params': {
             'skip_download': True,
         },
+    }, {
+        # Dailymotion embed
+        'url': 'http://www.francetvinfo.fr/politique/notre-dame-des-landes/video-sur-france-inter-cecile-duflot-denonce-le-regard-meprisant-de-patrick-cohen_1520091.html',
+        'md5': 'ee7f1828f25a648addc90cb2687b1f12',
+        'info_dict': {
+            'id': 'x4iiko0',
+            'ext': 'mp4',
+            'title': 'NDDL, référendum, Brexit : Cécile Duflot répond à Patrick Cohen',
+            'description': 'Au lendemain de la victoire du "oui" au référendum sur l\'aéroport de Notre-Dame-des-Landes, l\'ancienne ministre écologiste est l\'invitée de Patrick Cohen. Plus d\'info : https://www.franceinter.fr/emissions/le-7-9/le-7-9-27-juin-2016',
+            'timestamp': 1467011958,
+            'upload_date': '20160627',
+            'uploader': 'France Inter',
+            'uploader_id': 'x2q2ez',
+        },
+        'add_ie': ['Dailymotion'],
+    }, {
+        'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
@@ -197,7 +218,13 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
 
         dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
         if dmcloud_url:
-            return self.url_result(dmcloud_url, 'DailymotionCloud')
+            return self.url_result(dmcloud_url, DailymotionCloudIE.ie_key())
+
+        dailymotion_urls = DailymotionIE._extract_urls(webpage)
+        if dailymotion_urls:
+            return self.playlist_result([
+                self.url_result(dailymotion_url, DailymotionIE.ie_key())
+                for dailymotion_url in dailymotion_urls])
 
         video_id, catalogue = self._search_regex(
             (r'id-video=([^@]+@[^"]+)',
diff --git a/youtube_dl/extractor/fusion.py b/youtube_dl/extractor/fusion.py
new file mode 100644 (file)
index 0000000..b4ab4cb
--- /dev/null
@@ -0,0 +1,35 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .ooyala import OoyalaIE
+
+
+class FusionIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?fusion\.net/video/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://fusion.net/video/201781/u-s-and-panamanian-forces-work-together-to-stop-a-vessel-smuggling-drugs/',
+        'info_dict': {
+            'id': 'ZpcWNoMTE6x6uVIIWYpHh0qQDjxBuq5P',
+            'ext': 'mp4',
+            'title': 'U.S. and Panamanian forces work together to stop a vessel smuggling drugs',
+            'description': 'md5:0cc84a9943c064c0f46b128b41b1b0d7',
+            'duration': 140.0,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': ['Ooyala'],
+    }, {
+        'url': 'http://fusion.net/video/201781',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+
+        ooyala_code = self._search_regex(
+            r'data-video-id=(["\'])(?P<code>.+?)\1',
+            webpage, 'ooyala code', group='code')
+
+        return OoyalaIE._build_url_result(ooyala_code)
diff --git a/youtube_dl/extractor/fxnetworks.py b/youtube_dl/extractor/fxnetworks.py
new file mode 100644 (file)
index 0000000..6298973
--- /dev/null
@@ -0,0 +1,70 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .adobepass import AdobePassIE
+from ..utils import (
+    update_url_query,
+    extract_attributes,
+    parse_age_limit,
+    smuggle_url,
+)
+
+
+class FXNetworksIE(AdobePassIE):
+    _VALID_URL = r'https?://(?:www\.)?(?:fxnetworks|simpsonsworld)\.com/video/(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'http://www.fxnetworks.com/video/719841347694',
+        'md5': '1447d4722e42ebca19e5232ab93abb22',
+        'info_dict': {
+            'id': '719841347694',
+            'ext': 'mp4',
+            'title': 'Vanpage',
+            'description': 'F*ck settling down. You\'re the Worst returns for an all new season August 31st on FXX.',
+            'age_limit': 14,
+            'uploader': 'NEWA-FNG-FX',
+            'upload_date': '20160706',
+            'timestamp': 1467844741,
+        },
+        'add_ie': ['ThePlatform'],
+    }, {
+        'url': 'http://www.simpsonsworld.com/video/716094019682',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        if 'The content you are trying to access is not available in your region.' in webpage:
+            self.raise_geo_restricted()
+        video_data = extract_attributes(self._search_regex(
+            r'(<a.+?rel="http://link\.theplatform\.com/s/.+?</a>)', webpage, 'video data'))
+        player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', default=None)
+        release_url = video_data['rel']
+        title = video_data['data-title']
+        rating = video_data.get('data-rating')
+        query = {
+            'mbr': 'true',
+        }
+        if player_type == 'movies':
+            query.update({
+                'manifest': 'm3u',
+            })
+        else:
+            query.update({
+                'switch': 'http',
+            })
+        if video_data.get('data-req-auth') == '1':
+            resource = self._get_mvpd_resource(
+                video_data['data-channel'], title,
+                video_data.get('data-guid'), rating)
+            query['auth'] = self._extract_mvpd_auth(url, video_id, 'fx', resource)
+
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'title': title,
+            'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}),
+            'thumbnail': video_data.get('data-large-thumb'),
+            'age_limit': parse_age_limit(rating),
+            'ie_key': 'ThePlatform',
+        }
diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py
deleted file mode 100644 (file)
index cbcddcb..0000000
+++ /dev/null
@@ -1,76 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import (
-    xpath_text,
-    xpath_with_ns,
-)
-from .youtube import YoutubeIE
-
-
-class GamekingsIE(InfoExtractor):
-    _VALID_URL = r'https?://www\.gamekings\.nl/(?:videos|nieuws)/(?P<id>[^/]+)'
-    _TESTS = [{
-        # YouTube embed video
-        'url': 'http://www.gamekings.nl/videos/phoenix-wright-ace-attorney-dual-destinies-review/',
-        'md5': '5208d3a17adeaef829a7861887cb9029',
-        'info_dict': {
-            'id': 'HkSQKetlGOU',
-            'ext': 'mp4',
-            'title': 'Phoenix Wright: Ace Attorney - Dual Destinies Review',
-            'description': 'md5:db88c0e7f47e9ea50df3271b9dc72e1d',
-            'thumbnail': 're:^https?://.*\.jpg$',
-            'uploader_id': 'UCJugRGo4STYMeFr5RoOShtQ',
-            'uploader': 'Gamekings Vault',
-            'upload_date': '20151123',
-        },
-        'add_ie': ['Youtube'],
-    }, {
-        # vimeo video
-        'url': 'http://www.gamekings.nl/videos/the-legend-of-zelda-majoras-mask/',
-        'md5': '12bf04dfd238e70058046937657ea68d',
-        'info_dict': {
-            'id': 'the-legend-of-zelda-majoras-mask',
-            'ext': 'mp4',
-            'title': 'The Legend of Zelda: Majora’s Mask',
-            'description': 'md5:9917825fe0e9f4057601fe1e38860de3',
-            'thumbnail': 're:^https?://.*\.jpg$',
-        },
-    }, {
-        'url': 'http://www.gamekings.nl/nieuws/gamekings-extra-shelly-en-david-bereiden-zich-voor-op-de-livestream/',
-        'only_matching': True,
-    }]
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, video_id)
-
-        playlist_id = self._search_regex(
-            r'gogoVideo\([^,]+,\s*"([^"]+)', webpage, 'playlist id')
-
-        # Check if a YouTube embed is used
-        if YoutubeIE.suitable(playlist_id):
-            return self.url_result(playlist_id, ie='Youtube')
-
-        playlist = self._download_xml(
-            'http://www.gamekings.tv/wp-content/themes/gk2010/rss_playlist.php?id=%s' % playlist_id,
-            video_id)
-
-        NS_MAP = {
-            'jwplayer': 'http://rss.jwpcdn.com/'
-        }
-
-        item = playlist.find('./channel/item')
-
-        thumbnail = xpath_text(item, xpath_with_ns('./jwplayer:image', NS_MAP), 'thumbnail')
-        video_url = item.find(xpath_with_ns('./jwplayer:source', NS_MAP)).get('file')
-
-        return {
-            'id': video_id,
-            'url': video_url,
-            'title': self._og_search_title(webpage),
-            'description': self._og_search_description(webpage),
-            'thumbnail': thumbnail,
-        }
index 621257c9f72383a5c6133001953a7e51cc5df435..4e859e09aa16b7608ee103e851f0d0928bbfdb30 100644 (file)
@@ -28,10 +28,13 @@ class GameSpotIE(OnceIE):
         'url': 'http://www.gamespot.com/videos/the-witcher-3-wild-hunt-xbox-one-now-playing/2300-6424837/',
         'info_dict': {
             'id': 'gs-2300-6424837',
-            'ext': 'flv',
-            'title': 'The Witcher 3: Wild Hunt [Xbox ONE]  - Now Playing',
+            'ext': 'mp4',
+            'title': 'Now Playing - The Witcher 3: Wild Hunt',
             'description': 'Join us as we take a look at the early hours of The Witcher 3: Wild Hunt and more.',
         },
+        'params': {
+            'skip_download': True,  # m3u8 downloads
+        },
     }]
 
     def _real_extract(self, url):
index 4aa24061c0cb97f8c36145e4a8b94ba83b8e33e6..197ab95319419d29971979b94464aa191ef04a04 100644 (file)
@@ -49,7 +49,10 @@ from .pornhub import PornHubIE
 from .xhamster import XHamsterEmbedIE
 from .tnaflix import TNAFlixNetworkEmbedIE
 from .vimeo import VimeoIE
-from .dailymotion import DailymotionCloudIE
+from .dailymotion import (
+    DailymotionIE,
+    DailymotionCloudIE,
+)
 from .onionstudios import OnionStudiosIE
 from .viewlift import ViewLiftEmbedIE
 from .screenwavemedia import ScreenwaveMediaIE
@@ -59,11 +62,17 @@ from .videomore import VideomoreIE
 from .googledrive import GoogleDriveIE
 from .jwplatform import JWPlatformIE
 from .digiteka import DigitekaIE
+from .arkena import ArkenaIE
 from .instagram import InstagramIE
 from .liveleak import LiveLeakIE
 from .threeqsdn import ThreeQSDNIE
 from .theplatform import ThePlatformIE
 from .vessel import VesselIE
+from .kaltura import KalturaIE
+from .eagleplatform import EaglePlatformIE
+from .facebook import FacebookIE
+from .soundcloud import SoundcloudIE
+from .vbox7 import Vbox7IE
 
 
 class GenericIE(InfoExtractor):
@@ -467,7 +476,7 @@ class GenericIE(InfoExtractor):
             'url': 'http://www.vestifinance.ru/articles/25753',
             'info_dict': {
                 'id': '25753',
-                'title': 'Ð\92еÑ\81Ñ\82и Ð­ÐºÐ¾Ð½Ð¾Ð¼Ð¸ÐºÐ° â\80\95 Ð\9fÑ\80Ñ\8fмÑ\8bе Ñ\82Ñ\80анÑ\81лÑ\8fÑ\86ии Ñ\81 Ð¤Ð¾Ñ\80Ñ\83ма-вÑ\8bÑ\81Ñ\82авки "Ð\93оÑ\81заказ-2013"',
+                'title': 'Прямые трансляции с Форума-выставки "Госзаказ-2013"',
             },
             'playlist': [{
                 'info_dict': {
@@ -634,6 +643,8 @@ class GenericIE(InfoExtractor):
                 'ext': 'mp4',
                 'title': 'Key and Peele|October 10, 2012|2|203|Liam Neesons - Uncensored',
                 'description': 'Two valets share their love for movie star Liam Neesons.',
+                'timestamp': 1349922600,
+                'upload_date': '20121011',
             },
         },
         # YouTube embed via <data-embed-url="">
@@ -775,6 +786,15 @@ class GenericIE(InfoExtractor):
                 'upload_date': '20141029',
             }
         },
+        # Soundcloud multiple embeds
+        {
+            'url': 'http://www.guitarplayer.com/lessons/1014/legato-workout-one-hour-to-more-fluid-performance---tab/52809',
+            'info_dict': {
+                'id': '52809',
+                'title': 'Guitar Essentials: Legato Workout—One-Hour to Fluid Performance  | TAB + AUDIO',
+            },
+            'playlist_mincount': 7,
+        },
         # Livestream embed
         {
             'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
@@ -850,6 +870,7 @@ class GenericIE(InfoExtractor):
                 'description': 'md5:601cb790edd05908957dae8aaa866465',
                 'upload_date': '20150220',
             },
+            'skip': 'All The Daily Show URLs now redirect to http://www.cc.com/shows/',
         },
         # jwplayer YouTube
         {
@@ -920,6 +941,24 @@ class GenericIE(InfoExtractor):
             },
             'add_ie': ['Kaltura'],
         },
+        {
+            # Kaltura embedded via quoted entry_id
+            'url': 'https://www.oreilly.com/ideas/my-cloud-makes-pretty-pictures',
+            'info_dict': {
+                'id': '0_utuok90b',
+                'ext': 'mp4',
+                'title': '06_matthew_brender_raj_dutt',
+                'timestamp': 1466638791,
+                'upload_date': '20160622',
+            },
+            'add_ie': ['Kaltura'],
+            'expected_warnings': [
+                'Could not send HEAD request'
+            ],
+            'params': {
+                'skip_download': True,
+            }
+        },
         # Eagle.Platform embed (generic URL)
         {
             'url': 'http://lenta.ru/news/2015/03/06/navalny/',
@@ -1091,12 +1130,17 @@ class GenericIE(InfoExtractor):
         # Dailymotion Cloud video
         {
             'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910',
-            'md5': '49444254273501a64675a7e68c502681',
+            'md5': 'dcaf23ad0c67a256f4278bce6e0bae38',
             'info_dict': {
-                'id': '5585de919473990de4bee11b',
+                'id': 'x2uy8t3',
                 'ext': 'mp4',
-                'title': 'Le débat',
+                'title': 'Sauvons les abeilles ! - Le débat',
+                'description': 'md5:d9082128b1c5277987825d684939ca26',
                 'thumbnail': 're:^https?://.*\.jpe?g$',
+                'timestamp': 1434970506,
+                'upload_date': '20150622',
+                'uploader': 'Public Sénat',
+                'uploader_id': 'xa9gza',
             }
         },
         # OnionStudios embed
@@ -1220,6 +1264,145 @@ class GenericIE(InfoExtractor):
                 'uploader': 'www.hudl.com',
             },
         },
+        # twitter:player:stream embed
+        {
+            'url': 'http://www.rtl.be/info/video/589263.aspx?CategoryID=288',
+            'info_dict': {
+                'id': 'master',
+                'ext': 'mp4',
+                'title': 'Une nouvelle espèce de dinosaure découverte en Argentine',
+                'uploader': 'www.rtl.be',
+            },
+            'params': {
+                # m3u8 downloads
+                'skip_download': True,
+            },
+        },
+        # twitter:player embed
+        {
+            'url': 'http://www.theatlantic.com/video/index/484130/what-do-black-holes-sound-like/',
+            'md5': 'a3e0df96369831de324f0778e126653c',
+            'info_dict': {
+                'id': '4909620399001',
+                'ext': 'mp4',
+                'title': 'What Do Black Holes Sound Like?',
+                'description': 'what do black holes sound like',
+                'upload_date': '20160524',
+                'uploader_id': '29913724001',
+                'timestamp': 1464107587,
+                'uploader': 'TheAtlantic',
+            },
+            'add_ie': ['BrightcoveLegacy'],
+        },
+        # Facebook <iframe> embed
+        {
+            'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html',
+            'md5': 'fbcde74f534176ecb015849146dd3aee',
+            'info_dict': {
+                'id': '599637780109885',
+                'ext': 'mp4',
+                'title': 'Facebook video #599637780109885',
+            },
+        },
+        # Facebook API embed
+        {
+            'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/',
+            'md5': 'a47372ee61b39a7b90287094d447d94e',
+            'info_dict': {
+                'id': '10153467542406923',
+                'ext': 'mp4',
+                'title': 'Facebook video #10153467542406923',
+            },
+        },
+        # Wordpress "YouTube Video Importer" plugin
+        {
+            'url': 'http://www.lothype.com/blue-devils-drumline-stanford-lot-2016/',
+            'md5': 'd16797741b560b485194eddda8121b48',
+            'info_dict': {
+                'id': 'HNTXWDXV9Is',
+                'ext': 'mp4',
+                'title': 'Blue Devils Drumline Stanford lot 2016',
+                'upload_date': '20160627',
+                'uploader_id': 'GENOCIDE8GENERAL10',
+                'uploader': 'cylus cyrus',
+            },
+        },
+        {
+            # video stored on custom kaltura server
+            'url': 'http://www.expansion.com/multimedia/videos.html?media=EQcM30NHIPv',
+            'md5': '537617d06e64dfed891fa1593c4b30cc',
+            'info_dict': {
+                'id': '0_1iotm5bh',
+                'ext': 'mp4',
+                'title': 'Elecciones británicas: 5 lecciones para Rajoy',
+                'description': 'md5:435a89d68b9760b92ce67ed227055f16',
+                'uploader_id': 'videos.expansion@el-mundo.net',
+                'upload_date': '20150429',
+                'timestamp': 1430303472,
+            },
+            'add_ie': ['Kaltura'],
+        },
+        {
+            # Non-standard Vimeo embed
+            'url': 'https://openclassrooms.com/courses/understanding-the-web',
+            'md5': '64d86f1c7d369afd9a78b38cbb88d80a',
+            'info_dict': {
+                'id': '148867247',
+                'ext': 'mp4',
+                'title': 'Understanding the web - Teaser',
+                'description': 'This is "Understanding the web - Teaser" by openclassrooms on Vimeo, the home for high quality videos and the people who love them.',
+                'upload_date': '20151214',
+                'uploader': 'OpenClassrooms',
+                'uploader_id': 'openclassrooms',
+            },
+            'add_ie': ['Vimeo'],
+        },
+        {
+            'url': 'https://support.arkena.com/display/PLAY/Ways+to+embed+your+video',
+            'md5': 'b96f2f71b359a8ecd05ce4e1daa72365',
+            'info_dict': {
+                'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe',
+                'ext': 'mp4',
+                'title': 'Big Buck Bunny',
+                'description': 'Royalty free test video',
+                'timestamp': 1432816365,
+                'upload_date': '20150528',
+                'is_live': False,
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'add_ie': [ArkenaIE.ie_key()],
+        },
+        {
+            'url': 'http://nova.bg/news/view/2016/08/16/156543/%D0%BD%D0%B0-%D0%BA%D0%BE%D1%81%D1%8A%D0%BC-%D0%BE%D1%82-%D0%B2%D0%B7%D1%80%D0%B8%D0%B2-%D0%BE%D1%82%D1%86%D0%B5%D0%BF%D0%B8%D1%85%D0%B0-%D1%86%D1%8F%D0%BB-%D0%BA%D0%B2%D0%B0%D1%80%D1%82%D0%B0%D0%BB-%D0%B7%D0%B0%D1%80%D0%B0%D0%B4%D0%B8-%D0%B8%D0%B7%D1%82%D0%B8%D1%87%D0%B0%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%B3%D0%B0%D0%B7-%D0%B2-%D0%BF%D0%BB%D0%BE%D0%B2%D0%B4%D0%B8%D0%B2/',
+            'info_dict': {
+                'id': '1c7141f46c',
+                'ext': 'mp4',
+                'title': 'НА КОСЪМ ОТ ВЗРИВ: Изтичане на газ на бензиностанция в Пловдив',
+            },
+            'params': {
+                'skip_download': True,
+            },
+            'add_ie': [Vbox7IE.ie_key()],
+        },
+        # {
+        #     # TODO: find another test
+        #     # http://schema.org/VideoObject
+        #     'url': 'https://flipagram.com/f/nyvTSJMKId',
+        #     'md5': '888dcf08b7ea671381f00fab74692755',
+        #     'info_dict': {
+        #         'id': 'nyvTSJMKId',
+        #         'ext': 'mp4',
+        #         'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction',
+        #         'description': '#love for cats.',
+        #         'timestamp': 1461244995,
+        #         'upload_date': '20160421',
+        #     },
+        #     'params': {
+        #         'force_generic_extractor': True,
+        #     },
+        # }
     ]
 
     def report_following_redirect(self, new_url):
@@ -1576,12 +1759,16 @@ class GenericIE(InfoExtractor):
         if matches:
             return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
 
-        # Look for embedded Dailymotion player
-        matches = re.findall(
-            r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage)
+        # Look for Wordpress "YouTube Video Importer" plugin
+        matches = re.findall(r'''(?x)<div[^>]+
+            class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
+            data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
         if matches:
-            return _playlist_from_matches(
-                matches, lambda m: unescapeHTML(m[1]))
+            return _playlist_from_matches(matches, lambda m: m[-1])
+
+        matches = DailymotionIE._extract_urls(webpage)
+        if matches:
+            return _playlist_from_matches(matches)
 
         # Look for embedded Dailymotion playlist player (#3822)
         m = re.search(
@@ -1718,10 +1905,9 @@ class GenericIE(InfoExtractor):
             return self.url_result(mobj.group('url'))
 
         # Look for embedded Facebook player
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'), 'Facebook')
+        facebook_url = FacebookIE._extract_url(webpage)
+        if facebook_url is not None:
+            return self.url_result(facebook_url, 'Facebook')
 
         # Look for embedded VK player
         mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
@@ -1836,12 +2022,9 @@ class GenericIE(InfoExtractor):
             return self.url_result(myvi_url)
 
         # Look for embedded soundcloud player
-        mobj = re.search(
-            r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
-            webpage)
-        if mobj is not None:
-            url = unescapeHTML(mobj.group('url'))
-            return self.url_result(url)
+        soundcloud_urls = SoundcloudIE._extract_urls(webpage)
+        if soundcloud_urls:
+            return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key())
 
         # Look for embedded mtvservices player
         mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage)
@@ -1903,18 +2086,14 @@ class GenericIE(InfoExtractor):
             return self.url_result(mobj.group('url'), 'Zapiks')
 
         # Look for Kaltura embeds
-        mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?(?P<q1>['\"])wid(?P=q1)\s*:\s*(?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*?(?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*(?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4),", webpage) or
-                re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage))
-        if mobj is not None:
-            return self.url_result(smuggle_url(
-                'kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(),
-                {'source_url': url}), 'Kaltura')
+        kaltura_url = KalturaIE._extract_url(webpage)
+        if kaltura_url:
+            return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key())
 
         # Look for Eagle.Platform embeds
-        mobj = re.search(
-            r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
-        if mobj is not None:
-            return self.url_result(mobj.group('url'), 'EaglePlatform')
+        eagleplatform_url = EaglePlatformIE._extract_url(webpage)
+        if eagleplatform_url:
+            return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key())
 
         # Look for ClipYou (uses Eagle.Platform) embeds
         mobj = re.search(
@@ -2008,6 +2187,11 @@ class GenericIE(InfoExtractor):
         if digiteka_url:
             return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key())
 
+        # Look for Arkena embeds
+        arkena_url = ArkenaIE._extract_url(webpage)
+        if arkena_url:
+            return self.url_result(arkena_url, ArkenaIE.ie_key())
+
         # Look for Limelight embeds
         mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage)
         if mobj:
@@ -2036,6 +2220,14 @@ class GenericIE(InfoExtractor):
             return self.url_result(
                 self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine')
 
+        # Look for VODPlatform embeds
+        mobj = re.search(
+            r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vod-platform\.net/embed/[^/?#]+)',
+            webpage)
+        if mobj is not None:
+            return self.url_result(
+                self._proto_relative_url(unescapeHTML(mobj.group(1))), 'VODPlatform')
+
         # Look for Instagram embeds
         instagram_embed_url = InstagramIE._extract_embed_url(webpage)
         if instagram_embed_url is not None:
@@ -2060,6 +2252,24 @@ class GenericIE(InfoExtractor):
                 'uploader': video_uploader,
             }
 
+        # Look for VBOX7 embeds
+        vbox7_url = Vbox7IE._extract_url(webpage)
+        if vbox7_url:
+            return self.url_result(vbox7_url, Vbox7IE.ie_key())
+
+        # Looking for http://schema.org/VideoObject
+        json_ld = self._search_json_ld(
+            webpage, video_id, default={}, expected_type='VideoObject')
+        if json_ld.get('url'):
+            info_dict.update({
+                'title': video_title or info_dict['title'],
+                'description': video_description,
+                'thumbnail': video_thumbnail,
+                'age_limit': age_limit
+            })
+            info_dict.update(json_ld)
+            return info_dict
+
         def check_video(vurl):
             if YoutubeIE.suitable(vurl):
                 return True
@@ -2103,6 +2313,9 @@ class GenericIE(InfoExtractor):
                 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
         if not found:
             # Try to find twitter cards info
+            # twitter:player:stream should be checked before twitter:player since
+            # it is expected to contain a raw stream (see
+            # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser)
             found = filter_video(re.findall(
                 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
         if not found:
@@ -2136,6 +2349,15 @@ class GenericIE(InfoExtractor):
                     '_type': 'url',
                     'url': new_url,
                 }
+
+        if not found:
+            # twitter:player is a https URL to iframe player that may or may not
+            # be supported by youtube-dl thus this is checked the very last (see
+            # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser)
+            embed_url = self._html_search_meta('twitter:player', webpage, default=None)
+            if embed_url:
+                return self.url_result(embed_url)
+
         if not found:
             raise UnsupportedError(url)
 
diff --git a/youtube_dl/extractor/goldenmoustache.py b/youtube_dl/extractor/goldenmoustache.py
deleted file mode 100644 (file)
index 0fb5097..0000000
+++ /dev/null
@@ -1,48 +0,0 @@
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-
-
-class GoldenMoustacheIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?goldenmoustache\.com/(?P<display_id>[\w-]+)-(?P<id>\d+)'
-    _TESTS = [{
-        'url': 'http://www.goldenmoustache.com/suricate-le-poker-3700/',
-        'md5': '0f904432fa07da5054d6c8beb5efb51a',
-        'info_dict': {
-            'id': '3700',
-            'ext': 'mp4',
-            'title': 'Suricate - Le Poker',
-            'description': 'md5:3d1f242f44f8c8cb0a106f1fd08e5dc9',
-            'thumbnail': 're:^https?://.*\.jpg$',
-        }
-    }, {
-        'url': 'http://www.goldenmoustache.com/le-lab-tout-effacer-mc-fly-et-carlito-55249/',
-        'md5': '27f0c50fb4dd5f01dc9082fc67cd5700',
-        'info_dict': {
-            'id': '55249',
-            'ext': 'mp4',
-            'title': 'Le LAB - Tout Effacer (Mc Fly et Carlito)',
-            'description': 'md5:9b7fbf11023fb2250bd4b185e3de3b2a',
-            'thumbnail': 're:^https?://.*\.(?:png|jpg)$',
-        }
-    }]
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-
-        video_url = self._html_search_regex(
-            r'data-src-type="mp4" data-src="([^"]+)"', webpage, 'video URL')
-        title = self._html_search_regex(
-            r'<title>(.*?)(?: - Golden Moustache)?</title>', webpage, 'title')
-        thumbnail = self._og_search_thumbnail(webpage)
-        description = self._og_search_description(webpage)
-
-        return {
-            'id': video_id,
-            'url': video_url,
-            'ext': 'mp4',
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-        }
diff --git a/youtube_dl/extractor/hgtv.py b/youtube_dl/extractor/hgtv.py
new file mode 100644 (file)
index 0000000..c3f0733
--- /dev/null
@@ -0,0 +1,48 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    js_to_json,
+    smuggle_url,
+)
+
+
+class HGTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?hgtv\.ca/[^/]+/video/(?P<id>[^/]+)/video.html'
+    _TEST = {
+        'url': 'http://www.hgtv.ca/homefree/video/overnight-success/video.html?v=738081859718&p=1&s=da#video',
+        'md5': '',
+        'info_dict': {
+            'id': 'aFH__I_5FBOX',
+            'ext': 'mp4',
+            'title': 'Overnight Success',
+            'description': 'After weeks of hard work, high stakes, breakdowns and pep talks, the final 2 contestants compete to win the ultimate dream.',
+            'uploader': 'SHWM-NEW',
+            'timestamp': 1470320034,
+            'upload_date': '20160804',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        embed_vars = self._parse_json(self._search_regex(
+            r'(?s)embed_vars\s*=\s*({.*?});',
+            webpage, 'embed vars'), display_id, js_to_json)
+        return {
+            '_type': 'url_transparent',
+            'url': smuggle_url(
+                'http://link.theplatform.com/s/dtjsEC/%s?mbr=true&manifest=m3u' % embed_vars['pid'], {
+                    'force_smil_url': True
+                }),
+            'series': embed_vars.get('show'),
+            'season_number': int_or_none(embed_vars.get('season')),
+            'episode_number': int_or_none(embed_vars.get('episode')),
+            'ie_key': 'ThePlatform',
+        }
diff --git a/youtube_dl/extractor/hrti.py b/youtube_dl/extractor/hrti.py
new file mode 100644 (file)
index 0000000..656ce6d
--- /dev/null
@@ -0,0 +1,202 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+    clean_html,
+    ExtractorError,
+    int_or_none,
+    parse_age_limit,
+    sanitized_Request,
+    try_get,
+)
+
+
+class HRTiBaseIE(InfoExtractor):
+    """
+        Base Information Extractor for Croatian Radiotelevision
+        video on demand site https://hrti.hrt.hr
+        Reverse engineered from the JavaScript app in app.min.js
+    """
+    _NETRC_MACHINE = 'hrti'
+
+    _APP_LANGUAGE = 'hr'
+    _APP_VERSION = '1.1'
+    _APP_PUBLICATION_ID = 'all_in_one'
+    _API_URL = 'http://clientapi.hrt.hr/client_api.php/config/identify/format/json'
+
+    def _initialize_api(self):
+        init_data = {
+            'application_publication_id': self._APP_PUBLICATION_ID
+        }
+
+        uuid = self._download_json(
+            self._API_URL, None, note='Downloading uuid',
+            errnote='Unable to download uuid',
+            data=json.dumps(init_data).encode('utf-8'))['uuid']
+
+        app_data = {
+            'uuid': uuid,
+            'application_publication_id': self._APP_PUBLICATION_ID,
+            'application_version': self._APP_VERSION
+        }
+
+        req = sanitized_Request(self._API_URL, data=json.dumps(app_data).encode('utf-8'))
+        req.get_method = lambda: 'PUT'
+
+        resources = self._download_json(
+            req, None, note='Downloading session information',
+            errnote='Unable to download session information')
+
+        self._session_id = resources['session_id']
+
+        modules = resources['modules']
+
+        self._search_url = modules['vod_catalog']['resources']['search']['uri'].format(
+            language=self._APP_LANGUAGE,
+            application_id=self._APP_PUBLICATION_ID)
+
+        self._login_url = (modules['user']['resources']['login']['uri'] +
+                           '/format/json').format(session_id=self._session_id)
+
+        self._logout_url = modules['user']['resources']['logout']['uri']
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        # TODO: figure out authentication with cookies
+        if username is None or password is None:
+            self.raise_login_required()
+
+        auth_data = {
+            'username': username,
+            'password': password,
+        }
+
+        try:
+            auth_info = self._download_json(
+                self._login_url, None, note='Logging in', errnote='Unable to log in',
+                data=json.dumps(auth_data).encode('utf-8'))
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406:
+                auth_info = self._parse_json(e.cause.read().encode('utf-8'), None)
+            else:
+                raise
+
+        error_message = auth_info.get('error', {}).get('message')
+        if error_message:
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, error_message),
+                expected=True)
+
+        self._token = auth_info['secure_streaming_token']
+
+    def _real_initialize(self):
+        self._initialize_api()
+        self._login()
+
+
+class HRTiIE(HRTiBaseIE):
+    _VALID_URL = r'''(?x)
+                        (?:
+                            hrti:(?P<short_id>[0-9]+)|
+                            https?://
+                                hrti\.hrt\.hr/\#/video/show/(?P<id>[0-9]+)/(?P<display_id>[^/]+)?
+                        )
+                    '''
+    _TESTS = [{
+        'url': 'https://hrti.hrt.hr/#/video/show/2181385/republika-dokumentarna-serija-16-hd',
+        'info_dict': {
+            'id': '2181385',
+            'display_id': 'republika-dokumentarna-serija-16-hd',
+            'ext': 'mp4',
+            'title': 'REPUBLIKA, dokumentarna serija (1/6) (HD)',
+            'description': 'md5:48af85f620e8e0e1df4096270568544f',
+            'duration': 2922,
+            'view_count': int,
+            'average_rating': int,
+            'episode_number': int,
+            'season_number': int,
+            'age_limit': 12,
+        },
+        'skip': 'Requires account credentials',
+    }, {
+        'url': 'https://hrti.hrt.hr/#/video/show/2181385/',
+        'only_matching': True,
+    }, {
+        'url': 'hrti:2181385',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('short_id') or mobj.group('id')
+        display_id = mobj.group('display_id') or video_id
+
+        video = self._download_json(
+            '%s/video_id/%s/format/json' % (self._search_url, video_id),
+            display_id, 'Downloading video metadata JSON')['video'][0]
+
+        title_info = video['title']
+        title = title_info['title_long']
+
+        movie = video['video_assets']['movie'][0]
+        m3u8_url = movie['url'].format(TOKEN=self._token)
+        formats = self._extract_m3u8_formats(
+            m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native',
+            m3u8_id='hls')
+        self._sort_formats(formats)
+
+        description = clean_html(title_info.get('summary_long'))
+        age_limit = parse_age_limit(video.get('parental_control', {}).get('rating'))
+        view_count = int_or_none(video.get('views'))
+        average_rating = int_or_none(video.get('user_rating'))
+        duration = int_or_none(movie.get('duration'))
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'view_count': view_count,
+            'average_rating': average_rating,
+            'age_limit': age_limit,
+            'formats': formats,
+        }
+
+
+class HRTiPlaylistIE(HRTiBaseIE):
+    _VALID_URL = r'https?://hrti.hrt.hr/#/video/list/category/(?P<id>[0-9]+)/(?P<display_id>[^/]+)?'
+    _TESTS = [{
+        'url': 'https://hrti.hrt.hr/#/video/list/category/212/ekumena',
+        'info_dict': {
+            'id': '212',
+            'title': 'ekumena',
+        },
+        'playlist_mincount': 8,
+        'skip': 'Requires account credentials',
+    }, {
+        'url': 'https://hrti.hrt.hr/#/video/list/category/212/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        category_id = mobj.group('id')
+        display_id = mobj.group('display_id') or category_id
+
+        response = self._download_json(
+            '%s/category_id/%s/format/json' % (self._search_url, category_id),
+            display_id, 'Downloading video metadata JSON')
+
+        video_ids = try_get(
+            response, lambda x: x['video_listings'][0]['alternatives'][0]['list'],
+            list) or [video['id'] for video in response.get('videos', []) if video.get('id')]
+
+        entries = [self.url_result('hrti:%s' % video_id) for video_id in video_ids]
+
+        return self.playlist_result(entries, category_id, display_id)
index 85e9344aab18e22be204134ca846318367b54329..d23489dcff4464c1294d9c6f5dca30aa480ad3a8 100644 (file)
@@ -50,12 +50,10 @@ class ImgurIE(InfoExtractor):
         webpage = self._download_webpage(
             compat_urlparse.urljoin(url, video_id), video_id)
 
-        width = int_or_none(self._search_regex(
-            r'<param name="width" value="([0-9]+)"',
-            webpage, 'width', fatal=False))
-        height = int_or_none(self._search_regex(
-            r'<param name="height" value="([0-9]+)"',
-            webpage, 'height', fatal=False))
+        width = int_or_none(self._og_search_property(
+            'video:width', webpage, default=None))
+        height = int_or_none(self._og_search_property(
+            'video:height', webpage, default=None))
 
         video_elements = self._search_regex(
             r'(?s)<div class="video-elements">(.*?)</div>',
index fc0197ae19d2c3db4a504268256400d6ef81702d..8f7f232bea720ce0cfbf3c8e6aa9b38bddb93658 100644 (file)
@@ -36,7 +36,6 @@ class InstagramIE(InfoExtractor):
         'info_dict': {
             'id': 'BA-pQFBG8HZ',
             'ext': 'mp4',
-            'uploader_id': 'britneyspears',
             'title': 'Video by britneyspears',
             'thumbnail': 're:^https?://.*\.jpg',
             'timestamp': 1453760977,
index ddcb3c916e6a0610484dc5ceddbd84b507e761fd..01c7b30428f8a750c9932b0ea734f795c09866d6 100644 (file)
@@ -3,28 +3,22 @@ from __future__ import unicode_literals
 
 import hashlib
 import itertools
-import math
-import os
-import random
 import re
 import time
-import uuid
 
 from .common import InfoExtractor
 from ..compat import (
-    compat_parse_qs,
     compat_str,
     compat_urllib_parse_urlencode,
-    compat_urllib_parse_urlparse,
 )
 from ..utils import (
+    clean_html,
     decode_packed_codes,
+    get_element_by_id,
+    get_element_by_attribute,
     ExtractorError,
     ohdave_rsa_encrypt,
     remove_start,
-    sanitized_Request,
-    urlencode_postdata,
-    url_basename,
 )
 
 
@@ -171,70 +165,21 @@ class IqiyiIE(InfoExtractor):
 
     _TESTS = [{
         'url': 'http://www.iqiyi.com/v_19rrojlavg.html',
-        'md5': '2cb594dc2781e6c941a110d8f358118b',
+        # MD5 checksum differs on my machine and Travis CI
         'info_dict': {
             'id': '9c1fb1b99d192b21c559e5a1a2cb3c73',
+            'ext': 'mp4',
             'title': '美国德州空中惊现奇异云团 酷似UFO',
-            'ext': 'f4v',
         }
     }, {
         'url': 'http://www.iqiyi.com/v_19rrhnnclk.html',
+        'md5': '667171934041350c5de3f5015f7f1152',
         'info_dict': {
             'id': 'e3f585b550a280af23c98b6cb2be19fb',
-            'title': '名侦探柯南第752集',
-        },
-        'playlist': [{
-            'info_dict': {
-                'id': 'e3f585b550a280af23c98b6cb2be19fb_part1',
-                'ext': 'f4v',
-                'title': '名侦探柯南第752集',
-            },
-        }, {
-            'info_dict': {
-                'id': 'e3f585b550a280af23c98b6cb2be19fb_part2',
-                'ext': 'f4v',
-                'title': '名侦探柯南第752集',
-            },
-        }, {
-            'info_dict': {
-                'id': 'e3f585b550a280af23c98b6cb2be19fb_part3',
-                'ext': 'f4v',
-                'title': '名侦探柯南第752集',
-            },
-        }, {
-            'info_dict': {
-                'id': 'e3f585b550a280af23c98b6cb2be19fb_part4',
-                'ext': 'f4v',
-                'title': '名侦探柯南第752集',
-            },
-        }, {
-            'info_dict': {
-                'id': 'e3f585b550a280af23c98b6cb2be19fb_part5',
-                'ext': 'f4v',
-                'title': '名侦探柯南第752集',
-            },
-        }, {
-            'info_dict': {
-                'id': 'e3f585b550a280af23c98b6cb2be19fb_part6',
-                'ext': 'f4v',
-                'title': '名侦探柯南第752集',
-            },
-        }, {
-            'info_dict': {
-                'id': 'e3f585b550a280af23c98b6cb2be19fb_part7',
-                'ext': 'f4v',
-                'title': '名侦探柯南第752集',
-            },
-        }, {
-            'info_dict': {
-                'id': 'e3f585b550a280af23c98b6cb2be19fb_part8',
-                'ext': 'f4v',
-                'title': '名侦探柯南第752集',
-            },
-        }],
-        'params': {
-            'skip_download': True,
+            'ext': 'mp4',
+            'title': '名侦探柯南 国语版:第752集 迫近灰原秘密的黑影 下篇',
         },
+        'skip': 'Geo-restricted to China',
     }, {
         'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html',
         'only_matching': True,
@@ -250,22 +195,10 @@ class IqiyiIE(InfoExtractor):
         'url': 'http://www.iqiyi.com/v_19rrny4w8w.html',
         'info_dict': {
             'id': 'f3cf468b39dddb30d676f89a91200dc1',
+            'ext': 'mp4',
             'title': '泰坦尼克号',
         },
-        'playlist': [{
-            'info_dict': {
-                'id': 'f3cf468b39dddb30d676f89a91200dc1_part1',
-                'ext': 'f4v',
-                'title': '泰坦尼克号',
-            },
-        }, {
-            'info_dict': {
-                'id': 'f3cf468b39dddb30d676f89a91200dc1_part2',
-                'ext': 'f4v',
-                'title': '泰坦尼克号',
-            },
-        }],
-        'expected_warnings': ['Needs a VIP account for full video'],
+        'skip': 'Geo-restricted to China',
     }, {
         'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html',
         'info_dict': {
@@ -278,20 +211,15 @@ class IqiyiIE(InfoExtractor):
         'only_matching': True,
     }]
 
-    _FORMATS_MAP = [
-        ('1', 'h6'),
-        ('2', 'h5'),
-        ('3', 'h4'),
-        ('4', 'h3'),
-        ('5', 'h2'),
-        ('10', 'h1'),
-    ]
-
-    AUTH_API_ERRORS = {
-        # No preview available (不允许试看鉴权失败)
-        'Q00505': 'This video requires a VIP account',
-        # End of preview time (试看结束鉴权失败)
-        'Q00506': 'Needs a VIP account for full video',
+    _FORMATS_MAP = {
+        '96': 1,    # 216p, 240p
+        '1': 2,     # 336p, 360p
+        '2': 3,     # 480p, 504p
+        '21': 4,    # 504p
+        '4': 5,     # 720p
+        '17': 5,    # 720p
+        '5': 6,     # 1072p, 1080p
+        '18': 7,    # 1080p
     }
 
     def _real_initialize(self):
@@ -352,177 +280,23 @@ class IqiyiIE(InfoExtractor):
 
         return True
 
-    def _authenticate_vip_video(self, api_video_url, video_id, tvid, _uuid, do_report_warning):
-        auth_params = {
-            # version and platform hard-coded in com/qiyi/player/core/model/remote/AuthenticationRemote.as
-            'version': '2.0',
-            'platform': 'b6c13e26323c537d',
-            'aid': tvid,
-            'tvid': tvid,
-            'uid': '',
-            'deviceId': _uuid,
-            'playType': 'main',  # XXX: always main?
-            'filename': os.path.splitext(url_basename(api_video_url))[0],
-        }
+    def get_raw_data(self, tvid, video_id):
+        tm = int(time.time() * 1000)
 
-        qd_items = compat_parse_qs(compat_urllib_parse_urlparse(api_video_url).query)
-        for key, val in qd_items.items():
-            auth_params[key] = val[0]
-
-        auth_req = sanitized_Request(
-            'http://api.vip.iqiyi.com/services/ckn.action',
-            urlencode_postdata(auth_params))
-        # iQiyi server throws HTTP 405 error without the following header
-        auth_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
-        auth_result = self._download_json(
-            auth_req, video_id,
-            note='Downloading video authentication JSON',
-            errnote='Unable to download video authentication JSON')
-
-        code = auth_result.get('code')
-        msg = self.AUTH_API_ERRORS.get(code) or auth_result.get('msg') or code
-        if code == 'Q00506':
-            if do_report_warning:
-                self.report_warning(msg)
-            return False
-        if 'data' not in auth_result:
-            if msg is not None:
-                raise ExtractorError('%s said: %s' % (self.IE_NAME, msg), expected=True)
-            raise ExtractorError('Unexpected error from Iqiyi auth API')
-
-        return auth_result['data']
-
-    def construct_video_urls(self, data, video_id, _uuid, tvid):
-        def do_xor(x, y):
-            a = y % 3
-            if a == 1:
-                return x ^ 121
-            if a == 2:
-                return x ^ 72
-            return x ^ 103
-
-        def get_encode_code(l):
-            a = 0
-            b = l.split('-')
-            c = len(b)
-            s = ''
-            for i in range(c - 1, -1, -1):
-                a = do_xor(int(b[c - i - 1], 16), i)
-                s += chr(a)
-            return s[::-1]
-
-        def get_path_key(x, format_id, segment_index):
-            mg = ')(*&^flash@#$%a'
-            tm = self._download_json(
-                'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id,
-                note='Download path key of segment %d for format %s' % (segment_index + 1, format_id)
-            )['t']
-            t = str(int(math.floor(int(tm) / (600.0))))
-            return md5_text(t + mg + x)
-
-        video_urls_dict = {}
-        need_vip_warning_report = True
-        for format_item in data['vp']['tkl'][0]['vs']:
-            if 0 < int(format_item['bid']) <= 10:
-                format_id = self.get_format(format_item['bid'])
-            else:
-                continue
-
-            video_urls = []
-
-            video_urls_info = format_item['fs']
-            if not format_item['fs'][0]['l'].startswith('/'):
-                t = get_encode_code(format_item['fs'][0]['l'])
-                if t.endswith('mp4'):
-                    video_urls_info = format_item['flvs']
-
-            for segment_index, segment in enumerate(video_urls_info):
-                vl = segment['l']
-                if not vl.startswith('/'):
-                    vl = get_encode_code(vl)
-                is_vip_video = '/vip/' in vl
-                filesize = segment['b']
-                base_url = data['vp']['du'].split('/')
-                if not is_vip_video:
-                    key = get_path_key(
-                        vl.split('/')[-1].split('.')[0], format_id, segment_index)
-                    base_url.insert(-1, key)
-                base_url = '/'.join(base_url)
-                param = {
-                    'su': _uuid,
-                    'qyid': uuid.uuid4().hex,
-                    'client': '',
-                    'z': '',
-                    'bt': '',
-                    'ct': '',
-                    'tn': str(int(time.time()))
-                }
-                api_video_url = base_url + vl
-                if is_vip_video:
-                    api_video_url = api_video_url.replace('.f4v', '.hml')
-                    auth_result = self._authenticate_vip_video(
-                        api_video_url, video_id, tvid, _uuid, need_vip_warning_report)
-                    if auth_result is False:
-                        need_vip_warning_report = False
-                        break
-                    param.update({
-                        't': auth_result['t'],
-                        # cid is hard-coded in com/qiyi/player/core/player/RuntimeData.as
-                        'cid': 'afbe8fd3d73448c9',
-                        'vid': video_id,
-                        'QY00001': auth_result['u'],
-                    })
-                api_video_url += '?' if '?' not in api_video_url else '&'
-                api_video_url += compat_urllib_parse_urlencode(param)
-                js = self._download_json(
-                    api_video_url, video_id,
-                    note='Download video info of segment %d for format %s' % (segment_index + 1, format_id))
-                video_url = js['l']
-                video_urls.append(
-                    (video_url, filesize))
-
-            video_urls_dict[format_id] = video_urls
-        return video_urls_dict
-
-    def get_format(self, bid):
-        matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)]
-        return matched_format_ids[0] if len(matched_format_ids) else None
-
-    def get_bid(self, format_id):
-        matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id]
-        return matched_bids[0] if len(matched_bids) else None
-
-    def get_raw_data(self, tvid, video_id, enc_key, _uuid):
-        tm = str(int(time.time()))
-        tail = tm + tvid
-        param = {
-            'key': 'fvip',
-            'src': md5_text('youtube-dl'),
-            'tvId': tvid,
+        key = 'd5fb4bd9d50c4be6948c97edd7254b0e'
+        sc = md5_text(compat_str(tm) + key + tvid)
+        params = {
+            'tvid': tvid,
             'vid': video_id,
-            'vinfo': 1,
-            'tm': tm,
-            'enc': md5_text(enc_key + tail),
-            'qyid': _uuid,
-            'tn': random.random(),
-            # In iQiyi's flash player, um is set to 1 if there's a logged user
-            # Some 1080P formats are only available with a logged user.
-            # Here force um=1 to trick the iQiyi server
-            'um': 1,
-            'authkey': md5_text(md5_text('') + tail),
-            'k_tag': 1,
+            'src': '76f90cbd92f94a2e925d83e8ccd22cb7',
+            'sc': sc,
+            't': tm,
         }
 
-        api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
-            compat_urllib_parse_urlencode(param)
-        raw_data = self._download_json(api_url, video_id)
-        return raw_data
-
-    def get_enc_key(self, video_id):
-        # TODO: automatic key extraction
-        # last update at 2016-01-22 for Zombie::bite
-        enc_key = '4a1caba4b4465345366f28da7c117d20'
-        return enc_key
+        return self._download_json(
+            'http://cache.m.iqiyi.com/jp/tmts/%s/%s/' % (tvid, video_id),
+            video_id, transform_source=lambda s: remove_start(s, 'var tvInfoJs='),
+            query=params, headers=self.geo_verification_headers())
 
     def _extract_playlist(self, webpage):
         PAGE_SIZE = 50
@@ -571,58 +345,41 @@ class IqiyiIE(InfoExtractor):
             r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
         video_id = self._search_regex(
             r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
-        _uuid = uuid.uuid4().hex
-
-        enc_key = self.get_enc_key(video_id)
-
-        raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)
-
-        if raw_data['code'] != 'A000000':
-            raise ExtractorError('Unable to load data. Error code: ' + raw_data['code'])
-
-        data = raw_data['data']
-
-        title = data['vi']['vn']
-
-        # generate video_urls_dict
-        video_urls_dict = self.construct_video_urls(
-            data, video_id, _uuid, tvid)
-
-        # construct info
-        entries = []
-        for format_id in video_urls_dict:
-            video_urls = video_urls_dict[format_id]
-            for i, video_url_info in enumerate(video_urls):
-                if len(entries) < i + 1:
-                    entries.append({'formats': []})
-                entries[i]['formats'].append(
-                    {
-                        'url': video_url_info[0],
-                        'filesize': video_url_info[-1],
-                        'format_id': format_id,
-                        'preference': int(self.get_bid(format_id))
-                    }
-                )
-
-        for i in range(len(entries)):
-            self._sort_formats(entries[i]['formats'])
-            entries[i].update(
-                {
-                    'id': '%s_part%d' % (video_id, i + 1),
-                    'title': title,
-                }
-            )
-
-        if len(entries) > 1:
-            info = {
-                '_type': 'multi_video',
-                'id': video_id,
-                'title': title,
-                'entries': entries,
-            }
-        else:
-            info = entries[0]
-            info['id'] = video_id
-            info['title'] = title
-
-        return info
+
+        formats = []
+        for _ in range(5):
+            raw_data = self.get_raw_data(tvid, video_id)
+
+            if raw_data['code'] != 'A00000':
+                if raw_data['code'] == 'A00111':
+                    self.raise_geo_restricted()
+                raise ExtractorError('Unable to load data. Error code: ' + raw_data['code'])
+
+            data = raw_data['data']
+
+            for stream in data['vidl']:
+                if 'm3utx' not in stream:
+                    continue
+                vd = compat_str(stream['vd'])
+                formats.append({
+                    'url': stream['m3utx'],
+                    'format_id': vd,
+                    'ext': 'mp4',
+                    'preference': self._FORMATS_MAP.get(vd, -1),
+                    'protocol': 'm3u8_native',
+                })
+
+            if formats:
+                break
+
+            self._sleep(5, video_id)
+
+        self._sort_formats(formats)
+        title = (get_element_by_id('widget-videotitle', webpage) or
+                 clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage)))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+        }
index e44e31104f55fc72daa6884f09bcf8faab390568..ce3126943939063bc65c4c710e10ea61153ac036 100644 (file)
@@ -4,10 +4,12 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_urlparse
 from ..utils import (
     determine_ext,
     float_or_none,
     int_or_none,
+    mimetype2ext,
 )
 
 
@@ -28,74 +30,86 @@ class JWPlatformBaseIE(InfoExtractor):
         return self._parse_jwplayer_data(
             jwplayer_data, video_id, *args, **kwargs)
 
-    def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True, m3u8_id=None, rtmp_params=None):
+    def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, m3u8_id=None, rtmp_params=None, base_url=None):
         # JWPlayer backward compatibility: flattened playlists
         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
         if 'playlist' not in jwplayer_data:
             jwplayer_data = {'playlist': [jwplayer_data]}
 
-        video_data = jwplayer_data['playlist'][0]
-
-        # JWPlayer backward compatibility: flattened sources
-        # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
-        if 'sources' not in video_data:
-            video_data['sources'] = [video_data]
-
-        formats = []
-        for source in video_data['sources']:
-            source_url = self._proto_relative_url(source['file'])
-            source_type = source.get('type') or ''
-            if source_type in ('application/vnd.apple.mpegurl', 'hls') or determine_ext(source_url) == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(
-                    source_url, video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
-            elif source_type.startswith('audio'):
-                formats.append({
-                    'url': source_url,
-                    'vcodec': 'none',
-                })
-            else:
-                a_format = {
-                    'url': source_url,
-                    'width': int_or_none(source.get('width')),
-                    'height': int_or_none(source.get('height')),
-                }
-                if source_url.startswith('rtmp'):
-                    a_format['ext'] = 'flv',
-
-                    # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
-                    # of jwplayer.flash.swf
-                    rtmp_url_parts = re.split(
-                        r'((?:mp4|mp3|flv):)', source_url, 1)
-                    if len(rtmp_url_parts) == 3:
-                        rtmp_url, prefix, play_path = rtmp_url_parts
-                        a_format.update({
-                            'url': rtmp_url,
-                            'play_path': prefix + play_path,
-                        })
-                    if rtmp_params:
-                        a_format.update(rtmp_params)
-                formats.append(a_format)
-        self._sort_formats(formats)
-
-        subtitles = {}
-        tracks = video_data.get('tracks')
-        if tracks and isinstance(tracks, list):
-            for track in tracks:
-                if track.get('file') and track.get('kind') == 'captions':
-                    subtitles.setdefault(track.get('label') or 'en', []).append({
-                        'url': self._proto_relative_url(track['file'])
+        entries = []
+        for video_data in jwplayer_data['playlist']:
+            # JWPlayer backward compatibility: flattened sources
+            # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
+            if 'sources' not in video_data:
+                video_data['sources'] = [video_data]
+
+            this_video_id = video_id or video_data['mediaid']
+
+            formats = []
+            for source in video_data['sources']:
+                source_url = self._proto_relative_url(source['file'])
+                if base_url:
+                    source_url = compat_urlparse.urljoin(base_url, source_url)
+                source_type = source.get('type') or ''
+                ext = mimetype2ext(source_type) or determine_ext(source_url)
+                if source_type == 'hls' or ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False))
+                # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
+                elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
+                    formats.append({
+                        'url': source_url,
+                        'vcodec': 'none',
+                        'ext': ext,
                     })
+                else:
+                    a_format = {
+                        'url': source_url,
+                        'width': int_or_none(source.get('width')),
+                        'height': int_or_none(source.get('height')),
+                        'ext': ext,
+                    }
+                    if source_url.startswith('rtmp'):
+                        a_format['ext'] = 'flv'
+
+                        # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
+                        # of jwplayer.flash.swf
+                        rtmp_url_parts = re.split(
+                            r'((?:mp4|mp3|flv):)', source_url, 1)
+                        if len(rtmp_url_parts) == 3:
+                            rtmp_url, prefix, play_path = rtmp_url_parts
+                            a_format.update({
+                                'url': rtmp_url,
+                                'play_path': prefix + play_path,
+                            })
+                        if rtmp_params:
+                            a_format.update(rtmp_params)
+                    formats.append(a_format)
+            self._sort_formats(formats)
+
+            subtitles = {}
+            tracks = video_data.get('tracks')
+            if tracks and isinstance(tracks, list):
+                for track in tracks:
+                    if track.get('file') and track.get('kind') == 'captions':
+                        subtitles.setdefault(track.get('label') or 'en', []).append({
+                            'url': self._proto_relative_url(track['file'])
+                        })
 
-        return {
-            'id': video_id,
-            'title': video_data['title'] if require_title else video_data.get('title'),
-            'description': video_data.get('description'),
-            'thumbnail': self._proto_relative_url(video_data.get('image')),
-            'timestamp': int_or_none(video_data.get('pubdate')),
-            'duration': float_or_none(jwplayer_data.get('duration')),
-            'subtitles': subtitles,
-            'formats': formats,
-        }
+            entries.append({
+                'id': this_video_id,
+                'title': video_data['title'] if require_title else video_data.get('title'),
+                'description': video_data.get('description'),
+                'thumbnail': self._proto_relative_url(video_data.get('image')),
+                'timestamp': int_or_none(video_data.get('pubdate')),
+                'duration': float_or_none(jwplayer_data.get('duration')),
+                'subtitles': subtitles,
+                'formats': formats,
+            })
+        if len(entries) == 1:
+            return entries[0]
+        else:
+            return self.playlist_result(entries)
 
 
 class JWPlatformIE(JWPlatformBaseIE):
index a65697ff558864f36cc5e8b8f82f959b19ea16fc..ddf1165ffb021005119622d3f162cbce9c637b55 100644 (file)
@@ -6,7 +6,6 @@ import base64
 
 from .common import InfoExtractor
 from ..compat import (
-    compat_urllib_parse_urlencode,
     compat_urlparse,
     compat_parse_qs,
 )
@@ -15,6 +14,7 @@ from ..utils import (
     ExtractorError,
     int_or_none,
     unsmuggle_url,
+    smuggle_url,
 )
 
 
@@ -34,7 +34,8 @@ class KalturaIE(InfoExtractor):
                         )(?:/(?P<path>[^?]+))?(?:\?(?P<query>.*))?
                 )
                 '''
-    _API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?'
+    _SERVICE_URL = 'http://cdnapi.kaltura.com'
+    _SERVICE_BASE = '/api_v3/index.php'
     _TESTS = [
         {
             'url': 'kaltura:269692:1_1jc2y3e4',
@@ -61,19 +62,58 @@ class KalturaIE(InfoExtractor):
         {
             'url': 'https://cdnapisec.kaltura.com/html5/html5lib/v2.30.2/mwEmbedFrame.php/p/1337/uiconf_id/20540612/entry_id/1_sf5ovm7u?wid=_243342',
             'only_matching': True,
+        },
+        {
+            # video with subtitles
+            'url': 'kaltura:111032:1_cw786r8q',
+            'only_matching': True,
         }
     ]
 
-    def _kaltura_api_call(self, video_id, actions, *args, **kwargs):
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = (
+            re.search(
+                r"""(?xs)
+                    kWidget\.(?:thumb)?[Ee]mbed\(
+                    \{.*?
+                        (?P<q1>['\"])wid(?P=q1)\s*:\s*
+                        (?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*?
+                        (?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*
+                        (?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4),
+                """, webpage) or
+            re.search(
+                r'''(?xs)
+                    (?P<q1>["\'])
+                        (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?
+                    (?P=q1).*?
+                    (?:
+                        entry_?[Ii]d|
+                        (?P<q2>["\'])entry_?[Ii]d(?P=q2)
+                    )\s*:\s*
+                    (?P<q3>["\'])(?P<id>.+?)(?P=q3)
+                ''', webpage))
+        if mobj:
+            embed_info = mobj.groupdict()
+            url = 'kaltura:%(partner_id)s:%(id)s' % embed_info
+            escaped_pid = re.escape(embed_info['partner_id'])
+            service_url = re.search(
+                r'<script[^>]+src=["\']((?:https?:)?//.+?)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid),
+                webpage)
+            if service_url:
+                url = smuggle_url(url, {'service_url': service_url.group(1)})
+            return url
+
+    def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs):
         params = actions[0]
         if len(actions) > 1:
             for i, a in enumerate(actions[1:], start=1):
                 for k, v in a.items():
                     params['%d:%s' % (i, k)] = v
 
-        query = compat_urllib_parse_urlencode(params)
-        url = self._API_BASE + query
-        data = self._download_json(url, video_id, *args, **kwargs)
+        data = self._download_json(
+            (service_url or self._SERVICE_URL) + self._SERVICE_BASE,
+            video_id, query=params, *args, **kwargs)
 
         status = data if len(actions) == 1 else data[0]
         if status.get('objectType') == 'KalturaAPIException':
@@ -82,7 +122,7 @@ class KalturaIE(InfoExtractor):
 
         return data
 
-    def _get_kaltura_signature(self, video_id, partner_id):
+    def _get_kaltura_signature(self, video_id, partner_id, service_url=None):
         actions = [{
             'apiVersion': '3.1',
             'expiry': 86400,
@@ -92,10 +132,9 @@ class KalturaIE(InfoExtractor):
             'widgetId': '_%s' % partner_id,
         }]
         return self._kaltura_api_call(
-            video_id, actions, note='Downloading Kaltura signature')['ks']
+            video_id, actions, service_url, note='Downloading Kaltura signature')['ks']
 
-    def _get_video_info(self, video_id, partner_id):
-        signature = self._get_kaltura_signature(video_id, partner_id)
+    def _get_video_info(self, video_id, partner_id, service_url=None):
         actions = [
             {
                 'action': 'null',
@@ -103,22 +142,34 @@ class KalturaIE(InfoExtractor):
                 'clientTag': 'kdp:v3.8.5',
                 'format': 1,  # JSON, 2 = XML, 3 = PHP
                 'service': 'multirequest',
-                'ks': signature,
+            },
+            {
+                'expiry': 86400,
+                'service': 'session',
+                'action': 'startWidgetSession',
+                'widgetId': '_%s' % partner_id,
             },
             {
                 'action': 'get',
                 'entryId': video_id,
                 'service': 'baseentry',
-                'version': '-1',
+                'ks': '{1:result:ks}',
             },
             {
                 'action': 'getbyentryid',
                 'entryId': video_id,
                 'service': 'flavorAsset',
+                'ks': '{1:result:ks}',
+            },
+            {
+                'action': 'list',
+                'filter:entryIdEqual': video_id,
+                'service': 'caption_captionasset',
+                'ks': '{1:result:ks}',
             },
         ]
         return self._kaltura_api_call(
-            video_id, actions, note='Downloading video info JSON')
+            video_id, actions, service_url, note='Downloading video info JSON')
 
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
@@ -126,8 +177,9 @@ class KalturaIE(InfoExtractor):
         mobj = re.match(self._VALID_URL, url)
         partner_id, entry_id = mobj.group('partner_id', 'id')
         ks = None
+        captions = None
         if partner_id and entry_id:
-            info, flavor_assets = self._get_video_info(entry_id, partner_id)
+            _, info, flavor_assets, captions = self._get_video_info(entry_id, partner_id, smuggled_data.get('service_url'))
         else:
             path, query = mobj.group('path', 'query')
             if not path and not query:
@@ -146,7 +198,7 @@ class KalturaIE(InfoExtractor):
                 raise ExtractorError('Invalid URL', expected=True)
             if 'entry_id' in params:
                 entry_id = params['entry_id'][0]
-                info, flavor_assets = self._get_video_info(entry_id, partner_id)
+                _, info, flavor_assets, captions = self._get_video_info(entry_id, partner_id)
             elif 'uiconf_id' in params and 'flashvars[referenceId]' in params:
                 reference_id = params['flashvars[referenceId]'][0]
                 webpage = self._download_webpage(url, reference_id)
@@ -175,12 +227,17 @@ class KalturaIE(InfoExtractor):
                 unsigned_url += '?referrer=%s' % referrer
             return unsigned_url
 
+        data_url = info['dataUrl']
+        if '/flvclipper/' in data_url:
+            data_url = re.sub(r'/flvclipper/.*', '/serveFlavor', data_url)
+
         formats = []
         for f in flavor_assets:
             # Continue if asset is not ready
-            if f['status'] != 2:
+            if f.get('status') != 2:
                 continue
-            video_url = sign_url('%s/flavorId/%s' % (info['dataUrl'], f['id']))
+            video_url = sign_url(
+                '%s/flavorId/%s' % (data_url, f['id']))
             formats.append({
                 'format_id': '%(fileExt)s-%(bitrate)s' % f,
                 'ext': f.get('fileExt'),
@@ -193,17 +250,31 @@ class KalturaIE(InfoExtractor):
                 'width': int_or_none(f.get('width')),
                 'url': video_url,
             })
-        m3u8_url = sign_url(info['dataUrl'].replace('format/url', 'format/applehttp'))
-        formats.extend(self._extract_m3u8_formats(
-            m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+        if '/playManifest/' in data_url:
+            m3u8_url = sign_url(data_url.replace(
+                'format/url', 'format/applehttp'))
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, entry_id, 'mp4', 'm3u8_native',
+                m3u8_id='hls', fatal=False))
 
-        self._check_formats(formats, entry_id)
         self._sort_formats(formats)
 
+        subtitles = {}
+        if captions:
+            for caption in captions.get('objects', []):
+                # Continue if caption is not ready
+                if f.get('status') != 2:
+                    continue
+                subtitles.setdefault(caption.get('languageCode') or caption.get('language'), []).append({
+                    'url': '%s/api_v3/service/caption_captionasset/action/serve/captionAssetId/%s' % (self._SERVICE_URL, caption['id']),
+                    'ext': caption.get('fileExt'),
+                })
+
         return {
             'id': entry_id,
             'title': info['name'],
             'formats': formats,
+            'subtitles': subtitles,
             'description': clean_html(info.get('description')),
             'thumbnail': info.get('thumbnailUrl'),
             'duration': info.get('duration'),
diff --git a/youtube_dl/extractor/kamcord.py b/youtube_dl/extractor/kamcord.py
new file mode 100644 (file)
index 0000000..b50120d
--- /dev/null
@@ -0,0 +1,71 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    qualities,
+)
+
+
+class KamcordIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?kamcord\.com/v/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://www.kamcord.com/v/hNYRduDgWb4',
+        'md5': 'c3180e8a9cfac2e86e1b88cb8751b54c',
+        'info_dict': {
+            'id': 'hNYRduDgWb4',
+            'ext': 'mp4',
+            'title': 'Drinking Madness',
+            'uploader': 'jacksfilms',
+            'uploader_id': '3044562',
+            'view_count': int,
+            'like_count': int,
+            'comment_count': int,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video = self._parse_json(
+            self._search_regex(
+                r'window\.__props\s*=\s*({.+?});?(?:\n|\s*</script)',
+                webpage, 'video'),
+            video_id)['video']
+
+        title = video['title']
+
+        formats = self._extract_m3u8_formats(
+            video['play']['hls'], video_id, 'mp4', entry_protocol='m3u8_native')
+        self._sort_formats(formats)
+
+        uploader = video.get('user', {}).get('username')
+        uploader_id = video.get('user', {}).get('id')
+
+        view_count = int_or_none(video.get('viewCount'))
+        like_count = int_or_none(video.get('heartCount'))
+        comment_count = int_or_none(video.get('messageCount'))
+
+        preference_key = qualities(('small', 'medium', 'large'))
+
+        thumbnails = [{
+            'url': thumbnail_url,
+            'id': thumbnail_id,
+            'preference': preference_key(thumbnail_id),
+        } for thumbnail_id, thumbnail_url in (video.get('thumbnail') or {}).items()
+            if isinstance(thumbnail_id, compat_str) and isinstance(thumbnail_url, compat_str)]
+
+        return {
+            'id': video_id,
+            'title': title,
+            'uploader': uploader,
+            'uploader_id': uploader_id,
+            'view_count': view_count,
+            'like_count': like_count,
+            'comment_count': comment_count,
+            'thumbnails': thumbnails,
+            'formats': formats,
+        }
index 126ca13df1b8c30e9d94b204beb54eab03644fea..ad2f8a8c8dc880df28d61759dd567fd40f64b3c9 100644 (file)
@@ -3,64 +3,124 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..aes import aes_decrypt_text
+from ..compat import (
+    compat_str,
+    compat_urllib_parse_unquote,
+)
 from ..utils import (
-    sanitized_Request,
-    url_basename,
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    str_to_int,
+    strip_or_none,
 )
 
 
 class KeezMoviesIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/.+?(?P<id>[0-9]+)(?:[/?&]|$)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/(?:(?P<display_id>[^/]+)-)?(?P<id>\d+)'
+    _TESTS = [{
         'url': 'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711',
         'md5': '1c1e75d22ffa53320f45eeb07bc4cdc0',
         'info_dict': {
             'id': '1214711',
+            'display_id': 'petite-asian-lady-mai-playing-in-bathtub',
             'ext': 'mp4',
             'title': 'Petite Asian Lady Mai Playing In Bathtub',
-            'age_limit': 18,
             'thumbnail': 're:^https?://.*\.jpg$',
+            'view_count': int,
+            'age_limit': 18,
         }
-    }
+    }, {
+        'url': 'http://www.keezmovies.com/video/1214711',
+        'only_matching': True,
+    }]
 
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
+    def _extract_info(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id') or video_id
 
-        req = sanitized_Request(url)
-        req.add_header('Cookie', 'age_verified=1')
-        webpage = self._download_webpage(req, video_id)
+        webpage = self._download_webpage(
+            url, display_id, headers={'Cookie': 'age_verified=1'})
 
-        # embedded video
-        mobj = re.search(r'href="([^"]+)"></iframe>', webpage)
-        if mobj:
-            embedded_url = mobj.group(1)
-            return self.url_result(embedded_url)
+        formats = []
+        format_urls = set()
 
-        video_title = self._html_search_regex(
-            r'<h1 [^>]*>([^<]+)', webpage, 'title')
-        flashvars = self._parse_json(self._search_regex(
-            r'var\s+flashvars\s*=\s*([^;]+);', webpage, 'flashvars'), video_id)
+        title = None
+        thumbnail = None
+        duration = None
+        encrypted = False
 
-        formats = []
-        for height in (180, 240, 480):
-            if flashvars.get('quality_%dp' % height):
-                video_url = flashvars['quality_%dp' % height]
-                a_format = {
-                    'url': video_url,
-                    'height': height,
-                    'format_id': '%dp' % height,
-                }
-                filename_parts = url_basename(video_url).split('_')
-                if len(filename_parts) >= 2 and re.match(r'\d+[Kk]', filename_parts[1]):
-                    a_format['tbr'] = int(filename_parts[1][:-1])
-                formats.append(a_format)
-
-        age_limit = self._rta_search(webpage)
-
-        return {
+        def extract_format(format_url, height=None):
+            if not isinstance(format_url, compat_str) or not format_url.startswith('http'):
+                return
+            if format_url in format_urls:
+                return
+            format_urls.add(format_url)
+            tbr = int_or_none(self._search_regex(
+                r'[/_](\d+)[kK][/_]', format_url, 'tbr', default=None))
+            if not height:
+                height = int_or_none(self._search_regex(
+                    r'[/_](\d+)[pP][/_]', format_url, 'height', default=None))
+            if encrypted:
+                format_url = aes_decrypt_text(
+                    video_url, title, 32).decode('utf-8')
+            formats.append({
+                'url': format_url,
+                'format_id': '%dp' % height if height else None,
+                'height': height,
+                'tbr': tbr,
+            })
+
+        flashvars = self._parse_json(
+            self._search_regex(
+                r'flashvars\s*=\s*({.+?});', webpage,
+                'flashvars', default='{}'),
+            display_id, fatal=False)
+
+        if flashvars:
+            title = flashvars.get('video_title')
+            thumbnail = flashvars.get('image_url')
+            duration = int_or_none(flashvars.get('video_duration'))
+            encrypted = flashvars.get('encrypted') is True
+            for key, value in flashvars.items():
+                mobj = re.search(r'quality_(\d+)[pP]', key)
+                if mobj:
+                    extract_format(value, int(mobj.group(1)))
+            video_url = flashvars.get('video_url')
+            if video_url and determine_ext(video_url, None):
+                extract_format(video_url)
+
+        video_url = self._html_search_regex(
+            r'flashvars\.video_url\s*=\s*(["\'])(?P<url>http.+?)\1',
+            webpage, 'video url', default=None, group='url')
+        if video_url:
+            extract_format(compat_urllib_parse_unquote(video_url))
+
+        if not formats:
+            if 'title="This video is no longer available"' in webpage:
+                raise ExtractorError(
+                    'Video %s is no longer available' % video_id, expected=True)
+
+        self._sort_formats(formats)
+
+        if not title:
+            title = self._html_search_regex(
+                r'<h1[^>]*>([^<]+)', webpage, 'title')
+
+        return webpage, {
             'id': video_id,
-            'title': video_title,
+            'display_id': display_id,
+            'title': strip_or_none(title),
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'age_limit': 18,
             'formats': formats,
-            'age_limit': age_limit,
-            'thumbnail': flashvars.get('image_url')
         }
+
+    def _real_extract(self, url):
+        webpage, info = self._extract_info(url)
+        info['view_count'] = str_to_int(self._search_regex(
+            r'<b>([\d,.]+)</b> Views?', webpage, 'view count', fatal=False))
+        return info
index 0221fb9191879cef1d57e42238b5d86737cdd621..0eeb9ffeba13d10a6b047331c4923dea3b194f80 100644 (file)
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_urlparse
 from ..utils import (
     get_element_by_id,
     clean_html,
@@ -26,11 +27,6 @@ class KuwoBaseIE(InfoExtractor):
     def _get_formats(self, song_id, tolerate_ip_deny=False):
         formats = []
         for file_format in self._FORMATS:
-            headers = {}
-            cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
-            if cn_verification_proxy:
-                headers['Ytdl-request-proxy'] = cn_verification_proxy
-
             query = {
                 'format': file_format['ext'],
                 'br': file_format.get('br', ''),
@@ -42,7 +38,7 @@ class KuwoBaseIE(InfoExtractor):
             song_url = self._download_webpage(
                 'http://antiserver.kuwo.cn/anti.s',
                 song_id, note='Download %s url info' % file_format['format'],
-                query=query, headers=headers,
+                query=query, headers=self.geo_verification_headers(),
             )
 
             if song_url == 'IPDeny' and not tolerate_ip_deny:
@@ -247,8 +243,9 @@ class KuwoSingerIE(InfoExtractor):
                 query={'artistId': artist_id, 'pn': page_num, 'rn': self.PAGE_SIZE})
 
             return [
-                self.url_result(song_url, 'Kuwo') for song_url in re.findall(
-                    r'<div[^>]+class="name"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)',
+                self.url_result(compat_urlparse.urljoin(url, song_url), 'Kuwo')
+                for song_url in re.findall(
+                    r'<div[^>]+class="name"><a[^>]+href="(/yinyue/\d+)',
                     webpage)
             ]
 
index b08f6e3c9548de02217e43bebbf20b5f2ab871e8..da5a5de4ad7e65b995a257303096b4bc58061b67 100644 (file)
@@ -1,60 +1,65 @@
+# coding: utf-8
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
 from ..utils import (
-    parse_duration,
+    js_to_json,
+    smuggle_url,
 )
 
 
 class LA7IE(InfoExtractor):
-    IE_NAME = 'la7.tv'
-    _VALID_URL = r'''(?x)
-        https?://(?:www\.)?la7\.tv/
-        (?:
-            richplayer/\?assetid=|
-            \?contentId=
-        )
-        (?P<id>[0-9]+)'''
-
-    _TEST = {
-        'url': 'http://www.la7.tv/richplayer/?assetid=50355319',
-        'md5': 'ec7d1f0224d20ba293ab56cf2259651f',
+    IE_NAME = 'la7.it'
+    _VALID_URL = r'''(?x)(https?://)?(?:
+        (?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video)/|
+        tg\.la7\.it/repliche-tgla7\?id=
+    )(?P<id>.+)'''
+
+    _TESTS = [{
+        # 'src' is a plain URL
+        'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722',
+        'md5': '8b613ffc0c4bf9b9e377169fc19c214c',
         'info_dict': {
-            'id': '50355319',
+            'id': 'inccool8-02-10-2015-163722',
             'ext': 'mp4',
-            'title': 'IL DIVO',
-            'description': 'Un film di Paolo Sorrentino con Toni Servillo, Anna Bonaiuto, Giulio Bosetti  e Flavio Bucci',
-            'duration': 6254,
+            'title': 'Inc.Cool8',
+            'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto  atletico',
+            'thumbnail': 're:^https?://.*',
+            'uploader_id': 'kdla7pillole@iltrovatore.it',
+            'timestamp': 1443814869,
+            'upload_date': '20151002',
         },
-        'skip': 'Blocked in the US',
-    }
+    }, {
+        # 'src' is a dictionary
+        'url': 'http://tg.la7.it/repliche-tgla7?id=189080',
+        'md5': '6b0d8888d286e39870208dfeceaf456b',
+        'info_dict': {
+            'id': '189080',
+            'ext': 'mp4',
+            'title': 'TG LA7',
+        },
+    }, {
+        'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        xml_url = 'http://www.la7.tv/repliche/content/index.php?contentId=%s' % video_id
-        doc = self._download_xml(xml_url, video_id)
-
-        video_title = doc.find('title').text
-        description = doc.find('description').text
-        duration = parse_duration(doc.find('duration').text)
-        thumbnail = doc.find('img').text
-        view_count = int(doc.find('views').text)
 
-        prefix = doc.find('.//fqdn').text.strip().replace('auto:', 'http:')
+        webpage = self._download_webpage(url, video_id)
 
-        formats = [{
-            'format': vnode.find('quality').text,
-            'tbr': int(vnode.find('quality').text),
-            'url': vnode.find('fms').text.strip().replace('mp4:', prefix),
-        } for vnode in doc.findall('.//videos/video')]
-        self._sort_formats(formats)
+        player_data = self._parse_json(
+            self._search_regex(r'videoLa7\(({[^;]+})\);', webpage, 'player data'),
+            video_id, transform_source=js_to_json)
 
         return {
+            '_type': 'url_transparent',
+            'url': smuggle_url('kaltura:103:%s' % player_data['vid'], {
+                'service_url': 'http://kdam.iltrovatore.it',
+            }),
             'id': video_id,
-            'title': video_title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'duration': duration,
-            'formats': formats,
-            'view_count': view_count,
+            'title': player_data['title'],
+            'description': self._og_search_description(webpage, default=None),
+            'thumbnail': player_data.get('poster'),
+            'ie_key': 'Kaltura',
         }
diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py
new file mode 100644 (file)
index 0000000..ade27a9
--- /dev/null
@@ -0,0 +1,90 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .arkena import ArkenaIE
+
+
+class LcpPlayIE(ArkenaIE):
+    _VALID_URL = r'https?://play\.lcp\.fr/embed/(?P<id>[^/]+)/(?P<account_id>[^/]+)/[^/]+/[^/]+'
+    _TESTS = [{
+        'url': 'http://play.lcp.fr/embed/327336/131064/darkmatter/0',
+        'md5': 'b8bd9298542929c06c1c15788b1f277a',
+        'info_dict': {
+            'id': '327336',
+            'ext': 'mp4',
+            'title': '327336',
+            'timestamp': 1456391602,
+            'upload_date': '20160225',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+
+class LcpIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?lcp\.fr/(?:[^/]+/)*(?P<id>[^/]+)'
+
+    _TESTS = [{
+        # arkena embed
+        'url': 'http://www.lcp.fr/la-politique-en-video/schwartzenberg-prg-preconise-francois-hollande-de-participer-une-primaire',
+        'md5': 'b8bd9298542929c06c1c15788b1f277a',
+        'info_dict': {
+            'id': 'd56d03e9',
+            'ext': 'mp4',
+            'title': 'Schwartzenberg (PRG) préconise à François Hollande de participer à une primaire à gauche',
+            'description': 'md5:96ad55009548da9dea19f4120c6c16a8',
+            'timestamp': 1456488895,
+            'upload_date': '20160226',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # dailymotion live stream
+        'url': 'http://www.lcp.fr/le-direct',
+        'info_dict': {
+            'id': 'xji3qy',
+            'ext': 'mp4',
+            'title': 'La Chaine Parlementaire (LCP), Live TNT',
+            'description': 'md5:5c69593f2de0f38bd9a949f2c95e870b',
+            'uploader': 'LCP',
+            'uploader_id': 'xbz33d',
+            'timestamp': 1308923058,
+            'upload_date': '20110624',
+        },
+        'params': {
+            # m3u8 live stream
+            'skip_download': True,
+        },
+    }, {
+        'url': 'http://www.lcp.fr/emissions/277792-les-volontaires',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        play_url = self._search_regex(
+            r'<iframe[^>]+src=(["\'])(?P<url>%s?(?:(?!\1).)*)\1' % LcpPlayIE._VALID_URL,
+            webpage, 'play iframe', default=None, group='url')
+
+        if not play_url:
+            return self.url_result(url, 'Generic')
+
+        title = self._og_search_title(webpage, default=None) or self._html_search_meta(
+            'twitter:title', webpage, fatal=True)
+        description = self._html_search_meta(
+            ('description', 'twitter:description'), webpage)
+
+        return {
+            '_type': 'url_transparent',
+            'ie_key': LcpPlayIE.ie_key(),
+            'url': play_url,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+        }
index 63f581cd9fe44013ce8ec85ce540d04d0df73713..e9cc9aa5983967861b08a2d9ee79297ae3a1726e 100644 (file)
@@ -20,9 +20,10 @@ from ..utils import (
     int_or_none,
     orderedSet,
     parse_iso8601,
-    sanitized_Request,
     str_or_none,
     url_basename,
+    urshift,
+    update_url_query,
 )
 
 
@@ -74,15 +75,11 @@ class LeIE(InfoExtractor):
         'only_matching': True,
     }]
 
-    @staticmethod
-    def urshift(val, n):
-        return val >> n if val >= 0 else (val + 0x100000000) >> n
-
     # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf
     def ror(self, param1, param2):
         _loc3_ = 0
         while _loc3_ < param2:
-            param1 = self.urshift(param1, 1) + ((param1 & 1) << 31)
+            param1 = urshift(param1, 1) + ((param1 & 1) << 31)
             _loc3_ += 1
         return param1
 
@@ -93,6 +90,10 @@ class LeIE(InfoExtractor):
         _loc3_ = self.ror(_loc3_, _loc2_ % 17)
         return _loc3_
 
+    # reversed from http://jstatic.letvcdn.com/sdk/player.js
+    def get_mms_key(self, time):
+        return self.ror(time, 8) ^ 185025305
+
     # see M3U8Encryption class in KLetvPlayer.swf
     @staticmethod
     def decrypt_m3u8(encrypted_data):
@@ -113,28 +114,7 @@ class LeIE(InfoExtractor):
 
         return bytes(_loc7_)
 
-    def _real_extract(self, url):
-        media_id = self._match_id(url)
-        page = self._download_webpage(url, media_id)
-        params = {
-            'id': media_id,
-            'platid': 1,
-            'splatid': 101,
-            'format': 1,
-            'tkey': self.calc_time_key(int(time.time())),
-            'domain': 'www.le.com'
-        }
-        play_json_req = sanitized_Request(
-            'http://api.le.com/mms/out/video/playJson?' + compat_urllib_parse_urlencode(params)
-        )
-        cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
-        if cn_verification_proxy:
-            play_json_req.add_header('Ytdl-request-proxy', cn_verification_proxy)
-
-        play_json = self._download_json(
-            play_json_req,
-            media_id, 'Downloading playJson data')
-
+    def _check_errors(self, play_json):
         # Check for errors
         playstatus = play_json['playstatus']
         if playstatus['status'] == 0:
@@ -145,43 +125,99 @@ class LeIE(InfoExtractor):
                 msg = 'Generic error. flag = %d' % flag
             raise ExtractorError(msg, expected=True)
 
-        playurl = play_json['playurl']
-
-        formats = ['350', '1000', '1300', '720p', '1080p']
-        dispatch = playurl['dispatch']
+    def _real_extract(self, url):
+        media_id = self._match_id(url)
+        page = self._download_webpage(url, media_id)
 
-        urls = []
-        for format_id in formats:
-            if format_id in dispatch:
-                media_url = playurl['domain'][0] + dispatch[format_id][0]
-                media_url += '&' + compat_urllib_parse_urlencode({
-                    'm3v': 1,
+        play_json_h5 = self._download_json(
+            'http://api.le.com/mms/out/video/playJsonH5',
+            media_id, 'Downloading html5 playJson data', query={
+                'id': media_id,
+                'platid': 3,
+                'splatid': 304,
+                'format': 1,
+                'tkey': self.get_mms_key(int(time.time())),
+                'domain': 'www.le.com',
+                'tss': 'no',
+            },
+            headers=self.geo_verification_headers())
+        self._check_errors(play_json_h5)
+
+        play_json_flash = self._download_json(
+            'http://api.le.com/mms/out/video/playJson',
+            media_id, 'Downloading flash playJson data', query={
+                'id': media_id,
+                'platid': 1,
+                'splatid': 101,
+                'format': 1,
+                'tkey': self.calc_time_key(int(time.time())),
+                'domain': 'www.le.com',
+            },
+            headers=self.geo_verification_headers())
+        self._check_errors(play_json_flash)
+
+        def get_h5_urls(media_url, format_id):
+            location = self._download_json(
+                media_url, media_id,
+                'Download JSON metadata for format %s' % format_id, query={
                     'format': 1,
                     'expect': 3,
-                    'rateid': format_id,
-                })
+                    'tss': 'no',
+                })['location']
+
+            return {
+                'http': update_url_query(location, {'tss': 'no'}),
+                'hls': update_url_query(location, {'tss': 'ios'}),
+            }
 
-                nodes_data = self._download_json(
-                    media_url, media_id,
-                    'Download JSON metadata for format %s' % format_id)
+        def get_flash_urls(media_url, format_id):
+            media_url += '&' + compat_urllib_parse_urlencode({
+                'm3v': 1,
+                'format': 1,
+                'expect': 3,
+                'rateid': format_id,
+            })
 
-                req = self._request_webpage(
-                    nodes_data['nodelist'][0]['location'], media_id,
-                    note='Downloading m3u8 information for format %s' % format_id)
+            nodes_data = self._download_json(
+                media_url, media_id,
+                'Download JSON metadata for format %s' % format_id)
 
-                m3u8_data = self.decrypt_m3u8(req.read())
+            req = self._request_webpage(
+                nodes_data['nodelist'][0]['location'], media_id,
+                note='Downloading m3u8 information for format %s' % format_id)
 
-                url_info_dict = {
-                    'url': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'),
-                    'ext': determine_ext(dispatch[format_id][1]),
-                    'format_id': format_id,
-                    'protocol': 'm3u8',
-                }
+            m3u8_data = self.decrypt_m3u8(req.read())
 
-                if format_id[-1:] == 'p':
-                    url_info_dict['height'] = int_or_none(format_id[:-1])
+            return {
+                'hls': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'),
+            }
 
-                urls.append(url_info_dict)
+        extracted_formats = []
+        formats = []
+        for play_json, get_urls in ((play_json_h5, get_h5_urls), (play_json_flash, get_flash_urls)):
+            playurl = play_json['playurl']
+            play_domain = playurl['domain'][0]
+
+            for format_id, format_data in playurl.get('dispatch', []).items():
+                if format_id in extracted_formats:
+                    continue
+                extracted_formats.append(format_id)
+
+                media_url = play_domain + format_data[0]
+                for protocol, format_url in get_urls(media_url, format_id).items():
+                    f = {
+                        'url': format_url,
+                        'ext': determine_ext(format_data[1]),
+                        'format_id': '%s-%s' % (protocol, format_id),
+                        'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
+                        'quality': int_or_none(format_id),
+                    }
+
+                    if format_id[-1:] == 'p':
+                        f['height'] = int_or_none(format_id[:-1])
+
+                    formats.append(f)
+        self._sort_formats(formats, ('height', 'quality', 'format_id'))
 
         publish_time = parse_iso8601(self._html_search_regex(
             r'发布时间&nbsp;([^<>]+) ', page, 'publish time', default=None),
@@ -190,7 +226,7 @@ class LeIE(InfoExtractor):
 
         return {
             'id': media_id,
-            'formats': urls,
+            'formats': formats,
             'title': playurl['title'],
             'thumbnail': playurl['pic'],
             'description': description,
index c2b4490c49044bea3426dc48b873218aa98146b1..87120ecd1f40c8011269a5e80b6ab158d3d94df3 100644 (file)
@@ -4,7 +4,10 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_urlparse
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+)
 from ..utils import (
     determine_ext,
     ExtractorError,
@@ -96,7 +99,7 @@ class LifeNewsIE(InfoExtractor):
             r'<video[^>]+><source[^>]+src=["\'](.+?)["\']', webpage)
 
         iframe_links = re.findall(
-            r'<iframe[^>]+src=["\']((?:https?:)?//embed\.life\.ru/embed/.+?)["\']',
+            r'<iframe[^>]+src=["\']((?:https?:)?//embed\.life\.ru/(?:embed|video)/.+?)["\']',
             webpage)
 
         if not video_urls and not iframe_links:
@@ -164,9 +167,9 @@ class LifeNewsIE(InfoExtractor):
 
 class LifeEmbedIE(InfoExtractor):
     IE_NAME = 'life:embed'
-    _VALID_URL = r'https?://embed\.life\.ru/embed/(?P<id>[\da-f]{32})'
+    _VALID_URL = r'https?://embed\.life\.ru/(?:embed|video)/(?P<id>[\da-f]{32})'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://embed.life.ru/embed/e50c2dec2867350528e2574c899b8291',
         'md5': 'b889715c9e49cb1981281d0e5458fbbe',
         'info_dict': {
@@ -175,30 +178,57 @@ class LifeEmbedIE(InfoExtractor):
             'title': 'e50c2dec2867350528e2574c899b8291',
             'thumbnail': 're:http://.*\.jpg',
         }
-    }
+    }, {
+        # with 1080p
+        'url': 'https://embed.life.ru/video/e50c2dec2867350528e2574c899b8291',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
         webpage = self._download_webpage(url, video_id)
 
+        thumbnail = None
         formats = []
-        for video_url in re.findall(r'"file"\s*:\s*"([^"]+)', webpage):
-            video_url = compat_urlparse.urljoin(url, video_url)
-            ext = determine_ext(video_url)
-            if ext == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(
-                    video_url, video_id, 'mp4',
-                    entry_protocol='m3u8_native', m3u8_id='m3u8'))
-            else:
-                formats.append({
-                    'url': video_url,
-                    'format_id': ext,
-                    'preference': 1,
-                })
+
+        def extract_m3u8(manifest_url):
+            formats.extend(self._extract_m3u8_formats(
+                manifest_url, video_id, 'mp4',
+                entry_protocol='m3u8_native', m3u8_id='m3u8'))
+
+        def extract_original(original_url):
+            formats.append({
+                'url': original_url,
+                'format_id': determine_ext(original_url, None),
+                'preference': 1,
+            })
+
+        playlist = self._parse_json(
+            self._search_regex(
+                r'options\s*=\s*({.+?});', webpage, 'options', default='{}'),
+            video_id).get('playlist', {})
+        if playlist:
+            master = playlist.get('master')
+            if isinstance(master, compat_str) and determine_ext(master) == 'm3u8':
+                extract_m3u8(compat_urlparse.urljoin(url, master))
+            original = playlist.get('original')
+            if isinstance(original, compat_str):
+                extract_original(original)
+            thumbnail = playlist.get('image')
+
+        # Old rendition fallback
+        if not formats:
+            for video_url in re.findall(r'"file"\s*:\s*"([^"]+)', webpage):
+                video_url = compat_urlparse.urljoin(url, video_url)
+                if determine_ext(video_url) == 'm3u8':
+                    extract_m3u8(video_url)
+                else:
+                    extract_original(video_url)
+
         self._sort_formats(formats)
 
-        thumbnail = self._search_regex(
+        thumbnail = thumbnail or self._search_regex(
             r'"image"\s*:\s*"([^"]+)', webpage, 'thumbnail', default=None)
 
         return {
index 5d2c3e256740d865e1c0be2c8e2808a9ee97ee43..a425bafe30ae5cffdbeb793ef27c4ffdb870dcc9 100644 (file)
@@ -37,11 +37,12 @@ class LimelightBaseIE(InfoExtractor):
 
         for stream in streams:
             stream_url = stream.get('url')
-            if not stream_url:
+            if not stream_url or stream.get('drmProtected'):
                 continue
-            if '.f4m' in stream_url:
+            ext = determine_ext(stream_url)
+            if ext == 'f4m':
                 formats.extend(self._extract_f4m_formats(
-                    stream_url, video_id, fatal=False))
+                    stream_url, video_id, f4m_id='hds', fatal=False))
             else:
                 fmt = {
                     'url': stream_url,
@@ -50,13 +51,19 @@ class LimelightBaseIE(InfoExtractor):
                     'fps': float_or_none(stream.get('videoFrameRate')),
                     'width': int_or_none(stream.get('videoWidthInPixels')),
                     'height': int_or_none(stream.get('videoHeightInPixels')),
-                    'ext': determine_ext(stream_url)
+                    'ext': ext,
                 }
-                rtmp = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', stream_url)
+                rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp4:.+)$', stream_url)
                 if rtmp:
                     format_id = 'rtmp'
                     if stream.get('videoBitRate'):
                         format_id += '-%d' % int_or_none(stream['videoBitRate'])
+                    http_fmt = fmt.copy()
+                    http_fmt.update({
+                        'url': 'http://%s/%s' % (rtmp.group('host').replace('csl.', 'cpl.'), rtmp.group('playpath')[4:]),
+                        'format_id': format_id.replace('rtmp', 'http'),
+                    })
+                    formats.append(http_fmt)
                     fmt.update({
                         'url': rtmp.group('url'),
                         'play_path': rtmp.group('playpath'),
@@ -68,18 +75,23 @@ class LimelightBaseIE(InfoExtractor):
 
         for mobile_url in mobile_urls:
             media_url = mobile_url.get('mobileUrl')
-            if not media_url:
-                continue
             format_id = mobile_url.get('targetMediaPlatform')
-            if determine_ext(media_url) == 'm3u8':
+            if not media_url or format_id == 'Widevine':
+                continue
+            ext = determine_ext(media_url)
+            if ext == 'm3u8':
                 formats.extend(self._extract_m3u8_formats(
                     media_url, video_id, 'mp4', 'm3u8_native',
                     m3u8_id=format_id, fatal=False))
+            elif ext == 'f4m':
+                formats.extend(self._extract_f4m_formats(
+                    stream_url, video_id, f4m_id=format_id, fatal=False))
             else:
                 formats.append({
                     'url': media_url,
                     'format_id': format_id,
                     'preference': -1,
+                    'ext': ext,
                 })
 
         self._sort_formats(formats)
@@ -145,7 +157,7 @@ class LimelightMediaIE(LimelightBaseIE):
         'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86',
         'info_dict': {
             'id': '3ffd040b522b4485b6d84effc750cd86',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': 'HaP and the HB Prince Trailer',
             'description': 'md5:8005b944181778e313d95c1237ddb640',
             'thumbnail': 're:^https?://.*\.jpeg$',
@@ -154,27 +166,23 @@ class LimelightMediaIE(LimelightBaseIE):
             'upload_date': '20090604',
         },
         'params': {
-            # rtmp download
+            # m3u8 download
             'skip_download': True,
         },
     }, {
         # video with subtitles
         'url': 'limelight:media:a3e00274d4564ec4a9b29b9466432335',
+        'md5': '2fa3bad9ac321e23860ca23bc2c69e3d',
         'info_dict': {
             'id': 'a3e00274d4564ec4a9b29b9466432335',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': '3Play Media Overview Video',
-            'description': '',
             'thumbnail': 're:^https?://.*\.jpeg$',
             'duration': 78.101,
             'timestamp': 1338929955,
             'upload_date': '20120605',
             'subtitles': 'mincount:9',
         },
-        'params': {
-            # rtmp download
-            'skip_download': True,
-        },
     }, {
         'url': 'https://assets.delvenetworks.com/player/loader.swf?mediaId=8018a574f08d416e95ceaccae4ba0452',
         'only_matching': True,
index 2d50400326215e91ea0efc6130ec172437d7439c..a98c4c530ec4d62a18b437d5c17275daec5c3dfb 100644 (file)
@@ -100,7 +100,7 @@ class LyndaIE(LyndaBaseIE):
 
     _TESTS = [{
         'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html',
-        'md5': 'ecfc6862da89489161fb9cd5f5a6fac1',
+        # md5 is unstable
         'info_dict': {
             'id': '114408',
             'ext': 'mp4',
index d5945ad66b3a784263fb1c5106534081b1f04913..39d2742c89282c2773ee1aca44ca14f047393bc5 100644 (file)
@@ -1,8 +1,6 @@
 # encoding: utf-8
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 
 
@@ -23,34 +21,5 @@ class M6IE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-
-        rss = self._download_xml('http://ws.m6.fr/v1/video/info/m6/bonus/%s' % video_id, video_id,
-                                 'Downloading video RSS')
-
-        title = rss.find('./channel/item/title').text
-        description = rss.find('./channel/item/description').text
-        thumbnail = rss.find('./channel/item/visuel_clip_big').text
-        duration = int(rss.find('./channel/item/duration').text)
-        view_count = int(rss.find('./channel/item/nombre_vues').text)
-
-        formats = []
-        for format_id in ['lq', 'sd', 'hq', 'hd']:
-            video_url = rss.find('./channel/item/url_video_%s' % format_id)
-            if video_url is None:
-                continue
-            formats.append({
-                'url': video_url.text,
-                'format_id': format_id,
-            })
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'duration': duration,
-            'view_count': view_count,
-            'formats': formats,
-        }
+        video_id = self._match_id(url)
+        return self.url_result('6play:%s' % video_id, 'SixPlay', video_id)
diff --git a/youtube_dl/extractor/meta.py b/youtube_dl/extractor/meta.py
new file mode 100644 (file)
index 0000000..cdb46e1
--- /dev/null
@@ -0,0 +1,73 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .pladform import PladformIE
+from ..utils import (
+    unescapeHTML,
+    int_or_none,
+    ExtractorError,
+)
+
+
+class METAIE(InfoExtractor):
+    _VALID_URL = r'https?://video\.meta\.ua/(?:iframe/)?(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://video.meta.ua/5502115.video',
+        'md5': '71b6f3ee274bef16f1ab410f7f56b476',
+        'info_dict': {
+            'id': '5502115',
+            'ext': 'mp4',
+            'title': 'Sony Xperia Z camera test [HQ]',
+            'description': 'Xperia Z shoots video in FullHD HDR.',
+            'uploader_id': 'nomobile',
+            'uploader': 'CHЁZA.TV',
+            'upload_date': '20130211',
+        },
+        'add_ie': ['Youtube'],
+    }, {
+        'url': 'http://video.meta.ua/iframe/5502115',
+        'only_matching': True,
+    }, {
+        # pladform embed
+        'url': 'http://video.meta.ua/7121015.video',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        st_html5 = self._search_regex(
+            r"st_html5\s*=\s*'#([^']+)'", webpage, 'uppod html5 st', default=None)
+
+        if st_html5:
+            # uppod st decryption algorithm is reverse engineered from function un(s) at uppod.js
+            json_str = ''
+            for i in range(0, len(st_html5), 3):
+                json_str += '&#x0%s;' % st_html5[i:i + 3]
+            uppod_data = self._parse_json(unescapeHTML(json_str), video_id)
+            error = uppod_data.get('customnotfound')
+            if error:
+                raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
+
+            video_url = uppod_data['file']
+            info = {
+                'id': video_id,
+                'url': video_url,
+                'title': uppod_data.get('comment') or self._og_search_title(webpage),
+                'description': self._og_search_description(webpage, default=None),
+                'thumbnail': uppod_data.get('poster') or self._og_search_thumbnail(webpage),
+                'duration': int_or_none(self._og_search_property(
+                    'video:duration', webpage, default=None)),
+            }
+            if 'youtube.com/' in video_url:
+                info.update({
+                    '_type': 'url_transparent',
+                    'ie_key': 'Youtube',
+                })
+            return info
+
+        pladform_url = PladformIE._extract_url(webpage)
+        if pladform_url:
+            return self.url_result(pladform_url)
index b6f00cc25ff9c2769176e551b33bd5901c121f64..e6e7659a1de0ebe86f48a4128192de5d14d6d586 100644 (file)
@@ -11,13 +11,14 @@ from ..utils import (
     determine_ext,
     ExtractorError,
     int_or_none,
-    sanitized_Request,
     urlencode_postdata,
+    get_element_by_attribute,
+    mimetype2ext,
 )
 
 
 class MetacafeIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
+    _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/(?P<video_id>[^/]+)/(?P<display_id>[^/?#]+)'
     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
     IE_NAME = 'metacafe'
@@ -47,6 +48,7 @@ class MetacafeIE(InfoExtractor):
                 'uploader': 'ign',
                 'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.',
             },
+            'skip': 'Page is temporarily unavailable.',
         },
         # AnyClip video
         {
@@ -55,8 +57,8 @@ class MetacafeIE(InfoExtractor):
                 'id': 'an-dVVXnuY7Jh77J',
                 'ext': 'mp4',
                 'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3',
-                'uploader': 'anyclip',
-                'description': 'md5:38c711dd98f5bb87acf973d573442e67',
+                'uploader': 'AnyClip',
+                'description': 'md5:cbef0460d31e3807f6feb4e7a5952e5b',
             },
         },
         # age-restricted video
@@ -110,28 +112,25 @@ class MetacafeIE(InfoExtractor):
     def report_disclaimer(self):
         self.to_screen('Retrieving disclaimer')
 
-    def _real_initialize(self):
+    def _confirm_age(self):
         # Retrieve disclaimer
         self.report_disclaimer()
         self._download_webpage(self._DISCLAIMER, None, False, 'Unable to retrieve disclaimer')
 
         # Confirm age
-        disclaimer_form = {
-            'filters': '0',
-            'submit': "Continue - I'm over 18",
-        }
-        request = sanitized_Request(self._FILTER_POST, urlencode_postdata(disclaimer_form))
-        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
         self.report_age_confirmation()
-        self._download_webpage(request, None, False, 'Unable to confirm age')
+        self._download_webpage(
+            self._FILTER_POST, None, False, 'Unable to confirm age',
+            data=urlencode_postdata({
+                'filters': '0',
+                'submit': "Continue - I'm over 18",
+            }), headers={
+                'Content-Type': 'application/x-www-form-urlencoded',
+            })
 
     def _real_extract(self, url):
         # Extract id and simplified title from URL
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError('Invalid URL: %s' % url)
-
-        video_id = mobj.group(1)
+        video_id, display_id = re.match(self._VALID_URL, url).groups()
 
         # the video may come from an external site
         m_external = re.match('^(\w{2})-(.*)$', video_id)
@@ -144,15 +143,24 @@ class MetacafeIE(InfoExtractor):
             if prefix == 'cb':
                 return self.url_result('theplatform:%s' % ext_id, 'ThePlatform')
 
-        # Retrieve video webpage to extract further information
-        req = sanitized_Request('http://www.metacafe.com/watch/%s/' % video_id)
+        # self._confirm_age()
 
         # AnyClip videos require the flashversion cookie so that we get the link
         # to the mp4 file
-        mobj_an = re.match(r'^an-(.*?)$', video_id)
-        if mobj_an:
-            req.headers['Cookie'] = 'flashVersion=0;'
-        webpage = self._download_webpage(req, video_id)
+        headers = {}
+        if video_id.startswith('an-'):
+            headers['Cookie'] = 'flashVersion=0;'
+
+        # Retrieve video webpage to extract further information
+        webpage = self._download_webpage(url, video_id, headers=headers)
+
+        error = get_element_by_attribute(
+            'class', 'notfound-page-title', webpage)
+        if error:
+            raise ExtractorError(error, expected=True)
+
+        video_title = self._html_search_meta(
+            ['og:title', 'twitter:title'], webpage, 'title', default=None) or self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
 
         # Extract URL, uploader and title from webpage
         self.report_extraction(video_id)
@@ -216,20 +224,40 @@ class MetacafeIE(InfoExtractor):
                         'player_url': player_url,
                         'ext': play_path.partition(':')[0],
                     })
+        if video_url is None:
+            flashvars = self._parse_json(self._search_regex(
+                r'flashvars\s*=\s*({.*});', webpage, 'flashvars',
+                default=None), video_id, fatal=False)
+            if flashvars:
+                video_url = []
+                for source in flashvars.get('sources'):
+                    source_url = source.get('src')
+                    if not source_url:
+                        continue
+                    ext = mimetype2ext(source.get('type')) or determine_ext(source_url)
+                    if ext == 'm3u8':
+                        video_url.extend(self._extract_m3u8_formats(
+                            source_url, video_id, 'mp4',
+                            'm3u8_native', m3u8_id='hls', fatal=False))
+                    else:
+                        video_url.append({
+                            'url': source_url,
+                            'ext': ext,
+                        })
 
         if video_url is None:
             raise ExtractorError('Unsupported video type')
 
-        video_title = self._html_search_regex(
-            r'(?im)<title>(.*) - Video</title>', webpage, 'title')
-        description = self._og_search_description(webpage)
-        thumbnail = self._og_search_thumbnail(webpage)
+        description = self._html_search_meta(
+            ['og:description', 'twitter:description', 'description'],
+            webpage, 'title', fatal=False)
+        thumbnail = self._html_search_meta(
+            ['og:image', 'twitter:image'], webpage, 'title', fatal=False)
         video_uploader = self._html_search_regex(
             r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
             webpage, 'uploader nickname', fatal=False)
         duration = int_or_none(
-            self._html_search_meta('video:duration', webpage))
-
+            self._html_search_meta('video:duration', webpage, default=None))
         age_limit = (
             18
             if re.search(r'(?:"contentRating":|"rating",)"restricted"', webpage)
@@ -242,10 +270,11 @@ class MetacafeIE(InfoExtractor):
                 'url': video_url,
                 'ext': video_ext,
             }]
-
         self._sort_formats(formats)
+
         return {
             'id': video_id,
+            'display_id': display_id,
             'description': description,
             'uploader': video_uploader,
             'title': video_title,
index 9fbc74f5d52e819ca3213623a30b6d9cd2e6ec08..27bdff8b25cb63e628981ad3a8dbcc7f76db7653 100644 (file)
@@ -9,7 +9,7 @@ class MGTVIE(InfoExtractor):
     _VALID_URL = r'https?://www\.mgtv\.com/v/(?:[^/]+/)*(?P<id>\d+)\.html'
     IE_DESC = '芒果TV'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html',
         'md5': '1bdadcf760a0b90946ca68ee9a2db41a',
         'info_dict': {
@@ -20,13 +20,18 @@ class MGTVIE(InfoExtractor):
             'duration': 7461,
             'thumbnail': 're:^https?://.*\.jpg$',
         },
-    }
+    }, {
+        # no tbr extracted from stream_url
+        'url': 'http://www.mgtv.com/v/1/1/f/3324755.html',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
         api_data = self._download_json(
             'http://v.api.mgtv.com/player/video', video_id,
-            query={'video_id': video_id})['data']
+            query={'video_id': video_id},
+            headers=self.geo_verification_headers())['data']
         info = api_data['info']
 
         formats = []
@@ -40,7 +45,8 @@ class MGTVIE(InfoExtractor):
             def extract_format(stream_url, format_id, idx, query={}):
                 format_info = self._download_json(
                     stream_url, video_id,
-                    note='Download video info for format %s' % format_id or '#%d' % idx, query=query)
+                    note='Download video info for format %s' % (format_id or '#%d' % idx),
+                    query=query)
                 return {
                     'format_id': format_id,
                     'url': format_info['info'],
index 170ebd9eb9e285f91e4b8bd85c05b13f745039a8..937ba0f28bc41799f6489f4e246c9fac8a7c86e2 100644 (file)
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import random
 
 from .common import InfoExtractor
+from ..compat import compat_urlparse
 from ..utils import (
     xpath_text,
     int_or_none,
@@ -18,13 +19,16 @@ class MioMioIE(InfoExtractor):
     _TESTS = [{
         # "type=video" in flashvars
         'url': 'http://www.miomio.tv/watch/cc88912/',
-        'md5': '317a5f7f6b544ce8419b784ca8edae65',
         'info_dict': {
             'id': '88912',
             'ext': 'flv',
             'title': '【SKY】字幕 铠武昭和VS平成 假面骑士大战FEAT战队 魔星字幕组 字幕',
             'duration': 5923,
         },
+        'params': {
+            # The server provides broken file
+            'skip_download': True,
+        }
     }, {
         'url': 'http://www.miomio.tv/watch/cc184024/',
         'info_dict': {
@@ -32,7 +36,7 @@ class MioMioIE(InfoExtractor):
             'title': '《动漫同人插画绘制》',
         },
         'playlist_mincount': 86,
-        'skip': 'This video takes time too long for retrieving the URL',
+        'skip': 'Unable to load videos',
     }, {
         'url': 'http://www.miomio.tv/watch/cc173113/',
         'info_dict': {
@@ -40,20 +44,23 @@ class MioMioIE(InfoExtractor):
             'title': 'The New Macbook 2015 上手试玩与简评'
         },
         'playlist_mincount': 2,
+        'skip': 'Unable to load videos',
+    }, {
+        # new 'h5' player
+        'url': 'http://www.miomio.tv/watch/cc273295/',
+        'md5': '',
+        'info_dict': {
+            'id': '273295',
+            'ext': 'mp4',
+            'title': 'アウト×デラックス 20160526',
+        },
+        'params': {
+            # intermittent HTTP 500
+            'skip_download': True,
+        },
     }]
 
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-
-        title = self._html_search_meta(
-            'description', webpage, 'title', fatal=True)
-
-        mioplayer_path = self._search_regex(
-            r'src="(/mioplayer/[^"]+)"', webpage, 'ref_path')
-
-        http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path}
-
+    def _extract_mioplayer(self, webpage, video_id, title, http_headers):
         xml_config = self._search_regex(
             r'flashvars="type=(?:sina|video)&amp;(.+?)&amp;',
             webpage, 'xml config')
@@ -92,10 +99,34 @@ class MioMioIE(InfoExtractor):
                 'http_headers': http_headers,
             })
 
+        return entries
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._html_search_meta(
+            'description', webpage, 'title', fatal=True)
+
+        mioplayer_path = self._search_regex(
+            r'src="(/mioplayer(?:_h5)?/[^"]+)"', webpage, 'ref_path')
+
+        if '_h5' in mioplayer_path:
+            player_url = compat_urlparse.urljoin(url, mioplayer_path)
+            player_webpage = self._download_webpage(
+                player_url, video_id,
+                note='Downloading player webpage', headers={'Referer': url})
+            entries = self._parse_html5_media_entries(player_url, player_webpage)
+            http_headers = {'Referer': player_url}
+        else:
+            http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path}
+            entries = self._extract_mioplayer(webpage, video_id, title, http_headers)
+
         if len(entries) == 1:
             segment = entries[0]
             segment['id'] = video_id
             segment['title'] = title
+            segment['http_headers'] = http_headers
             return segment
 
         return {
index 5a00cd3975b46fbd0f191ecd8e82fb09ea78490a..cd169f3616729871fcad9e6c619e67f392f73312 100644 (file)
@@ -12,12 +12,69 @@ from ..utils import (
     get_element_by_attribute,
     int_or_none,
     remove_start,
+    extract_attributes,
+    determine_ext,
 )
 
 
-class MiTeleIE(InfoExtractor):
+class MiTeleBaseIE(InfoExtractor):
+    def _get_player_info(self, url, webpage):
+        player_data = extract_attributes(self._search_regex(
+            r'(?s)(<ms-video-player.+?</ms-video-player>)',
+            webpage, 'ms video player'))
+        video_id = player_data['data-media-id']
+        config_url = compat_urlparse.urljoin(url, player_data['data-config'])
+        config = self._download_json(
+            config_url, video_id, 'Downloading config JSON')
+        mmc_url = config['services']['mmc']
+
+        duration = None
+        formats = []
+        for m_url in (mmc_url, mmc_url.replace('/flash.json', '/html5.json')):
+            mmc = self._download_json(
+                m_url, video_id, 'Downloading mmc JSON')
+            if not duration:
+                duration = int_or_none(mmc.get('duration'))
+            for location in mmc['locations']:
+                gat = self._proto_relative_url(location.get('gat'), 'http:')
+                bas = location.get('bas')
+                loc = location.get('loc')
+                ogn = location.get('ogn')
+                if None in (gat, bas, loc, ogn):
+                    continue
+                token_data = {
+                    'bas': bas,
+                    'icd': loc,
+                    'ogn': ogn,
+                    'sta': '0',
+                }
+                media = self._download_json(
+                    '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)),
+                    video_id, 'Downloading %s JSON' % location['loc'])
+                file_ = media.get('file')
+                if not file_:
+                    continue
+                ext = determine_ext(file_)
+                if ext == 'f4m':
+                    formats.extend(self._extract_f4m_formats(
+                        file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
+                        video_id, f4m_id='hds', fatal=False))
+                elif ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        file_, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'formats': formats,
+            'thumbnail': player_data.get('data-poster') or config.get('poster', {}).get('imageUrl'),
+            'duration': duration,
+        }
+
+
+class MiTeleIE(MiTeleBaseIE):
     IE_DESC = 'mitele.es'
-    _VALID_URL = r'https?://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/'
+    _VALID_URL = r'https?://www\.mitele\.es/(?:[^/]+/){3}(?P<id>[^/]+)/'
 
     _TESTS = [{
         'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',
@@ -25,7 +82,7 @@ class MiTeleIE(InfoExtractor):
         'info_dict': {
             'id': '0NF1jJnxS1Wu3pHrmvFyw2',
             'display_id': 'programa-144',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': 'Tor, la web invisible',
             'description': 'md5:3b6fce7eaa41b2d97358726378d9369f',
             'series': 'Diario de',
@@ -40,7 +97,7 @@ class MiTeleIE(InfoExtractor):
         'info_dict': {
             'id': 'eLZSwoEd1S3pVyUm8lc6F',
             'display_id': 'programa-226',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': 'Cuarto Milenio - Temporada 6 - Programa 226',
             'description': 'md5:50daf9fadefa4e62d9fc866d0c015701',
             'series': 'Cuarto Milenio',
@@ -59,40 +116,7 @@ class MiTeleIE(InfoExtractor):
 
         webpage = self._download_webpage(url, display_id)
 
-        config_url = self._search_regex(
-            r'data-config\s*=\s*"([^"]+)"', webpage, 'data config url')
-        config_url = compat_urlparse.urljoin(url, config_url)
-
-        config = self._download_json(
-            config_url, display_id, 'Downloading config JSON')
-
-        mmc = self._download_json(
-            config['services']['mmc'], display_id, 'Downloading mmc JSON')
-
-        formats = []
-        for location in mmc['locations']:
-            gat = self._proto_relative_url(location.get('gat'), 'http:')
-            bas = location.get('bas')
-            loc = location.get('loc')
-            ogn = location.get('ogn')
-            if None in (gat, bas, loc, ogn):
-                continue
-            token_data = {
-                'bas': bas,
-                'icd': loc,
-                'ogn': ogn,
-                'sta': '0',
-            }
-            media = self._download_json(
-                '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)),
-                display_id, 'Downloading %s JSON' % location['loc'])
-            file_ = media.get('file')
-            if not file_:
-                continue
-            formats.extend(self._extract_f4m_formats(
-                file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
-                display_id, f4m_id=loc))
-        self._sort_formats(formats)
+        info = self._get_player_info(url, webpage)
 
         title = self._search_regex(
             r'class="Destacado-text"[^>]*>\s*<strong>([^<]+)</strong>',
@@ -112,21 +136,12 @@ class MiTeleIE(InfoExtractor):
                 title = remove_start(self._search_regex(
                     r'<title>([^<]+)</title>', webpage, 'title'), 'Ver online ')
 
-        video_id = self._search_regex(
-            r'data-media-id\s*=\s*"([^"]+)"', webpage,
-            'data media id', default=None) or display_id
-        thumbnail = config.get('poster', {}).get('imageUrl')
-        duration = int_or_none(mmc.get('duration'))
-
-        return {
-            'id': video_id,
+        info.update({
             'display_id': display_id,
             'title': title,
             'description': get_element_by_attribute('class', 'text', webpage),
             'series': series,
             'season': season,
             'episode': episode,
-            'thumbnail': thumbnail,
-            'duration': duration,
-            'formats': formats,
-        }
+        })
+        return info
index e47c8011924cb0f5ecddefd33b35debd0324d5a9..e3bbe5aa8997694f62a07d8a2e0c383aa64daae1 100644 (file)
@@ -1,53 +1,56 @@
 from __future__ import unicode_literals
 
-import os
-import re
-
-from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse_unquote,
-    compat_urllib_parse_urlparse,
+from ..utils import (
+    int_or_none,
+    str_to_int,
+    unified_strdate,
 )
-from ..utils import sanitized_Request
+from .keezmovies import KeezMoviesIE
 
 
-class MofosexIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?(?P<url>mofosex\.com/videos/(?P<id>[0-9]+)/.*?\.html)'
-    _TEST = {
-        'url': 'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html',
-        'md5': '1b2eb47ac33cc75d4a80e3026b613c5a',
+class MofosexIE(KeezMoviesIE):
+    _VALID_URL = r'https?://(?:www\.)?mofosex\.com/videos/(?P<id>\d+)/(?P<display_id>[^/?#&.]+)\.html'
+    _TESTS = [{
+        'url': 'http://www.mofosex.com/videos/318131/amateur-teen-playing-and-masturbating-318131.html',
+        'md5': '39a15853632b7b2e5679f92f69b78e91',
         'info_dict': {
-            'id': '5018',
+            'id': '318131',
+            'display_id': 'amateur-teen-playing-and-masturbating-318131',
             'ext': 'mp4',
-            'title': 'Japanese Teen Music Video',
+            'title': 'amateur teen playing and masturbating',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'upload_date': '20121114',
+            'view_count': int,
+            'like_count': int,
+            'dislike_count': int,
             'age_limit': 18,
         }
-    }
+    }, {
+        # This video is no longer available
+        'url': 'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        url = 'http://www.' + mobj.group('url')
-
-        req = sanitized_Request(url)
-        req.add_header('Cookie', 'age_verified=1')
-        webpage = self._download_webpage(req, video_id)
-
-        video_title = self._html_search_regex(r'<h1>(.+?)<', webpage, 'title')
-        video_url = compat_urllib_parse_unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, 'video_url'))
-        path = compat_urllib_parse_urlparse(video_url).path
-        extension = os.path.splitext(path)[1][1:]
-        format = path.split('/')[5].split('_')[:2]
-        format = '-'.join(format)
-
-        age_limit = self._rta_search(webpage)
-
-        return {
-            'id': video_id,
-            'title': video_title,
-            'url': video_url,
-            'ext': extension,
-            'format': format,
-            'format_id': format,
-            'age_limit': age_limit,
-        }
+        webpage, info = self._extract_info(url)
+
+        view_count = str_to_int(self._search_regex(
+            r'VIEWS:</span>\s*([\d,.]+)', webpage, 'view count', fatal=False))
+        like_count = int_or_none(self._search_regex(
+            r'id=["\']amountLikes["\'][^>]*>(\d+)', webpage,
+            'like count', fatal=False))
+        dislike_count = int_or_none(self._search_regex(
+            r'id=["\']amountDislikes["\'][^>]*>(\d+)', webpage,
+            'like count', fatal=False))
+        upload_date = unified_strdate(self._html_search_regex(
+            r'Added:</span>([^<]+)', webpage, 'upload date', fatal=False))
+
+        info.update({
+            'view_count': view_count,
+            'like_count': like_count,
+            'dislike_count': dislike_count,
+            'upload_date': upload_date,
+            'thumbnail': self._og_search_thumbnail(webpage),
+        })
+
+        return info
diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py
new file mode 100644 (file)
index 0000000..1ec8e0f
--- /dev/null
@@ -0,0 +1,122 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
+    unescapeHTML,
+)
+
+
+class MSNIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?msn\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/[a-z]{2}-(?P<id>[\da-zA-Z]+)'
+    _TESTS = [{
+        'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/criminal-minds-shemar-moore-shares-a-touching-goodbye-message/vp-BBqQYNE',
+        'md5': '8442f66c116cbab1ff7098f986983458',
+        'info_dict': {
+            'id': 'BBqQYNE',
+            'display_id': 'criminal-minds-shemar-moore-shares-a-touching-goodbye-message',
+            'ext': 'mp4',
+            'title': 'Criminal Minds - Shemar Moore Shares A Touching Goodbye Message',
+            'description': 'md5:e8e89b897b222eb33a6b5067a8f1bc25',
+            'duration': 104,
+            'uploader': 'CBS Entertainment',
+            'uploader_id': 'IT0X5aoJ6bJgYerJXSDCgFmYPB1__54v',
+        },
+    }, {
+        'url': 'http://www.msn.com/en-ae/news/offbeat/meet-the-nine-year-old-self-made-millionaire/ar-BBt6ZKf',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.msn.com/en-ae/video/watch/obama-a-lot-of-people-will-be-disappointed/vi-AAhxUMH',
+        'only_matching': True,
+    }, {
+        # geo restricted
+        'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/the-first-fart-makes-you-laugh-the-last-fart-makes-you-cry/vp-AAhzIBU',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.msn.com/en-ae/entertainment/bollywood/watch-how-salman-khan-reacted-when-asked-if-he-would-apologize-for-his-‘raped-woman’-comment/vi-AAhvzW6',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id, display_id = mobj.group('id', 'display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        video = self._parse_json(
+            self._search_regex(
+                r'data-metadata\s*=\s*(["\'])(?P<data>.+?)\1',
+                webpage, 'video data', default='{}', group='data'),
+            display_id, transform_source=unescapeHTML)
+
+        if not video:
+            error = unescapeHTML(self._search_regex(
+                r'data-error=(["\'])(?P<error>.+?)\1',
+                webpage, 'error', group='error'))
+            raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
+
+        title = video['title']
+
+        formats = []
+        for file_ in video.get('videoFiles', []):
+            format_url = file_.get('url')
+            if not format_url:
+                continue
+            ext = determine_ext(format_url)
+            # .ism is not yet supported (see
+            # https://github.com/rg3/youtube-dl/issues/8118)
+            if ext == 'ism':
+                continue
+            if 'm3u8' in format_url:
+                # m3u8_native should not be used here until
+                # https://github.com/rg3/youtube-dl/issues/9913 is fixed
+                m3u8_formats = self._extract_m3u8_formats(
+                    format_url, display_id, 'mp4',
+                    m3u8_id='hls', fatal=False)
+                # Despite metadata in m3u8 all video+audio formats are
+                # actually video-only (no audio)
+                for f in m3u8_formats:
+                    if f.get('acodec') != 'none' and f.get('vcodec') != 'none':
+                        f['acodec'] = 'none'
+                formats.extend(m3u8_formats)
+            else:
+                formats.append({
+                    'url': format_url,
+                    'ext': 'mp4',
+                    'format_id': 'http',
+                    'width': int_or_none(file_.get('width')),
+                    'height': int_or_none(file_.get('height')),
+                })
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for file_ in video.get('files', []):
+            format_url = file_.get('url')
+            format_code = file_.get('formatCode')
+            if not format_url or not format_code:
+                continue
+            if compat_str(format_code) == '3100':
+                subtitles.setdefault(file_.get('culture', 'en'), []).append({
+                    'ext': determine_ext(format_url, 'ttml'),
+                    'url': format_url,
+                })
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': video.get('description'),
+            'thumbnail': video.get('headlineImage', {}).get('url'),
+            'duration': int_or_none(video.get('durationSecs')),
+            'uploader': video.get('sourceFriendly'),
+            'uploader_id': video.get('providerId'),
+            'creator': video.get('creator'),
+            'subtitles': subtitles,
+            'formats': formats,
+        }
index dd06395891ac02ce81b40efd471776cb080d328d..2f455680ebba41074513f7dd33b76e5c269cb142 100644 (file)
@@ -15,6 +15,8 @@ from ..utils import (
     float_or_none,
     HEADRequest,
     sanitized_Request,
+    strip_or_none,
+    timeconvert,
     unescapeHTML,
     url_basename,
     RegexNotFoundError,
@@ -35,13 +37,13 @@ class MTVServicesInfoExtractor(InfoExtractor):
         return uri.split(':')[-1]
 
     # This was originally implemented for ComedyCentral, but it also works here
-    @staticmethod
-    def _transform_rtmp_url(rtmp_video_url):
+    @classmethod
+    def _transform_rtmp_url(cls, rtmp_video_url):
         m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\..+?/.*)$', rtmp_video_url)
         if not m:
-            return rtmp_video_url
+            return {'rtmp': rtmp_video_url}
         base = 'http://viacommtvstrmfs.fplive.net/'
-        return base + m.group('finalid')
+        return {'http': base + m.group('finalid')}
 
     def _get_feed_url(self, uri):
         return self._FEED_URL
@@ -85,14 +87,14 @@ class MTVServicesInfoExtractor(InfoExtractor):
                 rtmp_video_url = rendition.find('./src').text
                 if rtmp_video_url.endswith('siteunavail.png'):
                     continue
-                new_url = self._transform_rtmp_url(rtmp_video_url)
-                formats.append({
+                new_urls = self._transform_rtmp_url(rtmp_video_url)
+                formats.extend([{
                     'ext': 'flv' if new_url.startswith('rtmp') else ext,
                     'url': new_url,
-                    'format_id': rendition.get('bitrate'),
+                    'format_id': '-'.join(filter(None, [kind, rendition.get('bitrate')])),
                     'width': int(rendition.get('width')),
                     'height': int(rendition.get('height')),
-                })
+                } for kind, new_url in new_urls.items()])
             except (KeyError, TypeError):
                 raise ExtractorError('Invalid rendition field.')
         self._sort_formats(formats)
@@ -133,7 +135,9 @@ class MTVServicesInfoExtractor(InfoExtractor):
             message += item.text
             raise ExtractorError(message, expected=True)
 
-        description = xpath_text(itemdoc, 'description')
+        description = strip_or_none(xpath_text(itemdoc, 'description'))
+
+        timestamp = timeconvert(xpath_text(itemdoc, 'pubDate'))
 
         title_el = None
         if title_el is None:
@@ -167,6 +171,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
             'thumbnail': self._get_thumbnail_url(uri, itemdoc),
             'description': description,
             'duration': float_or_none(content_el.attrib.get('duration')),
+            'timestamp': timestamp,
         }
 
     def _get_feed_query(self, uri):
@@ -185,8 +190,13 @@ class MTVServicesInfoExtractor(InfoExtractor):
         idoc = self._download_xml(
             url, video_id,
             'Downloading info', transform_source=fix_xml_ampersands)
+
+        title = xpath_text(idoc, './channel/title')
+        description = xpath_text(idoc, './channel/description')
+
         return self.playlist_result(
-            [self._get_video_info(item) for item in idoc.findall('.//item')])
+            [self._get_video_info(item) for item in idoc.findall('.//item')],
+            playlist_title=title, playlist_description=description)
 
     def _extract_mgid(self, webpage):
         try:
@@ -232,6 +242,8 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
             'ext': 'mp4',
             'title': 'Peter Dinklage Sums Up \'Game Of Thrones\' In 45 Seconds',
             'description': '"Sexy sexy sexy, stabby stabby stabby, beautiful language," says Peter Dinklage as he tries summarizing "Game of Thrones" in under a minute.',
+            'timestamp': 1400126400,
+            'upload_date': '20140515',
         },
     }
 
@@ -274,6 +286,8 @@ class MTVIE(MTVServicesInfoExtractor):
                 'ext': 'mp4',
                 'title': 'Taylor Swift - "Ours (VH1 Storytellers)"',
                 'description': 'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
+                'timestamp': 1352610000,
+                'upload_date': '20121111',
             },
         },
     ]
@@ -300,20 +314,6 @@ class MTVIE(MTVServicesInfoExtractor):
         return self._get_videos_info(uri)
 
 
-class MTVIggyIE(MTVServicesInfoExtractor):
-    IE_NAME = 'mtviggy.com'
-    _VALID_URL = r'https?://www\.mtviggy\.com/videos/.+'
-    _TEST = {
-        'url': 'http://www.mtviggy.com/videos/arcade-fire-behind-the-scenes-at-the-biggest-music-experiment-yet/',
-        'info_dict': {
-            'id': '984696',
-            'ext': 'mp4',
-            'title': 'Arcade Fire: Behind the Scenes at the Biggest Music Experiment Yet',
-        }
-    }
-    _FEED_URL = 'http://all.mtvworldverticals.com/feed-xml/'
-
-
 class MTVDEIE(MTVServicesInfoExtractor):
     IE_NAME = 'mtv.de'
     _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:artists|shows|news)/(?:[^/]+/)*(?P<id>\d+)-[^/#?]+/*(?:[#?].*)?$'
@@ -321,7 +321,7 @@ class MTVDEIE(MTVServicesInfoExtractor):
         'url': 'http://www.mtv.de/artists/10571-cro/videos/61131-traum',
         'info_dict': {
             'id': 'music_video-a50bc5f0b3aa4b3190aa',
-            'ext': 'mp4',
+            'ext': 'flv',
             'title': 'MusicVideo_cro-traum',
             'description': 'Cro - Traum',
         },
@@ -329,20 +329,21 @@ class MTVDEIE(MTVServicesInfoExtractor):
             # rtmp download
             'skip_download': True,
         },
+        'skip': 'Blocked at Travis CI',
     }, {
         # mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97)
         'url': 'http://www.mtv.de/shows/933-teen-mom-2/staffeln/5353/folgen/63565-enthullungen',
         'info_dict': {
             'id': 'local_playlist-f5ae778b9832cc837189',
-            'ext': 'mp4',
+            'ext': 'flv',
             'title': 'Episode_teen-mom-2_shows_season-5_episode-1_full-episode_part1',
         },
         'params': {
             # rtmp download
             'skip_download': True,
         },
+        'skip': 'Blocked at Travis CI',
     }, {
-        # single video in pagePlaylist with different id
         'url': 'http://www.mtv.de/news/77491-mtv-movies-spotlight-pixels-teil-3',
         'info_dict': {
             'id': 'local_playlist-4e760566473c4c8c5344',
@@ -354,6 +355,7 @@ class MTVDEIE(MTVServicesInfoExtractor):
             # rtmp download
             'skip_download': True,
         },
+        'skip': 'Das Video kann zur Zeit nicht abgespielt werden.',
     }]
 
     def _real_extract(self, url):
@@ -366,11 +368,14 @@ class MTVDEIE(MTVServicesInfoExtractor):
                 r'window\.pagePlaylist\s*=\s*(\[.+?\]);\n', webpage, 'page playlist'),
             video_id)
 
+        def _mrss_url(item):
+            return item['mrss'] + item.get('mrssvars', '')
+
         # news pages contain single video in playlist with different id
         if len(playlist) == 1:
-            return self._get_videos_info_from_url(playlist[0]['mrss'], video_id)
+            return self._get_videos_info_from_url(_mrss_url(playlist[0]), video_id)
 
         for item in playlist:
             item_id = item.get('id')
             if item_id and compat_str(item_id) == video_id:
-                return self._get_videos_info_from_url(item['mrss'], video_id)
+                return self._get_videos_info_from_url(_mrss_url(item), video_id)
index b4e8ad17e9003940e5753349e30b4857c4c8aa6a..d9f17613633d245283f5f5745acca2feb273cbf5 100644 (file)
@@ -36,7 +36,7 @@ class MuenchenTVIE(InfoExtractor):
         title = self._live_title(self._og_search_title(webpage))
 
         data_js = self._search_regex(
-            r'(?s)\nplaylist:\s*(\[.*?}\]),related:',
+            r'(?s)\nplaylist:\s*(\[.*?}\]),',
             webpage, 'playlist configuration')
         data_json = js_to_json(data_js)
         data = json.loads(data_json)[0]
index 72251866303885f6cd9b040ec9ad3f042d8add6a..1dcf27afef331ceb3b09d6ade66cde65e92e3cd6 100644 (file)
@@ -1,15 +1,19 @@
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
+from .adobepass import AdobePassIE
 from ..utils import (
     smuggle_url,
     url_basename,
     update_url_query,
+    get_element_by_class,
 )
 
 
-class NationalGeographicIE(InfoExtractor):
-    IE_NAME = 'natgeo'
+class NationalGeographicVideoIE(InfoExtractor):
+    IE_NAME = 'natgeo:video'
     _VALID_URL = r'https?://video\.nationalgeographic\.com/.*?'
 
     _TESTS = [
@@ -61,16 +65,16 @@ class NationalGeographicIE(InfoExtractor):
         }
 
 
-class NationalGeographicChannelIE(InfoExtractor):
-    IE_NAME = 'natgeo:channel'
-    _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/videos/(?P<id>[^/?]+)'
+class NationalGeographicIE(AdobePassIE):
+    IE_NAME = 'natgeo'
+    _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/(?:videos|episodes)/(?P<id>[^/?]+)'
 
     _TESTS = [
         {
             'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/videos/uncovering-a-universal-knowledge/',
             'md5': '518c9aa655686cf81493af5cc21e2a04',
             'info_dict': {
-                'id': 'nB5vIAfmyllm',
+                'id': 'vKInpacll2pC',
                 'ext': 'mp4',
                 'title': 'Uncovering a Universal Knowledge',
                 'description': 'md5:1a89148475bf931b3661fcd6ddb2ae3a',
@@ -84,7 +88,7 @@ class NationalGeographicChannelIE(InfoExtractor):
             'url': 'http://channel.nationalgeographic.com/wild/destination-wild/videos/the-stunning-red-bird-of-paradise/',
             'md5': 'c4912f656b4cbe58f3e000c489360989',
             'info_dict': {
-                'id': '3TmMv9OvGwIR',
+                'id': 'Pok5lWCkiEFA',
                 'ext': 'mp4',
                 'title': 'The Stunning Red Bird of Paradise',
                 'description': 'md5:7bc8cd1da29686be4d17ad1230f0140c',
@@ -94,6 +98,10 @@ class NationalGeographicChannelIE(InfoExtractor):
             },
             'add_ie': ['ThePlatform'],
         },
+        {
+            'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/episodes/the-power-of-miracles/',
+            'only_matching': True,
+        }
     ]
 
     def _real_extract(self, url):
@@ -102,12 +110,59 @@ class NationalGeographicChannelIE(InfoExtractor):
         release_url = self._search_regex(
             r'video_auth_playlist_url\s*=\s*"([^"]+)"',
             webpage, 'release url')
+        query = {
+            'mbr': 'true',
+            'switch': 'http',
+        }
+        is_auth = self._search_regex(r'video_is_auth\s*=\s*"([^"]+)"', webpage, 'is auth', fatal=False)
+        if is_auth == 'auth':
+            auth_resource_id = self._search_regex(
+                r"video_auth_resourceId\s*=\s*'([^']+)'",
+                webpage, 'auth resource id')
+            query['auth'] = self._extract_mvpd_auth(url, display_id, 'natgeo', auth_resource_id)
 
         return {
             '_type': 'url_transparent',
             'ie_key': 'ThePlatform',
             'url': smuggle_url(
-                update_url_query(release_url, {'mbr': 'true', 'switch': 'http'}),
+                update_url_query(release_url, query),
                 {'force_smil_url': True}),
             'display_id': display_id,
         }
+
+
+class NationalGeographicEpisodeGuideIE(InfoExtractor):
+    IE_NAME = 'natgeo:episodeguide'
+    _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?(?P<id>[^/]+)/episode-guide'
+    _TESTS = [
+        {
+            'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/episode-guide/',
+            'info_dict': {
+                'id': 'the-story-of-god-with-morgan-freeman-season-1',
+                'title': 'The Story of God with Morgan Freeman - Season 1',
+            },
+            'playlist_mincount': 6,
+        },
+        {
+            'url': 'http://channel.nationalgeographic.com/underworld-inc/episode-guide/?s=2',
+            'info_dict': {
+                'id': 'underworld-inc-season-2',
+                'title': 'Underworld, Inc. - Season 2',
+            },
+            'playlist_mincount': 7,
+        },
+    ]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        show = get_element_by_class('show', webpage)
+        selected_season = self._search_regex(
+            r'<div[^>]+class="select-seasons[^"]*".*?<a[^>]*>(.*?)</a>',
+            webpage, 'selected season')
+        entries = [
+            self.url_result(self._proto_relative_url(entry_url), 'NationalGeographic')
+            for entry_url in re.findall('(?s)<div[^>]+class="col-inner"[^>]*?>.*?<a[^>]+href="([^"]+)"', webpage)]
+        return self.playlist_result(
+            entries, '%s-%s' % (display_id, selected_season.lower().replace(' ', '-')),
+            '%s - %s' % (show, selected_season))
index 6d6f69b440a4b91d95c42210b2e597aca99144f6..0891d2772cd53f9c686fed9e9cd50c77ddc80bb1 100644 (file)
@@ -4,12 +4,10 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse_urlencode,
-    compat_urlparse,
-)
 from ..utils import (
     ExtractorError,
+    int_or_none,
+    update_url_query,
 )
 
 
@@ -51,48 +49,74 @@ class NaverIE(InfoExtractor):
             if error:
                 raise ExtractorError(error, expected=True)
             raise ExtractorError('couldn\'t extract vid and key')
-        vid = m_id.group(1)
-        key = m_id.group(2)
-        query = compat_urllib_parse_urlencode({'vid': vid, 'inKey': key, })
-        query_urls = compat_urllib_parse_urlencode({
-            'masterVid': vid,
-            'protocol': 'p2p',
-            'inKey': key,
-        })
-        info = self._download_xml(
-            'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query,
-            video_id, 'Downloading video info')
-        urls = self._download_xml(
-            'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls,
-            video_id, 'Downloading video formats info')
-
+        video_data = self._download_json(
+            'http://play.rmcnmv.naver.com/vod/play/v2.0/' + m_id.group(1),
+            video_id, query={
+                'key': m_id.group(2),
+            })
+        meta = video_data['meta']
+        title = meta['subject']
         formats = []
-        for format_el in urls.findall('EncodingOptions/EncodingOption'):
-            domain = format_el.find('Domain').text
-            uri = format_el.find('uri').text
-            f = {
-                'url': compat_urlparse.urljoin(domain, uri),
-                'ext': 'mp4',
-                'width': int(format_el.find('width').text),
-                'height': int(format_el.find('height').text),
-            }
-            if domain.startswith('rtmp'):
-                # urlparse does not support custom schemes
-                # https://bugs.python.org/issue18828
-                f.update({
-                    'url': domain + uri,
-                    'ext': 'flv',
-                    'rtmp_protocol': '1',  # rtmpt
+
+        def extract_formats(streams, stream_type, query={}):
+            for stream in streams:
+                stream_url = stream.get('source')
+                if not stream_url:
+                    continue
+                stream_url = update_url_query(stream_url, query)
+                encoding_option = stream.get('encodingOption', {})
+                bitrate = stream.get('bitrate', {})
+                formats.append({
+                    'format_id': '%s_%s' % (stream.get('type') or stream_type, encoding_option.get('id') or encoding_option.get('name')),
+                    'url': stream_url,
+                    'width': int_or_none(encoding_option.get('width')),
+                    'height': int_or_none(encoding_option.get('height')),
+                    'vbr': int_or_none(bitrate.get('video')),
+                    'abr': int_or_none(bitrate.get('audio')),
+                    'filesize': int_or_none(stream.get('size')),
+                    'protocol': 'm3u8_native' if stream_type == 'HLS' else None,
                 })
-            formats.append(f)
+
+        extract_formats(video_data.get('videos', {}).get('list', []), 'H264')
+        for stream_set in video_data.get('streams', []):
+            query = {}
+            for param in stream_set.get('keys', []):
+                query[param['name']] = param['value']
+            stream_type = stream_set.get('type')
+            videos = stream_set.get('videos')
+            if videos:
+                extract_formats(videos, stream_type, query)
+            elif stream_type == 'HLS':
+                stream_url = stream_set.get('source')
+                if not stream_url:
+                    continue
+                formats.extend(self._extract_m3u8_formats(
+                    update_url_query(stream_url, query), video_id,
+                    'mp4', 'm3u8_native', m3u8_id=stream_type, fatal=False))
         self._sort_formats(formats)
 
+        subtitles = {}
+        for caption in video_data.get('captions', {}).get('list', []):
+            caption_url = caption.get('source')
+            if not caption_url:
+                continue
+            subtitles.setdefault(caption.get('language') or caption.get('locale'), []).append({
+                'url': caption_url,
+            })
+
+        upload_date = self._search_regex(
+            r'<span[^>]+class="date".*?(\d{4}\.\d{2}\.\d{2})',
+            webpage, 'upload date', fatal=False)
+        if upload_date:
+            upload_date = upload_date.replace('.', '')
+
         return {
             'id': video_id,
-            'title': info.find('Subject').text,
+            'title': title,
             'formats': formats,
+            'subtitles': subtitles,
             'description': self._og_search_description(webpage),
-            'thumbnail': self._og_search_thumbnail(webpage),
-            'upload_date': info.find('WriteDate').text.replace('.', ''),
-            'view_count': int(info.find('PlayCount').text),
+            'thumbnail': meta.get('cover', {}).get('source') or self._og_search_thumbnail(webpage),
+            'view_count': int_or_none(meta.get('count')),
+            'upload_date': upload_date,
         }
diff --git a/youtube_dl/extractor/nextmovie.py b/youtube_dl/extractor/nextmovie.py
deleted file mode 100644 (file)
index 9ccd7d7..0000000
+++ /dev/null
@@ -1,30 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .mtv import MTVServicesInfoExtractor
-from ..compat import compat_urllib_parse_urlencode
-
-
-class NextMovieIE(MTVServicesInfoExtractor):
-    IE_NAME = 'nextmovie.com'
-    _VALID_URL = r'https?://(?:www\.)?nextmovie\.com/shows/[^/]+/\d{4}-\d{2}-\d{2}/(?P<id>[^/?#]+)'
-    _FEED_URL = 'http://lite.dextr.mtvi.com/service1/dispatch.htm'
-    _TESTS = [{
-        'url': 'http://www.nextmovie.com/shows/exclusives/2013-03-10/mgid:uma:videolist:nextmovie.com:1715019/',
-        'md5': '09a9199f2f11f10107d04fcb153218aa',
-        'info_dict': {
-            'id': '961726',
-            'ext': 'mp4',
-            'title': 'The Muppets\' Gravity',
-        },
-    }]
-
-    def _get_feed_query(self, uri):
-        return compat_urllib_parse_urlencode({
-            'feed': '1505',
-            'mgid': uri,
-        })
-
-    def _real_extract(self, url):
-        mgid = self._match_id(url)
-        return self._get_videos_info(mgid)
index e960137913ff76f6c63d4d1e650e7d95d3747d3a..9c54846e14a86634d3e6b20deb8e1adb5e46de55 100644 (file)
@@ -7,8 +7,9 @@ from ..utils import update_url_query
 
 
 class NickIE(MTVServicesInfoExtractor):
+    # None of videos on the website are still alive?
     IE_NAME = 'nick.com'
-    _VALID_URL = r'https?://(?:www\.)?nick\.com/videos/clip/(?P<id>[^/?#.]+)'
+    _VALID_URL = r'https?://(?:www\.)?nick(?:jr)?\.com/(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)'
     _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm'
     _TESTS = [{
         'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html',
@@ -52,6 +53,9 @@ class NickIE(MTVServicesInfoExtractor):
                 }
             },
         ],
+    }, {
+        'url': 'http://www.nickjr.com/paw-patrol/videos/pups-save-a-goldrush-s3-ep302-full-episode/',
+        'only_matching': True,
     }]
 
     def _get_feed_query(self, uri):
diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py
new file mode 100644 (file)
index 0000000..d889245
--- /dev/null
@@ -0,0 +1,55 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_iso8601,
+    parse_duration,
+    ExtractorError
+)
+
+
+class NineCNineMediaIE(InfoExtractor):
+    _VALID_URL = r'9c9media:(?P<destination_code>[^:]+):(?P<id>\d+)'
+
+    def _real_extract(self, url):
+        destination_code, video_id = re.match(self._VALID_URL, url).groups()
+        api_base_url = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' % (destination_code, video_id)
+        content = self._download_json(api_base_url, video_id, query={
+            '$include': '[contentpackages]',
+        })
+        title = content['Name']
+        if len(content['ContentPackages']) > 1:
+            raise ExtractorError('multiple content packages')
+        content_package = content['ContentPackages'][0]
+        stacks_base_url = api_base_url + 'contentpackages/%s/stacks/' % content_package['Id']
+        stacks = self._download_json(stacks_base_url, video_id)['Items']
+        if len(stacks) > 1:
+            raise ExtractorError('multiple stacks')
+        stack = stacks[0]
+        stack_base_url = '%s%s/manifest.' % (stacks_base_url, stack['Id'])
+        formats = []
+        formats.extend(self._extract_m3u8_formats(
+            stack_base_url + 'm3u8', video_id, 'mp4',
+            'm3u8_native', m3u8_id='hls', fatal=False))
+        formats.extend(self._extract_f4m_formats(
+            stack_base_url + 'f4m', video_id,
+            f4m_id='hds', fatal=False))
+        mp4_url = self._download_webpage(stack_base_url + 'pd', video_id, fatal=False)
+        if mp4_url:
+            formats.append({
+                'url': mp4_url,
+                'format_id': 'mp4',
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': content.get('Desc') or content.get('ShortDesc'),
+            'timestamp': parse_iso8601(content.get('BroadcastDateTime')),
+            'duration': parse_duration(content.get('BroadcastTime')),
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/ninenow.py b/youtube_dl/extractor/ninenow.py
new file mode 100644 (file)
index 0000000..faa5772
--- /dev/null
@@ -0,0 +1,72 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+    int_or_none,
+    float_or_none,
+    ExtractorError,
+)
+
+
+class NineNowIE(InfoExtractor):
+    IE_NAME = '9now.com.au'
+    _VALID_URL = r'https?://(?:www\.)?9now\.com\.au/(?:[^/]+/){2}(?P<id>[^/?#]+)'
+    _TESTS = [{
+        # clip
+        'url': 'https://www.9now.com.au/afl-footy-show/2016/clip-ciql02091000g0hp5oktrnytc',
+        'md5': '17cf47d63ec9323e562c9957a968b565',
+        'info_dict': {
+            'id': '16801',
+            'ext': 'mp4',
+            'title': 'St. Kilda\'s Joey Montagna on the potential for a player\'s strike',
+            'description': 'Is a boycott of the NAB Cup "on the table"?',
+            'uploader_id': '4460760524001',
+            'upload_date': '20160713',
+            'timestamp': 1468421266,
+        },
+        'skip': 'Only available in Australia',
+    }, {
+        # episode
+        'url': 'https://www.9now.com.au/afl-footy-show/2016/episode-19',
+        'only_matching': True,
+    }, {
+        # DRM protected
+        'url': 'https://www.9now.com.au/andrew-marrs-history-of-the-world/season-1/episode-1',
+        'only_matching': True,
+    }]
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4460760524001/default_default/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        page_data = self._parse_json(self._search_regex(
+            r'window\.__data\s*=\s*({.*?});', webpage,
+            'page data'), display_id)
+        common_data = page_data.get('episode', {}).get('episode') or page_data.get('clip', {}).get('clip')
+        video_data = common_data['video']
+
+        if video_data.get('drm'):
+            raise ExtractorError('This video is DRM protected.', expected=True)
+
+        brightcove_id = video_data.get('brightcoveId') or 'ref:' + video_data['referenceId']
+        video_id = compat_str(video_data.get('id') or brightcove_id)
+        title = common_data['name']
+
+        thumbnails = [{
+            'id': thumbnail_id,
+            'url': thumbnail_url,
+            'width': int_or_none(thumbnail_id[1:])
+        } for thumbnail_id, thumbnail_url in common_data.get('image', {}).get('sizes', {}).items()]
+
+        return {
+            '_type': 'url_transparent',
+            'url': self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
+            'id': video_id,
+            'title': title,
+            'description': common_data.get('description'),
+            'duration': float_or_none(video_data.get('duration'), 1000),
+            'thumbnails': thumbnails,
+            'ie_key': 'BrightcoveNew',
+        }
diff --git a/youtube_dl/extractor/nintendo.py b/youtube_dl/extractor/nintendo.py
new file mode 100644 (file)
index 0000000..4b4e66b
--- /dev/null
@@ -0,0 +1,46 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .ooyala import OoyalaIE
+from ..utils import unescapeHTML
+
+
+class NintendoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?nintendo\.com/games/detail/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'http://www.nintendo.com/games/detail/yEiAzhU2eQI1KZ7wOHhngFoAHc1FpHwj',
+        'info_dict': {
+            'id': 'MzMmticjp0VPzO3CCj4rmFOuohEuEWoW',
+            'ext': 'flv',
+            'title': 'Duck Hunt Wii U VC NES - Trailer',
+            'duration': 60.326,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': ['Ooyala'],
+    }, {
+        'url': 'http://www.nintendo.com/games/detail/tokyo-mirage-sessions-fe-wii-u',
+        'info_dict': {
+            'id': 'tokyo-mirage-sessions-fe-wii-u',
+            'title': 'Tokyo Mirage Sessions ♯FE',
+        },
+        'playlist_count': 3,
+    }]
+
+    def _real_extract(self, url):
+        page_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, page_id)
+
+        entries = [
+            OoyalaIE._build_url_result(m.group('code'))
+            for m in re.finditer(
+                r'class=(["\'])embed-video\1[^>]+data-video-code=(["\'])(?P<code>(?:(?!\2).)+)\2',
+                webpage)]
+
+        return self.playlist_result(
+            entries, page_id, unescapeHTML(self._og_search_title(webpage, fatal=False)))
index 0895d7ea4cb88f805605a55cb0c1fe56ff1d475d..e8702ebcd72633f82a9bc15c55057d67d14d2401 100644 (file)
@@ -11,70 +11,64 @@ from ..utils import (
 
 class NTVRuIE(InfoExtractor):
     IE_NAME = 'ntv.ru'
-    _VALID_URL = r'https?://(?:www\.)?ntv\.ru/(?P<id>.+)'
+    _VALID_URL = r'https?://(?:www\.)?ntv\.ru/(?:[^/]+/)*(?P<id>[^/?#&]+)'
 
-    _TESTS = [
-        {
-            'url': 'http://www.ntv.ru/novosti/863142/',
-            'md5': 'ba7ea172a91cb83eb734cad18c10e723',
-            'info_dict': {
-                'id': '746000',
-                'ext': 'mp4',
-                'title': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины',
-                'description': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины',
-                'thumbnail': 're:^http://.*\.jpg',
-                'duration': 136,
-            },
+    _TESTS = [{
+        'url': 'http://www.ntv.ru/novosti/863142/',
+        'md5': 'ba7ea172a91cb83eb734cad18c10e723',
+        'info_dict': {
+            'id': '746000',
+            'ext': 'mp4',
+            'title': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины',
+            'description': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины',
+            'thumbnail': 're:^http://.*\.jpg',
+            'duration': 136,
         },
-        {
-            'url': 'http://www.ntv.ru/video/novosti/750370/',
-            'md5': 'adecff79691b4d71e25220a191477124',
-            'info_dict': {
-                'id': '750370',
-                'ext': 'mp4',
-                'title': 'Родные пассажиров пропавшего Boeing не верят в трагический исход',
-                'description': 'Родные пассажиров пропавшего Boeing не верят в трагический исход',
-                'thumbnail': 're:^http://.*\.jpg',
-                'duration': 172,
-            },
+    }, {
+        'url': 'http://www.ntv.ru/video/novosti/750370/',
+        'md5': 'adecff79691b4d71e25220a191477124',
+        'info_dict': {
+            'id': '750370',
+            'ext': 'mp4',
+            'title': 'Родные пассажиров пропавшего Boeing не верят в трагический исход',
+            'description': 'Родные пассажиров пропавшего Boeing не верят в трагический исход',
+            'thumbnail': 're:^http://.*\.jpg',
+            'duration': 172,
         },
-        {
-            'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416',
-            'md5': '82dbd49b38e3af1d00df16acbeab260c',
-            'info_dict': {
-                'id': '747480',
-                'ext': 'mp4',
-                'title': '«Сегодня». 21 марта 2014 года. 16:00',
-                'description': '«Сегодня». 21 марта 2014 года. 16:00',
-                'thumbnail': 're:^http://.*\.jpg',
-                'duration': 1496,
-            },
+    }, {
+        'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416',
+        'md5': '82dbd49b38e3af1d00df16acbeab260c',
+        'info_dict': {
+            'id': '747480',
+            'ext': 'mp4',
+            'title': '«Сегодня». 21 марта 2014 года. 16:00',
+            'description': '«Сегодня». 21 марта 2014 года. 16:00',
+            'thumbnail': 're:^http://.*\.jpg',
+            'duration': 1496,
         },
-        {
-            'url': 'http://www.ntv.ru/kino/Koma_film',
-            'md5': 'f825770930937aa7e5aca0dc0d29319a',
-            'info_dict': {
-                'id': '1007609',
-                'ext': 'mp4',
-                'title': 'Остросюжетный фильм «Кома»',
-                'description': 'Остросюжетный фильм «Кома»',
-                'thumbnail': 're:^http://.*\.jpg',
-                'duration': 5592,
-            },
+    }, {
+        'url': 'http://www.ntv.ru/kino/Koma_film',
+        'md5': 'f825770930937aa7e5aca0dc0d29319a',
+        'info_dict': {
+            'id': '1007609',
+            'ext': 'mp4',
+            'title': 'Остросюжетный фильм «Кома»',
+            'description': 'Остросюжетный фильм «Кома»',
+            'thumbnail': 're:^http://.*\.jpg',
+            'duration': 5592,
         },
-        {
-            'url': 'http://www.ntv.ru/serial/Delo_vrachey/m31760/o233916/',
-            'md5': '9320cd0e23f3ea59c330dc744e06ff3b',
-            'info_dict': {
-                'id': '751482',
-                'ext': 'mp4',
-                'title': '«Дело врачей»: «Деревце жизни»',
-                'description': '«Дело врачей»: «Деревце жизни»',
-                'thumbnail': 're:^http://.*\.jpg',
-                'duration': 2590,
-            },
+    }, {
+        'url': 'http://www.ntv.ru/serial/Delo_vrachey/m31760/o233916/',
+        'md5': '9320cd0e23f3ea59c330dc744e06ff3b',
+        'info_dict': {
+            'id': '751482',
+            'ext': 'mp4',
+            'title': '«Дело врачей»: «Деревце жизни»',
+            'description': '«Дело врачей»: «Деревце жизни»',
+            'thumbnail': 're:^http://.*\.jpg',
+            'duration': 2590,
         },
-    ]
+    }]
 
     _VIDEO_ID_REGEXES = [
         r'<meta property="og:url" content="http://www\.ntv\.ru/video/(\d+)',
@@ -87,11 +81,21 @@ class NTVRuIE(InfoExtractor):
 
         webpage = self._download_webpage(url, video_id)
 
-        video_id = self._html_search_regex(self._VIDEO_ID_REGEXES, webpage, 'video id')
+        video_url = self._og_search_property(
+            ('video', 'video:iframe'), webpage, default=None)
+        if video_url:
+            video_id = self._search_regex(
+                r'https?://(?:www\.)?ntv\.ru/video/(?:embed/)?(\d+)',
+                video_url, 'video id', default=None)
+
+        if not video_id:
+            video_id = self._html_search_regex(
+                self._VIDEO_ID_REGEXES, webpage, 'video id')
 
         player = self._download_xml(
             'http://www.ntv.ru/vi%s/' % video_id,
             video_id, 'Downloading video XML')
+
         title = clean_html(xpath_text(player, './data/title', 'title', fatal=True))
         description = clean_html(xpath_text(player, './data/description', 'description'))
 
diff --git a/youtube_dl/extractor/odatv.py b/youtube_dl/extractor/odatv.py
new file mode 100644 (file)
index 0000000..314527f
--- /dev/null
@@ -0,0 +1,50 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    NO_DEFAULT,
+    remove_start
+)
+
+
+class OdaTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?odatv\.com/(?:mob|vid)_video\.php\?.*\bid=(?P<id>[^&]+)'
+    _TESTS = [{
+        'url': 'http://odatv.com/vid_video.php?id=8E388',
+        'md5': 'dc61d052f205c9bf2da3545691485154',
+        'info_dict': {
+            'id': '8E388',
+            'ext': 'mp4',
+            'title': 'Artık Davutoğlu ile devam edemeyiz'
+        }
+    }, {
+        # mobile URL
+        'url': 'http://odatv.com/mob_video.php?id=8E388',
+        'only_matching': True,
+    }, {
+        # no video
+        'url': 'http://odatv.com/mob_video.php?id=8E900',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        no_video = 'NO VIDEO!' in webpage
+
+        video_url = self._search_regex(
+            r'mp4\s*:\s*(["\'])(?P<url>http.+?)\1', webpage, 'video url',
+            default=None if no_video else NO_DEFAULT, group='url')
+
+        if no_video:
+            raise ExtractorError('Video %s does not exist' % video_id, expected=True)
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': remove_start(self._og_search_title(webpage), 'Video: '),
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }
diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py
new file mode 100644 (file)
index 0000000..fc22ad5
--- /dev/null
@@ -0,0 +1,169 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    ExtractorError,
+    float_or_none,
+    get_element_by_class,
+    int_or_none,
+    js_to_json,
+    parse_iso8601,
+    remove_start,
+    strip_or_none,
+    url_basename,
+)
+
+
+class OnetBaseIE(InfoExtractor):
+    def _search_mvp_id(self, webpage):
+        return self._search_regex(
+            r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id')
+
+    def _extract_from_id(self, video_id, webpage):
+        response = self._download_json(
+            'http://qi.ckm.onetapi.pl/', video_id,
+            query={
+                'body[id]': video_id,
+                'body[jsonrpc]': '2.0',
+                'body[method]': 'get_asset_detail',
+                'body[params][ID_Publikacji]': video_id,
+                'body[params][Service]': 'www.onet.pl',
+                'content-type': 'application/jsonp',
+                'x-onet-app': 'player.front.onetapi.pl',
+            })
+
+        error = response.get('error')
+        if error:
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, error['message']), expected=True)
+
+        video = response['result'].get('0')
+
+        formats = []
+        for _, formats_dict in video['formats'].items():
+            if not isinstance(formats_dict, dict):
+                continue
+            for format_id, format_list in formats_dict.items():
+                if not isinstance(format_list, list):
+                    continue
+                for f in format_list:
+                    video_url = f.get('url')
+                    if not video_url:
+                        continue
+                    ext = determine_ext(video_url)
+                    if format_id == 'ism':
+                        # TODO: Support Microsoft Smooth Streaming
+                        continue
+                    elif ext == 'mpd':
+                        formats.extend(self._extract_mpd_formats(
+                            video_url, video_id, mpd_id='dash', fatal=False))
+                    else:
+                        formats.append({
+                            'url': video_url,
+                            'format_id': format_id,
+                            'height': int_or_none(f.get('vertical_resolution')),
+                            'width': int_or_none(f.get('horizontal_resolution')),
+                            'abr': float_or_none(f.get('audio_bitrate')),
+                            'vbr': float_or_none(f.get('video_bitrate')),
+                        })
+        self._sort_formats(formats)
+
+        meta = video.get('meta', {})
+
+        title = self._og_search_title(webpage, default=None) or meta['title']
+        description = self._og_search_description(webpage, default=None) or meta.get('description')
+        duration = meta.get('length') or meta.get('lenght')
+        timestamp = parse_iso8601(meta.get('addDate'), ' ')
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'timestamp': timestamp,
+            'formats': formats,
+        }
+
+
+class OnetIE(OnetBaseIE):
+    _VALID_URL = 'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P<display_id>[0-9a-z-]+)/(?P<id>[0-9a-z]+)'
+    IE_NAME = 'onet.tv'
+
+    _TEST = {
+        'url': 'http://onet.tv/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc',
+        'md5': 'e3ffbf47590032ac3f27249204173d50',
+        'info_dict': {
+            'id': 'qbpyqc',
+            'display_id': 'open-er-festival-2016-najdziwniejsze-wymagania-gwiazd',
+            'ext': 'mp4',
+            'title': 'Open\'er Festival 2016: najdziwniejsze wymagania gwiazd',
+            'description': 'Trzy samochody, których nigdy nie użyto, prywatne spa, hotel dekorowany czarnym suknem czy nielegalne używki. Organizatorzy koncertów i festiwali muszą stawać przed nie lada wyzwaniem zapraszając gwia...',
+            'upload_date': '20160705',
+            'timestamp': 1467721580,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        display_id, video_id = mobj.group('display_id', 'id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        mvp_id = self._search_mvp_id(webpage)
+
+        info_dict = self._extract_from_id(mvp_id, webpage)
+        info_dict.update({
+            'id': video_id,
+            'display_id': display_id,
+        })
+
+        return info_dict
+
+
+class OnetChannelIE(OnetBaseIE):
+    _VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/(?P<id>[a-z]+)(?:[?#]|$)'
+    IE_NAME = 'onet.tv:channel'
+
+    _TEST = {
+        'url': 'http://onet.tv/k/openerfestival',
+        'info_dict': {
+            'id': 'openerfestival',
+            'title': 'Open\'er Festival Live',
+            'description': 'Dziękujemy, że oglądaliście transmisje. Zobaczcie nasze relacje i wywiady z artystami.',
+        },
+        'playlist_mincount': 46,
+    }
+
+    def _real_extract(self, url):
+        channel_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, channel_id)
+
+        current_clip_info = self._parse_json(self._search_regex(
+            r'var\s+currentClip\s*=\s*({[^}]+})', webpage, 'video info'), channel_id,
+            transform_source=lambda s: js_to_json(re.sub(r'\'\s*\+\s*\'', '', s)))
+        video_id = remove_start(current_clip_info['ckmId'], 'mvp:')
+        video_name = url_basename(current_clip_info['url'])
+
+        if self._downloader.params.get('noplaylist'):
+            self.to_screen(
+                'Downloading just video %s because of --no-playlist' % video_name)
+            return self._extract_from_id(video_id, webpage)
+
+        self.to_screen(
+            'Downloading channel %s - add --no-playlist to just download video %s' % (
+                channel_id, video_name))
+        matches = re.findall(
+            r'<a[^>]+href=[\'"](https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/[0-9a-z-]+/[0-9a-z]+)',
+            webpage)
+        entries = [
+            self.url_result(video_link, OnetIE.ie_key())
+            for video_link in matches]
+
+        channel_title = strip_or_none(get_element_by_class('o_channelName', webpage))
+        channel_description = strip_or_none(get_element_by_class('o_channelDesc', webpage))
+        return self.playlist_result(entries, channel_id, channel_title, channel_description)
index d7b13a0f1fbc53deba74e47f6d1eefc0bdee8be2..6fb1a3fcc0bd565677b232adcb883b3649715dde 100644 (file)
@@ -7,6 +7,8 @@ from .common import InfoExtractor
 from ..utils import (
     determine_ext,
     int_or_none,
+    float_or_none,
+    mimetype2ext,
 )
 
 
@@ -15,15 +17,14 @@ class OnionStudiosIE(InfoExtractor):
 
     _TESTS = [{
         'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937',
-        'md5': 'd4851405d31adfadf71cd7a487b765bb',
+        'md5': 'e49f947c105b8a78a675a0ee1bddedfe',
         'info_dict': {
             'id': '2937',
             'ext': 'mp4',
             'title': 'Hannibal charges forward, stops for a cocktail',
-            'description': 'md5:e786add7f280b7f0fe237b64cc73df76',
             'thumbnail': 're:^https?://.*\.jpg$',
             'uploader': 'The A.V. Club',
-            'uploader_id': 'TheAVClub',
+            'uploader_id': 'the-av-club',
         },
     }, {
         'url': 'http://www.onionstudios.com/embed?id=2855&autoplay=true',
@@ -40,50 +41,38 @@ class OnionStudiosIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        webpage = self._download_webpage(
-            'http://www.onionstudios.com/embed?id=%s' % video_id, video_id)
+        video_data = self._download_json(
+            'http://www.onionstudios.com/video/%s.json' % video_id, video_id)
+
+        title = video_data['title']
 
         formats = []
-        for src in re.findall(r'<source[^>]+src="([^"]+)"', webpage):
-            ext = determine_ext(src)
+        for source in video_data.get('sources', []):
+            source_url = source.get('url')
+            if not source_url:
+                continue
+            ext = mimetype2ext(source.get('content_type')) or determine_ext(source_url)
             if ext == 'm3u8':
                 formats.extend(self._extract_m3u8_formats(
-                    src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+                    source_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
             else:
-                height = int_or_none(self._search_regex(
-                    r'/(\d+)\.%s' % ext, src, 'height', default=None))
+                tbr = int_or_none(source.get('bitrate'))
                 formats.append({
-                    'format_id': ext + ('-%sp' % height if height else ''),
-                    'url': src,
-                    'height': height,
+                    'format_id': ext + ('-%d' % tbr if tbr else ''),
+                    'url': source_url,
+                    'width': int_or_none(source.get('width')),
+                    'tbr': tbr,
                     'ext': ext,
-                    'preference': 1,
                 })
         self._sort_formats(formats)
 
-        title = self._search_regex(
-            r'share_title\s*=\s*(["\'])(?P<title>[^\1]+?)\1',
-            webpage, 'title', group='title')
-        description = self._search_regex(
-            r'share_description\s*=\s*(["\'])(?P<description>[^\'"]+?)\1',
-            webpage, 'description', default=None, group='description')
-        thumbnail = self._search_regex(
-            r'poster\s*=\s*(["\'])(?P<thumbnail>[^\1]+?)\1',
-            webpage, 'thumbnail', default=False, group='thumbnail')
-
-        uploader_id = self._search_regex(
-            r'twitter_handle\s*=\s*(["\'])(?P<uploader_id>[^\1]+?)\1',
-            webpage, 'uploader id', fatal=False, group='uploader_id')
-        uploader = self._search_regex(
-            r'window\.channelName\s*=\s*(["\'])Embedded:(?P<uploader>[^\1]+?)\1',
-            webpage, 'uploader', default=False, group='uploader')
-
         return {
             'id': video_id,
             'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'uploader': uploader,
-            'uploader_id': uploader_id,
+            'thumbnail': video_data.get('poster_url'),
+            'uploader': video_data.get('channel_name'),
+            'uploader_id': video_data.get('channel_slug'),
+            'duration': float_or_none(video_data.get('duration', 1000)),
+            'tags': video_data.get('tags'),
             'formats': formats,
         }
index 6415b8fdcb7451e6b499d1d4d3515af2775c66c2..4e80ca9ff03100e244b157f70c8ff6897256cd55 100644 (file)
@@ -1,15 +1,14 @@
 # coding: utf-8
-from __future__ import unicode_literals
+from __future__ import unicode_literals, division
 
-import re
+import math
 
 from .common import InfoExtractor
 from ..compat import compat_chr
 from ..utils import (
+    decode_png,
     determine_ext,
-    encode_base_n,
     ExtractorError,
-    mimetype2ext,
 )
 
 
@@ -41,60 +40,6 @@ class OpenloadIE(InfoExtractor):
         'only_matching': True,
     }]
 
-    @staticmethod
-    def openload_level2_debase(m):
-        radix, num = int(m.group(1)) + 27, int(m.group(2))
-        return '"' + encode_base_n(num, radix) + '"'
-
-    @classmethod
-    def openload_level2(cls, txt):
-        # The function name is ǃ \u01c3
-        # Using escaped unicode literals does not work in Python 3.2
-        return re.sub(r'ǃ\((\d+),(\d+)\)', cls.openload_level2_debase, txt, re.UNICODE).replace('"+"', '')
-
-    # Openload uses a variant of aadecode
-    # openload_decode and related functions are originally written by
-    # vitas@matfyz.cz and released with public domain
-    # See https://github.com/rg3/youtube-dl/issues/8489
-    @classmethod
-    def openload_decode(cls, txt):
-        symbol_table = [
-            ('_', '(゚Д゚) [゚Θ゚]'),
-            ('a', '(゚Д゚) [゚ω゚ノ]'),
-            ('b', '(゚Д゚) [゚Θ゚ノ]'),
-            ('c', '(゚Д゚) [\'c\']'),
-            ('d', '(゚Д゚) [゚ー゚ノ]'),
-            ('e', '(゚Д゚) [゚Д゚ノ]'),
-            ('f', '(゚Д゚) [1]'),
-
-            ('o', '(゚Д゚) [\'o\']'),
-            ('u', '(o゚ー゚o)'),
-            ('c', '(゚Д゚) [\'c\']'),
-
-            ('7', '((゚ー゚) + (o^_^o))'),
-            ('6', '((o^_^o) +(o^_^o) +(c^_^o))'),
-            ('5', '((゚ー゚) + (゚Θ゚))'),
-            ('4', '(-~3)'),
-            ('3', '(-~-~1)'),
-            ('2', '(-~1)'),
-            ('1', '(-~0)'),
-            ('0', '((c^_^o)-(c^_^o))'),
-        ]
-        delim = '(゚Д゚)[゚ε゚]+'
-        ret = ''
-        for aachar in txt.split(delim):
-            for val, pat in symbol_table:
-                aachar = aachar.replace(pat, val)
-            aachar = aachar.replace('+ ', '')
-            m = re.match(r'^\d+', aachar)
-            if m:
-                ret += compat_chr(int(m.group(0), 8))
-            else:
-                m = re.match(r'^u([\da-f]+)', aachar)
-                if m:
-                    ret += compat_chr(int(m.group(1), 16))
-        return cls.openload_level2(ret)
-
     def _real_extract(self, url):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
@@ -102,29 +47,77 @@ class OpenloadIE(InfoExtractor):
         if 'File not found' in webpage:
             raise ExtractorError('File not found', expected=True)
 
-        code = self._search_regex(
-            r'</video>\s*</div>\s*<script[^>]+>[^>]+</script>\s*<script[^>]+>([^<]+)</script>',
-            webpage, 'JS code')
-
-        decoded = self.openload_decode(code)
-
-        video_url = self._search_regex(
-            r'return\s+"(https?://[^"]+)"', decoded, 'video URL')
+        # The following extraction logic is proposed by @Belderak and @gdkchan
+        # and declared to be used freely in youtube-dl
+        # See https://github.com/rg3/youtube-dl/issues/9706
+
+        numbers_js = self._download_webpage(
+            'https://openload.co/assets/js/obfuscator/n.js', video_id,
+            note='Downloading signature numbers')
+        signums = self._search_regex(
+            r'window\.signatureNumbers\s*=\s*[\'"](?P<data>[a-z]+)[\'"]',
+            numbers_js, 'signature numbers', group='data')
+
+        linkimg_uri = self._search_regex(
+            r'<img[^>]+id="linkimg"[^>]+src="([^"]+)"', webpage, 'link image')
+        linkimg = self._request_webpage(
+            linkimg_uri, video_id, note=False).read()
+
+        width, height, pixels = decode_png(linkimg)
+
+        output = ''
+        for y in range(height):
+            for x in range(width):
+                r, g, b = pixels[y][3 * x:3 * x + 3]
+                if r == 0 and g == 0 and b == 0:
+                    break
+                else:
+                    output += compat_chr(r)
+                    output += compat_chr(g)
+                    output += compat_chr(b)
+
+        img_str_length = len(output) // 200
+        img_str = [[0 for x in range(img_str_length)] for y in range(10)]
+
+        sig_str_length = len(signums) // 260
+        sig_str = [[0 for x in range(sig_str_length)] for y in range(10)]
+
+        for i in range(10):
+            for j in range(img_str_length):
+                begin = i * img_str_length * 20 + j * 20
+                img_str[i][j] = output[begin:begin + 20]
+            for j in range(sig_str_length):
+                begin = i * sig_str_length * 26 + j * 26
+                sig_str[i][j] = signums[begin:begin + 26]
+
+        parts = []
+        # TODO: find better names for str_, chr_ and sum_
+        str_ = ''
+        for i in [2, 3, 5, 7]:
+            str_ = ''
+            sum_ = float(99)
+            for j in range(len(sig_str[i])):
+                for chr_idx in range(len(img_str[i][j])):
+                    if sum_ > float(122):
+                        sum_ = float(98)
+                    chr_ = compat_chr(int(math.floor(sum_)))
+                    if sig_str[i][j][chr_idx] == chr_ and j >= len(str_):
+                        sum_ += float(2.5)
+                        str_ += img_str[i][j][chr_idx]
+            parts.append(str_.replace(',', ''))
+
+        video_url = 'https://openload.co/stream/%s~%s~%s~%s' % (parts[3], parts[1], parts[2], parts[0])
 
         title = self._og_search_title(webpage, default=None) or self._search_regex(
             r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage,
             'title', default=None) or self._html_search_meta(
             'description', webpage, 'title', fatal=True)
 
-        ext = mimetype2ext(self._search_regex(
-            r'window\.vt\s*=\s*(["\'])(?P<mimetype>.+?)\1', decoded,
-            'mimetype', default=None, group='mimetype')) or determine_ext(
-            video_url, 'mp4')
-
         return {
             'id': video_id,
             'title': title,
-            'ext': ext,
             'thumbnail': self._og_search_thumbnail(webpage, default=None),
             'url': video_url,
+            # Seems all videos have extensions in their titles
+            'ext': determine_ext(title),
         }
index 4e3864f0d7bd1f6b63c560e8b04ae38844a283f9..6ae30679a0a226b0d242b2ef773fbab9a90920c5 100644 (file)
@@ -40,16 +40,16 @@ class ORFTVthekIE(InfoExtractor):
         'skip': 'Blocked outside of Austria / Germany',
     }, {
         'url': 'http://tvthek.orf.at/topic/Im-Wandel-der-Zeit/8002126/Best-of-Ingrid-Thurnher/7982256',
-        'playlist': [{
-            'md5': '68f543909aea49d621dfc7703a11cfaf',
-            'info_dict': {
-                'id': '7982259',
-                'ext': 'mp4',
-                'title': 'Best of Ingrid Thurnher',
-                'upload_date': '20140527',
-                'description': 'Viele Jahre war Ingrid Thurnher das "Gesicht" der ZIB 2. Vor ihrem Wechsel zur ZIB 2 im jahr 1995 moderierte sie unter anderem "Land und Leute", "Österreich-Bild" und "Niederösterreich heute".',
-            }
-        }],
+        'info_dict': {
+            'id': '7982259',
+            'ext': 'mp4',
+            'title': 'Best of Ingrid Thurnher',
+            'upload_date': '20140527',
+            'description': 'Viele Jahre war Ingrid Thurnher das "Gesicht" der ZIB 2. Vor ihrem Wechsel zur ZIB 2 im Jahr 1995 moderierte sie unter anderem "Land und Leute", "Österreich-Bild" und "Niederösterreich heute".',
+        },
+        'params': {
+            'skip_download': True,  # rtsp downloads
+        },
         '_skip': 'Blocked outside of Austria / Germany',
     }]
 
@@ -137,13 +137,16 @@ class ORFTVthekIE(InfoExtractor):
 class ORFOE1IE(InfoExtractor):
     IE_NAME = 'orf:oe1'
     IE_DESC = 'Radio Österreich 1'
-    _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole.*?#\?track_id=)(?P<id>[0-9]+)'
+    _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole\?.*?\btrack_id=)(?P<id>[0-9]+)'
 
     # Audios on ORF radio are only available for 7 days, so we can't add tests.
-    _TEST = {
+    _TESTS = [{
         'url': 'http://oe1.orf.at/konsole?show=on_demand#?track_id=394211',
         'only_matching': True,
-    }
+    }, {
+        'url': 'http://oe1.orf.at/konsole?show=ondemand&track_id=443608&load_day=/programm/konsole/tag/20160726',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         show_id = self._match_id(url)
index 81918ac6e455313c59a7b56b6c63a5bae4699e57..b490ef74c5fb768751d4598ff88e70a13d41c060 100644 (file)
@@ -4,13 +4,13 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_HTTPError
 from ..utils import (
     ExtractorError,
     determine_ext,
     int_or_none,
     js_to_json,
     strip_jsonp,
+    strip_or_none,
     unified_strdate,
     US_RATINGS,
 )
@@ -201,7 +201,7 @@ class PBSIE(InfoExtractor):
                 'id': '2365006249',
                 'ext': 'mp4',
                 'title': 'Constitution USA with Peter Sagal - A More Perfect Union',
-                'description': 'md5:36f341ae62e251b8f5bd2b754b95a071',
+                'description': 'md5:31b664af3c65fd07fa460d306b837d00',
                 'duration': 3190,
             },
         },
@@ -212,7 +212,7 @@ class PBSIE(InfoExtractor):
                 'id': '2365297690',
                 'ext': 'mp4',
                 'title': 'FRONTLINE - Losing Iraq',
-                'description': 'md5:4d3eaa01f94e61b3e73704735f1196d9',
+                'description': 'md5:5979a4d069b157f622d02bff62fbe654',
                 'duration': 5050,
             },
         },
@@ -223,7 +223,7 @@ class PBSIE(InfoExtractor):
                 'id': '2201174722',
                 'ext': 'mp4',
                 'title': 'PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist',
-                'description': 'md5:95a19f568689d09a166dff9edada3301',
+                'description': 'md5:86ab9a3d04458b876147b355788b8781',
                 'duration': 801,
             },
         },
@@ -268,7 +268,7 @@ class PBSIE(InfoExtractor):
                 'display_id': 'player',
                 'ext': 'mp4',
                 'title': 'American Experience - Death and the Civil War, Chapter 1',
-                'description': 'md5:1b80a74e0380ed2a4fb335026de1600d',
+                'description': 'md5:67fa89a9402e2ee7d08f53b920674c18',
                 'duration': 682,
                 'thumbnail': 're:^https?://.*\.jpg$',
             },
@@ -294,13 +294,13 @@ class PBSIE(InfoExtractor):
             # "<iframe style='position: absolute;<br />\ntop: 0; left: 0;' ...", see
             # https://github.com/rg3/youtube-dl/issues/7059)
             'url': 'http://www.pbs.org/food/features/a-chefs-life-season-3-episode-5-prickly-business/',
-            'md5': '84ced42850d78f1d4650297356e95e6f',
+            'md5': '59b0ef5009f9ac8a319cc5efebcd865e',
             'info_dict': {
                 'id': '2365546844',
                 'display_id': 'a-chefs-life-season-3-episode-5-prickly-business',
                 'ext': 'mp4',
                 'title': "A Chef's Life - Season 3, Ep. 5: Prickly Business",
-                'description': 'md5:54033c6baa1f9623607c6e2ed245888b',
+                'description': 'md5:c0ff7475a4b70261c7e58f493c2792a5',
                 'duration': 1480,
                 'thumbnail': 're:^https?://.*\.jpg$',
             },
@@ -313,7 +313,7 @@ class PBSIE(InfoExtractor):
                 'display_id': 'the-atomic-artists',
                 'ext': 'mp4',
                 'title': 'FRONTLINE - The Atomic Artists',
-                'description': 'md5:1a2481e86b32b2e12ec1905dd473e2c1',
+                'description': 'md5:f677e4520cfacb4a5ce1471e31b57800',
                 'duration': 723,
                 'thumbnail': 're:^https?://.*\.jpg$',
             },
@@ -324,7 +324,7 @@ class PBSIE(InfoExtractor):
         {
             # Serves hd only via wigget/partnerplayer page
             'url': 'http://www.pbs.org/video/2365641075/',
-            'md5': 'acfd4c400b48149a44861cb16dd305cf',
+            'md5': 'fdf907851eab57211dd589cf12006666',
             'info_dict': {
                 'id': '2365641075',
                 'ext': 'mp4',
@@ -353,11 +353,16 @@ class PBSIE(InfoExtractor):
     def _extract_webpage(self, url):
         mobj = re.match(self._VALID_URL, url)
 
+        description = None
+
         presumptive_id = mobj.group('presumptive_id')
         display_id = presumptive_id
         if presumptive_id:
             webpage = self._download_webpage(url, display_id)
 
+            description = strip_or_none(self._og_search_description(
+                webpage, default=None) or self._html_search_meta(
+                'description', webpage, default=None))
             upload_date = unified_strdate(self._search_regex(
                 r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"',
                 webpage, 'upload date', default=None))
@@ -370,7 +375,7 @@ class PBSIE(InfoExtractor):
             for p in MULTI_PART_REGEXES:
                 tabbed_videos = re.findall(p, webpage)
                 if tabbed_videos:
-                    return tabbed_videos, presumptive_id, upload_date
+                    return tabbed_videos, presumptive_id, upload_date, description
 
             MEDIA_ID_REGEXES = [
                 r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'",  # frontline video embed
@@ -382,7 +387,7 @@ class PBSIE(InfoExtractor):
             media_id = self._search_regex(
                 MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None)
             if media_id:
-                return media_id, presumptive_id, upload_date
+                return media_id, presumptive_id, upload_date, description
 
             # Fronline video embedded via flp
             video_id = self._search_regex(
@@ -399,7 +404,7 @@ class PBSIE(InfoExtractor):
                     'http://www.pbs.org/wgbh/pages/frontline/.json/getdir/getdir%d.json' % prg_id,
                     presumptive_id, 'Downloading getdir JSON',
                     transform_source=strip_jsonp)
-                return getdir['mid'], presumptive_id, upload_date
+                return getdir['mid'], presumptive_id, upload_date, description
 
             for iframe in re.findall(r'(?s)<iframe(.+?)></iframe>', webpage):
                 url = self._search_regex(
@@ -423,10 +428,10 @@ class PBSIE(InfoExtractor):
             video_id = mobj.group('id')
             display_id = video_id
 
-        return video_id, display_id, None
+        return video_id, display_id, None, description
 
     def _real_extract(self, url):
-        video_id, display_id, upload_date = self._extract_webpage(url)
+        video_id, display_id, upload_date, description = self._extract_webpage(url)
 
         if isinstance(video_id, list):
             entries = [self.url_result(
@@ -448,17 +453,6 @@ class PBSIE(InfoExtractor):
                     redirects.append(redirect)
                     redirect_urls.add(redirect_url)
 
-        try:
-            video_info = self._download_json(
-                'http://player.pbs.org/videoInfo/%s?format=json&type=partner' % video_id,
-                display_id, 'Downloading video info JSON')
-            extract_redirect_urls(video_info)
-            info = video_info
-        except ExtractorError as e:
-            # videoInfo API may not work for some videos
-            if not isinstance(e.cause, compat_HTTPError) or e.cause.code != 404:
-                raise
-
         # Player pages may also serve different qualities
         for page in ('widget/partnerplayer', 'portalplayer'):
             player = self._download_webpage(
@@ -511,14 +505,23 @@ class PBSIE(InfoExtractor):
             formats))
         if http_url:
             for m3u8_format in m3u8_formats:
-                bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
-                # extract only the formats that we know that they will be available as http format.
-                # https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications
-                if not bitrate or bitrate not in ('400k', '800k', '1200k', '2500k'):
+                bitrate = self._search_regex(r'(\d+)k', m3u8_format['url'], 'bitrate', default=None)
+                # Lower qualities (150k and 192k) are not available as HTTP formats (see [1]),
+                # we won't try extracting them.
+                # Since summer 2016 higher quality formats (4500k and 6500k) are also available
+                # albeit they are not documented in [2].
+                # 1. https://github.com/rg3/youtube-dl/commit/cbc032c8b70a038a69259378c92b4ba97b42d491#commitcomment-17313656
+                # 2. https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications
+                if not bitrate or int(bitrate) < 400:
+                    continue
+                f_url = re.sub(r'\d+k|baseline', bitrate + 'k', http_url)
+                # This may produce invalid links sometimes (e.g.
+                # http://www.pbs.org/wgbh/frontline/film/suicide-plan)
+                if not self._is_valid_url(f_url, display_id, 'http-%sk video' % bitrate):
                     continue
                 f = m3u8_format.copy()
                 f.update({
-                    'url': re.sub(r'\d+k|baseline', bitrate, http_url),
+                    'url': f_url,
                     'format_id': m3u8_format['format_id'].replace('hls', 'http'),
                     'protocol': 'http',
                 })
@@ -557,11 +560,14 @@ class PBSIE(InfoExtractor):
         if alt_title:
             info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + '[\s\-:]+', '', info['title'])
 
+        description = info.get('description') or info.get(
+            'program', {}).get('description') or description
+
         return {
             'id': video_id,
             'display_id': display_id,
             'title': info['title'],
-            'description': info.get('description') or info.get('program', {}).get('description'),
+            'description': description,
             'thumbnail': info.get('image_url'),
             'duration': int_or_none(info.get('duration')),
             'age_limit': age_limit,
index c23b314e79df70e6b115fee6fb91386345d00ad4..75f5884a928cff177bdabfb3db430ed282d0a7a8 100644 (file)
@@ -120,9 +120,12 @@ class PeriscopeUserIE(InfoExtractor):
         title = user.get('display_name') or user.get('username')
         description = user.get('description')
 
+        broadcast_ids = (data_store.get('UserBroadcastHistory', {}).get('broadcastIds') or
+                         data_store.get('BroadcastCache', {}).get('broadcastIds', []))
+
         entries = [
             self.url_result(
-                'https://www.periscope.tv/%s/%s' % (user_id, broadcast['id']))
-            for broadcast in data_store.get('UserBroadcastHistory', {}).get('broadcasts', [])]
+                'https://www.periscope.tv/%s/%s' % (user_id, broadcast_id))
+            for broadcast_id in broadcast_ids]
 
         return self.playlist_result(entries, user_id, title, description)
index bc559d1df289fca39b96f5cfc5519bf6acb8bbb3..77e1211d6095cf17464ce09a27b756157b4931e9 100644 (file)
@@ -49,7 +49,7 @@ class PladformIE(InfoExtractor):
     @staticmethod
     def _extract_url(webpage):
         mobj = re.search(
-            r'<iframe[^>]+src="(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)"', webpage)
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1', webpage)
         if mobj:
             return mobj.group('url')
 
index 2eb4fd96dcbc071c1c2ecfb596ab20c4526018bd..78d21929945741f1a1b6e2fd57371650d146ed8a 100644 (file)
@@ -15,7 +15,7 @@ from ..utils import (
 
 class PlayvidIE(InfoExtractor):
     _VALID_URL = r'https?://www\.playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.playvid.com/watch/RnmBNgtrrJu',
         'md5': 'ffa2f6b2119af359f544388d8c01eb6c',
         'info_dict': {
@@ -24,8 +24,19 @@ class PlayvidIE(InfoExtractor):
             'title': 'md5:9256d01c6317e3f703848b5906880dc8',
             'duration': 82,
             'age_limit': 18,
-        }
-    }
+        },
+        'skip': 'Video removed due to ToS',
+    }, {
+        'url': 'http://www.playvid.com/watch/hwb0GpNkzgH',
+        'md5': '39d49df503ad7b8f23a4432cbf046477',
+        'info_dict': {
+            'id': 'hwb0GpNkzgH',
+            'ext': 'mp4',
+            'title': 'Ellen Euro Cutie Blond Takes a Sexy Survey Get Facial in The Park',
+            'age_limit': 18,
+            'thumbnail': 're:^https?://.*\.jpg$',
+        },
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/pokemon.py b/youtube_dl/extractor/pokemon.py
new file mode 100644 (file)
index 0000000..2d87e7e
--- /dev/null
@@ -0,0 +1,58 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    extract_attributes,
+    int_or_none,
+)
+
+
+class PokemonIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P<id>[a-z0-9]{32})|/[^/]+/\d+_\d+-(?P<display_id>[^/?#]+))'
+    _TESTS = [{
+        'url': 'http://www.pokemon.com/us/pokemon-episodes/19_01-from-a-to-z/?play=true',
+        'md5': '9fb209ae3a569aac25de0f5afc4ee08f',
+        'info_dict': {
+            'id': 'd0436c00c3ce4071ac6cee8130ac54a1',
+            'ext': 'mp4',
+            'title': 'From A to Z!',
+            'description': 'Bonnie makes a new friend, Ash runs into an old friend, and a terrifying premonition begins to unfold!',
+            'timestamp': 1460478136,
+            'upload_date': '20160412',
+        },
+        'add_id': ['LimelightMedia']
+    }, {
+        'url': 'http://www.pokemon.com/uk/pokemon-episodes/?play=2e8b5c761f1d4a9286165d7748c1ece2',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.pokemon.com/fr/episodes-pokemon/18_09-un-hiver-inattendu/',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.pokemon.com/de/pokemon-folgen/01_20-bye-bye-smettbo/',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id, display_id = re.match(self._VALID_URL, url).groups()
+        webpage = self._download_webpage(url, video_id or display_id)
+        video_data = extract_attributes(self._search_regex(
+            r'(<[^>]+data-video-id="%s"[^>]*>)' % (video_id if video_id else '[a-z0-9]{32}'),
+            webpage, 'video data element'))
+        video_id = video_data['data-video-id']
+        title = video_data['data-video-title']
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'url': 'limelight:media:%s' % video_id,
+            'title': title,
+            'description': video_data.get('data-video-summary'),
+            'thumbnail': video_data.get('data-video-poster'),
+            'series': 'Pokémon',
+            'season_number': int_or_none(video_data.get('data-video-season')),
+            'episode': title,
+            'episode_number': int_or_none(video_data.get('data-video-episode')),
+            'ie_key': 'LimelightMedia',
+        }
diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py
new file mode 100644 (file)
index 0000000..f559b89
--- /dev/null
@@ -0,0 +1,99 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_str,
+    compat_urllib_parse_unquote,
+)
+from ..utils import (
+    int_or_none,
+    strip_or_none,
+    unified_timestamp,
+)
+
+
+class PolskieRadioIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie',
+        'info_dict': {
+            'id': '1587943',
+            'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie',
+            'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5',
+        },
+        'playlist': [{
+            'md5': '2984ee6ce9046d91fc233bc1a864a09a',
+            'info_dict': {
+                'id': '1540576',
+                'ext': 'mp3',
+                'title': 'md5:d4623290d4ac983bf924061c75c23a0d',
+                'timestamp': 1456594200,
+                'upload_date': '20160227',
+                'duration': 2364,
+                'thumbnail': 're:^https?://static\.prsa\.pl/images/.*\.jpg$'
+            },
+        }],
+    }, {
+        'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal',
+        'info_dict': {
+            'id': '1635803',
+            'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał',
+            'description': 'md5:01cb7d0cad58664095d72b51a1ebada2',
+        },
+        'playlist_mincount': 12,
+    }, {
+        'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943',
+        'only_matching': True,
+    }, {
+        # with mp4 video
+        'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        playlist_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, playlist_id)
+
+        content = self._search_regex(
+            r'(?s)<div[^>]+class="audio atarticle"[^>]*>(.+?)<script>',
+            webpage, 'content')
+
+        timestamp = unified_timestamp(self._html_search_regex(
+            r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>',
+            webpage, 'timestamp', fatal=False))
+
+        thumbnail_url = self._og_search_thumbnail(webpage)
+
+        entries = []
+
+        media_urls = set()
+
+        for data_media in re.findall(r'<[^>]+data-media=({[^>]+})', content):
+            media = self._parse_json(data_media, playlist_id, fatal=False)
+            if not media.get('file') or not media.get('desc'):
+                continue
+            media_url = self._proto_relative_url(media['file'], 'http:')
+            if media_url in media_urls:
+                continue
+            media_urls.add(media_url)
+            entries.append({
+                'id': compat_str(media['id']),
+                'url': media_url,
+                'title': compat_urllib_parse_unquote(media['desc']),
+                'duration': int_or_none(media.get('length')),
+                'vcodec': 'none' if media.get('provider') == 'audio' else None,
+                'timestamp': timestamp,
+                'thumbnail': thumbnail_url
+            })
+
+        title = self._og_search_title(webpage).strip()
+        description = strip_or_none(self._og_search_description(webpage))
+
+        return self.playlist_result(entries, playlist_id, title, description)
index 6d57e1d35eb567b26f65dea8cc74c653f955c08f..20976c101d97f53a58b3599ded30f7bfb1d4bcd5 100644 (file)
@@ -25,7 +25,15 @@ from ..aes import (
 
 
 class PornHubIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)'
+    IE_DESC = 'PornHub and Thumbzilla'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)|
+                            (?:www\.)?thumbzilla\.com/video/
+                        )
+                        (?P<id>[0-9a-z]+)
+                    '''
     _TESTS = [{
         'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
         'md5': '1e19b41231a02eba417839222ac9d58e',
@@ -63,8 +71,24 @@ class PornHubIE(InfoExtractor):
         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
         'only_matching': True,
     }, {
+        # removed at the request of cam4.com
         'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
         'only_matching': True,
+    }, {
+        # removed at the request of the copyright owner
+        'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
+        'only_matching': True,
+    }, {
+        # removed by uploader
+        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
+        'only_matching': True,
+    }, {
+        # private video
+        'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
+        'only_matching': True,
     }]
 
     @classmethod
@@ -87,8 +111,8 @@ class PornHubIE(InfoExtractor):
         webpage = self._download_webpage(req, video_id)
 
         error_msg = self._html_search_regex(
-            r'(?s)<div class="userMessageSection[^"]*".*?>(.*?)</div>',
-            webpage, 'error message', default=None)
+            r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
+            webpage, 'error message', default=None, group='error')
         if error_msg:
             error_msg = re.sub(r'\s+', ' ', error_msg)
             raise ExtractorError(
index 5398e708b68337b76739282abf6c00e8a39745ab..63816c3588cebe889e77a24a928cc789ef07c7d5 100644 (file)
@@ -3,10 +3,7 @@ from __future__ import unicode_literals
 import json
 
 from .common import InfoExtractor
-from ..utils import (
-    int_or_none,
-    sanitized_Request,
-)
+from ..utils import int_or_none
 
 
 class PornotubeIE(InfoExtractor):
@@ -31,59 +28,55 @@ class PornotubeIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        # Fetch origin token
-        js_config = self._download_webpage(
-            'http://www.pornotube.com/assets/src/app/config.js', video_id,
-            note='Download JS config')
-        originAuthenticationSpaceKey = self._search_regex(
-            r"constant\('originAuthenticationSpaceKey',\s*'([^']+)'",
-            js_config, 'originAuthenticationSpaceKey')
+        token = self._download_json(
+            'https://api.aebn.net/auth/v2/origins/authenticate',
+            video_id, note='Downloading token',
+            data=json.dumps({'credentials': 'Clip Application'}).encode('utf-8'),
+            headers={
+                'Content-Type': 'application/json',
+                'Origin': 'http://www.pornotube.com',
+            })['tokenKey']
 
-        # Fetch actual token
-        token_req_data = {
-            'authenticationSpaceKey': originAuthenticationSpaceKey,
-            'credentials': 'Clip Application',
-        }
-        token_req = sanitized_Request(
-            'https://api.aebn.net/auth/v1/token/primal',
-            data=json.dumps(token_req_data).encode('utf-8'))
-        token_req.add_header('Content-Type', 'application/json')
-        token_req.add_header('Origin', 'http://www.pornotube.com')
-        token_answer = self._download_json(
-            token_req, video_id, note='Requesting primal token')
-        token = token_answer['tokenKey']
+        video_url = self._download_json(
+            'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id,
+            video_id, note='Downloading delivery information',
+            headers={'Authorization': token})['mediaUrl']
 
-        # Get video URL
-        delivery_req = sanitized_Request(
-            'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id)
-        delivery_req.add_header('Authorization', token)
-        delivery_info = self._download_json(
-            delivery_req, video_id, note='Downloading delivery information')
-        video_url = delivery_info['mediaUrl']
+        FIELDS = (
+            'title', 'description', 'startSecond', 'endSecond', 'publishDate',
+            'studios{name}', 'categories{name}', 'movieId', 'primaryImageNumber'
+        )
 
-        # Get additional info (title etc.)
-        info_req = sanitized_Request(
-            'https://api.aebn.net/content/v1/clips/%s?expand='
-            'title,description,primaryImageNumber,startSecond,endSecond,'
-            'movie.title,movie.MovieId,movie.boxCoverFront,movie.stars,'
-            'movie.studios,stars.name,studios.name,categories.name,'
-            'clipActive,movieActive,publishDate,orientations' % video_id)
-        info_req.add_header('Authorization', token)
         info = self._download_json(
-            info_req, video_id, note='Downloading metadata')
+            'https://api.aebn.net/content/v2/clips/%s?fields=%s'
+            % (video_id, ','.join(FIELDS)), video_id,
+            note='Downloading metadata',
+            headers={'Authorization': token})
+
+        if isinstance(info, list):
+            info = info[0]
+
+        title = info['title']
 
         timestamp = int_or_none(info.get('publishDate'), scale=1000)
         uploader = info.get('studios', [{}])[0].get('name')
-        movie_id = info['movie']['movieId']
-        thumbnail = 'http://pic.aebn.net/dis/t/%s/%s_%08d.jpg' % (
-            movie_id, movie_id, info['primaryImageNumber'])
-        categories = [c['name'] for c in info.get('categories')]
+        movie_id = info.get('movieId')
+        primary_image_number = info.get('primaryImageNumber')
+        thumbnail = None
+        if movie_id and primary_image_number:
+            thumbnail = 'http://pic.aebn.net/dis/t/%s/%s_%08d.jpg' % (
+                movie_id, movie_id, primary_image_number)
+        start = int_or_none(info.get('startSecond'))
+        end = int_or_none(info.get('endSecond'))
+        duration = end - start if start and end else None
+        categories = [c['name'] for c in info.get('categories', []) if c.get('name')]
 
         return {
             'id': video_id,
             'url': video_url,
-            'title': info['title'],
+            'title': title,
             'description': info.get('description'),
+            'duration': duration,
             'timestamp': timestamp,
             'uploader': uploader,
             'thumbnail': thumbnail,
index 07d49d489d6779b0f6bb7bd12bc610497c576c2e..c6eee3b72a6e428012644ee7c10caad9be78fa86 100644 (file)
@@ -5,7 +5,7 @@ import re
 
 from hashlib import sha1
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse_urlencode
+from ..compat import compat_str
 from ..utils import (
     ExtractorError,
     determine_ext,
@@ -71,6 +71,7 @@ class ProSiebenSat1IE(InfoExtractor):
                 # rtmp download
                 'skip_download': True,
             },
+            'skip': 'This video is unavailable',
         },
         {
             'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip',
@@ -86,6 +87,7 @@ class ProSiebenSat1IE(InfoExtractor):
                 # rtmp download
                 'skip_download': True,
             },
+            'skip': 'This video is unavailable',
         },
         {
             'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip',
@@ -101,6 +103,7 @@ class ProSiebenSat1IE(InfoExtractor):
                 # rtmp download
                 'skip_download': True,
             },
+            'skip': 'This video is unavailable',
         },
         {
             'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge',
@@ -116,6 +119,7 @@ class ProSiebenSat1IE(InfoExtractor):
                 # rtmp download
                 'skip_download': True,
             },
+            'skip': 'This video is unavailable',
         },
         {
             'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge',
@@ -131,6 +135,7 @@ class ProSiebenSat1IE(InfoExtractor):
                 # rtmp download
                 'skip_download': True,
             },
+            'skip': 'This video is unavailable',
         },
         {
             'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip',
@@ -227,70 +232,42 @@ class ProSiebenSat1IE(InfoExtractor):
     ]
 
     def _extract_clip(self, url, webpage):
-        clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id')
+        clip_id = self._html_search_regex(
+            self._CLIPID_REGEXES, webpage, 'clip id')
 
         access_token = 'prosieben'
         client_name = 'kolibri-2.0.19-splec4'
         client_location = url
 
-        videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse_urlencode({
-            'access_token': access_token,
-            'client_location': client_location,
-            'client_name': client_name,
-            'ids': clip_id,
-        })
-
-        video = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')[0]
+        video = self._download_json(
+            'http://vas.sim-technik.de/vas/live/v2/videos',
+            clip_id, 'Downloading videos JSON', query={
+                'access_token': access_token,
+                'client_location': client_location,
+                'client_name': client_name,
+                'ids': clip_id,
+            })[0]
 
         if video.get('is_protected') is True:
             raise ExtractorError('This video is DRM protected.', expected=True)
 
         duration = float_or_none(video.get('duration'))
-        source_ids = [source['id'] for source in video['sources']]
-        source_ids_str = ','.join(map(str, source_ids))
+        source_ids = [compat_str(source['id']) for source in video['sources']]
 
         g = '01!8d8F_)r9]4s[qeuXfP%'
+        client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name]).encode('utf-8')).hexdigest()
 
-        client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name])
-                                 .encode('utf-8')).hexdigest()
-
-        sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse_urlencode({
-            'access_token': access_token,
-            'client_id': client_id,
-            'client_location': client_location,
-            'client_name': client_name,
-        }))
-
-        sources = self._download_json(sources_api_url, clip_id, 'Downloading sources JSON')
+        sources = self._download_json(
+            'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id,
+            clip_id, 'Downloading sources JSON', query={
+                'access_token': access_token,
+                'client_id': client_id,
+                'client_location': client_location,
+                'client_name': client_name,
+            })
         server_id = sources['server_id']
 
-        client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id,
-                                          client_location, source_ids_str, g, client_name])
-                                 .encode('utf-8')).hexdigest()
-
-        url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse_urlencode({
-            'access_token': access_token,
-            'client_id': client_id,
-            'client_location': client_location,
-            'client_name': client_name,
-            'server_id': server_id,
-            'source_ids': source_ids_str,
-        }))
-
-        urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON')
-
         title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title')
-        description = self._html_search_regex(self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False)
-        thumbnail = self._og_search_thumbnail(webpage)
-
-        upload_date = unified_strdate(self._html_search_regex(
-            self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None))
-
-        formats = []
-
-        urls_sources = urls['sources']
-        if isinstance(urls_sources, dict):
-            urls_sources = urls_sources.values()
 
         def fix_bitrate(bitrate):
             bitrate = int_or_none(bitrate)
@@ -298,37 +275,73 @@ class ProSiebenSat1IE(InfoExtractor):
                 return None
             return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate
 
-        for source in urls_sources:
-            protocol = source['protocol']
-            source_url = source['url']
-            if protocol == 'rtmp' or protocol == 'rtmpe':
-                mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url)
-                if not mobj:
-                    continue
-                path = mobj.group('path')
-                mp4colon_index = path.rfind('mp4:')
-                app = path[:mp4colon_index]
-                play_path = path[mp4colon_index:]
-                formats.append({
-                    'url': '%s/%s' % (mobj.group('url'), app),
-                    'app': app,
-                    'play_path': play_path,
-                    'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf',
-                    'page_url': 'http://www.prosieben.de',
-                    'vbr': fix_bitrate(source['bitrate']),
-                    'ext': 'mp4',
-                    'format_id': '%s_%s' % (source['cdn'], source['bitrate']),
-                })
-            elif 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m':
-                formats.extend(self._extract_f4m_formats(source_url, clip_id))
-            else:
-                formats.append({
-                    'url': source_url,
-                    'vbr': fix_bitrate(source['bitrate']),
+        formats = []
+        for source_id in source_ids:
+            client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id, client_location, source_id, g, client_name]).encode('utf-8')).hexdigest()
+            urls = self._download_json(
+                'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id,
+                clip_id, 'Downloading urls JSON', fatal=False, query={
+                    'access_token': access_token,
+                    'client_id': client_id,
+                    'client_location': client_location,
+                    'client_name': client_name,
+                    'server_id': server_id,
+                    'source_ids': source_id,
                 })
-
+            if not urls:
+                continue
+            if urls.get('status_code') != 0:
+                raise ExtractorError('This video is unavailable', expected=True)
+            urls_sources = urls['sources']
+            if isinstance(urls_sources, dict):
+                urls_sources = urls_sources.values()
+            for source in urls_sources:
+                source_url = source.get('url')
+                if not source_url:
+                    continue
+                protocol = source.get('protocol')
+                mimetype = source.get('mimetype')
+                if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m':
+                    formats.extend(self._extract_f4m_formats(
+                        source_url, clip_id, f4m_id='hds', fatal=False))
+                elif mimetype == 'application/x-mpegURL':
+                    formats.extend(self._extract_m3u8_formats(
+                        source_url, clip_id, 'mp4', 'm3u8_native',
+                        m3u8_id='hls', fatal=False))
+                else:
+                    tbr = fix_bitrate(source['bitrate'])
+                    if protocol in ('rtmp', 'rtmpe'):
+                        mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url)
+                        if not mobj:
+                            continue
+                        path = mobj.group('path')
+                        mp4colon_index = path.rfind('mp4:')
+                        app = path[:mp4colon_index]
+                        play_path = path[mp4colon_index:]
+                        formats.append({
+                            'url': '%s/%s' % (mobj.group('url'), app),
+                            'app': app,
+                            'play_path': play_path,
+                            'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf',
+                            'page_url': 'http://www.prosieben.de',
+                            'tbr': tbr,
+                            'ext': 'flv',
+                            'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''),
+                        })
+                    else:
+                        formats.append({
+                            'url': source_url,
+                            'tbr': tbr,
+                            'format_id': 'http%s' % ('-%d' % tbr if tbr else ''),
+                        })
         self._sort_formats(formats)
 
+        description = self._html_search_regex(
+            self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False)
+        thumbnail = self._og_search_thumbnail(webpage)
+        upload_date = unified_strdate(self._html_search_regex(
+            self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None))
+
         return {
             'id': clip_id,
             'title': title,
index 4f05bbddc6d35fc8ec709fcb005638abcbe77779..8ec402646767a22f8a1e7cedbc89eb8576b59804 100644 (file)
@@ -12,6 +12,7 @@ from ..utils import (
     unified_strdate,
     xpath_element,
     ExtractorError,
+    determine_protocol,
 )
 
 
@@ -22,13 +23,13 @@ class RadioCanadaIE(InfoExtractor):
         'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272',
         'info_dict': {
             'id': '7184272',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': 'Le parcours du tireur capté sur vidéo',
             'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa',
             'upload_date': '20141023',
         },
         'params': {
-            # rtmp download
+            # m3u8 download
             'skip_download': True,
         },
     }
@@ -36,11 +37,14 @@ class RadioCanadaIE(InfoExtractor):
     def _real_extract(self, url):
         app_code, video_id = re.match(self._VALID_URL, url).groups()
 
+        device_types = ['ipad', 'android']
+        if app_code != 'toutv':
+            device_types.append('flash')
+
         formats = []
-        # TODO: extract m3u8 and f4m formats
-        # m3u8 formats can be extracted using ipad device_type return 403 error code when ffmpeg try to download segements
+        # TODO: extract f4m formats
         # f4m formats can be extracted using flashhd device_type but they produce unplayable file
-        for device_type in ('flash',):
+        for device_type in device_types:
             v_data = self._download_xml(
                 'http://api.radio-canada.ca/validationMedia/v1/Validation.ashx',
                 video_id, note='Downloading %s XML' % device_type, query={
@@ -52,7 +56,7 @@ class RadioCanadaIE(InfoExtractor):
                     # paysJ391wsHjbOJwvCs26toz and bypasslock are used to bypass geo-restriction
                     'paysJ391wsHjbOJwvCs26toz': 'CA',
                     'bypasslock': 'NZt5K62gRqfc',
-                })
+                }, fatal=False)
             v_url = xpath_text(v_data, 'url')
             if not v_url:
                 continue
@@ -64,7 +68,8 @@ class RadioCanadaIE(InfoExtractor):
                 formats.extend(self._extract_m3u8_formats(
                     v_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
             elif ext == 'f4m':
-                formats.extend(self._extract_f4m_formats(v_url, video_id, f4m_id='hds', fatal=False))
+                formats.extend(self._extract_f4m_formats(
+                    v_url, video_id, f4m_id='hds', fatal=False))
             else:
                 ext = determine_ext(v_url)
                 bitrates = xpath_element(v_data, 'bitrates')
@@ -72,15 +77,28 @@ class RadioCanadaIE(InfoExtractor):
                     tbr = int_or_none(url_e.get('bitrate'))
                     if not tbr:
                         continue
+                    f_url = re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url)
+                    protocol = determine_protocol({'url': f_url})
                     formats.append({
-                        'format_id': 'rtmp-%d' % tbr,
-                        'url': re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url),
-                        'ext': 'flv',
-                        'protocol': 'rtmp',
+                        'format_id': '%s-%d' % (protocol, tbr),
+                        'url': f_url,
+                        'ext': 'flv' if protocol == 'rtmp' else ext,
+                        'protocol': protocol,
                         'width': int_or_none(url_e.get('width')),
                         'height': int_or_none(url_e.get('height')),
                         'tbr': tbr,
                     })
+                    if protocol == 'rtsp':
+                        base_url = self._search_regex(
+                            r'rtsp://([^?]+)', f_url, 'base url', default=None)
+                        if base_url:
+                            base_url = 'http://' + base_url
+                            formats.extend(self._extract_m3u8_formats(
+                                base_url + '/playlist.m3u8', video_id, 'mp4',
+                                'm3u8_native', m3u8_id='hls', fatal=False))
+                            formats.extend(self._extract_f4m_formats(
+                                base_url + '/manifest.f4m', video_id,
+                                f4m_id='hds', fatal=False))
         self._sort_formats(formats)
 
         metadata = self._download_xml(
@@ -115,13 +133,13 @@ class RadioCanadaAudioVideoIE(InfoExtractor):
         'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam',
         'info_dict': {
             'id': '7527184',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': 'Barack Obama au Vietnam',
             'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam',
             'upload_date': '20160523',
         },
         'params': {
-            # rtmp download
+            # m3u8 download
             'skip_download': True,
         },
     }
index e36ce1aa1940deafd5a633bec814e7462008c3b1..dc640b1bcb58ddb79c89e5f2346a5bc5c63a3547 100644 (file)
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse,
-    compat_urlparse,
-)
+from ..compat import compat_urlparse
 from ..utils import (
-    ExtractorError,
     determine_ext,
+    ExtractorError,
+    find_xpath_attr,
+    fix_xml_ampersands,
+    int_or_none,
     parse_duration,
     unified_strdate,
-    int_or_none,
+    update_url_query,
     xpath_text,
 )
 
 
-class RaiTVIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
+class RaiBaseIE(InfoExtractor):
+    def _extract_relinker_formats(self, relinker_url, video_id):
+        formats = []
+
+        for platform in ('mon', 'flash', 'native'):
+            relinker = self._download_xml(
+                relinker_url, video_id,
+                note='Downloading XML metadata for platform %s' % platform,
+                transform_source=fix_xml_ampersands,
+                query={'output': 45, 'pl': platform},
+                headers=self.geo_verification_headers())
+
+            media_url = find_xpath_attr(relinker, './url', 'type', 'content').text
+            if media_url == 'http://download.rai.it/video_no_available.mp4':
+                self.raise_geo_restricted()
+
+            ext = determine_ext(media_url)
+            if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'):
+                continue
+
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    media_url, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
+            elif ext == 'f4m':
+                manifest_url = update_url_query(
+                    media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'),
+                    {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'})
+                formats.extend(self._extract_f4m_formats(
+                    manifest_url, video_id, f4m_id='hds', fatal=False))
+            else:
+                bitrate = int_or_none(xpath_text(relinker, 'bitrate'))
+                formats.append({
+                    'url': media_url,
+                    'tbr': bitrate if bitrate > 0 else None,
+                    'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http',
+                })
+
+        return formats
+
+    def _extract_from_content_id(self, content_id, base_url):
+        media = self._download_json(
+            'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id,
+            content_id, 'Downloading video JSON')
+
+        thumbnails = []
+        for image_type in ('image', 'image_medium', 'image_300'):
+            thumbnail_url = media.get(image_type)
+            if thumbnail_url:
+                thumbnails.append({
+                    'url': compat_urlparse.urljoin(base_url, thumbnail_url),
+                })
+
+        formats = []
+        media_type = media['type']
+        if 'Audio' in media_type:
+            formats.append({
+                'format_id': media.get('formatoAudio'),
+                'url': media['audioUrl'],
+                'ext': media.get('formatoAudio'),
+            })
+        elif 'Video' in media_type:
+            formats.extend(self._extract_relinker_formats(media['mediaUri'], content_id))
+            self._sort_formats(formats)
+        else:
+            raise ExtractorError('not a media file')
+
+        subtitles = {}
+        captions = media.get('subtitlesUrl')
+        if captions:
+            STL_EXT = '.stl'
+            SRT_EXT = '.srt'
+            if captions.endswith(STL_EXT):
+                captions = captions[:-len(STL_EXT)] + SRT_EXT
+            subtitles['it'] = [{
+                'ext': 'srt',
+                'url': captions,
+            }]
+
+        return {
+            'id': content_id,
+            'title': media['name'],
+            'description': media.get('desc'),
+            'thumbnails': thumbnails,
+            'uploader': media.get('author'),
+            'upload_date': unified_strdate(media.get('date')),
+            'duration': parse_duration(media.get('length')),
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+
+class RaiTVIE(RaiBaseIE):
+    _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+(?:media|ondemand)/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
     _TESTS = [
         {
             'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
-            'md5': '96382709b61dd64a6b88e0f791e6df4c',
+            'md5': '8970abf8caf8aef4696e7b1f2adfc696',
             'info_dict': {
                 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391',
-                'ext': 'flv',
+                'ext': 'mp4',
                 'title': 'Report del 07/04/2014',
                 'description': 'md5:f27c544694cacb46a078db84ec35d2d9',
                 'upload_date': '20140407',
                 'duration': 6160,
+                'thumbnail': 're:^https?://.*\.jpg$',
             }
         },
         {
+            # no m3u8 stream
             'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html',
-            'md5': 'd9751b78eac9710d62c2447b224dea39',
+            # HDS download, MD5 is unstable
             'info_dict': {
                 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9',
                 'ext': 'flv',
                 'title': 'TG PRIMO TEMPO',
                 'upload_date': '20140612',
                 'duration': 1758,
+                'thumbnail': 're:^https?://.*\.jpg$',
             },
+            'skip': 'Geo-restricted to Italy',
         },
         {
             'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html',
@@ -67,127 +161,70 @@ class RaiTVIE(InfoExtractor):
         },
         {
             'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html',
-            'md5': '496ab63e420574447f70d02578333437',
+            'md5': 'e57493e1cb8bc7c564663f363b171847',
             'info_dict': {
                 'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6',
-                'ext': 'flv',
+                'ext': 'mp4',
                 'title': 'Il Candidato - Primo episodio: "Le Primarie"',
                 'description': 'md5:364b604f7db50594678f483353164fb8',
                 'upload_date': '20140923',
                 'duration': 386,
+                'thumbnail': 're:^https?://.*\.jpg$',
             }
         },
     ]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        media = self._download_json(
-            'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % video_id,
-            video_id, 'Downloading video JSON')
-
-        thumbnails = []
-        for image_type in ('image', 'image_medium', 'image_300'):
-            thumbnail_url = media.get(image_type)
-            if thumbnail_url:
-                thumbnails.append({
-                    'url': thumbnail_url,
-                })
-
-        subtitles = []
-        formats = []
-        media_type = media['type']
-        if 'Audio' in media_type:
-            formats.append({
-                'format_id': media.get('formatoAudio'),
-                'url': media['audioUrl'],
-                'ext': media.get('formatoAudio'),
-            })
-        elif 'Video' in media_type:
-            def fix_xml(xml):
-                return xml.replace(' tag elementi', '').replace('>/', '</')
-
-            relinker = self._download_xml(
-                media['mediaUri'] + '&output=43',
-                video_id, transform_source=fix_xml)
-
-            has_subtitle = False
-
-            for element in relinker.findall('element'):
-                media_url = xpath_text(element, 'url')
-                ext = determine_ext(media_url)
-                content_type = xpath_text(element, 'content-type')
-                if ext == 'm3u8':
-                    formats.extend(self._extract_m3u8_formats(
-                        media_url, video_id, 'mp4', 'm3u8_native',
-                        m3u8_id='hls', fatal=False))
-                elif ext == 'f4m':
-                    formats.extend(self._extract_f4m_formats(
-                        media_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44',
-                        video_id, f4m_id='hds', fatal=False))
-                elif ext == 'stl':
-                    has_subtitle = True
-                elif content_type.startswith('video/'):
-                    bitrate = int_or_none(xpath_text(element, 'bitrate'))
-                    formats.append({
-                        'url': media_url,
-                        'tbr': bitrate if bitrate > 0 else None,
-                        'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http',
-                    })
-                elif content_type.startswith('image/'):
-                    thumbnails.append({
-                        'url': media_url,
-                    })
-
-            self._sort_formats(formats)
 
-            if has_subtitle:
-                webpage = self._download_webpage(url, video_id)
-                subtitles = self._get_subtitles(video_id, webpage)
-        else:
-            raise ExtractorError('not a media file')
+        return self._extract_from_content_id(video_id, url)
 
-        return {
-            'id': video_id,
-            'title': media['name'],
-            'description': media.get('desc'),
-            'thumbnails': thumbnails,
-            'uploader': media.get('author'),
-            'upload_date': unified_strdate(media.get('date')),
-            'duration': parse_duration(media.get('length')),
-            'formats': formats,
-            'subtitles': subtitles,
-        }
 
-    def _get_subtitles(self, video_id, webpage):
-        subtitles = {}
-        m = re.search(r'<meta name="closedcaption" content="(?P<captions>[^"]+)"', webpage)
-        if m:
-            captions = m.group('captions')
-            STL_EXT = '.stl'
-            SRT_EXT = '.srt'
-            if captions.endswith(STL_EXT):
-                captions = captions[:-len(STL_EXT)] + SRT_EXT
-            subtitles['it'] = [{
-                'ext': 'srt',
-                'url': 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions),
-            }]
-        return subtitles
-
-
-class RaiIE(InfoExtractor):
+class RaiIE(RaiBaseIE):
     _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
     _TESTS = [
         {
             'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html',
-            'md5': 'e0e7a8a131e249d1aa0ebf270d1d8db7',
+            'md5': '2dd727e61114e1ee9c47f0da6914e178',
             'info_dict': {
                 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af',
-                'ext': 'flv',
+                'ext': 'mp4',
                 'title': 'Il pacco',
                 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a',
                 'upload_date': '20141221',
             },
-        }
+        },
+        {
+            # Direct relinker URL
+            'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews',
+            # HDS live stream, MD5 is unstable
+            'info_dict': {
+                'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc',
+                'ext': 'flv',
+                'title': 'EuroNews',
+            },
+            'skip': 'Geo-restricted to Italy',
+        },
+        {
+            # Embedded content item ID
+            'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined',
+            'md5': '84c1135ce960e8822ae63cec34441d63',
+            'info_dict': {
+                'id': '0960e765-62c8-474a-ac4b-7eb3e2be39c8',
+                'ext': 'mp4',
+                'title': 'TG1 ore 20:00 del 02/07/2016',
+                'upload_date': '20160702',
+            },
+        },
+        {
+            'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html',
+            # HDS live stream, MD5 is unstable
+            'info_dict': {
+                'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9',
+                'ext': 'flv',
+                'title': 'La diretta di Rainews24',
+            },
+        },
     ]
 
     @classmethod
@@ -201,7 +238,30 @@ class RaiIE(InfoExtractor):
         iframe_url = self._search_regex(
             [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"',
              r'drawMediaRaiTV\(["\'](.+?)["\']'],
-            webpage, 'iframe')
-        if not iframe_url.startswith('http'):
-            iframe_url = compat_urlparse.urljoin(url, iframe_url)
-        return self.url_result(iframe_url)
+            webpage, 'iframe', default=None)
+        if iframe_url:
+            if not iframe_url.startswith('http'):
+                iframe_url = compat_urlparse.urljoin(url, iframe_url)
+            return self.url_result(iframe_url)
+
+        content_item_id = self._search_regex(
+            r'initEdizione\((?P<q1>[\'"])ContentItem-(?P<content_id>[^\'"]+)(?P=q1)',
+            webpage, 'content item ID', group='content_id', default=None)
+        if content_item_id:
+            return self._extract_from_content_id(content_item_id, url)
+
+        relinker_url = compat_urlparse.urljoin(url, self._search_regex(
+            r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)',
+            webpage, 'relinker URL', group='url'))
+        formats = self._extract_relinker_formats(relinker_url, video_id)
+        self._sort_formats(formats)
+
+        title = self._search_regex(
+            r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1',
+            webpage, 'title', group='title', default=None) or self._og_search_title(webpage)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+        }
index 7932af6ef7c599fdcce5c95bcdbf4e77f162d45d..471928ef86b5d434953fc694eef0bb7da8edd334 100644 (file)
@@ -1,55 +1,71 @@
-# encoding: utf-8
 from __future__ import unicode_literals
 
-import json
 import re
 
 from .common import InfoExtractor
+from ..compat import compat_str
 from ..utils import (
-    ExtractorError,
+    clean_html,
+    int_or_none,
+    unified_timestamp,
+    update_url_query,
 )
 
 
 class RBMARadioIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
+    _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<show_id>[^/]+)/episodes/(?P<id>[^/?#&]+)'
     _TEST = {
-        'url': 'http://www.rbmaradio.com/shows/ford-lopatin-live-at-primavera-sound-2011',
+        'url': 'https://www.rbmaradio.com/shows/main-stage/episodes/ford-lopatin-live-at-primavera-sound-2011',
         'md5': '6bc6f9bcb18994b4c983bc3bf4384d95',
         'info_dict': {
             'id': 'ford-lopatin-live-at-primavera-sound-2011',
             'ext': 'mp3',
-            'uploader_id': 'ford-lopatin',
-            'location': 'Spain',
-            'description': 'Joel Ford and Daniel ’Oneohtrix Point Never’ Lopatin fly their midified pop extravaganza to Spain. Live at Primavera Sound 2011.',
-            'uploader': 'Ford & Lopatin',
-            'title': 'Live at Primavera Sound 2011',
+            'title': 'Main Stage - Ford & Lopatin',
+            'description': 'md5:4f340fb48426423530af5a9d87bd7b91',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'duration': 2452,
+            'timestamp': 1307103164,
+            'upload_date': '20110603',
         },
     }
 
     def _real_extract(self, url):
-        m = re.match(self._VALID_URL, url)
-        video_id = m.group('videoID')
+        mobj = re.match(self._VALID_URL, url)
+        show_id = mobj.group('show_id')
+        episode_id = mobj.group('id')
 
-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(url, episode_id)
 
-        json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
-                                       webpage, 'json data', flags=re.MULTILINE)
+        episode = self._parse_json(
+            self._search_regex(
+                r'__INITIAL_STATE__\s*=\s*({.+?})\s*</script>',
+                webpage, 'json data'),
+            episode_id)['episodes'][show_id][episode_id]
 
-        try:
-            data = json.loads(json_data)
-        except ValueError as e:
-            raise ExtractorError('Invalid JSON: ' + str(e))
+        title = episode['title']
 
-        video_url = data['akamai_url'] + '&cbr=256'
+        show_title = episode.get('showTitle')
+        if show_title:
+            title = '%s - %s' % (show_title, title)
+
+        formats = [{
+            'url': update_url_query(episode['audioURL'], query={'cbr': abr}),
+            'format_id': compat_str(abr),
+            'abr': abr,
+            'vcodec': 'none',
+        } for abr in (96, 128, 256)]
+
+        description = clean_html(episode.get('longTeaser'))
+        thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape'))
+        duration = int_or_none(episode.get('duration'))
+        timestamp = unified_timestamp(episode.get('publishedAt'))
 
         return {
-            'id': video_id,
-            'url': video_url,
-            'title': data['title'],
-            'description': data.get('teaser_text'),
-            'location': data.get('country_of_origin'),
-            'uploader': data.get('host', {}).get('name'),
-            'uploader_id': data.get('host', {}).get('slug'),
-            'thumbnail': data.get('image', {}).get('large_url_2x'),
-            'duration': data.get('duration'),
+            'id': episode_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'duration': duration,
+            'timestamp': timestamp,
+            'formats': formats,
         }
index 796adfdf9dab7f07481328026bb21591a7aa5612..bf200ea4d3f8b17f171bcce01c930b5d183fcc2e 100644 (file)
@@ -1,23 +1,23 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 from ..utils import (
     parse_duration,
     parse_iso8601,
+    js_to_json,
 )
+from ..compat import compat_str
 
 
 class RDSIE(InfoExtractor):
     IE_DESC = 'RDS.ca'
-    _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<display_id>[^/]+)-(?P<id>\d+\.\d+)'
+    _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<id>[^/]+)-\d+\.\d+'
 
     _TESTS = [{
         'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799',
         'info_dict': {
-            'id': '3.1132799',
+            'id': '604333',
             'display_id': 'fowler-jr-prend-la-direction-de-jacksonville',
             'ext': 'mp4',
             'title': 'Fowler Jr. prend la direction de Jacksonville',
@@ -33,22 +33,17 @@ class RDSIE(InfoExtractor):
     }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        display_id = mobj.group('display_id')
+        display_id = self._match_id(url)
 
         webpage = self._download_webpage(url, display_id)
 
-        # TODO: extract f4m from 9c9media.com
-        video_url = self._search_regex(
-            r'<span[^>]+itemprop="contentURL"[^>]+content="([^"]+)"',
-            webpage, 'video url')
-
-        title = self._og_search_title(webpage) or self._html_search_meta(
+        item = self._parse_json(self._search_regex(r'(?s)itemToPush\s*=\s*({.+?});', webpage, 'item'), display_id, js_to_json)
+        video_id = compat_str(item['id'])
+        title = item.get('title') or self._og_search_title(webpage) or self._html_search_meta(
             'title', webpage, 'title', fatal=True)
         description = self._og_search_description(webpage) or self._html_search_meta(
             'description', webpage, 'description')
-        thumbnail = self._og_search_thumbnail(webpage) or self._search_regex(
+        thumbnail = item.get('urlImageBig') or self._og_search_thumbnail(webpage) or self._search_regex(
             [r'<link[^>]+itemprop="thumbnailUrl"[^>]+href="([^"]+)"',
              r'<span[^>]+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'],
             webpage, 'thumbnail', fatal=False)
@@ -61,13 +56,15 @@ class RDSIE(InfoExtractor):
         age_limit = self._family_friendly_search(webpage)
 
         return {
+            '_type': 'url_transparent',
             'id': video_id,
             'display_id': display_id,
-            'url': video_url,
+            'url': '9c9media:rds_web:%s' % video_id,
             'title': title,
             'description': description,
             'thumbnail': thumbnail,
             'timestamp': timestamp,
             'duration': duration,
             'age_limit': age_limit,
+            'ie_key': 'NineCNineMedia',
         }
diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py
new file mode 100644 (file)
index 0000000..f5b2f56
--- /dev/null
@@ -0,0 +1,148 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    int_or_none,
+    strip_or_none,
+    unescapeHTML,
+    urlencode_postdata,
+)
+
+
+class RoosterTeethIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/episode/(?P<id>[^/?#&]+)'
+    _LOGIN_URL = 'https://roosterteeth.com/login'
+    _NETRC_MACHINE = 'roosterteeth'
+    _TESTS = [{
+        'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement',
+        'md5': 'e2bd7764732d785ef797700a2489f212',
+        'info_dict': {
+            'id': '26576',
+            'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement',
+            'ext': 'mp4',
+            'title': 'Million Dollars, But...: Million Dollars, But... The Game Announcement',
+            'description': 'md5:0cc3b21986d54ed815f5faeccd9a9ca5',
+            'thumbnail': 're:^https?://.*\.png$',
+            'series': 'Million Dollars, But...',
+            'episode': 'Million Dollars, But... The Game Announcement',
+            'comment_count': int,
+        },
+    }, {
+        'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31',
+        'only_matching': True,
+    }, {
+        'url': 'http://funhaus.roosterteeth.com/episode/funhaus-shorts-2016-austin-sucks-funhaus-shorts',
+        'only_matching': True,
+    }, {
+        'url': 'http://screwattack.roosterteeth.com/episode/death-battle-season-3-mewtwo-vs-shadow',
+        'only_matching': True,
+    }, {
+        'url': 'http://theknow.roosterteeth.com/episode/the-know-game-news-season-1-boring-steam-sales-are-better',
+        'only_matching': True,
+    }, {
+        # only available for FIRST members
+        'url': 'http://roosterteeth.com/episode/rt-docs-the-world-s-greatest-head-massage-the-world-s-greatest-head-massage-an-asmr-journey-part-one',
+        'only_matching': True,
+    }]
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+
+        login_page = self._download_webpage(
+            self._LOGIN_URL, None,
+            note='Downloading login page',
+            errnote='Unable to download login page')
+
+        login_form = self._hidden_inputs(login_page)
+
+        login_form.update({
+            'username': username,
+            'password': password,
+        })
+
+        login_request = self._download_webpage(
+            self._LOGIN_URL, None,
+            note='Logging in as %s' % username,
+            data=urlencode_postdata(login_form),
+            headers={
+                'Referer': self._LOGIN_URL,
+            })
+
+        if not any(re.search(p, login_request) for p in (
+                r'href=["\']https?://(?:www\.)?roosterteeth\.com/logout"',
+                r'>Sign Out<')):
+            error = self._html_search_regex(
+                r'(?s)<div[^>]+class=(["\']).*?\balert-danger\b.*?\1[^>]*>(?:\s*<button[^>]*>.*?</button>)?(?P<error>.+?)</div>',
+                login_request, 'alert', default=None, group='error')
+            if error:
+                raise ExtractorError('Unable to login: %s' % error, expected=True)
+            raise ExtractorError('Unable to log in')
+
+    def _real_initialize(self):
+        self._login()
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        episode = strip_or_none(unescapeHTML(self._search_regex(
+            (r'videoTitle\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
+             r'<title>(?P<title>[^<]+)</title>'), webpage, 'title',
+            default=None, group='title')))
+
+        title = strip_or_none(self._og_search_title(
+            webpage, default=None)) or episode
+
+        m3u8_url = self._search_regex(
+            r'file\s*:\s*(["\'])(?P<url>http.+?\.m3u8.*?)\1',
+            webpage, 'm3u8 url', default=None, group='url')
+
+        if not m3u8_url:
+            if re.search(r'<div[^>]+class=["\']non-sponsor', webpage):
+                self.raise_login_required(
+                    '%s is only available for FIRST members' % display_id)
+
+            if re.search(r'<div[^>]+class=["\']golive-gate', webpage):
+                self.raise_login_required('%s is not available yet' % display_id)
+
+            raise ExtractorError('Unable to extract m3u8 URL')
+
+        formats = self._extract_m3u8_formats(
+            m3u8_url, display_id, ext='mp4',
+            entry_protocol='m3u8_native', m3u8_id='hls')
+        self._sort_formats(formats)
+
+        description = strip_or_none(self._og_search_description(webpage))
+        thumbnail = self._proto_relative_url(self._og_search_thumbnail(webpage))
+
+        series = self._search_regex(
+            (r'<h2>More ([^<]+)</h2>', r'<a[^>]+>See All ([^<]+) Videos<'),
+            webpage, 'series', fatal=False)
+
+        comment_count = int_or_none(self._search_regex(
+            r'>Comments \((\d+)\)<', webpage,
+            'comment count', fatal=False))
+
+        video_id = self._search_regex(
+            (r'containerId\s*=\s*["\']episode-(\d+)\1',
+             r'<div[^<]+id=["\']episode-(\d+)'), webpage,
+            'video id', default=display_id)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'series': series,
+            'episode': episode,
+            'comment_count': comment_count,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/rozhlas.py b/youtube_dl/extractor/rozhlas.py
new file mode 100644 (file)
index 0000000..f8eda8d
--- /dev/null
@@ -0,0 +1,50 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    remove_start,
+)
+
+
+class RozhlasIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?prehravac\.rozhlas\.cz/audio/(?P<id>[0-9]+)'
+    _TESTS = [{
+        'url': 'http://prehravac.rozhlas.cz/audio/3421320',
+        'md5': '504c902dbc9e9a1fd50326eccf02a7e2',
+        'info_dict': {
+            'id': '3421320',
+            'ext': 'mp3',
+            'title': 'Echo Pavla Klusáka (30.06.2015 21:00)',
+            'description': 'Osmdesátiny Terryho Rileyho jsou skvělou příležitostí proletět se elektronickými i akustickými díly zakladatatele minimalismu, který je aktivní už přes padesát let'
+        }
+    }, {
+        'url': 'http://prehravac.rozhlas.cz/audio/3421320/embed',
+        'skip_download': True,
+    }]
+
+    def _real_extract(self, url):
+        audio_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://prehravac.rozhlas.cz/audio/%s' % audio_id, audio_id)
+
+        title = self._html_search_regex(
+            r'<h3>(.+?)</h3>\s*<p[^>]*>.*?</p>\s*<div[^>]+id=["\']player-track',
+            webpage, 'title', default=None) or remove_start(
+            self._og_search_title(webpage), 'Radio Wave - ')
+        description = self._html_search_regex(
+            r'<p[^>]+title=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>.*?</p>\s*<div[^>]+id=["\']player-track',
+            webpage, 'description', fatal=False, group='url')
+        duration = int_or_none(self._search_regex(
+            r'data-duration=["\'](\d+)', webpage, 'duration', default=None))
+
+        return {
+            'id': audio_id,
+            'url': 'http://media.rozhlas.cz/_audio/%s.mp3' % audio_id,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'vcodec': 'none',
+        }
index 4d612b5e3c1c1bfceec034d476cd5c0952da5b8c..f0250af8a4c0e06e2642c9cced9f6b330dc1d4f6 100644 (file)
@@ -14,7 +14,7 @@ class RtlNlIE(InfoExtractor):
     _VALID_URL = r'''(?x)
         https?://(?:www\.)?
         (?:
-            rtlxl\.nl/\#!/[^/]+/|
+            rtlxl\.nl/[^\#]*\#!/[^/]+/|
             rtl\.nl/system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid=
         )
         (?P<id>[0-9a-f-]+)'''
@@ -67,6 +67,9 @@ class RtlNlIE(InfoExtractor):
     }, {
         'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0',
         'only_matching': True,
+    }, {
+        'url': 'http://rtlxl.nl/?_ga=1.204735956.572365465.1466978370#!/rtl-nieuws-132237/3c487912-023b-49ac-903e-2c5d79f8410f',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
index f11e3588b0796718e2ecbe316b53b968a08df98c..34f9c4a991263572d935a2bfc79194f8594ecb78 100644 (file)
@@ -113,9 +113,9 @@ class RTVEALaCartaIE(InfoExtractor):
         png = self._download_webpage(png_request, video_id, 'Downloading url information')
         video_url = _decrypt_url(png)
         if not video_url.endswith('.f4m'):
-            video_url = video_url.replace(
-                'resources/', 'auth/resources/'
-            ).replace('.net.rtve', '.multimedia.cdn.rtve')
+            if '?' not in video_url:
+                video_url = video_url.replace('resources/', 'auth/resources/')
+            video_url = video_url.replace('.net.rtve', '.multimedia.cdn.rtve')
 
         subtitles = None
         if info.get('sbtFile') is not None:
@@ -222,3 +222,34 @@ class RTVELiveIE(InfoExtractor):
             'formats': formats,
             'is_live': True,
         }
+
+
+class RTVETelevisionIE(InfoExtractor):
+    IE_NAME = 'rtve.es:television'
+    _VALID_URL = r'https?://www\.rtve\.es/television/[^/]+/[^/]+/(?P<id>\d+).shtml'
+
+    _TEST = {
+        'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml',
+        'info_dict': {
+            'id': '3069778',
+            'ext': 'mp4',
+            'title': 'Documentos TV - La revolución del móvil',
+            'duration': 3496.948,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        page_id = self._match_id(url)
+        webpage = self._download_webpage(url, page_id)
+
+        alacarta_url = self._search_regex(
+            r'data-location="alacarta_videos"[^<]+url&quot;:&quot;(http://www\.rtve\.es/alacarta.+?)&',
+            webpage, 'alacarta url', default=None)
+        if alacarta_url is None:
+            raise ExtractorError(
+                'The webpage doesn\'t contain any video', expected=True)
+
+        return self.url_result(alacarta_url, ie=RTVEALaCartaIE.ie_key())
index 4896d09d666e687010ae3cb6ebe0e2bfaec537d6..f6454c6b0082ed431fa74de49dd5881d3b0b7a0f 100644 (file)
@@ -9,7 +9,7 @@ class RTVNHIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?rtvnh\.nl/video/(?P<id>[0-9]+)'
     _TEST = {
         'url': 'http://www.rtvnh.nl/video/131946',
-        'md5': '6e1d0ab079e2a00b6161442d3ceacfc1',
+        'md5': 'cdbec9f44550763c8afc96050fa747dc',
         'info_dict': {
             'id': '131946',
             'ext': 'mp4',
@@ -29,15 +29,29 @@ class RTVNHIE(InfoExtractor):
             raise ExtractorError(
                 '%s returned error code %d' % (self.IE_NAME, status), expected=True)
 
-        formats = self._extract_smil_formats(
-            'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id, fatal=False)
-
-        for item in meta['source']['fb']:
-            if item.get('type') == 'hls':
-                formats.extend(self._extract_m3u8_formats(
-                    item['file'], video_id, ext='mp4', entry_protocol='m3u8_native'))
-            elif item.get('type') == '':
-                formats.append({'url': item['file']})
+        formats = []
+        rtmp_formats = self._extract_smil_formats(
+            'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id)
+        formats.extend(rtmp_formats)
+
+        for rtmp_format in rtmp_formats:
+            rtmp_url = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
+            rtsp_format = rtmp_format.copy()
+            del rtsp_format['play_path']
+            del rtsp_format['ext']
+            rtsp_format.update({
+                'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
+                'url': rtmp_url.replace('rtmp://', 'rtsp://'),
+                'protocol': 'rtsp',
+            })
+            formats.append(rtsp_format)
+            http_base_url = rtmp_url.replace('rtmp://', 'http://')
+            formats.extend(self._extract_m3u8_formats(
+                http_base_url + '/playlist.m3u8', video_id, 'mp4',
+                'm3u8_native', m3u8_id='hls', fatal=False))
+            formats.extend(self._extract_f4m_formats(
+                http_base_url + '/manifest.f4m',
+                video_id, f4m_id='hds', fatal=False))
         self._sort_formats(formats)
 
         return {
diff --git a/youtube_dl/extractor/rudo.py b/youtube_dl/extractor/rudo.py
new file mode 100644 (file)
index 0000000..38366b7
--- /dev/null
@@ -0,0 +1,53 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .jwplatform import JWPlatformBaseIE
+from ..utils import (
+    js_to_json,
+    get_element_by_class,
+    unified_strdate,
+)
+
+
+class RudoIE(JWPlatformBaseIE):
+    _VALID_URL = r'https?://rudo\.video/vod/(?P<id>[0-9a-zA-Z]+)'
+
+    _TEST = {
+        'url': 'http://rudo.video/vod/oTzw0MGnyG',
+        'md5': '2a03a5b32dd90a04c83b6d391cf7b415',
+        'info_dict': {
+            'id': 'oTzw0MGnyG',
+            'ext': 'mp4',
+            'title': 'Comentario Tomás Mosciatti',
+            'upload_date': '20160617',
+        },
+    }
+
+    @classmethod
+    def _extract_url(self, webpage):
+        mobj = re.search(
+            '<iframe[^>]+src=(?P<q1>[\'"])(?P<url>(?:https?:)?//rudo\.video/vod/[0-9a-zA-Z]+)(?P=q1)',
+            webpage)
+        if mobj:
+            return mobj.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id, encoding='iso-8859-1')
+
+        jwplayer_data = self._parse_json(self._search_regex(
+            r'(?s)playerInstance\.setup\(({.+?})\)', webpage, 'jwplayer data'), video_id,
+            transform_source=lambda s: js_to_json(re.sub(r'encodeURI\([^)]+\)', '""', s)))
+
+        info_dict = self._parse_jwplayer_data(
+            jwplayer_data, video_id, require_title=False, m3u8_id='hls')
+
+        info_dict.update({
+            'title': self._og_search_title(webpage),
+            'upload_date': unified_strdate(get_element_by_class('date', webpage)),
+        })
+
+        return info_dict
index 6ba91f202baadbfd72160cc739efde868a60d421..08ddbe3c4222879cade93fd9a5433e1d9c4e6e49 100644 (file)
@@ -75,7 +75,7 @@ class SafariBaseIE(InfoExtractor):
 class SafariIE(SafariBaseIE):
     IE_NAME = 'safari'
     IE_DESC = 'safaribooksonline.com online video'
-    _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>part\d+)\.html'
+    _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?#&]+)\.html'
 
     _TESTS = [{
         'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html',
@@ -92,6 +92,9 @@ class SafariIE(SafariBaseIE):
         # non-digits in course id
         'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html',
         'only_matching': True,
+    }, {
+        'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
@@ -132,12 +135,15 @@ class SafariIE(SafariBaseIE):
 
 class SafariApiIE(SafariBaseIE):
     IE_NAME = 'safari:api'
-    _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>part\d+)\.html'
+    _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>[^/?#&]+)\.html'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
         'only_matching': True,
-    }
+    }, {
+        'url': 'https://www.safaribooksonline.com/api/v1/book/9780134664057/chapter/RHCE_Introduction.html',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
index 759898a492f43c67179409c563be42e864deae5f..96e43af849bc9e6b90bbba71a38e7155ef864268 100644 (file)
@@ -1,18 +1,12 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import itertools
 import json
-import re
 
 from .common import InfoExtractor
-from ..compat import compat_urlparse
 from ..utils import (
     int_or_none,
-    js_to_json,
     mimetype2ext,
-    sanitized_Request,
-    unified_strdate,
 )
 
 
@@ -27,7 +21,8 @@ class SandiaIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Xyce Software Training - Section 1',
             'description': 're:(?s)SAND Number: SAND 2013-7800.{200,}',
-            'upload_date': '20120904',
+            'upload_date': '20120409',
+            'timestamp': 1333983600,
             'duration': 7794,
         }
     }
@@ -35,81 +30,36 @@ class SandiaIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        req = sanitized_Request(url)
-        req.add_header('Cookie', 'MediasitePlayerCaps=ClientPlugins=4')
-        webpage = self._download_webpage(req, video_id)
+        presentation_data = self._download_json(
+            'http://digitalops.sandia.gov/Mediasite/PlayerService/PlayerService.svc/json/GetPlayerOptions',
+            video_id, data=json.dumps({
+                'getPlayerOptionsRequest': {
+                    'ResourceId': video_id,
+                    'QueryString': '',
+                }
+            }), headers={
+                'Content-Type': 'application/json; charset=utf-8',
+            })['d']['Presentation']
 
-        js_path = self._search_regex(
-            r'<script type="text/javascript" src="(/Mediasite/FileServer/Presentation/[^"]+)"',
-            webpage, 'JS code URL')
-        js_url = compat_urlparse.urljoin(url, js_path)
-
-        js_code = self._download_webpage(
-            js_url, video_id, note='Downloading player')
-
-        def extract_str(key, **args):
-            return self._search_regex(
-                r'Mediasite\.PlaybackManifest\.%s\s*=\s*(.+);\s*?\n' % re.escape(key),
-                js_code, key, **args)
-
-        def extract_data(key, **args):
-            data_json = extract_str(key, **args)
-            if data_json is None:
-                return data_json
-            return self._parse_json(
-                data_json, video_id, transform_source=js_to_json)
+        title = presentation_data['Title']
 
         formats = []
-        for i in itertools.count():
-            fd = extract_data('VideoUrls[%d]' % i, default=None)
-            if fd is None:
-                break
-            formats.append({
-                'format_id': '%s' % i,
-                'format_note': fd['MimeType'].partition('/')[2],
-                'ext': mimetype2ext(fd['MimeType']),
-                'url': fd['Location'],
-                'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None,
-            })
+        for stream in presentation_data.get('Streams', []):
+            for fd in stream.get('VideoUrls', []):
+                formats.append({
+                    'format_id': fd['MediaType'],
+                    'format_note': fd['MimeType'].partition('/')[2],
+                    'ext': mimetype2ext(fd['MimeType']),
+                    'url': fd['Location'],
+                    'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None,
+                })
         self._sort_formats(formats)
 
-        slide_baseurl = compat_urlparse.urljoin(
-            url, extract_data('SlideBaseUrl'))
-        slide_template = slide_baseurl + re.sub(
-            r'\{0:D?([0-9+])\}', r'%0\1d', extract_data('SlideImageFileNameTemplate'))
-        slides = []
-        last_slide_time = 0
-        for i in itertools.count(1):
-            sd = extract_str('Slides[%d]' % i, default=None)
-            if sd is None:
-                break
-            timestamp = int_or_none(self._search_regex(
-                r'^Mediasite\.PlaybackManifest\.CreateSlide\("[^"]*"\s*,\s*([0-9]+),',
-                sd, 'slide %s timestamp' % i, fatal=False))
-            slides.append({
-                'url': slide_template % i,
-                'duration': timestamp - last_slide_time,
-            })
-            last_slide_time = timestamp
-        formats.append({
-            'format_id': 'slides',
-            'protocol': 'slideshow',
-            'url': json.dumps(slides),
-            'preference': -10000,  # Downloader not yet written
-        })
-        self._sort_formats(formats)
-
-        title = extract_data('Title')
-        description = extract_data('Description', fatal=False)
-        duration = int_or_none(extract_data(
-            'Duration', fatal=False), scale=1000)
-        upload_date = unified_strdate(extract_data('AirDate', fatal=False))
-
         return {
             'id': video_id,
             'title': title,
-            'description': description,
+            'description': presentation_data.get('Description'),
             'formats': formats,
-            'upload_date': upload_date,
-            'duration': duration,
+            'timestamp': int_or_none(presentation_data.get('UnixTime'), 1000),
+            'duration': int_or_none(presentation_data.get('Duration'), 1000),
         }
index 1c636f67237af84d29b36d8340579811b692ef4f..2dbe490bba7717a7719290113f26ed5c795ae218 100644 (file)
@@ -4,33 +4,43 @@ from __future__ import unicode_literals
 import re
 
 from .jwplatform import JWPlatformBaseIE
-from ..compat import compat_parse_qs
 from ..utils import (
-    ExtractorError,
-    parse_duration,
+    float_or_none,
+    parse_iso8601,
+    update_url_query,
 )
 
 
 class SendtoNewsIE(JWPlatformBaseIE):
-    _VALID_URL = r'https?://embed\.sendtonews\.com/player/embed\.php\?(?P<query>[^#]+)'
+    _VALID_URL = r'https?://embed\.sendtonews\.com/player2/embedplayer\.php\?.*\bSC=(?P<id>[0-9A-Za-z-]+)'
 
     _TEST = {
         # From http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/
-        'url': 'http://embed.sendtonews.com/player/embed.php?SK=GxfCe0Zo7D&MK=175909&PK=5588&autoplay=on&sound=yes',
+        'url': 'http://embed.sendtonews.com/player2/embedplayer.php?SC=GxfCe0Zo7D-175909-5588&type=single&autoplay=on&sound=YES',
         'info_dict': {
-            'id': 'GxfCe0Zo7D-175909-5588',
-            'ext': 'mp4',
-            'title': 'Recap: CLE 15, CIN 6',
-            'description': '5/16/16: Indians\' bats explode for 15 runs in a win',
-            'duration': 49,
+            'id': 'GxfCe0Zo7D-175909-5588'
         },
+        'playlist_count': 9,
+        # test the first video only to prevent lengthy tests
+        'playlist': [{
+            'info_dict': {
+                'id': '198180',
+                'ext': 'mp4',
+                'title': 'Recap: CLE 5, LAA 4',
+                'description': '8/14/16: Naquin, Almonte lead Indians in 5-4 win',
+                'duration': 57.343,
+                'thumbnail': 're:https?://.*\.jpg$',
+                'upload_date': '20160815',
+                'timestamp': 1471221961,
+            },
+        }],
         'params': {
             # m3u8 download
             'skip_download': True,
         },
     }
 
-    _URL_TEMPLATE = '//embed.sendtonews.com/player/embed.php?SK=%s&MK=%s&PK=%s'
+    _URL_TEMPLATE = '//embed.sendtonews.com/player2/embedplayer.php?SC=%s'
 
     @classmethod
     def _extract_url(cls, webpage):
@@ -39,48 +49,41 @@ class SendtoNewsIE(JWPlatformBaseIE):
                 .*\bSC=(?P<SC>[0-9a-zA-Z-]+).*
             \1>''', webpage)
         if mobj:
-            sk, mk, pk = mobj.group('SC').split('-')
-            return cls._URL_TEMPLATE % (sk, mk, pk)
+            sc = mobj.group('SC')
+            return cls._URL_TEMPLATE % sc
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        params = compat_parse_qs(mobj.group('query'))
-
-        if 'SK' not in params or 'MK' not in params or 'PK' not in params:
-            raise ExtractorError('Invalid URL', expected=True)
-
-        video_id = '-'.join([params['SK'][0], params['MK'][0], params['PK'][0]])
-
-        webpage = self._download_webpage(url, video_id)
-
-        jwplayer_data_str = self._search_regex(
-            r'jwplayer\("[^"]+"\)\.setup\((.+?)\);', webpage, 'JWPlayer data')
-        js_vars = {
-            'w': 1024,
-            'h': 768,
-            'modeVar': 'html5',
-        }
-        for name, val in js_vars.items():
-            js_val = '%d' % val if isinstance(val, int) else '"%s"' % val
-            jwplayer_data_str = jwplayer_data_str.replace(':%s,' % name, ':%s,' % js_val)
-
-        info_dict = self._parse_jwplayer_data(
-            self._parse_json(jwplayer_data_str, video_id),
-            video_id, require_title=False, rtmp_params={'no_resume': True})
-
-        title = self._html_search_regex(
-            r'<div[^>]+class="embedTitle">([^<]+)</div>', webpage, 'title')
-        description = self._html_search_regex(
-            r'<div[^>]+class="embedSubTitle">([^<]+)</div>', webpage,
-            'description', fatal=False)
-        duration = parse_duration(self._html_search_regex(
-            r'<div[^>]+class="embedDetails">([0-9:]+)', webpage,
-            'duration', fatal=False))
-
-        info_dict.update({
-            'title': title,
-            'description': description,
-            'duration': duration,
-        })
-
-        return info_dict
+        playlist_id = self._match_id(url)
+
+        data_url = update_url_query(
+            url.replace('embedplayer.php', 'data_read.php'),
+            {'cmd': 'loadInitial'})
+        playlist_data = self._download_json(data_url, playlist_id)
+
+        entries = []
+        for video in playlist_data['playlistData'][0]:
+            info_dict = self._parse_jwplayer_data(
+                video['jwconfiguration'],
+                require_title=False, rtmp_params={'no_resume': True})
+
+            thumbnails = []
+            if video.get('thumbnailUrl'):
+                thumbnails.append({
+                    'id': 'normal',
+                    'url': video['thumbnailUrl'],
+                })
+            if video.get('smThumbnailUrl'):
+                thumbnails.append({
+                    'id': 'small',
+                    'url': video['smThumbnailUrl'],
+                })
+            info_dict.update({
+                'title': video['S_headLine'],
+                'description': video.get('S_fullStory'),
+                'thumbnails': thumbnails,
+                'duration': float_or_none(video.get('SM_length')),
+                'timestamp': parse_iso8601(video.get('S_sysDate'), delimiter=' '),
+            })
+            entries.append(info_dict)
+
+        return self.playlist_result(entries, playlist_id)
index d95ea06be56844deb3c34276fb1c4e7e18d64173..ca286abb1f7df74c46a62f995475fa771245f282 100644 (file)
@@ -2,11 +2,11 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse_urlencode
 from ..utils import (
     ExtractorError,
     int_or_none,
     parse_iso8601,
+    str_or_none,
 )
 
 
@@ -33,45 +33,27 @@ class ShahidIE(InfoExtractor):
         'only_matching': True
     }]
 
-    def _handle_error(self, response):
-        if not isinstance(response, dict):
-            return
-        error = response.get('error')
+    def _call_api(self, path, video_id, note):
+        data = self._download_json(
+            'http://api.shahid.net/api/v1_1/' + path, video_id, note, query={
+                'apiKey': 'sh@hid0nlin3',
+                'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=',
+            }).get('data', {})
+
+        error = data.get('error')
         if error:
             raise ExtractorError(
                 '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())),
                 expected=True)
 
-    def _download_json(self, url, video_id, note='Downloading JSON metadata'):
-        response = super(ShahidIE, self)._download_json(url, video_id, note)['data']
-        self._handle_error(response)
-        return response
+        return data
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        webpage = self._download_webpage(url, video_id)
-
-        api_vars = {
-            'id': video_id,
-            'type': 'player',
-            'url': 'http://api.shahid.net/api/v1_1',
-            'playerType': 'episode',
-        }
-
-        flashvars = self._search_regex(
-            r'var\s+flashvars\s*=\s*({[^}]+})', webpage, 'flashvars', default=None)
-        if flashvars:
-            for key in api_vars.keys():
-                value = self._search_regex(
-                    r'\b%s\s*:\s*(?P<q>["\'])(?P<value>.+?)(?P=q)' % key,
-                    flashvars, 'type', default=None, group='value')
-                if value:
-                    api_vars[key] = value
-
-        player = self._download_json(
-            'https://shahid.mbc.net/arContent/getPlayerContent-param-.id-%s.type-%s.html'
-            % (video_id, api_vars['type']), video_id, 'Downloading player JSON')
+        player = self._call_api(
+            'Content/Episode/%s' % video_id,
+            video_id, 'Downloading player JSON')
 
         if player.get('drm'):
             raise ExtractorError('This video is DRM protected.', expected=True)
@@ -79,22 +61,11 @@ class ShahidIE(InfoExtractor):
         formats = self._extract_m3u8_formats(player['url'], video_id, 'mp4')
         self._sort_formats(formats)
 
-        video = self._download_json(
-            '%s/%s/%s?%s' % (
-                api_vars['url'], api_vars['playerType'], api_vars['id'],
-                compat_urllib_parse_urlencode({
-                    'apiKey': 'sh@hid0nlin3',
-                    'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=',
-                })),
-            video_id, 'Downloading video JSON')
-
-        video = video[api_vars['playerType']]
+        video = self._call_api(
+            'episode/%s' % video_id, video_id,
+            'Downloading video JSON')['episode']
 
         title = video['title']
-        description = video.get('description')
-        thumbnail = video.get('thumbnailUrl')
-        duration = int_or_none(video.get('duration'))
-        timestamp = parse_iso8601(video.get('referenceDate'))
         categories = [
             category['name']
             for category in video.get('genres', []) if 'name' in category]
@@ -102,10 +73,16 @@ class ShahidIE(InfoExtractor):
         return {
             'id': video_id,
             'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'duration': duration,
-            'timestamp': timestamp,
+            'description': video.get('description'),
+            'thumbnail': video.get('thumbnailUrl'),
+            'duration': int_or_none(video.get('duration')),
+            'timestamp': parse_iso8601(video.get('referenceDate')),
             'categories': categories,
+            'series': video.get('showTitle') or video.get('showName'),
+            'season': video.get('seasonTitle'),
+            'season_number': int_or_none(video.get('seasonNumber')),
+            'season_id': str_or_none(video.get('seasonId')),
+            'episode_number': int_or_none(video.get('number')),
+            'episode_id': video_id,
             'formats': formats,
         }
index e7e5f653eb2117936f568195e508f9e7778b1085..d592dfeb8ed99d45d7c9b8245f5570e06f260a20 100644 (file)
@@ -6,7 +6,6 @@ from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
     int_or_none,
-    sanitized_Request,
     urlencode_postdata,
 )
 
@@ -37,28 +36,33 @@ class SharedIE(InfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
+
+        webpage, urlh = self._download_webpage_handle(url, video_id)
 
         if '>File does not exist<' in webpage:
             raise ExtractorError(
                 'Video %s does not exist' % video_id, expected=True)
 
         download_form = self._hidden_inputs(webpage)
-        request = sanitized_Request(
-            url, urlencode_postdata(download_form))
-        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
 
         video_page = self._download_webpage(
-            request, video_id, 'Downloading video page')
+            urlh.geturl(), video_id, 'Downloading video page',
+            data=urlencode_postdata(download_form),
+            headers={
+                'Content-Type': 'application/x-www-form-urlencoded',
+                'Referer': urlh.geturl(),
+            })
 
         video_url = self._html_search_regex(
-            r'data-url="([^"]+)"', video_page, 'video URL')
+            r'data-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
+            video_page, 'video URL', group='url')
         title = base64.b64decode(self._html_search_meta(
             'full:title', webpage, 'title').encode('utf-8')).decode('utf-8')
         filesize = int_or_none(self._html_search_meta(
             'full:size', webpage, 'file size', fatal=False))
         thumbnail = self._html_search_regex(
-            r'data-poster="([^"]+)"', video_page, 'thumbnail', default=None)
+            r'data-poster=(["\'])(?P<url>(?:(?!\1).)+)\1',
+            video_page, 'thumbnail', default=None, group='url')
 
         return {
             'id': video_id,
diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py
new file mode 100644 (file)
index 0000000..d3aba58
--- /dev/null
@@ -0,0 +1,64 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    qualities,
+    int_or_none,
+    mimetype2ext,
+    determine_ext,
+)
+
+
+class SixPlayIE(InfoExtractor):
+    _VALID_URL = r'(?:6play:|https?://(?:www\.)?6play\.fr/.+?-c_)(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.6play.fr/jamel-et-ses-amis-au-marrakech-du-rire-p_1316/jamel-et-ses-amis-au-marrakech-du-rire-2015-c_11495320',
+        'md5': '42310bffe4ba3982db112b9cd3467328',
+        'info_dict': {
+            'id': '11495320',
+            'ext': 'mp4',
+            'title': 'Jamel et ses amis au Marrakech du rire 2015',
+            'description': 'md5:ba2149d5c321d5201b78070ee839d872',
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        clip_data = self._download_json(
+            'https://player.m6web.fr/v2/video/config/6play-auth/FR/%s.json' % video_id,
+            video_id)
+        video_data = clip_data['videoInfo']
+
+        quality_key = qualities(['lq', 'sd', 'hq', 'hd'])
+        formats = []
+        for source in clip_data['sources']:
+            source_type, source_url = source.get('type'), source.get('src')
+            if not source_url or source_type == 'hls/primetime':
+                continue
+            ext = mimetype2ext(source_type) or determine_ext(source_url)
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    source_url, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
+                formats.extend(self._extract_f4m_formats(
+                    source_url.replace('.m3u8', '.f4m'),
+                    video_id, f4m_id='hds', fatal=False))
+            elif ext == 'mp4':
+                quality = source.get('quality')
+                formats.append({
+                    'url': source_url,
+                    'format_id': quality,
+                    'quality': quality_key(quality),
+                    'ext': ext,
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': video_data['title'].strip(),
+            'description': video_data.get('description'),
+            'duration': int_or_none(video_data.get('duration')),
+            'series': video_data.get('titlePgm'),
+            'formats': formats,
+        }
index 05e1b02ada567c8d88c0dbbba41171d0df1ac680..fffc9aa2277e3a0f35d24c585eec5be1e59d5c17 100644 (file)
@@ -67,7 +67,7 @@ class SkyNewsArabiaIE(SkyNewsArabiaBaseIE):
 
 
 class SkyNewsArabiaArticleIE(SkyNewsArabiaBaseIE):
-    IE_NAME = 'skynewsarabia:video'
+    IE_NAME = 'skynewsarabia:article'
     _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/article/(?P<id>[0-9]+)'
     _TESTS = [{
         'url': 'http://www.skynewsarabia.com/web/article/794549/%D8%A7%D9%94%D8%AD%D8%AF%D8%A7%D8%AB-%D8%A7%D9%84%D8%B4%D8%B1%D9%82-%D8%A7%D9%84%D8%A7%D9%94%D9%88%D8%B3%D8%B7-%D8%AE%D8%B1%D9%8A%D8%B7%D8%A9-%D8%A7%D9%84%D8%A7%D9%94%D9%84%D8%B9%D8%A7%D8%A8-%D8%A7%D9%84%D8%B0%D9%83%D9%8A%D8%A9',
diff --git a/youtube_dl/extractor/skysports.py b/youtube_dl/extractor/skysports.py
new file mode 100644 (file)
index 0000000..9dc78c7
--- /dev/null
@@ -0,0 +1,33 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class SkySportsIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine',
+        'md5': 'c44a1db29f27daf9a0003e010af82100',
+        'info_dict': {
+            'id': '10328419',
+            'ext': 'flv',
+            'title': 'Bale: Its our time to shine',
+            'description': 'md5:9fd1de3614d525f5addda32ac3c482c9',
+        },
+        'add_ie': ['Ooyala'],
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        return {
+            '_type': 'url_transparent',
+            'id': video_id,
+            'url': 'ooyala:%s' % self._search_regex(
+                r'data-video-id="([^"]+)"', webpage, 'ooyala id'),
+            'title': self._og_search_title(webpage),
+            'description': self._og_search_description(webpage),
+            'ie_key': 'Ooyala',
+        }
index 0b717a1e42b8dd2c3d8a88d602f001876cf99e03..4967c1b7752e4ebfd0c1aac9b0d079c2dc843363 100644 (file)
@@ -9,6 +9,7 @@ from ..compat import (
 )
 from ..utils import (
     ExtractorError,
+    get_element_by_id,
 )
 
 
@@ -40,7 +41,7 @@ class SlideshareIE(InfoExtractor):
         bucket = info['jsplayer']['video_bucket']
         ext = info['jsplayer']['video_extension']
         video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext)
-        description = self._html_search_regex(
+        description = get_element_by_id('slideshow-description-paragraph', webpage) or self._html_search_regex(
             r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage,
             'description', fatal=False)
 
@@ -51,5 +52,5 @@ class SlideshareIE(InfoExtractor):
             'ext': ext,
             'url': video_url,
             'thumbnail': info['slideshow']['pin_image_url'],
-            'description': description,
+            'description': description.strip() if description else None,
         }
index 5c3fd0fece8dc8b32a3d05bea8ed4dedf430f1c5..1143587868c842704901df4484482e7def3e64cc 100644 (file)
@@ -13,20 +13,21 @@ from ..utils import (
     sanitized_Request,
     unified_strdate,
     urlencode_postdata,
+    xpath_text,
 )
 
 
 class SmotriIE(InfoExtractor):
     IE_DESC = 'Smotri.com'
     IE_NAME = 'smotri'
-    _VALID_URL = r'^https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<id>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})'
+    _VALID_URL = r'https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<id>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})'
     _NETRC_MACHINE = 'smotri'
 
     _TESTS = [
         # real video id 2610366
         {
             'url': 'http://smotri.com/video/view/?id=v261036632ab',
-            'md5': '2a7b08249e6f5636557579c368040eb9',
+            'md5': '02c0dfab2102984e9c5bb585cc7cc321',
             'info_dict': {
                 'id': 'v261036632ab',
                 'ext': 'mp4',
@@ -174,11 +175,11 @@ class SmotriIE(InfoExtractor):
         if video_password:
             video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest()
 
-        request = sanitized_Request(
-            'http://smotri.com/video/view/url/bot/', urlencode_postdata(video_form))
-        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
-
-        video = self._download_json(request, video_id, 'Downloading video JSON')
+        video = self._download_json(
+            'http://smotri.com/video/view/url/bot/',
+            video_id, 'Downloading video JSON',
+            data=urlencode_postdata(video_form),
+            headers={'Content-Type': 'application/x-www-form-urlencoded'})
 
         video_url = video.get('_vidURL') or video.get('_vidURL_mp4')
 
@@ -196,11 +197,11 @@ class SmotriIE(InfoExtractor):
                 raise ExtractorError(msg, expected=True)
 
         title = video['title']
-        thumbnail = video['_imgURL']
-        upload_date = unified_strdate(video['added'])
-        uploader = video['userNick']
-        uploader_id = video['userLogin']
-        duration = int_or_none(video['duration'])
+        thumbnail = video.get('_imgURL')
+        upload_date = unified_strdate(video.get('added'))
+        uploader = video.get('userNick')
+        uploader_id = video.get('userLogin')
+        duration = int_or_none(video.get('duration'))
 
         # Video JSON does not provide enough meta data
         # We will extract some from the video web page instead
@@ -209,7 +210,7 @@ class SmotriIE(InfoExtractor):
 
         # Warning if video is unavailable
         warning = self._html_search_regex(
-            r'<div class="videoUnModer">(.*?)</div>', webpage,
+            r'<div[^>]+class="videoUnModer"[^>]*>(.+?)</div>', webpage,
             'warning message', default=None)
         if warning is not None:
             self._downloader.report_warning(
@@ -217,20 +218,22 @@ class SmotriIE(InfoExtractor):
                 (video_id, warning))
 
         # Adult content
-        if re.search('EroConfirmText">', webpage) is not None:
+        if 'EroConfirmText">' in webpage:
             self.report_age_confirmation()
             confirm_string = self._html_search_regex(
-                r'<a href="/video/view/\?id=%s&confirm=([^"]+)" title="[^"]+">' % video_id,
+                r'<a[^>]+href="/video/view/\?id=%s&confirm=([^"]+)"' % video_id,
                 webpage, 'confirm string')
             confirm_url = webpage_url + '&confirm=%s' % confirm_string
-            webpage = self._download_webpage(confirm_url, video_id, 'Downloading video page (age confirmed)')
+            webpage = self._download_webpage(
+                confirm_url, video_id,
+                'Downloading video page (age confirmed)')
             adult_content = True
         else:
             adult_content = False
 
         view_count = self._html_search_regex(
-            'Общее количество просмотров.*?<span class="Number">(\\d+)</span>',
-            webpage, 'view count', fatal=False, flags=re.MULTILINE | re.DOTALL)
+            r'(?s)Общее количество просмотров.*?<span class="Number">(\d+)</span>',
+            webpage, 'view count', fatal=False)
 
         return {
             'id': video_id,
@@ -249,37 +252,33 @@ class SmotriIE(InfoExtractor):
 class SmotriCommunityIE(InfoExtractor):
     IE_DESC = 'Smotri.com community videos'
     IE_NAME = 'smotri:community'
-    _VALID_URL = r'^https?://(?:www\.)?smotri\.com/community/video/(?P<communityid>[0-9A-Za-z_\'-]+)'
+    _VALID_URL = r'https?://(?:www\.)?smotri\.com/community/video/(?P<id>[0-9A-Za-z_\'-]+)'
     _TEST = {
         'url': 'http://smotri.com/community/video/kommuna',
         'info_dict': {
             'id': 'kommuna',
-            'title': 'КПРФ',
         },
         'playlist_mincount': 4,
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        community_id = mobj.group('communityid')
+        community_id = self._match_id(url)
 
-        url = 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id
-        rss = self._download_xml(url, community_id, 'Downloading community RSS')
+        rss = self._download_xml(
+            'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id,
+            community_id, 'Downloading community RSS')
 
-        entries = [self.url_result(video_url.text, 'Smotri')
-                   for video_url in rss.findall('./channel/item/link')]
+        entries = [
+            self.url_result(video_url.text, SmotriIE.ie_key())
+            for video_url in rss.findall('./channel/item/link')]
 
-        description_text = rss.find('./channel/description').text
-        community_title = self._html_search_regex(
-            '^Видео сообщества "([^"]+)"$', description_text, 'community title')
-
-        return self.playlist_result(entries, community_id, community_title)
+        return self.playlist_result(entries, community_id)
 
 
 class SmotriUserIE(InfoExtractor):
     IE_DESC = 'Smotri.com user videos'
     IE_NAME = 'smotri:user'
-    _VALID_URL = r'^https?://(?:www\.)?smotri\.com/user/(?P<userid>[0-9A-Za-z_\'-]+)'
+    _VALID_URL = r'https?://(?:www\.)?smotri\.com/user/(?P<id>[0-9A-Za-z_\'-]+)'
     _TESTS = [{
         'url': 'http://smotri.com/user/inspector',
         'info_dict': {
@@ -290,19 +289,19 @@ class SmotriUserIE(InfoExtractor):
     }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        user_id = mobj.group('userid')
+        user_id = self._match_id(url)
 
-        url = 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id
-        rss = self._download_xml(url, user_id, 'Downloading user RSS')
+        rss = self._download_xml(
+            'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id,
+            user_id, 'Downloading user RSS')
 
         entries = [self.url_result(video_url.text, 'Smotri')
                    for video_url in rss.findall('./channel/item/link')]
 
-        description_text = rss.find('./channel/description').text
-        user_nickname = self._html_search_regex(
-            '^Видео режиссера (.*)$', description_text,
-            'user nickname')
+        description_text = xpath_text(rss, './channel/description') or ''
+        user_nickname = self._search_regex(
+            '^Видео режиссера (.+)$', description_text,
+            'user nickname', fatal=False)
 
         return self.playlist_result(entries, user_id, user_nickname)
 
@@ -310,11 +309,11 @@ class SmotriUserIE(InfoExtractor):
 class SmotriBroadcastIE(InfoExtractor):
     IE_DESC = 'Smotri.com broadcasts'
     IE_NAME = 'smotri:broadcast'
-    _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<broadcastid>[^/]+))/?.*'
+    _VALID_URL = r'https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<id>[^/]+))/?.*'
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
-        broadcast_id = mobj.group('broadcastid')
+        broadcast_id = mobj.group('id')
 
         broadcast_url = 'http://' + mobj.group('url')
         broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page')
@@ -328,7 +327,8 @@ class SmotriBroadcastIE(InfoExtractor):
 
             (username, password) = self._get_login_info()
             if username is None:
-                self.raise_login_required('Erotic broadcasts allowed only for registered users')
+                self.raise_login_required(
+                    'Erotic broadcasts allowed only for registered users')
 
             login_form = {
                 'login-hint53': '1',
@@ -343,8 +343,9 @@ class SmotriBroadcastIE(InfoExtractor):
             broadcast_page = self._download_webpage(
                 request, broadcast_id, 'Logging in and confirming age')
 
-            if re.search('>Неверный логин или пароль<', broadcast_page) is not None:
-                raise ExtractorError('Unable to log in: bad username or password', expected=True)
+            if '>Неверный логин или пароль<' in broadcast_page:
+                raise ExtractorError(
+                    'Unable to log in: bad username or password', expected=True)
 
             adult_content = True
         else:
@@ -383,11 +384,11 @@ class SmotriBroadcastIE(InfoExtractor):
 
             broadcast_playpath = broadcast_json['_streamName']
             broadcast_app = '%s/%s' % (mobj.group('app'), broadcast_json['_vidURL'])
-            broadcast_thumbnail = broadcast_json['_imgURL']
+            broadcast_thumbnail = broadcast_json.get('_imgURL')
             broadcast_title = self._live_title(broadcast_json['title'])
-            broadcast_description = broadcast_json['description']
-            broadcaster_nick = broadcast_json['nick']
-            broadcaster_login = broadcast_json['login']
+            broadcast_description = broadcast_json.get('description')
+            broadcaster_nick = broadcast_json.get('nick')
+            broadcaster_login = broadcast_json.get('login')
             rtmp_conn = 'S:%s' % uuid.uuid4().hex
         except KeyError:
             if protected_broadcast:
index 49e5d09ae450d11bb567a2fe95ecba55998c8b42..48e2ba2dd16b0df190956c4d5e71b9206d30e531 100644 (file)
@@ -8,19 +8,16 @@ from ..compat import (
     compat_str,
     compat_urllib_parse_urlencode,
 )
-from ..utils import (
-    ExtractorError,
-    sanitized_Request,
-)
+from ..utils import ExtractorError
 
 
 class SohuIE(InfoExtractor):
     _VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?'
 
+    # Sohu videos give different MD5 sums on Travis CI and my machine
     _TESTS = [{
         'note': 'This video is available only in Mainland China',
         'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super',
-        'md5': '29175c8cadd8b5cc4055001e85d6b372',
         'info_dict': {
             'id': '382479172',
             'ext': 'mp4',
@@ -29,7 +26,6 @@ class SohuIE(InfoExtractor):
         'skip': 'On available in China',
     }, {
         'url': 'http://tv.sohu.com/20150305/n409385080.shtml',
-        'md5': '699060e75cf58858dd47fb9c03c42cfb',
         'info_dict': {
             'id': '409385080',
             'ext': 'mp4',
@@ -37,7 +33,6 @@ class SohuIE(InfoExtractor):
         }
     }, {
         'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml',
-        'md5': '9bf34be48f2f4dadcb226c74127e203c',
         'info_dict': {
             'id': '78693464',
             'ext': 'mp4',
@@ -51,7 +46,6 @@ class SohuIE(InfoExtractor):
             'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
         },
         'playlist': [{
-            'md5': 'bdbfb8f39924725e6589c146bc1883ad',
             'info_dict': {
                 'id': '78910339_part1',
                 'ext': 'mp4',
@@ -59,7 +53,6 @@ class SohuIE(InfoExtractor):
                 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
             }
         }, {
-            'md5': '3e1f46aaeb95354fd10e7fca9fc1804e',
             'info_dict': {
                 'id': '78910339_part2',
                 'ext': 'mp4',
@@ -67,7 +60,6 @@ class SohuIE(InfoExtractor):
                 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆',
             }
         }, {
-            'md5': '8407e634175fdac706766481b9443450',
             'info_dict': {
                 'id': '78910339_part3',
                 'ext': 'mp4',
@@ -96,15 +88,10 @@ class SohuIE(InfoExtractor):
             else:
                 base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid='
 
-            req = sanitized_Request(base_data_url + vid_id)
-
-            cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
-            if cn_verification_proxy:
-                req.add_header('Ytdl-request-proxy', cn_verification_proxy)
-
             return self._download_json(
-                req, video_id,
-                'Downloading JSON data for %s' % vid_id)
+                base_data_url + vid_id, video_id,
+                'Downloading JSON data for %s' % vid_id,
+                headers=self.geo_verification_headers())
 
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('id')
diff --git a/youtube_dl/extractor/sonyliv.py b/youtube_dl/extractor/sonyliv.py
new file mode 100644 (file)
index 0000000..accd112
--- /dev/null
@@ -0,0 +1,34 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class SonyLIVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/details/[^/]+/(?P<id>\d+)'
+    _TESTS = [{
+        'url': "http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor's-Delight",
+        'info_dict': {
+            'title': "Ep. 1 - Achaari Cheese Toast - Bachelor's Delight",
+            'id': '5024612095001',
+            'ext': 'mp4',
+            'upload_date': '20160707',
+            'description': 'md5:7f28509a148d5be9d0782b4d5106410d',
+            'uploader_id': '4338955589001',
+            'timestamp': 1467870968,
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': ['BrightcoveNew'],
+    }, {
+        'url': 'http://www.sonyliv.com/details/full%20movie/4951168986001/Sei-Raat-(Bangla)',
+        'only_matching': True,
+    }]
+
+    BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s'
+
+    def _real_extract(self, url):
+        brightcove_id = self._match_id(url)
+        return self.url_result(
+            self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)
index 194dabc71d84072fc64afd50baa3b80467c0808f..aeae931a20774ff9bc5c3988273a0d008e293954 100644 (file)
@@ -119,6 +119,12 @@ class SoundcloudIE(InfoExtractor):
     _CLIENT_ID = '02gUJC0hH2ct1EGOcYXQIzRFU91c72Ea'
     _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'
 
+    @staticmethod
+    def _extract_urls(webpage):
+        return [m.group('url') for m in re.finditer(
+            r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1',
+            webpage)]
+
     def report_resolve(self, video_id):
         """Report information extraction."""
         self.to_screen('%s: Resolving id' % video_id)
index 87b6504682a4feb50ce65c3eea43d07041aae31c..a147f7db19a11c27df55f783f1faf965af8bf644 100644 (file)
@@ -17,6 +17,8 @@ class SouthParkIE(MTVServicesInfoExtractor):
             'ext': 'mp4',
             'title': 'South Park|Bat Daded',
             'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.',
+            'timestamp': 1112760000,
+            'upload_date': '20050406',
         },
     }]
 
@@ -28,6 +30,10 @@ class SouthParkEsIE(SouthParkIE):
 
     _TESTS = [{
         'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate',
+        'info_dict': {
+            'title': 'Cartman Consigue Una Sonda Anal',
+            'description': 'Cartman Consigue Una Sonda Anal',
+        },
         'playlist_count': 4,
     }]
 
@@ -42,17 +48,27 @@ class SouthParkDeIE(SouthParkIE):
         'info_dict': {
             'id': '85487c96-b3b9-4e39-9127-ad88583d9bf2',
             'ext': 'mp4',
-            'title': 'The Government Won\'t Respect My Privacy',
+            'title': 'South Park|The Government Won\'t Respect My Privacy',
             'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.',
+            'timestamp': 1380160800,
+            'upload_date': '20130926',
         },
     }, {
         # non-ASCII characters in initial URL
         'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen',
-        'playlist_count': 4,
+        'info_dict': {
+            'title': 'Hashtag „Aufwärmen“',
+            'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.',
+        },
+        'playlist_count': 3,
     }, {
         # non-ASCII characters in redirect URL
         'url': 'http://www.southpark.de/alle-episoden/s18e09',
-        'playlist_count': 4,
+        'info_dict': {
+            'title': 'Hashtag „Aufwärmen“',
+            'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.',
+        },
+        'playlist_count': 3,
     }]
 
 
@@ -63,7 +79,11 @@ class SouthParkNlIE(SouthParkIE):
 
     _TESTS = [{
         'url': 'http://www.southpark.nl/full-episodes/s18e06-freemium-isnt-free',
-        'playlist_count': 4,
+        'info_dict': {
+            'title': 'Freemium Isn\'t Free',
+            'description': 'Stan is addicted to the new Terrance and Phillip mobile game.',
+        },
+        'playlist_mincount': 3,
     }]
 
 
@@ -74,5 +94,9 @@ class SouthParkDkIE(SouthParkIE):
 
     _TESTS = [{
         'url': 'http://www.southparkstudios.dk/full-episodes/s18e07-grounded-vindaloop',
-        'playlist_count': 4,
+        'info_dict': {
+            'title': 'Grounded Vindaloop',
+            'description': 'Butters is convinced he\'s living in a virtual reality.',
+        },
+        'playlist_mincount': 3,
     }]
index 39a7aaf9d630203dc1796b3b5621aad3c433f575..3c552807e268bb50a6a7d178e61d0834b0c48a42 100644 (file)
@@ -4,8 +4,13 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_urlparse
 from .spiegeltv import SpiegeltvIE
+from ..compat import compat_urlparse
+from ..utils import (
+    extract_attributes,
+    unified_strdate,
+    get_element_by_attribute,
+)
 
 
 class SpiegelIE(InfoExtractor):
@@ -19,6 +24,7 @@ class SpiegelIE(InfoExtractor):
             'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv',
             'description': 'md5:8029d8310232196eb235d27575a8b9f4',
             'duration': 49,
+            'upload_date': '20130311',
         },
     }, {
         'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html',
@@ -29,6 +35,7 @@ class SpiegelIE(InfoExtractor):
             'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers',
             'description': 'md5:c2322b65e58f385a820c10fa03b2d088',
             'duration': 983,
+            'upload_date': '20131115',
         },
     }, {
         'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html',
@@ -38,6 +45,7 @@ class SpiegelIE(InfoExtractor):
             'ext': 'mp4',
             'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.',
             'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"',
+            'upload_date': '20140904',
         }
     }, {
         'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html',
@@ -52,10 +60,10 @@ class SpiegelIE(InfoExtractor):
         if SpiegeltvIE.suitable(handle.geturl()):
             return self.url_result(handle.geturl(), 'Spiegeltv')
 
-        title = re.sub(r'\s+', ' ', self._html_search_regex(
-            r'(?s)<(?:h1|div) class="module-title"[^>]*>(.*?)</(?:h1|div)>',
-            webpage, 'title'))
-        description = self._html_search_meta('description', webpage, 'description')
+        video_data = extract_attributes(self._search_regex(r'(<div[^>]+id="spVideoElements"[^>]+>)', webpage, 'video element', default=''))
+
+        title = video_data.get('data-video-title') or get_element_by_attribute('class', 'module-title', webpage)
+        description = video_data.get('data-video-teaser') or self._html_search_meta('description', webpage, 'description')
 
         base_url = self._search_regex(
             [r'server\s*:\s*(["\'])(?P<url>.+?)\1', r'var\s+server\s*=\s*"(?P<url>[^"]+)\"'],
@@ -87,8 +95,9 @@ class SpiegelIE(InfoExtractor):
         return {
             'id': video_id,
             'title': title,
-            'description': description,
+            'description': description.strip() if description else None,
             'duration': duration,
+            'upload_date': unified_strdate(video_data.get('data-video-date')),
             'formats': formats,
         }
 
@@ -104,6 +113,7 @@ class SpiegelArticleIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Faszination Badminton: Nennt es bloß nicht Federball',
             'description': 're:^Patrick Kämnitz gehört.{100,}',
+            'upload_date': '20140825',
         },
     }, {
         'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html',
index 182f286dfefc4023483c422fbf6c6a73203b86ff..218785ee4e11045bcbb09416cd3bc6862a757ac0 100644 (file)
@@ -4,26 +4,31 @@ from .mtv import MTVServicesInfoExtractor
 
 
 class SpikeIE(MTVServicesInfoExtractor):
-    _VALID_URL = r'''(?x)https?://
-        (?:www\.spike\.com/(?:video-(?:clips|playlists)|(?:full-)?episodes)/.+|
-         m\.spike\.com/videos/video\.rbml\?id=(?P<id>[^&]+))
-        '''
-    _TEST = {
+    _VALID_URL = r'https?://(?:[^/]+\.)?spike\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)'
+    _TESTS = [{
         'url': 'http://www.spike.com/video-clips/lhtu8m/auction-hunters-can-allen-ride-a-hundred-year-old-motorcycle',
         'md5': '1a9265f32b0c375793d6c4ce45255256',
         'info_dict': {
             'id': 'b9c8221a-4e50-479a-b86d-3333323e38ba',
             'ext': 'mp4',
-            'title': 'Auction Hunters|Can Allen Ride A Hundred Year-Old Motorcycle?',
+            'title': 'Auction Hunters|December 27, 2013|4|414|Can Allen Ride A Hundred Year-Old Motorcycle?',
             'description': 'md5:fbed7e82ed5fad493615b3094a9499cb',
+            'timestamp': 1388120400,
+            'upload_date': '20131227',
         },
-    }
+    }, {
+        'url': 'http://www.spike.com/video-clips/lhtu8m/',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.spike.com/video-clips/lhtu8m',
+        'only_matching': True,
+    }, {
+        'url': 'http://bellator.spike.com/fight/atwr7k/bellator-158-michael-page-vs-evangelista-cyborg',
+        'only_matching': True,
+    }, {
+        'url': 'http://bellator.spike.com/video-clips/bw6k7n/bellator-158-foundations-michael-venom-page',
+        'only_matching': True,
+    }]
 
     _FEED_URL = 'http://www.spike.com/feeds/mrss/'
     _MOBILE_TEMPLATE = 'http://m.spike.com/videos/video.rbml?id=%s'
-
-    def _real_extract(self, url):
-        mobile_id = self._match_id(url)
-        if mobile_id:
-            url = 'http://www.spike.com/video-clips/%s' % mobile_id
-        return super(SpikeIE, self)._real_extract(url)
index 74d01183f5f396fb9499a8426775886faed5961d..409d5030422652e26fff1102c7fee1302f2b07b9 100644 (file)
@@ -9,8 +9,9 @@ from ..utils import (
 
 
 class SRMediathekIE(ARDMediathekIE):
+    IE_NAME = 'sr:mediathek'
     IE_DESC = 'Saarländischer Rundfunk'
-    _VALID_URL = r'https?://sr-mediathek\.sr-online\.de/index\.php\?.*?&id=(?P<id>[0-9]+)'
+    _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P<id>[0-9]+)'
 
     _TESTS = [{
         'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=28455',
@@ -34,7 +35,9 @@ class SRMediathekIE(ARDMediathekIE):
             # m3u8 download
             'skip_download': True,
         },
-        'expected_warnings': ['Unable to download f4m manifest']
+    }, {
+        'url': 'http://sr-mediathek.de/index.php?seite=7&id=7480',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
index d5c852f5207bdad9510a720d69b0cd70527f9f3f..0f8782d038c9fdadf903b05479ff468a039c6aa4 100644 (file)
@@ -56,7 +56,7 @@ class StitcherIE(InfoExtractor):
 
         episode = self._parse_json(
             js_to_json(self._search_regex(
-                r'(?s)var\s+stitcher\s*=\s*({.+?});\n', webpage, 'episode config')),
+                r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')),
             display_id)['config']['episode']
 
         title = unescapeHTML(episode['title'])
diff --git a/youtube_dl/extractor/streamable.py b/youtube_dl/extractor/streamable.py
new file mode 100644 (file)
index 0000000..1c61437
--- /dev/null
@@ -0,0 +1,98 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+)
+
+
+class StreamableIE(InfoExtractor):
+    _VALID_URL = r'https?://streamable\.com/(?:e/)?(?P<id>\w+)'
+    _TESTS = [
+        {
+            'url': 'https://streamable.com/dnd1',
+            'md5': '3e3bc5ca088b48c2d436529b64397fef',
+            'info_dict': {
+                'id': 'dnd1',
+                'ext': 'mp4',
+                'title': 'Mikel Oiarzabal scores to make it 0-3 for La Real against Espanyol',
+                'thumbnail': 're:https?://.*\.jpg$',
+                'uploader': 'teabaker',
+                'timestamp': 1454964157.35115,
+                'upload_date': '20160208',
+                'duration': 61.516,
+                'view_count': int,
+            }
+        },
+        # older video without bitrate, width/height, etc. info
+        {
+            'url': 'https://streamable.com/moo',
+            'md5': '2cf6923639b87fba3279ad0df3a64e73',
+            'info_dict': {
+                'id': 'moo',
+                'ext': 'mp4',
+                'title': '"Please don\'t eat me!"',
+                'thumbnail': 're:https?://.*\.jpg$',
+                'timestamp': 1426115495,
+                'upload_date': '20150311',
+                'duration': 12,
+                'view_count': int,
+            }
+        },
+        {
+            'url': 'https://streamable.com/e/dnd1',
+            'only_matching': True,
+        }
+    ]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        # Note: Using the ajax API, as the public Streamable API doesn't seem
+        # to return video info like the title properly sometimes, and doesn't
+        # include info like the video duration
+        video = self._download_json(
+            'https://streamable.com/ajax/videos/%s' % video_id, video_id)
+
+        # Format IDs:
+        # 0 The video is being uploaded
+        # 1 The video is being processed
+        # 2 The video has at least one file ready
+        # 3 The video is unavailable due to an error
+        status = video.get('status')
+        if status != 2:
+            raise ExtractorError(
+                'This video is currently unavailable. It may still be uploading or processing.',
+                expected=True)
+
+        title = video.get('reddit_title') or video['title']
+
+        formats = []
+        for key, info in video['files'].items():
+            if not info.get('url'):
+                continue
+            formats.append({
+                'format_id': key,
+                'url': self._proto_relative_url(info['url']),
+                'width': int_or_none(info.get('width')),
+                'height': int_or_none(info.get('height')),
+                'filesize': int_or_none(info.get('size')),
+                'fps': int_or_none(info.get('framerate')),
+                'vbr': float_or_none(info.get('bitrate'), 1000)
+            })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video.get('description'),
+            'thumbnail': self._proto_relative_url(video.get('thumbnail_url')),
+            'uploader': video.get('owner', {}).get('user_name'),
+            'timestamp': float_or_none(video.get('date_added')),
+            'duration': float_or_none(video.get('duration')),
+            'view_count': int_or_none(video.get('plays')),
+            'formats': formats
+        }
index e527aa97188b1860e054f8af7c7bd7a33301729e..ef9be7926866f6420d802f14cfdf83b3a9e4f69b 100644 (file)
@@ -12,25 +12,29 @@ from ..utils import (
 
 
 class SunPornoIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?sunporno\.com/videos/(?P<id>\d+)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:(?:www\.)?sunporno\.com/videos|embeds\.sunporno\.com/embed)/(?P<id>\d+)'
+    _TESTS = [{
         'url': 'http://www.sunporno.com/videos/807778/',
-        'md5': '6457d3c165fd6de062b99ef6c2ff4c86',
+        'md5': '507887e29033502f29dba69affeebfc9',
         'info_dict': {
             'id': '807778',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': 'md5:0a400058e8105d39e35c35e7c5184164',
             'description': 'md5:a31241990e1bd3a64e72ae99afb325fb',
             'thumbnail': 're:^https?://.*\.jpg$',
             'duration': 302,
             'age_limit': 18,
         }
-    }
+    }, {
+        'url': 'http://embeds.sunporno.com/embed/807778',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(
+            'http://www.sunporno.com/videos/%s' % video_id, video_id)
 
         title = self._html_search_regex(
             r'<title>([^<]+)</title>', webpage, 'title')
@@ -40,7 +44,8 @@ class SunPornoIE(InfoExtractor):
             r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False)
 
         duration = parse_duration(self._search_regex(
-            r'itemprop="duration">\s*(\d+:\d+)\s*<',
+            (r'itemprop="duration"[^>]*>\s*(\d+:\d+)\s*<',
+             r'>Duration:\s*<span[^>]+>\s*(\d+:\d+)\s*<'),
             webpage, 'duration', fatal=False))
 
         view_count = int_or_none(self._html_search_regex(
@@ -48,7 +53,7 @@ class SunPornoIE(InfoExtractor):
             webpage, 'view count', fatal=False))
         comment_count = int_or_none(self._html_search_regex(
             r'(\d+)</b> Comments?',
-            webpage, 'comment count', fatal=False))
+            webpage, 'comment count', fatal=False, default=None))
 
         formats = []
         quality = qualities(['mp4', 'flv'])
index 67f56fab8cb299d6644c8765fbeaaec1562e4b7f..1c04dfb7bf757477d134cc7caa223ab47d0800ba 100644 (file)
@@ -120,7 +120,7 @@ class SVTIE(SVTBaseIE):
 
 class SVTPlayIE(SVTBaseIE):
     IE_DESC = 'SVT Play and Öppet arkiv'
-    _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)'
+    _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp)/(?P<id>[0-9]+)'
     _TESTS = [{
         'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2',
         'md5': '2b6704fe4a28801e1a098bbf3c5ac611',
@@ -141,6 +141,9 @@ class SVTPlayIE(SVTBaseIE):
         # geo restricted to Sweden
         'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten',
         'only_matching': True,
+    }, {
+        'url': 'http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
index 5ca079f880717933a4216de6399046a44970d29b..cc81f60036794da7ba9f0f3f584e632aefb557ee 100644 (file)
@@ -1,46 +1,58 @@
 from __future__ import unicode_literals
 
-import re
+from .adobepass import AdobePassIE
+from ..utils import (
+    update_url_query,
+    smuggle_url,
+)
 
-from .common import InfoExtractor
-
-
-class SyfyIE(InfoExtractor):
-    _VALID_URL = r'https?://www\.syfy\.com/(?:videos/.+?vid:(?P<id>[0-9]+)|(?!videos)(?P<video_name>[^/]+)(?:$|[?#]))'
 
+class SyfyIE(AdobePassIE):
+    _VALID_URL = r'https?://www\.syfy\.com/(?:[^/]+/)?videos/(?P<id>[^/?#]+)'
     _TESTS = [{
-        'url': 'http://www.syfy.com/videos/Robot%20Combat%20League/Behind%20the%20Scenes/vid:2631458',
+        'url': 'http://www.syfy.com/theinternetruinedmylife/videos/the-internet-ruined-my-life-season-1-trailer',
         'info_dict': {
-            'id': 'NmqMrGnXvmO1',
-            'ext': 'flv',
-            'title': 'George Lucas has Advice for his Daughter',
-            'description': 'Listen to what insights George Lucas give his daughter Amanda.',
+            'id': '2968097',
+            'ext': 'mp4',
+            'title': 'The Internet Ruined My Life: Season 1 Trailer',
+            'description': 'One tweet, one post, one click, can destroy everything.',
+            'uploader': 'NBCU-MPAT',
+            'upload_date': '20170113',
+            'timestamp': 1484345640,
         },
-        'add_ie': ['ThePlatform'],
-    }, {
-        'url': 'http://www.syfy.com/wilwheaton',
-        'md5': '94dfa54ee3ccb63295b276da08c415f6',
-        'info_dict': {
-            'id': '4yoffOOXC767',
-            'ext': 'flv',
-            'title': 'The Wil Wheaton Project - Premiering May 27th at 10/9c.',
-            'description': 'The Wil Wheaton Project premieres May 27th at 10/9c. Don\'t miss it.',
+        'params': {
+            # m3u8 download
+            'skip_download': True,
         },
         'add_ie': ['ThePlatform'],
-        'skip': 'Blocked outside the US',
     }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_name = mobj.group('video_name')
-        if video_name:
-            generic_webpage = self._download_webpage(url, video_name)
-            video_id = self._search_regex(
-                r'<iframe.*?class="video_iframe_page"\s+src="/_utils/video/thP_video_controller.php.*?_vid([0-9]+)">',
-                generic_webpage, 'video ID')
-            url = 'http://www.syfy.com/videos/%s/%s/vid:%s' % (
-                video_name, video_name, video_id)
-        else:
-            video_id = mobj.group('id')
-        webpage = self._download_webpage(url, video_id)
-        return self.url_result(self._og_search_video_url(webpage))
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        syfy_mpx = list(self._parse_json(self._search_regex(
+            r'jQuery\.extend\([^,]+,\s*({.+})\);', webpage, 'drupal settings'),
+            display_id)['syfy']['syfy_mpx'].values())[0]
+        video_id = syfy_mpx['mpxGUID']
+        title = syfy_mpx['episodeTitle']
+        query = {
+            'mbr': 'true',
+            'manifest': 'm3u',
+        }
+        if syfy_mpx.get('entitlement') == 'auth':
+            resource = self._get_mvpd_resource(
+                'syfy', title, video_id,
+                syfy_mpx.get('mpxRating', 'TV-14'))
+            query['auth'] = self._extract_mvpd_auth(
+                url, video_id, 'syfy', resource)
+
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'ThePlatform',
+            'url': smuggle_url(update_url_query(
+                self._proto_relative_url(syfy_mpx['releaseURL']), query),
+                {'force_smil_url': True}),
+            'title': title,
+            'id': video_id,
+            'display_id': display_id,
+        }
diff --git a/youtube_dl/extractor/tapely.py b/youtube_dl/extractor/tapely.py
deleted file mode 100644 (file)
index ed560bd..0000000
+++ /dev/null
@@ -1,109 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
-    clean_html,
-    ExtractorError,
-    float_or_none,
-    parse_iso8601,
-    sanitized_Request,
-)
-
-
-class TapelyIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?(?:tape\.ly|tapely\.com)/(?P<id>[A-Za-z0-9\-_]+)(?:/(?P<songnr>\d+))?'
-    _API_URL = 'http://tape.ly/showtape?id={0:}'
-    _S3_SONG_URL = 'http://mytape.s3.amazonaws.com/{0:}'
-    _SOUNDCLOUD_SONG_URL = 'http://api.soundcloud.com{0:}'
-    _TESTS = [
-        {
-            'url': 'http://tape.ly/my-grief-as-told-by-water',
-            'info_dict': {
-                'id': 23952,
-                'title': 'my grief as told by water',
-                'thumbnail': 're:^https?://.*\.png$',
-                'uploader_id': 16484,
-                'timestamp': 1411848286,
-                'description': 'For Robin and Ponkers, whom the tides of life have taken out to sea.',
-            },
-            'playlist_count': 13,
-        },
-        {
-            'url': 'http://tape.ly/my-grief-as-told-by-water/1',
-            'md5': '79031f459fdec6530663b854cbc5715c',
-            'info_dict': {
-                'id': 258464,
-                'title': 'Dreaming Awake  (My Brightest Diamond)',
-                'ext': 'm4a',
-            },
-        },
-        {
-            'url': 'https://tapely.com/my-grief-as-told-by-water',
-            'only_matching': True,
-        },
-    ]
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        display_id = mobj.group('id')
-
-        playlist_url = self._API_URL.format(display_id)
-        request = sanitized_Request(playlist_url)
-        request.add_header('X-Requested-With', 'XMLHttpRequest')
-        request.add_header('Accept', 'application/json')
-        request.add_header('Referer', url)
-
-        playlist = self._download_json(request, display_id)
-
-        tape = playlist['tape']
-
-        entries = []
-        for s in tape['songs']:
-            song = s['song']
-            entry = {
-                'id': song['id'],
-                'duration': float_or_none(song.get('songduration'), 1000),
-                'title': song['title'],
-            }
-            if song['source'] == 'S3':
-                entry.update({
-                    'url': self._S3_SONG_URL.format(song['filename']),
-                })
-                entries.append(entry)
-            elif song['source'] == 'YT':
-                self.to_screen('YouTube video detected')
-                yt_id = song['filename'].replace('/youtube/', '')
-                entry.update(self.url_result(yt_id, 'Youtube', video_id=yt_id))
-                entries.append(entry)
-            elif song['source'] == 'SC':
-                self.to_screen('SoundCloud song detected')
-                sc_url = self._SOUNDCLOUD_SONG_URL.format(song['filename'])
-                entry.update(self.url_result(sc_url, 'Soundcloud'))
-                entries.append(entry)
-            else:
-                self.report_warning('Unknown song source: %s' % song['source'])
-
-        if mobj.group('songnr'):
-            songnr = int(mobj.group('songnr')) - 1
-            try:
-                return entries[songnr]
-            except IndexError:
-                raise ExtractorError(
-                    'No song with index: %s' % mobj.group('songnr'),
-                    expected=True)
-
-        return {
-            '_type': 'playlist',
-            'id': tape['id'],
-            'display_id': display_id,
-            'title': tape['name'],
-            'entries': entries,
-            'thumbnail': tape.get('image_url'),
-            'description': clean_html(tape.get('subtext')),
-            'like_count': tape.get('likescount'),
-            'uploader_id': tape.get('user_id'),
-            'timestamp': parse_iso8601(tape.get('published_at')),
-        }
index 4b4b740b44d325ffb8a8a5c6cba848b0c99ced13..2ecfd0405afa27d78f81fb0c4ba604d022798850 100644 (file)
@@ -1,50 +1,41 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import json
+from .mitele import MiTeleBaseIE
 
-from .common import InfoExtractor
-from ..compat import (
-    compat_urllib_parse_unquote,
-    compat_urllib_parse_urlencode,
-    compat_urlparse,
-)
-from ..utils import (
-    get_element_by_attribute,
-    parse_duration,
-    strip_jsonp,
-)
 
-
-class TelecincoIE(InfoExtractor):
+class TelecincoIE(MiTeleBaseIE):
     IE_DESC = 'telecinco.es, cuatro.com and mediaset.es'
     _VALID_URL = r'https?://www\.(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html'
 
     _TESTS = [{
         'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html',
-        'md5': '5cbef3ad5ef17bf0d21570332d140729',
+        'md5': '8d7b2d5f699ee2709d992a63d5cd1712',
         'info_dict': {
-            'id': 'MDSVID20141015_0058',
+            'id': 'JEA5ijCnF6p5W08A1rNKn7',
             'ext': 'mp4',
-            'title': 'Con Martín Berasategui, hacer un bacalao al ...',
+            'title': 'Bacalao con kokotxas al pil-pil',
+            'description': 'md5:1382dacd32dd4592d478cbdca458e5bb',
             'duration': 662,
         },
     }, {
         'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html',
-        'md5': '0a5b9f3cc8b074f50a0578f823a12694',
+        'md5': '284393e5387b3b947b77c613ef04749a',
         'info_dict': {
-            'id': 'MDSVID20150916_0128',
+            'id': 'jn24Od1zGLG4XUZcnUnZB6',
             'ext': 'mp4',
-            'title': '¿Quién es este ex futbolista con el que hablan ...',
+            'title': '¿Quién es este ex futbolista con el que hablan Leo Messi y Luis Suárez?',
+            'description': 'md5:a62ecb5f1934fc787107d7b9a2262805',
             'duration': 79,
         },
     }, {
         'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html',
-        'md5': 'ad1bfaaba922dd4a295724b05b68f86a',
+        'md5': '749afab6ea5a136a8806855166ae46a2',
         'info_dict': {
-            'id': 'MDSVID20150513_0220',
+            'id': 'aywerkD2Sv1vGNqq9b85Q2',
             'ext': 'mp4',
             'title': '#DOYLACARA. Con la trata no hay trato',
+            'description': 'md5:2771356ff7bfad9179c5f5cd954f1477',
             'duration': 50,
         },
     }, {
@@ -56,40 +47,16 @@ class TelecincoIE(InfoExtractor):
     }]
 
     def _real_extract(self, url):
-        episode = self._match_id(url)
-        webpage = self._download_webpage(url, episode)
-        embed_data_json = self._search_regex(
-            r'(?s)MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data',
-        ).replace('\'', '"')
-        embed_data = json.loads(embed_data_json)
-
-        domain = embed_data['mediaUrl']
-        if not domain.startswith('http'):
-            # only happens in telecinco.es videos
-            domain = 'http://' + domain
-        info_url = compat_urlparse.urljoin(
-            domain,
-            compat_urllib_parse_unquote(embed_data['flashvars']['host'])
-        )
-        info_el = self._download_xml(info_url, episode).find('./video/info')
-
-        video_link = info_el.find('videoUrl/link').text
-        token_query = compat_urllib_parse_urlencode({'id': video_link})
-        token_info = self._download_json(
-            embed_data['flashvars']['ov_tk'] + '?' + token_query,
-            episode,
-            transform_source=strip_jsonp
-        )
-        formats = self._extract_m3u8_formats(
-            token_info['tokenizedUrl'], episode, ext='mp4', entry_protocol='m3u8_native')
-        self._sort_formats(formats)
-
-        return {
-            'id': embed_data['videoId'],
-            'display_id': episode,
-            'title': info_el.find('title').text,
-            'formats': formats,
-            'description': get_element_by_attribute('class', 'text', webpage),
-            'thumbnail': info_el.find('thumb').text,
-            'duration': parse_duration(info_el.find('duration').text),
-        }
+        display_id = self._match_id(url)
+        webpage = self._download_webpage(url, display_id)
+        title = self._html_search_meta(
+            ['og:title', 'twitter:title'], webpage, 'title')
+        info = self._get_player_info(url, webpage)
+        info.update({
+            'display_id': display_id,
+            'title': title,
+            'description': self._html_search_meta(
+                ['og:description', 'twitter:description'],
+                webpage, 'title', fatal=False),
+        })
+        return info
index 9092e9b853637d14d54a2c15c524b12e35e91a30..58078c531d151e319fb7e707d8116a730507962b 100644 (file)
@@ -47,11 +47,10 @@ class TelegraafIE(InfoExtractor):
             ext = determine_ext(manifest_url)
             if ext == 'm3u8':
                 formats.extend(self._extract_m3u8_formats(
-                    manifest_url, video_id, ext='mp4', m3u8_id='hls'))
+                    manifest_url, video_id, ext='mp4', m3u8_id='hls', fatal=False))
             elif ext == 'mpd':
-                # TODO: Current DASH formats are broken - $Time$ pattern in
-                # <SegmentTemplate> not implemented yet
-                continue
+                formats.extend(self._extract_mpd_formats(
+                    manifest_url, video_id, mpd_id='dash', fatal=False))
             else:
                 self.report_warning('Unknown adaptive format %s' % ext)
         for location in locations.get('progressive', []):
index 07d222ae3c1b53dba7c3886922886d44a09a0d8b..23067e8c6510ffec73f4036f122e8d810291efef 100644 (file)
@@ -9,6 +9,7 @@ import hashlib
 
 
 from .once import OnceIE
+from .adobepass import AdobePassIE
 from ..compat import (
     compat_parse_qs,
     compat_urllib_parse_urlparse,
@@ -62,19 +63,20 @@ class ThePlatformBaseIE(OnceIE):
 
         return formats, subtitles
 
-    def get_metadata(self, path, video_id):
+    def _download_theplatform_metadata(self, path, video_id):
         info_url = 'http://link.theplatform.com/s/%s?format=preview' % path
-        info = self._download_json(info_url, video_id)
+        return self._download_json(info_url, video_id)
 
+    def _parse_theplatform_metadata(self, info):
         subtitles = {}
         captions = info.get('captions')
         if isinstance(captions, list):
             for caption in captions:
                 lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type')
-                subtitles[lang] = [{
+                subtitles.setdefault(lang, []).append({
                     'ext': mimetype2ext(mime),
                     'url': src,
-                }]
+                })
 
         return {
             'title': info['title'],
@@ -86,8 +88,12 @@ class ThePlatformBaseIE(OnceIE):
             'uploader': info.get('billingCode'),
         }
 
+    def _extract_theplatform_metadata(self, path, video_id):
+        info = self._download_theplatform_metadata(path, video_id)
+        return self._parse_theplatform_metadata(info)
 
-class ThePlatformIE(ThePlatformBaseIE):
+
+class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
     _VALID_URL = r'''(?x)
         (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
            (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
@@ -265,7 +271,7 @@ class ThePlatformIE(ThePlatformBaseIE):
         formats, subtitles = self._extract_theplatform_smil(smil_url, video_id)
         self._sort_formats(formats)
 
-        ret = self.get_metadata(path, video_id)
+        ret = self._extract_theplatform_metadata(path, video_id)
         combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles)
         ret.update({
             'id': video_id,
@@ -339,7 +345,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
         timestamp = int_or_none(entry.get('media$availableDate'), scale=1000)
         categories = [item['media$name'] for item in entry.get('media$categories', [])]
 
-        ret = self.get_metadata('%s/%s' % (provider_id, first_video_id), video_id)
+        ret = self._extract_theplatform_metadata('%s/%s' % (provider_id, first_video_id), video_id)
         subtitles = self._merge_subtitles(subtitles, ret['subtitles'])
         ret.update({
             'id': video_id,
index c77a07989a97629a715a3b164b06c60151f71a3e..f26937da1e3eaecf008d28bd6a661d0a22657ddd 100644 (file)
@@ -24,16 +24,20 @@ class ThreeQSDNIE(InfoExtractor):
             'title': '0280d6b9-1215-11e6-b427-0cc47a188158',
             'is_live': False,
         },
-        'expected_warnings': ['Failed to download MPD manifest'],
+        'expected_warnings': ['Failed to download MPD manifest', 'Failed to parse JSON'],
     }, {
         # live video stream
         'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true',
         'info_dict': {
             'id': 'd755d94b-4ab9-11e3-9162-0025907ad44f',
             'ext': 'mp4',
-            'title': 'd755d94b-4ab9-11e3-9162-0025907ad44f',
-            'is_live': False,
+            'title': 're:^d755d94b-4ab9-11e3-9162-0025907ad44f [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'is_live': True,
         },
+        'params': {
+            'skip_download': True,  # m3u8 downloads
+        },
+        'expected_warnings': ['Failed to download MPD manifest'],
     }, {
         # live audio stream
         'url': 'http://playout.3qsdn.com/9edf36e0-6bf2-11e2-a16a-9acf09e2db48',
@@ -92,12 +96,11 @@ class ThreeQSDNIE(InfoExtractor):
             if not item_url or item_url in urls:
                 return
             urls.add(item_url)
-            type_ = item.get('type')
-            ext = determine_ext(item_url, default_ext=None)
-            if type_ == 'application/dash+xml' or ext == 'mpd':
+            ext = mimetype2ext(item.get('type')) or determine_ext(item_url, default_ext=None)
+            if ext == 'mpd':
                 formats.extend(self._extract_mpd_formats(
                     item_url, video_id, mpd_id='mpd', fatal=False))
-            elif type_ in ('application/vnd.apple.mpegURL', 'application/x-mpegurl') or ext == 'm3u8':
+            elif ext == 'm3u8':
                 formats.extend(self._extract_m3u8_formats(
                     item_url, video_id, 'mp4',
                     entry_protocol='m3u8' if live else 'm3u8_native',
@@ -111,11 +114,11 @@ class ThreeQSDNIE(InfoExtractor):
                 formats.append({
                     'url': item_url,
                     'format_id': item.get('quality'),
-                    'ext': 'mp4' if item_url.startswith('rtsp') else mimetype2ext(type_) or ext,
+                    'ext': 'mp4' if item_url.startswith('rtsp') else ext,
                     'vcodec': 'none' if stream_type == 'audio' else None,
                 })
 
-        for item_js in re.findall(r'({.*?\b(?:src|source)\s*:\s*["\'].+?})', js):
+        for item_js in re.findall(r'({[^{]*?\b(?:src|source)\s*:\s*["\'].+?})', js):
             f = self._parse_json(
                 item_js, video_id, transform_source=js_to_json, fatal=False)
             if not f:
index 7dbe68b5c228ea6d3fa20d835a60ecf38b820816..979856e9a6663332a5eed7be8c2f9a30a9495d40 100644 (file)
@@ -5,31 +5,27 @@ from .common import InfoExtractor
 
 
 class TMZIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P<id>[^/]+)/?'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P<id>[^/?#]+)'
+    _TESTS = [{
         'url': 'http://www.tmz.com/videos/0_okj015ty/',
-        'md5': '791204e3bf790b1426cb2db0706184c0',
+        'md5': '4d22a51ef205b6c06395d8394f72d560',
         'info_dict': {
             'id': '0_okj015ty',
-            'url': 'http://tmz.vo.llnwd.net/o28/2014-03/13/0_okj015ty_0_rt8ro3si_2.mp4',
             'ext': 'mp4',
             'title': 'Kim Kardashian\'s Boobs Unlock a Mystery!',
             'description': 'Did Kim Kardasain try to one-up Khloe by one-upping Kylie???  Or is she just showing off her amazing boobs?',
-            'thumbnail': r're:http://cdnbakmi\.kaltura\.com/.*thumbnail.*',
+            'timestamp': 1394747163,
+            'uploader_id': 'batchUser',
+            'upload_date': '20140313',
         }
-    }
+    }, {
+        'url': 'http://www.tmz.com/videos/0-cegprt2p/',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-
-        return {
-            'id': video_id,
-            'url': self._html_search_meta('VideoURL', webpage, fatal=True),
-            'title': self._og_search_title(webpage),
-            'description': self._og_search_description(webpage),
-            'thumbnail': self._html_search_meta('ThumbURL', webpage),
-        }
+        video_id = self._match_id(url).replace('-', '_')
+        return self.url_result('kaltura:591531:%s' % video_id, 'Kaltura', video_id)
 
 
 class TMZArticleIE(InfoExtractor):
index 78174178e6ef69362462f96f997b7a37a640a275..7ddf77767d804faff4097d85d796760bc4473b77 100644 (file)
@@ -118,8 +118,12 @@ class TNAFlixNetworkBaseIE(InfoExtractor):
             xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:')
         thumbnails = self._extract_thumbnails(cfg_xml)
 
-        title = self._html_search_regex(
-            self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage)
+        title = None
+        if self._TITLE_REGEX:
+            title = self._html_search_regex(
+                self._TITLE_REGEX, webpage, 'title', default=None)
+        if not title:
+            title = self._og_search_title(webpage)
 
         age_limit = self._rta_search(webpage) or 18
 
@@ -189,9 +193,9 @@ class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE):
 class TNAFlixIE(TNAFlixNetworkBaseIE):
     _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)'
 
-    _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>'
-    _DESCRIPTION_REGEX = r'<meta[^>]+name="description"[^>]+content="([^"]+)"'
-    _UPLOADER_REGEX = r'<i>\s*Verified Member\s*</i>\s*<h1>(.+?)</h1>'
+    _TITLE_REGEX = r'<title>(.+?) - (?:TNAFlix Porn Videos|TNAFlix\.com)</title>'
+    _DESCRIPTION_REGEX = r'(?s)>Description:</[^>]+>(.+?)<'
+    _UPLOADER_REGEX = r'<i>\s*Verified Member\s*</i>\s*<h\d+>(.+?)<'
     _CATEGORIES_REGEX = r'(?s)<span[^>]*>Categories:</span>(.+?)</div>'
 
     _TESTS = [{
index 4797d1310aaeec2664d822c052f26be5ea5210af..54c2d0aa6c0d234f9f747550ab841c409bfbc079 100644 (file)
@@ -1,74 +1,41 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-    unified_strdate,
-)
+from ..utils import int_or_none
 
 
 class TouTvIE(InfoExtractor):
     IE_NAME = 'tou.tv'
-    _VALID_URL = r'https?://www\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/(?P<episode>S[0-9]+E[0-9]+)))'
+    _VALID_URL = r'https?://ici\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+/S[0-9]+E[0-9]+)'
 
     _TEST = {
-        'url': 'http://www.tou.tv/30-vies/S04E41',
+        'url': 'http://ici.tou.tv/garfield-tout-court/S2015E17',
         'info_dict': {
-            'id': '30-vies_S04E41',
+            'id': '122017',
             'ext': 'mp4',
-            'title': '30 vies Saison 4 / Épisode 41',
-            'description': 'md5:da363002db82ccbe4dafeb9cab039b09',
-            'age_limit': 8,
-            'uploader': 'Groupe des Nouveaux Médias',
-            'duration': 1296,
-            'upload_date': '20131118',
-            'thumbnail': 'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg',
+            'title': 'Saison 2015 Épisode 17',
+            'description': 'La photo de famille 2',
+            'upload_date': '20100717',
         },
         'params': {
-            'skip_download': True,  # Requires rtmpdump
+            # m3u8 download
+            'skip_download': True,
         },
-        'skip': 'Only available in Canada'
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        webpage = self._download_webpage(url, video_id)
-
-        mediaId = self._search_regex(
-            r'"idMedia":\s*"([^"]+)"', webpage, 'media ID')
-
-        streams_url = 'http://release.theplatform.com/content.select?pid=' + mediaId
-        streams_doc = self._download_xml(
-            streams_url, video_id, note='Downloading stream list')
-
-        video_url = next(n.text
-                         for n in streams_doc.findall('.//choice/url')
-                         if '//ad.doubleclick' not in n.text)
-        if video_url.endswith('/Unavailable.flv'):
-            raise ExtractorError(
-                'Access to this video is blocked from outside of Canada',
-                expected=True)
-
-        duration_str = self._html_search_meta(
-            'video:duration', webpage, 'duration')
-        duration = int(duration_str) if duration_str else None
-        upload_date_str = self._html_search_meta(
-            'video:release_date', webpage, 'upload date')
-        upload_date = unified_strdate(upload_date_str) if upload_date_str else None
+        path = self._match_id(url)
+        metadata = self._download_json('http://ici.tou.tv/presentation/%s' % path, path)
+        video_id = metadata['IdMedia']
+        details = metadata['Details']
+        title = details['OriginalTitle']
 
         return {
+            '_type': 'url_transparent',
+            'url': 'radiocanada:%s:%s' % (metadata.get('AppCode', 'toutv'), video_id),
             'id': video_id,
-            'title': self._og_search_title(webpage),
-            'url': video_url,
-            'description': self._og_search_description(webpage),
-            'uploader': self._dc_search_uploader(webpage),
-            'thumbnail': self._og_search_thumbnail(webpage),
-            'age_limit': self._media_rating_search(webpage),
-            'duration': duration,
-            'upload_date': upload_date,
-            'ext': 'mp4',
+            'title': title,
+            'thumbnail': details.get('ImageUrl'),
+            'duration': int_or_none(details.get('LengthInSeconds')),
         }
index 1d9271d1e70c3f2b5ad30a6c69f6602bd58ac89a..4053f6c2150ff536cce19a7fede105f0fee5d8d8 100644 (file)
@@ -1,18 +1,13 @@
 from __future__ import unicode_literals
 
-import re
-
-from .common import InfoExtractor
-from ..compat import compat_str
 from ..utils import (
     int_or_none,
-    sanitized_Request,
     str_to_int,
 )
-from ..aes import aes_decrypt_text
+from .keezmovies import KeezMoviesIE
 
 
-class Tube8IE(InfoExtractor):
+class Tube8IE(KeezMoviesIE):
     _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/(?P<id>\d+)'
     _TESTS = [{
         'url': 'http://www.tube8.com/teen/kasia-music-video/229795/',
@@ -33,47 +28,17 @@ class Tube8IE(InfoExtractor):
     }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-        display_id = mobj.group('display_id')
-
-        req = sanitized_Request(url)
-        req.add_header('Cookie', 'age_verified=1')
-        webpage = self._download_webpage(req, display_id)
-
-        flashvars = self._parse_json(
-            self._search_regex(
-                r'flashvars\s*=\s*({.+?});\r?\n', webpage, 'flashvars'),
-            video_id)
-
-        formats = []
-        for key, video_url in flashvars.items():
-            if not isinstance(video_url, compat_str) or not video_url.startswith('http'):
-                continue
-            height = self._search_regex(
-                r'quality_(\d+)[pP]', key, 'height', default=None)
-            if not height:
-                continue
-            if flashvars.get('encrypted') is True:
-                video_url = aes_decrypt_text(
-                    video_url, flashvars['video_title'], 32).decode('utf-8')
-            formats.append({
-                'url': video_url,
-                'format_id': '%sp' % height,
-                'height': int(height),
-            })
-        self._sort_formats(formats)
+        webpage, info = self._extract_info(url)
 
-        thumbnail = flashvars.get('image_url')
+        if not info['title']:
+            info['title'] = self._html_search_regex(
+                r'videoTitle\s*=\s*"([^"]+)', webpage, 'title')
 
-        title = self._html_search_regex(
-            r'videoTitle\s*=\s*"([^"]+)', webpage, 'title')
         description = self._html_search_regex(
             r'>Description:</strong>\s*(.+?)\s*<', webpage, 'description', fatal=False)
         uploader = self._html_search_regex(
             r'<span class="username">\s*(.+?)\s*<',
             webpage, 'uploader', fatal=False)
-        duration = int_or_none(flashvars.get('video_duration'))
 
         like_count = int_or_none(self._search_regex(
             r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False))
@@ -86,18 +51,13 @@ class Tube8IE(InfoExtractor):
             r'<span id="allCommentsCount">(\d+)</span>',
             webpage, 'comment count', fatal=False))
 
-        return {
-            'id': video_id,
-            'display_id': display_id,
-            'title': title,
+        info.update({
             'description': description,
-            'thumbnail': thumbnail,
             'uploader': uploader,
-            'duration': duration,
             'view_count': view_count,
             'like_count': like_count,
             'dislike_count': dislike_count,
             'comment_count': comment_count,
-            'age_limit': 18,
-            'formats': formats,
-        }
+        })
+
+        return info
index 86bb7915db170ecf4c75fdc2b960160a65daa1c0..f225ec68448271eabbcca0b63ef367f37e7e908c 100644 (file)
@@ -8,6 +8,7 @@ from ..utils import (
     determine_ext,
     int_or_none,
     float_or_none,
+    js_to_json,
     parse_iso8601,
     remove_end,
 )
@@ -54,10 +55,11 @@ class TV2IE(InfoExtractor):
                 ext = determine_ext(video_url)
                 if ext == 'f4m':
                     formats.extend(self._extract_f4m_formats(
-                        video_url, video_id, f4m_id=format_id))
+                        video_url, video_id, f4m_id=format_id, fatal=False))
                 elif ext == 'm3u8':
                     formats.extend(self._extract_m3u8_formats(
-                        video_url, video_id, 'mp4', m3u8_id=format_id))
+                        video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                        m3u8_id=format_id, fatal=False))
                 elif ext == 'ism' or video_url.endswith('.ism/Manifest'):
                     pass
                 else:
@@ -105,7 +107,7 @@ class TV2ArticleIE(InfoExtractor):
         'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542',
         'info_dict': {
             'id': '6930542',
-            'title': 'Russen hetses etter pingvintyveri  innrømmer å ha åpnet luken på buret',
+            'title': 'Russen hetses etter pingvintyveri - innrømmer å ha åpnet luken på buret',
             'description': 'md5:339573779d3eea3542ffe12006190954',
         },
         'playlist_count': 2,
@@ -119,9 +121,23 @@ class TV2ArticleIE(InfoExtractor):
 
         webpage = self._download_webpage(url, playlist_id)
 
+        # Old embed pattern (looks unused nowadays)
+        assets = re.findall(r'data-assetid=["\'](\d+)', webpage)
+
+        if not assets:
+            # New embed pattern
+            for v in re.findall('TV2ContentboxVideo\(({.+?})\)', webpage):
+                video = self._parse_json(
+                    v, playlist_id, transform_source=js_to_json, fatal=False)
+                if not video:
+                    continue
+                asset = video.get('assetId')
+                if asset:
+                    assets.append(asset)
+
         entries = [
-            self.url_result('http://www.tv2.no/v/%s' % video_id, 'TV2')
-            for video_id in re.findall(r'data-assetid="(\d+)"', webpage)]
+            self.url_result('http://www.tv2.no/v/%s' % asset_id, 'TV2')
+            for asset_id in assets]
 
         title = remove_end(self._og_search_title(webpage), ' - TV2.no')
         description = remove_end(self._og_search_description(webpage), ' - TV2.no')
index b73279dec8f433163e4d5d272e25e794c2fa74a8..cb76a2a583912d120faa81bcbcb17fa136a95eeb 100644 (file)
@@ -9,56 +9,23 @@ class TVLandIE(MTVServicesInfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?tvland\.com/(?:video-clips|episodes)/(?P<id>[^/?#.]+)'
     _FEED_URL = 'http://www.tvland.com/feeds/mrss/'
     _TESTS = [{
+        # Geo-restricted. Without a proxy metadata are still there. With a
+        # proxy it redirects to http://m.tvland.com/app/
         'url': 'http://www.tvland.com/episodes/hqhps2/everybody-loves-raymond-the-invasion-ep-048',
-        'playlist': [
-            {
-                'md5': '227e9723b9669c05bf51098b10287aa7',
-                'info_dict': {
-                    'id': 'bcbd3a83-3aca-4dca-809b-f78a87dcccdd',
-                    'ext': 'mp4',
-                    'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 1 of 5',
-                }
-            },
-            {
-                'md5': '9fa2b764ec0e8194fb3ebb01a83df88b',
-                'info_dict': {
-                    'id': 'f4279548-6e13-40dd-92e8-860d27289197',
-                    'ext': 'mp4',
-                    'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 2 of 5',
-                }
-            },
-            {
-                'md5': 'fde4c3bccd7cc7e3576b338734153cec',
-                'info_dict': {
-                    'id': '664e4a38-53ef-4115-9bc9-d0f789ec6334',
-                    'ext': 'mp4',
-                    'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 3 of 5',
-                }
-            },
-            {
-                'md5': '247f6780cda6891f2e49b8ae2b10e017',
-                'info_dict': {
-                    'id': '9146ecf5-b15a-4d78-879c-6679b77f4960',
-                    'ext': 'mp4',
-                    'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 4 of 5',
-                }
-            },
-            {
-                'md5': 'fd269f33256e47bad5eb6c40de089ff6',
-                'info_dict': {
-                    'id': '04334a2e-9a47-4214-a8c2-ae5792e2fab7',
-                    'ext': 'mp4',
-                    'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 5 of 5',
-                }
-            }
-        ],
+        'info_dict': {
+            'description': 'md5:80973e81b916a324e05c14a3fb506d29',
+            'title': 'The Invasion',
+        },
+        'playlist': [],
     }, {
         'url': 'http://www.tvland.com/video-clips/zea2ev/younger-younger--hilary-duff---little-lies',
         'md5': 'e2c6389401cf485df26c79c247b08713',
         'info_dict': {
             'id': 'b8697515-4bbe-4e01-83d5-fa705ce5fa88',
             'ext': 'mp4',
-            'title': 'Younger|Younger: Hilary Duff - Little Lies',
-            'description': 'md5:7d192f56ca8d958645c83f0de8ef0269'
+            'title': 'Younger|December 28, 2015|2|NO-EPISODE#|Younger: Hilary Duff - Little Lies',
+            'description': 'md5:7d192f56ca8d958645c83f0de8ef0269',
+            'upload_date': '20151228',
+            'timestamp': 1451289600,
         },
     }]
index 5070082da7ba3b34078a01bd214d02a9e8dcac33..2abfb78307186a2007e7f375bccb7a26ff0713ca 100644 (file)
@@ -24,6 +24,7 @@ class TVPIE(InfoExtractor):
             'id': '194536',
             'ext': 'mp4',
             'title': 'Czas honoru, I seria – odc. 13',
+            'description': 'md5:76649d2014f65c99477be17f23a4dead',
         },
     }, {
         'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176',
@@ -32,6 +33,16 @@ class TVPIE(InfoExtractor):
             'id': '17916176',
             'ext': 'mp4',
             'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
+            'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
+        },
+    }, {
+        # page id is not the same as video id(#7799)
+        'url': 'http://vod.tvp.pl/22704887/08122015-1500',
+        'md5': 'cf6a4705dfd1489aef8deb168d6ba742',
+        'info_dict': {
+            'id': '22680786',
+            'ext': 'mp4',
+            'title': 'Wiadomości, 08.12.2015, 15:00',
         },
     }, {
         'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
@@ -53,6 +64,39 @@ class TVPIE(InfoExtractor):
         'only_matching': True,
     }]
 
+    def _real_extract(self, url):
+        page_id = self._match_id(url)
+        webpage = self._download_webpage(url, page_id)
+        video_id = self._search_regex([
+            r'<iframe[^>]+src="[^"]*?object_id=(\d+)',
+            "object_id\s*:\s*'(\d+)'"], webpage, 'video id')
+        return {
+            '_type': 'url_transparent',
+            'url': 'tvp:' + video_id,
+            'description': self._og_search_description(webpage, default=None),
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'ie_key': 'TVPEmbed',
+        }
+
+
+class TVPEmbedIE(InfoExtractor):
+    IE_NAME = 'tvp:embed'
+    IE_DESC = 'Telewizja Polska'
+    _VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268',
+        'md5': '8c9cd59d16edabf39331f93bf8a766c7',
+        'info_dict': {
+            'id': '22670268',
+            'ext': 'mp4',
+            'title': 'Panorama, 07.12.2015, 15:40',
+        },
+    }, {
+        'url': 'tvp:22670268',
+        'only_matching': True,
+    }]
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
@@ -89,8 +133,8 @@ class TVPIE(InfoExtractor):
             r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)',
             video_url, 'video base url', default=None)
         if video_url_base:
-            # TODO: Current DASH formats are broken - $Time$ pattern in
-            # <SegmentTemplate> not implemented yet
+            # TODO: <Group> found instead of <AdaptationSet> in MPD manifest.
+            # It's not mentioned in MPEG-DASH standard. Figure that out.
             # formats.extend(self._extract_mpd_formats(
             #     video_url_base + '.ism/video.mpd',
             #     video_id, mpd_id='dash', fatal=False))
index df70a6b230a4217261f3f69a3e1213a88f07afbf..4186e82db170300c6addeb0d0054c75b3d512e48 100644 (file)
@@ -4,47 +4,58 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_str
+from ..compat import (
+    compat_HTTPError,
+    compat_str,
+    compat_urlparse,
+)
 from ..utils import (
+    determine_ext,
+    ExtractorError,
+    int_or_none,
     parse_iso8601,
     qualities,
+    try_get,
+    update_url_query,
 )
 
 
 class TVPlayIE(InfoExtractor):
-    IE_DESC = 'TV3Play and related services'
-    _VALID_URL = r'''(?x)https?://(?:www\.)?
-        (?:tvplay\.lv/parraides|
-           tv3play\.lt/programos|
-           play\.tv3\.lt/programos|
-           tv3play\.ee/sisu|
-           tv3play\.se/program|
-           tv6play\.se/program|
-           tv8play\.se/program|
-           tv10play\.se/program|
-           tv3play\.no/programmer|
-           viasat4play\.no/programmer|
-           tv6play\.no/programmer|
-           tv3play\.dk/programmer|
-           play\.novatv\.bg/programi
-        )/[^/]+/(?P<id>\d+)
-        '''
+    IE_NAME = 'mtg'
+    IE_DESC = 'MTG services'
+    _VALID_URL = r'''(?x)
+                    (?:
+                        mtg:|
+                        https?://
+                            (?:www\.)?
+                            (?:
+                                tvplay(?:\.skaties)?\.lv/parraides|
+                                (?:tv3play|play\.tv3)\.lt/programos|
+                                tv3play(?:\.tv3)?\.ee/sisu|
+                                (?:tv(?:3|6|8|10)play|viafree)\.se/program|
+                                (?:(?:tv3play|viasat4play|tv6play|viafree)\.no|(?:tv3play|viafree)\.dk)/programmer|
+                                play\.novatv\.bg/programi
+                            )
+                            /(?:[^/]+/)+
+                        )
+                        (?P<id>\d+)
+                    '''
     _TESTS = [
         {
             'url': 'http://www.tvplay.lv/parraides/vinas-melo-labak/418113?autostart=true',
+            'md5': 'a1612fe0849455423ad8718fe049be21',
             'info_dict': {
                 'id': '418113',
-                'ext': 'flv',
+                'ext': 'mp4',
                 'title': 'Kādi ir īri? - Viņas melo labāk',
                 'description': 'Baiba apsmej īrus, kādi tie ir un ko viņi dara.',
+                'series': 'Viņas melo labāk',
+                'season': '2.sezona',
+                'season_number': 2,
                 'duration': 25,
                 'timestamp': 1406097056,
                 'upload_date': '20140723',
             },
-            'params': {
-                # rtmp download
-                'skip_download': True,
-            },
         },
         {
             'url': 'http://play.tv3.lt/programos/moterys-meluoja-geriau/409229?autostart=true',
@@ -53,6 +64,10 @@ class TVPlayIE(InfoExtractor):
                 'ext': 'flv',
                 'title': 'Moterys meluoja geriau',
                 'description': 'md5:9aec0fc68e2cbc992d2a140bd41fa89e',
+                'series': 'Moterys meluoja geriau',
+                'episode_number': 47,
+                'season': '1 sezonas',
+                'season_number': 1,
                 'duration': 1330,
                 'timestamp': 1403769181,
                 'upload_date': '20140626',
@@ -82,7 +97,7 @@ class TVPlayIE(InfoExtractor):
             'url': 'http://www.tv3play.se/program/husraddarna/395385?autostart=true',
             'info_dict': {
                 'id': '395385',
-                'ext': 'flv',
+                'ext': 'mp4',
                 'title': 'Husräddarna S02E07',
                 'description': 'md5:f210c6c89f42d4fc39faa551be813777',
                 'duration': 2574,
@@ -90,7 +105,6 @@ class TVPlayIE(InfoExtractor):
                 'upload_date': '20140520',
             },
             'params': {
-                # rtmp download
                 'skip_download': True,
             },
         },
@@ -98,7 +112,7 @@ class TVPlayIE(InfoExtractor):
             'url': 'http://www.tv6play.se/program/den-sista-dokusapan/266636?autostart=true',
             'info_dict': {
                 'id': '266636',
-                'ext': 'flv',
+                'ext': 'mp4',
                 'title': 'Den sista dokusåpan S01E08',
                 'description': 'md5:295be39c872520221b933830f660b110',
                 'duration': 1492,
@@ -107,7 +121,6 @@ class TVPlayIE(InfoExtractor):
                 'age_limit': 18,
             },
             'params': {
-                # rtmp download
                 'skip_download': True,
             },
         },
@@ -115,7 +128,7 @@ class TVPlayIE(InfoExtractor):
             'url': 'http://www.tv8play.se/program/antikjakten/282756?autostart=true',
             'info_dict': {
                 'id': '282756',
-                'ext': 'flv',
+                'ext': 'mp4',
                 'title': 'Antikjakten S01E10',
                 'description': 'md5:1b201169beabd97e20c5ad0ad67b13b8',
                 'duration': 2646,
@@ -123,7 +136,6 @@ class TVPlayIE(InfoExtractor):
                 'upload_date': '20120925',
             },
             'params': {
-                # rtmp download
                 'skip_download': True,
             },
         },
@@ -131,7 +143,7 @@ class TVPlayIE(InfoExtractor):
             'url': 'http://www.tv3play.no/programmer/anna-anka-soker-assistent/230898?autostart=true',
             'info_dict': {
                 'id': '230898',
-                'ext': 'flv',
+                'ext': 'mp4',
                 'title': 'Anna Anka søker assistent - Ep. 8',
                 'description': 'md5:f80916bf5bbe1c5f760d127f8dd71474',
                 'duration': 2656,
@@ -139,7 +151,6 @@ class TVPlayIE(InfoExtractor):
                 'upload_date': '20100628',
             },
             'params': {
-                # rtmp download
                 'skip_download': True,
             },
         },
@@ -147,7 +158,7 @@ class TVPlayIE(InfoExtractor):
             'url': 'http://www.viasat4play.no/programmer/budbringerne/21873?autostart=true',
             'info_dict': {
                 'id': '21873',
-                'ext': 'flv',
+                'ext': 'mp4',
                 'title': 'Budbringerne program 10',
                 'description': 'md5:4db78dc4ec8a85bb04fd322a3ee5092d',
                 'duration': 1297,
@@ -155,7 +166,6 @@ class TVPlayIE(InfoExtractor):
                 'upload_date': '20090929',
             },
             'params': {
-                # rtmp download
                 'skip_download': True,
             },
         },
@@ -163,7 +173,7 @@ class TVPlayIE(InfoExtractor):
             'url': 'http://www.tv6play.no/programmer/hotelinspektor-alex-polizzi/361883?autostart=true',
             'info_dict': {
                 'id': '361883',
-                'ext': 'flv',
+                'ext': 'mp4',
                 'title': 'Hotelinspektør Alex Polizzi - Ep. 10',
                 'description': 'md5:3ecf808db9ec96c862c8ecb3a7fdaf81',
                 'duration': 2594,
@@ -171,7 +181,6 @@ class TVPlayIE(InfoExtractor):
                 'upload_date': '20140224',
             },
             'params': {
-                # rtmp download
                 'skip_download': True,
             },
         },
@@ -191,59 +200,173 @@ class TVPlayIE(InfoExtractor):
                 'skip_download': True,
             },
         },
+        {
+            'url': 'http://tvplay.skaties.lv/parraides/vinas-melo-labak/418113?autostart=true',
+            'only_matching': True,
+        },
+        {
+            # views is null
+            'url': 'http://tvplay.skaties.lv/parraides/tv3-zinas/760183',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://tv3play.tv3.ee/sisu/kodu-keset-linna/238551?autostart=true',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://www.viafree.se/program/underhallning/i-like-radio-live/sasong-1/676869',
+            'only_matching': True,
+        },
+        {
+            'url': 'mtg:418113',
+            'only_matching': True,
+        }
     ]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
         video = self._download_json(
-            'http://playapi.mtgx.tv/v1/videos/%s' % video_id, video_id, 'Downloading video JSON')
+            'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON')
 
-        if video['is_geo_blocked']:
-            self.report_warning(
-                'This content might not be available in your country due to copyright reasons')
+        title = video['title']
 
-        streams = self._download_json(
-            'http://playapi.mtgx.tv/v1/videos/stream/%s' % video_id, video_id, 'Downloading streams JSON')
+        try:
+            streams = self._download_json(
+                'http://playapi.mtgx.tv/v3/videos/stream/%s' % video_id,
+                video_id, 'Downloading streams JSON')
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                msg = self._parse_json(e.cause.read().decode('utf-8'), video_id)
+                raise ExtractorError(msg['msg'], expected=True)
+            raise
 
         quality = qualities(['hls', 'medium', 'high'])
         formats = []
-        for format_id, video_url in streams['streams'].items():
+        for format_id, video_url in streams.get('streams', {}).items():
             if not video_url or not isinstance(video_url, compat_str):
                 continue
-            fmt = {
-                'format_id': format_id,
-                'preference': quality(format_id),
-            }
-            if video_url.startswith('rtmp'):
-                m = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', video_url)
-                if not m:
-                    continue
-                fmt.update({
-                    'ext': 'flv',
-                    'url': m.group('url'),
-                    'app': m.group('app'),
-                    'play_path': m.group('playpath'),
-                })
-            elif video_url.endswith('.f4m'):
+            ext = determine_ext(video_url)
+            if ext == 'f4m':
                 formats.extend(self._extract_f4m_formats(
-                    video_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', video_id))
-                continue
+                    update_url_query(video_url, {
+                        'hdcore': '3.5.0',
+                        'plugin': 'aasp-3.5.0.151.81'
+                    }), video_id, f4m_id='hds', fatal=False))
+            elif ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    video_url, video_id, 'mp4', 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
             else:
-                fmt.update({
-                    'url': video_url,
-                })
-            formats.append(fmt)
+                fmt = {
+                    'format_id': format_id,
+                    'quality': quality(format_id),
+                    'ext': ext,
+                }
+                if video_url.startswith('rtmp'):
+                    m = re.search(
+                        r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', video_url)
+                    if not m:
+                        continue
+                    fmt.update({
+                        'ext': 'flv',
+                        'url': m.group('url'),
+                        'app': m.group('app'),
+                        'play_path': m.group('playpath'),
+                    })
+                else:
+                    fmt.update({
+                        'url': video_url,
+                    })
+                formats.append(fmt)
+
+        if not formats and video.get('is_geo_blocked'):
+            self.raise_geo_restricted(
+                'This content might not be available in your country due to copyright reasons')
 
         self._sort_formats(formats)
 
+        # TODO: webvtt in m3u8
+        subtitles = {}
+        sami_path = video.get('sami_path')
+        if sami_path:
+            lang = self._search_regex(
+                r'_([a-z]{2})\.xml', sami_path, 'lang',
+                default=compat_urlparse.urlparse(url).netloc.rsplit('.', 1)[-1])
+            subtitles[lang] = [{
+                'url': sami_path,
+            }]
+
+        series = video.get('format_title')
+        episode_number = int_or_none(video.get('format_position', {}).get('episode'))
+        season = video.get('_embedded', {}).get('season', {}).get('title')
+        season_number = int_or_none(video.get('format_position', {}).get('season'))
+
         return {
             'id': video_id,
-            'title': video['title'],
-            'description': video['description'],
-            'duration': video['duration'],
-            'timestamp': parse_iso8601(video['created_at']),
-            'view_count': video['views']['total'],
-            'age_limit': video.get('age_limit', 0),
+            'title': title,
+            'description': video.get('description'),
+            'series': series,
+            'episode_number': episode_number,
+            'season': season,
+            'season_number': season_number,
+            'duration': int_or_none(video.get('duration')),
+            'timestamp': parse_iso8601(video.get('created_at')),
+            'view_count': try_get(video, lambda x: x['views']['total'], int),
+            'age_limit': int_or_none(video.get('age_limit', 0)),
             'formats': formats,
+            'subtitles': subtitles,
         }
+
+
+class ViafreeIE(InfoExtractor):
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?
+                        viafree\.
+                        (?:
+                            (?:dk|no)/programmer|
+                            se/program
+                        )
+                        /(?:[^/]+/)+(?P<id>[^/?#&]+)
+                    '''
+    _TESTS = [{
+        'url': 'http://www.viafree.se/program/livsstil/husraddarna/sasong-2/avsnitt-2',
+        'info_dict': {
+            'id': '395375',
+            'ext': 'mp4',
+            'title': 'Husräddarna S02E02',
+            'description': 'md5:4db5c933e37db629b5a2f75dfb34829e',
+            'series': 'Husräddarna',
+            'season': 'Säsong 2',
+            'season_number': 2,
+            'duration': 2576,
+            'timestamp': 1400596321,
+            'upload_date': '20140520',
+        },
+        'params': {
+            'skip_download': True,
+        },
+        'add_ie': [TVPlayIE.ie_key()],
+    }, {
+        'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.viafree.dk/programmer/reality/paradise-hotel/saeson-7/episode-5',
+        'only_matching': True,
+    }]
+
+    @classmethod
+    def suitable(cls, url):
+        return False if TVPlayIE.suitable(url) else super(ViafreeIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_id = self._search_regex(
+            r'currentVideo["\']\s*:\s*.+?["\']id["\']\s*:\s*["\'](?P<id>\d{6,})',
+            webpage, 'video id')
+
+        return self.url_result('mtg:%s' % video_id, TVPlayIE.ie_key())
index f3198fb85adb29b8081b9735899dd574cb504c67..7a9386cde3d9e0e5d78bfd368d47819430c53e85 100644 (file)
@@ -1,25 +1,62 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    determine_ext,
+    mimetype2ext,
+)
 
 
 class TweakersIE(InfoExtractor):
     _VALID_URL = r'https?://tweakers\.net/video/(?P<id>\d+)'
     _TEST = {
         'url': 'https://tweakers.net/video/9926/new-nintendo-3ds-xl-op-alle-fronten-beter.html',
-        'md5': '3147e4ddad366f97476a93863e4557c8',
+        'md5': 'fe73e417c093a788e0160c4025f88b15',
         'info_dict': {
             'id': '9926',
             'ext': 'mp4',
             'title': 'New Nintendo 3DS XL - Op alle fronten beter',
-            'description': 'md5:f97324cc71e86e11c853f0763820e3ba',
+            'description': 'md5:3789b21fed9c0219e9bcaacd43fab280',
             'thumbnail': 're:^https?://.*\.jpe?g$',
             'duration': 386,
+            'uploader_id': 's7JeEm',
         }
     }
 
     def _real_extract(self, url):
-        playlist_id = self._match_id(url)
-        entries = self._extract_xspf_playlist(
-            'https://tweakers.net/video/s1playlist/%s/playlist.xspf' % playlist_id, playlist_id)
-        return self.playlist_result(entries, playlist_id)
+        video_id = self._match_id(url)
+        video_data = self._download_json(
+            'https://tweakers.net/video/s1playlist/%s/1920/1080/playlist.json' % video_id,
+            video_id)['items'][0]
+
+        title = video_data['title']
+
+        formats = []
+        for location in video_data.get('locations', {}).get('progressive', []):
+            format_id = location.get('label')
+            width = int_or_none(location.get('width'))
+            height = int_or_none(location.get('height'))
+            for source in location.get('sources', []):
+                source_url = source.get('src')
+                if not source_url:
+                    continue
+                ext = mimetype2ext(source.get('type')) or determine_ext(source_url)
+                formats.append({
+                    'format_id': format_id,
+                    'url': source_url,
+                    'width': width,
+                    'height': height,
+                    'ext': ext,
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': video_data.get('description'),
+            'thumbnail': video_data.get('poster'),
+            'duration': int_or_none(video_data.get('duration')),
+            'uploader_id': video_data.get('account'),
+            'formats': formats,
+        }
index 4025edf02b4ca5b8c42e3324fb094fe43141ce68..af92b713b08e22343f84a282d3db59b355623f04 100644 (file)
@@ -12,32 +12,32 @@ from ..utils import (
 
 class TwentyFourVideoIE(InfoExtractor):
     IE_NAME = '24video'
-    _VALID_URL = r'https?://(?:www\.)?24video\.net/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www\.)?24video\.(?:net|me|xxx)/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P<id>\d+)'
 
-    _TESTS = [
-        {
-            'url': 'http://www.24video.net/video/view/1044982',
-            'md5': 'e09fc0901d9eaeedac872f154931deeb',
-            'info_dict': {
-                'id': '1044982',
-                'ext': 'mp4',
-                'title': 'Эротика каменного века',
-                'description': 'Как смотрели порно в каменном веке.',
-                'thumbnail': 're:^https?://.*\.jpg$',
-                'uploader': 'SUPERTELO',
-                'duration': 31,
-                'timestamp': 1275937857,
-                'upload_date': '20100607',
-                'age_limit': 18,
-                'like_count': int,
-                'dislike_count': int,
-            },
+    _TESTS = [{
+        'url': 'http://www.24video.net/video/view/1044982',
+        'md5': 'e09fc0901d9eaeedac872f154931deeb',
+        'info_dict': {
+            'id': '1044982',
+            'ext': 'mp4',
+            'title': 'Эротика каменного века',
+            'description': 'Как смотрели порно в каменном веке.',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'uploader': 'SUPERTELO',
+            'duration': 31,
+            'timestamp': 1275937857,
+            'upload_date': '20100607',
+            'age_limit': 18,
+            'like_count': int,
+            'dislike_count': int,
         },
-        {
-            'url': 'http://www.24video.net/player/new24_play.swf?id=1044982',
-            'only_matching': True,
-        }
-    ]
+    }, {
+        'url': 'http://www.24video.net/player/new24_play.swf?id=1044982',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.24video.me/video/view/1044982',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -64,7 +64,7 @@ class TwentyFourVideoIE(InfoExtractor):
             r'<span class="video-views">(\d+) просмотр',
             webpage, 'view count', fatal=False))
         comment_count = int_or_none(self._html_search_regex(
-            r'<div class="comments-title" id="comments-count">(\d+) комментари',
+            r'<a[^>]+href="#tab-comments"[^>]*>(\d+) комментари',
             webpage, 'comment count', fatal=False))
 
         # Sets some cookies
index 20919774d0bcf65705e1f379a433667e2a06b7e9..890f551800d011c423636530dfd8d70e4e926290 100644 (file)
@@ -29,7 +29,7 @@ class TwitchBaseIE(InfoExtractor):
     _VALID_URL_BASE = r'https?://(?:www\.)?twitch\.tv'
 
     _API_BASE = 'https://api.twitch.tv'
-    _USHER_BASE = 'http://usher.twitch.tv'
+    _USHER_BASE = 'https://usher.ttvnw.net'
     _LOGIN_URL = 'http://www.twitch.tv/login'
     _NETRC_MACHINE = 'twitch'
 
@@ -461,7 +461,7 @@ class TwitchClipsIE(InfoExtractor):
     IE_NAME = 'twitch:clips'
     _VALID_URL = r'https?://clips\.twitch\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)'
 
-    _TEST = {
+    _TESTS = [{
         'url': 'https://clips.twitch.tv/ea/AggressiveCobraPoooound',
         'md5': '761769e1eafce0ffebfb4089cb3847cd',
         'info_dict': {
@@ -473,7 +473,11 @@ class TwitchClipsIE(InfoExtractor):
             'uploader': 'stereotype_',
             'uploader_id': 'stereotype_',
         },
-    }
+    }, {
+        # multiple formats
+        'url': 'https://clips.twitch.tv/rflegendary/UninterestedBeeDAESuppy',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -485,15 +489,27 @@ class TwitchClipsIE(InfoExtractor):
                 r'(?s)clipInfo\s*=\s*({.+?});', webpage, 'clip info'),
             video_id, transform_source=js_to_json)
 
-        video_url = clip['clip_video_url']
-        title = clip['channel_title']
+        title = clip.get('channel_title') or self._og_search_title(webpage)
+
+        formats = [{
+            'url': option['source'],
+            'format_id': option.get('quality'),
+            'height': int_or_none(option.get('quality')),
+        } for option in clip.get('quality_options', []) if option.get('source')]
+
+        if not formats:
+            formats = [{
+                'url': clip['clip_video_url'],
+            }]
+
+        self._sort_formats(formats)
 
         return {
             'id': video_id,
-            'url': video_url,
             'title': title,
             'thumbnail': self._og_search_thumbnail(webpage),
             'creator': clip.get('broadcaster_display_name') or clip.get('broadcaster_login'),
             'uploader': clip.get('curator_login'),
             'uploader_id': clip.get('curator_display_name'),
+            'formats': formats,
         }
diff --git a/youtube_dl/extractor/uol.py b/youtube_dl/extractor/uol.py
new file mode 100644 (file)
index 0000000..c27c643
--- /dev/null
@@ -0,0 +1,128 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    clean_html,
+    int_or_none,
+    parse_duration,
+    update_url_query,
+    str_or_none,
+)
+
+
+class UOLIE(InfoExtractor):
+    IE_NAME = 'uol.com.br'
+    _VALID_URL = r'https?://(?:.+?\.)?uol\.com\.br/.*?(?:(?:mediaId|v)=|view/(?:[a-z0-9]+/)?|video(?:=|/(?:\d{4}/\d{2}/\d{2}/)?))(?P<id>\d+|[\w-]+-[A-Z0-9]+)'
+    _TESTS = [{
+        'url': 'http://player.mais.uol.com.br/player_video_v3.swf?mediaId=15951931',
+        'md5': '25291da27dc45e0afb5718a8603d3816',
+        'info_dict': {
+            'id': '15951931',
+            'ext': 'mp4',
+            'title': 'Miss simpatia é encontrada morta',
+            'description': 'md5:3f8c11a0c0556d66daf7e5b45ef823b2',
+        }
+    }, {
+        'url': 'http://tvuol.uol.com.br/video/incendio-destroi-uma-das-maiores-casas-noturnas-de-londres-04024E9A3268D4C95326',
+        'md5': 'e41a2fb7b7398a3a46b6af37b15c00c9',
+        'info_dict': {
+            'id': '15954259',
+            'ext': 'mp4',
+            'title': 'Incêndio destrói uma das maiores casas noturnas de Londres',
+            'description': 'Em Londres, um incêndio destruiu uma das maiores boates da cidade. Não há informações sobre vítimas.',
+        }
+    }, {
+        'url': 'http://mais.uol.com.br/static/uolplayer/index.html?mediaId=15951931',
+        'only_matching': True,
+    }, {
+        'url': 'http://mais.uol.com.br/view/15954259',
+        'only_matching': True,
+    }, {
+        'url': 'http://noticias.band.uol.com.br/brasilurgente/video/2016/08/05/15951931/miss-simpatia-e-encontrada-morta.html',
+        'only_matching': True,
+    }, {
+        'url': 'http://videos.band.uol.com.br/programa.asp?e=noticias&pr=brasil-urgente&v=15951931&t=Policia-desmonte-base-do-PCC-na-Cracolandia',
+        'only_matching': True,
+    }, {
+        'url': 'http://mais.uol.com.br/view/cphaa0gl2x8r/incendio-destroi-uma-das-maiores-casas-noturnas-de-londres-04024E9A3268D4C95326',
+        'only_matching': True,
+    }, {
+        'url': 'http://noticias.uol.com.br//videos/assistir.htm?video=rafaela-silva-inspira-criancas-no-judo-04024D983968D4C95326',
+        'only_matching': True,
+    }, {
+        'url': 'http://mais.uol.com.br/view/e0qbgxid79uv/15275470',
+        'only_matching': True,
+    }]
+
+    _FORMATS = {
+        '2': {
+            'width': 640,
+            'height': 360,
+        },
+        '5': {
+            'width': 1080,
+            'height': 720,
+        },
+        '6': {
+            'width': 426,
+            'height': 240,
+        },
+        '7': {
+            'width': 1920,
+            'height': 1080,
+        },
+        '8': {
+            'width': 192,
+            'height': 144,
+        },
+        '9': {
+            'width': 568,
+            'height': 320,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        if not video_id.isdigit():
+            embed_page = self._download_webpage('https://jsuol.com.br/c/tv/uol/embed/?params=[embed,%s]' % video_id, video_id)
+            video_id = self._search_regex(r'mediaId=(\d+)', embed_page, 'media id')
+        video_data = self._download_json(
+            'http://mais.uol.com.br/apiuol/v3/player/getMedia/%s.json' % video_id,
+            video_id)['item']
+        title = video_data['title']
+
+        query = {
+            'ver': video_data.get('numRevision', 2),
+            'r': 'http://mais.uol.com.br',
+        }
+        formats = []
+        for f in video_data.get('formats', []):
+            f_url = f.get('url') or f.get('secureUrl')
+            if not f_url:
+                continue
+            format_id = str_or_none(f.get('id'))
+            fmt = {
+                'format_id': format_id,
+                'url': update_url_query(f_url, query),
+            }
+            fmt.update(self._FORMATS.get(format_id, {}))
+            formats.append(fmt)
+        self._sort_formats(formats)
+
+        tags = []
+        for tag in video_data.get('tags', []):
+            tag_description = tag.get('description')
+            if not tag_description:
+                continue
+            tags.append(tag_description)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': clean_html(video_data.get('desMedia')),
+            'thumbnail': video_data.get('thumbnail'),
+            'duration': int_or_none(video_data.get('durationSeconds')) or parse_duration(video_data.get('duration')),
+            'tags': tags,
+            'formats': formats,
+        }
diff --git a/youtube_dl/extractor/uplynk.py b/youtube_dl/extractor/uplynk.py
new file mode 100644 (file)
index 0000000..ae529f6
--- /dev/null
@@ -0,0 +1,70 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    float_or_none,
+    ExtractorError,
+)
+
+
+class UplynkIE(InfoExtractor):
+    IE_NAME = 'uplynk'
+    _VALID_URL = r'https?://.*?\.uplynk\.com/(?P<path>ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)|(?P<id>[0-9a-f]{32}))\.(?:m3u8|json)(?:.*?\bpbs=(?P<session_id>[^&]+))?'
+    _TEST = {
+        'url': 'http://content.uplynk.com/e89eaf2ce9054aa89d92ddb2d817a52e.m3u8',
+        'info_dict': {
+            'id': 'e89eaf2ce9054aa89d92ddb2d817a52e',
+            'ext': 'mp4',
+            'title': '030816-kgo-530pm-solar-eclipse-vid_web.mp4',
+            'uploader_id': '4413701bf5a1488db55b767f8ae9d4fa',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }
+
+    def _extract_uplynk_info(self, uplynk_content_url):
+        path, external_id, video_id, session_id = re.match(UplynkIE._VALID_URL, uplynk_content_url).groups()
+        display_id = video_id or external_id
+        formats = self._extract_m3u8_formats('http://content.uplynk.com/%s.m3u8' % path, display_id, 'mp4')
+        if session_id:
+            for f in formats:
+                f['extra_param_to_segment_url'] = {
+                    'pbs': session_id,
+                }
+        self._sort_formats(formats)
+        asset = self._download_json('http://content.uplynk.com/player/assetinfo/%s.json' % path, display_id)
+        if asset.get('error') == 1:
+            raise ExtractorError('% said: %s' % (self.IE_NAME, asset['msg']), expected=True)
+
+        return {
+            'id': asset['asset'],
+            'title': asset['desc'],
+            'thumbnail': asset.get('default_poster_url'),
+            'duration': float_or_none(asset.get('duration')),
+            'uploader_id': asset.get('owner'),
+            'formats': formats,
+        }
+
+    def _real_extract(self, url):
+        return self._extract_uplynk_info(url)
+
+
+class UplynkPreplayIE(UplynkIE):
+    IE_NAME = 'uplynk:preplay'
+    _VALID_URL = r'https?://.*?\.uplynk\.com/preplay2?/(?P<path>ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)|(?P<id>[0-9a-f]{32}))\.json'
+    _TEST = None
+
+    def _real_extract(self, url):
+        path, external_id, video_id = re.match(self._VALID_URL, url).groups()
+        display_id = video_id or external_id
+        preplay = self._download_json(url, display_id)
+        content_url = 'http://content.uplynk.com/%s.m3u8' % path
+        session_id = preplay.get('sid')
+        if session_id:
+            content_url += '?pbs=' + session_id
+        return self._extract_uplynk_info(content_url)
diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py
new file mode 100644 (file)
index 0000000..ce3bf6b
--- /dev/null
@@ -0,0 +1,67 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class URPlayIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?urplay\.se/program/(?P<id>[0-9]+)'
+    _TEST = {
+        'url': 'http://urplay.se/program/190031-tripp-trapp-trad-sovkudde',
+        'md5': '15ca67b63fd8fb320ac2bcd854bad7b6',
+        'info_dict': {
+            'id': '190031',
+            'ext': 'mp4',
+            'title': 'Tripp, Trapp, Träd : Sovkudde',
+            'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        urplayer_data = self._parse_json(self._search_regex(
+            r'urPlayer\.init\(({.+?})\);', webpage, 'urplayer data'), video_id)
+        host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect']
+
+        formats = []
+        for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)):
+            file_rtmp = urplayer_data.get('file_rtmp' + quality_attr)
+            if file_rtmp:
+                formats.append({
+                    'url': 'rtmp://%s/urplay/mp4:%s' % (host, file_rtmp),
+                    'format_id': quality + '-rtmp',
+                    'ext': 'flv',
+                    'preference': preference,
+                })
+            file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr)
+            if file_http:
+                file_http_base_url = 'http://%s/%s' % (host, file_http)
+                formats.extend(self._extract_f4m_formats(
+                    file_http_base_url + 'manifest.f4m', video_id,
+                    preference, '%s-hds' % quality, fatal=False))
+                formats.extend(self._extract_m3u8_formats(
+                    file_http_base_url + 'playlist.m3u8', video_id, 'mp4',
+                    'm3u8_native', preference, '%s-hls' % quality, fatal=False))
+        self._sort_formats(formats)
+
+        subtitles = {}
+        for subtitle in urplayer_data.get('subtitles', []):
+            subtitle_url = subtitle.get('file')
+            kind = subtitle.get('kind')
+            if subtitle_url or kind and kind != 'captions':
+                continue
+            subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({
+                'url': subtitle_url,
+            })
+
+        return {
+            'id': video_id,
+            'title': urplayer_data['title'],
+            'description': self._og_search_description(webpage),
+            'thumbnail': urplayer_data.get('image'),
+            'series': urplayer_data.get('series_title'),
+            'subtitles': subtitles,
+            'formats': formats,
+        }
index dff1bb70281b9a1cd69d9507b963e4c622a29044..e1798857364c0d9d9ecc8ff48583880cb5cff697 100644 (file)
@@ -1,18 +1,23 @@
 # encoding: utf-8
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
-from ..compat import compat_urlparse
-from ..utils import (
-    ExtractorError,
-    sanitized_Request,
-    urlencode_postdata,
-)
+from ..utils import urlencode_postdata
 
 
 class Vbox7IE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?vbox7\.com/play:(?P<id>[^/]+)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?vbox7\.com/(?:play:|emb/external\.php\?.*?\bvid=)(?P<id>[\da-fA-F]+)'
+    _TESTS = [{
+        'url': 'http://vbox7.com/play:0946fff23c',
+        'md5': 'a60f9ab3a3a2f013ef9a967d5f7be5bf',
+        'info_dict': {
+            'id': '0946fff23c',
+            'ext': 'mp4',
+            'title': 'Борисов: Притеснен съм за бъдещето на България',
+        },
+    }, {
         'url': 'http://vbox7.com/play:249bb972c2',
         'md5': '99f65c0c9ef9b682b97313e052734c3f',
         'info_dict': {
@@ -20,43 +25,50 @@ class Vbox7IE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Смях! Чудо - чист за секунди - Скрита камера',
         },
-    }
+        'skip': 'georestricted',
+    }, {
+        'url': 'http://vbox7.com/emb/external.php?vid=a240d20f9c&autoplay=1',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        mobj = re.search(
+            '<iframe[^>]+src=(?P<q>["\'])(?P<url>(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)',
+            webpage)
+        if mobj:
+            return mobj.group('url')
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        # need to get the page 3 times for the correct jsSecretToken cookie
-        # which is necessary for the correct title
-        def get_session_id():
-            redirect_page = self._download_webpage(url, video_id)
-            session_id_url = self._search_regex(
-                r'var\s*url\s*=\s*\'([^\']+)\';', redirect_page,
-                'session id url')
-            self._download_webpage(
-                compat_urlparse.urljoin(url, session_id_url), video_id,
-                'Getting session id')
-
-        get_session_id()
-        get_session_id()
-
-        webpage = self._download_webpage(url, video_id,
-                                         'Downloading redirect page')
-
-        title = self._html_search_regex(r'<title>(.*)</title>',
-                                        webpage, 'title').split('/')[0].strip()
-
-        info_url = 'http://vbox7.com/play/magare.do'
-        data = urlencode_postdata({'as3': '1', 'vid': video_id})
-        info_request = sanitized_Request(info_url, data)
-        info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
-        info_response = self._download_webpage(info_request, video_id, 'Downloading info webpage')
-        if info_response is None:
-            raise ExtractorError('Unable to extract the media url')
-        (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
+        webpage = self._download_webpage(
+            'http://vbox7.com/play:%s' % video_id, video_id)
+
+        title = self._html_search_regex(
+            r'<title>(.+?)</title>', webpage, 'title').split('/')[0].strip()
+
+        video_url = self._search_regex(
+            r'src\s*:\s*(["\'])(?P<url>.+?.mp4.*?)\1',
+            webpage, 'video url', default=None, group='url')
+
+        thumbnail_url = self._og_search_thumbnail(webpage)
+
+        if not video_url:
+            info_response = self._download_webpage(
+                'http://vbox7.com/play/magare.do', video_id,
+                'Downloading info webpage',
+                data=urlencode_postdata({'as3': '1', 'vid': video_id}),
+                headers={'Content-Type': 'application/x-www-form-urlencoded'})
+            final_url, thumbnail_url = map(
+                lambda x: x.split('=')[1], info_response.split('&'))
+
+        if '/na.mp4' in video_url:
+            self.raise_geo_restricted()
 
         return {
             'id': video_id,
-            'url': final_url,
+            'url': self._proto_relative_url(video_url, 'http:'),
             'title': title,
             'thumbnail': thumbnail_url,
         }
index b11cd254c7da9c8c780dedd2b2db120f8025c74b..185756301c3a9b0afe440254d3d2051ca97730d7 100644 (file)
@@ -8,6 +8,7 @@ from .xstream import XstreamIE
 from ..utils import (
     ExtractorError,
     float_or_none,
+    try_get,
 )
 
 
@@ -129,6 +130,11 @@ class VGTVIE(XstreamIE):
             'url': 'http://ap.vgtv.no/webtv#!/video/111084/de-nye-bysyklene-lettere-bedre-gir-stoerre-hjul-og-feste-til-mobil',
             'only_matching': True,
         },
+        {
+            # geoblocked
+            'url': 'http://www.vgtv.no/#!/video/127205/inside-the-mind-of-favela-funk',
+            'only_matching': True,
+        },
     ]
 
     def _real_extract(self, url):
@@ -196,6 +202,12 @@ class VGTVIE(XstreamIE):
 
         info['formats'].extend(formats)
 
+        if not info['formats']:
+            properties = try_get(
+                data, lambda x: x['streamConfiguration']['properties'], list)
+            if properties and 'geoblocked' in properties:
+                raise self.raise_geo_restricted()
+
         self._sort_formats(info['formats'])
 
         info.update({
diff --git a/youtube_dl/extractor/viceland.py b/youtube_dl/extractor/viceland.py
new file mode 100644 (file)
index 0000000..8742b60
--- /dev/null
@@ -0,0 +1,107 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import time
+import hashlib
+import json
+
+from .adobepass import AdobePassIE
+from ..compat import compat_HTTPError
+from ..utils import (
+    int_or_none,
+    parse_age_limit,
+    str_or_none,
+    parse_duration,
+    ExtractorError,
+    extract_attributes,
+)
+
+
+class VicelandIE(AdobePassIE):
+    _VALID_URL = r'https?://(?:www\.)?viceland\.com/[^/]+/video/[^/]+/(?P<id>[a-f0-9]+)'
+    _TEST = {
+        'url': 'https://www.viceland.com/en_us/video/cyberwar-trailer/57608447973ee7705f6fbd4e',
+        'info_dict': {
+            'id': '57608447973ee7705f6fbd4e',
+            'ext': 'mp4',
+            'title': 'CYBERWAR (Trailer)',
+            'description': 'Tapping into the geopolitics of hacking and surveillance, Ben Makuch travels the world to meet with hackers, government officials, and dissidents to investigate the ecosystem of cyberwarfare.',
+            'age_limit': 14,
+            'timestamp': 1466008539,
+            'upload_date': '20160615',
+            'uploader_id': '11',
+            'uploader': 'Viceland',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+        'add_ie': ['UplynkPreplay'],
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        watch_hub_data = extract_attributes(self._search_regex(
+            r'(?s)(<watch-hub\s*.+?</watch-hub>)', webpage, 'watch hub'))
+        video_id = watch_hub_data['vms-id']
+        title = watch_hub_data['video-title']
+
+        query = {}
+        if watch_hub_data.get('video-locked') == '1':
+            resource = self._get_mvpd_resource(
+                'VICELAND', title, video_id,
+                watch_hub_data.get('video-rating'))
+            query['tvetoken'] = self._extract_mvpd_auth(url, video_id, 'VICELAND', resource)
+
+        # signature generation algorithm is reverse engineered from signatureGenerator in
+        # webpack:///../shared/~/vice-player/dist/js/vice-player.js in
+        # https://www.viceland.com/assets/common/js/web.vendor.bundle.js
+        exp = int(time.time()) + 14400
+        query.update({
+            'exp': exp,
+            'sign': hashlib.sha512(('%s:GET:%d' % (video_id, exp)).encode()).hexdigest(),
+        })
+
+        try:
+            preplay = self._download_json('https://www.viceland.com/en_us/preplay/%s' % video_id, video_id, query=query)
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+                error = json.loads(e.cause.read().decode())
+                raise ExtractorError('%s said: %s' % (self.IE_NAME, error['details']), expected=True)
+            raise
+
+        video_data = preplay['video']
+        base = video_data['base']
+        uplynk_preplay_url = preplay['preplayURL']
+        episode = video_data.get('episode', {})
+        channel = video_data.get('channel', {})
+
+        subtitles = {}
+        cc_url = preplay.get('ccURL')
+        if cc_url:
+            subtitles['en'] = [{
+                'url': cc_url,
+            }]
+
+        return {
+            '_type': 'url_transparent',
+            'url': uplynk_preplay_url,
+            'id': video_id,
+            'title': title,
+            'description': base.get('body'),
+            'thumbnail': watch_hub_data.get('cover-image') or watch_hub_data.get('thumbnail'),
+            'duration': parse_duration(video_data.get('video_duration') or watch_hub_data.get('video-duration')),
+            'timestamp': int_or_none(video_data.get('created_at')),
+            'age_limit': parse_age_limit(video_data.get('video_rating')),
+            'series': video_data.get('show_title') or watch_hub_data.get('show-title'),
+            'episode_number': int_or_none(episode.get('episode_number') or watch_hub_data.get('episode')),
+            'episode_id': str_or_none(episode.get('id') or video_data.get('episode_id')),
+            'season_number': int_or_none(watch_hub_data.get('season')),
+            'season_id': str_or_none(episode.get('season_id')),
+            'uploader': channel.get('base', {}).get('title') or watch_hub_data.get('channel-title'),
+            'uploader_id': str_or_none(channel.get('id')),
+            'subtitles': subtitles,
+            'ie_key': 'UplynkPreplay',
+        }
diff --git a/youtube_dl/extractor/vidbit.py b/youtube_dl/extractor/vidbit.py
new file mode 100644 (file)
index 0000000..e7ac5a8
--- /dev/null
@@ -0,0 +1,84 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    int_or_none,
+    js_to_json,
+    remove_end,
+    unified_strdate,
+)
+
+
+class VidbitIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?vidbit\.co/(?:watch|embed)\?.*?\bv=(?P<id>[\da-zA-Z]+)'
+    _TESTS = [{
+        'url': 'http://www.vidbit.co/watch?v=jkL2yDOEq2',
+        'md5': '1a34b7f14defe3b8fafca9796892924d',
+        'info_dict': {
+            'id': 'jkL2yDOEq2',
+            'ext': 'mp4',
+            'title': 'Intro to VidBit',
+            'description': 'md5:5e0d6142eec00b766cbf114bfd3d16b7',
+            'thumbnail': 're:https?://.*\.jpg$',
+            'upload_date': '20160618',
+            'view_count': int,
+            'comment_count': int,
+        }
+    }, {
+        'url': 'http://www.vidbit.co/embed?v=jkL2yDOEq2&auto=0&water=0',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            compat_urlparse.urljoin(url, '/watch?v=%s' % video_id), video_id)
+
+        video_url, title = [None] * 2
+
+        config = self._parse_json(self._search_regex(
+            r'(?s)\.setup\(({.+?})\);', webpage, 'setup', default='{}'),
+            video_id, transform_source=js_to_json)
+        if config:
+            if config.get('file'):
+                video_url = compat_urlparse.urljoin(url, config['file'])
+            title = config.get('title')
+
+        if not video_url:
+            video_url = compat_urlparse.urljoin(url, self._search_regex(
+                r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
+                webpage, 'video URL', group='url'))
+
+        if not title:
+            title = remove_end(
+                self._html_search_regex(
+                    (r'<h1>(.+?)</h1>', r'<title>(.+?)</title>'),
+                    webpage, 'title', default=None) or self._og_search_title(webpage),
+                ' - VidBit')
+
+        description = self._html_search_meta(
+            ('description', 'og:description', 'twitter:description'),
+            webpage, 'description')
+
+        upload_date = unified_strdate(self._html_search_meta(
+            'datePublished', webpage, 'upload date'))
+
+        view_count = int_or_none(self._search_regex(
+            r'<strong>(\d+)</strong> views',
+            webpage, 'view count', fatal=False))
+        comment_count = int_or_none(self._search_regex(
+            r'id=["\']cmt_num["\'][^>]*>\((\d+)\)',
+            webpage, 'comment count', fatal=False))
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'upload_date': upload_date,
+            'view_count': view_count,
+            'comment_count': comment_count,
+        }
index 3c78fb3d5a071a6f49dec7467e620c0b8a01ded9..d49cc6cbc567a8a0219f304a52707bd4129d1119 100644 (file)
@@ -9,8 +9,8 @@ from ..utils import (
 
 
 class VidziIE(JWPlatformBaseIE):
-    _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?P<id>\w+)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
+    _TESTS = [{
         'url': 'http://vidzi.tv/cghql9yq6emu.html',
         'md5': '4f16c71ca0c8c8635ab6932b5f3f1660',
         'info_dict': {
@@ -22,12 +22,16 @@ class VidziIE(JWPlatformBaseIE):
             # m3u8 download
             'skip_download': True,
         },
-    }
+    }, {
+        'url': 'http://vidzi.tv/embed-4z2yb0rzphe9-600x338.html',
+        'skip_download': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(
+            'http://vidzi.tv/%s' % video_id, video_id)
         title = self._html_search_regex(
             r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title')
 
index efa15e0b633f56dc24c15eeb4e63889cbb084ab3..4351ac4571935fa3c3ace915c0b97f20e67ec18d 100644 (file)
@@ -130,7 +130,7 @@ class VikiIE(VikiBaseIE):
     }, {
         # clip
         'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
-        'md5': 'feea2b1d7b3957f70886e6dfd8b8be84',
+        'md5': '86c0b5dbd4d83a6611a79987cc7a1989',
         'info_dict': {
             'id': '1067139v',
             'ext': 'mp4',
@@ -156,15 +156,11 @@ class VikiIE(VikiBaseIE):
             'like_count': int,
             'age_limit': 13,
         },
-        'params': {
-            # m3u8 download
-            'skip_download': True,
-        },
         'skip': 'Blocked in the US',
     }, {
         # episode
         'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
-        'md5': '1f54697dabc8f13f31bf06bb2e4de6db',
+        'md5': '5fa476a902e902783ac7a4d615cdbc7a',
         'info_dict': {
             'id': '44699v',
             'ext': 'mp4',
@@ -200,7 +196,7 @@ class VikiIE(VikiBaseIE):
     }, {
         # non-English description
         'url': 'http://www.viki.com/videos/158036v-love-in-magic',
-        'md5': '013dc282714e22acf9447cad14ff1208',
+        'md5': '1713ae35df5a521b31f6dc40730e7c9c',
         'info_dict': {
             'id': '158036v',
             'ext': 'mp4',
@@ -281,9 +277,16 @@ class VikiIE(VikiBaseIE):
                 r'^(\d+)[pP]$', format_id, 'height', default=None))
             for protocol, format_dict in stream_dict.items():
                 if format_id == 'm3u8':
-                    formats.extend(self._extract_m3u8_formats(
-                        format_dict['url'], video_id, 'mp4', 'm3u8_native',
-                        m3u8_id='m3u8-%s' % protocol, fatal=False))
+                    m3u8_formats = self._extract_m3u8_formats(
+                        format_dict['url'], video_id, 'mp4',
+                        entry_protocol='m3u8_native', preference=-1,
+                        m3u8_id='m3u8-%s' % protocol, fatal=False)
+                    # Despite CODECS metadata in m3u8 all video-only formats
+                    # are actually video+audio
+                    for f in m3u8_formats:
+                        if f.get('acodec') == 'none' and f.get('vcodec') != 'none':
+                            f['acodec'] = None
+                    formats.extend(m3u8_formats)
                 else:
                     formats.append({
                         'url': format_dict['url'],
index d9c9852d463d2ee533e3447f536a1a665e06ddd8..7e854f3265eac3312f1b63199ce8633a17eb7d04 100644 (file)
@@ -364,6 +364,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
             r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
         if mobj:
             return mobj.group(1)
+        # Look more for non-standard embedded Vimeo player
+        mobj = re.search(
+            r'<video[^>]+src=(?P<q1>[\'"])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)(?P=q1)', webpage)
+        if mobj:
+            return mobj.group('url')
 
     def _verify_player_video_password(self, url, video_id):
         password = self._downloader.params.get('videopassword')
index 5b801849cae1c26cd40049e625c65715c7e58010..0183f052a599f411a48053360ee41670e758f7af 100644 (file)
@@ -90,10 +90,12 @@ class VineIE(InfoExtractor):
 
         data = self._parse_json(
             self._search_regex(
-                r'window\.POST_DATA\s*=\s*{\s*%s\s*:\s*({.+?})\s*};\s*</script>' % video_id,
+                r'window\.POST_DATA\s*=\s*({.+?});\s*</script>',
                 webpage, 'vine data'),
             video_id)
 
+        data = data[list(data.keys())[0]]
+
         formats = [{
             'format_id': '%(format)s-%(rate)s' % f,
             'vcodec': f.get('format'),
index cfc5ffd8b02082d1f6d3035ba7b02dc230b8b2ed..3ee66e23e22b1d5350148d5cb21c2f55e4885f75 100644 (file)
@@ -6,11 +6,18 @@ import json
 import sys
 
 from .common import InfoExtractor
-from ..compat import compat_str
+from ..compat import (
+    compat_str,
+    compat_urlparse,
+)
 from ..utils import (
+    clean_html,
     ExtractorError,
+    get_element_by_class,
     int_or_none,
     orderedSet,
+    parse_duration,
+    remove_start,
     str_to_int,
     unescapeHTML,
     unified_strdate,
@@ -20,26 +27,72 @@ from .vimeo import VimeoIE
 from .pladform import PladformIE
 
 
-class VKIE(InfoExtractor):
+class VKBaseIE(InfoExtractor):
+    _NETRC_MACHINE = 'vk'
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+
+        login_page, url_handle = self._download_webpage_handle(
+            'https://vk.com', None, 'Downloading login page')
+
+        login_form = self._hidden_inputs(login_page)
+
+        login_form.update({
+            'email': username.encode('cp1251'),
+            'pass': password.encode('cp1251'),
+        })
+
+        # https://new.vk.com/ serves two same remixlhk cookies in Set-Cookie header
+        # and expects the first one to be set rather than second (see
+        # https://github.com/rg3/youtube-dl/issues/9841#issuecomment-227871201).
+        # As of RFC6265 the newer one cookie should be set into cookie store
+        # what actually happens.
+        # We will workaround this VK issue by resetting the remixlhk cookie to
+        # the first one manually.
+        cookies = url_handle.headers.get('Set-Cookie')
+        if cookies:
+            if sys.version_info[0] >= 3:
+                cookies = cookies.encode('iso-8859-1')
+            cookies = cookies.decode('utf-8')
+            remixlhk = re.search(r'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies)
+            if remixlhk:
+                value, domain = remixlhk.groups()
+                self._set_cookie(domain, 'remixlhk', value)
+
+        login_page = self._download_webpage(
+            'https://login.vk.com/?act=login', None,
+            note='Logging in as %s' % username,
+            data=urlencode_postdata(login_form))
+
+        if re.search(r'onLoginFailed', login_page):
+            raise ExtractorError(
+                'Unable to login, incorrect username and/or password', expected=True)
+
+    def _real_initialize(self):
+        self._login()
+
+
+class VKIE(VKBaseIE):
     IE_NAME = 'vk'
     IE_DESC = 'VK'
     _VALID_URL = r'''(?x)
                     https?://
                         (?:
                             (?:
-                                (?:m\.)?vk\.com/video_|
+                                (?:(?:m|new)\.)?vk\.com/video_|
                                 (?:www\.)?daxab.com/
                             )
                             ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)|
                             (?:
-                                (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video|
+                                (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?video|
                                 (?:www\.)?daxab.com/embed/
                             )
                             (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))?
                         )
                     '''
-    _NETRC_MACHINE = 'vk'
-
     _TESTS = [
         {
             'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521',
@@ -182,52 +235,13 @@ class VKIE(InfoExtractor):
             # pladform embed
             'url': 'https://vk.com/video-76116461_171554880',
             'only_matching': True,
+        },
+        {
+            'url': 'http://new.vk.com/video205387401_165548505',
+            'only_matching': True,
         }
     ]
 
-    def _login(self):
-        (username, password) = self._get_login_info()
-        if username is None:
-            return
-
-        login_page, url_handle = self._download_webpage_handle(
-            'https://vk.com', None, 'Downloading login page')
-
-        login_form = self._hidden_inputs(login_page)
-
-        login_form.update({
-            'email': username.encode('cp1251'),
-            'pass': password.encode('cp1251'),
-        })
-
-        # https://new.vk.com/ serves two same remixlhk cookies in Set-Cookie header
-        # and expects the first one to be set rather than second (see
-        # https://github.com/rg3/youtube-dl/issues/9841#issuecomment-227871201).
-        # As of RFC6265 the newer one cookie should be set into cookie store
-        # what actually happens.
-        # We will workaround this VK issue by resetting the remixlhk cookie to
-        # the first one manually.
-        cookies = url_handle.headers.get('Set-Cookie')
-        if sys.version_info[0] >= 3:
-            cookies = cookies.encode('iso-8859-1')
-        cookies = cookies.decode('utf-8')
-        remixlhk = re.search(r'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies)
-        if remixlhk:
-            value, domain = remixlhk.groups()
-            self._set_cookie(domain, 'remixlhk', value)
-
-        login_page = self._download_webpage(
-            'https://login.vk.com/?act=login', None,
-            note='Logging in as %s' % username,
-            data=urlencode_postdata(login_form))
-
-        if re.search(r'onLoginFailed', login_page):
-            raise ExtractorError(
-                'Unable to login, incorrect username and/or password', expected=True)
-
-    def _real_initialize(self):
-        self._login()
-
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         video_id = mobj.group('videoid')
@@ -351,10 +365,10 @@ class VKIE(InfoExtractor):
         }
 
 
-class VKUserVideosIE(InfoExtractor):
+class VKUserVideosIE(VKBaseIE):
     IE_NAME = 'vk:uservideos'
     IE_DESC = "VK - User's Videos"
-    _VALID_URL = r'https?://vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)'
+    _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)'
     _TEMPLATE_URL = 'https://vk.com/videos'
     _TESTS = [{
         'url': 'http://vk.com/videos205387401',
@@ -369,6 +383,12 @@ class VKUserVideosIE(InfoExtractor):
     }, {
         'url': 'http://vk.com/videos-97664626?section=all',
         'only_matching': True,
+    }, {
+        'url': 'http://m.vk.com/videos205387401',
+        'only_matching': True,
+    }, {
+        'url': 'http://new.vk.com/videos205387401',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
@@ -386,3 +406,121 @@ class VKUserVideosIE(InfoExtractor):
             webpage, 'title', default=page_id))
 
         return self.playlist_result(entries, page_id, title)
+
+
+class VKWallPostIE(VKBaseIE):
+    IE_NAME = 'vk:wallpost'
+    _VALID_URL = r'https?://(?:(?:(?:(?:m|new)\.)?vk\.com/(?:[^?]+\?.*\bw=)?wall(?P<id>-?\d+_\d+)))'
+    _TESTS = [{
+        # public page URL, audio playlist
+        'url': 'https://vk.com/bs.official?w=wall-23538238_35',
+        'info_dict': {
+            'id': '23538238_35',
+            'title': 'Black Shadow - Wall post 23538238_35',
+            'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c',
+        },
+        'playlist': [{
+            'md5': '5ba93864ec5b85f7ce19a9af4af080f6',
+            'info_dict': {
+                'id': '135220665_111806521',
+                'ext': 'mp3',
+                'title': 'Black Shadow - Слепое Верование',
+                'duration': 370,
+                'uploader': 'Black Shadow',
+                'artist': 'Black Shadow',
+                'track': 'Слепое Верование',
+            },
+        }, {
+            'md5': '4cc7e804579122b17ea95af7834c9233',
+            'info_dict': {
+                'id': '135220665_111802303',
+                'ext': 'mp3',
+                'title': 'Black Shadow - Война - Негасимое Бездны Пламя!',
+                'duration': 423,
+                'uploader': 'Black Shadow',
+                'artist': 'Black Shadow',
+                'track': 'Война - Негасимое Бездны Пламя!',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        }],
+        'skip': 'Requires vk account credentials',
+    }, {
+        # single YouTube embed, no leading -
+        'url': 'https://vk.com/wall85155021_6319',
+        'info_dict': {
+            'id': '85155021_6319',
+            'title': 'Sergey Gorbunov - Wall post 85155021_6319',
+        },
+        'playlist_count': 1,
+        'skip': 'Requires vk account credentials',
+    }, {
+        # wall page URL
+        'url': 'https://vk.com/wall-23538238_35',
+        'only_matching': True,
+    }, {
+        # mobile wall page URL
+        'url': 'https://m.vk.com/wall-23538238_35',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        post_id = self._match_id(url)
+
+        wall_url = 'https://vk.com/wall%s' % post_id
+
+        post_id = remove_start(post_id, '-')
+
+        webpage = self._download_webpage(wall_url, post_id)
+
+        error = self._html_search_regex(
+            r'>Error</div>\s*<div[^>]+class=["\']body["\'][^>]*>([^<]+)',
+            webpage, 'error', default=None)
+        if error:
+            raise ExtractorError('VK said: %s' % error, expected=True)
+
+        description = clean_html(get_element_by_class('wall_post_text', webpage))
+        uploader = clean_html(get_element_by_class(
+            'fw_post_author', webpage)) or self._og_search_description(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        entries = []
+
+        for audio in re.finditer(r'''(?sx)
+                            <input[^>]+
+                                id=(?P<q1>["\'])audio_info(?P<id>\d+_\d+).*?(?P=q1)[^>]+
+                                value=(?P<q2>["\'])(?P<url>http.+?)(?P=q2)
+                                .+?
+                            </table>''', webpage):
+            audio_html = audio.group(0)
+            audio_id = audio.group('id')
+            duration = parse_duration(get_element_by_class('duration', audio_html))
+            track = self._html_search_regex(
+                r'<span[^>]+id=["\']title%s[^>]*>([^<]+)' % audio_id,
+                audio_html, 'title', default=None)
+            artist = self._html_search_regex(
+                r'>([^<]+)</a></b>\s*&ndash', audio_html,
+                'artist', default=None)
+            entries.append({
+                'id': audio_id,
+                'url': audio.group('url'),
+                'title': '%s - %s' % (artist, track) if artist and track else audio_id,
+                'thumbnail': thumbnail,
+                'duration': duration,
+                'uploader': uploader,
+                'artist': artist,
+                'track': track,
+            })
+
+        for video in re.finditer(
+                r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage):
+            entries.append(self.url_result(
+                compat_urlparse.urljoin(url, video.group('url')), VKIE.ie_key()))
+
+        title = 'Wall post %s' % post_id
+
+        return self.playlist_result(
+            orderedSet(entries), post_id,
+            '%s - %s' % (uploader, title) if uploader else title,
+            description)
diff --git a/youtube_dl/extractor/vodplatform.py b/youtube_dl/extractor/vodplatform.py
new file mode 100644 (file)
index 0000000..b49542b
--- /dev/null
@@ -0,0 +1,58 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import unescapeHTML
+
+
+class VODPlatformIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?vod-platform\.net/embed/(?P<id>[^/?#]+)'
+    _TEST = {
+        # from http://www.lbcgroup.tv/watch/chapter/29143/52844/%D8%A7%D9%84%D9%86%D8%B5%D8%B1%D8%A9-%D9%81%D9%8A-%D8%B6%D9%8A%D8%A7%D9%81%D8%A9-%D8%A7%D9%84%D9%80-cnn/ar
+        'url': 'http://vod-platform.net/embed/RufMcytHDolTH1MuKHY9Fw',
+        'md5': '1db2b7249ce383d6be96499006e951fc',
+        'info_dict': {
+            'id': 'RufMcytHDolTH1MuKHY9Fw',
+            'ext': 'mp4',
+            'title': 'LBCi News_ النصرة في ضيافة الـ "سي.أن.أن"',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        title = unescapeHTML(self._og_search_title(webpage))
+        hidden_inputs = self._hidden_inputs(webpage)
+
+        base_url = self._search_regex(
+            '(.*/)(?:playlist.m3u8|manifest.mpd)',
+            hidden_inputs.get('HiddenmyhHlsLink') or hidden_inputs['HiddenmyDashLink'],
+            'base url')
+        formats = self._extract_m3u8_formats(
+            base_url + 'playlist.m3u8', video_id, 'mp4',
+            'm3u8_native', m3u8_id='hls', fatal=False)
+        formats.extend(self._extract_mpd_formats(
+            base_url + 'manifest.mpd', video_id,
+            mpd_id='dash', fatal=False))
+        rtmp_formats = self._extract_smil_formats(
+            base_url + 'jwplayer.smil', video_id, fatal=False)
+        for rtmp_format in rtmp_formats:
+            rtsp_format = rtmp_format.copy()
+            rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
+            del rtsp_format['play_path']
+            del rtsp_format['ext']
+            rtsp_format.update({
+                'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
+                'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
+                'protocol': 'rtsp',
+            })
+            formats.extend([rtmp_format, rtsp_format])
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'thumbnail': hidden_inputs.get('HiddenThumbnail') or self._og_search_thumbnail(webpage),
+            'formats': formats,
+        }
index 8e35f24e81e7e61240898ea3259914737365fd2f..bec7ab327008803f8609ea0e78e7d70577556940 100644 (file)
@@ -25,7 +25,8 @@ class VRTIE(InfoExtractor):
                 'timestamp': 1414271750.949,
                 'upload_date': '20141025',
                 'duration': 929,
-            }
+            },
+            'skip': 'HTTP Error 404: Not Found',
         },
         # sporza.be
         {
@@ -39,7 +40,8 @@ class VRTIE(InfoExtractor):
                 'timestamp': 1413835980.560,
                 'upload_date': '20141020',
                 'duration': 3238,
-            }
+            },
+            'skip': 'HTTP Error 404: Not Found',
         },
         # cobra.be
         {
@@ -53,16 +55,39 @@ class VRTIE(InfoExtractor):
                 'timestamp': 1413967500.494,
                 'upload_date': '20141022',
                 'duration': 661,
-            }
+            },
+            'skip': 'HTTP Error 404: Not Found',
         },
         {
             # YouTube video
             'url': 'http://deredactie.be/cm/vrtnieuws/videozone/nieuws/cultuurenmedia/1.2622957',
-            'only_matching': True,
+            'md5': 'b8b93da1df1cea6c8556255a796b7d61',
+            'info_dict': {
+                'id': 'Wji-BZ0oCwg',
+                'ext': 'mp4',
+                'title': 'ROGUE ONE: A STAR WARS STORY Official Teaser Trailer',
+                'description': 'md5:8e468944dce15567a786a67f74262583',
+                'uploader': 'Star Wars',
+                'uploader_id': 'starwars',
+                'upload_date': '20160407',
+            },
+            'add_ie': ['Youtube'],
         },
         {
             'url': 'http://cobra.canvas.be/cm/cobra/videozone/rubriek/film-videozone/1.2377055',
-            'only_matching': True,
+            'md5': '',
+            'info_dict': {
+                'id': '2377055',
+                'ext': 'mp4',
+                'title': 'Cafe Derby',
+                'description': 'Lenny Van Wesemael debuteert met de langspeelfilm Café Derby. Een waar gebeurd maar ook verzonnen verhaal.',
+                'upload_date': '20150626',
+                'timestamp': 1435305240.769,
+            },
+            'params': {
+                # m3u8 download
+                'skip_download': True,
+            }
         }
     ]
 
@@ -98,6 +123,32 @@ class VRTIE(InfoExtractor):
                 formats.extend(self._extract_m3u8_formats(
                     src, video_id, 'mp4', entry_protocol='m3u8_native',
                     m3u8_id='hls', fatal=False))
+                formats.extend(self._extract_f4m_formats(
+                    src.replace('playlist.m3u8', 'manifest.f4m'),
+                    video_id, f4m_id='hds', fatal=False))
+                if 'data-video-geoblocking="true"' not in webpage:
+                    rtmp_formats = self._extract_smil_formats(
+                        src.replace('playlist.m3u8', 'jwplayer.smil'),
+                        video_id, fatal=False)
+                    formats.extend(rtmp_formats)
+                    for rtmp_format in rtmp_formats:
+                        rtmp_format_c = rtmp_format.copy()
+                        rtmp_format_c['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
+                        del rtmp_format_c['play_path']
+                        del rtmp_format_c['ext']
+                        http_format = rtmp_format_c.copy()
+                        http_format.update({
+                            'url': rtmp_format_c['url'].replace('rtmp://', 'http://').replace('vod.', 'download.').replace('/_definst_/', '/').replace('mp4:', ''),
+                            'format_id': rtmp_format['format_id'].replace('rtmp', 'http'),
+                            'protocol': 'http',
+                        })
+                        rtsp_format = rtmp_format_c.copy()
+                        rtsp_format.update({
+                            'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
+                            'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
+                            'protocol': 'rtsp',
+                        })
+                        formats.extend([http_format, rtsp_format])
             else:
                 formats.extend(self._extract_f4m_formats(
                     '%s/manifest.f4m' % src, video_id, f4m_id='hds', fatal=False))
index eaa888f005cc61c53b8f45c3e3b93633083b17ed..b73da5cd073a6f4803c070b1cde3c4a3879dbaf1 100644 (file)
@@ -9,7 +9,7 @@ from ..compat import (
 from ..utils import (
     ExtractorError,
     parse_duration,
-    qualities,
+    remove_end,
 )
 
 
@@ -22,7 +22,7 @@ class VuClipIE(InfoExtractor):
             'id': '922692425',
             'ext': '3gp',
             'title': 'The Toy Soldiers - Hollywood Movie Trailer',
-            'duration': 180,
+            'duration': 177,
         }
     }
 
@@ -46,34 +46,21 @@ class VuClipIE(InfoExtractor):
                 '%s said: %s' % (self.IE_NAME, error_msg), expected=True)
 
         # These clowns alternate between two page types
-        links_code = self._search_regex(
-            r'''(?xs)
-                (?:
-                    <img\s+src="[^"]*/play.gif".*?>|
-                    <!--\ player\ end\ -->\s*</div><!--\ thumb\ end-->
-                )
-                (.*?)
-                (?:
-                    <a\s+href="fblike|<div\s+class="social">
-                )
-            ''', webpage, 'links')
-        title = self._html_search_regex(
-            r'<title>(.*?)-\s*Vuclip</title>', webpage, 'title').strip()
+        video_url = self._search_regex(
+            r'<a[^>]+href="([^"]+)"[^>]*><img[^>]+src="[^"]*/play\.gif',
+            webpage, 'video URL', default=None)
+        if video_url:
+            formats = [{
+                'url': video_url,
+            }]
+        else:
+            formats = self._parse_html5_media_entries(url, webpage)[0]['formats']
 
-        quality_order = qualities(['Reg', 'Hi'])
-        formats = []
-        for url, q in re.findall(
-                r'<a\s+href="(?P<url>[^"]+)".*?>(?:<button[^>]*>)?(?P<q>[^<]+)(?:</button>)?</a>', links_code):
-            format_id = compat_urllib_parse_urlparse(url).scheme + '-' + q
-            formats.append({
-                'format_id': format_id,
-                'url': url,
-                'quality': quality_order(q),
-            })
-        self._sort_formats(formats)
+        title = remove_end(self._html_search_regex(
+            r'<title>(.*?)-\s*Vuclip</title>', webpage, 'title').strip(), ' - Video')
 
-        duration = parse_duration(self._search_regex(
-            r'\(([0-9:]+)\)</span>', webpage, 'duration', fatal=False))
+        duration = parse_duration(self._html_search_regex(
+            r'[(>]([0-9]+:[0-9]+)(?:<span|\))', webpage, 'duration', fatal=False))
 
         return {
             'id': video_id,
index de7d6b55935cd5fd8edb4c83c581505c2f0f4214..9f1b8b4b5f1bbe6f5d34d5e31bcc5596c35543f7 100644 (file)
@@ -9,7 +9,7 @@ from ..utils import (
     ExtractorError,
     unified_strdate,
     HEADRequest,
-    float_or_none,
+    int_or_none,
 )
 
 
@@ -31,48 +31,58 @@ class WatIE(InfoExtractor):
         },
         {
             'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html',
-            'md5': 'fbc84e4378165278e743956d9c1bf16b',
+            'md5': '34bdfa5ca9fd3c7eb88601b635b0424c',
             'info_dict': {
                 'id': '11713075',
                 'ext': 'mp4',
                 'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)',
-                'description': 'md5:b7a849cf16a2b733d9cd10c52906dee3',
                 'upload_date': '20140816',
-                'duration': 2910,
             },
-            'skip': "Ce contenu n'est pas disponible pour l'instant.",
+            'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."],
         },
     ]
 
+    _FORMATS = (
+        (200, 416, 234),
+        (400, 480, 270),
+        (600, 640, 360),
+        (1200, 640, 360),
+        (1800, 960, 540),
+        (2500, 1280, 720),
+    )
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
         video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36))
 
         # 'contentv4' is used in the website, but it also returns the related
         # videos, we don't need them
-        video_info = self._download_json(
-            'http://www.wat.tv/interface/contentv3/' + video_id, video_id)['media']
+        video_data = self._download_json(
+            'http://www.wat.tv/interface/contentv4s/' + video_id, video_id)
+        video_info = video_data['media']
 
         error_desc = video_info.get('error_desc')
         if error_desc:
-            raise ExtractorError(
-                '%s returned error: %s' % (self.IE_NAME, error_desc), expected=True)
+            self.report_warning(
+                '%s returned error: %s' % (self.IE_NAME, error_desc))
 
         chapters = video_info['chapters']
-        first_chapter = chapters[0]
+        if chapters:
+            first_chapter = chapters[0]
 
-        def video_id_for_chapter(chapter):
-            return chapter['tc_start'].split('-')[0]
+            def video_id_for_chapter(chapter):
+                return chapter['tc_start'].split('-')[0]
 
-        if video_id_for_chapter(first_chapter) != video_id:
-            self.to_screen('Multipart video detected')
-            entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters]
-            return self.playlist_result(entries, video_id, video_info['title'])
-        # Otherwise we can continue and extract just one part, we have to use
-        # the video id for getting the video url
+            if video_id_for_chapter(first_chapter) != video_id:
+                self.to_screen('Multipart video detected')
+                entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters]
+                return self.playlist_result(entries, video_id, video_info['title'])
+            # Otherwise we can continue and extract just one part, we have to use
+            # the video id for getting the video url
+        else:
+            first_chapter = video_info
 
-        date_diffusion = first_chapter.get('date_diffusion')
-        upload_date = unified_strdate(date_diffusion) if date_diffusion else None
+        title = first_chapter['title']
 
         def extract_url(path_template, url_type):
             req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id)
@@ -84,45 +94,61 @@ class WatIE(InfoExtractor):
                     expected=True)
             return red_url
 
-        m3u8_url = extract_url('ipad/%s.m3u8', 'm3u8')
-        http_url = extract_url('android5/%s.mp4', 'http')
-
         formats = []
-        m3u8_formats = self._extract_m3u8_formats(
-            m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
-        formats.extend(m3u8_formats)
-        formats.extend(self._extract_f4m_formats(
-            m3u8_url.replace('ios.', 'web.').replace('.m3u8', '.f4m'),
-            video_id, f4m_id='hds', fatal=False))
-        for m3u8_format in m3u8_formats:
-            mobj = re.search(
-                r'audio.*?%3D(\d+)(?:-video.*?%3D(\d+))?', m3u8_format['url'])
-            if not mobj:
-                continue
-            abr, vbr = mobj.groups()
-            abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
-            m3u8_format.update({
-                'vbr': vbr,
-                'abr': abr,
-            })
-            if not vbr or not abr:
-                continue
-            f = m3u8_format.copy()
-            f.update({
-                'url': re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url),
-                'format_id': f['format_id'].replace('hls', 'http'),
-                'protocol': 'http',
-            })
-            formats.append(f)
-        self._sort_formats(formats)
+        try:
+            http_url = extract_url('android5/%s.mp4', 'http')
+            m3u8_url = extract_url('ipad/%s.m3u8', 'm3u8')
+            m3u8_formats = self._extract_m3u8_formats(
+                m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
+            formats.extend(m3u8_formats)
+            formats.extend(self._extract_f4m_formats(
+                m3u8_url.replace('ios.', 'web.').replace('.m3u8', '.f4m'),
+                video_id, f4m_id='hds', fatal=False))
+            for m3u8_format in m3u8_formats:
+                vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr')
+                if not vbr or not abr:
+                    continue
+                format_id = m3u8_format['format_id'].replace('hls', 'http')
+                fmt_url = re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url)
+                if self._is_valid_url(fmt_url, video_id, format_id):
+                    f = m3u8_format.copy()
+                    f.update({
+                        'url': fmt_url,
+                        'format_id': format_id,
+                        'protocol': 'http',
+                    })
+                    formats.append(f)
+            self._sort_formats(formats)
+        except ExtractorError:
+            abr = 64
+            for vbr, width, height in self._FORMATS:
+                tbr = vbr + abr
+                format_id = 'http-%s' % tbr
+                fmt_url = 'http://dnl.adv.tf1.fr/2/USP-0x0/%s/%s/%s/ssm/%s-%s-64k.mp4' % (video_id[-4:-2], video_id[-2:], video_id, video_id, vbr)
+                if self._is_valid_url(fmt_url, video_id, format_id):
+                    formats.append({
+                        'format_id': format_id,
+                        'url': fmt_url,
+                        'vbr': vbr,
+                        'abr': abr,
+                        'width': width,
+                        'height': height,
+                    })
+
+        date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4')
+        upload_date = unified_strdate(date_diffusion) if date_diffusion else None
+        duration = None
+        files = video_info['files']
+        if files:
+            duration = int_or_none(files[0].get('duration'))
 
         return {
             'id': video_id,
-            'title': first_chapter['title'],
-            'thumbnail': first_chapter['preview'],
-            'description': first_chapter['description'],
-            'view_count': video_info['views'],
+            'title': title,
+            'thumbnail': first_chapter.get('preview'),
+            'description': first_chapter.get('description'),
+            'view_count': int_or_none(video_info.get('views')),
             'upload_date': upload_date,
-            'duration': video_info['files'][0]['duration'],
+            'duration': duration,
             'formats': formats,
         }
index a6dfc4af98e6baafbe254579ebe060c208d6cd06..86abef25704bbc9e8a1494ecc2146b5c5bfabe32 100644 (file)
@@ -13,6 +13,7 @@ class XiamiBaseIE(InfoExtractor):
         webpage = super(XiamiBaseIE, self)._download_webpage(*args, **kwargs)
         if '>Xiami is currently not available in your country.<' in webpage:
             self.raise_geo_restricted('Xiami is currently not available in your country')
+        return webpage
 
     def _extract_track(self, track, track_id=None):
         title = track['title']
index 4075b8a4f8a705cf29aa1430656146350a8d07aa..83bc1fef2095b322a67199c60e27fcc6f8f1bcbc 100644 (file)
@@ -4,17 +4,23 @@ import itertools
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote
 from ..utils import (
     int_or_none,
     orderedSet,
+    parse_duration,
     sanitized_Request,
     str_to_int,
 )
 
 
 class XTubeIE(InfoExtractor):
-    _VALID_URL = r'(?:xtube:|https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P<display_id>[^/]+)-))(?P<id>[^/?&#]+)'
+    _VALID_URL = r'''(?x)
+                        (?:
+                            xtube:|
+                            https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P<display_id>[^/]+)-)
+                        )
+                        (?P<id>[^/?&#]+)
+                    '''
 
     _TESTS = [{
         # old URL schema
@@ -27,6 +33,8 @@ class XTubeIE(InfoExtractor):
             'description': 'contains:an ET kind of thing',
             'uploader': 'greenshowers',
             'duration': 450,
+            'view_count': int,
+            'comment_count': int,
             'age_limit': 18,
         }
     }, {
@@ -51,21 +59,30 @@ class XTubeIE(InfoExtractor):
         req.add_header('Cookie', 'age_verified=1; cookiesAccepted=1')
         webpage = self._download_webpage(req, display_id)
 
-        flashvars = self._parse_json(
-            self._search_regex(
-                r'xt\.playerOps\s*=\s*({.+?});', webpage, 'player ops'),
-            video_id)['flashvars']
-
-        title = flashvars.get('title') or self._search_regex(
-            r'<h1>([^<]+)</h1>', webpage, 'title')
-        video_url = compat_urllib_parse_unquote(flashvars['video_url'])
-        duration = int_or_none(flashvars.get('video_duration'))
-
-        uploader = self._search_regex(
-            r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"',
-            webpage, 'uploader', fatal=False)
+        sources = self._parse_json(self._search_regex(
+            r'sources\s*:\s*({.+?}),', webpage, 'sources'), video_id)
+
+        formats = []
+        for format_id, format_url in sources.items():
+            formats.append({
+                'url': format_url,
+                'format_id': format_id,
+                'height': int_or_none(format_id),
+            })
+        self._sort_formats(formats)
+
+        title = self._search_regex(
+            (r'<h1>(?P<title>[^<]+)</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'),
+            webpage, 'title', group='title')
         description = self._search_regex(
             r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False)
+        uploader = self._search_regex(
+            (r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"',
+             r'<span[^>]+class="nickname"[^>]*>([^<]+)'),
+            webpage, 'uploader', fatal=False)
+        duration = parse_duration(self._search_regex(
+            r'<dt>Runtime:</dt>\s*<dd>([^<]+)</dd>',
+            webpage, 'duration', fatal=False))
         view_count = str_to_int(self._search_regex(
             r'<dt>Views:</dt>\s*<dd>([\d,\.]+)</dd>',
             webpage, 'view count', fatal=False))
@@ -76,7 +93,6 @@ class XTubeIE(InfoExtractor):
         return {
             'id': video_id,
             'display_id': display_id,
-            'url': video_url,
             'title': title,
             'description': description,
             'uploader': uploader,
@@ -84,6 +100,7 @@ class XTubeIE(InfoExtractor):
             'view_count': view_count,
             'comment_count': comment_count,
             'age_limit': 18,
+            'formats': formats,
         }
 
 
index 0be8932ad36fbc02615bf6081972a621048283b3..a66daee46ebc0324152f69eefa2b66d79bfb513d 100644 (file)
@@ -67,6 +67,20 @@ class XuiteIE(InfoExtractor):
             'categories': ['電玩動漫'],
         },
         'skip': 'Video removed',
+    }, {
+        # Video with encoded media id
+        # from http://forgetfulbc.blogspot.com/2016/06/date.html
+        'url': 'http://vlog.xuite.net/embed/cE1xbENoLTI3NDQ3MzM2LmZsdg==?ar=0&as=0',
+        'info_dict': {
+            'id': 'cE1xbENoLTI3NDQ3MzM2LmZsdg==',
+            'ext': 'mp4',
+            'title': '男女平權只是口號?專家解釋約會時男生是否該幫女生付錢 (中字)',
+            'description': 'md5:f0abdcb69df300f522a5442ef3146f2a',
+            'timestamp': 1466160960,
+            'upload_date': '20160617',
+            'uploader': 'B.C. & Lowy',
+            'uploader_id': '232279340',
+        },
     }, {
         'url': 'http://vlog.xuite.net/play/S1dDUjdyLTMyOTc3NjcuZmx2/%E5%AD%AB%E7%87%95%E5%A7%BF-%E7%9C%BC%E6%B7%9A%E6%88%90%E8%A9%A9',
         'only_matching': True,
@@ -80,10 +94,9 @@ class XuiteIE(InfoExtractor):
     def base64_encode_utf8(data):
         return base64.b64encode(data.encode('utf-8')).decode('utf-8')
 
-    def _extract_flv_config(self, media_id):
-        base64_media_id = self.base64_encode_utf8(media_id)
+    def _extract_flv_config(self, encoded_media_id):
         flv_config = self._download_xml(
-            'http://vlog.xuite.net/flash/player?media=%s' % base64_media_id,
+            'http://vlog.xuite.net/flash/player?media=%s' % encoded_media_id,
             'flv config')
         prop_dict = {}
         for prop in flv_config.findall('./property'):
@@ -108,9 +121,14 @@ class XuiteIE(InfoExtractor):
                 '%s returned error: %s' % (self.IE_NAME, error_msg),
                 expected=True)
 
-        video_id = self._html_search_regex(
-            r'data-mediaid="(\d+)"', webpage, 'media id')
-        flv_config = self._extract_flv_config(video_id)
+        encoded_media_id = self._search_regex(
+            r'attributes\.name\s*=\s*"([^"]+)"', webpage,
+            'encoded media id', default=None)
+        if encoded_media_id is None:
+            video_id = self._html_search_regex(
+                r'data-mediaid="(\d+)"', webpage, 'media id')
+            encoded_media_id = self.base64_encode_utf8(video_id)
+        flv_config = self._extract_flv_config(encoded_media_id)
 
         FORMATS = {
             'audio': 'mp3',
index 1dfe031cab8e9ccf4fa6fb23fbd90d40759ac930..30825daae956e12f450e32bc66aea737a5c293e0 100644 (file)
@@ -15,10 +15,10 @@ class XVideosIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?xvideos\.com/video(?P<id>[0-9]+)(?:.*)'
     _TEST = {
         'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl',
-        'md5': '4b46ae6ea5e6e9086e714d883313c0c9',
+        'md5': '14cea69fcb84db54293b1e971466c2e1',
         'info_dict': {
             'id': '4588838',
-            'ext': 'flv',
+            'ext': 'mp4',
             'title': 'Biker Takes his Girl',
             'age_limit': 18,
         }
@@ -42,24 +42,24 @@ class XVideosIE(InfoExtractor):
         video_url = compat_urllib_parse_unquote(self._search_regex(
             r'flv_url=(.+?)&', webpage, 'video URL', default=''))
         if video_url:
-            formats.append({'url': video_url})
+            formats.append({
+                'url': video_url,
+                'format_id': 'flv',
+            })
 
-        player_args = self._search_regex(
-            r'(?s)new\s+HTML5Player\((.+?)\)', webpage, ' html5 player', default=None)
-        if player_args:
-            for arg in player_args.split(','):
-                format_url = self._search_regex(
-                    r'(["\'])(?P<url>https?://.+?)\1', arg, 'url',
-                    default=None, group='url')
-                if not format_url:
-                    continue
-                ext = determine_ext(format_url)
-                if ext == 'mp4':
-                    formats.append({'url': format_url})
-                elif ext == 'm3u8':
-                    formats.extend(self._extract_m3u8_formats(
-                        format_url, video_id, 'mp4',
-                        entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+        for kind, _, format_url in re.findall(
+                r'setVideo([^(]+)\((["\'])(http.+?)\2\)', webpage):
+            format_id = kind.lower()
+            if format_id == 'hls':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4',
+                    entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+            elif format_id in ('urllow', 'urlhigh'):
+                formats.append({
+                    'url': format_url,
+                    'format_id': '%s-%s' % (determine_ext(format_url, 'mp4'), format_id[3:]),
+                    'quality': -2 if format_id.endswith('low') else None,
+                })
 
         self._sort_formats(formats)
 
index 927a964a4c5036094d657a634c6df8f8874e00af..b0679dfb70868f39b1190b8b04696787c02d75b1 100644 (file)
@@ -19,6 +19,7 @@ from ..utils import (
     mimetype2ext,
 )
 
+from .brightcove import BrightcoveNewIE
 from .nbc import NBCSportsVPlayerIE
 
 
@@ -227,7 +228,12 @@ class YahooIE(InfoExtractor):
         # Look for NBCSports iframes
         nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
         if nbc_sports_url:
-            return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
+            return self.url_result(nbc_sports_url, NBCSportsVPlayerIE.ie_key())
+
+        # Look for Brightcove New Studio embeds
+        bc_url = BrightcoveNewIE._extract_url(webpage)
+        if bc_url:
+            return self.url_result(bc_url, BrightcoveNewIE.ie_key())
 
         # Query result is often embedded in webpage as JSON. Sometimes explicit requests
         # to video API results in a failure with geo restriction reason therefore using
index b37d0eab66b53ab45ff38dcac91598079f08a275..fd6268ba4119d02988172a4771514cc34603db1a 100644 (file)
@@ -75,6 +75,12 @@ class YandexMusicTrackIE(YandexMusicBaseIE):
             % storage_dir,
             track_id, 'Downloading track location JSON')
 
+        # Each string is now wrapped in a list, this is probably only temporarily thus
+        # supporting both scenarios (see https://github.com/rg3/youtube-dl/issues/10193)
+        for k, v in data.items():
+            if v and isinstance(v, list):
+                data[k] = v[0]
+
         key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + data['path'][1:] + data['s']).encode('utf-8')).hexdigest()
         storage = storage_dir.split('.')
 
index 4150b28daffad5c8cae227c4bf76a125733ced73..31e2f9263ada15aa4932b66347b12640b1a46621 100644 (file)
@@ -9,8 +9,8 @@ from ..utils import (
 
 
 class YouJizzIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/[^/#?]+-(?P<id>[0-9]+)\.html(?:$|[?#])'
-    _TEST = {
+    _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]+)?-(?P<id>[0-9]+)\.html(?:$|[?#])'
+    _TESTS = [{
         'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html',
         'md5': '07e15fa469ba384c7693fd246905547c',
         'info_dict': {
@@ -19,7 +19,10 @@ class YouJizzIE(InfoExtractor):
             'title': 'Zeichentrick 1',
             'age_limit': 18,
         }
-    }
+    }, {
+        'url': 'http://www.youjizz.com/videos/-2189178.html',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
index 147608ebebbbe8b71dda64e2daf974362868c50c..e37f237c76c6880eb1d442e8302dcef558d0e9d8 100644 (file)
@@ -16,7 +16,6 @@ from ..compat import (
 from ..utils import (
     ExtractorError,
     get_element_by_attribute,
-    sanitized_Request,
 )
 
 
@@ -218,14 +217,10 @@ class YoukuIE(InfoExtractor):
             headers = {
                 'Referer': req_url,
             }
+            headers.update(self.geo_verification_headers())
             self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com')
-            req = sanitized_Request(req_url, headers=headers)
 
-            cn_verification_proxy = self._downloader.params.get('cn_verification_proxy')
-            if cn_verification_proxy:
-                req.add_header('Ytdl-request-proxy', cn_verification_proxy)
-
-            raw_data = self._download_json(req, video_id, note=note)
+            raw_data = self._download_json(req_url, video_id, note=note, headers=headers)
 
             return raw_data['data']
 
index c8d54f22a80ac447179305f50a85b42c483b946e..268080ba6c7f5833bd9f5da3201563cd164fa12e 100644 (file)
@@ -53,6 +53,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
     """Provide base functions for Youtube extractors"""
     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
     _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
+    _PASSWORD_CHALLENGE_URL = 'https://accounts.google.com/signin/challenge/sl/password'
     _NETRC_MACHINE = 'youtube'
     # If True it will raise an error if no login info is provided
     _LOGIN_REQUIRED = False
@@ -116,12 +117,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
             'hl': 'en_US',
         }
 
-        login_data = urlencode_postdata(login_form_strs)
-
-        req = sanitized_Request(self._LOGIN_URL, login_data)
         login_results = self._download_webpage(
-            req, None,
-            note='Logging in', errnote='unable to log in', fatal=False)
+            self._PASSWORD_CHALLENGE_URL, None,
+            note='Logging in', errnote='unable to log in', fatal=False,
+            data=urlencode_postdata(login_form_strs))
         if login_results is False:
             return False
 
@@ -137,7 +136,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
         # Two-Factor
         # TODO add SMS and phone call support - these require making a request and then prompting the user
 
-        if re.search(r'(?i)<form[^>]id="challenge"', login_results) is not None:
+        if re.search(r'(?i)<form[^>]+id="challenge"', login_results) is not None:
             tfa_code = self._get_tfa_info('2-step verification code')
 
             if not tfa_code:
@@ -165,17 +164,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
             if tfa_results is False:
                 return False
 
-            if re.search(r'(?i)<form[^>]id="challenge"', tfa_results) is not None:
+            if re.search(r'(?i)<form[^>]+id="challenge"', tfa_results) is not None:
                 self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')
                 return False
-            if re.search(r'(?i)<form[^>]id="gaia_loginform"', tfa_results) is not None:
+            if re.search(r'(?i)<form[^>]+id="gaia_loginform"', tfa_results) is not None:
                 self._downloader.report_warning('unable to log in - did the page structure change?')
                 return False
             if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
                 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
                 return False
 
-        if re.search(r'(?i)<form[^>]id="gaia_loginform"', login_results) is not None:
+        if re.search(r'(?i)<form[^>]+id="gaia_loginform"', login_results) is not None:
             self._downloader.report_warning('unable to log in: bad username or password')
             return False
         return True
@@ -858,6 +857,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         {
             'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;amp;v=V36LpHqtcDY',
             'only_matching': True,
+        },
+        {
+            # YouTube Red paid video (https://github.com/rg3/youtube-dl/issues/10059)
+            'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo',
+            'only_matching': True,
         }
     ]
 
@@ -1730,6 +1734,39 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         }
 
 
+class YoutubeSharedVideoIE(InfoExtractor):
+    _VALID_URL = r'(?:https?:)?//(?:www\.)?youtube\.com/shared\?.*\bci=(?P<id>[0-9A-Za-z_-]{11})'
+    IE_NAME = 'youtube:shared'
+
+    _TEST = {
+        'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU',
+        'info_dict': {
+            'id': 'uPDB5I9wfp8',
+            'ext': 'webm',
+            'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3',
+            'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d',
+            'upload_date': '20160219',
+            'uploader': 'Pocoyo - Português (BR)',
+            'uploader_id': 'PocoyoBrazil',
+        },
+        'add_ie': ['Youtube'],
+        'params': {
+            # There are already too many Youtube downloads
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        real_video_id = self._html_search_meta(
+            'videoId', webpage, 'YouTube video id', fatal=True)
+
+        return self.url_result(real_video_id, YoutubeIE.ie_key())
+
+
 class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
     IE_DESC = 'YouTube.com playlists'
     _VALID_URL = r"""(?x)(?:
@@ -1945,10 +1982,13 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
         return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
                 else super(YoutubeChannelIE, cls).suitable(url))
 
+    def _build_template_url(self, url, channel_id):
+        return self._TEMPLATE_URL % channel_id
+
     def _real_extract(self, url):
         channel_id = self._match_id(url)
 
-        url = self._TEMPLATE_URL % channel_id
+        url = self._build_template_url(url, channel_id)
 
         # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
         # Workaround by extracting as a playlist if managed to obtain channel playlist URL
@@ -1962,9 +2002,13 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
             channel_playlist_id = self._html_search_meta(
                 'channelId', channel_page, 'channel id', default=None)
             if not channel_playlist_id:
-                channel_playlist_id = self._search_regex(
-                    r'data-(?:channel-external-|yt)id="([^"]+)"',
-                    channel_page, 'channel id', default=None)
+                channel_url = self._html_search_meta(
+                    ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
+                    channel_page, 'channel url', default=None)
+                if channel_url:
+                    channel_playlist_id = self._search_regex(
+                        r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
+                        channel_url, 'channel id', default=None)
         if channel_playlist_id and channel_playlist_id.startswith('UC'):
             playlist_id = 'UU' + channel_playlist_id[2:]
             return self.url_result(
@@ -1987,20 +2031,39 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
                 for video_id, video_title in self.extract_videos_from_page(channel_page)]
             return self.playlist_result(entries, channel_id)
 
+        try:
+            next(self._entries(channel_page, channel_id))
+        except StopIteration:
+            alert_message = self._html_search_regex(
+                r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
+                channel_page, 'alert', default=None, group='alert')
+            if alert_message:
+                raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
+
         return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
 
 
 class YoutubeUserIE(YoutubeChannelIE):
     IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
-    _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/|c/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
-    _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos'
+    _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
+    _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
     IE_NAME = 'youtube:user'
 
     _TESTS = [{
         'url': 'https://www.youtube.com/user/TheLinuxFoundation',
         'playlist_mincount': 320,
         'info_dict': {
-            'title': 'TheLinuxFoundation',
+            'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
+            'title': 'Uploads from The Linux Foundation',
+        }
+    }, {
+        # Only available via https://www.youtube.com/c/12minuteathlete/videos
+        # but not https://www.youtube.com/user/12minuteathlete/videos
+        'url': 'https://www.youtube.com/c/12minuteathlete/videos',
+        'playlist_mincount': 249,
+        'info_dict': {
+            'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
+            'title': 'Uploads from 12 Minute Athlete',
         }
     }, {
         'url': 'ytuser:phihag',
@@ -2008,6 +2071,13 @@ class YoutubeUserIE(YoutubeChannelIE):
     }, {
         'url': 'https://www.youtube.com/c/gametrailers',
         'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/gametrailers',
+        'only_matching': True,
+    }, {
+        # This channel is not available.
+        'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
+        'only_matching': True,
     }]
 
     @classmethod
@@ -2020,6 +2090,10 @@ class YoutubeUserIE(YoutubeChannelIE):
         else:
             return super(YoutubeUserIE, cls).suitable(url)
 
+    def _build_template_url(self, url, channel_id):
+        mobj = re.match(self._VALID_URL, url)
+        return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
+
 
 class YoutubeLiveIE(YoutubeBaseInfoExtractor):
     IE_DESC = 'YouTube.com live streams'
diff --git a/youtube_dl/extractor/zippcast.py b/youtube_dl/extractor/zippcast.py
deleted file mode 100644 (file)
index de81937..0000000
+++ /dev/null
@@ -1,94 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
-    determine_ext,
-    str_to_int,
-)
-
-
-class ZippCastIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?zippcast\.com/(?:video/|videoview\.php\?.*\bvplay=)(?P<id>[0-9a-zA-Z]+)'
-    _TESTS = [{
-        # m3u8, hq direct link
-        'url': 'http://www.zippcast.com/video/c9cfd5c7e44dbc29c81',
-        'md5': '5ea0263b5606866c4d6cda0fc5e8c6b6',
-        'info_dict': {
-            'id': 'c9cfd5c7e44dbc29c81',
-            'ext': 'mp4',
-            'title': '[Vinesauce] Vinny - Digital Space Traveler',
-            'description': 'Muted on youtube, but now uploaded in it\'s original form.',
-            'thumbnail': 're:^https?://.*\.jpg$',
-            'uploader': 'vinesauce',
-            'view_count': int,
-            'categories': ['Entertainment'],
-            'tags': list,
-        },
-    }, {
-        # f4m, lq ipod direct link
-        'url': 'http://www.zippcast.com/video/b79c0a233e9c6581775',
-        'only_matching': True,
-    }, {
-        'url': 'http://www.zippcast.com/videoview.php?vplay=c9cfd5c7e44dbc29c81&auto=no',
-        'only_matching': True,
-    }]
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        webpage = self._download_webpage(
-            'http://www.zippcast.com/video/%s' % video_id, video_id)
-
-        formats = []
-        video_url = self._search_regex(
-            r'<source[^>]+src=(["\'])(?P<url>.+?)\1', webpage,
-            'video url', default=None, group='url')
-        if video_url:
-            formats.append({
-                'url': video_url,
-                'format_id': 'http',
-                'preference': 0,  # direct link is almost always of worse quality
-            })
-        src_url = self._search_regex(
-            r'src\s*:\s*(?:escape\()?(["\'])(?P<url>http://.+?)\1',
-            webpage, 'src', default=None, group='url')
-        ext = determine_ext(src_url)
-        if ext == 'm3u8':
-            formats.extend(self._extract_m3u8_formats(
-                src_url, video_id, 'mp4', entry_protocol='m3u8_native',
-                m3u8_id='hls', fatal=False))
-        elif ext == 'f4m':
-            formats.extend(self._extract_f4m_formats(
-                src_url, video_id, f4m_id='hds', fatal=False))
-        self._sort_formats(formats)
-
-        title = self._og_search_title(webpage)
-        description = self._og_search_description(webpage) or self._html_search_meta(
-            'description', webpage)
-        uploader = self._search_regex(
-            r'<a[^>]+href="https?://[^/]+/profile/[^>]+>([^<]+)</a>',
-            webpage, 'uploader', fatal=False)
-        thumbnail = self._og_search_thumbnail(webpage)
-        view_count = str_to_int(self._search_regex(
-            r'>([\d,.]+) views!', webpage, 'view count', fatal=False))
-
-        categories = re.findall(
-            r'<a[^>]+href="https?://[^/]+/categories/[^"]+">([^<]+),?<',
-            webpage)
-        tags = re.findall(
-            r'<a[^>]+href="https?://[^/]+/search/tags/[^"]+">([^<]+),?<',
-            webpage)
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'uploader': uploader,
-            'view_count': view_count,
-            'categories': categories,
-            'tags': tags,
-            'formats': formats,
-        }
index 99ce4131fdfaacbbbca07805fbc86c8563ea95fe..d32a9e32cd0549a7faacc9ee638a03c0413834d5 100644 (file)
@@ -2,6 +2,7 @@ from __future__ import unicode_literals
 
 import os.path
 import optparse
+import re
 import sys
 
 from .downloader.external import list_external_downloaders
@@ -26,9 +27,11 @@ def parseOpts(overrideArguments=None):
         except IOError:
             return default  # silently skip if file is not present
         try:
-            res = []
-            for l in optionf:
-                res += compat_shlex_split(l, comments=True)
+            # FIXME: https://github.com/rg3/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
+            contents = optionf.read()
+            if sys.version_info < (3,):
+                contents = contents.decode(preferredencoding())
+            res = compat_shlex_split(contents, comments=True)
         finally:
             optionf.close()
         return res
@@ -91,8 +94,18 @@ def parseOpts(overrideArguments=None):
         setattr(parser.values, option.dest, value.split(','))
 
     def _hide_login_info(opts):
-        opts = list(opts)
-        for private_opt in ['-p', '--password', '-u', '--username', '--video-password']:
+        PRIVATE_OPTS = ['-p', '--password', '-u', '--username', '--video-password']
+        eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
+
+        def _scrub_eq(o):
+            m = eqre.match(o)
+            if m:
+                return m.group('key') + '=PRIVATE'
+            else:
+                return o
+
+        opts = list(map(_scrub_eq, opts))
+        for private_opt in PRIVATE_OPTS:
             try:
                 i = opts.index(private_opt)
                 opts[i + 1] = 'PRIVATE'
@@ -211,11 +224,16 @@ def parseOpts(overrideArguments=None):
         action='store_const', const='::', dest='source_address',
         help='Make all connections via IPv6 (experimental)',
     )
+    network.add_option(
+        '--geo-verification-proxy',
+        dest='geo_verification_proxy', default=None, metavar='URL',
+        help='Use this proxy to verify the IP address for some geo-restricted sites. '
+        'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading. (experimental)'
+    )
     network.add_option(
         '--cn-verification-proxy',
         dest='cn_verification_proxy', default=None, metavar='URL',
-        help='Use this proxy to verify the IP address for some Chinese sites. '
-        'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading. (experimental)'
+        help=optparse.SUPPRESS_HELP,
     )
 
     selection = optparse.OptionGroup(parser, 'Video Selection')
@@ -481,9 +499,20 @@ def parseOpts(overrideArguments=None):
         dest='bidi_workaround', action='store_true',
         help='Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH')
     workarounds.add_option(
-        '--sleep-interval', metavar='SECONDS',
+        '--sleep-interval', '--min-sleep-interval', metavar='SECONDS',
         dest='sleep_interval', type=float,
-        help='Number of seconds to sleep before each download.')
+        help=(
+            'Number of seconds to sleep before each download when used alone '
+            'or a lower bound of a range for randomized sleep before each download '
+            '(minimum possible number of seconds to sleep) when used along with '
+            '--max-sleep-interval.'))
+    workarounds.add_option(
+        '--max-sleep-interval', metavar='SECONDS',
+        dest='max_sleep_interval', type=float,
+        help=(
+            'Upper bound of a range for randomized sleep before each download '
+            '(maximum possible number of seconds to sleep). Must only be used '
+            'along with --min-sleep-interval.'))
 
     verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
     verbosity.add_option(
@@ -809,11 +838,11 @@ def parseOpts(overrideArguments=None):
             system_conf = []
             user_conf = []
         else:
-            system_conf = compat_conf(_readOptions('/etc/youtube-dl.conf'))
+            system_conf = _readOptions('/etc/youtube-dl.conf')
             if '--ignore-config' in system_conf:
                 user_conf = []
             else:
-                user_conf = compat_conf(_readUserConf())
+                user_conf = _readUserConf()
         argv = system_conf + user_conf + command_line_conf
 
         opts, args = parser.parse_args(argv)
index 42377fa0f0bde0d3fa6ae578c6eaa0fe4a73474d..920573da9d8f472b8fdd8681cab0be1c6331afb7 100644 (file)
@@ -3,11 +3,6 @@ from __future__ import unicode_literals
 import re
 
 from .common import PostProcessor
-from ..utils import PostProcessingError
-
-
-class MetadataFromTitlePPError(PostProcessingError):
-    pass
 
 
 class MetadataFromTitlePP(PostProcessor):
@@ -38,7 +33,8 @@ class MetadataFromTitlePP(PostProcessor):
         title = info['title']
         match = re.match(self._titleregex, title)
         if match is None:
-            raise MetadataFromTitlePPError('Could not interpret title of video as "%s"' % self._titleformat)
+            self._downloader.to_screen('[fromtitle] Could not interpret title of video as "%s"' % self._titleformat)
+            return [], info
         for attribute, value in match.groupdict().items():
             value = match.group(attribute)
             info[attribute] = value
index fd49d74352c7d50e15956169f686c3ffbb107b5c..104807242bd3b0f35e0423faa096540be29d0a45 100644 (file)
@@ -76,7 +76,7 @@ class Socks4Error(ProxyError):
 
     CODES = {
         91: 'request rejected or failed',
-        92: 'request rejected becasue SOCKS server cannot connect to identd on the client',
+        92: 'request rejected because SOCKS server cannot connect to identd on the client',
         93: 'request rejected because the client program and identd report different user-ids'
     }
 
index 562031fe110a9a6962c6e5efed2bb311f3059979..b3b687a314681de17547104f7f99e883a312c322 100644 (file)
@@ -47,6 +47,7 @@ from .compat import (
     compat_socket_create_connection,
     compat_str,
     compat_struct_pack,
+    compat_struct_unpack,
     compat_urllib_error,
     compat_urllib_parse,
     compat_urllib_parse_urlencode,
@@ -110,6 +111,50 @@ ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙ
                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 
+DATE_FORMATS = (
+    '%d %B %Y',
+    '%d %b %Y',
+    '%B %d %Y',
+    '%b %d %Y',
+    '%b %dst %Y %I:%M',
+    '%b %dnd %Y %I:%M',
+    '%b %dth %Y %I:%M',
+    '%Y %m %d',
+    '%Y-%m-%d',
+    '%Y/%m/%d',
+    '%Y/%m/%d %H:%M',
+    '%Y/%m/%d %H:%M:%S',
+    '%Y-%m-%d %H:%M:%S',
+    '%Y-%m-%d %H:%M:%S.%f',
+    '%d.%m.%Y %H:%M',
+    '%d.%m.%Y %H.%M',
+    '%Y-%m-%dT%H:%M:%SZ',
+    '%Y-%m-%dT%H:%M:%S.%fZ',
+    '%Y-%m-%dT%H:%M:%S.%f0Z',
+    '%Y-%m-%dT%H:%M:%S',
+    '%Y-%m-%dT%H:%M:%S.%f',
+    '%Y-%m-%dT%H:%M',
+)
+
+DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
+DATE_FORMATS_DAY_FIRST.extend([
+    '%d-%m-%Y',
+    '%d.%m.%Y',
+    '%d.%m.%y',
+    '%d/%m/%Y',
+    '%d/%m/%y',
+    '%d/%m/%Y %H:%M:%S',
+])
+
+DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
+DATE_FORMATS_MONTH_FIRST.extend([
+    '%m-%d-%Y',
+    '%m.%d.%Y',
+    '%m/%d/%Y',
+    '%m/%d/%y',
+    '%m/%d/%Y %H:%M:%S',
+])
+
 
 def preferredencoding():
     """Get preferred encoding.
@@ -267,9 +312,17 @@ def get_element_by_id(id, html):
     return get_element_by_attribute('id', id, html)
 
 
-def get_element_by_attribute(attribute, value, html):
+def get_element_by_class(class_name, html):
+    return get_element_by_attribute(
+        'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
+        html, escape_value=False)
+
+
+def get_element_by_attribute(attribute, value, html, escape_value=True):
     """Return the content of the tag with the specified attribute in the passed HTML document"""
 
+    value = re.escape(value) if escape_value else value
+
     m = re.search(r'''(?xs)
         <([a-zA-Z0-9:._-]+)
          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
@@ -278,7 +331,7 @@ def get_element_by_attribute(attribute, value, html):
         \s*>
         (?P<content>.*?)
         </\1>
-    ''' % (re.escape(attribute), re.escape(value)), html)
+    ''' % (re.escape(attribute), value), html)
 
     if not m:
         return None
@@ -975,6 +1028,24 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
     https_response = http_response
 
 
+def extract_timezone(date_str):
+    m = re.search(
+        r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
+        date_str)
+    if not m:
+        timezone = datetime.timedelta()
+    else:
+        date_str = date_str[:-len(m.group('tz'))]
+        if not m.group('sign'):
+            timezone = datetime.timedelta()
+        else:
+            sign = 1 if m.group('sign') == '+' else -1
+            timezone = datetime.timedelta(
+                hours=sign * int(m.group('hours')),
+                minutes=sign * int(m.group('minutes')))
+    return timezone, date_str
+
+
 def parse_iso8601(date_str, delimiter='T', timezone=None):
     """ Return a UNIX timestamp from the given date """
 
@@ -984,20 +1055,8 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):
     date_str = re.sub(r'\.[0-9]+', '', date_str)
 
     if timezone is None:
-        m = re.search(
-            r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
-            date_str)
-        if not m:
-            timezone = datetime.timedelta()
-        else:
-            date_str = date_str[:-len(m.group(0))]
-            if not m.group('sign'):
-                timezone = datetime.timedelta()
-            else:
-                sign = 1 if m.group('sign') == '+' else -1
-                timezone = datetime.timedelta(
-                    hours=sign * int(m.group('hours')),
-                    minutes=sign * int(m.group('minutes')))
+        timezone, date_str = extract_timezone(date_str)
+
     try:
         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
         dt = datetime.datetime.strptime(date_str, date_format) - timezone
@@ -1006,6 +1065,10 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):
         pass
 
 
+def date_formats(day_first=True):
+    return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
+
+
 def unified_strdate(date_str, day_first=True):
     """Return a string with the date in the format YYYYMMDD"""
 
@@ -1014,53 +1077,11 @@ def unified_strdate(date_str, day_first=True):
     upload_date = None
     # Replace commas
     date_str = date_str.replace(',', ' ')
-    # %z (UTC offset) is only supported in python>=3.2
-    if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
-        date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
     # Remove AM/PM + timezone
     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
+    _, date_str = extract_timezone(date_str)
 
-    format_expressions = [
-        '%d %B %Y',
-        '%d %b %Y',
-        '%B %d %Y',
-        '%b %d %Y',
-        '%b %dst %Y %I:%M',
-        '%b %dnd %Y %I:%M',
-        '%b %dth %Y %I:%M',
-        '%Y %m %d',
-        '%Y-%m-%d',
-        '%Y/%m/%d',
-        '%Y/%m/%d %H:%M:%S',
-        '%Y-%m-%d %H:%M:%S',
-        '%Y-%m-%d %H:%M:%S.%f',
-        '%d.%m.%Y %H:%M',
-        '%d.%m.%Y %H.%M',
-        '%Y-%m-%dT%H:%M:%SZ',
-        '%Y-%m-%dT%H:%M:%S.%fZ',
-        '%Y-%m-%dT%H:%M:%S.%f0Z',
-        '%Y-%m-%dT%H:%M:%S',
-        '%Y-%m-%dT%H:%M:%S.%f',
-        '%Y-%m-%dT%H:%M',
-    ]
-    if day_first:
-        format_expressions.extend([
-            '%d-%m-%Y',
-            '%d.%m.%Y',
-            '%d.%m.%y',
-            '%d/%m/%Y',
-            '%d/%m/%y',
-            '%d/%m/%Y %H:%M:%S',
-        ])
-    else:
-        format_expressions.extend([
-            '%m-%d-%Y',
-            '%m.%d.%Y',
-            '%m/%d/%Y',
-            '%m/%d/%y',
-            '%m/%d/%Y %H:%M:%S',
-        ])
-    for expression in format_expressions:
+    for expression in date_formats(day_first):
         try:
             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
         except ValueError:
@@ -1076,6 +1097,29 @@ def unified_strdate(date_str, day_first=True):
         return compat_str(upload_date)
 
 
+def unified_timestamp(date_str, day_first=True):
+    if date_str is None:
+        return None
+
+    date_str = date_str.replace(',', ' ')
+
+    pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
+    timezone, date_str = extract_timezone(date_str)
+
+    # Remove AM/PM + timezone
+    date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
+
+    for expression in date_formats(day_first):
+        try:
+            dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
+            return calendar.timegm(dt.timetuple())
+        except ValueError:
+            pass
+    timetuple = email.utils.parsedate_tz(date_str)
+    if timetuple:
+        return calendar.timegm(timetuple) + pm_delta * 3600
+
+
 def determine_ext(url, default_ext='unknown_video'):
     if url is None:
         return default_ext
@@ -1410,6 +1454,8 @@ def shell_quote(args):
 def smuggle_url(url, data):
     """ Pass additional data in a URL for internal use. """
 
+    url, idata = unsmuggle_url(url, {})
+    data.update(idata)
     sdata = compat_urllib_parse_urlencode(
         {'__youtubedl_smuggle': json.dumps(data)})
     return url + '#' + sdata
@@ -1591,6 +1637,11 @@ class HEADRequest(compat_urllib_request.Request):
         return 'HEAD'
 
 
+class PUTRequest(compat_urllib_request.Request):
+    def get_method(self):
+        return 'PUT'
+
+
 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
     if get_attr:
         if v is not None:
@@ -1626,6 +1677,10 @@ def float_or_none(v, scale=1, invscale=1, default=None):
         return default
 
 
+def strip_or_none(v):
+    return None if v is None else v.strip()
+
+
 def parse_duration(s):
     if not isinstance(s, compat_basestring):
         return None
@@ -1882,7 +1937,13 @@ def update_Request(req, url=None, data=None, headers={}, query={}):
     req_headers.update(headers)
     req_data = data or req.data
     req_url = update_url_query(url or req.get_full_url(), query)
-    req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
+    req_get_method = req.get_method()
+    if req_get_method == 'HEAD':
+        req_type = HEADRequest
+    elif req_get_method == 'PUT':
+        req_type = PUTRequest
+    else:
+        req_type = compat_urllib_request.Request
     new_req = req_type(
         req_url, data=req_data, headers=req_headers,
         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
@@ -1924,11 +1985,27 @@ US_RATINGS = {
 }
 
 
+TV_PARENTAL_GUIDELINES = {
+    'TV-Y': 0,
+    'TV-Y7': 7,
+    'TV-G': 0,
+    'TV-PG': 0,
+    'TV-14': 14,
+    'TV-MA': 17,
+}
+
+
 def parse_age_limit(s):
-    if s is None:
+    if type(s) == int:
+        return s if 0 <= s <= 21 else None
+    if not isinstance(s, compat_basestring):
         return None
     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
-    return int(m.group('age')) if m else US_RATINGS.get(s)
+    if m:
+        return int(m.group('age'))
+    if s in US_RATINGS:
+        return US_RATINGS[s]
+    return TV_PARENTAL_GUIDELINES.get(s)
 
 
 def strip_jsonp(code):
@@ -2046,6 +2123,7 @@ def mimetype2ext(mt):
         return ext
 
     _, _, res = mt.rpartition('/')
+    res = res.lower()
 
     return {
         '3gpp': '3gp',
@@ -2057,9 +2135,53 @@ def mimetype2ext(mt):
         'x-flv': 'flv',
         'x-mp4-fragmented': 'mp4',
         'x-ms-wmv': 'wmv',
+        'mpegurl': 'm3u8',
+        'x-mpegurl': 'm3u8',
+        'vnd.apple.mpegurl': 'm3u8',
+        'dash+xml': 'mpd',
+        'f4m': 'f4m',
+        'f4m+xml': 'f4m',
+        'hds+xml': 'f4m',
+        'vnd.ms-sstr+xml': 'ism',
     }.get(res, res)
 
 
+def parse_codecs(codecs_str):
+    # http://tools.ietf.org/html/rfc6381
+    if not codecs_str:
+        return {}
+    splited_codecs = list(filter(None, map(
+        lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
+    vcodec, acodec = None, None
+    for full_codec in splited_codecs:
+        codec = full_codec.split('.')[0]
+        if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
+            if not vcodec:
+                vcodec = full_codec
+        elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac'):
+            if not acodec:
+                acodec = full_codec
+        else:
+            write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
+    if not vcodec and not acodec:
+        if len(splited_codecs) == 2:
+            return {
+                'vcodec': vcodec,
+                'acodec': acodec,
+            }
+        elif len(splited_codecs) == 1:
+            return {
+                'vcodec': 'none',
+                'acodec': vcodec,
+            }
+    else:
+        return {
+            'vcodec': vcodec or 'none',
+            'acodec': acodec or 'none',
+        }
+    return {}
+
+
 def urlhandle_detect_ext(url_handle):
     getheader = url_handle.headers.get
 
@@ -2288,6 +2410,8 @@ def dfxp2srt(dfxp_data):
 
 def cli_option(params, command_option, param):
     param = params.get(param)
+    if param:
+        param = compat_str(param)
     return [command_option, param] if param is not None else []
 
 
@@ -2861,3 +2985,114 @@ def parse_m3u8_attributes(attrib):
             val = val[1:-1]
         info[key] = val
     return info
+
+
+def urshift(val, n):
+    return val >> n if val >= 0 else (val + 0x100000000) >> n
+
+
+# Based on png2str() written by @gdkchan and improved by @yokrysty
+# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
+def decode_png(png_data):
+    # Reference: https://www.w3.org/TR/PNG/
+    header = png_data[8:]
+
+    if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
+        raise IOError('Not a valid PNG file.')
+
+    int_map = {1: '>B', 2: '>H', 4: '>I'}
+    unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
+
+    chunks = []
+
+    while header:
+        length = unpack_integer(header[:4])
+        header = header[4:]
+
+        chunk_type = header[:4]
+        header = header[4:]
+
+        chunk_data = header[:length]
+        header = header[length:]
+
+        header = header[4:]  # Skip CRC
+
+        chunks.append({
+            'type': chunk_type,
+            'length': length,
+            'data': chunk_data
+        })
+
+    ihdr = chunks[0]['data']
+
+    width = unpack_integer(ihdr[:4])
+    height = unpack_integer(ihdr[4:8])
+
+    idat = b''
+
+    for chunk in chunks:
+        if chunk['type'] == b'IDAT':
+            idat += chunk['data']
+
+    if not idat:
+        raise IOError('Unable to read PNG data.')
+
+    decompressed_data = bytearray(zlib.decompress(idat))
+
+    stride = width * 3
+    pixels = []
+
+    def _get_pixel(idx):
+        x = idx % stride
+        y = idx // stride
+        return pixels[y][x]
+
+    for y in range(height):
+        basePos = y * (1 + stride)
+        filter_type = decompressed_data[basePos]
+
+        current_row = []
+
+        pixels.append(current_row)
+
+        for x in range(stride):
+            color = decompressed_data[1 + basePos + x]
+            basex = y * stride + x
+            left = 0
+            up = 0
+
+            if x > 2:
+                left = _get_pixel(basex - 3)
+            if y > 0:
+                up = _get_pixel(basex - stride)
+
+            if filter_type == 1:  # Sub
+                color = (color + left) & 0xff
+            elif filter_type == 2:  # Up
+                color = (color + up) & 0xff
+            elif filter_type == 3:  # Average
+                color = (color + ((left + up) >> 1)) & 0xff
+            elif filter_type == 4:  # Paeth
+                a = left
+                b = up
+                c = 0
+
+                if x > 2 and y > 0:
+                    c = _get_pixel(basex - stride - 3)
+
+                p = a + b - c
+
+                pa = abs(p - a)
+                pb = abs(p - b)
+                pc = abs(p - c)
+
+                if pa <= pb and pa <= pc:
+                    color = (color + a) & 0xff
+                elif pb <= pc:
+                    color = (color + b) & 0xff
+                else:
+                    color = (color + c) & 0xff
+
+            current_row.append(color)
+
+    return width, height, pixels
index 2b7a4c98dd2ded70a49a2455ec1309ad921c512e..cf59501177dfde41c61dcd6d3d7418b3b4468c7a 100644 (file)
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2016.06.25'
+__version__ = '2016.08.17'