New upstream version 2017.03.26

[youtubedl] / youtube_dl / extractor / archiveorg.py
diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py

index 7efd1d82324c5397bb6d6f10e1bfa993a2531584..e21045bed9f7eb161d2dbacd02008e8ac11f9ec3 100644 (file)
--- a/youtube_dl/extractor/archiveorg.py
+++ b/youtube_dl/extractor/archiveorg.py
@@ -1,67 +1,65 @@
-import json
-import re
+from __future__ import unicode_literals
  
  from .common import InfoExtractor
  from ..utils import (
  
  from .common import InfoExtractor
  from ..utils import (
-    determine_ext,
      unified_strdate,
      unified_strdate,
+    clean_html,
  )
  
  
  class ArchiveOrgIE(InfoExtractor):
      IE_NAME = 'archive.org'
      IE_DESC = 'archive.org videos'
  )
  
  
  class ArchiveOrgIE(InfoExtractor):
      IE_NAME = 'archive.org'
      IE_DESC = 'archive.org videos'
-    _VALID_URL = r'(?:https?://)?(?:www\.)?archive.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
-    _TEST = {
-        u"url": u"http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect",
-        u'file': u'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv',
-        u'md5': u'8af1d4cf447933ed3c7f4871162602db',
-        u'info_dict': {
-            u"title": u"1968 Demo - FJCC Conference Presentation Reel #1",
-            u"description": u"Reel 1 of 3: Also known as the \"Mother of All Demos\", Doug Engelbart's presentation at the Fall Joint Computer Conference in San Francisco, December 9, 1968 titled \"A Research Center for Augmenting Human Intellect.\" For this presentation, Doug and his team astonished the audience by not only relating their research, but demonstrating it live. This was the debut of the mouse, interactive computing, hypermedia, computer supported software engineering, video teleconferencing, etc. See also <a href=\"http://dougengelbart.org/firsts/dougs-1968-demo.html\" rel=\"nofollow\">Doug's 1968 Demo page</a> for more background, highlights, links, and the detailed paper published in this conference proceedings. Filmed on 3 reels: Reel 1 | <a href=\"http://www.archive.org/details/XD300-24_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 2</a> | <a href=\"http://www.archive.org/details/XD300-25_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 3</a>",
-            u"upload_date": u"19681210",
-            u"uploader": u"SRI International"
+    _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#]+)(?:[?].*)?$'
+    _TESTS = [{
+        'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
+        'md5': '8af1d4cf447933ed3c7f4871162602db',
+        'info_dict': {
+            'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect',
+            'ext': 'ogg',
+            'title': '1968 Demo - FJCC Conference Presentation Reel #1',
+            'description': 'md5:da45c349df039f1cc8075268eb1b5c25',
+            'upload_date': '19681210',
+            'uploader': 'SRI International'
          }
          }
-    }
-
+    }, {
+        'url': 'https://archive.org/details/Cops1922',
+        'md5': 'bc73c8ab3838b5a8fc6c6651fa7b58ba',
+        'info_dict': {
+            'id': 'Cops1922',
+            'ext': 'mp4',
+            'title': 'Buster Keaton\'s "Cops" (1922)',
+            'description': 'md5:b4544662605877edd99df22f9620d858',
+        }
+    }, {
+        'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
+        'only_matching': True,
+    }]
  
      def _real_extract(self, url):
  
      def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
-
-        json_url = url + (u'?' if u'?' in url else '&') + u'output=json'
-        json_data = self._download_webpage(json_url, video_id)
-        data = json.loads(json_data)
-
-        title = data['metadata']['title'][0]
-        description = data['metadata']['description'][0]
-        uploader = data['metadata']['creator'][0]
-        upload_date = unified_strdate(data['metadata']['date'][0])
-
-        formats = [{
-                'format': fdata['format'],
-                'url': 'http://' + data['server'] + data['dir'] + fn,
-                'file_size': int(fdata['size']),
-            }
-            for fn,fdata in data['files'].items()
-            if 'Video' in fdata['format']]
-        formats.sort(key=lambda fdata: fdata['file_size'])
-
-        info = {
-            '_type': 'video',
-            'id': video_id,
-            'title': title,
-            'formats': formats,
-            'description': description,
-            'uploader': uploader,
-            'upload_date': upload_date,
-        }
-        thumbnail = data.get('misc', {}).get('image')
-        if thumbnail:
-            info['thumbnail'] = thumbnail
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(
+            'http://archive.org/embed/' + video_id, video_id)
+        jwplayer_playlist = self._parse_json(self._search_regex(
+            r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\);",
+            webpage, 'jwplayer playlist'), video_id)
+        info = self._parse_jwplayer_data(
+            {'playlist': jwplayer_playlist}, video_id, base_url=url)
  
  
-        # TODO: Remove when #980 has been merged
-        info['url'] = formats[-1]['url']
-        info['ext'] = determine_ext(formats[-1]['url'])
+        def get_optional(metadata, field):
+            return metadata.get(field, [None])[0]
  
  
-        return info
-\ No newline at end of file
+        metadata = self._download_json(
+            'http://archive.org/details/' + video_id, video_id, query={
+                'output': 'json',
+            })['metadata']
+        info.update({
+            'title': get_optional(metadata, 'title') or info.get('title'),
+            'description': clean_html(get_optional(metadata, 'description')),
+        })
+        if info.get('_type') != 'playlist':
+            info.update({
+                'uploader': get_optional(metadata, 'creator'),
+                'upload_date': unified_strdate(get_optional(metadata, 'date')),
+            })
+        return info