Update changelog.

[youtubedl] / youtube_dl / extractor / collegehumor.py
diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py

index 7ae0972e501ef641ea5e1eb43ec9124fba5bc39c..d10b7bd0cda2161838e83da741fe7a2f6c88f575 100644 (file)
--- a/youtube_dl/extractor/collegehumor.py
+++ b/youtube_dl/extractor/collegehumor.py
@@ -1,74 +1,68 @@
+from __future__ import unicode_literals
+
+import json
  import re
-import socket
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
-from ..utils import (
-    compat_http_client,
-    compat_str,
-    compat_urllib_error,
-    compat_urllib_parse_urlparse,
-    compat_urllib_request,
-
-    ExtractorError,
-)
  
  
  class CollegeHumorIE(InfoExtractor):
-    _WORKING = False
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
+    _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P<videoid>[0-9]+)/?(?P<shorttitle>.*)$'
  
-    def report_manifest(self, video_id):
-        """Report information extraction."""
-        self.to_screen(u'%s: Downloading XML manifest' % video_id)
+    _TESTS = [{
+        'url': 'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe',
+        'file': '6902724.mp4',
+        'md5': 'dcc0f5c1c8be98dc33889a191f4c26bd',
+        'info_dict': {
+            'title': 'Comic-Con Cosplay Catastrophe',
+            'description': 'Fans get creative this year at San Diego.  Too',
+            'age_limit': 13,
+        },
+    },
+    {
+        'url': 'http://www.collegehumor.com/video/3505939/font-conference',
+        'file': '3505939.mp4',
+        'md5': '72fa701d8ef38664a4dbb9e2ab721816',
+        'info_dict': {
+            'title': 'Font Conference',
+            'description': 'This video wasn\'t long enough, so we made it double-spaced.',
+            'age_limit': 10,
+        },
+    }]
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
          video_id = mobj.group('videoid')
  
-        info = {
-            'id': video_id,
-            'uploader': None,
-            'upload_date': None,
-        }
-
-        self.report_extraction(video_id)
-        xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
-        try:
-            metaXml = compat_urllib_request.urlopen(xmlUrl).read()
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
+        jsonUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id + '.json'
+        data = json.loads(self._download_webpage(
+            jsonUrl, video_id, 'Downloading info JSON'))
+        vdata = data['video']
  
-        mdoc = xml.etree.ElementTree.fromstring(metaXml)
-        try:
-            videoNode = mdoc.findall('./video')[0]
-            info['description'] = videoNode.findall('./description')[0].text
-            info['title'] = videoNode.findall('./caption')[0].text
-            info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
-            manifest_url = videoNode.findall('./file')[0].text
-        except IndexError:
-            raise ExtractorError(u'Invalid metadata XML file')
+        AGE_LIMITS = {'nc17': 18, 'r': 18, 'pg13': 13, 'pg': 10, 'g': 0}
+        rating = vdata.get('rating')
+        if rating:
+            age_limit = AGE_LIMITS.get(rating.lower())
+        else:
+            age_limit = None  # None = No idea
  
-        manifest_url += '?hdcore=2.10.3'
-        self.report_manifest(video_id)
-        try:
-            manifestXml = compat_urllib_request.urlopen(manifest_url).read()
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
+        PREFS = {'high_quality': 2, 'low_quality': 0}
+        formats = []
+        for format_key in ('mp4', 'webm'):
+            for qname, qurl in vdata[format_key].items():
+                formats.append({
+                    'format_id': format_key + '_' + qname,
+                    'url': qurl,
+                    'format': format_key,
+                    'preference': PREFS.get(qname),
+                })
+        self._sort_formats(formats)
  
-        adoc = xml.etree.ElementTree.fromstring(manifestXml)
-        try:
-            media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
-            node_id = media_node.attrib['url']
-            video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
-        except IndexError as err:
-            raise ExtractorError(u'Invalid manifest file')
-
-        url_pr = compat_urllib_parse_urlparse(manifest_url)
-        url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
-
-        info['url'] = url
-        info['ext'] = 'f4f'
-        return [info]
+        return {
+            'id': video_id,
+            'title': vdata['title'],
+            'description': vdata.get('description'),
+            'thumbnail': vdata.get('thumbnail'),
+            'formats': formats,
+            'age_limit': age_limit,
+        }