2 import xml
.etree
.ElementTree
4 from .common
import InfoExtractor
6 compat_urllib_parse_urlparse
,
13 class CollegeHumorIE(InfoExtractor
):
14 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P<videoid>[0-9]+)/?(?P<shorttitle>.*)$'
17 u
'url': u
'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe',
18 u
'file': u
'6902724.mp4',
19 u
'md5': u
'1264c12ad95dca142a9f0bf7968105a0',
21 u
'title': u
'Comic-Con Cosplay Catastrophe',
22 u
'description': u
'Fans get creative this year at San Diego. Too creative. And yes, that\'s really Joss Whedon.',
26 u
'url': u
'http://www.collegehumor.com/video/3505939/font-conference',
27 u
'file': u
'3505939.mp4',
28 u
'md5': u
'c51ca16b82bb456a4397987791a835f5',
30 u
'title': u
'Font Conference',
31 u
'description': u
'This video wasn\'t long enough, so we made it double-spaced.',
35 def _real_extract(self
, url
):
36 mobj
= re
.match(self
._VALID
_URL
, url
)
38 raise ExtractorError(u
'Invalid URL: %s' % url
)
39 video_id
= mobj
.group('videoid')
47 self
.report_extraction(video_id
)
48 xmlUrl
= 'http://www.collegehumor.com/moogaloop/video/' + video_id
49 metaXml
= self
._download
_webpage
(xmlUrl
, video_id
,
50 u
'Downloading info XML',
51 u
'Unable to download video info XML')
53 mdoc
= xml
.etree
.ElementTree
.fromstring(metaXml
)
55 videoNode
= mdoc
.findall('./video')[0]
56 youtubeIdNode
= videoNode
.find('./youtubeID')
57 if youtubeIdNode
is not None:
58 return self
.url_result(youtubeIdNode
.text
, 'Youtube')
59 info
['description'] = videoNode
.findall('./description')[0].text
60 info
['title'] = videoNode
.findall('./caption')[0].text
61 info
['thumbnail'] = videoNode
.findall('./thumbnail')[0].text
62 next_url
= videoNode
.findall('./file')[0].text
64 raise ExtractorError(u
'Invalid metadata XML file')
66 if next_url
.endswith(u
'manifest.f4m'):
67 manifest_url
= next_url
+ '?hdcore=2.10.3'
68 manifestXml
= self
._download
_webpage
(manifest_url
, video_id
,
69 u
'Downloading XML manifest',
70 u
'Unable to download video info XML')
72 adoc
= xml
.etree
.ElementTree
.fromstring(manifestXml
)
74 media_node
= adoc
.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
75 node_id
= media_node
.attrib
['url']
76 video_id
= adoc
.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
77 except IndexError as err
:
78 raise ExtractorError(u
'Invalid manifest file')
79 url_pr
= compat_urllib_parse_urlparse(info
['thumbnail'])
80 info
['url'] = url_pr
.scheme
+ '://' + url_pr
.netloc
+ video_id
[:-2].replace('.csmil','').replace(',','')
83 # Old-style direct links
84 info
['url'] = next_url
85 info
['ext'] = determine_ext(info
['url'])