3 import xml
.etree
.ElementTree
5 from .common
import InfoExtractor
10 compat_urllib_parse_urlparse
,
11 compat_urllib_request
,
17 class CollegeHumorIE(InfoExtractor
):
19 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
21 def report_manifest(self
, video_id
):
22 """Report information extraction."""
23 self
.to_screen(u
'%s: Downloading XML manifest' % video_id
)
25 def _real_extract(self
, url
):
26 mobj
= re
.match(self
._VALID
_URL
, url
)
28 raise ExtractorError(u
'Invalid URL: %s' % url
)
29 video_id
= mobj
.group('videoid')
37 self
.report_extraction(video_id
)
38 xmlUrl
= 'http://www.collegehumor.com/moogaloop/video/' + video_id
40 metaXml
= compat_urllib_request
.urlopen(xmlUrl
).read()
41 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
42 raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
))
44 mdoc
= xml
.etree
.ElementTree
.fromstring(metaXml
)
46 videoNode
= mdoc
.findall('./video')[0]
47 info
['description'] = videoNode
.findall('./description')[0].text
48 info
['title'] = videoNode
.findall('./caption')[0].text
49 info
['thumbnail'] = videoNode
.findall('./thumbnail')[0].text
50 manifest_url
= videoNode
.findall('./file')[0].text
52 raise ExtractorError(u
'Invalid metadata XML file')
54 manifest_url
+= '?hdcore=2.10.3'
55 self
.report_manifest(video_id
)
57 manifestXml
= compat_urllib_request
.urlopen(manifest_url
).read()
58 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
59 raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
))
61 adoc
= xml
.etree
.ElementTree
.fromstring(manifestXml
)
63 media_node
= adoc
.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
64 node_id
= media_node
.attrib
['url']
65 video_id
= adoc
.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
66 except IndexError as err
:
67 raise ExtractorError(u
'Invalid manifest file')
69 url_pr
= compat_urllib_parse_urlparse(manifest_url
)
70 url
= url_pr
.scheme
+ '://' + url_pr
.netloc
+ '/z' + video_id
[:-2] + '/' + node_id
+ 'Seg1-Frag1'