2 from __future__
import unicode_literals
10 from .common
import InfoExtractor
11 from ..compat
import (
24 class GloboIE(InfoExtractor
):
25 _VALID_URL
= r
'(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P<id>\d{7,})'
26 _NETRC_MACHINE
= 'globo'
28 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/',
29 'md5': 'b3ccc801f75cd04a914d51dadb83a78d',
33 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa',
35 'uploader': 'Globo.com',
39 'url': 'http://globoplay.globo.com/v/4581987/',
40 'md5': 'f36a1ecd6a50da1577eee6dd17f67eff',
44 'title': 'Acidentes de trânsito estão entre as maiores causas de queda de energia em SP',
46 'uploader': 'Rede Globo',
50 'url': 'http://canalbrasil.globo.com/programas/sangue-latino/videos/3928201.html',
51 'only_matching': True,
53 'url': 'http://globosatplay.globo.com/globonews/v/4472924/',
54 'only_matching': True,
56 'url': 'http://globotv.globo.com/t/programa/v/clipe-sexo-e-as-negas-adeus/3836166/',
57 'only_matching': True,
59 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/',
60 'only_matching': True,
62 'url': 'http://canaloff.globo.com/programas/desejar-profundo/videos/4518560.html',
63 'only_matching': True,
65 'url': 'globo:3607726',
66 'only_matching': True,
69 def _real_initialize(self
):
70 email
, password
= self
._get
_login
_info
()
75 glb_id
= (self
._download
_json
(
76 'https://login.globo.com/api/authentication', None, data
=json
.dumps({
82 }).encode(), headers
={
83 'Content-Type': 'application/json; charset=utf-8',
84 }) or {}).get('glbId')
86 self
._set
_cookie
('.globo.com', 'GLBID', glb_id
)
87 except ExtractorError
as e
:
88 if isinstance(e
.cause
, compat_HTTPError
) and e
.cause
.code
== 401:
89 resp
= self
._parse
_json
(e
.cause
.read(), None)
90 raise ExtractorError(resp
.get('userMessage') or resp
['id'], expected
=True)
93 def _real_extract(self
, url
):
94 video_id
= self
._match
_id
(url
)
96 video
= self
._download
_json
(
97 'http://api.globovideos.com/videos/%s/playlist' % video_id
,
98 video_id
)['videos'][0]
99 if video
.get('encrypted') is True:
100 raise ExtractorError('This video is DRM protected.', expected
=True)
102 title
= video
['title']
106 for resource
in video
['resources']:
107 resource_id
= resource
.get('_id')
108 resource_url
= resource
.get('url')
109 resource_type
= resource
.get('type')
110 if not resource_url
or (resource_type
== 'media' and not resource_id
) or resource_type
not in ('subtitle', 'media'):
113 if resource_type
== 'subtitle':
114 subtitles
.setdefault(resource
.get('language') or 'por', []).append({
119 security
= self
._download
_json
(
120 'http://security.video.globo.com/videos/%s/hash' % video_id
,
121 video_id
, 'Downloading security hash for %s' % resource_id
, query
={
124 'resource_id': resource_id
,
127 security_hash
= security
.get('hash')
128 if not security_hash
:
129 message
= security
.get('message')
131 raise ExtractorError(
132 '%s returned error: %s' % (self
.IE_NAME
, message
), expected
=True)
135 hash_code
= security_hash
[:2]
136 padding
= '%010d' % random
.randint(1, 10000000000)
137 if hash_code
in ('04', '14'):
138 received_time
= security_hash
[3:13]
139 received_md5
= security_hash
[24:]
140 hash_prefix
= security_hash
[:23]
141 elif hash_code
in ('02', '12', '03', '13'):
142 received_time
= security_hash
[2:12]
143 received_md5
= security_hash
[22:]
145 hash_prefix
= '05' + security_hash
[:22]
147 padded_sign_time
= compat_str(int(received_time
) + 86400) + padding
148 md5_data
= (received_md5
+ padded_sign_time
+ '0xAC10FD').encode()
149 signed_md5
= base64
.urlsafe_b64encode(hashlib
.md5(md5_data
).digest()).decode().strip('=')
150 signed_hash
= hash_prefix
+ padded_sign_time
+ signed_md5
151 signed_url
= '%s?h=%s&k=html5&a=%s&u=%s' % (resource_url
, signed_hash
, 'F' if video
.get('subscriber_only') else 'A', security
.get('user') or '')
153 if resource_id
.endswith('m3u8') or resource_url
.endswith('.m3u8'):
154 formats
.extend(self
._extract
_m
3u8_formats
(
155 signed_url
, resource_id
, 'mp4', entry_protocol
='m3u8_native',
156 m3u8_id
='hls', fatal
=False))
157 elif resource_id
.endswith('mpd') or resource_url
.endswith('.mpd'):
158 formats
.extend(self
._extract
_mpd
_formats
(
159 signed_url
, resource_id
, mpd_id
='dash', fatal
=False))
160 elif resource_id
.endswith('manifest') or resource_url
.endswith('/manifest'):
161 formats
.extend(self
._extract
_ism
_formats
(
162 signed_url
, resource_id
, ism_id
='mss', fatal
=False))
166 'format_id': 'http-%s' % resource_id
,
167 'height': int_or_none(resource
.get('height')),
170 self
._sort
_formats
(formats
)
172 duration
= float_or_none(video
.get('duration'), 1000)
173 uploader
= video
.get('channel')
174 uploader_id
= str_or_none(video
.get('channel_id'))
179 'duration': duration
,
180 'uploader': uploader
,
181 'uploader_id': uploader_id
,
183 'subtitles': subtitles
,
187 class GloboArticleIE(InfoExtractor
):
188 _VALID_URL
= r
'https?://.+?\.globo\.com/(?:[^/]+/)*(?P<id>[^/.]+)(?:\.html)?'
191 r
'\bdata-video-id=["\'](\d
{7,})',
192 r'\bdata
-player
-videosids
=["\'](\d{7,})',
193 r'\bvideosIDs\s*:\s*["\']?
(\d
{7,})',
194 r'\bdata
-id=["\'](\d{7,})',
195 r'<div[^>]+\bid=["\'](\d
{7,})',
199 'url
': 'http
://g1
.globo
.com
/jornal
-nacional
/noticia
/2014/09/novidade
-na
-fiscalizacao
-de
-bagagem
-pela
-receita
-provoca
-discussoes
.html
',
201 'id': 'novidade
-na
-fiscalizacao
-de
-bagagem
-pela
-receita
-provoca
-discussoes
',
202 'title
': 'Novidade na fiscalização de bagagem pela Receita provoca discussões
',
203 'description
': 'md5
:c3c4b4d4c30c32fce460040b1ac46b12
',
207 'url
': 'http
://g1
.globo
.com
/pr
/parana
/noticia
/2016/09/mpf
-denuncia
-lula
-marisa
-e
-mais
-seis
-na
-operacao
-lava
-jato
.html
',
209 'id': 'mpf
-denuncia
-lula
-marisa
-e
-mais
-seis
-na
-operacao
-lava
-jato
',
210 'title
': "Lula era o 'comandante máximo
' do esquema da Lava Jato, diz MPF",
211 'description
': 'md5
:8aa7cc8beda4dc71cc8553e00b77c54c
',
215 'url
': 'http
://gq
.globo
.com
/Prazeres
/Poder
/noticia
/2015/10/all
-o
-desafio
-assista
-ao
-segundo
-capitulo
-da
-serie
.html
',
216 'only_matching
': True,
218 'url
': 'http
://gshow
.globo
.com
/programas
/tv
-xuxa
/O
-Programa
/noticia
/2014/01/xuxa
-e
-junno
-namoram
-muuuito
-em
-luau
-de
-zeze
-di
-camargo
-e
-luciano
.html
',
219 'only_matching
': True,
221 'url
': 'http
://oglobo
.globo
.com
/rio
/a
-amizade
-entre
-um
-entregador
-de
-farmacia
-um
-piano
-19946271',
222 'only_matching
': True,
226 def suitable(cls, url):
227 return False if GloboIE.suitable(url) else super(GloboArticleIE, cls).suitable(url)
229 def _real_extract(self, url):
230 display_id = self._match_id(url)
231 webpage = self._download_webpage(url, display_id)
233 for video_regex in self._VIDEOID_REGEXES:
234 video_ids.extend(re.findall(video_regex, webpage))
236 self.url_result('globo
:%s' % video_id, GloboIE.ie_key())
237 for video_id in orderedSet(video_ids)]
238 title = self._og_search_title(webpage, fatal=False)
239 description = self._html_search_meta('description
', webpage)
240 return self.playlist_result(entries, display_id, title, description)