]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ctsnews.py
   1 # -*- coding: utf-8 -*- 
   2 from __future__ 
import unicode_literals
 
   4 from .common 
import InfoExtractor
 
   5 from ..utils 
import parse_iso8601
, ExtractorError
 
   8 class CtsNewsIE(InfoExtractor
): 
  10     # https connection failed (Connection reset) 
  11     _VALID_URL 
= r
'https?://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P<id>\d+)\.html' 
  13         'url': 'http://news.cts.com.tw/cts/international/201501/201501291578109.html', 
  14         'md5': 'a9875cb790252b08431186d741beaabe', 
  16             'id': '201501291578109', 
  18             'title': '以色列.真主黨交火 3人死亡', 
  19             'description': 'md5:95e9b295c898b7ff294f09d450178d7d', 
  20             'timestamp': 1422528540, 
  21             'upload_date': '20150129', 
  24         # News count not appear on page but still available in database 
  25         'url': 'http://news.cts.com.tw/cts/international/201309/201309031304098.html', 
  26         'md5': '3aee7e0df7cdff94e43581f54c22619e', 
  28             'id': '201309031304098', 
  30             'title': '韓國31歲童顏男 貌如十多歲小孩', 
  31             'description': 'md5:f183feeba3752b683827aab71adad584', 
  32             'thumbnail': 're:^https?://.*\.jpg$', 
  33             'timestamp': 1378205880, 
  34             'upload_date': '20130903', 
  37         # With Youtube embedded video 
  38         'url': 'http://news.cts.com.tw/cts/money/201501/201501291578003.html', 
  39         'md5': '1d842c771dc94c8c3bca5af2cc1db9c5', 
  40         'add_ie': ['Youtube'], 
  44             'title': 'iPhone6熱銷 蘋果財報亮眼', 
  45             'description': 'md5:f395d4f485487bb0f992ed2c4b07aa7d', 
  46             'thumbnail': 're:^https?://.*\.jpg$', 
  47             'upload_date': '20150128', 
  48             'uploader_id': 'TBSCTS', 
  53     def _real_extract(self
, url
): 
  54         news_id 
= self
._match
_id
(url
) 
  55         page 
= self
._download
_webpage
(url
, news_id
) 
  57         if self
._search
_regex
(r
'(CTSPlayer2)', page
, 'CTSPlayer2 identifier', default
=None): 
  58             feed_url 
= self
._html
_search
_regex
( 
  59                 r
'(http://news\.cts\.com\.tw/action/mp4feed\.php\?news_id=\d+)', 
  61             video_url 
= self
._download
_webpage
( 
  62                 feed_url
, news_id
, note
='Fetching feed') 
  64             self
.to_screen('Not CTSPlayer video, trying Youtube...') 
  65             youtube_url 
= self
._search
_regex
( 
  66                 r
'src="(//www\.youtube\.com/embed/[^"]+)"', page
, 'youtube url', 
  69                 raise ExtractorError('The news includes no videos!', expected
=True) 
  77         description 
= self
._html
_search
_meta
('description', page
) 
  78         title 
= self
._html
_search
_meta
('title', page
) 
  79         thumbnail 
= self
._html
_search
_meta
('image', page
) 
  81         datetime_str 
= self
._html
_search
_regex
( 
  82             r
'(\d{4}/\d{2}/\d{2} \d{2}:\d{2})', page
, 'date and time') 
  83         # Transform into ISO 8601 format with timezone info 
  84         datetime_str 
= datetime_str
.replace('/', '-') + ':00+0800' 
  85         timestamp 
= parse_iso8601(datetime_str
, delimiter
=' ') 
  91             'description': description
, 
  92             'thumbnail': thumbnail
, 
  93             'timestamp': timestamp
,