X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/ee121ffd5cd3fe6fd1daa5699cae3becf3483dd3..78a7877538f28a5237c4ef3b5b3c62db7a38a4e6:/youtube_dl/InfoExtractors.py
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
old mode 100644
new mode 100755
index ddb4aa1..672ef9e
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -1,2957 +1,4 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
+# Legacy file for backwards compatibility, use youtube_dl.extractor instead!
-import datetime
-import HTMLParser
-import httplib
-import netrc
-import os
-import re
-import socket
-import time
-import urllib
-import urllib2
-import email.utils
-import xml.etree.ElementTree
-from urlparse import parse_qs
-
-try:
- import cStringIO as StringIO
-except ImportError:
- import StringIO
-
-from utils import *
-
-
-class InfoExtractor(object):
- """Information Extractor class.
-
- Information extractors are the classes that, given a URL, extract
- information from the video (or videos) the URL refers to. This
- information includes the real video URL, the video title and simplified
- title, author and others. The information is stored in a dictionary
- which is then passed to the FileDownloader. The FileDownloader
- processes this information possibly downloading the video to the file
- system, among other possible outcomes. The dictionaries must include
- the following fields:
-
- id: Video identifier.
- url: Final video URL.
- uploader: Nickname of the video uploader.
- title: Literal title.
- ext: Video filename extension.
- format: Video format.
- player_url: SWF Player URL (may be None).
-
- The following fields are optional. Their primary purpose is to allow
- youtube-dl to serve as the backend for a video search function, such
- as the one in youtube2mp3. They are only used when their respective
- forced printing functions are called:
-
- thumbnail: Full URL to a video thumbnail image.
- description: One-line video description.
-
- Subclasses of this one should re-define the _real_initialize() and
- _real_extract() methods and define a _VALID_URL regexp.
- Probably, they should also be added to the list of extractors.
- """
-
- _ready = False
- _downloader = None
-
- def __init__(self, downloader=None):
- """Constructor. Receives an optional downloader."""
- self._ready = False
- self.set_downloader(downloader)
-
- def suitable(self, url):
- """Receives a URL and returns True if suitable for this IE."""
- return re.match(self._VALID_URL, url) is not None
-
- def initialize(self):
- """Initializes an instance (authentication, etc)."""
- if not self._ready:
- self._real_initialize()
- self._ready = True
-
- def extract(self, url):
- """Extracts URL information and returns it in list of dicts."""
- self.initialize()
- return self._real_extract(url)
-
- def set_downloader(self, downloader):
- """Sets the downloader for this IE."""
- self._downloader = downloader
-
- def _real_initialize(self):
- """Real initialization process. Redefine in subclasses."""
- pass
-
- def _real_extract(self, url):
- """Real extraction process. Redefine in subclasses."""
- pass
-
-
-class YoutubeIE(InfoExtractor):
- """Information extractor for youtube.com."""
-
- _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
- _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
- _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
- _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
- _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
- _NETRC_MACHINE = 'youtube'
- # Listed in order of quality
- _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
- _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
- _video_extensions = {
- '13': '3gp',
- '17': 'mp4',
- '18': 'mp4',
- '22': 'mp4',
- '37': 'mp4',
- '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
- '43': 'webm',
- '44': 'webm',
- '45': 'webm',
- '46': 'webm',
- }
- _video_dimensions = {
- '5': '240x400',
- '6': '???',
- '13': '???',
- '17': '144x176',
- '18': '360x640',
- '22': '720x1280',
- '34': '360x640',
- '35': '480x854',
- '37': '1080x1920',
- '38': '3072x4096',
- '43': '360x640',
- '44': '480x854',
- '45': '720x1280',
- '46': '1080x1920',
- }
- IE_NAME = u'youtube'
-
- def report_lang(self):
- """Report attempt to set language."""
- self._downloader.to_screen(u'[youtube] Setting language')
-
- def report_login(self):
- """Report attempt to log in."""
- self._downloader.to_screen(u'[youtube] Logging in')
-
- def report_age_confirmation(self):
- """Report attempt to confirm age."""
- self._downloader.to_screen(u'[youtube] Confirming age')
-
- def report_video_webpage_download(self, video_id):
- """Report attempt to download video webpage."""
- self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
-
- def report_video_info_webpage_download(self, video_id):
- """Report attempt to download video info webpage."""
- self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
-
- def report_video_subtitles_download(self, video_id):
- """Report attempt to download video info webpage."""
- self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
-
- def report_information_extraction(self, video_id):
- """Report attempt to extract video information."""
- self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
-
- def report_unavailable_format(self, video_id, format):
- """Report extracted video URL."""
- self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
-
- def report_rtmp_download(self):
- """Indicate the download will use the RTMP protocol."""
- self._downloader.to_screen(u'[youtube] RTMP download detected')
-
- def _closed_captions_xml_to_srt(self, xml_string):
- srt = ''
- texts = re.findall(r'