From dd867805969126ed0bd2ab8fe69eaf61fbf44ab7 Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 11 Feb 2016 10:55:50 +0100 Subject: [PATCH 001/116] [extractor/common] fix dash formats sorting --- youtube_dl/extractor/common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 00645feed..cd7087bec 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -851,6 +851,7 @@ class InfoExtractor(object): proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1 if f.get('vcodec') == 'none': # audio only + preference -= 50 if self._downloader.params.get('prefer_free_formats'): ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus'] else: @@ -861,6 +862,8 @@ class InfoExtractor(object): except ValueError: audio_ext_preference = -1 else: + if f.get('acodec') == 'none': # video only + preference -= 40 if self._downloader.params.get('prefer_free_formats'): ORDER = ['flv', 'mp4', 'webm'] else: From 8bb56eeeea8154f811076c0a9093203fab224003 Mon Sep 17 00:00:00 2001 From: Brian Foley Date: Sat, 2 Jan 2016 19:49:59 +0000 Subject: [PATCH 002/116] [utils] Add extract_attributes for extracting html tag attributes This is much more robust than just using regexps, and handles all the common scenarios, such as empty/no values, repeated attributes, entity decoding, mixed case names, and the different possible value quoting schemes. --- test/test_utils.py | 40 ++++++++++++++++++++++++++++++++++++++++ youtube_dl/compat.py | 6 ++++++ youtube_dl/utils.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 97587ad2f..cb85e18f0 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -28,6 +28,7 @@ from youtube_dl.utils import ( encodeFilename, escape_rfc3986, escape_url, + extract_attributes, ExtractorError, find_xpath_attr, fix_xml_ampersands, @@ -75,6 +76,7 @@ from youtube_dl.utils import ( cli_bool_option, ) from youtube_dl.compat import ( + compat_chr, compat_etree_fromstring, ) @@ -591,6 +593,44 @@ class TestUtil(unittest.TestCase): on = js_to_json('{"abc": "def",}') self.assertEqual(json.loads(on), {'abc': 'def'}) + def test_extract_attributes(self): + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(""), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': "a 'b' c"}) + self.assertEqual(extract_attributes(''), {'x': 'a "b" c'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': '&'}) # XML + self.assertEqual(extract_attributes(''), {'x': '"'}) + self.assertEqual(extract_attributes(''), {'x': '£'}) # HTML 3.2 + self.assertEqual(extract_attributes(''), {'x': 'λ'}) # HTML 4.0 + self.assertEqual(extract_attributes(''), {'x': '&foo'}) + self.assertEqual(extract_attributes(''), {'x': "'"}) + self.assertEqual(extract_attributes(''), {'x': '"'}) + self.assertEqual(extract_attributes(''), {'x': None}) + self.assertEqual(extract_attributes(''), {'x': 'y', 'a': None}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'y': '2', 'x': '3'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(""), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': '\ny\n'}) + self.assertEqual(extract_attributes(''), {'caps': 'x'}) # Names lowercased + self.assertEqual(extract_attributes(''), {'x': '2'}) + self.assertEqual(extract_attributes(''), {'x': '2'}) + self.assertEqual(extract_attributes(''), {'_:funny-name1': '1'}) + self.assertEqual(extract_attributes(''), {'x': 'Fáilte 世界 \U0001f600'}) + self.assertEqual(extract_attributes(''), {'x': 'décompose\u0301'}) + # "Narrow" Python builds don't support unicode code points outside BMP. + try: + compat_chr(0x10000) + supports_outside_bmp = True + except ValueError: + supports_outside_bmp = False + if supports_outside_bmp: + self.assertEqual(extract_attributes(''), {'x': 'Smile \U0001f600!'}) + def test_clean_html(self): self.assertEqual(clean_html('a:\nb'), 'a: b') self.assertEqual(clean_html('a:\n "b"'), 'a: "b"') diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index b497da696..7b9afc36d 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -77,6 +77,11 @@ try: except ImportError: # Python 2 from urllib import urlretrieve as compat_urlretrieve +try: + from html.parser import HTMLParser as compat_HTMLParser +except ImportError: # Python 2 + from HTMLParser import HTMLParser as compat_HTMLParser + try: from subprocess import DEVNULL @@ -540,6 +545,7 @@ else: from tokenize import generate_tokens as compat_tokenize_tokenize __all__ = [ + 'compat_HTMLParser', 'compat_HTTPError', 'compat_basestring', 'compat_chr', diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 210c47fce..a0234a3a8 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -35,6 +35,7 @@ import xml.etree.ElementTree import zlib from .compat import ( + compat_HTMLParser, compat_basestring, compat_chr, compat_etree_fromstring, @@ -272,6 +273,35 @@ def get_element_by_attribute(attribute, value, html): return unescapeHTML(res) +class HTMLAttributeParser(compat_HTMLParser): + """Trivial HTML parser to gather the attributes for a single element""" + def __init__(self): + self.attrs = { } + compat_HTMLParser.__init__(self) + + def handle_starttag(self, tag, attrs): + self.attrs = dict(attrs) + +def extract_attributes(html_element): + """Given a string for an HTML element such as + + Decode and return a dictionary of attributes. + { + 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz', + 'empty': '', 'noval': None, 'entity': '&', + 'sq': '"', 'dq': '\'' + }. + NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions, + but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5. + """ + parser = HTMLAttributeParser() + parser.feed(html_element) + parser.close() + return parser.attrs def clean_html(html): """Clean an HTML snippet into a readable string""" From dec2cae0a768d2eb1f7d28cfd267d0bf5383bcdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 15 Mar 2016 21:45:43 +0600 Subject: [PATCH 003/116] [twitch:playlistbase] Clarify pagination bug Pagination bug has been fixed by twitch on 15.03.2016. --- youtube_dl/extractor/twitch.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 958bf8fff..d4169ec6d 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -299,9 +299,10 @@ class TwitchPlaylistBaseIE(TwitchBaseIE): # is completely broken on the twitch side. It simply ignores # a limit and returns the whole offset number of videos. # Working around by just requesting all videos at once. + # Upd: pagination bug was fixed by twitch on 15.03.2016. if not broken_paging_detected and total and len(page_entries) > limit: self.report_warning( - 'Twitch paging is broken on twitch side, requesting all videos at once', + 'Twitch pagination is broken on twitch side, requesting all videos at once', channel_id) broken_paging_detected = True offset = total From 89807d6a8293d7b028a204628aec32b6f39148dd Mon Sep 17 00:00:00 2001 From: remitamine Date: Tue, 15 Mar 2016 18:48:21 +0100 Subject: [PATCH 004/116] [brightcove] extract dash formats and detect audio formats --- youtube_dl/extractor/brightcove.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index f8413d5f2..86de5b9ee 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -484,6 +484,10 @@ class BrightcoveNewIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( src, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + elif source_type == 'application/dash+xml': + if not src: + continue + formats.extend(self._extract_mpd_formats(src, video_id, 'dash', fatal=False)) else: streaming_src = source.get('streaming_src') stream_name, app_name = source.get('stream_name'), source.get('app_name') @@ -491,15 +495,23 @@ class BrightcoveNewIE(InfoExtractor): continue tbr = float_or_none(source.get('avg_bitrate'), 1000) height = int_or_none(source.get('height')) + width = int_or_none(source.get('width')) f = { 'tbr': tbr, - 'width': int_or_none(source.get('width')), - 'height': height, 'filesize': int_or_none(source.get('size')), 'container': container, - 'vcodec': source.get('codec'), - 'ext': source.get('container').lower(), + 'ext': container.lower(), } + if width == 0 and height == 0: + f.update({ + 'vcodec': 'none', + }) + else: + f.update({ + 'width': width, + 'height': height, + 'vcodec': source.get('codec'), + }) def build_format_id(kind): format_id = kind From edfd93518ea0a929dbdd51cc3472bddceaf96514 Mon Sep 17 00:00:00 2001 From: remitamine Date: Tue, 15 Mar 2016 19:33:09 +0100 Subject: [PATCH 005/116] [svt] extract dashhbbtv formats(#8867) --- youtube_dl/extractor/svt.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 399c3b8ee..2ab30e45f 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -19,20 +19,25 @@ class SVTBaseIE(InfoExtractor): video_info = info['video'] formats = [] for vr in video_info['videoReferences']: + player_type = vr.get('playerType') vurl = vr['url'] ext = determine_ext(vurl) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( vurl, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=vr.get('playerType'))) + m3u8_id=player_type, fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( vurl + '?hdcore=3.3.0', video_id, - f4m_id=vr.get('playerType'))) + f4m_id=player_type, fatal=False)) + elif ext == 'mpd': + if player_type == 'dashhbbtv': + formats.extend(self._extract_mpd_formats( + vurl, video_id, mpd_id=player_type, fatal=False)) else: formats.append({ - 'format_id': vr.get('playerType'), + 'format_id': player_type, 'url': vurl, }) self._sort_formats(formats) From cb6e477dfe09b68a810e587269958b1e56077b00 Mon Sep 17 00:00:00 2001 From: remitamine Date: Tue, 15 Mar 2016 19:38:10 +0100 Subject: [PATCH 006/116] [aljazeera] update the extractor to use BrightcoveNewIE --- youtube_dl/extractor/aljazeera.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index 5b2c0dc9a..cddcaa489 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -13,24 +13,18 @@ class AlJazeeraIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Slum - Episode 1: Deliverance', 'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.', - 'uploader': 'Al Jazeera English', + 'uploader_id': '665003303001', + 'timestamp': 1411116829, + 'upload_date': '20140919', }, - 'add_ie': ['BrightcoveLegacy'], + 'add_ie': ['BrightcoveNew'], 'skip': 'Not accessible from Travis CI server', } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s' def _real_extract(self, url): program_name = self._match_id(url) webpage = self._download_webpage(url, program_name) brightcove_id = self._search_regex( r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id') - - return { - '_type': 'url', - 'url': ( - 'brightcove:' - 'playerKey=AQ~~%2CAAAAmtVJIFk~%2CTVGOQ5ZTwJbeMWnq5d_H4MOM57xfzApc' - '&%40videoPlayer={0}'.format(brightcove_id) - ), - 'ie_key': 'BrightcoveLegacy', - } + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) From 3ff8279e80b4c057f12998dcca5164209bb71078 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 15 Mar 2016 02:20:37 +0800 Subject: [PATCH 007/116] [kuwo:mv] Fix the test and extraction of georestricted MVs --- youtube_dl/extractor/kuwo.py | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 700e44b63..f94804d06 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -23,7 +23,7 @@ class KuwoBaseIE(InfoExtractor): {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10} ] - def _get_formats(self, song_id): + def _get_formats(self, song_id, tolerate_ip_deny=False): formats = [] for file_format in self._FORMATS: song_url = self._download_webpage( @@ -32,7 +32,7 @@ class KuwoBaseIE(InfoExtractor): song_id, note='Download %s url info' % file_format['format'], ) - if song_url == 'IPDeny': + if song_url == 'IPDeny' and not tolerate_ip_deny: raise ExtractorError('This song is blocked in this region', expected=True) if song_url.startswith('http://') or song_url.startswith('https://'): @@ -43,7 +43,12 @@ class KuwoBaseIE(InfoExtractor): 'preference': file_format['preference'], 'abr': file_format.get('abr'), }) - self._sort_formats(formats) + + # XXX _sort_formats fails if there are not formats, while it's not the + # desired behavior if 'IPDeny' is ignored + # This check can be removed if https://github.com/rg3/youtube-dl/pull/8051 is merged + if not tolerate_ip_deny: + self._sort_formats(formats) return formats @@ -288,10 +293,16 @@ class KuwoMvIE(KuwoBaseIE): 'url': 'http://www.kuwo.cn/mv/6480076/', 'info_dict': { 'id': '6480076', - 'ext': 'mkv', - 'title': '我们家MV', + 'ext': 'mp4', + 'title': 'My HouseMV', 'creator': '2PM', }, + # In this video, music URLs (anti.s) are blocked outside China and + # USA, while the MV URL (mvurl) is available globally, so force the MV + # URL for consistent results in different countries + 'params': { + 'format': 'mv', + }, } _FORMATS = KuwoBaseIE._FORMATS + [ {'format': 'mkv', 'ext': 'mkv', 'preference': 250}, @@ -313,7 +324,17 @@ class KuwoMvIE(KuwoBaseIE): else: raise ExtractorError('Unable to find song or singer names') - formats = self._get_formats(song_id) + formats = self._get_formats(song_id, tolerate_ip_deny=True) + + mv_url = self._download_webpage( + 'http://www.kuwo.cn/yy/st/mvurl?rid=MUSIC_%s' % song_id, + song_id, note='Download %s MV URL' % song_id) + formats.append({ + 'url': mv_url, + 'format_id': 'mv', + }) + + self._sort_formats(formats) return { 'id': song_id, From 0c9ff2404119bc93fe9755fa5ecdf8e8dd53b146 Mon Sep 17 00:00:00 2001 From: remitamine Date: Tue, 15 Mar 2016 20:54:55 +0100 Subject: [PATCH 008/116] [noz] fix extraction in python 2.6 --- youtube_dl/extractor/noz.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/noz.py b/youtube_dl/extractor/noz.py index 4e60b13a5..656443c49 100644 --- a/youtube_dl/extractor/noz.py +++ b/youtube_dl/extractor/noz.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote from ..utils import ( int_or_none, + find_xpath_attr, xpath_text, update_url_query, ) @@ -47,8 +48,9 @@ class NozIE(InfoExtractor): doc, './/article/movie/file/duration')) formats = [] for qnode in doc.findall('.//article/movie/file/qualities/qual'): - http_url = xpath_text( - qnode, './html_urls/video_url[@format="video/mp4"]') + http_url_ele = find_xpath_attr( + qnode, './html_urls/video_url', 'format', 'video/mp4') + http_url = http_url_ele.text if http_url_ele is not None else None if http_url: formats.append({ 'url': http_url, @@ -64,8 +66,10 @@ class NozIE(InfoExtractor): formats.extend(self._extract_f4m_formats( update_url_query(f4m_url, {'hdcore': '3.4.0'}), video_id, f4m_id='hds', fatal=False)) - m3u8_url = xpath_text( - qnode, './html_urls/video_url[@format="application/vnd.apple.mpegurl"]') + m3u8_url_ele = find_xpath_attr( + qnode, './html_urls/video_url', + 'format', 'application/vnd.apple.mpegurl') + m3u8_url = m3u8_url_ele.text if m3u8_url_ele is not None else None if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', From 64d4f31d78d9555d7f79b8bebeabb7535a842090 Mon Sep 17 00:00:00 2001 From: remitamine Date: Tue, 15 Mar 2016 22:50:43 +0100 Subject: [PATCH 009/116] [brightcove:new] update embed_in_page embeds regex to match non numeric ref id --- youtube_dl/extractor/brightcove.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 86de5b9ee..b182edb26 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -425,7 +425,7 @@ class BrightcoveNewIE(InfoExtractor): # According to [4] data-video-id may be prefixed with ref: r'''(?sx) ]+ - data-video-id=["\']((?:ref:)?\d+)["\'][^>]*>.*? + data-video-id=["\'](\d+|ref:[^"\']+)["\'][^>]*>.*? .*? ]+ src=["\'](?:https?:)?//players\.brightcove\.net/ From 2cab48704ced0d703f2b6de8575dc06e83207462 Mon Sep 17 00:00:00 2001 From: remitamine Date: Tue, 15 Mar 2016 22:57:59 +0100 Subject: [PATCH 010/116] [thestar] Add new extractor(closes #5955) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/brightcove.py | 2 +- youtube_dl/extractor/thestar.py | 31 ++++++++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 youtube_dl/extractor/thestar.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 179c11ffa..bf9fa17c9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -737,6 +737,7 @@ from .theplatform import ( ThePlatformFeedIE, ) from .thesixtyone import TheSixtyOneIE +from .thestar import TheStarIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .tinypic import TinyPicIE diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index b182edb26..f56b642ab 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -157,7 +157,7 @@ class BrightcoveLegacyIE(InfoExtractor): if playerKey is not None: params['playerKey'] = playerKey # The three fields hold the id of the video - videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') + videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList') if videoPlayer is not None: params['@videoPlayer'] = videoPlayer linkBase = find_param('linkBaseURL') diff --git a/youtube_dl/extractor/thestar.py b/youtube_dl/extractor/thestar.py new file mode 100644 index 000000000..b7e9af2af --- /dev/null +++ b/youtube_dl/extractor/thestar.py @@ -0,0 +1,31 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .brightcove import BrightcoveLegacyIE +from ..compat import compat_parse_qs + + +class TheStarIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thestar\.com/(?:[^/]+/)*(?P.+)\.html' + _TEST = { + 'url': 'http://www.thestar.com/life/2016/02/01/mankind-why-this-woman-started-a-men-s-skincare-line.html', + 'md5': '2c62dd4db2027e35579fefb97a8b6554', + 'info_dict': { + 'id': '4732393888001', + 'ext': 'mp4', + 'title': 'Mankind: Why this woman started a men\'s skin care line', + 'description': 'Robert Cribb talks to Young Lee, the founder of Uncle Peter\'s MAN.', + 'uploader_id': '794267642001', + 'timestamp': 1454353482, + 'upload_date': '20160201', + } + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/794267642001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) + brightcove_id = compat_parse_qs(brightcove_legacy_url)['@videoPlayer'][0] + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) From 48254c3f2cb315c4b9d2b679a6126f1e1208fbf8 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 16 Mar 2016 09:14:37 +0100 Subject: [PATCH 011/116] [brightcove] some improvements and fixes - use FFmpeg downloader to download m3u8 formats extracted from BrightcoveNew(some of the m3u8 media playlists use AES-128) - update comment and update_url_query to handle url query --- youtube_dl/extractor/brightcove.py | 32 +++++++++++++----------------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index f56b642ab..304fb89e3 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -24,16 +24,16 @@ from ..utils import ( js_to_json, int_or_none, parse_iso8601, - sanitized_Request, unescapeHTML, unsmuggle_url, + update_url_query, ) class BrightcoveLegacyIE(InfoExtractor): IE_NAME = 'brightcove:legacy' _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P.*)' - _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' + _FEDERATED_URL = 'http://c.brightcove.com/services/viewer/htmlFederated' _TESTS = [ { @@ -156,7 +156,7 @@ class BrightcoveLegacyIE(InfoExtractor): # Not all pages define this value if playerKey is not None: params['playerKey'] = playerKey - # The three fields hold the id of the video + # These fields hold the id of the video videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList') if videoPlayer is not None: params['@videoPlayer'] = videoPlayer @@ -185,8 +185,7 @@ class BrightcoveLegacyIE(InfoExtractor): @classmethod def _make_brightcove_url(cls, params): - data = compat_urllib_parse.urlencode(params) - return cls._FEDERATED_URL_TEMPLATE % data + return update_url_query(cls._FEDERATED_URL, params) @classmethod def _extract_brightcove_url(cls, webpage): @@ -240,7 +239,7 @@ class BrightcoveLegacyIE(InfoExtractor): # We set the original url as the default 'Referer' header referer = smuggled_data.get('Referer', url) return self._get_video_info( - videoPlayer[0], query_str, query, referer=referer) + videoPlayer[0], query, referer=referer) elif 'playerKey' in query: player_key = query['playerKey'] return self._get_playlist_info(player_key[0]) @@ -249,15 +248,14 @@ class BrightcoveLegacyIE(InfoExtractor): 'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?', expected=True) - def _get_video_info(self, video_id, query_str, query, referer=None): - request_url = self._FEDERATED_URL_TEMPLATE % query_str - req = sanitized_Request(request_url) + def _get_video_info(self, video_id, query, referer=None): + headers = {} linkBase = query.get('linkBaseURL') if linkBase is not None: referer = linkBase[0] if referer is not None: - req.add_header('Referer', referer) - webpage = self._download_webpage(req, video_id) + headers['Referer'] = referer + webpage = self._download_webpage(self._FEDERATED_URL, video_id, headers=headers, query=query) error_msg = self._html_search_regex( r"

We're sorry.

([\s\n]*

.*?

)+", webpage, @@ -459,12 +457,11 @@ class BrightcoveNewIE(InfoExtractor): r'policyKey\s*:\s*(["\'])(?P.+?)\1', webpage, 'policy key', group='pk') - req = sanitized_Request( - 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' - % (account_id, video_id), - headers={'Accept': 'application/json;pk=%s' % policy_key}) + api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id) try: - json_data = self._download_json(req, video_id) + json_data = self._download_json(api_url, video_id, headers={ + 'Accept': 'application/json;pk=%s' % policy_key + }) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: json_data = self._parse_json(e.cause.read().decode(), video_id) @@ -482,8 +479,7 @@ class BrightcoveNewIE(InfoExtractor): if not src: continue formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + src, video_id, 'mp4', m3u8_id='hls', fatal=False)) elif source_type == 'application/dash+xml': if not src: continue From 23edc49509052e06afe7032802a0f4deb6710b47 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 16 Mar 2016 10:47:39 +0100 Subject: [PATCH 012/116] [tv3] Add new extractor(closes #8059) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/tv3.py | 33 ++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 youtube_dl/extractor/tv3.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bf9fa17c9..9502d07a4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -784,6 +784,7 @@ from .tv2 import ( TV2IE, TV2ArticleIE, ) +from .tv3 import TV3IE from .tv4 import TV4IE from .tvc import ( TVCIE, diff --git a/youtube_dl/extractor/tv3.py b/youtube_dl/extractor/tv3.py new file mode 100644 index 000000000..d3f690dc7 --- /dev/null +++ b/youtube_dl/extractor/tv3.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class TV3IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tv3\.co\.nz/(?P[^/]+)/tabid/\d+/articleID/\d+/MCat/\d+/Default\.aspx' + _TEST = { + 'url': 'http://www.tv3.co.nz/MOTORSPORT-SRS-SsangYong-Hampton-Downs-Round-3/tabid/3692/articleID/121615/MCat/2915/Default.aspx', + 'info_dict': { + 'id': '4659127992001', + 'ext': 'mp4', + 'title': 'CRC Motorsport: SRS SsangYong Hampton Downs Round 3 - S2015 Ep3', + 'description': 'SsangYong Racing Series returns for Round 3 with drivers from New Zealand and Australia taking to the grid at Hampton Downs raceway.', + 'uploader_id': '3812193411001', + 'upload_date': '20151213', + 'timestamp': 1449975272, + }, + 'expected_warnings': [ + 'Failed to download MPD manifest' + ], + 'params': { + 'skip_download': True, + }, + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/3812193411001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + brightcove_id = self._search_regex(r' Date: Wed, 16 Mar 2016 11:46:53 +0100 Subject: [PATCH 013/116] [brightcove:new] extract protocol-less embed URLs(closes #2914) --- youtube_dl/extractor/brightcove.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 304fb89e3..3ab383461 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -413,8 +413,8 @@ class BrightcoveNewIE(InfoExtractor): # Look for iframe embeds [1] for _, url in re.findall( - r']+src=(["\'])((?:https?:)//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): - entries.append(url) + r']+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): + entries.append(url if url.startswith('http') else 'http:' + url) # Look for embed_in_page embeds [2] for video_id, account_id, player_id, embed in re.findall( From a7ba57dc176efaa50b5121a1f63963f4fc0111e7 Mon Sep 17 00:00:00 2001 From: Quan Hua Date: Wed, 16 Mar 2016 10:15:39 +0700 Subject: [PATCH 014/116] [udemy] Update course id regex to cover v4 layout (Closes #8753, closes #8868, closes #8870) --- youtube_dl/extractor/udemy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index f5b5e7fd6..0fd2a0a0a 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -144,7 +144,8 @@ class UdemyIE(InfoExtractor): webpage = self._download_webpage(url, lecture_id) course_id = self._search_regex( - r'data-course-id=["\'](\d+)', webpage, 'course id') + (r'data-course-id=["\'](\d+)', r'"id": (\d+)'), + webpage, 'course id') try: lecture = self._download_lecture(course_id, lecture_id) From 70cab344c48598904fde657620156be62b70ee0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 16 Mar 2016 21:46:09 +0600 Subject: [PATCH 015/116] [udemy] Improve course id v4 regex --- youtube_dl/extractor/udemy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 0fd2a0a0a..74cc36ece 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -144,7 +144,7 @@ class UdemyIE(InfoExtractor): webpage = self._download_webpage(url, lecture_id) course_id = self._search_regex( - (r'data-course-id=["\'](\d+)', r'"id": (\d+)'), + (r'data-course-id=["\'](\d+)', r'"id"\s*:\s*(\d+)'), webpage, 'course id') try: From 96f4f796fb02e3ef13fa6584b8f77ebafaabf59f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 16 Mar 2016 21:47:51 +0600 Subject: [PATCH 016/116] [brightcover] Remove unused import --- youtube_dl/extractor/brightcove.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 3ab383461..59e8008f9 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -9,7 +9,6 @@ from ..compat import ( compat_etree_fromstring, compat_parse_qs, compat_str, - compat_urllib_parse, compat_urllib_parse_urlparse, compat_urlparse, compat_xml_parse_error, From c5229f3926d64bce101d328fc5acf25bda83e0d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 16 Mar 2016 21:50:04 +0600 Subject: [PATCH 017/116] [utils] PEP 8 --- test/test_utils.py | 6 +++--- youtube_dl/utils.py | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 5a0109977..9a3a8ddff 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -641,8 +641,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(extract_attributes(''), {'x': 'y'}) self.assertEqual(extract_attributes(''), {'x': '&'}) # XML self.assertEqual(extract_attributes(''), {'x': '"'}) - self.assertEqual(extract_attributes(''), {'x': '£'}) # HTML 3.2 - self.assertEqual(extract_attributes(''), {'x': 'λ'}) # HTML 4.0 + self.assertEqual(extract_attributes(''), {'x': '£'}) # HTML 3.2 + self.assertEqual(extract_attributes(''), {'x': 'λ'}) # HTML 4.0 self.assertEqual(extract_attributes(''), {'x': '&foo'}) self.assertEqual(extract_attributes(''), {'x': "'"}) self.assertEqual(extract_attributes(''), {'x': '"'}) @@ -654,7 +654,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(extract_attributes(''), {'x': 'y'}) self.assertEqual(extract_attributes(""), {'x': 'y'}) self.assertEqual(extract_attributes(''), {'x': '\ny\n'}) - self.assertEqual(extract_attributes(''), {'caps': 'x'}) # Names lowercased + self.assertEqual(extract_attributes(''), {'caps': 'x'}) # Names lowercased self.assertEqual(extract_attributes(''), {'x': '2'}) self.assertEqual(extract_attributes(''), {'x': '2'}) self.assertEqual(extract_attributes(''), {'_:funny-name1': '1'}) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ec186918c..8ec1bd469 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -273,15 +273,17 @@ def get_element_by_attribute(attribute, value, html): return unescapeHTML(res) + class HTMLAttributeParser(compat_HTMLParser): """Trivial HTML parser to gather the attributes for a single element""" def __init__(self): - self.attrs = { } + self.attrs = {} compat_HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): self.attrs = dict(attrs) + def extract_attributes(html_element): """Given a string for an HTML element such as Date: Wed, 16 Mar 2016 21:26:25 +0100 Subject: [PATCH 018/116] [bravotv] Add new extractor(#4657) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/bravotv.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 youtube_dl/extractor/bravotv.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 9502d07a4..725ebec04 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -81,6 +81,7 @@ from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bpb import BpbIE from .br import BRIE +from .bravotv import BravoTVIE from .breakcom import BreakIE from .brightcove import ( BrightcoveLegacyIE, diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py new file mode 100644 index 000000000..69d00b466 --- /dev/null +++ b/youtube_dl/extractor/bravotv.py @@ -0,0 +1,28 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class BravoTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+videos/(?P[^/?]+)' + _TEST = { + 'url': 'http://www.bravotv.com/last-chance-kitchen/season-5/videos/lck-ep-12-fishy-finale', + 'md5': 'd60cdf68904e854fac669bd26cccf801', + 'info_dict': { + 'id': 'LitrBdX64qLn', + 'ext': 'mp4', + 'title': 'Last Chance Kitchen Returns', + 'description': 'S13: Last Chance Kitchen Returns for Top Chef Season 13', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + account_pid = self._search_regex(r'"account_pid"\s*:\s*"([^"]+)"', webpage, 'account pid') + release_pid = self._search_regex(r'"release_pid"\s*:\s*"([^"]+)"', webpage, 'release pid') + return self.url_result(smuggle_url( + 'http://link.theplatform.com/s/%s/%s?format=SMIL&mbr=true&switch=progressive' % (account_pid, release_pid), + {'force_smil_url': True}), 'ThePlatform', release_pid) From a646a8cf980a946cfc15d2286fcec6ee3987886f Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 17 Mar 2016 02:02:18 +0100 Subject: [PATCH 019/116] [sbs] improve extraction(fixes #3811) - extract error messages - force the platform smil url(previously the manifest param in the query is not respected which make theplatform return non working mp4 files for some videos) --- youtube_dl/extractor/sbs.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index d6ee2d9e2..2f96477ca 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -2,6 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + smuggle_url, + ExtractorError, +) class SBSIE(InfoExtractor): @@ -31,21 +35,28 @@ class SBSIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + player_params = self._download_json( + 'http://www.sbs.com.au/api/video_pdkvars/id/%s?form=json' % video_id, video_id) - webpage = self._download_webpage( - 'http://www.sbs.com.au/ondemand/video/single/%s?context=web' % video_id, video_id) - - player_params = self._parse_json( - self._search_regex( - r'(?s)var\s+playerParams\s*=\s*({.+?});', webpage, 'playerParams'), - video_id) + error = player_params.get('error') + if error: + error_message = 'Sorry, The video you are looking for does not exist.' + video_data = error.get('results') or {} + error_code = error.get('errorCode') + if error_code == 'ComingSoon': + error_message = '%s is not yet available.' % video_data.get('title', '') + elif error_code in ('Forbidden', 'intranetAccessOnly'): + error_message = 'Sorry, This video cannot be accessed via this website' + elif error_code == 'Expired': + error_message = 'Sorry, %s is no longer available.' % video_data.get('title', '') + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) urls = player_params['releaseUrls'] - theplatform_url = (urls.get('progressive') or urls.get('standard') or - urls.get('html') or player_params['relatedItemsURL']) + theplatform_url = (urls.get('progressive') or urls.get('html') or + urls.get('standard') or player_params['relatedItemsURL']) return { '_type': 'url_transparent', 'id': video_id, - 'url': theplatform_url, + 'url': smuggle_url(theplatform_url, {'force_smil_url': True}), } From 11f12195af73a2b0a09de928247cb87aed6dd693 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 17 Mar 2016 19:25:37 +0800 Subject: [PATCH 020/116] [youtube] Added itag 91 Seen in https://www.youtube.com/watch?v=jMN4cxyhJjk --- youtube_dl/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 27e67feb4..466f5da2e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -309,6 +309,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, # Apple HTTP Live Streaming + '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, From 0436ec0e7a4683539bc7844511ba76fbcab03f7b Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 17 Mar 2016 16:05:31 +0100 Subject: [PATCH 021/116] [once] Add new format extractor --- youtube_dl/extractor/once.py | 37 ++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 youtube_dl/extractor/once.py diff --git a/youtube_dl/extractor/once.py b/youtube_dl/extractor/once.py new file mode 100644 index 000000000..403f8c0af --- /dev/null +++ b/youtube_dl/extractor/once.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class OnceIE(InfoExtractor): + _VALID_URL = r'https?://once\.unicornmedia\.com/now/[^/]+/[^/]+/(?P[^/]+)/(?P[^/]+)/(?:[^/]+/)?(?P[^/]+)/content\.(?:once|m3u8|mp4)' + ADAPTIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/master/playlist/%s/%s/%s/content.m3u8' + PROGRESSIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/media/progressive/%s/%s/%s/%s/content.mp4' + + def _extract_once_formats(self, url): + domain_id, application_id, media_item_id = re.match( + OnceIE._VALID_URL, url).groups() + adaptive_formats = self._extract_m3u8_formats( + self.ADAPTIVE_URL_TEMPLATE % ( + domain_id, application_id, media_item_id), + media_item_id, 'mp4', m3u8_id='hls', fatal=False) + formats = [] + formats.extend(adaptive_formats) + for adaptive_format in adaptive_formats: + rendition_id = self._search_regex( + r'/now/media/playlist/[^/]+/[^/]+/([^/]+)', + adaptive_format['url'], 'redition id', default=None) + if rendition_id: + progressive_format = adaptive_format.copy() + progressive_format.update({ + 'url': self.PROGRESSIVE_URL_TEMPLATE % ( + domain_id, application_id, rendition_id, media_item_id), + 'format_id': adaptive_format['format_id'].replace( + 'hls', 'http'), + 'protocol': 'http', + }) + formats.append(progressive_format) + return formats From 9f02ff537c6ddfd3f1ea3586f3e44f0ec07a2aea Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 17 Mar 2016 16:06:25 +0100 Subject: [PATCH 022/116] [theplatform] extract brightcove once formats --- youtube_dl/extractor/theplatform.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 9a57b49df..ffe7c57ad 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -8,13 +8,12 @@ import binascii import hashlib -from .common import InfoExtractor +from .once import OnceIE from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, ) from ..utils import ( - determine_ext, ExtractorError, float_or_none, int_or_none, @@ -29,7 +28,7 @@ default_ns = 'http://www.w3.org/2005/SMIL21/Language' _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) -class ThePlatformBaseIE(InfoExtractor): +class ThePlatformBaseIE(OnceIE): def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): meta = self._download_xml(smil_url, video_id, note=note) error_element = find_xpath_attr( @@ -38,17 +37,19 @@ class ThePlatformBaseIE(InfoExtractor): if error_element is not None: raise ExtractorError(error_element.attrib['abstract'], expected=True) - formats = self._parse_smil_formats( + smil_formats = self._parse_smil_formats( meta, smil_url, video_id, namespace=default_ns, # the parameters are from syfy.com, other sites may use others, # they also work for nbc.com f4m_params={'g': 'UXWGVKRWHFSP', 'hdcore': '3.0.3'}, transform_rtmp_url=lambda streamer, src: (streamer, 'mp4:' + src)) - for _format in formats: - ext = determine_ext(_format['url']) - if ext == 'once': - _format['ext'] = 'mp4' + formats = [] + for _format in smil_formats: + if OnceIE.suitable(_format['url']): + formats.extend(self._extract_once_formats(_format['url'])) + else: + formats.append(_format) self._sort_formats(formats) @@ -125,7 +126,7 @@ class ThePlatformIE(ThePlatformBaseIE): 'only_matching': True, }, { 'url': 'http://player.theplatform.com/p/2E2eJC/nbcNewsOffsite?guid=tdy_or_siri_150701', - 'md5': '734f3790fb5fc4903da391beeebc4836', + 'md5': 'fb96bb3d85118930a5b055783a3bd992', 'info_dict': { 'id': 'tdy_or_siri_150701', 'ext': 'mp4', @@ -135,7 +136,6 @@ class ThePlatformIE(ThePlatformBaseIE): 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': 1435752600, 'upload_date': '20150701', - 'categories': ['Today/Shows/Orange Room', 'Today/Sections/Money', 'Today/Topics/Tech', "Today/Topics/Editor's picks"], }, }, { # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1 @@ -250,7 +250,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE): _TEST = { # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207 'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207', - 'md5': '22d2b84f058d3586efcd99e57d59d314', + 'md5': '6e32495b5073ab414471b615c5ded394', 'info_dict': { 'id': 'n_hardball_5biden_140207', 'ext': 'mp4', From 574b2a7393ef389792d5010704e505ef0eaaa5e8 Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 17 Mar 2016 16:07:36 +0100 Subject: [PATCH 023/116] [nbc:nbcnews] improve extraction(fixes #6922) - extract more metadata and formats - relax regex --- youtube_dl/extractor/nbc.py | 137 ++++++++++++++++++++++++------------ 1 file changed, 91 insertions(+), 46 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 2202cfa33..bb0817e34 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -3,13 +3,16 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from .theplatform import ThePlatformIE from ..utils import ( - ExtractorError, find_xpath_attr, lowercase_escape, smuggle_url, unescapeHTML, + update_url_query, + int_or_none, + HEADRequest, + parse_iso8601, ) @@ -131,10 +134,10 @@ class NBCSportsIE(InfoExtractor): NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') -class NBCNewsIE(InfoExtractor): +class NBCNewsIE(ThePlatformIE): _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/ (?:video/.+?/(?P\d+)| - (?:watch|feature|nightly-news)/[^/]+/(?P.+)) + ([^/]+/)*(?P<display_id>[^/?]+)) ''' _TESTS = [ @@ -149,15 +152,14 @@ class NBCNewsIE(InfoExtractor): }, }, { - 'url': 'http://www.nbcnews.com/feature/edward-snowden-interview/how-twitter-reacted-snowden-interview-n117236', - 'md5': 'b2421750c9f260783721d898f4c42063', + 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', + 'md5': 'af1adfa51312291a017720403826bb64', 'info_dict': { - 'id': 'I1wpAI_zmhsQ', + 'id': '269389891880', 'ext': 'mp4', 'title': 'How Twitter Reacted To The Snowden Interview', 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', }, - 'add_ie': ['ThePlatform'], }, { 'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156', @@ -168,17 +170,29 @@ class NBCNewsIE(InfoExtractor): 'title': 'FULL EPISODE: Family Business', 'description': 'md5:757988edbaae9d7be1d585eb5d55cc04', }, + 'skip': 'This page is unavailable.', }, { 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', - 'md5': 'b5dda8cddd8650baa0dcb616dd2cf60d', + 'md5': '73135a2e0ef819107bbb55a5a9b2a802', 'info_dict': { - 'id': 'sekXqyTVnmN3', + 'id': '394064451844', 'ext': 'mp4', 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', }, }, + { + 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', + 'md5': 'a49e173825e5fcd15c13fc297fced39d', + 'info_dict': { + 'id': '529953347624', + 'ext': 'mp4', + 'title': 'Volkswagen U.S. Chief: We \'Totally Screwed Up\'', + 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301', + }, + 'expected_warnings': ['http-6000 is not available'] + }, { 'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952', 'only_matching': True, @@ -202,49 +216,80 @@ class NBCNewsIE(InfoExtractor): } else: # "feature" and "nightly-news" pages use theplatform.com - title = mobj.group('title') - webpage = self._download_webpage(url, title) + display_id = mobj.group('display_id') + webpage = self._download_webpage(url, display_id) + info = None bootstrap_json = self._search_regex( - r'var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$', - webpage, 'bootstrap json', flags=re.MULTILINE) - bootstrap = self._parse_json(bootstrap_json, video_id) - info = bootstrap['results'][0]['video'] - mpxid = info['mpxId'] + r'(?m)var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$', + webpage, 'bootstrap json', default=None) + if bootstrap_json: + bootstrap = self._parse_json(bootstrap_json, display_id) + info = bootstrap['results'][0]['video'] + else: + player_instance_json = self._search_regex( + r'videoObj\s*:\s*({.+})', webpage, 'player instance') + info = self._parse_json(player_instance_json, display_id) + video_id = info['mpxId'] + title = info['title'] - base_urls = [ - info['fallbackPlaylistUrl'], - info['associatedPlaylistUrl'], - ] + subtitles = {} + caption_links = info.get('captionLinks') + if caption_links: + for (sub_key, sub_ext) in (('smpte-tt', 'ttml'), ('web-vtt', 'vtt'), ('srt', 'srt')): + sub_url = caption_links.get(sub_key) + if sub_url: + subtitles.setdefault('en', []).append({ + 'url': sub_url, + 'ext': sub_ext, + }) - for base_url in base_urls: - if not base_url: + formats = [] + for video_asset in info['videoAssets']: + video_url = video_asset.get('publicUrl') + if not video_url: continue - playlist_url = base_url + '?form=MPXNBCNewsAPI' - - try: - all_videos = self._download_json(playlist_url, title) - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError): - continue - raise - - if not all_videos or 'videos' not in all_videos: + container = video_asset.get('format') + asset_type = video_asset.get('assetType') or '' + if container == 'ISM' or asset_type == 'FireTV-Once': continue - - try: - info = next(v for v in all_videos['videos'] if v['mpxId'] == mpxid) - break - except StopIteration: - continue - - if info is None: - raise ExtractorError('Could not find video in playlists') + elif asset_type == 'OnceURL': + tp_formats, tp_subtitles = self._extract_theplatform_smil( + video_url, video_id) + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + else: + tbr = int_or_none(video_asset.get('bitRate'), 1000) + format_id = 'http%s' % ('-%d' % tbr if tbr else '') + video_url = update_url_query( + video_url, {'format': 'redirect'}) + # resolve the url so that we can check availability and detect the correct extension + head = self._request_webpage( + HEADRequest(video_url), video_id, + 'Checking %s url' % format_id, + '%s is not available' % format_id, + fatal=False) + if head: + video_url = head.geturl() + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'width': int_or_none(video_asset.get('width')), + 'height': int_or_none(video_asset.get('height')), + 'tbr': tbr, + 'container': video_asset.get('format'), + }) + self._sort_formats(formats) return { - '_type': 'url', - # We get the best quality video - 'url': info['videoAssets'][-1]['publicUrl'], - 'ie_key': 'ThePlatform', + 'id': video_id, + 'title': title, + 'description': info.get('description'), + 'thumbnail': info.get('description'), + 'thumbnail': info.get('thumbnail'), + 'duration': int_or_none(info.get('duration')), + 'timestamp': parse_iso8601(info.get('pubDate')), + 'formats': formats, + 'subtitles': subtitles, } From cf45ed786e580999afe864724c3b7d16abadb4e1 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 17 Mar 2016 17:48:17 +0100 Subject: [PATCH 024/116] [wistia] extract more metadata --- youtube_dl/extractor/wistia.py | 42 ++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index 41061dd31..8b14840a2 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -4,6 +4,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, sanitized_Request, + int_or_none, ) @@ -18,6 +19,9 @@ class WistiaIE(InfoExtractor): 'id': 'sh7fpupwlt', 'ext': 'mov', 'title': 'Being Resourceful', + 'description': 'a Clients From Hell Video Series video from worldwidewebhosting', + 'upload_date': '20131204', + 'timestamp': 1386185018, 'duration': 117, }, } @@ -32,35 +36,43 @@ class WistiaIE(InfoExtractor): raise ExtractorError('Error while getting the playlist', expected=True) data = data_json['media'] + title = data['name'] formats = [] thumbnails = [] for a in data['assets']: + astatus = a.get('status') atype = a.get('type') - if atype == 'still': + if (astatus is not None and astatus != 2) or atype == 'preview': + continue + elif atype in ('still', 'still_image'): thumbnails.append({ 'url': a['url'], 'resolution': '%dx%d' % (a['width'], a['height']), }) - continue - if atype == 'preview': - continue - formats.append({ - 'format_id': atype, - 'url': a['url'], - 'width': a['width'], - 'height': a['height'], - 'filesize': a['size'], - 'ext': a['ext'], - 'preference': 1 if atype == 'original' else None, - }) + else: + formats.append({ + 'format_id': atype, + 'url': a['url'], + 'tbr': int_or_none(a.get('bitrate')), + 'vbr': int_or_none(a.get('opt_vbitrate')), + 'width': int_or_none(a.get('width')), + 'height': int_or_none(a.get('height')), + 'filesize': int_or_none(a.get('size')), + 'vcodec': a.get('codec'), + 'container': a.get('container'), + 'ext': a.get('ext'), + 'preference': 1 if atype == 'original' else None, + }) self._sort_formats(formats) return { 'id': video_id, - 'title': data['name'], + 'title': title, + 'description': data.get('seoDescription'), 'formats': formats, 'thumbnails': thumbnails, - 'duration': data.get('duration'), + 'duration': int_or_none(data.get('duration')), + 'timestamp': int_or_none(data.get('createdAt')), } From cc162f6a0aa63a3e050c55cec9da728aa2cb9100 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 17 Mar 2016 22:55:04 +0600 Subject: [PATCH 025/116] [crunchyroll] Fix custom _download_webpage (Closes #8883) --- youtube_dl/extractor/crunchyroll.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index c7032ffa2..85fa7a725 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -54,7 +54,7 @@ class CrunchyrollBaseIE(InfoExtractor): def _real_initialize(self): self._login() - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): + def _download_webpage(self, url_or_request, *args, **kwargs): request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) else sanitized_Request(url_or_request)) # Accept-Language must be set explicitly to accept any language to avoid issues @@ -65,8 +65,7 @@ class CrunchyrollBaseIE(InfoExtractor): # Crunchyroll to not work in georestriction cases in some browsers that don't place # the locale lang first in header. However allowing any language seems to workaround the issue. request.add_header('Accept-Language', '*') - return super(CrunchyrollBaseIE, self)._download_webpage( - request, video_id, note, errnote, fatal, tries, timeout, encoding) + return super(CrunchyrollBaseIE, self)._download_webpage(request, *args, **kwargs) @staticmethod def _add_skip_wall(url): From 8c97e7efb6ab273f0b7c91f0aa9ac6869c911bf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 17 Mar 2016 23:43:14 +0600 Subject: [PATCH 026/116] [animeondemand] Expand episode title regex (Closes #8875) --- youtube_dl/extractor/animeondemand.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index a7d8daf7b..3dbbe2a62 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -93,7 +93,7 @@ class AnimeOnDemandIE(InfoExtractor): for episode_html in re.findall(r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage): m = re.search( - r'class="episodebox-title"[^>]+title="Episode (?P<number>\d+) - (?P<title>.+?)"', episode_html) + r'class="episodebox-title"[^>]+title="(?:Episode|Film)\s*(?P<number>\d+)\s*-\s*(?P<title>.+?)"', episode_html) if not m: continue From b57fecfdddc78b5ef5cfd1c3302f7b79ab1bf64f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 17 Mar 2016 23:50:10 +0600 Subject: [PATCH 027/116] [animeondemand] Add test --- youtube_dl/extractor/animeondemand.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 3dbbe2a62..6cb3a84f9 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -18,7 +18,7 @@ class AnimeOnDemandIE(InfoExtractor): _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in' _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' _NETRC_MACHINE = 'animeondemand' - _TEST = { + _TESTS = [{ 'url': 'https://www.anime-on-demand.de/anime/161', 'info_dict': { 'id': '161', @@ -26,7 +26,11 @@ class AnimeOnDemandIE(InfoExtractor): 'description': 'md5:6681ce3c07c7189d255ac6ab23812d31', }, 'playlist_mincount': 4, - } + }, { + # Film wording is used instead of Episode + 'url': 'https://www.anime-on-demand.de/anime/39', + 'only_matching': True, + }] def _login(self): (username, password) = self._get_login_info() From 85e8f26b827e77cbed6a83268787d450ab2bea3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 00:02:34 +0600 Subject: [PATCH 028/116] [animeondemand] Improve extraction --- youtube_dl/extractor/animeondemand.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 6cb3a84f9..3631c2451 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -30,6 +30,10 @@ class AnimeOnDemandIE(InfoExtractor): # Film wording is used instead of Episode 'url': 'https://www.anime-on-demand.de/anime/39', 'only_matching': True, + }, { + # Episodes without titles + 'url': 'https://www.anime-on-demand.de/anime/162', + 'only_matching': True, }] def _login(self): @@ -95,14 +99,22 @@ class AnimeOnDemandIE(InfoExtractor): entries = [] - for episode_html in re.findall(r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage): - m = re.search( - r'class="episodebox-title"[^>]+title="(?:Episode|Film)\s*(?P<number>\d+)\s*-\s*(?P<title>.+?)"', episode_html) - if not m: + for num, episode_html in enumerate(re.findall( + r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage)): + episodebox_title = self._search_regex( + (r'class="episodebox-title"[^>]+title="(.+?)"', + r'class="episodebox-title"[^>]+>(.+?)<'), + webpage, 'episodebox title', default=None) + if not episodebox_title: continue - episode_number = int(m.group('number')) - episode_title = m.group('title') + episode_number = int(self._search_regex( + r'^(?:Episode|Film)\s*(\d+)', + episodebox_title, 'episode number', default=num)) + episode_title = self._search_regex( + r'(?:Episode|Film)\s*\d+\s*-\s*(?P<title>.+?)', + episodebox_title, 'episode title', default=None) + video_id = 'episode-%d' % episode_number common_info = { From 0d0e282912a7ade43a148518c742557c310a41a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 00:12:34 +0600 Subject: [PATCH 029/116] [animeondemand] Fix typo and improve --- youtube_dl/extractor/animeondemand.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 3631c2451..0158407f6 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -100,19 +100,19 @@ class AnimeOnDemandIE(InfoExtractor): entries = [] for num, episode_html in enumerate(re.findall( - r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage)): + r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage), 1): episodebox_title = self._search_regex( - (r'class="episodebox-title"[^>]+title="(.+?)"', - r'class="episodebox-title"[^>]+>(.+?)<'), - webpage, 'episodebox title', default=None) + (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1', + r'class="episodebox-title"[^>]+>(?P<title>.+?)<'), + episode_html, 'episodebox title', default=None, group='title') if not episodebox_title: continue episode_number = int(self._search_regex( - r'^(?:Episode|Film)\s*(\d+)', + r'(?:Episode|Film)\s*(\d+)', episodebox_title, 'episode number', default=num)) episode_title = self._search_regex( - r'(?:Episode|Film)\s*\d+\s*-\s*(?P<title>.+?)', + r'(?:Episode|Film)\s*\d+\s*-\s*(.+)', episodebox_title, 'episode title', default=None) video_id = 'episode-%d' % episode_number From 57f7e3c62df187457a057be88fca43136f4c507f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 02:51:38 +0600 Subject: [PATCH 030/116] [compat] Add compat_xpath --- youtube_dl/compat.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 74702786a..dbb91a6ef 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -256,6 +256,16 @@ else: el.text = el.text.decode('utf-8') return doc +if sys.version_info < (2, 7): + # Here comes the crazy part: In 2.6, if the xpath is a unicode, + # .//node does not match if a node is a direct child of . ! + def compat_xpath(xpath): + if isinstance(xpath, compat_str): + xpath = xpath.encode('ascii') + return xpath +else: + compat_xpath = lambda xpath: xpath + try: from urllib.parse import parse_qs as compat_parse_qs except ImportError: # Python 2 @@ -585,6 +595,7 @@ __all__ = [ 'compat_urlparse', 'compat_urlretrieve', 'compat_xml_parse_error', + 'compat_xpath', 'shlex_quote', 'subprocess_check_output', 'workaround_optparse_bug9161', From 810c10baa1e0177a6a0ef39496f7e972db02d806 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 02:52:23 +0600 Subject: [PATCH 031/116] [utils] Use compat_xpath --- youtube_dl/utils.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8ec1bd469..ef6e7c7cb 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -50,6 +50,7 @@ from .compat import ( compat_urllib_parse_urlparse, compat_urllib_request, compat_urlparse, + compat_xpath, shlex_quote, ) @@ -165,12 +166,7 @@ if sys.version_info >= (2, 7): return node.find(expr) else: def find_xpath_attr(node, xpath, key, val=None): - # Here comes the crazy part: In 2.6, if the xpath is a unicode, - # .//node does not match if a node is a direct child of . ! - if isinstance(xpath, compat_str): - xpath = xpath.encode('ascii') - - for f in node.findall(xpath): + for f in node.findall(compat_xpath(xpath)): if key not in f.attrib: continue if val is None or f.attrib.get(key) == val: @@ -195,9 +191,7 @@ def xpath_with_ns(path, ns_map): def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): def _find_xpath(xpath): - if sys.version_info < (2, 7): # Crazy 2.6 - xpath = xpath.encode('ascii') - return node.find(xpath) + return node.find(compat_xpath(xpath)) if isinstance(xpath, (str, compat_str)): n = _find_xpath(xpath) From e3d17b3c07c6d8bc7fd45af1e45523e8fde5fb58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 02:54:27 +0600 Subject: [PATCH 032/116] [noz] Fix extraction on python 2.6 by means of using compat_xpath --- youtube_dl/extractor/noz.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/noz.py b/youtube_dl/extractor/noz.py index 656443c49..c47a33d15 100644 --- a/youtube_dl/extractor/noz.py +++ b/youtube_dl/extractor/noz.py @@ -2,7 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_urllib_parse_unquote, + compat_xpath, +) from ..utils import ( int_or_none, find_xpath_attr, @@ -47,7 +50,7 @@ class NozIE(InfoExtractor): duration = int_or_none(xpath_text( doc, './/article/movie/file/duration')) formats = [] - for qnode in doc.findall('.//article/movie/file/qualities/qual'): + for qnode in doc.findall(compat_xpath('.//article/movie/file/qualities/qual')): http_url_ele = find_xpath_attr( qnode, './html_urls/video_url', 'format', 'video/mp4') http_url = http_url_ele.text if http_url_ele is not None else None From 4c92fd2e835cde89866d3dfb1fc05d23196b19db Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 18 Mar 2016 09:21:21 +0100 Subject: [PATCH 033/116] [theplatform] always force theplatform to return a smil for _extract_theplatform_smil --- youtube_dl/extractor/bravotv.py | 2 +- youtube_dl/extractor/cbsnews.py | 2 +- youtube_dl/extractor/cnet.py | 2 +- youtube_dl/extractor/nationalgeographic.py | 2 +- youtube_dl/extractor/theplatform.py | 10 ++++++---- 5 files changed, 10 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py index 69d00b466..34d451f38 100644 --- a/youtube_dl/extractor/bravotv.py +++ b/youtube_dl/extractor/bravotv.py @@ -24,5 +24,5 @@ class BravoTVIE(InfoExtractor): account_pid = self._search_regex(r'"account_pid"\s*:\s*"([^"]+)"', webpage, 'account pid') release_pid = self._search_regex(r'"release_pid"\s*:\s*"([^"]+)"', webpage, 'release pid') return self.url_result(smuggle_url( - 'http://link.theplatform.com/s/%s/%s?format=SMIL&mbr=true&switch=progressive' % (account_pid, release_pid), + 'http://link.theplatform.com/s/%s/%s?mbr=true&switch=progressive' % (account_pid, release_pid), {'force_smil_url': True}), 'ThePlatform', release_pid) diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 7319ee1b7..8ddcc5097 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -78,7 +78,7 @@ class CBSNewsIE(ThePlatformIE): pid = item.get('media' + format_id) if not pid: continue - release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?format=SMIL&mbr=true' % pid + release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' % pid tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % pid) formats.extend(tp_formats) subtitles = self._merge_subtitles(subtitles, tp_subtitles) diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index 3cf0bf95b..c154b3e19 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -60,7 +60,7 @@ class CNETIE(ThePlatformIE): for (fkey, vid) in vdata['files'].items(): if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: continue - release_url = 'http://link.theplatform.com/s/kYEXFC/%s?format=SMIL&mbr=true' % vid + release_url = 'http://link.theplatform.com/s/kYEXFC/%s?mbr=true' % vid if fkey == 'hds': release_url += '&manifest=f4m' tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey) diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 6fc9e7b05..7ce8d9b18 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -48,7 +48,7 @@ class NationalGeographicIE(InfoExtractor): theplatform_id = url_basename(content.attrib.get('url')) return self.url_result(smuggle_url( - 'http://link.theplatform.com/s/ngs/%s?format=SMIL&formats=MPEG4&manifest=f4m' % theplatform_id, + 'http://link.theplatform.com/s/ngs/%s?formats=MPEG4&manifest=f4m' % theplatform_id, # For some reason, the normal links don't work and we must force # the use of f4m {'force_smil_url': True})) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index ffe7c57ad..a148f78ce 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -22,6 +22,7 @@ from ..utils import ( xpath_with_ns, mimetype2ext, find_xpath_attr, + update_url_query, ) default_ns = 'http://www.w3.org/2005/SMIL21/Language' @@ -30,6 +31,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) class ThePlatformBaseIE(OnceIE): def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): + smil_url = update_url_query(smil_url, {'format': 'SMIL'}) meta = self._download_xml(smil_url, video_id, note=note) error_element = find_xpath_attr( meta, _x('.//smil:ref'), 'src', @@ -213,7 +215,7 @@ class ThePlatformIE(ThePlatformBaseIE): webpage, 'smil url', group='url') path = self._search_regex( r'link\.theplatform\.com/s/((?:[^/?#&]+/)+[^/?#&]+)', smil_url, 'path') - smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4&format=SMIL' + smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4' elif mobj.group('config'): config_url = url + '&form=json' config_url = config_url.replace('swf/', 'config/') @@ -223,9 +225,9 @@ class ThePlatformIE(ThePlatformBaseIE): release_url = config['releaseUrl'] else: release_url = 'http://link.theplatform.com/s/%s?mbr=true' % path - smil_url = release_url + '&format=SMIL&formats=MPEG4&manifest=f4m' + smil_url = release_url + '&formats=MPEG4&manifest=f4m' else: - smil_url = 'http://link.theplatform.com/s/%s/meta.smil?format=smil&mbr=true' % path + smil_url = 'http://link.theplatform.com/s/%s?mbr=true' % path sig = smuggled_data.get('sig') if sig: @@ -280,7 +282,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE): first_video_id = None duration = None for item in entry['media$content']: - smil_url = item['plfile$url'] + '&format=SMIL&mbr=true' + smil_url = item['plfile$url'] + '&mbr=true' cur_video_id = ThePlatformIE._match_id(smil_url) if first_video_id is None: first_video_id = cur_video_id From 87c03c6bd22e99d6410c907128ab872e79df1560 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 18 Mar 2016 09:43:28 +0100 Subject: [PATCH 034/116] [theplatform] remove unnecessary import --- youtube_dl/extractor/theplatform.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index a148f78ce..2230dfe02 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -22,7 +22,6 @@ from ..utils import ( xpath_with_ns, mimetype2ext, find_xpath_attr, - update_url_query, ) default_ns = 'http://www.w3.org/2005/SMIL21/Language' @@ -31,8 +30,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) class ThePlatformBaseIE(OnceIE): def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): - smil_url = update_url_query(smil_url, {'format': 'SMIL'}) - meta = self._download_xml(smil_url, video_id, note=note) + meta = self._download_xml(smil_url, video_id, note=note, query={'format': 'SMIL'}) error_element = find_xpath_attr( meta, _x('.//smil:ref'), 'src', 'http://link.theplatform.com/s/errorFiles/Unavailable.mp4') From 0d33166ec586b9c75e20835adca927e923cb36e3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 18 Mar 2016 11:43:48 +0100 Subject: [PATCH 035/116] release 2016.03.18 --- docs/supportedsites.md | 4 ++++ youtube_dl/version.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a6dcc2576..3415efc45 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -81,6 +81,7 @@ - **BokeCC** - **Bpb**: Bundeszentrale für politische Bildung - **BR**: Bayerischer Rundfunk Mediathek + - **BravoTV** - **Break** - **brightcove:legacy** - **brightcove:new** @@ -499,6 +500,7 @@ - **Restudy** - **ReverbNation** - **Revision3** + - **RICE** - **RingTV** - **RottenTomatoes** - **Roxwel** @@ -617,6 +619,7 @@ - **ThePlatform** - **ThePlatformFeed** - **TheSixtyOne** + - **TheStar** - **ThisAmericanLife** - **ThisAV** - **THVideo** @@ -650,6 +653,7 @@ - **tv.dfb.de** - **TV2** - **TV2Article** + - **TV3** - **TV4**: tv4.se and tv4play.se - **TVC** - **TVCArticle** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9216fa547..6b2c5fac9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.03.14' +__version__ = '2016.03.18' From 61870915323abd126f5440282b1fd5734ee1ce6f Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 18 Mar 2016 11:50:04 +0100 Subject: [PATCH 036/116] [once] check http formats availability --- youtube_dl/extractor/once.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/once.py b/youtube_dl/extractor/once.py index 403f8c0af..080045d4c 100644 --- a/youtube_dl/extractor/once.py +++ b/youtube_dl/extractor/once.py @@ -14,13 +14,12 @@ class OnceIE(InfoExtractor): def _extract_once_formats(self, url): domain_id, application_id, media_item_id = re.match( OnceIE._VALID_URL, url).groups() - adaptive_formats = self._extract_m3u8_formats( + formats = self._extract_m3u8_formats( self.ADAPTIVE_URL_TEMPLATE % ( domain_id, application_id, media_item_id), media_item_id, 'mp4', m3u8_id='hls', fatal=False) - formats = [] - formats.extend(adaptive_formats) - for adaptive_format in adaptive_formats: + progressive_formats = [] + for adaptive_format in formats: rendition_id = self._search_regex( r'/now/media/playlist/[^/]+/[^/]+/([^/]+)', adaptive_format['url'], 'redition id', default=None) @@ -33,5 +32,7 @@ class OnceIE(InfoExtractor): 'hls', 'http'), 'protocol': 'http', }) - formats.append(progressive_format) + progressive_formats.append(progressive_format) + self._check_formats(progressive_formats, media_item_id) + formats.extend(progressive_formats) return formats From cae21032ab38f404a9959e6b28984b960e579fb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 21:08:25 +0600 Subject: [PATCH 037/116] [theplatform] Improve geo restriction detection --- youtube_dl/extractor/theplatform.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 2230dfe02..863914299 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -31,10 +31,9 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) class ThePlatformBaseIE(OnceIE): def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): meta = self._download_xml(smil_url, video_id, note=note, query={'format': 'SMIL'}) - error_element = find_xpath_attr( - meta, _x('.//smil:ref'), 'src', - 'http://link.theplatform.com/s/errorFiles/Unavailable.mp4') - if error_element is not None: + error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src') + if error_element is not None and error_element.attrib['src'].startswith( + 'http://link.theplatform.com/s/errorFiles/Unavailable.'): raise ExtractorError(error_element.attrib['abstract'], expected=True) smil_formats = self._parse_smil_formats( From 263eff9537c73caa9bff42b1e675043eaa124f9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 21:50:10 +0600 Subject: [PATCH 038/116] [extractor/generic] Properly extract format id from Content-Type Fixes extraction for cases like: audio/x-mpegURL; charset=utf-8 --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8121f04a5..b75db1252 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1249,7 +1249,7 @@ class GenericIE(InfoExtractor): # Check for direct link to a video content_type = head_response.headers.get('Content-Type', '') - m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>.+)$', content_type) + m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: upload_date = unified_strdate( head_response.headers.get('Last-Modified')) From 955737b2d40c0ce947c13659a27aae0c41077c65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 21:50:44 +0600 Subject: [PATCH 039/116] [extractor/generic] Force Content-Type to lowecase --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index b75db1252..cce7799e2 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1248,7 +1248,7 @@ class GenericIE(InfoExtractor): } # Check for direct link to a video - content_type = head_response.headers.get('Content-Type', '') + content_type = head_response.headers.get('Content-Type', '').lower() m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: upload_date = unified_strdate( From 20938f768b16c945c6041ba3c0a7ae1a4e790881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 21:54:33 +0600 Subject: [PATCH 040/116] [extractor/generic] Add another test for generic m3u8 --- youtube_dl/extractor/generic.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index cce7799e2..62b51e84e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -239,6 +239,20 @@ class GenericIE(InfoExtractor): 'format': 'bestvideo', }, }, + # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 + { + 'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8', + 'info_dict': { + 'id': 'content', + 'ext': 'mp4', + 'title': 'content', + 'formats': 'mincount:8', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + } + }, # google redirect { 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', From 303dcdb99505b29ef4c499cc395ab9ec90c07ec1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 22:41:16 +0600 Subject: [PATCH 041/116] [extractor/generic] Simplify upload_date extraction --- youtube_dl/extractor/generic.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 62b51e84e..a2e7ba5ad 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1259,14 +1259,13 @@ class GenericIE(InfoExtractor): info_dict = { 'id': video_id, 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), + 'upload_date': unified_strdate(head_response.headers.get('Last-Modified')) } # Check for direct link to a video content_type = head_response.headers.get('Content-Type', '').lower() m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: - upload_date = unified_strdate( - head_response.headers.get('Last-Modified')) format_id = m.group('format_id') if format_id.endswith('mpegurl'): formats = self._extract_m3u8_formats(url, video_id, 'mp4') @@ -1281,7 +1280,6 @@ class GenericIE(InfoExtractor): info_dict.update({ 'direct': True, 'formats': formats, - 'upload_date': upload_date, }) return info_dict @@ -1309,12 +1307,9 @@ class GenericIE(InfoExtractor): if not is_html(first_bytes): self._downloader.report_warning( 'URL could be a direct video link, returning it as such.') - upload_date = unified_strdate( - head_response.headers.get('Last-Modified')) info_dict.update({ 'direct': True, 'url': url, - 'upload_date': upload_date, }) return info_dict From de6c51e88eb61b49a95ccfcfa82547c2172eb52b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 22:43:07 +0600 Subject: [PATCH 042/116] [extractor/generic] Fix direct link semantics --- youtube_dl/extractor/generic.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a2e7ba5ad..5649e26da 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1277,10 +1277,8 @@ class GenericIE(InfoExtractor): 'url': url, 'vcodec': 'none' if m.group('type') == 'audio' else None }] - info_dict.update({ - 'direct': True, - 'formats': formats, - }) + info_dict['direct'] = True + info_dict['formats'] = formats return info_dict if not self._downloader.params.get('test', False) and not is_intentional: From 5940862d5a75ae45a640e0ce3104dd18c9864e26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 22:45:28 +0600 Subject: [PATCH 043/116] [extractor/generic] Detect m3u playlists served without proper Content-Type --- youtube_dl/extractor/generic.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 5649e26da..24d43a247 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1299,9 +1299,15 @@ class GenericIE(InfoExtractor): request.add_header('Accept-Encoding', '*') full_response = self._request_webpage(request, video_id) + first_bytes = full_response.read(512) + + # Is it an M3U playlist? + if first_bytes.startswith('#EXTM3U'): + info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4') + return info_dict + # Maybe it's a direct link to a video? # Be careful not to download the whole thing! - first_bytes = full_response.read(512) if not is_html(first_bytes): self._downloader.report_warning( 'URL could be a direct video link, returning it as such.') From edd9b71c2cca7e5a0df8799710d9ad410ec77d29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 22:49:11 +0600 Subject: [PATCH 044/116] [extractor/generic] Add a test for m3u playlist served without proper Content-Type --- youtube_dl/extractor/generic.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 24d43a247..f28a65d9b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -253,6 +253,21 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, + # m3u8 served with Content-Type: text/plain + { + 'url': 'http://www.nacentapps.com/m3u8/index.m3u8', + 'info_dict': { + 'id': 'index', + 'ext': 'mp4', + 'title': 'index', + 'upload_date': '20140720', + 'formats': 'mincount:11', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + } + }, # google redirect { 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', From 19e2617a6fb614a84340757dacb2ea918c097a84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 23:42:15 +0600 Subject: [PATCH 045/116] [commonprotocols] Add generic support for rtmp URLs (Closes #8488) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/commonprotocols.py | 36 +++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 youtube_dl/extractor/commonprotocols.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 725ebec04..acc0b03bd 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -136,6 +136,7 @@ from .collegerama import CollegeRamaIE from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE from .comcarcoff import ComCarCoffIE from .commonmistakes import CommonMistakesIE, UnicodeBOMIE +from .commonprotocols import RtmpIE from .condenast import CondeNastIE from .cracked import CrackedIE from .crackle import CrackleIE diff --git a/youtube_dl/extractor/commonprotocols.py b/youtube_dl/extractor/commonprotocols.py new file mode 100644 index 000000000..5d130a170 --- /dev/null +++ b/youtube_dl/extractor/commonprotocols.py @@ -0,0 +1,36 @@ +from __future__ import unicode_literals + +import os + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_unquote, + compat_urlparse, +) +from ..utils import url_basename + + +class RtmpIE(InfoExtractor): + IE_DESC = False # Do not list + _VALID_URL = r'(?i)rtmp[est]?://.+' + + _TESTS = [{ + 'url': 'rtmp://cp44293.edgefcs.net/ondemand?auth=daEcTdydfdqcsb8cZcDbAaCbhamacbbawaS-bw7dBb-bWG-GqpGFqCpNCnGoyL&aifp=v001&slist=public/unsecure/audio/2c97899446428e4301471a8cb72b4b97--audio--pmg-20110908-0900a_flv_aac_med_int.mp4', + 'only_matching': True, + }, { + 'url': 'rtmp://edge.live.hitbox.tv/live/dimak', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) + title = compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) + return { + 'id': video_id, + 'title': title, + 'formats': [{ + 'url': url, + 'ext': 'flv', + 'format_id': compat_urlparse.urlparse(url).scheme, + }], + } From d5aacf9a90e0855976401b6085ac56b66ca09d12 Mon Sep 17 00:00:00 2001 From: John Peel <john@dgby.org> Date: Fri, 18 Mar 2016 00:33:03 -0700 Subject: [PATCH 046/116] Added format_id to the filers on -f. --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 8c651cd52..93b6ca54d 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -905,7 +905,7 @@ class YoutubeDL(object): '*=': lambda attr, value: value in attr, } str_operator_rex = re.compile(r'''(?x) - \s*(?P<key>ext|acodec|vcodec|container|protocol) + \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id) \s*(?P<op>%s)(?P<none_inclusive>\s*\?)? \s*(?P<value>[a-zA-Z0-9._-]+) \s*$ From 4c3b16d5d1bf4806693d2895928ac1b03585b2b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Mar 2016 00:04:26 +0600 Subject: [PATCH 047/116] [test_YoutubeDL] Add test for format_id format selection --- test/test_YoutubeDL.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index efbee3b71..ca25025e2 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -222,6 +222,11 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'dash-video-low') + ydl = YDL({'format': 'bestvideo[format_id^=dash][format_id$=low]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'dash-video-low') + formats = [ {'format_id': 'vid-vcodec-dot', 'ext': 'mp4', 'preference': 1, 'vcodec': 'avc1.123456', 'acodec': 'none', 'url': TEST_URL}, ] From 09fc33198a4cfc93a98ce1ba7d51d41c487e5f56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 18 Mar 2016 19:18:55 +0100 Subject: [PATCH 048/116] utils: lookup_unit_table: Use a stricter regex In parse_count multiple units start with the same letter, so it would match different units depending on the order they were sorted when iterating over them. --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ef6e7c7cb..bad1c4ea8 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1346,7 +1346,7 @@ def format_bytes(bytes): def lookup_unit_table(unit_table, s): units_re = '|'.join(re.escape(u) for u in unit_table) m = re.match( - r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s) + r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)$' % units_re, s) if not m: return None num_str = m.group('num').replace(',', '.') From 4cd70099ea79a4a82b26694937ca46d31f7436ca Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 18 Mar 2016 21:17:45 +0100 Subject: [PATCH 049/116] [hbo] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/hbo.py | 122 +++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 youtube_dl/extractor/hbo.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index acc0b03bd..529051a93 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -284,6 +284,7 @@ from .goshgay import GoshgayIE from .gputechconf import GPUTechConfIE from .groupon import GrouponIE from .hark import HarkIE +from .hbo import HBOIE from .hearthisat import HearThisAtIE from .heise import HeiseIE from .hellporno import HellPornoIE diff --git a/youtube_dl/extractor/hbo.py b/youtube_dl/extractor/hbo.py new file mode 100644 index 000000000..dad0f3994 --- /dev/null +++ b/youtube_dl/extractor/hbo.py @@ -0,0 +1,122 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + xpath_text, + xpath_element, + int_or_none, + parse_duration, +) + + +class HBOIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hbo\.com/video/video\.html\?.*vid=(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.hbo.com/video/video.html?autoplay=true&g=u&vid=1437839', + 'md5': '1c33253f0c7782142c993c0ba62a8753', + 'info_dict': { + 'id': '1437839', + 'ext': 'mp4', + 'title': 'Ep. 64 Clip: Encryption', + } + } + _FORMATS_INFO = { + '1920': { + 'width': 1280, + 'height': 720, + }, + '640': { + 'width': 768, + 'height': 432, + }, + 'highwifi': { + 'width': 640, + 'height': 360, + }, + 'high3g': { + 'width': 640, + 'height': 360, + }, + 'medwifi': { + 'width': 400, + 'height': 224, + }, + 'med3g': { + 'width': 400, + 'height': 224, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_xml( + 'http://render.lv3.hbo.com/data/content/global/videos/data/%s.xml' % video_id, video_id) + title = xpath_text(video_data, 'title', 'title', True) + + formats = [] + for source in xpath_element(video_data, 'videos', 'sources', True): + if source.tag == 'size': + path = xpath_text(source, './/path') + if not path: + continue + width = source.attrib.get('width') + format_info = self._FORMATS_INFO.get(width, {}) + height = format_info.get('height') + fmt = { + 'url': path, + 'format_id': 'http%s' % ('-%dp' % height if height else ''), + 'width': format_info.get('width'), + 'height': height, + } + rtmp = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', path) + if rtmp: + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'ext': 'flv', + 'format_id': fmt['format_id'].replace('http', 'rtmp'), + }) + formats.append(fmt) + else: + video_url = source.text + if not video_url: + continue + if source.tag == 'tarball': + formats.extend(self._extract_m3u8_formats( + video_url.replace('.tar', '/base_index_w8.m3u8'), + video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + format_info = self._FORMATS_INFO.get(source.tag, {}) + formats.append({ + 'format_id': 'http-%s' % source.tag, + 'url': video_url, + 'width': format_info.get('width'), + 'height': format_info.get('height'), + }) + self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) + + thumbnails = [] + card_sizes = xpath_element(video_data, 'titleCardSizes') + if card_sizes is not None: + for size in card_sizes: + path = xpath_text(size, 'path') + if not path: + continue + width = int_or_none(size.get('width')) + thumbnails.append({ + 'id': width, + 'url': path, + 'width': width, + }) + + return { + 'id': video_id, + 'title': title, + 'duration': parse_duration(xpath_element(video_data, 'duration/tv14')), + 'formats': formats, + 'thumbnails': thumbnails, + } From 0d769bcb781b46a00ddf958d6ea945560f2d6cd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Mar 2016 05:43:43 +0600 Subject: [PATCH 050/116] [extractor/generic] Fix missing byte literal prefix --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f28a65d9b..26de27a7e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1317,7 +1317,7 @@ class GenericIE(InfoExtractor): first_bytes = full_response.read(512) # Is it an M3U playlist? - if first_bytes.startswith('#EXTM3U'): + if first_bytes.startswith(b'#EXTM3U'): info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4') return info_dict From 782b1b5bd1cdaaead6865dee5d300486e7dd8348 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 19 Mar 2016 11:42:35 +0100 Subject: [PATCH 051/116] [utils] lookup_unit_table: Match word boundary instead of end of string --- test/test_utils.py | 2 ++ youtube_dl/utils.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 9a3a8ddff..325b870cc 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -702,6 +702,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_count('1.000'), 1000) self.assertEqual(parse_count('1.1k'), 1100) self.assertEqual(parse_count('1.1kk'), 1100000) + self.assertEqual(parse_count('1.1kk '), 1100000) + self.assertEqual(parse_count('1.1kk views'), 1100000) def test_version_tuple(self): self.assertEqual(version_tuple('1'), (1,)) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index bad1c4ea8..067b8a184 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1346,7 +1346,7 @@ def format_bytes(bytes): def lookup_unit_table(unit_table, s): units_re = '|'.join(re.escape(u) for u in unit_table) m = re.match( - r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)$' % units_re, s) + r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s) if not m: return None num_str = m.group('num').replace(',', '.') From 52bb437e412726a37d585cf782c88bc8c8a042a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Mar 2016 20:40:36 +0600 Subject: [PATCH 052/116] [options] Add --fragment-retries option --- youtube_dl/__init__.py | 17 ++++++++++++----- youtube_dl/options.py | 4 ++++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 79b389840..737f6545d 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -144,14 +144,20 @@ def _real_main(argv=None): if numeric_limit is None: parser.error('invalid max_filesize specified') opts.max_filesize = numeric_limit - if opts.retries is not None: - if opts.retries in ('inf', 'infinite'): - opts_retries = float('inf') + + def parse_retries(retries): + if retries in ('inf', 'infinite'): + parsed_retries = float('inf') else: try: - opts_retries = int(opts.retries) + parsed_retries = int(retries) except (TypeError, ValueError): parser.error('invalid retry count specified') + return parsed_retries + if opts.retries is not None: + opts.retries = parse_retries(opts.retries) + if opts.fragment_retries is not None: + opts.fragment_retries = parse_retries(opts.fragment_retries) if opts.buffersize is not None: numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize) if numeric_buffersize is None: @@ -299,7 +305,8 @@ def _real_main(argv=None): 'force_generic_extractor': opts.force_generic_extractor, 'ratelimit': opts.ratelimit, 'nooverwrites': opts.nooverwrites, - 'retries': opts_retries, + 'retries': opts.retries, + 'fragment_retries': opts.fragment_retries, 'buffersize': opts.buffersize, 'noresizebuffer': opts.noresizebuffer, 'continuedl': opts.continue_dl, diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 9dd7a8034..822728afc 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -399,6 +399,10 @@ def parseOpts(overrideArguments=None): '-R', '--retries', dest='retries', metavar='RETRIES', default=10, help='Number of retries (default is %default), or "infinite".') + downloader.add_option( + '--fragment-retries', + dest='fragment_retries', metavar='RETRIES', default=10, + help='Number of retries for a fragment (default is %default), or "infinite" (DASH only)') downloader.add_option( '--buffer-size', dest='buffersize', metavar='SIZE', default='1024', From 721f26b8211a22648523f51c80f9b81d1eaa3b32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Mar 2016 20:41:24 +0600 Subject: [PATCH 053/116] [downloader/fragment] Add report_retry_fragment --- youtube_dl/downloader/fragment.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index a5bae9669..df66c35f0 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -21,6 +21,11 @@ class FragmentFD(FileDownloader): A base file downloader class for fragmented media (e.g. f4m/m3u8 manifests). """ + def report_retry_fragment(self, fragment_name, count, retries): + self.to_screen( + '[download] Got server HTTP error. Retrying fragment %s (attempt %d of %.0f)...' + % (fragment_name, count, retries)) + def _prepare_and_start_frag_download(self, ctx): self._prepare_frag_download(ctx) self._start_frag_download(ctx) From e33baba0dd6584475f75badec2186a7d86b88a5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Mar 2016 20:42:23 +0600 Subject: [PATCH 054/116] [downloader/dash] Add fragment retry capability YouTube may often return 404 HTTP error for a fragment causing the whole download to fail. However if the same fragment is immediately retried with the same request data this usually succeeds (1-2 attemps is usually enough) thus allowing to download the whole file successfully. So, we will retry all fragments that fail with 404 HTTP error for now. --- youtube_dl/downloader/dash.py | 42 ++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 8b1b17c6e..8bbab9dbc 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -4,6 +4,7 @@ import os import re from .fragment import FragmentFD +from ..compat import compat_urllib_error from ..utils import ( sanitize_open, encodeFilename, @@ -36,20 +37,41 @@ class DashSegmentsFD(FragmentFD): segments_filenames = [] - def append_url_to_file(target_url, target_filename): - success = ctx['dl'].download(target_filename, {'url': combine_url(base_url, target_url)}) - if not success: + fragment_retries = self.params.get('fragment_retries', 0) + + def append_url_to_file(target_url, tmp_filename, segment_name): + target_filename = '%s-%s' % (tmp_filename, segment_name) + count = 0 + while count <= fragment_retries: + try: + success = ctx['dl'].download(target_filename, {'url': combine_url(base_url, target_url)}) + if not success: + return False + down, target_sanitized = sanitize_open(target_filename, 'rb') + ctx['dest_stream'].write(down.read()) + down.close() + segments_filenames.append(target_sanitized) + break + except (compat_urllib_error.HTTPError, ) as err: + # YouTube may often return 404 HTTP error for a fragment causing the + # whole download to fail. However if the same fragment is immediately + # retried with the same request data this usually succeeds (1-2 attemps + # is usually enough) thus allowing to download the whole file successfully. + # So, we will retry all fragments that fail with 404 HTTP error for now. + if err.code != 404: + raise + # Retry fragment + count += 1 + if count <= fragment_retries: + self.report_retry_fragment(segment_name, count, fragment_retries) + if count > fragment_retries: + self.report_error('giving up after %s fragment retries' % fragment_retries) return False - down, target_sanitized = sanitize_open(target_filename, 'rb') - ctx['dest_stream'].write(down.read()) - down.close() - segments_filenames.append(target_sanitized) if initialization_url: - append_url_to_file(initialization_url, ctx['tmpfilename'] + '-Init') + append_url_to_file(initialization_url, ctx['tmpfilename'], 'Init') for i, segment_url in enumerate(segment_urls): - segment_filename = '%s-Seg%d' % (ctx['tmpfilename'], i) - append_url_to_file(segment_url, segment_filename) + append_url_to_file(segment_url, ctx['tmpfilename'], 'Seg%d' % i) self._finish_frag_download(ctx) From 617e58d85063b68fb9736355e8354b05e82b1147 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Mar 2016 20:51:30 +0600 Subject: [PATCH 055/116] [downloader/{common,fragment}] Fix total retries reporting on python 2.6 --- youtube_dl/downloader/common.py | 8 +++++++- youtube_dl/downloader/fragment.py | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index f39db58f6..1dba9f49a 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -115,6 +115,10 @@ class FileDownloader(object): return '%10s' % '---b/s' return '%10s' % ('%s/s' % format_bytes(speed)) + @staticmethod + def format_retries(retries): + return 'inf' if retries == float('inf') else '%.0f' % retries + @staticmethod def best_block_size(elapsed_time, bytes): new_min = max(bytes / 2.0, 1.0) @@ -297,7 +301,9 @@ class FileDownloader(object): def report_retry(self, count, retries): """Report retry in case of HTTP error 5xx""" - self.to_screen('[download] Got server HTTP error. Retrying (attempt %d of %.0f)...' % (count, retries)) + self.to_screen( + '[download] Got server HTTP error. Retrying (attempt %d of %s)...' + % (count, self.format_retries(retries))) def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index df66c35f0..c2671e6d2 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -23,8 +23,8 @@ class FragmentFD(FileDownloader): def report_retry_fragment(self, fragment_name, count, retries): self.to_screen( - '[download] Got server HTTP error. Retrying fragment %s (attempt %d of %.0f)...' - % (fragment_name, count, retries)) + '[download] Got server HTTP error. Retrying fragment %s (attempt %d of %s)...' + % (fragment_name, count, self.format_retries(retries))) def _prepare_and_start_frag_download(self, ctx): self._prepare_frag_download(ctx) From 16a8b7986b88572aea12c0f80c499e6e8085f1cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Mar 2016 20:54:21 +0600 Subject: [PATCH 056/116] [downloader/fragment] Document fragment_retries --- youtube_dl/downloader/fragment.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index c2671e6d2..ba903ae10 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -19,6 +19,10 @@ class HttpQuietDownloader(HttpFD): class FragmentFD(FileDownloader): """ A base file downloader class for fragmented media (e.g. f4m/m3u8 manifests). + + Available options: + + fragment_retries: Number of times to retry a fragment for HTTP error (DASH only) """ def report_retry_fragment(self, fragment_name, count, retries): From 3aec71766da38478740437c901514e666a39dbb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Mar 2016 22:30:48 +0600 Subject: [PATCH 057/116] [safari:api] Separate extractor (Closes #8871) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/safari.py | 52 +++++++++++++++++++------------- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 529051a93..b3bc38916 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -628,6 +628,7 @@ from .ruutu import RuutuIE from .sandia import SandiaIE from .safari import ( SafariIE, + SafariApiIE, SafariCourseIE, ) from .sapo import SapoIE diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 256396bb8..6ba91f202 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -75,16 +75,7 @@ class SafariBaseIE(InfoExtractor): class SafariIE(SafariBaseIE): IE_NAME = 'safari' IE_DESC = 'safaribooksonline.com online video' - _VALID_URL = r'''(?x)https?:// - (?:www\.)?safaribooksonline\.com/ - (?: - library/view/[^/]+| - api/v1/book - )/ - (?P<course_id>[^/]+)/ - (?:chapter(?:-content)?/)? - (?P<part>part\d+)\.html - ''' + _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>part\d+)\.html' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', @@ -97,9 +88,6 @@ class SafariIE(SafariBaseIE): 'upload_date': '20150724', 'uploader_id': 'stork', }, - }, { - 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', - 'only_matching': True, }, { # non-digits in course id 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html', @@ -108,13 +96,18 @@ class SafariIE(SafariBaseIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - course_id = mobj.group('course_id') - part = mobj.group('part') + video_id = '%s/%s' % (mobj.group('course_id'), mobj.group('part')) - webpage = self._download_webpage(url, '%s/%s' % (course_id, part)) - reference_id = self._search_regex(r'data-reference-id="([^"]+)"', webpage, 'kaltura reference id') - partner_id = self._search_regex(r'data-partner-id="([^"]+)"', webpage, 'kaltura widget id') - ui_id = self._search_regex(r'data-ui-id="([^"]+)"', webpage, 'kaltura uiconf id') + webpage = self._download_webpage(url, video_id) + reference_id = self._search_regex( + r'data-reference-id=(["\'])(?P<id>.+?)\1', + webpage, 'kaltura reference id', group='id') + partner_id = self._search_regex( + r'data-partner-id=(["\'])(?P<id>.+?)\1', + webpage, 'kaltura widget id', group='id') + ui_id = self._search_regex( + r'data-ui-id=(["\'])(?P<id>.+?)\1', + webpage, 'kaltura uiconf id', group='id') query = { 'wid': '_%s' % partner_id, @@ -125,7 +118,7 @@ class SafariIE(SafariBaseIE): if self.LOGGED_IN: kaltura_session = self._download_json( '%s/player/kaltura_session/?reference_id=%s' % (self._API_BASE, reference_id), - course_id, 'Downloading kaltura session JSON', + video_id, 'Downloading kaltura session JSON', 'Unable to download kaltura session JSON', fatal=False) if kaltura_session: session = kaltura_session.get('session') @@ -137,6 +130,23 @@ class SafariIE(SafariBaseIE): 'Kaltura') +class SafariApiIE(SafariBaseIE): + IE_NAME = 'safari:api' + _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>part\d+)\.html' + + _TEST = { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', + 'only_matching': True, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + part = self._download_json( + url, '%s/%s' % (mobj.group('course_id'), mobj.group('part')), + 'Downloading part JSON') + return self.url_result(part['web_url'], SafariIE.ie_key()) + + class SafariCourseIE(SafariBaseIE): IE_NAME = 'safari:course' IE_DESC = 'safaribooksonline.com online courses' @@ -168,7 +178,7 @@ class SafariCourseIE(SafariBaseIE): 'No chapters found for course %s' % course_id, expected=True) entries = [ - self.url_result(chapter, 'Safari') + self.url_result(chapter, SafariApiIE.ie_key()) for chapter in course_json['chapters']] course_title = course_json['title'] From 8b0d7a66ef5451556bb8ae5b085c7bef4c992f8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20Michaj=C5=82ow?= <kasper93@gmail.com> Date: Wed, 9 Mar 2016 20:55:27 +0100 Subject: [PATCH 058/116] [cda] Add new extractor for cda.pl Fixes #8760 --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/cda.py | 96 ++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100755 youtube_dl/extractor/cda.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b3bc38916..5f5eca42b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -108,6 +108,7 @@ from .cbsnews import ( ) from .cbssports import CBSSportsIE from .ccc import CCCIE +from .cda import CDAIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chaturbate import ChaturbateIE diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py new file mode 100755 index 000000000..4c53b8dda --- /dev/null +++ b/youtube_dl/extractor/cda.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + decode_packed_codes, + ExtractorError, + parse_duration +) + + +class CDAIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|ebd)\.)?cda\.pl/(?:video|[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)' + _TESTS = [ + { + 'url': 'http://www.cda.pl/video/5749950c', + 'md5': '6f844bf51b15f31fae165365707ae970', + 'info_dict': { + 'id': '5749950c', + 'ext': 'mp4', + 'height': 720, + 'title': 'Oto dlaczego przed zakrętem należy zwolnić.', + 'duration': 39 + } + }, + { + 'url': 'http://www.cda.pl/video/57413289', + 'md5': 'a88828770a8310fc00be6c95faf7f4d5', + 'info_dict': { + 'id': '57413289', + 'ext': 'mp4', + 'title': 'Lądowanie na lotnisku na Maderze', + 'duration': 137 + } + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage('http://ebd.cda.pl/0x0/' + video_id, video_id) + + if 'Ten film jest dostępny dla użytkowników premium' in webpage: + raise ExtractorError('This video is only available for premium users.', expected=True) + + title = self._html_search_regex(r'<title>(.+?)', webpage, 'title', fatal=False) + + def _get_format(page, version=''): + unpacked = decode_packed_codes(page) + duration = self._search_regex(r"duration:\\'(.+?)\\'", unpacked, 'duration', fatal=False) + format_id = None + height = None + + m = re.search(r'(?P[0-9]+)p<\/a>', page) + if m: + format_id = m.group('format_id') + height = int(m.group('height')) + + url = self._search_regex(r"url:\\'(.+?)\\'", unpacked, version + ' url', fatal=False) + if url is None: + return None + + return { + 'format_id': format_id, + 'height': height, + 'url': url + }, parse_duration(duration) + + formats = [] + + format_desc, duration = _get_format(webpage) or (None, None) + if format_desc is not None: + formats.append(format_desc) + + pattern = re.compile(r'([0-9]+p)<\/a>') + for version in re.findall(pattern, webpage): + webpage = self._download_webpage(version[0], video_id, 'Downloading %s version information' % version[1], fatal=False) + if not webpage: + # Manually report warning because empty page is returned when invalid version is requested. + self.report_warning('Unable to download %s version information' % version[1]) + continue + + format_desc, duration_ = _get_format(webpage, version[1]) or (None, None) + duration = duration or duration_ + if format_desc is not None: + formats.append(format_desc) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'duration': duration + } From f1ced6df51e4d81523e9051cadb6e4f5ceac19f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 19 Mar 2016 23:17:14 +0600 Subject: [PATCH 059/116] [cda] Improve and simplify (Closes #8805) --- youtube_dl/extractor/cda.py | 132 ++++++++++++++++++------------------ 1 file changed, 66 insertions(+), 66 deletions(-) diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 4c53b8dda..498d2c0d8 100755 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -12,30 +12,30 @@ from ..utils import ( class CDAIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|ebd)\.)?cda\.pl/(?:video|[0-9]+x[0-9]+)/(?P[0-9a-z]+)' - _TESTS = [ - { - 'url': 'http://www.cda.pl/video/5749950c', - 'md5': '6f844bf51b15f31fae165365707ae970', - 'info_dict': { - 'id': '5749950c', - 'ext': 'mp4', - 'height': 720, - 'title': 'Oto dlaczego przed zakrętem należy zwolnić.', - 'duration': 39 - } - }, - { - 'url': 'http://www.cda.pl/video/57413289', - 'md5': 'a88828770a8310fc00be6c95faf7f4d5', - 'info_dict': { - 'id': '57413289', - 'ext': 'mp4', - 'title': 'Lądowanie na lotnisku na Maderze', - 'duration': 137 - } + _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P[0-9a-z]+)' + _TESTS = [{ + 'url': 'http://www.cda.pl/video/5749950c', + 'md5': '6f844bf51b15f31fae165365707ae970', + 'info_dict': { + 'id': '5749950c', + 'ext': 'mp4', + 'height': 720, + 'title': 'Oto dlaczego przed zakrętem należy zwolnić.', + 'duration': 39 } - ] + }, { + 'url': 'http://www.cda.pl/video/57413289', + 'md5': 'a88828770a8310fc00be6c95faf7f4d5', + 'info_dict': { + 'id': '57413289', + 'ext': 'mp4', + 'title': 'Lądowanie na lotnisku na Maderze', + 'duration': 137 + } + }, { + 'url': 'http://ebd.cda.pl/0x0/5749950c', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -44,53 +44,53 @@ class CDAIE(InfoExtractor): if 'Ten film jest dostępny dla użytkowników premium' in webpage: raise ExtractorError('This video is only available for premium users.', expected=True) - title = self._html_search_regex(r'(.+?)', webpage, 'title', fatal=False) - - def _get_format(page, version=''): - unpacked = decode_packed_codes(page) - duration = self._search_regex(r"duration:\\'(.+?)\\'", unpacked, 'duration', fatal=False) - format_id = None - height = None - - m = re.search(r'(?P[0-9]+)p<\/a>', page) - if m: - format_id = m.group('format_id') - height = int(m.group('height')) - - url = self._search_regex(r"url:\\'(.+?)\\'", unpacked, version + ' url', fatal=False) - if url is None: - return None - - return { - 'format_id': format_id, - 'height': height, - 'url': url - }, parse_duration(duration) + title = self._html_search_regex(r'(.+?)', webpage, 'title') formats = [] - format_desc, duration = _get_format(webpage) or (None, None) - if format_desc is not None: - formats.append(format_desc) - - pattern = re.compile(r'([0-9]+p)<\/a>') - for version in re.findall(pattern, webpage): - webpage = self._download_webpage(version[0], video_id, 'Downloading %s version information' % version[1], fatal=False) - if not webpage: - # Manually report warning because empty page is returned when invalid version is requested. - self.report_warning('Unable to download %s version information' % version[1]) - continue - - format_desc, duration_ = _get_format(webpage, version[1]) or (None, None) - duration = duration or duration_ - if format_desc is not None: - formats.append(format_desc) - - self._sort_formats(formats) - - return { + info_dict = { 'id': video_id, 'title': title, 'formats': formats, - 'duration': duration + 'duration': None, } + + def extract_format(page, version): + unpacked = decode_packed_codes(page) + format_url = self._search_regex( + r"url:\\'(.+?)\\'", unpacked, '%s url' % version, fatal=False) + if not format_url: + return + f = { + 'url': format_url, + } + m = re.search( + r']+data-quality="(?P[^"]+)"[^>]+href="[^"]+"[^>]+class="[^"]*quality-btn-active[^"]*">(?P[0-9]+)p', + page) + if m: + f.update({ + 'format_id': m.group('format_id'), + 'height': int(m.group('height')), + }) + info_dict['formats'].append(f) + if not info_dict['duration']: + info_dict['duration'] = parse_duration(self._search_regex( + r"duration:\\'(.+?)\\'", unpacked, 'duration', fatal=False)) + + extract_format(webpage, 'default') + + for href, resolution in re.findall( + r']+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)', + webpage): + webpage = self._download_webpage( + href, video_id, 'Downloading %s version information' % resolution, fatal=False) + if not webpage: + # Manually report warning because empty page is returned when + # invalid version is requested. + self.report_warning('Unable to download %s version information' % resolution) + continue + extract_format(webpage, resolution) + + self._sort_formats(formats) + + return info_dict From 9261e347ccf63b31bd2035996279b0ad1a45247a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 19 Mar 2016 23:18:04 +0600 Subject: [PATCH 060/116] Credit @kasper93 for cda (#8805) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index aa48cd5a6..6c9747913 100644 --- a/AUTHORS +++ b/AUTHORS @@ -163,3 +163,4 @@ Patrick Griffis Aidan Rowe mutantmonkey Ben Congdon +Kacper Michajłow From e36f4aa72b01b3f6a322edc094cdf1c20b071367 Mon Sep 17 00:00:00 2001 From: jjatria Date: Wed, 28 Oct 2015 18:31:52 +0000 Subject: [PATCH 061/116] [biobiotv] Add extractor --- youtube_dl/extractor/biobiotv.py | 75 ++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 youtube_dl/extractor/biobiotv.py diff --git a/youtube_dl/extractor/biobiotv.py b/youtube_dl/extractor/biobiotv.py new file mode 100644 index 000000000..aae0588ef --- /dev/null +++ b/youtube_dl/extractor/biobiotv.py @@ -0,0 +1,75 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class BioBioTVIE(InfoExtractor): + _VALID_URL = r'https?://tv\.biobiochile\.cl/notas/(?P\d{4})/\d{2}/\d{2}/(?P[\w-]+)(?:\.shtml)?' + + _TESTS = [{ + 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/sobre-camaras-y-camarillas-parlamentarias.shtml', + 'md5': '26f51f03cf580265defefb4518faec09', + 'info_dict': { + 'id': 'col_c266', + 'display_id': 'sobre-camaras-y-camarillas-parlamentarias', + 'ext': 'mp4', + 'title': 'Sobre Cámaras y camarillas parlamentarias - BioBioChile TV', + 'thumbnail': 'http://media.biobiochile.cl/wp-content/uploads/2015/10/atria-2010-730x350.jpg', + 'url': 'http://unlimited2-cl.digitalproserver.com/bbtv/2015/col_c266.mp4', + 'uploader': 'Fernando Atria', + } + }, { + 'url': 'http://tv.biobiochile.cl/notas/2015/10/22/ninos-transexuales-de-quien-es-la-decision.shtml', + 'md5': 'a8c868e6b5f6c17d56873d5633204f84', + 'info_dict': { + 'id': 'col_c270', + 'display_id': 'ninos-transexuales-de-quien-es-la-decision', + 'ext': 'mp4', + 'title': 'Niños transexuales: ¿De quién es la decisión? - BioBioChile TV', + 'thumbnail': 'http://media.biobiochile.cl/wp-content/uploads/2015/10/samantha-2210-730x350.jpg', + 'url': 'http://unlimited2-cl.digitalproserver.com/bbtv/2015/col_c270.mp4', + 'uploader': 'Samantha Morán', + } + }, { + 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/exclusivo-hector-pinto-formador-de-chupete-revela-version-del-ex-delantero-albo.shtml', + 'md5': 'c8369b50d42ff0a4f6b969fbd1a7c32d', + 'info_dict': { + 'id': 'Keno_Pinto', + 'display_id': 'exclusivo-hector-pinto-formador-de-chupete-revela-version-del-ex-delantero-albo', + 'ext': 'mp4', + 'title': 'Exclusivo: Héctor Pinto, formador de “Chupete”, revela versión del ex delantero albo - BioBioChile TV', + 'thumbnail': 'http://media.biobiochile.cl/wp-content/uploads/2015/10/pinto-730x350.jpg', + 'url': 'http://unlimited2-cl.digitalproserver.com/bbtv/2015/Keno_Pinto.mp4', + 'uploader': 'Juan Pablo Echenique', + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + year = mobj.group('year') + + webpage = self._download_webpage(url, display_id) + + title = self._html_search_meta( + 'og:title', webpage, 'title', fatal=True) + + thumbnail = self._html_search_meta( + 'og:image', webpage, 'thumbnail', fatal=True) + + video_id = self._html_search_regex( + r'loadFWPlayerVideo\(\"player_0\", \"\d{4}/(.+)\.mp4\"\)', webpage, 'title') + + url = 'http://unlimited2-cl.digitalproserver.com/bbtv/' + year + '/' + video_id + '.mp4' + + return { + 'id': video_id, + 'title': title, + 'url': url, + 'display_id': display_id, + 'thumbnail': thumbnail, + 'uploader': self._search_regex(r'biobiochile\.cl/author[^"]+"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), + } From fa023ccb2c00f393c78ae4cbbabec7a8ec7b3ac6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 20 Mar 2016 01:31:55 +0600 Subject: [PATCH 062/116] [biobiochiletv] Fix extraction, extract m3u8 formats and overall improve (Closes #7314) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/biobiochiletv.py | 86 +++++++++++++++++++++++++++ youtube_dl/extractor/biobiotv.py | 75 ----------------------- 3 files changed, 87 insertions(+), 75 deletions(-) create mode 100644 youtube_dl/extractor/biobiochiletv.py delete mode 100644 youtube_dl/extractor/biobiotv.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5f5eca42b..b773edb3d 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -72,6 +72,7 @@ from .bet import BetIE from .bigflix import BigflixIE from .bild import BildIE from .bilibili import BiliBiliIE +from .biobiochiletv import BioBioChileTVIE from .bleacherreport import ( BleacherReportIE, BleacherReportCMSIE, diff --git a/youtube_dl/extractor/biobiochiletv.py b/youtube_dl/extractor/biobiochiletv.py new file mode 100644 index 000000000..133228133 --- /dev/null +++ b/youtube_dl/extractor/biobiochiletv.py @@ -0,0 +1,86 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import remove_end + + +class BioBioChileTVIE(InfoExtractor): + _VALID_URL = r'https?://tv\.biobiochile\.cl/notas/(?:[^/]+/)+(?P[^/]+)\.shtml' + + _TESTS = [{ + 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/sobre-camaras-y-camarillas-parlamentarias.shtml', + 'md5': '26f51f03cf580265defefb4518faec09', + 'info_dict': { + 'id': 'sobre-camaras-y-camarillas-parlamentarias', + 'ext': 'mp4', + 'title': 'Sobre Cámaras y camarillas parlamentarias', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Fernando Atria', + }, + }, { + # different uploader layout + 'url': 'http://tv.biobiochile.cl/notas/2016/03/18/natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades.shtml', + 'md5': 'edc2e6b58974c46d5b047dea3c539ff3', + 'info_dict': { + 'id': 'natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades', + 'ext': 'mp4', + 'title': 'Natalia Valdebenito repasa a diputado Hasbún: Pasó a la categoría de hablar brutalidades', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Piangella Obrador', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://tv.biobiochile.cl/notas/2015/10/22/ninos-transexuales-de-quien-es-la-decision.shtml', + 'only_matching': True, + }, { + 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/exclusivo-hector-pinto-formador-de-chupete-revela-version-del-ex-delantero-albo.shtml', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = remove_end(self._og_search_title(webpage), ' - BioBioChile TV') + + file_url = self._search_regex( + r'loadFWPlayerVideo\([^,]+,\s*(["\'])(?P.+?)\1', + webpage, 'file url', group='url') + + base_url = self._search_regex( + r'file\s*:\s*(["\'])(?P.+?)\1\s*\+\s*fileURL', webpage, + 'base url', default='http://unlimited2-cl.digitalproserver.com/bbtv/', + group='url') + + formats = self._extract_m3u8_formats( + '%s%s/playlist.m3u8' % (base_url, file_url), video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + f = { + 'url': '%s%s' % (base_url, file_url), + 'format_id': 'http', + 'protocol': 'http', + 'preference': 1, + } + if formats: + f_copy = formats[-1].copy() + f_copy.update(f) + f = f_copy + formats.append(f) + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + uploader = self._html_search_regex( + r']+href=["\']https?://busca\.biobiochile\.cl/author[^>]+>(.+?)', + webpage, 'uploader', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'formats': formats, + } diff --git a/youtube_dl/extractor/biobiotv.py b/youtube_dl/extractor/biobiotv.py deleted file mode 100644 index aae0588ef..000000000 --- a/youtube_dl/extractor/biobiotv.py +++ /dev/null @@ -1,75 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class BioBioTVIE(InfoExtractor): - _VALID_URL = r'https?://tv\.biobiochile\.cl/notas/(?P\d{4})/\d{2}/\d{2}/(?P[\w-]+)(?:\.shtml)?' - - _TESTS = [{ - 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/sobre-camaras-y-camarillas-parlamentarias.shtml', - 'md5': '26f51f03cf580265defefb4518faec09', - 'info_dict': { - 'id': 'col_c266', - 'display_id': 'sobre-camaras-y-camarillas-parlamentarias', - 'ext': 'mp4', - 'title': 'Sobre Cámaras y camarillas parlamentarias - BioBioChile TV', - 'thumbnail': 'http://media.biobiochile.cl/wp-content/uploads/2015/10/atria-2010-730x350.jpg', - 'url': 'http://unlimited2-cl.digitalproserver.com/bbtv/2015/col_c266.mp4', - 'uploader': 'Fernando Atria', - } - }, { - 'url': 'http://tv.biobiochile.cl/notas/2015/10/22/ninos-transexuales-de-quien-es-la-decision.shtml', - 'md5': 'a8c868e6b5f6c17d56873d5633204f84', - 'info_dict': { - 'id': 'col_c270', - 'display_id': 'ninos-transexuales-de-quien-es-la-decision', - 'ext': 'mp4', - 'title': 'Niños transexuales: ¿De quién es la decisión? - BioBioChile TV', - 'thumbnail': 'http://media.biobiochile.cl/wp-content/uploads/2015/10/samantha-2210-730x350.jpg', - 'url': 'http://unlimited2-cl.digitalproserver.com/bbtv/2015/col_c270.mp4', - 'uploader': 'Samantha Morán', - } - }, { - 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/exclusivo-hector-pinto-formador-de-chupete-revela-version-del-ex-delantero-albo.shtml', - 'md5': 'c8369b50d42ff0a4f6b969fbd1a7c32d', - 'info_dict': { - 'id': 'Keno_Pinto', - 'display_id': 'exclusivo-hector-pinto-formador-de-chupete-revela-version-del-ex-delantero-albo', - 'ext': 'mp4', - 'title': 'Exclusivo: Héctor Pinto, formador de “Chupete”, revela versión del ex delantero albo - BioBioChile TV', - 'thumbnail': 'http://media.biobiochile.cl/wp-content/uploads/2015/10/pinto-730x350.jpg', - 'url': 'http://unlimited2-cl.digitalproserver.com/bbtv/2015/Keno_Pinto.mp4', - 'uploader': 'Juan Pablo Echenique', - } - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') - year = mobj.group('year') - - webpage = self._download_webpage(url, display_id) - - title = self._html_search_meta( - 'og:title', webpage, 'title', fatal=True) - - thumbnail = self._html_search_meta( - 'og:image', webpage, 'thumbnail', fatal=True) - - video_id = self._html_search_regex( - r'loadFWPlayerVideo\(\"player_0\", \"\d{4}/(.+)\.mp4\"\)', webpage, 'title') - - url = 'http://unlimited2-cl.digitalproserver.com/bbtv/' + year + '/' + video_id + '.mp4' - - return { - 'id': video_id, - 'title': title, - 'url': url, - 'display_id': display_id, - 'thumbnail': thumbnail, - 'uploader': self._search_regex(r'biobiochile\.cl/author[^"]+"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), - } From 94dcade8f892f27f1cdbee29be2e06c08442976e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 20 Mar 2016 01:36:20 +0600 Subject: [PATCH 063/116] Credit @jjatria for biobiochiletv (#7314) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 6c9747913..e507686f2 100644 --- a/AUTHORS +++ b/AUTHORS @@ -164,3 +164,4 @@ Aidan Rowe mutantmonkey Ben Congdon Kacper Michajłow +José Joaquín Atria From d95114dd8359d10c6a0ca5eaddbbd94806173957 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 20 Mar 2016 02:34:02 +0600 Subject: [PATCH 064/116] [91porn] Unquote final URL (Closes #8881) --- youtube_dl/extractor/porn91.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py index 3e15533e9..a6dd2694c 100644 --- a/youtube_dl/extractor/porn91.py +++ b/youtube_dl/extractor/porn91.py @@ -1,7 +1,10 @@ # encoding: utf-8 from __future__ import unicode_literals -from ..compat import compat_urllib_parse +from ..compat import ( + compat_urllib_parse, + compat_urllib_parse_unquote, +) from .common import InfoExtractor from ..utils import ( parse_duration, @@ -55,7 +58,8 @@ class Porn91IE(InfoExtractor): info_cn = self._download_webpage( 'http://91porn.com/getfile.php?' + url_params, video_id, 'get real video url') - video_url = self._search_regex(r'file=([^&]+)&', info_cn, 'url') + video_url = compat_urllib_parse_unquote(self._search_regex( + r'file=([^&]+)&', info_cn, 'url')) duration = parse_duration(self._search_regex( r'时长:\s*\s*(\d+:\d+)', webpage, 'duration', fatal=False)) From 298c04b46497b924b1cbb2f031c5d73d09d2933d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 20 Mar 2016 02:35:48 +0600 Subject: [PATCH 065/116] [91porn] Use common messages' wording --- youtube_dl/extractor/porn91.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py index a6dd2694c..63ce87ee3 100644 --- a/youtube_dl/extractor/porn91.py +++ b/youtube_dl/extractor/porn91.py @@ -31,9 +31,10 @@ class Porn91IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - url = 'http://91porn.com/view_video.php?viewkey=%s' % video_id self._set_cookie('91porn.com', 'language', 'cn_CN') - webpage = self._download_webpage(url, video_id, 'get HTML content') + + webpage = self._download_webpage( + 'http://91porn.com/view_video.php?viewkey=%s' % video_id, video_id) if '作为游客,你每天只可观看10个视频' in webpage: raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True) @@ -57,7 +58,7 @@ class Porn91IE(InfoExtractor): }) info_cn = self._download_webpage( 'http://91porn.com/getfile.php?' + url_params, video_id, - 'get real video url') + 'Downloading real video url') video_url = compat_urllib_parse_unquote(self._search_regex( r'file=([^&]+)&', info_cn, 'url')) From 5c7cd37ebd6dfb7d5809d2798d0188decce42914 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 19 Mar 2016 21:50:16 +0100 Subject: [PATCH 066/116] tox.ini: Exclude test_iqiyi_sdk_interpreter.py --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 48504329f..2d7134005 100644 --- a/tox.ini +++ b/tox.ini @@ -8,6 +8,6 @@ deps = passenv = HOME defaultargs = test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py - --exclude test_youtube_lists.py + --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py commands = nosetests --verbose {posargs:{[testenv]defaultargs}} # --with-coverage --cover-package=youtube_dl --cover-html # test.test_download:TestDownload.test_NowVideo From 3ff63fb3657fee9e0c2df9d5bb96ae5827f257cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 19 Mar 2016 21:51:13 +0100 Subject: [PATCH 067/116] Makefile: make it compatible with bmake It's the portable version of BSD make: http://crufty.net/help/sjg/bmake.html The syntax for conditionals is different in GNU make and BSD make, so we use the shell --- Makefile | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/Makefile b/Makefile index e98806791..6689ec06f 100644 --- a/Makefile +++ b/Makefile @@ -12,15 +12,7 @@ SHAREDIR ?= $(PREFIX)/share PYTHON ?= /usr/bin/env python # set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local -ifeq ($(PREFIX),/usr) - SYSCONFDIR=/etc -else - ifeq ($(PREFIX),/usr/local) - SYSCONFDIR=/etc - else - SYSCONFDIR=$(PREFIX)/etc - endif -endif +SYSCONFDIR != if [ $(PREFIX) = /usr -o $(PREFIX) = /usr/local ]; then echo /etc; else echo $(PREFIX)/etc; fi install: youtube-dl youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish install -d $(DESTDIR)$(BINDIR) From 40025ee2a339d1a357869fd8d8718a737d250d9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 20 Mar 2016 04:12:34 +0600 Subject: [PATCH 068/116] [postprocessort/ffmpeg] Allow embedding webvtt into webm (Closes #8874) --- youtube_dl/options.py | 2 +- youtube_dl/postprocessor/ffmpeg.py | 25 +++++++++++++++++++++---- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 9dd7a8034..755ed6540 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -720,7 +720,7 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--embed-subs', action='store_true', dest='embedsubtitles', default=False, - help='Embed subtitles in the video (only for mkv and mp4 videos)') + help='Embed subtitles in the video (only for mp4, webm and mkv videos)') postproc.add_option( '--embed-thumbnail', action='store_true', dest='embedthumbnail', default=False, diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index a8819f258..06b8c0548 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -331,17 +331,34 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor): class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): def run(self, information): - if information['ext'] not in ['mp4', 'mkv']: - self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 or mkv files') + if information['ext'] not in ('mp4', 'webm', 'mkv'): + self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4, webm or mkv files') return [], information subtitles = information.get('requested_subtitles') if not subtitles: self._downloader.to_screen('[ffmpeg] There aren\'t any subtitles to embed') return [], information - sub_langs = list(subtitles.keys()) filename = information['filepath'] - sub_filenames = [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in subtitles.items()] + + ext = information['ext'] + sub_langs = [] + sub_filenames = [] + webm_vtt_warn = False + + for lang, sub_info in subtitles.items(): + sub_ext = sub_info['ext'] + if ext != 'webm' or ext == 'webm' and sub_ext == 'vtt': + sub_langs.append(lang) + sub_filenames.append(subtitles_filename(filename, lang, sub_ext)) + else: + if not webm_vtt_warn and ext == 'webm' and sub_ext != 'vtt': + webm_vtt_warn = True + self._downloader.to_screen('[ffmpeg] Only WebVTT subtitles can be embedded in webm files') + + if not sub_langs: + return [], information + input_files = [filename] + sub_filenames opts = [ From 96a9f22d983e414583f52eafece7902b1248377b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 20 Mar 2016 10:26:58 +0600 Subject: [PATCH 069/116] [discovery] Relax _VALID_URL (Closes #8903) --- youtube_dl/extractor/discovery.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index ce680a9f3..fdce1429a 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -9,7 +9,7 @@ from ..compat import compat_str class DiscoveryIE(InfoExtractor): - _VALID_URL = r'''(?x)http://(?:www\.)?(?: + _VALID_URL = r'''(?x)https?://(?:www\.)?(?: discovery| investigationdiscovery| discoverylife| From db264e3cc3bbab191972bbe1c4efb526ff8bfc26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 20 Mar 2016 12:44:04 +0600 Subject: [PATCH 070/116] [francetvinfo] Add support for france3-regions and strip title (Closes #7673) --- youtube_dl/extractor/francetv.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 3f4ac3093..7db5fb418 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -82,6 +82,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor): subtitle = info.get('sous_titre') if subtitle: title += ' - %s' % subtitle + title = title.strip() subtitles = {} subtitles_list = [{ @@ -125,7 +126,7 @@ class PluzzIE(FranceTVBaseInfoExtractor): class FranceTvInfoIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetvinfo.fr' - _VALID_URL = r'https?://(?:www|mobile)\.francetvinfo\.fr/.*/(?P.+)\.html' + _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/.*/(?P<title>.+)\.html' _TESTS = [{ 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', @@ -160,6 +161,21 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'title': 'Les entreprises familiales : le secret de la réussite', 'thumbnail': 're:^https?://.*\.jpe?g$', } + }, { + 'url': 'http://france3-regions.francetvinfo.fr/bretagne/cotes-d-armor/thalassa-echappee-breizh-ce-venredi-dans-les-cotes-d-armor-954961.html', + 'md5': 'f485bda6e185e7d15dbc69b72bae993e', + 'info_dict': { + 'id': 'NI_657393', + 'ext': 'flv', + 'title': 'Olivier Monthus, réalisateur de "Bretagne, le choix de l’Armor"', + 'description': 'md5:a3264114c9d29aeca11ced113c37b16c', + 'thumbnail': 're:^https?://.*\.jpe?g$', + 'timestamp': 1458300695, + 'upload_date': '20160318', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -172,7 +188,9 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): return self.url_result(dmcloud_url, 'DailymotionCloud') video_id, catalogue = self._search_regex( - r'id-video=([^@]+@[^"]+)', webpage, 'video id').split('@') + (r'id-video=([^@]+@[^"]+)', + r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"'), + webpage, 'video id').split('@') return self._extract_video(video_id, catalogue) From 3c20208eff29fcd3e5d4b13f3d4ffa1be7c56309 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Mar 2016 13:00:46 +0600 Subject: [PATCH 071/116] [francetv] Improve formats extraction --- youtube_dl/extractor/francetv.py | 36 +++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 7db5fb418..ad94e31f3 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -60,22 +60,24 @@ class FranceTVBaseInfoExtractor(InfoExtractor): video_id, 'Downloading f4m manifest token', fatal=False) if f4m_url: formats.extend(self._extract_f4m_formats( - f4m_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, 1, format_id)) + f4m_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', + video_id, f4m_id=format_id, fatal=False)) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4', m3u8_id=format_id)) + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) elif video_url.startswith('rtmp'): formats.append({ 'url': video_url, 'format_id': 'rtmp-%s' % format_id, 'ext': 'flv', - 'preference': 1, }) else: - formats.append({ - 'url': video_url, - 'format_id': format_id, - 'preference': -1, - }) + if self._is_valid_url(video_url, video_id, format_id): + formats.append({ + 'url': video_url, + 'format_id': format_id, + }) self._sort_formats(formats) title = info['titre'] @@ -132,7 +134,7 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', 'info_dict': { 'id': '84981923', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Soir 3', 'upload_date': '20130826', 'timestamp': 1377548400, @@ -140,6 +142,10 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'fr': 'mincount:2', }, }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, }, { 'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html', 'info_dict': { @@ -156,17 +162,23 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html', 'md5': 'f485bda6e185e7d15dbc69b72bae993e', 'info_dict': { - 'id': '556e03339473995ee145930c', + 'id': 'NI_173343', 'ext': 'mp4', 'title': 'Les entreprises familiales : le secret de la réussite', 'thumbnail': 're:^https?://.*\.jpe?g$', - } + 'timestamp': 1433273139, + 'upload_date': '20150602', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, }, { 'url': 'http://france3-regions.francetvinfo.fr/bretagne/cotes-d-armor/thalassa-echappee-breizh-ce-venredi-dans-les-cotes-d-armor-954961.html', 'md5': 'f485bda6e185e7d15dbc69b72bae993e', 'info_dict': { 'id': 'NI_657393', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Olivier Monthus, réalisateur de "Bretagne, le choix de l’Armor"', 'description': 'md5:a3264114c9d29aeca11ced113c37b16c', 'thumbnail': 're:^https?://.*\.jpe?g$', From 664bcd80b99ac84c3cc7a08e8284abc024a1e58c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 20 Mar 2016 15:45:31 +0800 Subject: [PATCH 072/116] [tudou] Use InAdvancePagedList (closes #8884) --- youtube_dl/extractor/tudou.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index f56b66d06..9892e8a62 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( int_or_none, + InAdvancePagedList, float_or_none, unescapeHTML, ) @@ -75,15 +76,16 @@ class TudouIE(InfoExtractor): quality = sorted(filter(lambda k: k.isdigit(), segments.keys()), key=lambda k: int(k))[-1] parts = segments[quality] - result = [] len_parts = len(parts) if len_parts > 1: self.to_screen('%s: found %s parts' % (video_id, len_parts)) - for part in parts: + + def part_func(partnum): + part = parts[partnum] part_id = part['k'] final_url = self._url_for_id(part_id, quality) ext = (final_url.split('?')[0]).split('.')[-1] - part_info = { + return [{ 'id': '%s' % part_id, 'url': final_url, 'ext': ext, @@ -97,12 +99,13 @@ class TudouIE(InfoExtractor): 'http_headers': { 'Referer': self._PLAYER_URL, }, - } - result.append(part_info) + }] + + entries = InAdvancePagedList(part_func, len_parts, 1) return { '_type': 'multi_video', - 'entries': result, + 'entries': entries, 'id': video_id, 'title': title, } From 2bfeee69b976fe049761dd3012e30b637ee05a58 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 20 Mar 2016 15:54:58 +0800 Subject: [PATCH 073/116] [openload] Add new extractor (closes #8489) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/openload.py | 92 ++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) create mode 100644 youtube_dl/extractor/openload.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b773edb3d..ee792bbe0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -535,6 +535,7 @@ from .ooyala import ( OoyalaIE, OoyalaExternalIE, ) +from .openload import OpenloadIE from .ora import OraTVIE from .orf import ( ORFTVthekIE, diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py new file mode 100644 index 000000000..71021d573 --- /dev/null +++ b/youtube_dl/extractor/openload.py @@ -0,0 +1,92 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_chr +from ..utils import encode_base_n + + +class OpenloadIE(InfoExtractor): + _VALID_URL = r'https://openload.co/f/(?P<id>[a-zA-Z0-9]+)' + + _TEST = { + 'url': 'https://openload.co/f/kUEfGclsU9o', + 'md5': 'bf1c059b004ebc7a256f89408e65c36e', + 'info_dict': { + 'id': 'kUEfGclsU9o', + 'ext': 'mp4', + 'title': 'skyrim_no-audio_1080.mp4', + }, + } + + @staticmethod + def openload_level2_debase(m): + radix, num = int(m.group(1)) + 27, int(m.group(2)) + return '"' + encode_base_n(num, radix) + '"' + + @classmethod + def openload_level2(cls, txt): + # The function name is ǃ \u01c3 + # Using escaped unicode literals does not work in Python 3.2 + return re.sub(r'ǃ\((\d+),(\d+)\)', cls.openload_level2_debase, txt, re.UNICODE).replace('"+"', '') + + # Openload uses a variant of aadecode + # openload_decode and related functions are originally written by + # vitas@matfyz.cz and released with public domain + # See https://github.com/rg3/youtube-dl/issues/8489 + @classmethod + def openload_decode(cls, txt): + symbol_table = [ + ('_', '(゚Д゚) [゚Θ゚]'), + ('a', '(゚Д゚) [゚ω゚ノ]'), + ('b', '(゚Д゚) [゚Θ゚ノ]'), + ('c', '(゚Д゚) [\'c\']'), + ('d', '(゚Д゚) [゚ー゚ノ]'), + ('e', '(゚Д゚) [゚Д゚ノ]'), + ('f', '(゚Д゚) [1]'), + + ('o', '(゚Д゚) [\'o\']'), + ('u', '(o゚ー゚o)'), + ('c', '(゚Д゚) [\'c\']'), + + ('7', '((゚ー゚) + (o^_^o))'), + ('6', '((o^_^o) +(o^_^o) +(c^_^o))'), + ('5', '((゚ー゚) + (゚Θ゚))'), + ('4', '(-~3)'), + ('3', '(-~-~1)'), + ('2', '(-~1)'), + ('1', '(-~0)'), + ('0', '((c^_^o)-(c^_^o))'), + ] + delim = '(゚Д゚)[゚ε゚]+' + ret = '' + for aachar in txt.split(delim): + for val, pat in symbol_table: + aachar = aachar.replace(pat, val) + aachar = aachar.replace('+ ', '') + m = re.match(r'^\d+', aachar) + if m: + ret += compat_chr(int(m.group(0), 8)) + else: + m = re.match(r'^u([\da-f]+)', aachar) + if m: + ret += compat_chr(int(m.group(1), 16)) + return cls.openload_level2(ret) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + code = self._search_regex( + r'<video[^>]+>\s*<script[^>]+>([^<]+)</script>', + webpage, 'JS code') + + video_url = self._search_regex( + r'return\s+"(https?://[^"]+)"', self.openload_decode(code), 'video URL') + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'url': video_url, + } From 9e3c2f1d741acc4dd576f77c185e99cfd6bb2ea4 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 20 Mar 2016 16:49:44 +0800 Subject: [PATCH 074/116] [openload] Misc improvements * Add thumbnail * Detect errors (#6469) * Match more (#6469, #8489) --- youtube_dl/extractor/openload.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 71021d573..4468f31fc 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -5,21 +5,31 @@ import re from .common import InfoExtractor from ..compat import compat_chr -from ..utils import encode_base_n +from ..utils import ( + encode_base_n, + ExtractorError, +) class OpenloadIE(InfoExtractor): - _VALID_URL = r'https://openload.co/f/(?P<id>[a-zA-Z0-9]+)' + _VALID_URL = r'https://openload.(?:co|io)/(?:f|embed)/(?P<id>[a-zA-Z0-9-]+)' - _TEST = { + _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', 'md5': 'bf1c059b004ebc7a256f89408e65c36e', 'info_dict': { 'id': 'kUEfGclsU9o', 'ext': 'mp4', 'title': 'skyrim_no-audio_1080.mp4', + 'thumbnail': 're:^https?://.*\.jpg$', }, - } + }, { + 'url': 'https://openload.co/embed/kUEfGclsU9o/skyrim_no-audio_1080.mp4', + 'only_matching': True, + }, { + 'url': 'https://openload.io/f/ZAn6oz-VZGE/', + 'only_matching': True, + }] @staticmethod def openload_level2_debase(m): @@ -78,6 +88,10 @@ class OpenloadIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + + if 'File not found' in webpage: + raise ExtractorError('File not found', expected=True) + code = self._search_regex( r'<video[^>]+>\s*<script[^>]+>([^<]+)</script>', webpage, 'JS code') @@ -88,5 +102,6 @@ class OpenloadIE(InfoExtractor): return { 'id': video_id, 'title': self._og_search_title(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), 'url': video_url, } From 920d318d3cf70ee0b80cc67e7d3f85b5d45a20a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 20 Mar 2016 10:55:14 +0100 Subject: [PATCH 075/116] README: document that BSD make is also supported (#8902) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 68db546ef..fcc12d2b3 100644 --- a/README.md +++ b/README.md @@ -831,7 +831,7 @@ To run the test, simply invoke your favorite test runner, or execute a test file If you want to create a build of youtube-dl yourself, you'll need * python -* make +* make (both GNU make and BSD make are supported) * pandoc * zip * nosetests From 2648918c814773e746c6d26da834d32eac952ffa Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 20 Mar 2016 18:14:02 +0800 Subject: [PATCH 076/116] [vlive] Fix creator extraction (closes #8814) --- youtube_dl/extractor/vlive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 9e2aa58bd..bd5545173 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -64,7 +64,7 @@ class VLiveIE(InfoExtractor): thumbnail = self._og_search_thumbnail(webpage) creator = self._html_search_regex( - r'<div[^>]+class="info_area"[^>]*>\s*<strong[^>]+class="name"[^>]*>([^<]+)</strong>', + r'<div[^>]+class="info_area"[^>]*>\s*<a\s+[^>]*>([^<]+)', webpage, 'creator', fatal=False) view_count = int_or_none(playinfo.get('meta', {}).get('count')) From 7caae128a72596e22cdfb538272c38a37d095db2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 20 Mar 2016 19:11:02 +0800 Subject: [PATCH 077/116] Credit @vitstradal for the key algorithm in OpenloadIE (#8489) [ci skip] --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index e507686f2..51dfc8ddd 100644 --- a/AUTHORS +++ b/AUTHORS @@ -165,3 +165,4 @@ mutantmonkey Ben Congdon Kacper Michajłow José Joaquín Atria +Viťas Strádal From daef04a4e75ccd2ff5e2d2495baa0ac9bcf75724 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 20 Mar 2016 20:17:56 +0800 Subject: [PATCH 078/116] [kwuo] Fix KuwoChartIE and KuwoSingerIE and accept new URL forms --- youtube_dl/extractor/kuwo.py | 52 ++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index f94804d06..45d65e61f 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -2,13 +2,13 @@ from __future__ import unicode_literals import re -import itertools from .common import InfoExtractor from ..utils import ( get_element_by_id, clean_html, ExtractorError, + InAdvancePagedList, remove_start, ) @@ -55,7 +55,7 @@ class KuwoBaseIE(InfoExtractor): class KuwoIE(KuwoBaseIE): IE_NAME = 'kuwo:song' IE_DESC = '酷我音乐' - _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P<id>\d+?)/' + _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P<id>\d+?)' _TESTS = [{ 'url': 'http://www.kuwo.cn/yinyue/635632/', 'info_dict': { @@ -80,6 +80,9 @@ class KuwoIE(KuwoBaseIE): 'params': { 'format': 'mp3-320' }, + }, { + 'url': 'http://www.kuwo.cn/yinyue/3197154?catalog=yueku2016', + 'only_matching': True, }] def _real_extract(self, url): @@ -172,8 +175,6 @@ class KuwoChartIE(InfoExtractor): 'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm', 'info_dict': { 'id': '香港中文龙虎榜', - 'title': '香港中文龙虎榜', - 'description': 're:\d{4}第\d{2}期', }, 'playlist_mincount': 10, } @@ -184,17 +185,11 @@ class KuwoChartIE(InfoExtractor): url, chart_id, note='Download chart info', errnote='Unable to get chart info') - chart_name = self._html_search_regex( - r'<h1[^>]+class="unDis">([^<]+)</h1>', webpage, 'chart name') - - chart_desc = self._html_search_regex( - r'<p[^>]+class="tabDef">(\d{4}第\d{2}期)</p>', webpage, 'chart desc') - entries = [ self.url_result(song_url, 'Kuwo') for song_url in re.findall( - r'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)/"', webpage) + r'<a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)', webpage) ] - return self.playlist_result(entries, chart_id, chart_name, chart_desc) + return self.playlist_result(entries, chart_id) class KuwoSingerIE(InfoExtractor): @@ -207,7 +202,7 @@ class KuwoSingerIE(InfoExtractor): 'id': 'bruno+mars', 'title': 'Bruno Mars', }, - 'playlist_count': 10, + 'playlist_mincount': 329, }, { 'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm', 'info_dict': { @@ -218,6 +213,8 @@ class KuwoSingerIE(InfoExtractor): 'skip': 'Regularly stalls travis build', # See https://travis-ci.org/rg3/youtube-dl/jobs/78878540 }] + PAGE_SIZE = 15 + def _real_extract(self, url): singer_id = self._match_id(url) webpage = self._download_webpage( @@ -225,25 +222,28 @@ class KuwoSingerIE(InfoExtractor): errnote='Unable to get singer info') singer_name = self._html_search_regex( - r'<div class="title clearfix">\s*<h1>([^<]+)<span', webpage, 'singer name' - ) + r'<h1>([^<]+)</h1>', webpage, 'singer name') - entries = [] - first_page_only = False if re.search(r'/music(?:_\d+)?\.htm', url) else True - for page_num in itertools.count(1): + artist_id = self._html_search_regex( + r'data-artistid="(\d+)"', webpage, 'artist id') + + page_count = int(self._html_search_regex( + r'data-page="(\d+)"', webpage, 'page count')) + + def page_func(page_num): webpage = self._download_webpage( - 'http://www.kuwo.cn/mingxing/%s/music_%d.htm' % (singer_id, page_num), - singer_id, note='Download song list page #%d' % page_num, - errnote='Unable to get song list page #%d' % page_num) + 'http://www.kuwo.cn/artist/contentMusicsAjax', + singer_id, note='Download song list page #%d' % (page_num + 1), + errnote='Unable to get song list page #%d' % (page_num + 1), + query={'artistId': artist_id, 'pn': page_num, 'rn': self.PAGE_SIZE}) - entries.extend([ + return [ self.url_result(song_url, 'Kuwo') for song_url in re.findall( - r'<p[^>]+class="m_name"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)/', + r'<div[^>]+class="name"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)', webpage) - ][:10 if first_page_only else None]) + ] - if first_page_only or not re.search(r'<a[^>]+href="[^"]+">下一页</a>', webpage): - break + entries = InAdvancePagedList(page_func, page_count, self.PAGE_SIZE) return self.playlist_result(entries, singer_id, singer_name) From 3e8bb9a972a377442f5f433123ea70b332248f70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Mar 2016 20:39:00 +0600 Subject: [PATCH 079/116] [animeondemand] Detect geo restriction --- youtube_dl/extractor/animeondemand.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 0158407f6..85306a69c 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -44,6 +44,10 @@ class AnimeOnDemandIE(InfoExtractor): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') + if '>Our licensing terms allow the distribution of animes only to German-speaking countries of Europe' in login_page: + self.raise_geo_restricted( + '%s is only available in German-speaking countries of Europe' % self.IE_NAME) + login_form = self._form_hidden_inputs('new_user', login_page) login_form.update({ From 3c5d183c19f29c1f52fe913ce7e7d47f6eebff2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Mar 2016 21:51:22 +0600 Subject: [PATCH 080/116] [animeondemand] Extract all formats (Closes #8906) --- youtube_dl/extractor/animeondemand.py | 85 ++++++++++++++++++++------- 1 file changed, 65 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 85306a69c..4352525e2 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -3,10 +3,14 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_urlparse, + compat_str, +) from ..utils import ( determine_ext, encode_dict, + extract_attributes, ExtractorError, sanitized_Request, urlencode_postdata, @@ -34,6 +38,10 @@ class AnimeOnDemandIE(InfoExtractor): # Episodes without titles 'url': 'https://www.anime-on-demand.de/anime/162', 'only_matching': True, + }, { + # ger/jap, Dub/OmU, account required + 'url': 'https://www.anime-on-demand.de/anime/169', + 'only_matching': True, }] def _login(self): @@ -130,33 +138,70 @@ class AnimeOnDemandIE(InfoExtractor): formats = [] - playlist_url = self._search_regex( - r'data-playlist=(["\'])(?P<url>.+?)\1', - episode_html, 'data playlist', default=None, group='url') - if playlist_url: - request = sanitized_Request( - compat_urlparse.urljoin(url, playlist_url), - headers={ - 'X-Requested-With': 'XMLHttpRequest', - 'X-CSRF-Token': csrf_token, - 'Referer': url, - 'Accept': 'application/json, text/javascript, */*; q=0.01', - }) + for input_ in re.findall( + r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', episode_html): + attributes = extract_attributes(input_) + playlist_urls = [] + for playlist_key in ('data-playlist', 'data-otherplaylist'): + playlist_url = attributes.get(playlist_key) + if isinstance(playlist_url, compat_str) and re.match( + r'/?[\da-zA-Z]+', playlist_url): + playlist_urls.append(attributes[playlist_key]) + if not playlist_urls: + continue - playlist = self._download_json( - request, video_id, 'Downloading playlist JSON', fatal=False) - if playlist: - playlist = playlist['playlist'][0] - title = playlist['title'] + lang = attributes.get('data-lang') + lang_note = attributes.get('value') + + for playlist_url in playlist_urls: + kind = self._search_regex( + r'videomaterialurl/\d+/([^/]+)/', + playlist_url, 'media kind', default=None) + format_id_list = [] + if lang: + format_id_list.append(lang) + if kind: + format_id_list.append(kind) + if not format_id_list: + format_id_list.append('hls') + format_id = '-'.join(format_id_list) + format_note = ', '.join(filter(None, (kind, lang_note))) + request = sanitized_Request( + compat_urlparse.urljoin(url, playlist_url), + headers={ + 'X-Requested-With': 'XMLHttpRequest', + 'X-CSRF-Token': csrf_token, + 'Referer': url, + 'Accept': 'application/json, text/javascript, */*; q=0.01', + }) + playlist = self._download_json( + request, video_id, 'Downloading %s playlist JSON' % format_id, + fatal=False) + if not playlist: + continue + playlist = playlist.get('playlist') + if not playlist or not isinstance(playlist, list): + continue + playlist = playlist[0] + title = playlist.get('title') + if not title: + continue description = playlist.get('description') for source in playlist.get('sources', []): file_ = source.get('file') if file_ and determine_ext(file_) == 'm3u8': - formats = self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( file_, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls') + entry_protocol='m3u8_native', m3u8_id=format_id) + for f in m3u8_formats: + f.update({ + 'language': lang, + 'format_note': format_note, + }) + formats.extend(m3u8_formats) if formats: + self._sort_formats(formats) f = common_info.copy() f.update({ 'title': title, From 9016d76f71b30bd61d69f80dc88fa53f978cf99c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 20 Mar 2016 22:01:45 +0600 Subject: [PATCH 081/116] [YoutubeDL] Improve _format_note --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 93b6ca54d..29d7a3106 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1836,7 +1836,7 @@ class YoutubeDL(object): if fdict.get('language'): if res: res += ' ' - res += '[%s]' % fdict['language'] + res += '[%s] ' % fdict['language'] if fdict.get('format_note') is not None: res += fdict['format_note'] + ' ' if fdict.get('tbr') is not None: From 12af4beb3e28f986170ed00488b48e2e8bcd4e13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 21 Mar 2016 21:17:29 +0600 Subject: [PATCH 082/116] [mailru] Add support for https (Closes #8920) --- youtube_dl/extractor/mailru.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py index 71085f279..46eb00492 100644 --- a/youtube_dl/extractor/mailru.py +++ b/youtube_dl/extractor/mailru.py @@ -13,7 +13,7 @@ from ..utils import ( class MailRuIE(InfoExtractor): IE_NAME = 'mailru' IE_DESC = 'Видео@Mail.Ru' - _VALID_URL = r'http://(?:www\.)?my\.mail\.ru/(?:video/.*#video=/?(?P<idv1>(?:[^/]+/){3}\d+)|(?:(?P<idv2prefix>(?:[^/]+/){2})video/(?P<idv2suffix>[^/]+/\d+))\.html)' + _VALID_URL = r'https?://(?:www\.)?my\.mail\.ru/(?:video/.*#video=/?(?P<idv1>(?:[^/]+/){3}\d+)|(?:(?P<idv2prefix>(?:[^/]+/){2})video/(?P<idv2suffix>[^/]+/\d+))\.html)' _TESTS = [ { From 0cef27ad255b5cb994b1fa0e80a04bd09514925a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 21 Mar 2016 21:22:37 +0600 Subject: [PATCH 083/116] Add missing r prefix for _VALID_URLs --- youtube_dl/extractor/bbc.py | 2 +- youtube_dl/extractor/nova.py | 2 +- youtube_dl/extractor/tv2.py | 4 ++-- youtube_dl/extractor/vgtv.py | 4 ++-- youtube_dl/extractor/wdr.py | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index e62b3860e..c3176700a 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -942,7 +942,7 @@ class BBCIE(BBCCoUkIE): class BBCCoUkArticleIE(InfoExtractor): - _VALID_URL = 'http://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)' + _VALID_URL = r'http://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)' IE_NAME = 'bbc.co.uk:article' IE_DESC = 'BBC articles' diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 3f9c776ef..72f72b803 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -12,7 +12,7 @@ from ..utils import ( class NovaIE(InfoExtractor): IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz' - _VALID_URL = 'http://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)' + _VALID_URL = r'http://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)' _TESTS = [{ 'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html?utm_source=tvnoviny&utm_medium=cpfooter&utm_campaign=novaplus', 'info_dict': { diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index 1457e524e..535d0d361 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -14,7 +14,7 @@ from ..utils import ( class TV2IE(InfoExtractor): - _VALID_URL = 'http://(?:www\.)?tv2\.no/v/(?P<id>\d+)' + _VALID_URL = r'http://(?:www\.)?tv2\.no/v/(?P<id>\d+)' _TEST = { 'url': 'http://www.tv2.no/v/916509/', 'info_dict': { @@ -100,7 +100,7 @@ class TV2IE(InfoExtractor): class TV2ArticleIE(InfoExtractor): - _VALID_URL = 'http://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)' + _VALID_URL = r'http://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542', 'info_dict': { diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index e148b1ef5..77d8978d4 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -214,7 +214,7 @@ class VGTVIE(XstreamIE): class BTArticleIE(InfoExtractor): IE_NAME = 'bt:article' IE_DESC = 'Bergens Tidende Articles' - _VALID_URL = 'http://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html' + _VALID_URL = r'http://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html' _TEST = { 'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html', 'md5': '2acbe8ad129b3469d5ae51b1158878df', @@ -241,7 +241,7 @@ class BTArticleIE(InfoExtractor): class BTVestlendingenIE(InfoExtractor): IE_NAME = 'bt:vestlendingen' IE_DESC = 'Bergens Tidende - Vestlendingen' - _VALID_URL = 'http://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)' + _VALID_URL = r'http://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588', 'md5': 'd7d17e3337dc80de6d3a540aefbe441b', diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index a851578e0..65cab4069 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -244,7 +244,7 @@ class WDRMobileIE(InfoExtractor): class WDRMausIE(InfoExtractor): - _VALID_URL = 'http://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P<id>[^/?#]+)(?:/index\.php5|(?<!index)\.php5|/(?:$|[?#]))' + _VALID_URL = r'http://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P<id>[^/?#]+)(?:/index\.php5|(?<!index)\.php5|/(?:$|[?#]))' IE_DESC = 'Sendung mit der Maus' _TESTS = [{ 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', From 5886b38d73c54239c85c3e0d8e7c1585d1bbb7da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 21 Mar 2016 21:36:32 +0600 Subject: [PATCH 084/116] Add support for https for all extractors as preventive and future-proof measure --- youtube_dl/extractor/abc.py | 2 +- youtube_dl/extractor/addanime.py | 2 +- youtube_dl/extractor/aftonbladet.py | 2 +- youtube_dl/extractor/aljazeera.py | 2 +- youtube_dl/extractor/aol.py | 4 ++-- youtube_dl/extractor/arte.py | 2 +- youtube_dl/extractor/azubu.py | 2 +- youtube_dl/extractor/baidu.py | 2 +- youtube_dl/extractor/bbc.py | 2 +- youtube_dl/extractor/behindkink.py | 2 +- youtube_dl/extractor/bilibili.py | 2 +- youtube_dl/extractor/bokecc.py | 2 +- youtube_dl/extractor/bpb.py | 2 +- youtube_dl/extractor/breakcom.py | 2 +- youtube_dl/extractor/camdemy.py | 4 ++-- youtube_dl/extractor/cbsnews.py | 4 ++-- youtube_dl/extractor/cbssports.py | 2 +- youtube_dl/extractor/cliphunter.py | 2 +- youtube_dl/extractor/clipsyndicate.py | 2 +- youtube_dl/extractor/clubic.py | 2 +- youtube_dl/extractor/comcarcoff.py | 2 +- youtube_dl/extractor/condenast.py | 2 +- youtube_dl/extractor/cspan.py | 2 +- youtube_dl/extractor/ctsnews.py | 2 +- youtube_dl/extractor/dctp.py | 2 +- youtube_dl/extractor/defense.py | 2 +- youtube_dl/extractor/douyutv.py | 2 +- youtube_dl/extractor/dplay.py | 2 +- youtube_dl/extractor/dreisat.py | 2 +- youtube_dl/extractor/dvtv.py | 2 +- youtube_dl/extractor/echomsk.py | 2 +- youtube_dl/extractor/exfm.py | 2 +- youtube_dl/extractor/fc2.py | 2 +- youtube_dl/extractor/firstpost.py | 2 +- youtube_dl/extractor/firsttv.py | 2 +- youtube_dl/extractor/fktv.py | 2 +- youtube_dl/extractor/footyroom.py | 2 +- youtube_dl/extractor/foxgay.py | 2 +- youtube_dl/extractor/franceinter.py | 2 +- youtube_dl/extractor/freevideo.py | 2 +- youtube_dl/extractor/gamekings.py | 2 +- youtube_dl/extractor/gamespot.py | 2 +- youtube_dl/extractor/gamestar.py | 2 +- youtube_dl/extractor/gametrailers.py | 2 +- youtube_dl/extractor/hotnewhiphop.py | 2 +- youtube_dl/extractor/hypem.py | 2 +- youtube_dl/extractor/imdb.py | 4 ++-- youtube_dl/extractor/iqiyi.py | 2 +- youtube_dl/extractor/jadorecettepub.py | 2 +- youtube_dl/extractor/jeuxvideo.py | 2 +- youtube_dl/extractor/karaoketv.py | 2 +- youtube_dl/extractor/karrierevideos.py | 2 +- youtube_dl/extractor/kontrtube.py | 2 +- youtube_dl/extractor/ku6.py | 2 +- youtube_dl/extractor/kusi.py | 2 +- youtube_dl/extractor/kuwo.py | 12 ++++++------ youtube_dl/extractor/leeco.py | 4 ++-- youtube_dl/extractor/lifenews.py | 4 ++-- youtube_dl/extractor/limelight.py | 6 +++--- youtube_dl/extractor/m6.py | 2 +- youtube_dl/extractor/metacafe.py | 2 +- youtube_dl/extractor/mit.py | 2 +- youtube_dl/extractor/mitele.py | 2 +- youtube_dl/extractor/mooshare.py | 2 +- youtube_dl/extractor/motherless.py | 2 +- youtube_dl/extractor/motorsport.py | 2 +- youtube_dl/extractor/myspass.py | 2 +- youtube_dl/extractor/myvideo.py | 2 +- youtube_dl/extractor/myvidster.py | 2 +- youtube_dl/extractor/nationalgeographic.py | 2 +- youtube_dl/extractor/nbc.py | 4 ++-- youtube_dl/extractor/nextmedia.py | 6 +++--- youtube_dl/extractor/noco.py | 2 +- youtube_dl/extractor/normalboots.py | 2 +- youtube_dl/extractor/nova.py | 2 +- youtube_dl/extractor/npr.py | 2 +- youtube_dl/extractor/ntvru.py | 2 +- youtube_dl/extractor/orf.py | 6 +++--- youtube_dl/extractor/philharmoniedeparis.py | 2 +- youtube_dl/extractor/photobucket.py | 2 +- youtube_dl/extractor/pornhd.py | 2 +- youtube_dl/extractor/pornovoisines.py | 2 +- youtube_dl/extractor/pyvideo.py | 2 +- youtube_dl/extractor/qqmusic.py | 10 +++++----- youtube_dl/extractor/rai.py | 4 ++-- youtube_dl/extractor/redtube.py | 2 +- youtube_dl/extractor/ringtv.py | 2 +- youtube_dl/extractor/rtve.py | 4 ++-- youtube_dl/extractor/ruhd.py | 2 +- youtube_dl/extractor/rutube.py | 6 +++--- youtube_dl/extractor/screenjunkies.py | 2 +- youtube_dl/extractor/senateisvp.py | 2 +- youtube_dl/extractor/shared.py | 2 +- youtube_dl/extractor/sport5.py | 2 +- youtube_dl/extractor/ssa.py | 2 +- youtube_dl/extractor/sztvhu.py | 2 +- youtube_dl/extractor/teamcoco.py | 2 +- youtube_dl/extractor/tele13.py | 2 +- youtube_dl/extractor/tf1.py | 2 +- youtube_dl/extractor/thvideo.py | 2 +- youtube_dl/extractor/tinypic.py | 2 +- youtube_dl/extractor/tlc.py | 2 +- youtube_dl/extractor/toypics.py | 2 +- youtube_dl/extractor/traileraddict.py | 2 +- youtube_dl/extractor/trollvids.py | 2 +- youtube_dl/extractor/tumblr.py | 2 +- youtube_dl/extractor/tv2.py | 4 ++-- youtube_dl/extractor/tvc.py | 4 ++-- youtube_dl/extractor/tvplay.py | 2 +- youtube_dl/extractor/ubu.py | 2 +- youtube_dl/extractor/unistra.py | 2 +- youtube_dl/extractor/vbox7.py | 2 +- youtube_dl/extractor/veoh.py | 2 +- youtube_dl/extractor/vesti.py | 2 +- youtube_dl/extractor/vgtv.py | 4 ++-- youtube_dl/extractor/videott.py | 2 +- youtube_dl/extractor/viidea.py | 2 +- youtube_dl/extractor/vube.py | 2 +- youtube_dl/extractor/vuclip.py | 2 +- youtube_dl/extractor/walla.py | 2 +- youtube_dl/extractor/wat.py | 2 +- youtube_dl/extractor/wdr.py | 2 +- youtube_dl/extractor/weiqitv.py | 2 +- youtube_dl/extractor/wimp.py | 2 +- youtube_dl/extractor/xbef.py | 2 +- youtube_dl/extractor/yam.py | 2 +- youtube_dl/extractor/ynet.py | 2 +- 127 files changed, 156 insertions(+), 156 deletions(-) diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 6a29e587f..b584277be 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -12,7 +12,7 @@ from ..utils import ( class ABCIE(InfoExtractor): IE_NAME = 'abc.net.au' - _VALID_URL = r'http://www\.abc\.net\.au/news/(?:[^/]+/){1,2}(?P<id>\d+)' + _VALID_URL = r'https?://www\.abc\.net\.au/news/(?:[^/]+/){1,2}(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334', diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index e3e6d2113..fb1cc02e1 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -16,7 +16,7 @@ from ..utils import ( class AddAnimeIE(InfoExtractor): - _VALID_URL = r'http://(?:\w+\.)?add-anime\.net/(?:watch_video\.php\?(?:.*?)v=|video/)(?P<id>[\w_]+)' + _VALID_URL = r'https?://(?:\w+\.)?add-anime\.net/(?:watch_video\.php\?(?:.*?)v=|video/)(?P<id>[\w_]+)' _TESTS = [{ 'url': 'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', 'md5': '72954ea10bc979ab5e2eb288b21425a0', diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py index e0518cf26..d548592fe 100644 --- a/youtube_dl/extractor/aftonbladet.py +++ b/youtube_dl/extractor/aftonbladet.py @@ -6,7 +6,7 @@ from ..utils import int_or_none class AftonbladetIE(InfoExtractor): - _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P<id>[0-9]+)' + _VALID_URL = r'https?://tv\.aftonbladet\.se/abtv/articles/(?P<id>[0-9]+)' _TEST = { 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'info_dict': { diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index cddcaa489..b081695d8 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'http://www\.aljazeera\.com/programmes/.*?/(?P<id>[^/]+)\.html' + _VALID_URL = r'https?://www\.aljazeera\.com/programmes/.*?/(?P<id>[^/]+)\.html' _TEST = { 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html', diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index b761b2cc4..95a99c6b0 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class AolIE(InfoExtractor): IE_NAME = 'on.aol.com' - _VALID_URL = r'(?:aol-video:|http://on\.aol\.com/video/.*-)(?P<id>[0-9]+)(?:$|\?)' + _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/video/.*-)(?P<id>[0-9]+)(?:$|\?)' _TESTS = [{ 'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img', @@ -25,7 +25,7 @@ class AolIE(InfoExtractor): class AolFeaturesIE(InfoExtractor): IE_NAME = 'features.aol.com' - _VALID_URL = r'http://features\.aol\.com/video/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://features\.aol\.com/video/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'http://features.aol.com/video/behind-secret-second-careers-late-night-talk-show-hosts', diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 3e119e21b..ae0f27dcb 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -23,7 +23,7 @@ from ..utils import ( class ArteTvIE(InfoExtractor): - _VALID_URL = r'http://videos\.arte\.tv/(?P<lang>fr|de|en|es)/.*-(?P<id>.*?)\.html' + _VALID_URL = r'https?://videos\.arte\.tv/(?P<lang>fr|de|en|es)/.*-(?P<id>.*?)\.html' IE_NAME = 'arte.tv' def _real_extract(self, url): diff --git a/youtube_dl/extractor/azubu.py b/youtube_dl/extractor/azubu.py index 011edf128..1805b7312 100644 --- a/youtube_dl/extractor/azubu.py +++ b/youtube_dl/extractor/azubu.py @@ -98,7 +98,7 @@ class AzubuIE(InfoExtractor): class AzubuLiveIE(InfoExtractor): - _VALID_URL = r'http://www.azubu.tv/(?P<id>[^/]+)$' + _VALID_URL = r'https?://www.azubu.tv/(?P<id>[^/]+)$' _TEST = { 'url': 'http://www.azubu.tv/MarsTVMDLen', diff --git a/youtube_dl/extractor/baidu.py b/youtube_dl/extractor/baidu.py index 76b21e596..234a661d3 100644 --- a/youtube_dl/extractor/baidu.py +++ b/youtube_dl/extractor/baidu.py @@ -9,7 +9,7 @@ from ..utils import unescapeHTML class BaiduVideoIE(InfoExtractor): IE_DESC = '百度视频' - _VALID_URL = r'http://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm' + _VALID_URL = r'https?://v\.baidu\.com/(?P<type>[a-z]+)/(?P<id>\d+)\.htm' _TESTS = [{ 'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6', 'info_dict': { diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index c3176700a..2dfcee98d 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -942,7 +942,7 @@ class BBCIE(BBCCoUkIE): class BBCCoUkArticleIE(InfoExtractor): - _VALID_URL = r'http://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)' + _VALID_URL = r'https?://www.bbc.co.uk/programmes/articles/(?P<id>[a-zA-Z0-9]+)' IE_NAME = 'bbc.co.uk:article' IE_DESC = 'BBC articles' diff --git a/youtube_dl/extractor/behindkink.py b/youtube_dl/extractor/behindkink.py index 1bdc25812..9bca853b3 100644 --- a/youtube_dl/extractor/behindkink.py +++ b/youtube_dl/extractor/behindkink.py @@ -8,7 +8,7 @@ from ..utils import url_basename class BehindKinkIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?behindkink\.com/(?P<year>[0-9]{4})/(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<id>[^/#?_]+)' + _VALID_URL = r'https?://(?:www\.)?behindkink\.com/(?P<year>[0-9]{4})/(?P<month>[0-9]{2})/(?P<day>[0-9]{2})/(?P<id>[^/#?_]+)' _TEST = { 'url': 'http://www.behindkink.com/2014/12/05/what-are-you-passionate-about-marley-blaze/', 'md5': '507b57d8fdcd75a41a9a7bdb7989c762', diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 59beb11bc..8baff2041 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -14,7 +14,7 @@ from ..utils import ( class BiliBiliIE(InfoExtractor): - _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)(?:/index_(?P<page_num>\d+).html)?' + _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)(?:/index_(?P<page_num>\d+).html)?' _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', diff --git a/youtube_dl/extractor/bokecc.py b/youtube_dl/extractor/bokecc.py index 122a1cbb6..86a7f4d7d 100644 --- a/youtube_dl/extractor/bokecc.py +++ b/youtube_dl/extractor/bokecc.py @@ -33,7 +33,7 @@ class BokeCCBaseIE(InfoExtractor): class BokeCCIE(BokeCCBaseIE): _IE_DESC = 'CC视频' - _VALID_URL = r'http://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)' + _VALID_URL = r'https?://union\.bokecc\.com/playvideo\.bo\?(?P<query>.*)' _TESTS = [{ 'url': 'http://union.bokecc.com/playvideo.bo?vid=E44D40C15E65EA30&uid=CD0C5D3C8614B28B', diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py index c28e72927..6ad45a1e6 100644 --- a/youtube_dl/extractor/bpb.py +++ b/youtube_dl/extractor/bpb.py @@ -12,7 +12,7 @@ from ..utils import ( class BpbIE(InfoExtractor): IE_DESC = 'Bundeszentrale für politische Bildung' - _VALID_URL = r'http://www\.bpb\.de/mediathek/(?P<id>[0-9]+)/' + _VALID_URL = r'https?://www\.bpb\.de/mediathek/(?P<id>[0-9]+)/' _TEST = { 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index aa08051b1..725859b4d 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -11,7 +11,7 @@ from ..utils import ( class BreakIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?break\.com/video/(?:[^/]+/)*.+-(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?break\.com/video/(?:[^/]+/)*.+-(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.break.com/video/when-girls-act-like-guys-2468056', 'info_dict': { diff --git a/youtube_dl/extractor/camdemy.py b/youtube_dl/extractor/camdemy.py index 897f3a104..dd4d96cec 100644 --- a/youtube_dl/extractor/camdemy.py +++ b/youtube_dl/extractor/camdemy.py @@ -16,7 +16,7 @@ from ..utils import ( class CamdemyIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?camdemy\.com/media/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?camdemy\.com/media/(?P<id>\d+)' _TESTS = [{ # single file 'url': 'http://www.camdemy.com/media/5181/', @@ -104,7 +104,7 @@ class CamdemyIE(InfoExtractor): class CamdemyFolderIE(InfoExtractor): - _VALID_URL = r'http://www.camdemy.com/folder/(?P<id>\d+)' + _VALID_URL = r'https?://www.camdemy.com/folder/(?P<id>\d+)' _TESTS = [{ # links with trailing slash 'url': 'http://www.camdemy.com/folder/450', diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 8ddcc5097..f23bac9a1 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -11,7 +11,7 @@ from ..utils import ( class CBSNewsIE(ThePlatformIE): IE_DESC = 'CBS News' - _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P<id>[\da-z_-]+)' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P<id>[\da-z_-]+)' _TESTS = [ { @@ -96,7 +96,7 @@ class CBSNewsIE(ThePlatformIE): class CBSNewsLiveVideoIE(InfoExtractor): IE_DESC = 'CBS News Live Videos' - _VALID_URL = r'http://(?:www\.)?cbsnews\.com/live/video/(?P<id>[\da-z_-]+)' + _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P<id>[\da-z_-]+)' _TEST = { 'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/', diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index ae47e74cc..549ae32f3 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class CBSSportsIE(InfoExtractor): - _VALID_URL = r'http://www\.cbssports\.com/video/player/(?P<section>[^/]+)/(?P<id>[^/]+)' + _VALID_URL = r'https?://www\.cbssports\.com/video/player/(?P<section>[^/]+)/(?P<id>[^/]+)' _TEST = { 'url': 'http://www.cbssports.com/video/player/tennis/318462531970/0/us-open-flashbacks-1990s', diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index 2996b6b09..19f8b397e 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -19,7 +19,7 @@ def _decode(s): class CliphunterIE(InfoExtractor): IE_NAME = 'cliphunter' - _VALID_URL = r'''(?x)http://(?:www\.)?cliphunter\.com/w/ + _VALID_URL = r'''(?x)https?://(?:www\.)?cliphunter\.com/w/ (?P<id>[0-9]+)/ (?P<seo>.+?)(?:$|[#\?]) ''' diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index 8306d6fb7..0b6ad895f 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -8,7 +8,7 @@ from ..utils import ( class ClipsyndicateIE(InfoExtractor): - _VALID_URL = r'http://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)' + _VALID_URL = r'https?://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py index 1dfa7c12e..2fba93543 100644 --- a/youtube_dl/extractor/clubic.py +++ b/youtube_dl/extractor/clubic.py @@ -12,7 +12,7 @@ from ..utils import ( class ClubicIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P<id>[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?clubic\.com/video/(?:[^/]+/)*video.*-(?P<id>[0-9]+)\.html' _TESTS = [{ 'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html', diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py index 7dff68492..e697d1410 100644 --- a/youtube_dl/extractor/comcarcoff.py +++ b/youtube_dl/extractor/comcarcoff.py @@ -11,7 +11,7 @@ from ..utils import ( class ComCarCoffIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?comediansincarsgettingcoffee\.com/(?P<id>[a-z0-9\-]*)' + _VALID_URL = r'https?://(?:www\.)?comediansincarsgettingcoffee\.com/(?P<id>[a-z0-9\-]*)' _TESTS = [{ 'url': 'http://comediansincarsgettingcoffee.com/miranda-sings-happy-thanksgiving-miranda/', 'info_dict': { diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 6f92ae2ed..054978ff2 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -45,7 +45,7 @@ class CondeNastIE(InfoExtractor): 'wmagazine': 'W Magazine', } - _VALID_URL = r'http://(?:video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed(?:js)?)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys()) + _VALID_URL = r'https?://(?:video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed(?:js)?)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) EMBED_URL = r'(?:https?:)?//player\.(?P<site>%s)\.com/(?P<type>embed(?:js)?)/.+?' % '|'.join(_SITES.keys()) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index b8b9d058d..84b36f44c 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -15,7 +15,7 @@ from .senateisvp import SenateISVPIE class CSpanIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P<id>[0-9a-f]+)' + _VALID_URL = r'https?://(?:www\.)?c-span\.org/video/\?(?P<id>[0-9a-f]+)' IE_DESC = 'C-SPAN' _TESTS = [{ 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', diff --git a/youtube_dl/extractor/ctsnews.py b/youtube_dl/extractor/ctsnews.py index 45049bf37..1622fc844 100644 --- a/youtube_dl/extractor/ctsnews.py +++ b/youtube_dl/extractor/ctsnews.py @@ -8,7 +8,7 @@ from ..utils import parse_iso8601, ExtractorError class CtsNewsIE(InfoExtractor): IE_DESC = '華視新聞' # https connection failed (Connection reset) - _VALID_URL = r'http://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P<id>\d+)\.html' + _VALID_URL = r'https?://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P<id>\d+)\.html' _TESTS = [{ 'url': 'http://news.cts.com.tw/cts/international/201501/201501291578109.html', 'md5': 'a9875cb790252b08431186d741beaabe', diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index aa2c09eb6..9099f5046 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -6,7 +6,7 @@ from ..compat import compat_str class DctpTvIE(InfoExtractor): - _VALID_URL = r'http://www.dctp.tv/(#/)?filme/(?P<id>.+?)/$' + _VALID_URL = r'https?://www.dctp.tv/(#/)?filme/(?P<id>.+?)/$' _TEST = { 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', 'info_dict': { diff --git a/youtube_dl/extractor/defense.py b/youtube_dl/extractor/defense.py index 98e3aedfd..9fe144e14 100644 --- a/youtube_dl/extractor/defense.py +++ b/youtube_dl/extractor/defense.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class DefenseGouvFrIE(InfoExtractor): IE_NAME = 'defense.gouv.fr' - _VALID_URL = r'http://.*?\.defense\.gouv\.fr/layout/set/ligthboxvideo/base-de-medias/webtv/(?P<id>[^/?#]*)' + _VALID_URL = r'https?://.*?\.defense\.gouv\.fr/layout/set/ligthboxvideo/base-de-medias/webtv/(?P<id>[^/?#]*)' _TEST = { 'url': 'http://www.defense.gouv.fr/layout/set/ligthboxvideo/base-de-medias/webtv/attaque-chimique-syrienne-du-21-aout-2013-1', diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index bdc768c78..bcb670945 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -10,7 +10,7 @@ from ..compat import (compat_str, compat_basestring) class DouyuTVIE(InfoExtractor): IE_DESC = '斗鱼' - _VALID_URL = r'http://(?:www\.)?douyutv\.com/(?P<id>[A-Za-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?douyutv\.com/(?P<id>[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://www.douyutv.com/iseven', 'info_dict': { diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index a638c827c..1e7dcada6 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -10,7 +10,7 @@ from ..utils import int_or_none class DPlayIE(InfoExtractor): - _VALID_URL = r'http://(?P<domain>it\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?P<domain>it\.dplay\.com|www\.dplay\.(?:dk|se|no))/[^/]+/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'http://it.dplay.com/take-me-out/stagione-1-episodio-25/', diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 028144f20..0040e70d4 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -7,7 +7,7 @@ from .zdf import ZDFIE class DreiSatIE(ZDFIE): IE_NAME = '3sat' - _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$' + _VALID_URL = r'(?:https?://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$' _TESTS = [ { 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', diff --git a/youtube_dl/extractor/dvtv.py b/youtube_dl/extractor/dvtv.py index c1a4bc757..974c69dbc 100644 --- a/youtube_dl/extractor/dvtv.py +++ b/youtube_dl/extractor/dvtv.py @@ -15,7 +15,7 @@ class DVTVIE(InfoExtractor): IE_NAME = 'dvtv' IE_DESC = 'http://video.aktualne.cz/' - _VALID_URL = r'http://video\.aktualne\.cz/(?:[^/]+/)+r~(?P<id>[0-9a-f]{32})' + _VALID_URL = r'https?://video\.aktualne\.cz/(?:[^/]+/)+r~(?P<id>[0-9a-f]{32})' _TESTS = [{ 'url': 'http://video.aktualne.cz/dvtv/vondra-o-ceskem-stoleti-pri-pohledu-na-havla-mi-bylo-trapne/r~e5efe9ca855511e4833a0025900fea04/', diff --git a/youtube_dl/extractor/echomsk.py b/youtube_dl/extractor/echomsk.py index d2d94049d..6b7cc652f 100644 --- a/youtube_dl/extractor/echomsk.py +++ b/youtube_dl/extractor/echomsk.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class EchoMskIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?echo\.msk\.ru/sounds/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?echo\.msk\.ru/sounds/(?P<id>\d+)' _TEST = { 'url': 'http://www.echo.msk.ru/sounds/1464134.html', 'md5': '2e44b3b78daff5b458e4dbc37f191f7c', diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py index 0c0fe6d65..09ed4f2b5 100644 --- a/youtube_dl/extractor/exfm.py +++ b/youtube_dl/extractor/exfm.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class ExfmIE(InfoExtractor): IE_NAME = 'exfm' IE_DESC = 'ex.fm' - _VALID_URL = r'http://(?:www\.)?ex\.fm/song/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?ex\.fm/song/(?P<id>[^/]+)' _SOUNDCLOUD_URL = r'http://(?:www\.)?api\.soundcloud\.com/tracks/([^/]+)/stream' _TESTS = [ { diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index 9580f5c0c..508684d2e 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -17,7 +17,7 @@ from ..utils import ( class FC2IE(InfoExtractor): - _VALID_URL = r'^http://video\.fc2\.com/(?:[^/]+/)*content/(?P<id>[^/]+)' + _VALID_URL = r'^https?://video\.fc2\.com/(?:[^/]+/)*content/(?P<id>[^/]+)' IE_NAME = 'fc2' _NETRC_MACHINE = 'fc2' _TESTS = [{ diff --git a/youtube_dl/extractor/firstpost.py b/youtube_dl/extractor/firstpost.py index 298227d57..e8936cb24 100644 --- a/youtube_dl/extractor/firstpost.py +++ b/youtube_dl/extractor/firstpost.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class FirstpostIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?firstpost\.com/[^/]+/.*-(?P<id>[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?firstpost\.com/[^/]+/.*-(?P<id>[0-9]+)\.html' _TEST = { 'url': 'http://www.firstpost.com/india/india-to-launch-indigenous-aircraft-carrier-monday-1025403.html', diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 510d4b108..98b165143 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -8,7 +8,7 @@ from ..utils import int_or_none class FirstTVIE(InfoExtractor): IE_NAME = '1tv' IE_DESC = 'Первый канал' - _VALID_URL = r'http://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P<id>.+)' + _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P<id>.+)' _TESTS = [{ 'url': 'http://www.1tv.ru/videoarchive/73390', diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py index 5f6e65dae..a3a291599 100644 --- a/youtube_dl/extractor/fktv.py +++ b/youtube_dl/extractor/fktv.py @@ -10,7 +10,7 @@ from ..utils import ( class FKTVIE(InfoExtractor): IE_NAME = 'fernsehkritik.tv' - _VALID_URL = r'http://(?:www\.)?fernsehkritik\.tv/folge-(?P<id>[0-9]+)(?:/.*)?' + _VALID_URL = r'https?://(?:www\.)?fernsehkritik\.tv/folge-(?P<id>[0-9]+)(?:/.*)?' _TEST = { 'url': 'http://fernsehkritik.tv/folge-1', diff --git a/youtube_dl/extractor/footyroom.py b/youtube_dl/extractor/footyroom.py index 370fd006f..d2503ae2e 100644 --- a/youtube_dl/extractor/footyroom.py +++ b/youtube_dl/extractor/footyroom.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class FootyRoomIE(InfoExtractor): - _VALID_URL = r'http://footyroom\.com/(?P<id>[^/]+)' + _VALID_URL = r'https?://footyroom\.com/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://footyroom.com/schalke-04-0-2-real-madrid-2015-02/', 'info_dict': { diff --git a/youtube_dl/extractor/foxgay.py b/youtube_dl/extractor/foxgay.py index 08b8ea362..70c1a815d 100644 --- a/youtube_dl/extractor/foxgay.py +++ b/youtube_dl/extractor/foxgay.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class FoxgayIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?foxgay\.com/videos/(?:\S+-)?(?P<id>\d+)\.shtml' + _VALID_URL = r'https?://(?:www\.)?foxgay\.com/videos/(?:\S+-)?(?P<id>\d+)\.shtml' _TEST = { 'url': 'http://foxgay.com/videos/fuck-turkish-style-2582.shtml', 'md5': '80d72beab5d04e1655a56ad37afe6841', diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py index 0388ba00c..2369f868d 100644 --- a/youtube_dl/extractor/franceinter.py +++ b/youtube_dl/extractor/franceinter.py @@ -6,7 +6,7 @@ from ..utils import int_or_none class FranceInterIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?franceinter\.fr/player/reecouter\?play=(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?franceinter\.fr/player/reecouter\?play=(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.franceinter.fr/player/reecouter?play=793962', 'md5': '4764932e466e6f6c79c317d2e74f6884', diff --git a/youtube_dl/extractor/freevideo.py b/youtube_dl/extractor/freevideo.py index c7bec027b..cd8423a6f 100644 --- a/youtube_dl/extractor/freevideo.py +++ b/youtube_dl/extractor/freevideo.py @@ -5,7 +5,7 @@ from ..utils import ExtractorError class FreeVideoIE(InfoExtractor): - _VALID_URL = r'^http://www.freevideo.cz/vase-videa/(?P<id>[^.]+)\.html(?:$|[?#])' + _VALID_URL = r'^https?://www.freevideo.cz/vase-videa/(?P<id>[^.]+)\.html(?:$|[?#])' _TEST = { 'url': 'http://www.freevideo.cz/vase-videa/vysukany-zadecek-22033.html', diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py index f6b9046f9..cbcddcb7c 100644 --- a/youtube_dl/extractor/gamekings.py +++ b/youtube_dl/extractor/gamekings.py @@ -10,7 +10,7 @@ from .youtube import YoutubeIE class GamekingsIE(InfoExtractor): - _VALID_URL = r'http://www\.gamekings\.nl/(?:videos|nieuws)/(?P<id>[^/]+)' + _VALID_URL = r'https?://www\.gamekings\.nl/(?:videos|nieuws)/(?P<id>[^/]+)' _TESTS = [{ # YouTube embed video 'url': 'http://www.gamekings.nl/videos/phoenix-wright-ace-attorney-dual-destinies-review/', diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index b3f1bafcc..4ffdd7515 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -14,7 +14,7 @@ from ..utils import ( class GameSpotIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?' + _VALID_URL = r'https?://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?' _TESTS = [{ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', 'md5': 'b2a30deaa8654fcccd43713a6b6a4825', diff --git a/youtube_dl/extractor/gamestar.py b/youtube_dl/extractor/gamestar.py index 590ccf526..69058a583 100644 --- a/youtube_dl/extractor/gamestar.py +++ b/youtube_dl/extractor/gamestar.py @@ -13,7 +13,7 @@ from ..utils import ( class GameStarIE(InfoExtractor): - _VALID_URL = r'http://www\.gamestar\.de/videos/.*,(?P<id>[0-9]+)\.html' + _VALID_URL = r'https?://www\.gamestar\.de/videos/.*,(?P<id>[0-9]+)\.html' _TEST = { 'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html', 'md5': '96974ecbb7fd8d0d20fca5a00810cea7', diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py index c3f031d9c..1e7948ab8 100644 --- a/youtube_dl/extractor/gametrailers.py +++ b/youtube_dl/extractor/gametrailers.py @@ -9,7 +9,7 @@ from ..utils import ( class GametrailersIE(InfoExtractor): - _VALID_URL = r'http://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)' + _VALID_URL = r'https?://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)' _TEST = { 'url': 'http://www.gametrailers.com/videos/view/gametrailers-com/116437-Just-Cause-3-Review', diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index 31e219945..efc3e8429 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -12,7 +12,7 @@ from ..utils import ( class HotNewHipHopIE(InfoExtractor): - _VALID_URL = r'http://www\.hotnewhiphop\.com/.*\.(?P<id>.*)\.html' + _VALID_URL = r'https?://www\.hotnewhiphop\.com/.*\.(?P<id>.*)\.html' _TEST = { 'url': 'http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html', 'md5': '2c2cd2f76ef11a9b3b581e8b232f3d96', diff --git a/youtube_dl/extractor/hypem.py b/youtube_dl/extractor/hypem.py index b3706fe6d..e0ab31802 100644 --- a/youtube_dl/extractor/hypem.py +++ b/youtube_dl/extractor/hypem.py @@ -12,7 +12,7 @@ from ..utils import ( class HypemIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?hypem\.com/track/(?P<id>[^/]+)/' + _VALID_URL = r'https?://(?:www\.)?hypem\.com/track/(?P<id>[^/]+)/' _TEST = { 'url': 'http://hypem.com/track/1v6ga/BODYWORK+-+TAME', 'md5': 'b9cc91b5af8995e9f0c1cee04c575828', diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index b61b2dc4e..8bed8ccd0 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -12,7 +12,7 @@ from ..utils import ( class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'http://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)' _TEST = { 'url': 'http://www.imdb.com/video/imdb/vi2524815897', @@ -70,7 +70,7 @@ class ImdbIE(InfoExtractor): class ImdbListIE(InfoExtractor): IE_NAME = 'imdb:list' IE_DESC = 'Internet Movie Database lists' - _VALID_URL = r'http://www\.imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})' + _VALID_URL = r'https?://www\.imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})' _TEST = { 'url': 'http://www.imdb.com/list/JFs9NWw6XI0', 'info_dict': { diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index e7c0cb3f6..1a4c64713 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -165,7 +165,7 @@ class IqiyiIE(InfoExtractor): IE_NAME = 'iqiyi' IE_DESC = '爱奇艺' - _VALID_URL = r'http://(?:[^.]+\.)?iqiyi\.com/.+\.html' + _VALID_URL = r'https?://(?:[^.]+\.)?iqiyi\.com/.+\.html' _NETRC_MACHINE = 'iqiyi' diff --git a/youtube_dl/extractor/jadorecettepub.py b/youtube_dl/extractor/jadorecettepub.py index 063e86de4..158c09a33 100644 --- a/youtube_dl/extractor/jadorecettepub.py +++ b/youtube_dl/extractor/jadorecettepub.py @@ -9,7 +9,7 @@ from .youtube import YoutubeIE class JadoreCettePubIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?jadorecettepub\.com/[0-9]{4}/[0-9]{2}/(?P<id>.*?)\.html' + _VALID_URL = r'https?://(?:www\.)?jadorecettepub\.com/[0-9]{4}/[0-9]{2}/(?P<id>.*?)\.html' _TEST = { 'url': 'http://www.jadorecettepub.com/2010/12/star-wars-massacre-par-les-japonais.html', diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index 137db873c..1a4227f6b 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class JeuxVideoIE(InfoExtractor): - _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)\.htm' + _VALID_URL = r'https?://.*?\.jeuxvideo\.com/.*/(.*?)\.htm' _TESTS = [{ 'url': 'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm', diff --git a/youtube_dl/extractor/karaoketv.py b/youtube_dl/extractor/karaoketv.py index 06daf5a89..b4c30b7f3 100644 --- a/youtube_dl/extractor/karaoketv.py +++ b/youtube_dl/extractor/karaoketv.py @@ -9,7 +9,7 @@ from ..utils import ( class KaraoketvIE(InfoExtractor): - _VALID_URL = r'http://karaoketv\.co\.il/\?container=songs&id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://karaoketv\.co\.il/\?container=songs&id=(?P<id>[0-9]+)' _TEST = { 'url': 'http://karaoketv.co.il/?container=songs&id=171568', 'info_dict': { diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py index bed94bc93..2cb04e533 100644 --- a/youtube_dl/extractor/karrierevideos.py +++ b/youtube_dl/extractor/karrierevideos.py @@ -12,7 +12,7 @@ from ..utils import ( class KarriereVideosIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin', 'info_dict': { diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py index a59c529f4..704bd7b34 100644 --- a/youtube_dl/extractor/kontrtube.py +++ b/youtube_dl/extractor/kontrtube.py @@ -13,7 +13,7 @@ from ..utils import ( class KontrTubeIE(InfoExtractor): IE_NAME = 'kontrtube' IE_DESC = 'KontrTube.ru - Труба зовёт' - _VALID_URL = r'http://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/(?P<display_id>[^/]+)/' + _VALID_URL = r'https?://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/(?P<display_id>[^/]+)/' _TEST = { 'url': 'http://www.kontrtube.ru/videos/2678/nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag/', diff --git a/youtube_dl/extractor/ku6.py b/youtube_dl/extractor/ku6.py index a602980a1..a574408e5 100644 --- a/youtube_dl/extractor/ku6.py +++ b/youtube_dl/extractor/ku6.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class Ku6IE(InfoExtractor): - _VALID_URL = r'http://v\.ku6\.com/show/(?P<id>[a-zA-Z0-9\-\_]+)(?:\.)*html' + _VALID_URL = r'https?://v\.ku6\.com/show/(?P<id>[a-zA-Z0-9\-\_]+)(?:\.)*html' _TEST = { 'url': 'http://v.ku6.com/show/JG-8yS14xzBr4bCn1pu0xw...html', 'md5': '01203549b9efbb45f4b87d55bdea1ed1', diff --git a/youtube_dl/extractor/kusi.py b/youtube_dl/extractor/kusi.py index 931f34c9b..12cc56e44 100644 --- a/youtube_dl/extractor/kusi.py +++ b/youtube_dl/extractor/kusi.py @@ -16,7 +16,7 @@ from ..utils import ( class KUSIIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?kusi\.com/(?P<path>story/.+|video\?clipId=(?P<clipId>\d+))' + _VALID_URL = r'https?://(?:www\.)?kusi\.com/(?P<path>story/.+|video\?clipId=(?P<clipId>\d+))' _TESTS = [{ 'url': 'http://www.kusi.com/story/31183873/turko-files-case-closed-put-on-hold', 'md5': 'f926e7684294cf8cb7bdf8858e1b3988', diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 45d65e61f..a586308b2 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -55,7 +55,7 @@ class KuwoBaseIE(InfoExtractor): class KuwoIE(KuwoBaseIE): IE_NAME = 'kuwo:song' IE_DESC = '酷我音乐' - _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P<id>\d+?)' + _VALID_URL = r'https?://www\.kuwo\.cn/yinyue/(?P<id>\d+?)' _TESTS = [{ 'url': 'http://www.kuwo.cn/yinyue/635632/', 'info_dict': { @@ -134,7 +134,7 @@ class KuwoIE(KuwoBaseIE): class KuwoAlbumIE(InfoExtractor): IE_NAME = 'kuwo:album' IE_DESC = '酷我音乐 - 专辑' - _VALID_URL = r'http://www\.kuwo\.cn/album/(?P<id>\d+?)/' + _VALID_URL = r'https?://www\.kuwo\.cn/album/(?P<id>\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/album/502294/', 'info_dict': { @@ -170,7 +170,7 @@ class KuwoAlbumIE(InfoExtractor): class KuwoChartIE(InfoExtractor): IE_NAME = 'kuwo:chart' IE_DESC = '酷我音乐 - 排行榜' - _VALID_URL = r'http://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm' + _VALID_URL = r'https?://yinyue\.kuwo\.cn/billboard_(?P<id>[^.]+).htm' _TEST = { 'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm', 'info_dict': { @@ -195,7 +195,7 @@ class KuwoChartIE(InfoExtractor): class KuwoSingerIE(InfoExtractor): IE_NAME = 'kuwo:singer' IE_DESC = '酷我音乐 - 歌手' - _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P<id>[^/]+)' + _VALID_URL = r'https?://www\.kuwo\.cn/mingxing/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.kuwo.cn/mingxing/bruno+mars/', 'info_dict': { @@ -251,7 +251,7 @@ class KuwoSingerIE(InfoExtractor): class KuwoCategoryIE(InfoExtractor): IE_NAME = 'kuwo:category' IE_DESC = '酷我音乐 - 分类' - _VALID_URL = r'http://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm' + _VALID_URL = r'https?://yinyue\.kuwo\.cn/yy/cinfo_(?P<id>\d+?).htm' _TEST = { 'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm', 'info_dict': { @@ -288,7 +288,7 @@ class KuwoCategoryIE(InfoExtractor): class KuwoMvIE(KuwoBaseIE): IE_NAME = 'kuwo:mv' IE_DESC = '酷我音乐 - MV' - _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P<id>\d+?)/' + _VALID_URL = r'https?://www\.kuwo\.cn/mv/(?P<id>\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/mv/6480076/', 'info_dict': { diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index df47e88ba..462b752dd 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -28,7 +28,7 @@ from ..utils import ( class LeIE(InfoExtractor): IE_DESC = '乐视网' - _VALID_URL = r'http://www\.le\.com/ptv/vplay/(?P<id>\d+)\.html' + _VALID_URL = r'https?://www\.le\.com/ptv/vplay/(?P<id>\d+)\.html' _URL_TEMPLATE = 'http://www.le.com/ptv/vplay/%s.html' @@ -196,7 +196,7 @@ class LeIE(InfoExtractor): class LePlaylistIE(InfoExtractor): - _VALID_URL = r'http://[a-z]+\.le\.com/[a-z]+/(?P<id>[a-z0-9_]+)' + _VALID_URL = r'https?://[a-z]+\.le\.com/[a-z]+/(?P<id>[a-z0-9_]+)' _TESTS = [{ 'url': 'http://www.le.com/tv/46177.html', diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index a8fd639cc..ba2f80a75 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -17,7 +17,7 @@ from ..utils import ( class LifeNewsIE(InfoExtractor): IE_NAME = 'lifenews' IE_DESC = 'LIFE | NEWS' - _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)' + _VALID_URL = r'https?://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)' _TESTS = [{ # single video embedded via video/source @@ -159,7 +159,7 @@ class LifeNewsIE(InfoExtractor): class LifeEmbedIE(InfoExtractor): IE_NAME = 'life:embed' - _VALID_URL = r'http://embed\.life\.ru/embed/(?P<id>[\da-f]{32})' + _VALID_URL = r'https?://embed\.life\.ru/embed/(?P<id>[\da-f]{32})' _TEST = { 'url': 'http://embed.life.ru/embed/e50c2dec2867350528e2574c899b8291', diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 1a0625ac3..2599d45c3 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -123,7 +123,7 @@ class LimelightBaseIE(InfoExtractor): class LimelightMediaIE(LimelightBaseIE): IE_NAME = 'limelight' - _VALID_URL = r'(?:limelight:media:|http://link\.videoplatform\.limelight\.com/media/\??\bmediaId=)(?P<id>[a-z0-9]{32})' + _VALID_URL = r'(?:limelight:media:|https?://link\.videoplatform\.limelight\.com/media/\??\bmediaId=)(?P<id>[a-z0-9]{32})' _TESTS = [{ 'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86', 'info_dict': { @@ -176,7 +176,7 @@ class LimelightMediaIE(LimelightBaseIE): class LimelightChannelIE(LimelightBaseIE): IE_NAME = 'limelight:channel' - _VALID_URL = r'(?:limelight:channel:|http://link\.videoplatform\.limelight\.com/media/\??\bchannelId=)(?P<id>[a-z0-9]{32})' + _VALID_URL = r'(?:limelight:channel:|https?://link\.videoplatform\.limelight\.com/media/\??\bchannelId=)(?P<id>[a-z0-9]{32})' _TEST = { 'url': 'http://link.videoplatform.limelight.com/media/?channelId=ab6a524c379342f9b23642917020c082', 'info_dict': { @@ -207,7 +207,7 @@ class LimelightChannelIE(LimelightBaseIE): class LimelightChannelListIE(LimelightBaseIE): IE_NAME = 'limelight:channel_list' - _VALID_URL = r'(?:limelight:channel_list:|http://link\.videoplatform\.limelight\.com/media/\?.*?\bchannelListId=)(?P<id>[a-z0-9]{32})' + _VALID_URL = r'(?:limelight:channel_list:|https?://link\.videoplatform\.limelight\.com/media/\?.*?\bchannelListId=)(?P<id>[a-z0-9]{32})' _TEST = { 'url': 'http://link.videoplatform.limelight.com/media/?channelListId=301b117890c4465c8179ede21fd92e2b', 'info_dict': { diff --git a/youtube_dl/extractor/m6.py b/youtube_dl/extractor/m6.py index 7e025831b..d5945ad66 100644 --- a/youtube_dl/extractor/m6.py +++ b/youtube_dl/extractor/m6.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class M6IE(InfoExtractor): IE_NAME = 'm6' - _VALID_URL = r'http://(?:www\.)?m6\.fr/[^/]+/videos/(?P<id>\d+)-[^\.]+\.html' + _VALID_URL = r'https?://(?:www\.)?m6\.fr/[^/]+/videos/(?P<id>\d+)-[^\.]+\.html' _TEST = { 'url': 'http://www.m6.fr/emission-les_reines_du_shopping/videos/11323908-emeline_est_la_reine_du_shopping_sur_le_theme_ma_fete_d_8217_anniversaire.html', diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 67d6271e1..c31e8798a 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -17,7 +17,7 @@ from ..utils import ( class MetacafeIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' + _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' _DISCLAIMER = 'http://www.metacafe.com/family_filter/' _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' IE_NAME = 'metacafe' diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index 819c1b90b..1aea78d11 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -91,7 +91,7 @@ class MITIE(TechTVMITIE): class OCWMITIE(InfoExtractor): IE_NAME = 'ocw.mit.edu' - _VALID_URL = r'^http://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)' + _VALID_URL = r'^https?://ocw\.mit\.edu/courses/(?P<topic>[a-z0-9\-]+)' _BASE_URL = 'http://ocw.mit.edu/' _TESTS = [ diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index c595f2077..9e584860a 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -14,7 +14,7 @@ from ..utils import ( class MiTeleIE(InfoExtractor): IE_DESC = 'mitele.es' - _VALID_URL = r'http://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/' + _VALID_URL = r'https?://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/' _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', diff --git a/youtube_dl/extractor/mooshare.py b/youtube_dl/extractor/mooshare.py index 7cc7f054f..f010f52d5 100644 --- a/youtube_dl/extractor/mooshare.py +++ b/youtube_dl/extractor/mooshare.py @@ -13,7 +13,7 @@ from ..utils import ( class MooshareIE(InfoExtractor): IE_NAME = 'mooshare' IE_DESC = 'Mooshare.biz' - _VALID_URL = r'http://(?:www\.)?mooshare\.biz/(?P<id>[\da-z]{12})' + _VALID_URL = r'https?://(?:www\.)?mooshare\.biz/(?P<id>[\da-z]{12})' _TESTS = [ { diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index 0b4787c1d..ad04b12cd 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -12,7 +12,7 @@ from ..utils import ( class MotherlessIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?motherless\.com/(?:g/[a-z0-9_]+/)?(?P<id>[A-Z0-9]+)' _TESTS = [{ 'url': 'http://motherless.com/AC3FFE1', 'md5': '310f62e325a9fafe64f68c0bccb6e75f', diff --git a/youtube_dl/extractor/motorsport.py b/youtube_dl/extractor/motorsport.py index c1a482dba..370328b36 100644 --- a/youtube_dl/extractor/motorsport.py +++ b/youtube_dl/extractor/motorsport.py @@ -9,7 +9,7 @@ from ..compat import ( class MotorsportIE(InfoExtractor): IE_DESC = 'motorsport.com' - _VALID_URL = r'http://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/?(?:$|[?#])' + _VALID_URL = r'https?://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P<id>[^/]+)/?(?:$|[?#])' _TEST = { 'url': 'http://www.motorsport.com/f1/video/main-gallery/red-bull-racing-2014-rules-explained/', 'info_dict': { diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py index f936b92bb..1ca7b1a9e 100644 --- a/youtube_dl/extractor/myspass.py +++ b/youtube_dl/extractor/myspass.py @@ -11,7 +11,7 @@ from ..utils import ( class MySpassIE(InfoExtractor): - _VALID_URL = r'http://www\.myspass\.de/.*' + _VALID_URL = r'https?://www\.myspass\.de/.*' _TEST = { 'url': 'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/', 'md5': '0b49f4844a068f8b33f4b7c88405862b', diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py index 1e21cf98a..c83a1eab5 100644 --- a/youtube_dl/extractor/myvideo.py +++ b/youtube_dl/extractor/myvideo.py @@ -20,7 +20,7 @@ from ..utils import ( class MyVideoIE(InfoExtractor): _WORKING = False - _VALID_URL = r'http://(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/(?P<id>[0-9]+)/[^?/]+.*' + _VALID_URL = r'https?://(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/(?P<id>[0-9]+)/[^?/]+.*' IE_NAME = 'myvideo' _TEST = { 'url': 'http://www.myvideo.de/watch/8229274/bowling_fail_or_win', diff --git a/youtube_dl/extractor/myvidster.py b/youtube_dl/extractor/myvidster.py index a94ab8358..731c24542 100644 --- a/youtube_dl/extractor/myvidster.py +++ b/youtube_dl/extractor/myvidster.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class MyVidsterIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?myvidster\.com/video/(?P<id>\d+)/' + _VALID_URL = r'https?://(?:www\.)?myvidster\.com/video/(?P<id>\d+)/' _TEST = { 'url': 'http://www.myvidster.com/video/32059805/Hot_chemistry_with_raw_love_making', diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 7ce8d9b18..d5e53365c 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -8,7 +8,7 @@ from ..utils import ( class NationalGeographicIE(InfoExtractor): - _VALID_URL = r'http://video\.nationalgeographic\.com/.*?' + _VALID_URL = r'https?://video\.nationalgeographic\.com/.*?' _TESTS = [ { diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index bb0817e34..a622f2212 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -115,7 +115,7 @@ class NBCSportsVPlayerIE(InfoExtractor): class NBCSportsIE(InfoExtractor): # Does not include https because its certificate is invalid - _VALID_URL = r'http://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)' + _VALID_URL = r'https?://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)' _TEST = { 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', @@ -295,7 +295,7 @@ class NBCNewsIE(ThePlatformIE): class MSNBCIE(InfoExtractor): # https URLs redirect to corresponding http ones - _VALID_URL = r'http://www\.msnbc\.com/[^/]+/watch/(?P<id>[^/]+)' + _VALID_URL = r'https?://www\.msnbc\.com/[^/]+/watch/(?P<id>[^/]+)' _TEST = { 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py index d1688457f..aae7aeeeb 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/youtube_dl/extractor/nextmedia.py @@ -7,7 +7,7 @@ from ..utils import parse_iso8601 class NextMediaIE(InfoExtractor): IE_DESC = '蘋果日報' - _VALID_URL = r'http://hk.apple.nextmedia.com/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)' + _VALID_URL = r'https?://hk.apple.nextmedia.com/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)' _TESTS = [{ 'url': 'http://hk.apple.nextmedia.com/realtime/news/20141108/53109199', 'md5': 'dff9fad7009311c421176d1ac90bfe4f', @@ -68,7 +68,7 @@ class NextMediaIE(InfoExtractor): class NextMediaActionNewsIE(NextMediaIE): IE_DESC = '蘋果日報 - 動新聞' - _VALID_URL = r'http://hk.dv.nextmedia.com/actionnews/[^/]+/(?P<date>\d+)/(?P<id>\d+)/\d+' + _VALID_URL = r'https?://hk.dv.nextmedia.com/actionnews/[^/]+/(?P<date>\d+)/(?P<id>\d+)/\d+' _TESTS = [{ 'url': 'http://hk.dv.nextmedia.com/actionnews/hit/20150121/19009428/20061460', 'md5': '05fce8ffeed7a5e00665d4b7cf0f9201', @@ -93,7 +93,7 @@ class NextMediaActionNewsIE(NextMediaIE): class AppleDailyIE(NextMediaIE): IE_DESC = '臺灣蘋果日報' - _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' + _VALID_URL = r'https?://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' _TESTS = [{ 'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694', 'md5': 'a843ab23d150977cc55ef94f1e2c1e4d', diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index d440313d5..ec7317a2f 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -22,7 +22,7 @@ from ..utils import ( class NocoIE(InfoExtractor): - _VALID_URL = r'http://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)' + _VALID_URL = r'https?://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)' _LOGIN_URL = 'http://noco.tv/do.php' _API_URL_TEMPLATE = 'https://api.noco.tv/1.1/%s?ts=%s&tk=%s' _SUB_LANG_TEMPLATE = '&sub_lang=%s' diff --git a/youtube_dl/extractor/normalboots.py b/youtube_dl/extractor/normalboots.py index 5952d136f..77e091072 100644 --- a/youtube_dl/extractor/normalboots.py +++ b/youtube_dl/extractor/normalboots.py @@ -9,7 +9,7 @@ from ..utils import ( class NormalbootsIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?normalboots\.com/video/(?P<id>[0-9a-z-]*)/?$' + _VALID_URL = r'https?://(?:www\.)?normalboots\.com/video/(?P<id>[0-9a-z-]*)/?$' _TEST = { 'url': 'http://normalboots.com/video/home-alone-games-jontron/', 'md5': '8bf6de238915dd501105b44ef5f1e0f6', diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 72f72b803..17671ad39 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -12,7 +12,7 @@ from ..utils import ( class NovaIE(InfoExtractor): IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz' - _VALID_URL = r'http://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)' + _VALID_URL = r'https?://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)' _TESTS = [{ 'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html?utm_source=tvnoviny&utm_medium=cpfooter&utm_campaign=novaplus', 'info_dict': { diff --git a/youtube_dl/extractor/npr.py b/youtube_dl/extractor/npr.py index 125c7010b..a3f0abb4e 100644 --- a/youtube_dl/extractor/npr.py +++ b/youtube_dl/extractor/npr.py @@ -9,7 +9,7 @@ from ..utils import ( class NprIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?npr\.org/player/v2/mediaPlayer\.html\?.*\bid=(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?npr\.org/player/v2/mediaPlayer\.html\?.*\bid=(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.npr.org/player/v2/mediaPlayer.html?id=449974205', 'info_dict': { diff --git a/youtube_dl/extractor/ntvru.py b/youtube_dl/extractor/ntvru.py index 2cd924d05..0895d7ea4 100644 --- a/youtube_dl/extractor/ntvru.py +++ b/youtube_dl/extractor/ntvru.py @@ -11,7 +11,7 @@ from ..utils import ( class NTVRuIE(InfoExtractor): IE_NAME = 'ntv.ru' - _VALID_URL = r'http://(?:www\.)?ntv\.ru/(?P<id>.+)' + _VALID_URL = r'https?://(?:www\.)?ntv\.ru/(?P<id>.+)' _TESTS = [ { diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 958eb398b..66c75f8b3 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -137,7 +137,7 @@ class ORFTVthekIE(InfoExtractor): class ORFOE1IE(InfoExtractor): IE_NAME = 'orf:oe1' IE_DESC = 'Radio Österreich 1' - _VALID_URL = r'http://oe1\.orf\.at/(?:programm/|konsole.*?#\?track_id=)(?P<id>[0-9]+)' + _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole.*?#\?track_id=)(?P<id>[0-9]+)' # Audios on ORF radio are only available for 7 days, so we can't add tests. _TEST = { @@ -171,7 +171,7 @@ class ORFOE1IE(InfoExtractor): class ORFFM4IE(InfoExtractor): IE_NAME = 'orf:fm4' IE_DESC = 'radio FM4' - _VALID_URL = r'http://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)' + _VALID_URL = r'https?://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)' _TEST = { 'url': 'http://fm4.orf.at/player/20160110/IS/', @@ -222,7 +222,7 @@ class ORFFM4IE(InfoExtractor): class ORFIPTVIE(InfoExtractor): IE_NAME = 'orf:iptv' IE_DESC = 'iptv.ORF.at' - _VALID_URL = r'http://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)' + _VALID_URL = r'https?://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)' _TEST = { 'url': 'http://iptv.orf.at/stories/2275236/', diff --git a/youtube_dl/extractor/philharmoniedeparis.py b/youtube_dl/extractor/philharmoniedeparis.py index 6e60e5fe9..f1008ae51 100644 --- a/youtube_dl/extractor/philharmoniedeparis.py +++ b/youtube_dl/extractor/philharmoniedeparis.py @@ -12,7 +12,7 @@ from ..utils import ( class PhilharmonieDeParisIE(InfoExtractor): IE_DESC = 'Philharmonie de Paris' - _VALID_URL = r'http://live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|misc/Playlist\.ashx\?id=)(?P<id>\d+)' + _VALID_URL = r'https?://live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|misc/Playlist\.ashx\?id=)(?P<id>\d+)' _TESTS = [{ 'url': 'http://live.philharmoniedeparis.fr/concert/1032066.html', 'info_dict': { diff --git a/youtube_dl/extractor/photobucket.py b/youtube_dl/extractor/photobucket.py index 788411ccc..6c8bbe1d9 100644 --- a/youtube_dl/extractor/photobucket.py +++ b/youtube_dl/extractor/photobucket.py @@ -8,7 +8,7 @@ from ..compat import compat_urllib_parse_unquote class PhotobucketIE(InfoExtractor): - _VALID_URL = r'http://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))' + _VALID_URL = r'https?://(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))' _TEST = { 'url': 'http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0', 'md5': '7dabfb92b0a31f6c16cebc0f8e60ff99', diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py index 57c78ba52..39b53ecf6 100644 --- a/youtube_dl/extractor/pornhd.py +++ b/youtube_dl/extractor/pornhd.py @@ -12,7 +12,7 @@ from ..utils import ( class PornHdIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?' + _VALID_URL = r'https?://(?:www\.)?pornhd\.com/(?:[a-z]{2,4}/)?videos/(?P<id>\d+)(?:/(?P<display_id>.+))?' _TEST = { 'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video', 'md5': '956b8ca569f7f4d8ec563e2c41598441', diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py index 1a53fd71c..6b51e5c54 100644 --- a/youtube_dl/extractor/pornovoisines.py +++ b/youtube_dl/extractor/pornovoisines.py @@ -13,7 +13,7 @@ from ..utils import ( class PornoVoisinesIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?pornovoisines\.com/showvideo/(?P<id>\d+)/(?P<display_id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/showvideo/(?P<id>\d+)/(?P<display_id>[^/]+)' _VIDEO_URL_TEMPLATE = 'http://stream%d.pornovoisines.com' \ '/static/media/video/transcoded/%s-640x360-1000-trscded.mp4' diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py index 30a5f2de4..cc0416cb8 100644 --- a/youtube_dl/extractor/pyvideo.py +++ b/youtube_dl/extractor/pyvideo.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class PyvideoIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)' + _VALID_URL = r'https?://(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)' _TESTS = [ { diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 45a3c41c5..ff0af9543 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -18,7 +18,7 @@ from ..utils import ( class QQMusicIE(InfoExtractor): IE_NAME = 'qqmusic' IE_DESC = 'QQ音乐' - _VALID_URL = r'http://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD', 'md5': '9ce1c1c8445f561506d2e3cfb0255705', @@ -172,7 +172,7 @@ class QQPlaylistBaseIE(InfoExtractor): class QQMusicSingerIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:singer' IE_DESC = 'QQ音乐 - 歌手' - _VALID_URL = r'http://y.qq.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y.qq.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)' _TEST = { 'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2', 'info_dict': { @@ -217,7 +217,7 @@ class QQMusicSingerIE(QQPlaylistBaseIE): class QQMusicAlbumIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:album' IE_DESC = 'QQ音乐 - 专辑' - _VALID_URL = r'http://y.qq.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)' + _VALID_URL = r'https?://y.qq.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1', @@ -260,7 +260,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): class QQMusicToplistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:toplist' IE_DESC = 'QQ音乐 - 排行榜' - _VALID_URL = r'http://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)' + _VALID_URL = r'https?://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=toplist&p=global_123', @@ -314,7 +314,7 @@ class QQMusicToplistIE(QQPlaylistBaseIE): class QQMusicPlaylistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:playlist' IE_DESC = 'QQ音乐 - 歌单' - _VALID_URL = r'http://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=taoge&id=3462654915', diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index a4dc5c335..e36ce1aa1 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -18,7 +18,7 @@ from ..utils import ( class RaiTVIE(InfoExtractor): - _VALID_URL = r'http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' + _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' _TESTS = [ { 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', @@ -175,7 +175,7 @@ class RaiTVIE(InfoExtractor): class RaiIE(InfoExtractor): - _VALID_URL = r'http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' + _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' _TESTS = [ { 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index d6054d717..7ba41ba59 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -5,7 +5,7 @@ from ..utils import ExtractorError class RedTubeIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?redtube\.com/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?redtube\.com/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.redtube.com/66418', 'md5': '7b8c22b5e7098a3e1c09709df1126d2d', diff --git a/youtube_dl/extractor/ringtv.py b/youtube_dl/extractor/ringtv.py index 508758075..2c2c707bd 100644 --- a/youtube_dl/extractor/ringtv.py +++ b/youtube_dl/extractor/ringtv.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class RingTVIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?ringtv\.craveonline\.com/(?P<type>news|videos/video)/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?ringtv\.craveonline\.com/(?P<type>news|videos/video)/(?P<id>[^/?#]+)' _TEST = { 'url': 'http://ringtv.craveonline.com/news/310833-luis-collazo-says-victor-ortiz-better-not-quit-on-jan-30', 'md5': 'd25945f5df41cdca2d2587165ac28720', diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 8a8c5d2a0..08cd1ae6c 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -62,7 +62,7 @@ def _decrypt_url(png): class RTVEALaCartaIE(InfoExtractor): IE_NAME = 'rtve.es:alacarta' IE_DESC = 'RTVE a la carta' - _VALID_URL = r'http://www\.rtve\.es/(m/)?alacarta/videos/[^/]+/[^/]+/(?P<id>\d+)' + _VALID_URL = r'https?://www\.rtve\.es/(m/)?alacarta/videos/[^/]+/[^/]+/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', @@ -179,7 +179,7 @@ class RTVEInfantilIE(InfoExtractor): class RTVELiveIE(InfoExtractor): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' - _VALID_URL = r'http://www\.rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)' + _VALID_URL = r'https?://www\.rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)' _TESTS = [{ 'url': 'http://www.rtve.es/directo/la-1/', diff --git a/youtube_dl/extractor/ruhd.py b/youtube_dl/extractor/ruhd.py index 0e470e73f..1f7c26299 100644 --- a/youtube_dl/extractor/ruhd.py +++ b/youtube_dl/extractor/ruhd.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class RUHDIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?ruhd\.ru/play\.php\?vid=(?P<id>\d+)' _TEST = { 'url': 'http://www.ruhd.ru/play.php?vid=207', 'md5': 'd1a9ec4edf8598e3fbd92bb16072ba83', diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index c5c47d01e..9ca4ae147 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -122,7 +122,7 @@ class RutubeEmbedIE(InfoExtractor): class RutubeChannelIE(InfoExtractor): IE_NAME = 'rutube:channel' IE_DESC = 'Rutube channels' - _VALID_URL = r'http://rutube\.ru/tags/video/(?P<id>\d+)' + _VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)' _TESTS = [{ 'url': 'http://rutube.ru/tags/video/1800/', 'info_dict': { @@ -156,7 +156,7 @@ class RutubeChannelIE(InfoExtractor): class RutubeMovieIE(RutubeChannelIE): IE_NAME = 'rutube:movie' IE_DESC = 'Rutube movies' - _VALID_URL = r'http://rutube\.ru/metainfo/tv/(?P<id>\d+)' + _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P<id>\d+)' _TESTS = [] _MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json' @@ -174,7 +174,7 @@ class RutubeMovieIE(RutubeChannelIE): class RutubePersonIE(RutubeChannelIE): IE_NAME = 'rutube:person' IE_DESC = 'Rutube person videos' - _VALID_URL = r'http://rutube\.ru/video/person/(?P<id>\d+)' + _VALID_URL = r'https?://rutube\.ru/video/person/(?P<id>\d+)' _TESTS = [{ 'url': 'http://rutube.ru/video/person/313878/', 'info_dict': { diff --git a/youtube_dl/extractor/screenjunkies.py b/youtube_dl/extractor/screenjunkies.py index f2af15f6b..dd0a6ba19 100644 --- a/youtube_dl/extractor/screenjunkies.py +++ b/youtube_dl/extractor/screenjunkies.py @@ -11,7 +11,7 @@ from ..utils import ( class ScreenJunkiesIE(InfoExtractor): - _VALID_URL = r'http://www.screenjunkies.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' + _VALID_URL = r'https?://www.screenjunkies.com/video/(?P<display_id>[^/]+?)(?:-(?P<id>\d+))?(?:[/?#&]|$)' _TESTS = [{ 'url': 'http://www.screenjunkies.com/video/best-quentin-tarantino-movie-2841915', 'md5': '5c2b686bec3d43de42bde9ec047536b0', diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py index 4d3b58522..c5f474dd1 100644 --- a/youtube_dl/extractor/senateisvp.py +++ b/youtube_dl/extractor/senateisvp.py @@ -48,7 +48,7 @@ class SenateISVPIE(InfoExtractor): ['arch', '', 'http://ussenate-f.akamaihd.net/'] ] _IE_NAME = 'senate.gov' - _VALID_URL = r'http://www\.senate\.gov/isvp/?\?(?P<qs>.+)' + _VALID_URL = r'https?://www\.senate\.gov/isvp/?\?(?P<qs>.+)' _TESTS = [{ 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', 'info_dict': { diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 8eda3c864..96fe0b90d 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -13,7 +13,7 @@ from ..utils import ( class SharedIE(InfoExtractor): IE_DESC = 'shared.sx and vivo.sx' - _VALID_URL = r'http://(?:shared|vivo)\.sx/(?P<id>[\da-z]{10})' + _VALID_URL = r'https?://(?:shared|vivo)\.sx/(?P<id>[\da-z]{10})' _TESTS = [{ 'url': 'http://shared.sx/0060718775', diff --git a/youtube_dl/extractor/sport5.py b/youtube_dl/extractor/sport5.py index dfe50ed45..7e6783306 100644 --- a/youtube_dl/extractor/sport5.py +++ b/youtube_dl/extractor/sport5.py @@ -8,7 +8,7 @@ from ..utils import ExtractorError class Sport5IE(InfoExtractor): - _VALID_URL = r'http://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P<id>\d+)' + _VALID_URL = r'https?://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P<id>\d+)' _TESTS = [ { 'url': 'http://vod.sport5.co.il/?Vc=147&Vi=176331&Page=1', diff --git a/youtube_dl/extractor/ssa.py b/youtube_dl/extractor/ssa.py index 13101c714..54d1843f2 100644 --- a/youtube_dl/extractor/ssa.py +++ b/youtube_dl/extractor/ssa.py @@ -8,7 +8,7 @@ from ..utils import ( class SSAIE(InfoExtractor): - _VALID_URL = r'http://ssa\.nls\.uk/film/(?P<id>\d+)' + _VALID_URL = r'https?://ssa\.nls\.uk/film/(?P<id>\d+)' _TEST = { 'url': 'http://ssa.nls.uk/film/3561', 'info_dict': { diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py index aa5964acb..f562aa6d3 100644 --- a/youtube_dl/extractor/sztvhu.py +++ b/youtube_dl/extractor/sztvhu.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class SztvHuIE(InfoExtractor): - _VALID_URL = r'http://(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:(?:www\.)?sztv\.hu|www\.tvszombathely\.hu)/(?:[^/]+)/.+-(?P<id>[0-9]+)' _TEST = { 'url': 'http://sztv.hu/hirek/cserkeszek-nepszerusitettek-a-kornyezettudatos-eletmodot-a-savaria-teren-20130909', 'md5': 'a6df607b11fb07d0e9f2ad94613375cb', diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index d1b7264b4..b49ab5f5b 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -16,7 +16,7 @@ from ..compat import compat_ord class TeamcocoIE(InfoExtractor): - _VALID_URL = r'http://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<display_id>.*)' + _VALID_URL = r'https?://teamcoco\.com/video/(?P<video_id>[0-9]+)?/?(?P<display_id>.*)' _TESTS = [ { 'url': 'http://teamcoco.com/video/80187/conan-becomes-a-mary-kay-beauty-consultant', diff --git a/youtube_dl/extractor/tele13.py b/youtube_dl/extractor/tele13.py index 4e860db0a..a29a64b6d 100644 --- a/youtube_dl/extractor/tele13.py +++ b/youtube_dl/extractor/tele13.py @@ -11,7 +11,7 @@ from ..utils import ( class Tele13IE(InfoExtractor): - _VALID_URL = r'^http://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)' + _VALID_URL = r'^https?://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)' _TESTS = [ { 'url': 'http://www.t13.cl/videos/actualidad/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 9ee844684..3f54b2744 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class TF1IE(InfoExtractor): """TF1 uses the wat.tv player.""" - _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/(?:[^/]+/)*(?P<id>.+?)\.html' + _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/(?:[^/]+/)*(?P<id>.+?)\.html' _TESTS = [{ 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', 'info_dict': { diff --git a/youtube_dl/extractor/thvideo.py b/youtube_dl/extractor/thvideo.py index 496f15d80..406f4a826 100644 --- a/youtube_dl/extractor/thvideo.py +++ b/youtube_dl/extractor/thvideo.py @@ -10,7 +10,7 @@ from ..utils import ( class THVideoIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?thvideo\.tv/(?:v/th|mobile\.php\?cid=)(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?thvideo\.tv/(?:v/th|mobile\.php\?cid=)(?P<id>[0-9]+)' _TEST = { 'url': 'http://thvideo.tv/v/th1987/', 'md5': 'fa107b1f73817e325e9433505a70db50', diff --git a/youtube_dl/extractor/tinypic.py b/youtube_dl/extractor/tinypic.py index e036b8cdf..c43cace24 100644 --- a/youtube_dl/extractor/tinypic.py +++ b/youtube_dl/extractor/tinypic.py @@ -9,7 +9,7 @@ from ..utils import ExtractorError class TinyPicIE(InfoExtractor): IE_NAME = 'tinypic' IE_DESC = 'tinypic.com videos' - _VALID_URL = r'http://(?:.+?\.)?tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+' + _VALID_URL = r'https?://(?:.+?\.)?tinypic\.com/player\.php\?v=(?P<id>[^&]+)&s=\d+' _TESTS = [ { diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py index 17add9543..abad3ff64 100644 --- a/youtube_dl/extractor/tlc.py +++ b/youtube_dl/extractor/tlc.py @@ -9,7 +9,7 @@ from ..compat import compat_parse_qs class TlcDeIE(InfoExtractor): IE_NAME = 'tlc.de' - _VALID_URL = r'http://www\.tlc\.de/(?:[^/]+/)*videos/(?P<title>[^/?#]+)?(?:.*#(?P<id>\d+))?' + _VALID_URL = r'https?://www\.tlc\.de/(?:[^/]+/)*videos/(?P<title>[^/?#]+)?(?:.*#(?P<id>\d+))?' _TEST = { 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', diff --git a/youtube_dl/extractor/toypics.py b/youtube_dl/extractor/toypics.py index 2756f56d3..2579ba8c6 100644 --- a/youtube_dl/extractor/toypics.py +++ b/youtube_dl/extractor/toypics.py @@ -41,7 +41,7 @@ class ToypicsIE(InfoExtractor): class ToypicsUserIE(InfoExtractor): IE_DESC = 'Toypics user profile' - _VALID_URL = r'http://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])' + _VALID_URL = r'https?://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])' _TEST = { 'url': 'http://videos.toypics.net/Mikey', 'info_dict': { diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py index 0e01b15fc..747370d12 100644 --- a/youtube_dl/extractor/traileraddict.py +++ b/youtube_dl/extractor/traileraddict.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class TrailerAddictIE(InfoExtractor): _WORKING = False - _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)' + _VALID_URL = r'(?:https?://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)' _TEST = { 'url': 'http://www.traileraddict.com/trailer/prince-avalanche/trailer', 'md5': '41365557f3c8c397d091da510e73ceb4', diff --git a/youtube_dl/extractor/trollvids.py b/youtube_dl/extractor/trollvids.py index d239949a6..657705623 100644 --- a/youtube_dl/extractor/trollvids.py +++ b/youtube_dl/extractor/trollvids.py @@ -7,7 +7,7 @@ from .nuevo import NuevoBaseIE class TrollvidsIE(NuevoBaseIE): - _VALID_URL = r'http://(?:www\.)?trollvids\.com/video/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?trollvids\.com/video/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' IE_NAME = 'trollvids' _TEST = { 'url': 'http://trollvids.com/video/2349002/%E3%80%90MMD-R-18%E3%80%91%E3%82%AC%E3%83%BC%E3%83%AB%E3%83%95%E3%83%AC%E3%83%B3%E3%83%89-carrymeoff', diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 4f844706d..cea117c79 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -8,7 +8,7 @@ from ..utils import int_or_none class TumblrIE(InfoExtractor): - _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])' + _VALID_URL = r'https?://(?P<blog_name>.*?)\.tumblr\.com/(?:post|video)/(?P<id>[0-9]+)(?:$|[/?#])' _TESTS = [{ 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', 'md5': '479bb068e5b16462f5176a6828829767', diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index 535d0d361..86bb7915d 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -14,7 +14,7 @@ from ..utils import ( class TV2IE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?tv2\.no/v/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P<id>\d+)' _TEST = { 'url': 'http://www.tv2.no/v/916509/', 'info_dict': { @@ -100,7 +100,7 @@ class TV2IE(InfoExtractor): class TV2ArticleIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542', 'info_dict': { diff --git a/youtube_dl/extractor/tvc.py b/youtube_dl/extractor/tvc.py index 3a4f393fc..4065354dd 100644 --- a/youtube_dl/extractor/tvc.py +++ b/youtube_dl/extractor/tvc.py @@ -11,7 +11,7 @@ from ..utils import ( class TVCIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)' _TEST = { 'url': 'http://www.tvc.ru/video/iframe/id/74622/isPlay/false/id_stat/channel/?acc_video_id=/channel/brand/id/17/show/episodes/episode_id/39702', 'md5': 'bbc5ff531d1e90e856f60fc4b3afd708', @@ -64,7 +64,7 @@ class TVCIE(InfoExtractor): class TVCArticleIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P<id>[^?#]+)' + _VALID_URL = r'https?://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P<id>[^?#]+)' _TESTS = [{ 'url': 'http://www.tvc.ru/channel/brand/id/29/show/episodes/episode_id/39702/', 'info_dict': { diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index b4683de54..df70a6b23 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -13,7 +13,7 @@ from ..utils import ( class TVPlayIE(InfoExtractor): IE_DESC = 'TV3Play and related services' - _VALID_URL = r'''(?x)http://(?:www\.)? + _VALID_URL = r'''(?x)https?://(?:www\.)? (?:tvplay\.lv/parraides| tv3play\.lt/programos| play\.tv3\.lt/programos| diff --git a/youtube_dl/extractor/ubu.py b/youtube_dl/extractor/ubu.py index d50237758..1d52cbc98 100644 --- a/youtube_dl/extractor/ubu.py +++ b/youtube_dl/extractor/ubu.py @@ -10,7 +10,7 @@ from ..utils import ( class UbuIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?ubu\.com/film/(?P<id>[\da-z_-]+)\.html' + _VALID_URL = r'https?://(?:www\.)?ubu\.com/film/(?P<id>[\da-z_-]+)\.html' _TEST = { 'url': 'http://ubu.com/film/her_noise.html', 'md5': '138d5652618bf0f03878978db9bef1ee', diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py index 594bee4f9..66d9f1bf3 100644 --- a/youtube_dl/extractor/unistra.py +++ b/youtube_dl/extractor/unistra.py @@ -7,7 +7,7 @@ from ..utils import qualities class UnistraIE(InfoExtractor): - _VALID_URL = r'http://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(?P<id>\d+)' + _VALID_URL = r'https?://utv\.unistra\.fr/(?:index|video)\.php\?id_video\=(?P<id>\d+)' _TESTS = [ { diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index 3794bcded..b755dda90 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -13,7 +13,7 @@ from ..utils import ( class Vbox7IE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?vbox7\.com/play:(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?vbox7\.com/play:(?P<id>[^/]+)' _TEST = { 'url': 'http://vbox7.com/play:249bb972c2', 'md5': '99f65c0c9ef9b682b97313e052734c3f', diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py index 9633f7ffe..23ce0a0d1 100644 --- a/youtube_dl/extractor/veoh.py +++ b/youtube_dl/extractor/veoh.py @@ -12,7 +12,7 @@ from ..utils import ( class VeohIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|yapi-)[\da-zA-Z]+)' + _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|yapi-)[\da-zA-Z]+)' _TESTS = [ { diff --git a/youtube_dl/extractor/vesti.py b/youtube_dl/extractor/vesti.py index a0c59a2e0..cb64ae0bd 100644 --- a/youtube_dl/extractor/vesti.py +++ b/youtube_dl/extractor/vesti.py @@ -10,7 +10,7 @@ from .rutv import RUTVIE class VestiIE(InfoExtractor): IE_DESC = 'Вести.Ru' - _VALID_URL = r'http://(?:.+?\.)?vesti\.ru/(?P<id>.+)' + _VALID_URL = r'https?://(?:.+?\.)?vesti\.ru/(?P<id>.+)' _TESTS = [ { diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 77d8978d4..b11cd254c 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -214,7 +214,7 @@ class VGTVIE(XstreamIE): class BTArticleIE(InfoExtractor): IE_NAME = 'bt:article' IE_DESC = 'Bergens Tidende Articles' - _VALID_URL = r'http://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html' + _VALID_URL = r'https?://(?:www\.)?bt\.no/(?:[^/]+/)+(?P<id>[^/]+)-\d+\.html' _TEST = { 'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html', 'md5': '2acbe8ad129b3469d5ae51b1158878df', @@ -241,7 +241,7 @@ class BTArticleIE(InfoExtractor): class BTVestlendingenIE(InfoExtractor): IE_NAME = 'bt:vestlendingen' IE_DESC = 'Bergens Tidende - Vestlendingen' - _VALID_URL = r'http://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588', 'md5': 'd7d17e3337dc80de6d3a540aefbe441b', diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py index 2cd36508a..0f798711b 100644 --- a/youtube_dl/extractor/videott.py +++ b/youtube_dl/extractor/videott.py @@ -14,7 +14,7 @@ class VideoTtIE(InfoExtractor): _WORKING = False ID_NAME = 'video.tt' IE_DESC = 'video.tt - Your True Tube' - _VALID_URL = r'http://(?:www\.)?video\.tt/(?:(?:video|embed)/|watch_video\.php\?v=)(?P<id>[\da-zA-Z]{9})' + _VALID_URL = r'https?://(?:www\.)?video\.tt/(?:(?:video|embed)/|watch_video\.php\?v=)(?P<id>[\da-zA-Z]{9})' _TESTS = [{ 'url': 'http://www.video.tt/watch_video.php?v=amd5YujV8', diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index 315984bf9..03b9f1353 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -15,7 +15,7 @@ from ..utils import ( class ViideaIE(InfoExtractor): - _VALID_URL = r'''(?x)http://(?:www\.)?(?: + _VALID_URL = r'''(?x)https?://(?:www\.)?(?: videolectures\.net| flexilearn\.viidea\.net| presentations\.ocwconsortium\.org| diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py index 149e36467..10ca6acb1 100644 --- a/youtube_dl/extractor/vube.py +++ b/youtube_dl/extractor/vube.py @@ -15,7 +15,7 @@ from ..utils import ( class VubeIE(InfoExtractor): IE_NAME = 'vube' IE_DESC = 'Vube.com' - _VALID_URL = r'http://vube\.com/(?:[^/]+/)+(?P<id>[\da-zA-Z]{10})\b' + _VALID_URL = r'https?://vube\.com/(?:[^/]+/)+(?P<id>[\da-zA-Z]{10})\b' _TESTS = [ { diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py index a6d9b5fee..eaa888f00 100644 --- a/youtube_dl/extractor/vuclip.py +++ b/youtube_dl/extractor/vuclip.py @@ -14,7 +14,7 @@ from ..utils import ( class VuClipIE(InfoExtractor): - _VALID_URL = r'http://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:m\.)?vuclip\.com/w\?.*?cid=(?P<id>[0-9]+)' _TEST = { 'url': 'http://m.vuclip.com/w?cid=922692425&fid=70295&z=1010&nvar&frm=index.html', diff --git a/youtube_dl/extractor/walla.py b/youtube_dl/extractor/walla.py index 24efbd6e6..8b9488340 100644 --- a/youtube_dl/extractor/walla.py +++ b/youtube_dl/extractor/walla.py @@ -11,7 +11,7 @@ from ..utils import ( class WallaIE(InfoExtractor): - _VALID_URL = r'http://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)' + _VALID_URL = r'https?://vod\.walla\.co\.il/[^/]+/(?P<id>\d+)/(?P<display_id>.+)' _TEST = { 'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one', 'info_dict': { diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 37cf3d309..5227bb5ad 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -12,7 +12,7 @@ from ..utils import ( class WatIE(InfoExtractor): - _VALID_URL = r'(?:wat:(?P<real_id>\d{8})|http://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html)' + _VALID_URL = r'(?:wat:(?P<real_id>\d{8})|https?://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html)' IE_NAME = 'wat.tv' _TESTS = [ { diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 65cab4069..31c904303 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -244,7 +244,7 @@ class WDRMobileIE(InfoExtractor): class WDRMausIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P<id>[^/?#]+)(?:/index\.php5|(?<!index)\.php5|/(?:$|[?#]))' + _VALID_URL = r'https?://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P<id>[^/?#]+)(?:/index\.php5|(?<!index)\.php5|/(?:$|[?#]))' IE_DESC = 'Sendung mit der Maus' _TESTS = [{ 'url': 'http://www.wdrmaus.de/aktuelle-sendung/index.php5', diff --git a/youtube_dl/extractor/weiqitv.py b/youtube_dl/extractor/weiqitv.py index e333ae345..3dafbeec2 100644 --- a/youtube_dl/extractor/weiqitv.py +++ b/youtube_dl/extractor/weiqitv.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class WeiqiTVIE(InfoExtractor): IE_DESC = 'WQTV' - _VALID_URL = r'http://www\.weiqitv\.com/index/video_play\?videoId=(?P<id>[A-Za-z0-9]+)' + _VALID_URL = r'https?://www\.weiqitv\.com/index/video_play\?videoId=(?P<id>[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://www.weiqitv.com/index/video_play?videoId=53c744f09874f0e76a8b46f3', diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index fb0accac7..828c03dc3 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -5,7 +5,7 @@ from .youtube import YoutubeIE class WimpIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?wimp\.com/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?wimp\.com/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.wimp.com/maruexhausted/', 'md5': 'ee21217ffd66d058e8b16be340b74883', diff --git a/youtube_dl/extractor/xbef.py b/youtube_dl/extractor/xbef.py index 4ff99e5ca..e4a2baad2 100644 --- a/youtube_dl/extractor/xbef.py +++ b/youtube_dl/extractor/xbef.py @@ -5,7 +5,7 @@ from ..compat import compat_urllib_parse_unquote class XBefIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?xbef\.com/video/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?xbef\.com/video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://xbef.com/video/5119-glamourous-lesbians-smoking-drinking-and-fucking', 'md5': 'a478b565baff61634a98f5e5338be995', diff --git a/youtube_dl/extractor/yam.py b/youtube_dl/extractor/yam.py index 001ee17b6..63bbc0634 100644 --- a/youtube_dl/extractor/yam.py +++ b/youtube_dl/extractor/yam.py @@ -15,7 +15,7 @@ from ..utils import ( class YamIE(InfoExtractor): IE_DESC = '蕃薯藤yam天空部落' - _VALID_URL = r'http://mymedia.yam.com/m/(?P<id>\d+)' + _VALID_URL = r'https?://mymedia.yam.com/m/(?P<id>\d+)' _TESTS = [{ # An audio hosted on Yam diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py index 869f3e819..2522551dc 100644 --- a/youtube_dl/extractor/ynet.py +++ b/youtube_dl/extractor/ynet.py @@ -9,7 +9,7 @@ from ..compat import compat_urllib_parse_unquote_plus class YnetIE(InfoExtractor): - _VALID_URL = r'http://(?:.+?\.)?ynet\.co\.il/(?:.+?/)?0,7340,(?P<id>L(?:-[0-9]+)+),00\.html' + _VALID_URL = r'https?://(?:.+?\.)?ynet\.co\.il/(?:.+?/)?0,7340,(?P<id>L(?:-[0-9]+)+),00\.html' _TESTS = [ { 'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html', From 1600ed1ff9edb33ef901bda2c4b2732df7e0e4e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 21 Mar 2016 21:46:49 +0600 Subject: [PATCH 085/116] [rutv] Improve flash version pattern (Closes #8911) --- youtube_dl/extractor/rutv.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py index f7fe1fece..a2379eb04 100644 --- a/youtube_dl/extractor/rutv.py +++ b/youtube_dl/extractor/rutv.py @@ -14,7 +14,7 @@ class RUTVIE(InfoExtractor): IE_DESC = 'RUTV.RU' _VALID_URL = r'''(?x) https?://player\.(?:rutv\.ru|vgtrk\.com)/ - (?P<path>flash2v/container\.swf\?id= + (?P<path>flash\d+v/container\.swf\?id= |iframe/(?P<type>swf|video|live)/id/ |index/iframe/cast_id/) (?P<id>\d+)''' @@ -109,7 +109,7 @@ class RUTVIE(InfoExtractor): return mobj.group('url') mobj = re.search( - r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/flash2v/container\.swf\?id=.+?\2)', + r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)', webpage) if mobj: return mobj.group('url') @@ -119,7 +119,7 @@ class RUTVIE(InfoExtractor): video_id = mobj.group('id') video_path = mobj.group('path') - if video_path.startswith('flash2v'): + if re.match(r'flash\d+v', video_path): video_type = 'video' elif video_path.startswith('iframe'): video_type = mobj.group('type') @@ -168,7 +168,7 @@ class RUTVIE(InfoExtractor): 'play_path': mobj.group('playpath'), 'app': mobj.group('app'), 'page_url': 'http://player.rutv.ru', - 'player_url': 'http://player.rutv.ru/flash2v/osmf.swf?i=22', + 'player_url': 'http://player.rutv.ru/flash3v/osmf.swf?i=22', 'rtmp_live': True, 'ext': 'flv', 'vbr': int(quality), From 065c4b27bfc4488758b357e023e9b6b1679c9641 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 21 Mar 2016 22:07:34 +0600 Subject: [PATCH 086/116] [xhamster:embed] Extract vars (Closes #8912) --- youtube_dl/extractor/xhamster.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index fd43e8854..b3547174d 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( + dict_get, float_or_none, int_or_none, unified_strdate, @@ -170,6 +171,12 @@ class XHamsterEmbedIE(InfoExtractor): video_url = self._search_regex( r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id, - webpage, 'xhamster url') + webpage, 'xhamster url', default=None) + + if not video_url: + vars = self._parse_json( + self._search_regex(r'vars\s*:\s*({.+?})\s*,\s*\n', webpage, 'vars'), + video_id) + video_url = dict_get(vars, ('downloadLink', 'homepageLink', 'commentsLink', 'shareUrl')) return self.url_result(video_url, 'XHamster') From ff5873b72de16854ae8d506d5648148a54828243 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 21 Mar 2016 22:24:42 +0600 Subject: [PATCH 087/116] [motherless] Detect friends only videos --- youtube_dl/extractor/motherless.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/motherless.py b/youtube_dl/extractor/motherless.py index ad04b12cd..5e1a8a71a 100644 --- a/youtube_dl/extractor/motherless.py +++ b/youtube_dl/extractor/motherless.py @@ -69,6 +69,9 @@ class MotherlessIE(InfoExtractor): ">The page you're looking for cannot be found.<")): raise ExtractorError('Video %s does not exist' % video_id, expected=True) + if '>The content you are trying to view is for friends only.' in webpage: + raise ExtractorError('Video %s is for friends only' % video_id, expected=True) + title = self._html_search_regex( r'id="view-upload-title">\s+([^<]+)<', webpage, 'title') video_url = self._html_search_regex( From 5c69f7a479936a8fc429228c8259dc5fcbf428b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 21 Mar 2016 23:31:40 +0600 Subject: [PATCH 088/116] [animeondemand] Respect startvideo (Closes #8923) --- youtube_dl/extractor/animeondemand.py | 37 +++++++++++++++++++-------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 4352525e2..81a843035 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -163,7 +163,7 @@ class AnimeOnDemandIE(InfoExtractor): if kind: format_id_list.append(kind) if not format_id_list: - format_id_list.append('hls') + format_id_list.append(compat_str(num)) format_id = '-'.join(format_id_list) format_note = ', '.join(filter(None, (kind, lang_note))) request = sanitized_Request( @@ -179,26 +179,41 @@ class AnimeOnDemandIE(InfoExtractor): fatal=False) if not playlist: continue + start_video = playlist.get('startvideo', 0) playlist = playlist.get('playlist') if not playlist or not isinstance(playlist, list): continue - playlist = playlist[0] + playlist = playlist[start_video] title = playlist.get('title') if not title: continue description = playlist.get('description') for source in playlist.get('sources', []): file_ = source.get('file') - if file_ and determine_ext(file_) == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( + if not file_: + continue + ext = determine_ext(file_) + format_id_list = [lang, kind] + if ext == 'm3u8': + format_id_list.append('hls') + elif source.get('type') == 'video/dash' or ext == 'mpd': + format_id_list.append('dash') + format_id = '-'.join(filter(None, format_id_list)) + if ext == 'm3u8': + file_formats = self._extract_m3u8_formats( file_, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id=format_id) - for f in m3u8_formats: - f.update({ - 'language': lang, - 'format_note': format_note, - }) - formats.extend(m3u8_formats) + entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) + elif source.get('type') == 'video/dash' or ext == 'mpd': + file_formats = self._extract_mpd_formats( + file_, video_id, mpd_id=format_id, fatal=False) + else: + continue + for f in file_formats: + f.update({ + 'language': lang, + 'format_note': format_note, + }) + formats.extend(file_formats) if formats: self._sort_formats(formats) From 85c637b7376f0426e5e0a6812da2a72b2ca28680 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 21 Mar 2016 23:35:50 +0600 Subject: [PATCH 089/116] [animeondemand] Extract teaser when no full episode available (#8923) --- youtube_dl/extractor/animeondemand.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 81a843035..a47697738 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -225,16 +225,18 @@ class AnimeOnDemandIE(InfoExtractor): }) entries.append(f) - m = re.search( - r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<', - episode_html) - if m: - f = common_info.copy() - f.update({ - 'id': '%s-teaser' % f['id'], - 'title': m.group('title'), - 'url': compat_urlparse.urljoin(url, m.group('href')), - }) - entries.append(f) + # Extract teaser only when full episode is not available + if not formats: + m = re.search( + r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<', + episode_html) + if m: + f = common_info.copy() + f.update({ + 'id': '%s-teaser' % f['id'], + 'title': m.group('title'), + 'url': compat_urlparse.urljoin(url, m.group('href')), + }) + entries.append(f) return self.playlist_result(entries, anime_id, anime_title, anime_description) From bc5d16b3023ab29216d868bbfe8838b5e92e72f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 21 Mar 2016 23:37:39 +0600 Subject: [PATCH 090/116] [animeondemand] Skip dash for now --- youtube_dl/extractor/animeondemand.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index a47697738..2cede55a7 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -204,6 +204,7 @@ class AnimeOnDemandIE(InfoExtractor): file_, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) elif source.get('type') == 'video/dash' or ext == 'mpd': + continue file_formats = self._extract_mpd_formats( file_, video_id, mpd_id=format_id, fatal=False) else: From cc7397b04d4a21b5ac680858ee2600e3b3bfb569 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 22 Mar 2016 21:12:29 +0600 Subject: [PATCH 091/116] [ceskatelevize] Make m3u8 formats extraction non fatal (Closes #8933) --- youtube_dl/extractor/ceskatelevize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index b27b4e670..b355111cb 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -129,7 +129,8 @@ class CeskaTelevizeIE(InfoExtractor): formats = [] for format_id, stream_url in item['streamUrls'].items(): formats.extend(self._extract_m3u8_formats( - stream_url, playlist_id, 'mp4', entry_protocol='m3u8_native')) + stream_url, playlist_id, 'mp4', + entry_protocol='m3u8_native', fatal=False)) self._sort_formats(formats) item_id = item.get('id') or item['assetId'] From 2beeb286e179a00bc0c76fc55ca5c8d19e74ca41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 22 Mar 2016 22:32:59 +0600 Subject: [PATCH 092/116] [laola1tv] Add support for livestreams (Closes #8934) --- youtube_dl/extractor/laola1tv.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 5d8ebbeb3..41d80bc12 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -19,7 +19,7 @@ from ..utils import ( class Laola1TvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P<lang>[a-z]+)-(?P<portal>[a-z]+)/[^/]+/(?P<slug>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P<lang>[a-z]+)-(?P<portal>[a-z]+)/(?P<kind>[^/]+)/(?P<slug>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie/227883.html', 'info_dict': { @@ -33,7 +33,7 @@ class Laola1TvIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, }, { 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie', 'info_dict': { @@ -47,12 +47,28 @@ class Laola1TvIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, + }, { + 'url': 'http://www.laola1.tv/de-de/livestream/2016-03-22-belogorie-belgorod-trentino-diatec-lde', + 'info_dict': { + 'id': '487850', + 'display_id': '2016-03-22-belogorie-belgorod-trentino-diatec-lde', + 'ext': 'flv', + 'title': 'Belogorie BELGOROD - TRENTINO Diatec', + 'upload_date': '20160322', + 'uploader': 'CEV - Europäischer Volleyball Verband', + 'is_live': True, + 'categories': ['Volleyball'], + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('slug') + kind = mobj.group('kind') lang = mobj.group('lang') portal = mobj.group('portal') @@ -85,12 +101,17 @@ class Laola1TvIE(InfoExtractor): _v = lambda x, **k: xpath_text(hd_doc, './/video/' + x, **k) title = _v('title', fatal=True) + VS_TARGETS = { + 'video': '2', + 'livestream': '17', + } + req = sanitized_Request( 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access?%s' % compat_urllib_parse.urlencode({ 'videoId': video_id, - 'target': '2', - 'label': 'laola1tv', + 'target': VS_TARGETS.get(kind, '2'), + 'label': _v('label'), 'area': _v('area'), }), urlencode_postdata( From c6ca11f1b31a292413cab03012654fd0021814a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 22 Mar 2016 23:48:05 +0600 Subject: [PATCH 093/116] [once] Prevent ads from embedding into m3u8 playlists (Closes #8893) --- youtube_dl/extractor/once.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/once.py b/youtube_dl/extractor/once.py index 080045d4c..5db949b17 100644 --- a/youtube_dl/extractor/once.py +++ b/youtube_dl/extractor/once.py @@ -20,6 +20,10 @@ class OnceIE(InfoExtractor): media_item_id, 'mp4', m3u8_id='hls', fatal=False) progressive_formats = [] for adaptive_format in formats: + # Prevent advertisement from embedding into m3u8 playlist (see + # https://github.com/rg3/youtube-dl/issues/8893#issuecomment-199912684) + adaptive_format['url'] = re.sub( + r'\badsegmentlength=\d+', r'adsegmentlength=0', adaptive_format['url']) rendition_id = self._search_regex( r'/now/media/playlist/[^/]+/[^/]+/([^/]+)', adaptive_format['url'], 'redition id', default=None) From 7da2c87119db8beda1bdc979fad38c08fc1252e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 22 Mar 2016 22:17:59 +0100 Subject: [PATCH 094/116] Add extractor for thescene.com (closes #8929) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/thescene.py | 48 ++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 youtube_dl/extractor/thescene.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index ee792bbe0..8f7df4d12 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -743,6 +743,7 @@ from .theplatform import ( ThePlatformIE, ThePlatformFeedIE, ) +from .thescene import TheSceneIE from .thesixtyone import TheSixtyOneIE from .thestar import TheStarIE from .thisamericanlife import ThisAmericanLifeIE diff --git a/youtube_dl/extractor/thescene.py b/youtube_dl/extractor/thescene.py new file mode 100644 index 000000000..08d666eaf --- /dev/null +++ b/youtube_dl/extractor/thescene.py @@ -0,0 +1,48 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..compat import compat_urllib_parse +from ..utils import qualities + + +class TheSceneIE(InfoExtractor): + _VALID_URL = r'https://thescene\.com/watch/[^/]+/(?P<id>[^/#?]+)' + + _TEST = { + 'url': 'https://thescene.com/watch/vogue/narciso-rodriguez-spring-2013-ready-to-wear', + 'info_dict': { + 'id': '520e8faac2b4c00e3c6e5f43', + 'ext': 'mp4', + 'title': 'Narciso Rodriguez: Spring 2013 Ready-to-Wear', + 'display_id': 'narciso-rodriguez-spring-2013-ready-to-wear', + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + player_url = compat_urllib_parse.urljoin( + url, + self._html_search_regex( + r'id=\'js-player-script\'[^>]+src=\'(.+?)\'', webpage, 'player url')) + + self.to_screen(player_url) + player = self._download_webpage(player_url, player_url) + info = self._parse_json(self._search_regex(r'(?m)var\s+video\s+=\s+({.+?});$', player, 'info json'), display_id) + + qualities_order = qualities(['low', 'high']) + formats = [{ + 'format_id': '{0}-{1}'.format(f['type'].split('/')[0], f['quality']), + 'url': f['src'], + 'quality': qualities_order(f['quality']), + } for f in info['sources'][0]] + self._sort_formats(formats) + + return { + 'id': info['id'], + 'title': info['title'], + 'formats': formats, + 'thumbnail': info.get('poster_frame'), + 'display_id': display_id, + } From efbed08dc20c530fe428256e4dcbea4dc4423d0d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 23 Mar 2016 22:24:52 +0800 Subject: [PATCH 095/116] [utils] Encode hostnames before passing to urllib With IDN (Internationalized Domain Name) and a proxy, non-ascii URLs are passed down to urllib/urllib2, causing UnicodeEncodeError Fixes #8890 --- test/test_http.py | 10 ++++++++++ youtube_dl/utils.py | 1 + 2 files changed, 11 insertions(+) diff --git a/test/test_http.py b/test/test_http.py index fc59b1aed..15e0ad369 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# coding: utf-8 from __future__ import unicode_literals # Allow direct execution @@ -120,5 +121,14 @@ class TestProxy(unittest.TestCase): response = ydl.urlopen(req).read().decode('utf-8') self.assertEqual(response, 'cn: {0}'.format(url)) + def test_proxy_with_idn(self): + ydl = YoutubeDL({ + 'proxy': 'localhost:{0}'.format(self.port), + }) + url = 'http://中文.tw/' + response = ydl.urlopen(url).read().decode('utf-8') + # b'xn--fiq228c' is '中文'.encode('idna') + self.assertEqual(response, 'normal: http://xn--fiq228c.tw/') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 067b8a184..03bb7782f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1746,6 +1746,7 @@ def escape_url(url): """Escape URL as suggested by RFC 3986""" url_parsed = compat_urllib_parse_urlparse(url) return url_parsed._replace( + netloc=url_parsed.netloc.encode('idna').decode('ascii'), path=escape_rfc3986(url_parsed.path), params=escape_rfc3986(url_parsed.params), query=escape_rfc3986(url_parsed.query), From 882c6992967914c245e086ddaacde9d595cd6ed9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 23 Mar 2016 20:45:39 +0600 Subject: [PATCH 096/116] [tunein] Fix stream data extraction (Closes #8899, closes #8924) --- youtube_dl/extractor/tunein.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/tunein.py b/youtube_dl/extractor/tunein.py index 8322cc14d..ae4cfaec2 100644 --- a/youtube_dl/extractor/tunein.py +++ b/youtube_dl/extractor/tunein.py @@ -1,7 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -import json +import re from .common import InfoExtractor from ..utils import ExtractorError @@ -27,10 +27,9 @@ class TuneInBaseIE(InfoExtractor): if not streams_url.startswith('http://'): streams_url = compat_urlparse.urljoin(url, streams_url) - stream_data = self._download_webpage( - streams_url, content_id, note='Downloading stream data') - streams = json.loads(self._search_regex( - r'\((.*)\);', stream_data, 'stream info'))['Streams'] + streams = self._download_json( + streams_url, content_id, note='Downloading stream data', + transform_source=lambda s: re.sub(r'^\s*\((.*)\);\s*$', r'\1', s))['Streams'] is_live = None formats = [] From 2d60465e44c3290fa1ee4239fe18eb9d0c69d9e9 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 23 Mar 2016 23:20:28 +0800 Subject: [PATCH 097/116] [test/test_utils] Update for escape_url change --- test/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 325b870cc..8ba531542 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -577,7 +577,7 @@ class TestUtil(unittest.TestCase): ) self.assertEqual( escape_url('http://тест.рф/фрагмент'), - 'http://тест.рф/%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82' + 'http://xn--e1aybc.xn--p1ai/%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82' ) self.assertEqual( escape_url('http://тест.рф/абв?абв=абв#абв'), From 81f36eba886349475235dbacef29d6b23b40538f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 23 Mar 2016 23:23:26 +0800 Subject: [PATCH 098/116] [test/test_utils] Update for escape_url change (again) --- test/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 8ba531542..a35debfe1 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -581,7 +581,7 @@ class TestUtil(unittest.TestCase): ) self.assertEqual( escape_url('http://тест.рф/абв?абв=абв#абв'), - 'http://тест.рф/%D0%B0%D0%B1%D0%B2?%D0%B0%D0%B1%D0%B2=%D0%B0%D0%B1%D0%B2#%D0%B0%D0%B1%D0%B2' + 'http://xn--e1aybc.xn--p1ai/%D0%B0%D0%B1%D0%B2?%D0%B0%D0%B1%D0%B2=%D0%B0%D0%B1%D0%B2#%D0%B0%D0%B1%D0%B2' ) self.assertEqual(escape_url('http://vimeo.com/56015672#at=0'), 'http://vimeo.com/56015672#at=0') From 088e1aac5970ea2c24fa902873a5e0b984b37595 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 23 Mar 2016 23:55:08 +0800 Subject: [PATCH 099/116] [generic] Support Vine embeds (#8817) --- youtube_dl/extractor/generic.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 26de27a7e..93e0563b5 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1909,6 +1909,14 @@ class GenericIE(InfoExtractor): self._proto_relative_url(unescapeHTML(mobj.group(1))), 'AdobeTVVideo') + # Look for Vine embeds + mobj = re.search( + r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))', + webpage) + if mobj is not None: + return self.url_result( + self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine') + def check_video(vurl): if YoutubeIE.suitable(vurl): return True From fc27ea94642a8e2e9b0fcfdcc0c370ec7484c971 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 23 Mar 2016 23:55:52 +0800 Subject: [PATCH 100/116] [tumblr] Support Vine embeds (#8817) --- youtube_dl/extractor/tumblr.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index cea117c79..584716986 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -67,6 +67,22 @@ class TumblrIE(InfoExtractor): 'uploader_id': 'user32021558', }, 'add_ie': ['Vimeo'], + }, { + 'url': 'http://sutiblr.tumblr.com/post/139638707273', + 'md5': '2dd184b3669e049ba40563a7d423f95c', + 'info_dict': { + 'id': 'ir7qBEIKqvq', + 'ext': 'mp4', + 'title': 'Vine by sutiblr', + 'alt_title': 'Vine by sutiblr', + 'uploader': 'sutiblr', + 'uploader_id': '1198993975374495744', + 'upload_date': '20160220', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + 'add_ie': ['Vine'], }] def _real_extract(self, url): From c4096e8aeaa373159e350a3674b0ce18b6c519e2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 24 Mar 2016 16:29:33 +0800 Subject: [PATCH 101/116] [instagram] Extract embed videos (#8817) --- youtube_dl/extractor/instagram.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index ed3e07118..e8b27b379 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( + get_element_by_attribute, int_or_none, limit_length, ) @@ -38,6 +39,18 @@ class InstagramIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_embed_url(webpage): + blockquote_el = get_element_by_attribute( + 'class', 'instagram-media', webpage) + if blockquote_el is None: + return + + mobj = re.search( + r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el) + if mobj: + return mobj.group('link') + def _real_extract(self, url): video_id = self._match_id(url) From 87696e78d7203cf47bdd27773d06ba15af7c819d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 24 Mar 2016 16:30:01 +0800 Subject: [PATCH 102/116] [instagram] Unescape description (#8817) --- youtube_dl/extractor/instagram.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index e8b27b379..4e62098b0 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -7,6 +7,7 @@ from ..utils import ( get_element_by_attribute, int_or_none, limit_length, + lowercase_escape, ) @@ -59,6 +60,8 @@ class InstagramIE(InfoExtractor): webpage, 'uploader id', fatal=False) desc = self._search_regex( r'"caption":"(.+?)"', webpage, 'description', default=None) + if desc is not None: + desc = lowercase_escape(desc) return { 'id': video_id, From 5a51775a58b901c63bb0b59e44a113ad16ceb236 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 24 Mar 2016 16:32:27 +0800 Subject: [PATCH 103/116] [generic] Extract Instagram embeds (#8817) --- youtube_dl/extractor/generic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 93e0563b5..12f2309fc 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -59,6 +59,7 @@ from .videomore import VideomoreIE from .googledrive import GoogleDriveIE from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE +from .instagram import InstagramIE class GenericIE(InfoExtractor): @@ -1917,6 +1918,11 @@ class GenericIE(InfoExtractor): return self.url_result( self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine') + # Look for Instagram embeds + instagram_embed_url = InstagramIE._extract_embed_url(webpage) + if instagram_embed_url is not None: + return self.url_result(instagram_embed_url, InstagramIE.ie_key()) + def check_video(vurl): if YoutubeIE.suitable(vurl): return True From 32d88410eb2ef0ca25ab770af8f2ca2326c0aca7 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 24 Mar 2016 16:32:53 +0800 Subject: [PATCH 104/116] [tumblr] Add a test with Instagram embed Closes #8817 --- youtube_dl/extractor/tumblr.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 584716986..e5bcf7798 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -83,6 +83,18 @@ class TumblrIE(InfoExtractor): 'repost_count': int, }, 'add_ie': ['Vine'], + }, { + 'url': 'http://vitasidorkina.tumblr.com/post/134652425014/joskriver-victoriassecret-invisibility-or', + 'md5': '01c12ceb82cbf6b2fe0703aa56b3ad72', + 'info_dict': { + 'id': '-7LnUPGlSo', + 'ext': 'mp4', + 'title': 'Video by victoriassecret', + 'description': 'Invisibility or flight…which superpower would YOU choose? #VSFashionShow #ThisOrThat', + 'uploader_id': 'victoriassecret', + 'thumbnail': 're:^https?://.*\.jpg' + }, + 'add_ie': ['Instagram'], }] def _real_extract(self, url): From 622d19160bd81161e18c6ce868c359549a4d0413 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 24 Mar 2016 18:06:15 +0800 Subject: [PATCH 105/116] [utils] Clarify Python versions affected by buggy struct module --- youtube_dl/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 03bb7782f..b6e1dc809 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1756,7 +1756,8 @@ def escape_url(url): try: struct.pack('!I', 0) except TypeError: - # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument + # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument + # See https://bugs.python.org/issue19099 def struct_pack(spec, *args): if isinstance(spec, compat_str): spec = spec.encode('ascii') From 5767b4eeae997a4ef75e348b46489cbb55126414 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 24 Mar 2016 22:23:31 +0600 Subject: [PATCH 106/116] [mtv] Fix description extraction (Closes #8962) --- youtube_dl/extractor/mtv.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index ed068365d..824bbcb4e 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -17,6 +17,7 @@ from ..utils import ( unescapeHTML, url_basename, RegexNotFoundError, + xpath_text, ) @@ -130,11 +131,7 @@ class MTVServicesInfoExtractor(InfoExtractor): message += item.text raise ExtractorError(message, expected=True) - description_node = itemdoc.find('description') - if description_node is not None: - description = description_node.text.strip() - else: - description = None + description = xpath_text(itemdoc, 'description') title_el = None if title_el is None: From 369e7e3ff02201210864b4e20af2893c40894ddf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 24 Mar 2016 22:54:26 +0600 Subject: [PATCH 107/116] [iprima] Fix extraction (Closes #8953) --- youtube_dl/extractor/iprima.py | 44 ++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index 61a0de472..788bbe0d5 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re @@ -6,6 +6,8 @@ import time from .common import InfoExtractor from ..utils import ( + determine_ext, + js_to_json, sanitized_Request, ) @@ -30,8 +32,7 @@ class IPrimaIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -43,9 +44,42 @@ class IPrimaIE(InfoExtractor): req.add_header('Referer', url) playerpage = self._download_webpage(req, video_id, note='Downloading player') - m3u8_url = self._search_regex(r"'src': '([^']+\.m3u8)'", playerpage, 'm3u8 url') + formats = [] - formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + def extract_formats(format_url, format_key=None, lang=None): + ext = determine_ext(format_url) + new_formats = [] + if format_key == 'hls' or ext == 'm3u8': + new_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + elif format_key == 'dash' or ext == 'mpd': + return + new_formats = self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False) + if lang: + for f in new_formats: + if not f.get('language'): + f['language'] = lang + formats.extend(new_formats) + + options = self._parse_json( + self._search_regex( + r'(?s)var\s+playerOptions\s*=\s*({.+?});', + playerpage, 'player options', default='{}'), + video_id, transform_source=js_to_json, fatal=False) + if options: + for key, tracks in options.get('tracks', {}).items(): + if not isinstance(tracks, list): + continue + for track in tracks: + src = track.get('src') + if src: + extract_formats(src, key.lower(), track.get('lang')) + + if not formats: + for _, src in re.findall(r'src["\']\s*:\s*(["\'])(.+?)\1', playerpage): + extract_formats(src) self._sort_formats(formats) From 993271da0a70d6d5c194a10e48d43f3aa2abc956 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 24 Mar 2016 23:28:24 +0600 Subject: [PATCH 108/116] [nytimes] Tolerate missing metadata (Closes #8952) --- youtube_dl/extractor/nytimes.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 7f254b867..681683e86 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -18,8 +18,9 @@ class NYTimesBaseIE(InfoExtractor): description = video_data.get('summary') duration = float_or_none(video_data.get('duration'), 1000) - uploader = video_data['byline'] - timestamp = parse_iso8601(video_data['publication_date'][:-8]) + uploader = video_data.get('byline') + publication_date = video_data.get('publication_date') + timestamp = parse_iso8601(publication_date[:-8]) if publication_date else None def get_file_size(file_size): if isinstance(file_size, int): @@ -37,7 +38,7 @@ class NYTimesBaseIE(InfoExtractor): 'width': int_or_none(video.get('width')), 'height': int_or_none(video.get('height')), 'filesize': get_file_size(video.get('fileSize')), - } for video in video_data['renditions'] + } for video in video_data['renditions'] if video.get('url') ] self._sort_formats(formats) @@ -46,7 +47,7 @@ class NYTimesBaseIE(InfoExtractor): 'url': 'http://www.nytimes.com/%s' % image['url'], 'width': int_or_none(image.get('width')), 'height': int_or_none(image.get('height')), - } for image in video_data['images'] + } for image in video_data.get('images', []) if image.get('url') ] return { From f07e276a04292c3fa87f703931bad9b716e7ccdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 25 Mar 2016 01:18:14 +0600 Subject: [PATCH 109/116] [youtube:live] Add extractor (Closes #8959) --- youtube_dl/extractor/youtube.py | 48 ++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 466f5da2e..96fa3b5aa 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1911,7 +1911,8 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): @classmethod def suitable(cls, url): - return False if YoutubePlaylistsIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url) + return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url) + else super(YoutubeChannelIE, cls).suitable(url)) def _real_extract(self, url): channel_id = self._match_id(url) @@ -1986,6 +1987,51 @@ class YoutubeUserIE(YoutubeChannelIE): return super(YoutubeUserIE, cls).suitable(url) +class YoutubeLiveIE(YoutubeBaseInfoExtractor): + IE_DESC = 'YouTube.com live streams' + _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+))/live' + IE_NAME = 'youtube:live' + + _TESTS = [{ + 'url': 'http://www.youtube.com/user/TheYoungTurks/live', + 'info_dict': { + 'id': 'a48o2S1cPoo', + 'ext': 'mp4', + 'title': 'The Young Turks - Live Main Show', + 'uploader': 'The Young Turks', + 'uploader_id': 'TheYoungTurks', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', + 'upload_date': '20150715', + 'license': 'Standard YouTube License', + 'description': 'md5:438179573adcdff3c97ebb1ee632b891', + 'categories': ['News & Politics'], + 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + channel_id = mobj.group('id') + base_url = mobj.group('base_url') + webpage = self._download_webpage(url, channel_id, fatal=False) + if webpage: + page_type = self._og_search_property( + 'type', webpage, 'page type', default=None) + video_id = self._html_search_meta( + 'videoId', webpage, 'video id', default=None) + if page_type == 'video' and video_id and re.match(r'^[0-9A-Za-z_-]{11}$', video_id): + return self.url_result(video_id, YoutubeIE.ie_key()) + return self.url_result(base_url) + + class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): IE_DESC = 'YouTube.com user/channel playlists' _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists' From d041a736741e37e75e94cddf9c8258de8f08b501 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 25 Mar 2016 01:39:25 +0600 Subject: [PATCH 110/116] [extractor/__init__] Add youtube:live and sort youtube extractors alphabetically --- youtube_dl/extractor/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 8f7df4d12..d99873419 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -960,7 +960,9 @@ from .youtube import ( YoutubeChannelIE, YoutubeFavouritesIE, YoutubeHistoryIE, + YoutubeLiveIE, YoutubePlaylistIE, + YoutubePlaylistsIE, YoutubeRecommendedIE, YoutubeSearchDateIE, YoutubeSearchIE, @@ -970,7 +972,6 @@ from .youtube import ( YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, YoutubeUserIE, - YoutubePlaylistsIE, YoutubeWatchLaterIE, ) from .zapiks import ZapiksIE From ff9d5d093854a974afdd0191d331d817e7e4c2a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 25 Mar 2016 02:26:46 +0600 Subject: [PATCH 111/116] [udemy] Improve course enrolling --- youtube_dl/extractor/udemy.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 74cc36ece..a5634ece9 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -5,6 +5,7 @@ from ..compat import ( compat_HTTPError, compat_urllib_parse, compat_urllib_request, + compat_urlparse, ) from ..utils import ( ExtractorError, @@ -35,7 +36,7 @@ class UdemyIE(InfoExtractor): 'skip': 'Requires udemy account credentials', }] - def _enroll_course(self, webpage, course_id): + def _enroll_course(self, base_url, webpage, course_id): checkout_url = unescapeHTML(self._search_regex( r'href=(["\'])(?P<url>https?://(?:www\.)?udemy\.com/payment/checkout/.+?)\1', webpage, 'checkout url', group='url', default=None)) @@ -45,9 +46,11 @@ class UdemyIE(InfoExtractor): 'Use this URL to confirm purchase: %s' % (course_id, checkout_url), expected=True) enroll_url = unescapeHTML(self._search_regex( - r'href=(["\'])(?P<url>https?://(?:www\.)?udemy\.com/course/subscribe/.+?)\1', + r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/course/subscribe/.+?)\1', webpage, 'enroll url', group='url', default=None)) if enroll_url: + if not enroll_url.startswith('http'): + enroll_url = compat_urlparse.urljoin(base_url, enroll_url) webpage = self._download_webpage(enroll_url, course_id, 'Enrolling in the course') if '>You have enrolled in' in webpage: self.to_screen('%s: Successfully enrolled in the course' % course_id) @@ -152,7 +155,7 @@ class UdemyIE(InfoExtractor): except ExtractorError as e: # Error could possibly mean we are not enrolled in the course if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - self._enroll_course(webpage, course_id) + self._enroll_course(url, webpage, course_id) lecture = self._download_lecture(course_id, lecture_id) else: raise @@ -244,7 +247,7 @@ class UdemyCourseIE(UdemyIE): course_id = response['id'] course_title = response.get('title') - self._enroll_course(webpage, course_id) + self._enroll_course(url, webpage, course_id) response = self._download_json( 'https://www.udemy.com/api-1.1/courses/%s/curriculum' % course_id, From f0e83681d97db52af9dc73d1c8e51d671503a222 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 25 Mar 2016 02:27:13 +0600 Subject: [PATCH 112/116] [udemy] Extract formats from outputs --- youtube_dl/extractor/udemy.py | 78 +++++++++++++++++++++-------------- 1 file changed, 48 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index a5634ece9..2b886d6c8 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -180,39 +180,57 @@ class UdemyIE(InfoExtractor): video_id = asset['id'] thumbnail = asset.get('thumbnailUrl') or asset.get('thumbnail_url') duration = float_or_none(asset.get('data', {}).get('duration')) - outputs = asset.get('data', {}).get('outputs', {}) formats = [] - for format_ in asset.get('download_urls', {}).get('Video', []): - video_url = format_.get('file') - if not video_url: - continue - format_id = format_.get('label') - f = { - 'url': format_['file'], - 'height': int_or_none(format_id), + + def extract_output_format(src): + return { + 'url': src['url'], + 'format_id': '%sp' % (src.get('label') or format_id), + 'width': int_or_none(src.get('width')), + 'height': int_or_none(src.get('height')), + 'vbr': int_or_none(src.get('video_bitrate_in_kbps')), + 'vcodec': src.get('video_codec'), + 'fps': int_or_none(src.get('frame_rate')), + 'abr': int_or_none(src.get('audio_bitrate_in_kbps')), + 'acodec': src.get('audio_codec'), + 'asr': int_or_none(src.get('audio_sample_rate')), + 'tbr': int_or_none(src.get('total_bitrate_in_kbps')), + 'filesize': int_or_none(src.get('file_size_in_bytes')), } - if format_id: - # Some videos contain additional metadata (e.g. - # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) - output = outputs.get(format_id) - if isinstance(output, dict): - f.update({ - 'format_id': '%sp' % (output.get('label') or format_id), - 'width': int_or_none(output.get('width')), - 'height': int_or_none(output.get('height')), - 'vbr': int_or_none(output.get('video_bitrate_in_kbps')), - 'vcodec': output.get('video_codec'), - 'fps': int_or_none(output.get('frame_rate')), - 'abr': int_or_none(output.get('audio_bitrate_in_kbps')), - 'acodec': output.get('audio_codec'), - 'asr': int_or_none(output.get('audio_sample_rate')), - 'tbr': int_or_none(output.get('total_bitrate_in_kbps')), - 'filesize': int_or_none(output.get('file_size_in_bytes')), - }) - else: - f['format_id'] = '%sp' % format_id - formats.append(f) + + outputs = asset.get('data', {}).get('outputs') + if not isinstance(outputs, dict): + outputs = {} + + for format_id, output in outputs.items(): + if isinstance(output, dict) and output.get('url'): + formats.append(extract_output_format(output)) + + download_urls = asset.get('download_urls') + if isinstance(download_urls, dict): + video = download_urls.get('Video') + if isinstance(video, list): + for format_ in video: + video_url = format_.get('file') + if not video_url: + continue + format_id = format_.get('label') + f = { + 'url': format_['file'], + 'height': int_or_none(format_id), + } + if format_id: + # Some videos contain additional metadata (e.g. + # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) + output = outputs.get(format_id) + if isinstance(output, dict): + output_format = extract_output_format(output) + output_format.update(f) + f = output_format + else: + f['format_id'] = '%sp' % format_id + formats.append(f) self._sort_formats(formats) From 5eb7db4ee96c97d98a87d8d7df46e8c6a607b682 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 25 Mar 2016 02:28:39 +0600 Subject: [PATCH 113/116] [udemy] Add support for new URL schema --- youtube_dl/extractor/udemy.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 2b886d6c8..a9046b865 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -18,7 +18,16 @@ from ..utils import ( class UdemyIE(InfoExtractor): IE_NAME = 'udemy' - _VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + www\.udemy\.com/ + (?: + [^#]+\#/lecture/| + lecture/view/?\?lectureId=| + [^/]+/learn/v4/t/lecture/ + ) + (?P<id>\d+) + ''' _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1' _ORIGIN_URL = 'https://www.udemy.com' _NETRC_MACHINE = 'udemy' @@ -34,6 +43,10 @@ class UdemyIE(InfoExtractor): 'duration': 579.29, }, 'skip': 'Requires udemy account credentials', + }, { + # new URL schema + 'url': 'https://www.udemy.com/electric-bass-right-from-the-start/learn/v4/t/lecture/4580906', + 'only_matching': True, }] def _enroll_course(self, base_url, webpage, course_id): From e0317686666f9de4a6eca3fc26ede32e664f2bec Mon Sep 17 00:00:00 2001 From: Kagami Hiiragi <kagami@genshiken.org> Date: Thu, 24 Mar 2016 13:55:02 +0300 Subject: [PATCH 114/116] [mnet] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/mnet.py | 76 ++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 youtube_dl/extractor/mnet.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d99873419..1e4b078a4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -410,6 +410,7 @@ from .mit import TechTVMITIE, MITIE, OCWMITIE from .mitele import MiTeleIE from .mixcloud import MixcloudIE from .mlb import MLBIE +from .mnet import MnetIE from .mpora import MporaIE from .moevideo import MoeVideoIE from .mofosex import MofosexIE diff --git a/youtube_dl/extractor/mnet.py b/youtube_dl/extractor/mnet.py new file mode 100644 index 000000000..8e83b1fc3 --- /dev/null +++ b/youtube_dl/extractor/mnet.py @@ -0,0 +1,76 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, +) + + +class MnetIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mnet\.(?:com|interest\.me)/tv/vod/(?:.*?\bclip_id=)?(?P<id>[0-9]+)' + _TESTS = [ + { + 'url': 'http://www.mnet.com/tv/vod/171008', + 'md5': '6abd7a837fa9fe56d22709a60b19bffb', + 'info_dict': { + 'id': '171008', + 'title': 'SS_이해인@히든박스', + 'description': 'md5:b9efa592c3918b615ba69fe9f8a05c55', + 'duration': 88, + 'upload_date': '20151231', + 'timestamp': 1451564040, + 'age_limit': 0, + 'thumbnails': 'mincount:5', + 'ext': 'flv', + }, + }, + { + 'url': 'http://mnet.interest.me/tv/vod/172790', + 'only_matching': True, + }, + { + 'url': 'http://www.mnet.com/tv/vod/vod_view.asp?clip_id=172790&tabMenu=', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + info_url = 'http://content.api.mnet.com/player/vodConfig?id=%s' % video_id + info = self._download_json(info_url, video_id) + info = info['data']['info'] + + title = info['title'] + rtmp_info_url = info['cdn'] + 'CLIP' + rtmp_info = self._download_json(rtmp_info_url, video_id) + file_url = rtmp_info['serverurl'] + rtmp_info['fileurl'] + description = info.get('ment') + duration = parse_duration(info.get('time')) + timestamp = parse_iso8601(info.get('date'), delimiter=' ') + age_limit = info.get('adult') + if age_limit is not None: + age_limit = 0 if age_limit == 'N' else 18 + thumbnails = [ + { + 'id': thumb_format, + 'url': thumb['url'], + 'width': int_or_none(thumb.get('width')), + 'height': int_or_none(thumb.get('height')), + } + for (thumb_format, thumb) in info.get('cover', {}).items() + ] + + return { + 'id': video_id, + 'title': title, + 'url': file_url, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'age_limit': age_limit, + 'thumbnails': thumbnails, + 'ext': 'flv', + } From 98e68806fb8cfe2a81ee8a6ac6705bb3d61ed2d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 25 Mar 2016 03:26:29 +0600 Subject: [PATCH 115/116] [mnet] Improve (Closes #8958) --- youtube_dl/extractor/mnet.py | 85 +++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 40 deletions(-) diff --git a/youtube_dl/extractor/mnet.py b/youtube_dl/extractor/mnet.py index 8e83b1fc3..e3f42e7bd 100644 --- a/youtube_dl/extractor/mnet.py +++ b/youtube_dl/extractor/mnet.py @@ -11,66 +11,71 @@ from ..utils import ( class MnetIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?mnet\.(?:com|interest\.me)/tv/vod/(?:.*?\bclip_id=)?(?P<id>[0-9]+)' - _TESTS = [ - { - 'url': 'http://www.mnet.com/tv/vod/171008', - 'md5': '6abd7a837fa9fe56d22709a60b19bffb', - 'info_dict': { - 'id': '171008', - 'title': 'SS_이해인@히든박스', - 'description': 'md5:b9efa592c3918b615ba69fe9f8a05c55', - 'duration': 88, - 'upload_date': '20151231', - 'timestamp': 1451564040, - 'age_limit': 0, - 'thumbnails': 'mincount:5', - 'ext': 'flv', - }, + _TESTS = [{ + 'url': 'http://www.mnet.com/tv/vod/171008', + 'info_dict': { + 'id': '171008', + 'title': 'SS_이해인@히든박스', + 'description': 'md5:b9efa592c3918b615ba69fe9f8a05c55', + 'duration': 88, + 'upload_date': '20151231', + 'timestamp': 1451564040, + 'age_limit': 0, + 'thumbnails': 'mincount:5', + 'thumbnail': 're:^https?://.*\.jpg$', + 'ext': 'flv', }, - { - 'url': 'http://mnet.interest.me/tv/vod/172790', - 'only_matching': True, + 'params': { + # rtmp download + 'skip_download': True, }, - { - 'url': 'http://www.mnet.com/tv/vod/vod_view.asp?clip_id=172790&tabMenu=', - 'only_matching': True, - }, - ] + }, { + 'url': 'http://mnet.interest.me/tv/vod/172790', + 'only_matching': True, + }, { + 'url': 'http://www.mnet.com/tv/vod/vod_view.asp?clip_id=172790&tabMenu=', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - info_url = 'http://content.api.mnet.com/player/vodConfig?id=%s' % video_id - info = self._download_json(info_url, video_id) - info = info['data']['info'] + + info = self._download_json( + 'http://content.api.mnet.com/player/vodConfig?id=%s&ctype=CLIP' % video_id, + video_id, 'Downloading vod config JSON')['data']['info'] title = info['title'] - rtmp_info_url = info['cdn'] + 'CLIP' - rtmp_info = self._download_json(rtmp_info_url, video_id) - file_url = rtmp_info['serverurl'] + rtmp_info['fileurl'] + + rtmp_info = self._download_json( + info['cdn'], video_id, 'Downloading vod cdn JSON') + + formats = [{ + 'url': rtmp_info['serverurl'] + rtmp_info['fileurl'], + 'ext': 'flv', + 'page_url': url, + 'player_url': 'http://flvfile.mnet.com/service/player/201602/cjem_player_tv.swf?v=201602191318', + }] + description = info.get('ment') duration = parse_duration(info.get('time')) timestamp = parse_iso8601(info.get('date'), delimiter=' ') age_limit = info.get('adult') if age_limit is not None: age_limit = 0 if age_limit == 'N' else 18 - thumbnails = [ - { - 'id': thumb_format, - 'url': thumb['url'], - 'width': int_or_none(thumb.get('width')), - 'height': int_or_none(thumb.get('height')), - } - for (thumb_format, thumb) in info.get('cover', {}).items() - ] + thumbnails = [{ + 'id': thumb_format, + 'url': thumb['url'], + 'width': int_or_none(thumb.get('width')), + 'height': int_or_none(thumb.get('height')), + } for thumb_format, thumb in info.get('cover', {}).items() if thumb.get('url')] return { 'id': video_id, 'title': title, - 'url': file_url, 'description': description, 'duration': duration, 'timestamp': timestamp, 'age_limit': age_limit, 'thumbnails': thumbnails, - 'ext': 'flv', + 'formats': formats, } From 3f15fec1d13cf4b18c093271bee7ad7586c97f9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 25 Mar 2016 03:56:27 +0600 Subject: [PATCH 116/116] Credit @Kagami for mnet (#8958) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 51dfc8ddd..ea8d39978 100644 --- a/AUTHORS +++ b/AUTHORS @@ -166,3 +166,4 @@ Ben Congdon Kacper Michajłow José Joaquín Atria Viťas Strádal +Kagami Hiiragi