From 00d24327efcac74b11dbc4d813aed74da9a501e0 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 30 Oct 2015 09:48:56 +0100 Subject: [PATCH 01/32] [vgtv] extract videos from FTV, Aftenposten, Aftonbladet using VGTVIE --- youtube_dl/extractor/__init__.py | 1 - youtube_dl/extractor/aftenposten.py | 23 ----------- youtube_dl/extractor/vgtv.py | 60 ++++++++++++++++++++++------- 3 files changed, 46 insertions(+), 38 deletions(-) delete mode 100644 youtube_dl/extractor/aftenposten.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f98e6487e..f7dcabcf7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -9,7 +9,6 @@ from .adobetv import ( AdobeTVVideoIE, ) from .adultswim import AdultSwimIE -from .aftenposten import AftenpostenIE from .aftonbladet import AftonbladetIE from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE diff --git a/youtube_dl/extractor/aftenposten.py b/youtube_dl/extractor/aftenposten.py deleted file mode 100644 index 0c00acfb5..000000000 --- a/youtube_dl/extractor/aftenposten.py +++ /dev/null @@ -1,23 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class AftenpostenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/(?:#!/)?video/(?P\d+)' - _TEST = { - 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', - 'md5': 'fd828cd29774a729bf4d4425fe192972', - 'info_dict': { - 'id': '21039', - 'ext': 'mov', - 'title': 'TRAILER: "Sweatshop" - I can´t take any more', - 'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238', - 'timestamp': 1416927969, - 'upload_date': '20141125', - } - } - - def _real_extract(self, url): - return self.url_result('xstream:ap:%s' % self._match_id(url), 'Xstream') diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index f38a72fde..17213d9b6 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -11,16 +11,17 @@ from ..utils import ( class VGTVIE(InfoExtractor): - IE_DESC = 'VGTV and BTTV' + IE_DESC = 'VGTV, BTTV, FTV, Aftenposten, Aftonbladet' _VALID_URL = r'''(?x) (?: vgtv:| http://(?:www\.)? ) - (?Pvgtv|bt) + (?Pvgtv.no|(?:bt.no|aftenbladet.no)/tv|fvn.no/fvntv|aftenposten.no/webtv) (?: :| - \.no/(?:tv/)?\#!/(?:video|live)/ + /\#!/(?:video|live)/| + /embed?id= ) (?P[0-9]+) ''' @@ -59,17 +60,18 @@ class VGTVIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + 'skip': 'Video is no longer available', }, { - # streamType: live + # streamType: wasLive 'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla', 'info_dict': { 'id': '113063', - 'ext': 'flv', - 'title': 're:^DIREKTE: V75 fra Solvalla [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'ext': 'mp4', + 'title': 'V75 fra Solvalla 30.05.15', 'description': 'md5:b3743425765355855f88e096acc93231', 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 0, + 'duration': 25966, 'timestamp': 1432975582, 'upload_date': '20150530', 'view_count': int, @@ -78,26 +80,56 @@ class VGTVIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + },{ + 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', + 'md5': '7fbc265a3ca4933a423c7a66aa879a67', + 'info_dict': { + 'id': '21039', + 'ext': 'mp4', + 'title': 'TRAILER: «SWEATSHOP» - I can´t take any more', + 'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238', + 'duration': 66, + 'timestamp': 1417002452, + 'upload_date': '20141126', + 'view_count': int, + } }, { 'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien', 'only_matching': True, }, ] + _HOST_WEBSITES = { + 'vgtv.no': { + 'vendor': 'vgtv', + 'appname': 'vgtv', + }, + 'bt.no/tv': { + 'vendor': 'bt', + 'appname': 'bttv', + }, + 'aftenbladet.no/tv': { + 'vendor': 'sa', + 'appname': 'satv', + }, + 'fvn.no/fvntv': { + 'vendor': 'fvn', + 'appname': 'fvntv', + }, + 'aftenposten.no/webtv': { + 'vendor': 'ap', + 'appname': 'aptv', + }, + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') host = mobj.group('host') - HOST_WEBSITES = { - 'vgtv': 'vgtv', - 'bt': 'bttv', - } - data = self._download_json( 'http://svp.vg.no/svp/api/v1/%s/assets/%s?appName=%s-website' - % (host, video_id, HOST_WEBSITES[host]), + % (self._HOST_WEBSITES[host]['vendor'], video_id, self._HOST_WEBSITES[host]['appname']), video_id, 'Downloading media JSON') if data.get('status') == 'inactive': @@ -144,7 +176,7 @@ class VGTVIE(InfoExtractor): return { 'id': video_id, - 'title': self._live_title(data['title']), + 'title': self._live_title(data['title']) if stream_type == 'live' else data['title'], 'description': data['description'], 'thumbnail': data['images']['main'] + '?t[]=900x506q80', 'timestamp': data['published'], From 804afc5871a88eaa32a6c161df67e6b37383d7d1 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 30 Oct 2015 10:20:38 +0100 Subject: [PATCH 02/32] [vgtv] improve _VALID_URL regex --- youtube_dl/extractor/vgtv.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 17213d9b6..e8039ec7f 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -11,17 +11,19 @@ from ..utils import ( class VGTVIE(InfoExtractor): - IE_DESC = 'VGTV, BTTV, FTV, Aftenposten, Aftonbladet' + IE_DESC = 'VGTV, BTTV, FTV, Aftenposten and Aftonbladet' _VALID_URL = r'''(?x) - (?: - vgtv:| - http://(?:www\.)? + http://(?:www\.)? + (?P + vgtv.no| + (?:bt|aftenbladet).no/tv| + fvn.no/fvntv| + aftenposten.no/webtv ) - (?Pvgtv.no|(?:bt.no|aftenbladet.no)/tv|fvn.no/fvntv|aftenposten.no/webtv) + / (?: - :| - /\#!/(?:video|live)/| - /embed?id= + \#!/(?:video|live)/| + embed?.*id= ) (?P[0-9]+) ''' @@ -211,7 +213,7 @@ class BTArticleIE(InfoExtractor): webpage = self._download_webpage(url, self._match_id(url)) video_id = self._search_regex( r'SVP\.Player\.load\(\s*(\d+)', webpage, 'video id') - return self.url_result('vgtv:bt:%s' % video_id, 'VGTV') + return self.url_result('http://bt.no/tv/embed?id=%s' % video_id, 'VGTV') class BTVestlendingenIE(InfoExtractor): From 9f6b517671a7a9b16ccf2bc3f5153673a73895ff Mon Sep 17 00:00:00 2001 From: remitamine Date: Sun, 6 Dec 2015 07:43:46 +0100 Subject: [PATCH 03/32] [vgtv] extract all formats and improve extraction --- youtube_dl/extractor/vgtv.py | 51 +++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index e8039ec7f..8ce3d8b8f 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -13,7 +13,7 @@ from ..utils import ( class VGTVIE(InfoExtractor): IE_DESC = 'VGTV, BTTV, FTV, Aftenposten and Aftonbladet' _VALID_URL = r'''(?x) - http://(?:www\.)? + https?://(?:www\.)? (?P vgtv.no| (?:bt|aftenbladet).no/tv| @@ -25,7 +25,7 @@ class VGTVIE(InfoExtractor): \#!/(?:video|live)/| embed?.*id= ) - (?P[0-9]+) + (?P\d+) ''' _TESTS = [ { @@ -82,7 +82,8 @@ class VGTVIE(InfoExtractor): # m3u8 download 'skip_download': True, }, - },{ + }, + { 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', 'md5': '7fbc265a3ca4933a423c7a66aa879a67', 'info_dict': { @@ -145,35 +146,37 @@ class VGTVIE(InfoExtractor): hls_url = streams.get('hls') if hls_url: - formats.extend(self._extract_m3u8_formats( - hls_url, video_id, 'mp4', m3u8_id='hls')) + m3u8_formats = self._extract_m3u8_formats(hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) hds_url = streams.get('hds') # wasLive hds are always 404 if hds_url and stream_type != 'wasLive': - formats.extend(self._extract_f4m_formats( - hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', - video_id, f4m_id='hds')) + f4m_formats = self._extract_f4m_formats(hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + mp4_urls = streams.get('pseudostreaming') or [] mp4_url = streams.get('mp4') if mp4_url: - _url = hls_url or hds_url - MP4_URL_TEMPLATE = '%s/%%s.%s' % (mp4_url.rpartition('/')[0], mp4_url.rpartition('.')[-1]) - for mp4_format in _url.split(','): - m = re.search('(?P\d+)_(?P\d+)_(?P\d+)', mp4_format) - if not m: - continue - width = int(m.group('width')) - height = int(m.group('height')) - vbr = int(m.group('vbr')) - formats.append({ - 'url': MP4_URL_TEMPLATE % mp4_format, - 'format_id': 'mp4-%s' % vbr, - 'width': width, - 'height': height, + mp4_urls.append(mp4_url) + for mp4_url in mp4_urls: + format_info = { + 'url': mp4_url, + 'preference': 1, + } + mobj = re.search('(\d+)_(\d+)_(\d+)', mp4_url) + if mobj: + vbr = int(mobj.group(3)) + format_info.update({ + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), 'vbr': vbr, - 'preference': 1, + 'format_id': 'mp4-%s' % vbr, }) + formats.append(format_info) + self._sort_formats(formats) return { @@ -234,4 +237,4 @@ class BTVestlendingenIE(InfoExtractor): } def _real_extract(self, url): - return self.url_result('xstream:btno:%s' % self._match_id(url), 'Xstream') + return self.url_result('http://bt.no/tv/embed?id=%s' % self._match_id(url), 'VGTV') From 41c3b34b1f1f94c723f3af760ccddc4d7119464f Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 9 Dec 2015 10:50:11 +0100 Subject: [PATCH 04/32] [vgtv] add sortcut expressions to use the extractor --- youtube_dl/extractor/vgtv.py | 68 ++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 8ce3d8b8f..1fba37578 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -12,21 +12,39 @@ from ..utils import ( class VGTVIE(InfoExtractor): IE_DESC = 'VGTV, BTTV, FTV, Aftenposten and Aftonbladet' + + _HOST_TO_APPNAME = { + 'vgtv.no': 'vgtv', + 'bt.no/tv': 'bttv', + 'aftenbladet.no/tv': 'satv', + 'fvn.no/fvntv': 'fvntv', + 'aftenposten.no/webtv': 'aptv', + } + + _APP_NAME_TO_VENDOR = { + 'vgtv': 'vgtv', + 'bttv': 'bt', + 'satv': 'sa', + 'fvntv': 'fvn', + 'aptv': 'ap', + } + _VALID_URL = r'''(?x) - https?://(?:www\.)? + (?:https?://(?:www\.)? (?P - vgtv.no| - (?:bt|aftenbladet).no/tv| - fvn.no/fvntv| - aftenposten.no/webtv + %s ) / (?: \#!/(?:video|live)/| embed?.*id= - ) + )| + (?P + %s + ):) (?P\d+) - ''' + ''' % ('|'.join(_HOST_TO_APPNAME.keys()), '|'.join(_APP_NAME_TO_VENDOR.keys())) + _TESTS = [ { # streamType: vod @@ -102,37 +120,17 @@ class VGTVIE(InfoExtractor): 'only_matching': True, }, ] - _HOST_WEBSITES = { - 'vgtv.no': { - 'vendor': 'vgtv', - 'appname': 'vgtv', - }, - 'bt.no/tv': { - 'vendor': 'bt', - 'appname': 'bttv', - }, - 'aftenbladet.no/tv': { - 'vendor': 'sa', - 'appname': 'satv', - }, - 'fvn.no/fvntv': { - 'vendor': 'fvn', - 'appname': 'fvntv', - }, - 'aftenposten.no/webtv': { - 'vendor': 'ap', - 'appname': 'aptv', - }, - } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') host = mobj.group('host') + appname = self._HOST_TO_APPNAME[host] if host else mobj.group('appname') + vendor = self._APP_NAME_TO_VENDOR[appname] data = self._download_json( 'http://svp.vg.no/svp/api/v1/%s/assets/%s?appName=%s-website' - % (self._HOST_WEBSITES[host]['vendor'], video_id, self._HOST_WEBSITES[host]['appname']), + % (vendor, video_id, appname), video_id, 'Downloading media JSON') if data.get('status') == 'inactive': @@ -146,14 +144,16 @@ class VGTVIE(InfoExtractor): hls_url = streams.get('hls') if hls_url: - m3u8_formats = self._extract_m3u8_formats(hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + m3u8_formats = self._extract_m3u8_formats( + hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False) if m3u8_formats: formats.extend(m3u8_formats) hds_url = streams.get('hds') # wasLive hds are always 404 if hds_url and stream_type != 'wasLive': - f4m_formats = self._extract_f4m_formats(hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id, f4m_id='hds', fatal=False) + f4m_formats = self._extract_f4m_formats( + hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id, f4m_id='hds', fatal=False) if f4m_formats: formats.extend(f4m_formats) @@ -216,7 +216,7 @@ class BTArticleIE(InfoExtractor): webpage = self._download_webpage(url, self._match_id(url)) video_id = self._search_regex( r'SVP\.Player\.load\(\s*(\d+)', webpage, 'video id') - return self.url_result('http://bt.no/tv/embed?id=%s' % video_id, 'VGTV') + return self.url_result('bttv:%s' % video_id, 'VGTV') class BTVestlendingenIE(InfoExtractor): @@ -237,4 +237,4 @@ class BTVestlendingenIE(InfoExtractor): } def _real_extract(self, url): - return self.url_result('http://bt.no/tv/embed?id=%s' % self._match_id(url), 'VGTV') + return self.url_result('bttv:%s' % self._match_id(url), 'VGTV') From d50116b8ac2484ce9e8bc6a3d885c5a4a09b4e47 Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 10 Dec 2015 22:18:42 +0100 Subject: [PATCH 05/32] [vgtv] extract 5 digit length video ids using both xstream and vgtv --- youtube_dl/extractor/vgtv.py | 22 +++++++++++++++++----- youtube_dl/extractor/xstream.py | 14 +++++++++----- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 1fba37578..347410a78 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -4,13 +4,14 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .xstream import XstreamIE from ..utils import ( ExtractorError, float_or_none, ) -class VGTVIE(InfoExtractor): +class VGTVIE(XstreamIE): IE_DESC = 'VGTV, BTTV, FTV, Aftenposten and Aftonbladet' _HOST_TO_APPNAME = { @@ -137,6 +138,15 @@ class VGTVIE(InfoExtractor): raise ExtractorError( 'Video %s is no longer available' % video_id, expected=True) + info = { + 'formats': [], + } + if len(video_id) == 5: + if appname == 'bttv': + info = self._extract_video_info('btno', video_id) + elif appname == 'aptv': + info = self._extract_video_info('ap', video_id) + streams = data['streamUrls'] stream_type = data.get('streamType') @@ -177,9 +187,11 @@ class VGTVIE(InfoExtractor): }) formats.append(format_info) - self._sort_formats(formats) + info['formats'].extend(formats) - return { + self._sort_formats(info['formats']) + + info.update({ 'id': video_id, 'title': self._live_title(data['title']) if stream_type == 'live' else data['title'], 'description': data['description'], @@ -187,9 +199,9 @@ class VGTVIE(InfoExtractor): 'timestamp': data['published'], 'duration': float_or_none(data['duration'], 1000), 'view_count': data['displays'], - 'formats': formats, 'is_live': True if stream_type == 'live' else False, - } + }) + return info class BTArticleIE(InfoExtractor): diff --git a/youtube_dl/extractor/xstream.py b/youtube_dl/extractor/xstream.py index 71584c291..436f8978b 100644 --- a/youtube_dl/extractor/xstream.py +++ b/youtube_dl/extractor/xstream.py @@ -42,11 +42,7 @@ class XstreamIE(InfoExtractor): 'only_matching': True, }] - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - partner_id = mobj.group('partner_id') - video_id = mobj.group('id') - + def _extract_video_info(self, partner_id, video_id): data = self._download_xml( 'http://frontend.xstream.dk/%s/feed/video/?platform=web&id=%s' % (partner_id, video_id), @@ -97,6 +93,7 @@ class XstreamIE(InfoExtractor): formats.append({ 'url': link.get('href'), 'format_id': link.get('rel'), + 'preference': 2, }) thumbnails = [{ @@ -113,3 +110,10 @@ class XstreamIE(InfoExtractor): 'formats': formats, 'thumbnails': thumbnails, } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + partner_id = mobj.group('partner_id') + video_id = mobj.group('id') + + return self._extract_video_info(partner_id, video_id) From ed63cbd6ee392a01d10f0ca3f36ac9442622f0a9 Mon Sep 17 00:00:00 2001 From: Austin Adams Date: Mon, 21 Dec 2015 20:26:15 -0500 Subject: [PATCH 06/32] [comcarcoff] adjust for json updates --- youtube_dl/extractor/comcarcoff.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py index 81f3d7697..4391b7ce4 100644 --- a/youtube_dl/extractor/comcarcoff.py +++ b/youtube_dl/extractor/comcarcoff.py @@ -32,8 +32,8 @@ class ComCarCoffIE(InfoExtractor): webpage = self._download_webpage(url, display_id) full_data = json.loads(self._search_regex( - r'', - webpage, 'full data json')) + r'\nwindow.app = (?P.+?);\n', + webpage, 'full data json'))['videoData'] video_id = full_data['activeVideo']['video'] video_data = full_data.get('videos', {}).get(video_id) or full_data['singleshots'][video_id] From ff43d2365ff5569c98df8e01250d34706e266c44 Mon Sep 17 00:00:00 2001 From: remitamine Date: Tue, 22 Dec 2015 07:58:33 +0100 Subject: [PATCH 07/32] [soompi] remove extractor http://tv.soompi.com now redirect to viki.com because Viki has acquired Soompi http://www.soompi.com/2015/08/19/we-got-married-soompi-joins-viki/ --- youtube_dl/extractor/__init__.py | 4 - youtube_dl/extractor/soompi.py | 146 ------------------------------- 2 files changed, 150 deletions(-) delete mode 100644 youtube_dl/extractor/soompi.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 760b65441..702cbc6e2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -591,10 +591,6 @@ from .snagfilms import ( ) from .snotr import SnotrIE from .sohu import SohuIE -from .soompi import ( - SoompiIE, - SoompiShowIE, -) from .soundcloud import ( SoundcloudIE, SoundcloudSetIE, diff --git a/youtube_dl/extractor/soompi.py b/youtube_dl/extractor/soompi.py deleted file mode 100644 index 5da66ca9e..000000000 --- a/youtube_dl/extractor/soompi.py +++ /dev/null @@ -1,146 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .crunchyroll import CrunchyrollIE - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - ExtractorError, - int_or_none, - remove_start, - xpath_text, -) - - -class SoompiBaseIE(InfoExtractor): - def _get_episodes(self, webpage, episode_filter=None): - episodes = self._parse_json( - self._search_regex( - r'VIDEOS\s*=\s*(\[.+?\]);', webpage, 'episodes JSON'), - None) - return list(filter(episode_filter, episodes)) - - -class SoompiIE(SoompiBaseIE, CrunchyrollIE): - IE_NAME = 'soompi' - _VALID_URL = r'https?://tv\.soompi\.com/(?:en/)?watch/(?P[0-9]+)' - _TESTS = [{ - 'url': 'http://tv.soompi.com/en/watch/29235', - 'info_dict': { - 'id': '29235', - 'ext': 'mp4', - 'title': 'Episode 1096', - 'description': '2015-05-20' - }, - 'params': { - 'skip_download': True, - }, - }] - - def _get_episode(self, webpage, video_id): - return self._get_episodes(webpage, lambda x: x['id'] == video_id)[0] - - def _get_subtitles(self, config, video_id): - sub_langs = {} - for subtitle in config.findall('./{default}preload/subtitles/subtitle'): - sub_langs[subtitle.attrib['id']] = subtitle.attrib['title'] - - subtitles = {} - for s in config.findall('./{default}preload/subtitle'): - lang_code = sub_langs.get(s.attrib['id']) - if not lang_code: - continue - sub_id = s.get('id') - data = xpath_text(s, './data', 'data') - iv = xpath_text(s, './iv', 'iv') - if not id or not iv or not data: - continue - subtitle = self._decrypt_subtitles(data, iv, sub_id).decode('utf-8') - subtitles[lang_code] = self._extract_subtitles(subtitle) - return subtitles - - def _real_extract(self, url): - video_id = self._match_id(url) - - try: - webpage = self._download_webpage( - url, video_id, 'Downloading episode page') - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - webpage = ee.cause.read() - block_message = self._html_search_regex( - r'(?s)
(.+?)
', webpage, - 'block message', default=None) - if block_message: - raise ExtractorError(block_message, expected=True) - raise - - formats = [] - config = None - for format_id in re.findall(r'\?quality=([0-9a-zA-Z]+)', webpage): - config = self._download_xml( - 'http://tv.soompi.com/en/show/_/%s-config.xml?mode=hls&quality=%s' % (video_id, format_id), - video_id, 'Downloading %s XML' % format_id) - m3u8_url = xpath_text( - config, './{default}preload/stream_info/file', - '%s m3u8 URL' % format_id) - if not m3u8_url: - continue - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', m3u8_id=format_id)) - self._sort_formats(formats) - - episode = self._get_episode(webpage, video_id) - - title = episode['name'] - description = episode.get('description') - duration = int_or_none(episode.get('duration')) - - thumbnails = [{ - 'id': thumbnail_id, - 'url': thumbnail_url, - } for thumbnail_id, thumbnail_url in episode.get('img_url', {}).items()] - - subtitles = self.extract_subtitles(config, video_id) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnails': thumbnails, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles - } - - -class SoompiShowIE(SoompiBaseIE): - IE_NAME = 'soompi:show' - _VALID_URL = r'https?://tv\.soompi\.com/en/shows/(?P[0-9a-zA-Z\-_]+)' - _TESTS = [{ - 'url': 'http://tv.soompi.com/en/shows/liar-game', - 'info_dict': { - 'id': 'liar-game', - 'title': 'Liar Game', - 'description': 'md5:52c02bce0c1a622a95823591d0589b66', - }, - 'playlist_count': 14, - }] - - def _real_extract(self, url): - show_id = self._match_id(url) - - webpage = self._download_webpage( - url, show_id, 'Downloading show page') - - title = remove_start(self._og_search_title(webpage), 'SoompiTV | ') - description = self._og_search_description(webpage) - - entries = [ - self.url_result('http://tv.soompi.com/en/watch/%s' % episode['id'], 'Soompi') - for episode in self._get_episodes(webpage)] - - return self.playlist_result(entries, show_id, title, description) From dc016bf5216d4c0d5b5fb2cd707e1d08fa4b0517 Mon Sep 17 00:00:00 2001 From: remitamine Date: Tue, 22 Dec 2015 09:55:25 +0100 Subject: [PATCH 08/32] [viki] detect errors and fix formats extraction --- youtube_dl/extractor/viki.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index a63c23617..ca3f20a3d 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -30,6 +30,12 @@ class VikiBaseIE(InfoExtractor): _token = None + _ERRORS = { + 'geo': 'Sorry, this content is not available in your region.', + 'upcoming': 'Sorry, this content is not yet available.', + # 'paywall': 'paywall', + } + def _prepare_call(self, path, timestamp=None, post_data=None): path += '?' if '?' not in path else '&' if not timestamp: @@ -67,6 +73,12 @@ class VikiBaseIE(InfoExtractor): '%s returned error: %s' % (self.IE_NAME, error), expected=True) + def _check_errors(self, data): + for reason, status in data.get('blocking', {}).items(): + if status and reason in self._ERRORS: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, self._ERRORS[reason]), expected=True) + def _real_initialize(self): self._login() @@ -193,6 +205,7 @@ class VikiIE(VikiBaseIE): 'timestamp': 1321985454, 'description': 'md5:44b1e46619df3a072294645c770cef36', 'title': 'Love In Magic', + 'age_limit': 13, }, }] @@ -202,6 +215,8 @@ class VikiIE(VikiBaseIE): video = self._call_api( 'videos/%s.json' % video_id, video_id, 'Downloading video JSON') + self._check_errors(video) + title = self.dict_selection(video.get('titles', {}), 'en') if not title: title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id @@ -262,8 +277,11 @@ class VikiIE(VikiBaseIE): r'^(\d+)[pP]$', format_id, 'height', default=None)) for protocol, format_dict in stream_dict.items(): if format_id == 'm3u8': - formats = self._extract_m3u8_formats( - format_dict['url'], video_id, 'mp4', m3u8_id='m3u8-%s' % protocol) + m3u8_formats = self._extract_m3u8_formats( + format_dict['url'], video_id, 'mp4', 'm3u8_native', + m3u8_id='m3u8-%s' % protocol, fatal=None) + if m3u8_formats: + formats.extend(m3u8_formats) else: formats.append({ 'url': format_dict['url'], @@ -315,6 +333,8 @@ class VikiChannelIE(VikiBaseIE): 'containers/%s.json' % channel_id, channel_id, 'Downloading channel JSON') + self._check_errors(channel) + title = self.dict_selection(channel['titles'], 'en') description = self.dict_selection(channel['descriptions'], 'en') From 48a6c984b806141dc6d3da0a96df2e553bb815e0 Mon Sep 17 00:00:00 2001 From: remitamine Date: Tue, 22 Dec 2015 10:14:57 +0100 Subject: [PATCH 09/32] [bleacherreport] update test --- youtube_dl/extractor/bleacherreport.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py index bd2a6340b..38bda3af5 100644 --- a/youtube_dl/extractor/bleacherreport.py +++ b/youtube_dl/extractor/bleacherreport.py @@ -90,7 +90,7 @@ class BleacherReportCMSIE(AMPIE): _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P[0-9a-f-]{36})' _TESTS = [{ 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1', - 'md5': 'f0ca220af012d4df857b54f792c586bb', + 'md5': '8c2c12e3af7805152675446c905d159b', 'info_dict': { 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', 'ext': 'flv', From 220bc3f0e3777b89de335cdbc58a7d105584f06b Mon Sep 17 00:00:00 2001 From: remitamine Date: Tue, 22 Dec 2015 11:27:18 +0100 Subject: [PATCH 10/32] [franceinter] fix title extraction --- youtube_dl/extractor/franceinter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py index 6613ee17a..90a17815d 100644 --- a/youtube_dl/extractor/franceinter.py +++ b/youtube_dl/extractor/franceinter.py @@ -33,7 +33,7 @@ class FranceInterIE(InfoExtractor): video_url = 'http://www.franceinter.fr/' + path title = self._html_search_regex( - r'(.+?)', webpage, 'title') + r'(.+?)', webpage, 'title') description = self._html_search_regex( r'(.*?)', webpage, 'description', fatal=False) From 2db5806991145ee293c964ecd85623c093d2e429 Mon Sep 17 00:00:00 2001 From: remitamine Date: Tue, 22 Dec 2015 11:30:35 +0100 Subject: [PATCH 11/32] [franceinter] use _match_id --- youtube_dl/extractor/franceinter.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py index 90a17815d..fdc51f44f 100644 --- a/youtube_dl/extractor/franceinter.py +++ b/youtube_dl/extractor/franceinter.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import int_or_none @@ -23,8 +21,7 @@ class FranceInterIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) From 2be689b7e27df89648d1d98fa74c297f0e06cbc1 Mon Sep 17 00:00:00 2001 From: j Date: Mon, 21 Dec 2015 02:26:37 +0100 Subject: [PATCH 12/32] [theintercept] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/theintercept.py | 68 ++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 youtube_dl/extractor/theintercept.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index eac50eda5..042b1e921 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -657,6 +657,7 @@ from .tenplay import TenPlayIE from .testurl import TestURLIE from .testtube import TestTubeIE from .tf1 import TF1IE +from .theintercept import TheInterceptIE from .theonion import TheOnionIE from .theplatform import ( ThePlatformIE, diff --git a/youtube_dl/extractor/theintercept.py b/youtube_dl/extractor/theintercept.py new file mode 100644 index 000000000..b096a28de --- /dev/null +++ b/youtube_dl/extractor/theintercept.py @@ -0,0 +1,68 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, +) + +class TheInterceptIE(InfoExtractor): + _VALID_URL = r'https://theintercept.com/fieldofvision/(?P.+?)/' + _TESTS = [{ + 'url': 'https://theintercept.com/fieldofvision/thisisacoup-episode-four-surrender-or-die/', + 'info_dict': { + 'id': 'thisisacoup-episode-four-surrender-or-die', + 'ext': 'mp4', + 'title': '#ThisIsACoup – Episode Four: Surrender or Die', + 'upload_date': '20151218', + 'description': 'md5:74dd27f0e2fbd50817829f97eaa33140', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + mobj = re.search(r'initialStoreTree =(?P.+})', webpage) + if mobj is None: + raise ExtractorError('Unable to extract initialStoreTree') + json_data = self._parse_json(mobj.group('json_data'), display_id) + + info = None + for post in json_data['resources']['posts'].values(): + if post['slug'] == display_id: + info = post + break + if info is None: + raise ExtractorError('Unable to find info for %s'%display_id) + + title = info['title'] + description = info['excerpt'] + upload_date = info['date'][:10].replace('-', '') + video_id = info['fov_videoid'] + creator = ','.join([a['display_name'] for a in info['authors']]) + thumbnail = self._og_search_property('image', webpage) + content_id = thumbnail.split('/')[-1].split('.')[0] + content_url = 'https://content.jwplatform.com/jw6/{content_id}.xml'.format(content_id=content_id) + content = self._download_xml(content_url, video_id) + + formats = [] + for source in content.findall('.//{http://rss.jwpcdn.com/}source'): + if source.attrib['file'].endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats( + source.attrib['file'], video_id, 'mp4', preference=1, m3u8_id='hls')) + + return { + 'creator': creator, + 'description': description, + 'display_id': display_id, + 'formats': formats, + 'id': video_id, + 'id': video_id, + 'thumbnail': thumbnail, + 'title': title, + 'upload_date': upload_date, + } From 3b68efdc6ae109a840ff5f15f0e28910c2463b3f Mon Sep 17 00:00:00 2001 From: remitamine Date: Tue, 22 Dec 2015 15:54:51 +0100 Subject: [PATCH 13/32] [vgtv] update tests and correct format sorting --- youtube_dl/extractor/vgtv.py | 31 +++++++++++++++++++++---------- youtube_dl/extractor/xstream.py | 2 +- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 347410a78..811ee197d 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -104,10 +104,10 @@ class VGTVIE(XstreamIE): }, { 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', - 'md5': '7fbc265a3ca4933a423c7a66aa879a67', + 'md5': 'fd828cd29774a729bf4d4425fe192972', 'info_dict': { 'id': '21039', - 'ext': 'mp4', + 'ext': 'mov', 'title': 'TRAILER: «SWEATSHOP» - I can´t take any more', 'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238', 'duration': 66, @@ -174,16 +174,15 @@ class VGTVIE(XstreamIE): for mp4_url in mp4_urls: format_info = { 'url': mp4_url, - 'preference': 1, } mobj = re.search('(\d+)_(\d+)_(\d+)', mp4_url) if mobj: - vbr = int(mobj.group(3)) + tbr = int(mobj.group(3)) format_info.update({ 'width': int(mobj.group(1)), 'height': int(mobj.group(2)), - 'vbr': vbr, - 'format_id': 'mp4-%s' % vbr, + 'tbr': tbr, + 'format_id': 'mp4-%s' % tbr, }) formats.append(format_info) @@ -210,7 +209,7 @@ class BTArticleIE(InfoExtractor): _VALID_URL = 'http://(?:www\.)?bt\.no/(?:[^/]+/)+(?P[^/]+)-\d+\.html' _TEST = { 'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html', - 'md5': 'd055e8ee918ef2844745fcfd1a4175fb', + 'md5': '2acbe8ad129b3469d5ae51b1158878df', 'info_dict': { 'id': '23199', 'ext': 'mp4', @@ -227,7 +226,7 @@ class BTArticleIE(InfoExtractor): def _real_extract(self, url): webpage = self._download_webpage(url, self._match_id(url)) video_id = self._search_regex( - r'SVP\.Player\.load\(\s*(\d+)', webpage, 'video id') + r']+data-id="(\d+)"', webpage, 'video id') return self.url_result('bttv:%s' % video_id, 'VGTV') @@ -235,7 +234,7 @@ class BTVestlendingenIE(InfoExtractor): IE_NAME = 'bt:vestlendingen' IE_DESC = 'Bergens Tidende - Vestlendingen' _VALID_URL = 'http://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588', 'md5': 'd7d17e3337dc80de6d3a540aefbe441b', 'info_dict': { @@ -246,7 +245,19 @@ class BTVestlendingenIE(InfoExtractor): 'timestamp': 1430473209, 'upload_date': '20150501', }, - } + 'skip': '404 Error', + }, { + 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86255', + 'md5': 'a2893f8632e96389f4bdf36aa9463ceb', + 'info_dict': { + 'id': '86255', + 'ext': 'mov', + 'title': 'Du må tåle å fryse og være sulten', + 'description': 'md5:b8046f4d022d5830ddab04865791d063', + 'upload_date': '20150321', + 'timestamp': 1426942023, + }, + }] def _real_extract(self, url): return self.url_result('bttv:%s' % self._match_id(url), 'VGTV') diff --git a/youtube_dl/extractor/xstream.py b/youtube_dl/extractor/xstream.py index 436f8978b..76c91bd92 100644 --- a/youtube_dl/extractor/xstream.py +++ b/youtube_dl/extractor/xstream.py @@ -93,7 +93,7 @@ class XstreamIE(InfoExtractor): formats.append({ 'url': link.get('href'), 'format_id': link.get('rel'), - 'preference': 2, + 'preference': 1, }) thumbnails = [{ From dbee18b5521edbfa1642c683ad2d317ba06e9d5b Mon Sep 17 00:00:00 2001 From: Abhishek Kedia Date: Mon, 21 Dec 2015 01:50:07 +0100 Subject: [PATCH 14/32] Improve extraction (Closes #7918) remove outer parentheses in if Conflicts: youtube_dl/extractor/imgur.py checked code with flake8 not returning list in case of single images. using the fact that id with length 5 are albums and more are single videos. Also for single videos ie ImgurIE both urls - http://imgur.com/gallery/oWeAMW2 and http://imgur.com/oWeAMW2 are equally fine. Change regex to allow thuis. For albums urls - http://imgur.com/gallery/Q95ko and http://imgur.com/Q95ko are ok. Change regex to allow this also. update description in ImgurIE Tests. Also move single video test 'https://imgur.com/gallery/YcAQlkx' from ImgurAlbumIE to ImgurIE. --- youtube_dl/extractor/imgur.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index 70c8ca64e..88423f179 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -13,7 +13,7 @@ from ..utils import ( class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!gallery)(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(gallery/)?(?P[a-zA-Z0-9]{6,})' _TESTS = [{ 'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -21,7 +21,7 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$', + 'description': 'Imgur: The most awesome images on the Internet.', }, }, { 'url': 'https://imgur.com/A61SaA1', @@ -29,8 +29,17 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$', + 'description': 'Imgur: The most awesome images on the Internet.', }, + }, { + 'url': 'https://imgur.com/gallery/YcAQlkx', + 'info_dict': { + 'id': 'YcAQlkx', + 'ext': 'mp4', + 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', + 'description': 'Imgur: The most awesome images on the Internet.' + + } }] def _real_extract(self, url): @@ -100,7 +109,7 @@ class ImgurIE(InfoExtractor): class ImgurAlbumIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/gallery/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(gallery/)?(?P[a-zA-Z0-9]{5})(?![a-zA-Z0-9])' _TEST = { 'url': 'http://imgur.com/gallery/Q95ko', @@ -113,12 +122,15 @@ class ImgurAlbumIE(InfoExtractor): def _real_extract(self, url): album_id = self._match_id(url) - album_images = self._download_json( - 'http://imgur.com/gallery/%s/album_images/hit.json?all=true' % album_id, - album_id)['data']['images'] + album_img_data = self._download_json( + 'http://imgur.com/gallery/%s/album_images/hit.json?all=true' % album_id, album_id)['data'] - entries = [ - self.url_result('http://imgur.com/%s' % image['hash']) - for image in album_images if image.get('hash')] + if len(album_img_data) == 0: + return self.url_result('http://imgur.com/%s' % album_id) + else: + album_images = album_img_data['images'] + entries = [ + self.url_result('http://imgur.com/%s' % image['hash']) + for image in album_images if image.get('hash')] return self.playlist_result(entries, album_id) From 774ce35571c08a1532fe4079224239adfdb80e43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 22 Dec 2015 21:48:48 +0600 Subject: [PATCH 15/32] [imgur] Improve (Closes #7928) --- youtube_dl/extractor/imgur.py | 41 +++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index 88423f179..85e9344aa 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -13,7 +13,7 @@ from ..utils import ( class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(gallery/)?(?P[a-zA-Z0-9]{6,})' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|topic/[^/]+)/)?(?P[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z]+)?$' _TESTS = [{ 'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -40,6 +40,9 @@ class ImgurIE(InfoExtractor): 'description': 'Imgur: The most awesome images on the Internet.' } + }, { + 'url': 'http://imgur.com/topic/Funny/N8rOudd', + 'only_matching': True, }] def _real_extract(self, url): @@ -109,28 +112,38 @@ class ImgurIE(InfoExtractor): class ImgurAlbumIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(gallery/)?(?P[a-zA-Z0-9]{5})(?![a-zA-Z0-9])' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:a|gallery|topic/[^/]+)/)?(?P[a-zA-Z0-9]{5})(?:[/?#&]+)?$' - _TEST = { + _TESTS = [{ 'url': 'http://imgur.com/gallery/Q95ko', 'info_dict': { 'id': 'Q95ko', }, 'playlist_count': 25, - } + }, { + 'url': 'http://imgur.com/a/j6Orj', + 'only_matching': True, + }, { + 'url': 'http://imgur.com/topic/Aww/ll5Vk', + 'only_matching': True, + }] def _real_extract(self, url): album_id = self._match_id(url) - album_img_data = self._download_json( - 'http://imgur.com/gallery/%s/album_images/hit.json?all=true' % album_id, album_id)['data'] + album_images = self._download_json( + 'http://imgur.com/gallery/%s/album_images/hit.json?all=true' % album_id, + album_id, fatal=False) - if len(album_img_data) == 0: - return self.url_result('http://imgur.com/%s' % album_id) - else: - album_images = album_img_data['images'] - entries = [ - self.url_result('http://imgur.com/%s' % image['hash']) - for image in album_images if image.get('hash')] + if album_images: + data = album_images.get('data') + if data and isinstance(data, dict): + images = data.get('images') + if images and isinstance(images, list): + entries = [ + self.url_result('http://imgur.com/%s' % image['hash']) + for image in images if image.get('hash')] + return self.playlist_result(entries, album_id) - return self.playlist_result(entries, album_id) + # Fallback to single video + return self.url_result('http://imgur.com/%s' % album_id, ImgurIE.ie_key()) From 4c24ed94640b148882f1ceb400127b3b3afcafd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Dec 2015 01:10:31 +0600 Subject: [PATCH 16/32] [comcarcoff] Improve json data regex and modernize --- youtube_dl/extractor/comcarcoff.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py index 4391b7ce4..edf5b29a0 100644 --- a/youtube_dl/extractor/comcarcoff.py +++ b/youtube_dl/extractor/comcarcoff.py @@ -31,9 +31,10 @@ class ComCarCoffIE(InfoExtractor): display_id = 'comediansincarsgettingcoffee.com' webpage = self._download_webpage(url, display_id) - full_data = json.loads(self._search_regex( - r'\nwindow.app = (?P.+?);\n', - webpage, 'full data json'))['videoData'] + full_data = self._parse_json( + self._search_regex( + r'window\.app\s*=\s*({.+?});\n', webpage, 'full data json'), + display_id)['videoData'] video_id = full_data['activeVideo']['video'] video_data = full_data.get('videos', {}).get(video_id) or full_data['singleshots'][video_id] From cfe9e5aa6c5b14016ae454649b1a9df9c7c18b3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Dec 2015 01:18:14 +0600 Subject: [PATCH 17/32] [comcarcoff] Extract duration --- youtube_dl/extractor/comcarcoff.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py index edf5b29a0..2efa200b5 100644 --- a/youtube_dl/extractor/comcarcoff.py +++ b/youtube_dl/extractor/comcarcoff.py @@ -1,10 +1,12 @@ # encoding: utf-8 from __future__ import unicode_literals -import json - from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, +) class ComCarCoffIE(InfoExtractor): @@ -16,6 +18,7 @@ class ComCarCoffIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20141127', 'timestamp': 1417107600, + 'duration': 1232, 'title': 'Happy Thanksgiving Miranda', 'description': 'Jerry Seinfeld and his special guest Miranda Sings cruise around town in search of coffee, complaining and apologizing along the way.', 'thumbnail': 'http://ccc.crackle.com/images/s5e4_thumb.jpg', @@ -46,12 +49,18 @@ class ComCarCoffIE(InfoExtractor): formats = self._extract_m3u8_formats( video_data['mediaUrl'], video_id, ext='mp4') + timestamp = int_or_none(video_data.get('pubDateTime')) or parse_iso8601( + video_data.get('pubDate')) + duration = int_or_none(video_data.get('durationSeconds')) or parse_duration( + video_data.get('duration')) + return { 'id': video_id, 'display_id': display_id, 'title': video_data['title'], 'description': video_data.get('description'), - 'timestamp': parse_iso8601(video_data.get('pubDate')), + 'timestamp': timestamp, + 'duration': duration, 'thumbnails': thumbnails, 'formats': formats, 'webpage_url': 'http://comediansincarsgettingcoffee.com/%s' % (video_data.get('urlSlug', video_data.get('slug'))), From 89abf7bf4d5dfc8c161924067f4430b7d81a8b32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Dec 2015 02:09:50 +0600 Subject: [PATCH 18/32] [periscope] Fix token based extraction (Closes #7943) --- youtube_dl/extractor/periscope.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 63cc764bb..514e9b433 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -31,9 +31,8 @@ class PeriscopeIE(InfoExtractor): }] def _call_api(self, method, value): - attribute = 'token' if len(value) > 13 else 'broadcast_id' return self._download_json( - 'https://api.periscope.tv/api/v2/%s?%s=%s' % (method, attribute, value), value) + 'https://api.periscope.tv/api/v2/%s?broadcast_id=%s' % (method, value), value) def _real_extract(self, url): token = self._match_id(url) From 3a70ed9ebeac782b922cc3cb3b74cd999e60845a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Dec 2015 02:54:32 +0600 Subject: [PATCH 19/32] [daum] Fix extraction (Closes #7949) --- youtube_dl/extractor/daum.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index 934da765e..e3fc639b0 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -37,9 +37,11 @@ class DaumIE(InfoExtractor): video_id = mobj.group('id') canonical_url = 'http://tvpot.daum.net/v/%s' % video_id webpage = self._download_webpage(canonical_url, video_id) + og_url = self._og_search_url(webpage, default=None) or self._search_regex( + r']+rel=(["\'])canonical\1[^>]+href=(["\'])(?P.+?)\2', + webpage, 'canonical url', group='url') full_id = self._search_regex( - r'src=["\']http://videofarm\.daum\.net/controller/video/viewer/Video\.html\?.*?vid=(.+?)[&"\']', - webpage, 'full id') + r'tvpot\.daum\.net/v/([^/]+)', og_url, 'full id') query = compat_urllib_parse.urlencode({'vid': full_id}) info = self._download_xml( 'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, From 178b47e6af73521da50e9eec04af7fdceb236e39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Dec 2015 02:59:49 +0600 Subject: [PATCH 20/32] [daum] Add test for #7949 --- youtube_dl/extractor/daum.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index e3fc639b0..9a94cf361 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -24,6 +24,18 @@ class DaumIE(InfoExtractor): 'upload_date': '20130831', 'duration': 3868, }, + }, { + # Test for https://github.com/rg3/youtube-dl/issues/7949 + 'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=M1O35s8HPOo0&clipid=73147290', + 'md5': 'c92d78bcee4424451f1667f275c1dc97', + 'info_dict': { + 'id': '73147290', + 'ext': 'mp4', + 'title': '싸이 - 나팔바지 [유희열의 스케치북] 299회 20151218', + 'description': '싸이 - 나팔바지', + 'upload_date': '20151219', + 'duration': 232, + }, }, { 'url': 'http://tvpot.daum.net/v/vab4dyeDBysyBssyukBUjBz', 'only_matching': True, From 60427f63d129919d5c98e1176ea8136d0eedd0f4 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 23 Dec 2015 10:40:45 +0100 Subject: [PATCH 21/32] [appletrailers] Add support for AppleTrailers Section --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/appletrailers.py | 74 +++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index fede5ff0d..3b541a538 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -25,7 +25,10 @@ from .aol import AolIE from .allocine import AllocineIE from .aparat import AparatIE from .appleconnect import AppleConnectIE -from .appletrailers import AppleTrailersIE +from .appletrailers import ( + AppleTrailersIE, + AppleTrailersSectionIE, +) from .archiveorg import ArchiveOrgIE from .ard import ( ARDIE, diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index f68dc3236..ca9a70924 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -11,6 +11,7 @@ from ..utils import ( class AppleTrailersIE(InfoExtractor): + IE_NAME = 'appletrailers' _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P[^/]+)/(?P[^/]+)' _TESTS = [{ 'url': 'http://trailers.apple.com/trailers/wb/manofsteel/', @@ -144,3 +145,76 @@ class AppleTrailersIE(InfoExtractor): 'id': movie, 'entries': playlist, } + + +class AppleTrailersSectionIE(InfoExtractor): + IE_NAME = 'appletrailers:section' + _SECTIONS = { + 'justadded': { + 'feed_path': 'just_added', + 'title': 'Just Added', + }, + 'exclusive': { + 'feed_path': 'exclusive', + 'title': 'Exclusive', + }, + 'justhd': { + 'feed_path': 'just_hd', + 'title': 'Just HD', + }, + 'mostpopular': { + 'feed_path': 'most_pop', + 'title': 'Most Popular', + }, + 'moviestudios': { + 'feed_path': 'studios', + 'title': 'Movie Studios', + }, + } + _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/#section=(?P%s)' % '|'.join(_SECTIONS) + _TESTS = [{ + 'url': 'http://trailers.apple.com/#section=justadded', + 'info_dict': { + 'title': 'Just Added', + 'id': 'justadded', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=exclusive', + 'info_dict': { + 'title': 'Exclusive', + 'id': 'exclusive', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=justhd', + 'info_dict': { + 'title': 'Just HD', + 'id': 'justhd', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=mostpopular', + 'info_dict': { + 'title': 'Most Popular', + 'id': 'mostpopular', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=moviestudios', + 'info_dict': { + 'title': 'Movie Studios', + 'id': 'moviestudios', + }, + 'playlist_mincount': 80, + }] + + def _real_extract(self, url): + section = self._match_id(url) + section_data = self._download_json( + 'http://trailers.apple.com/trailers/home/feeds/%s.json' % self._SECTIONS[section]['feed_path'], + section) + entries = [ + self.url_result('http://trailers.apple.com' + e['location']) + for e in section_data] + return self.playlist_result(entries, section, self._SECTIONS[section]['title']) From f10c27b8cb35c72b5f9633956f71f96da72ada31 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 23 Dec 2015 14:05:06 +0100 Subject: [PATCH 22/32] release 2015.12.23 --- docs/supportedsites.md | 8 +++----- youtube_dl/version.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 299bc5e72..1a5c7cde9 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -23,7 +23,6 @@ - **AdobeTVShow** - **AdobeTVVideo** - **AdultSwim** - - **Aftenposten** - **Aftonbladet** - **AirMozilla** - **AlJazeera** @@ -34,7 +33,8 @@ - **Aparat** - **AppleConnect** - **AppleDaily**: 臺灣蘋果日報 - - **AppleTrailers** + - **appletrailers** + - **appletrailers:section** - **archive.org**: archive.org videos - **ARD** - **ARD:mediathek** @@ -502,8 +502,6 @@ - **SnagFilmsEmbed** - **Snotr** - **Sohu** - - **soompi** - - **soompi:show** - **soundcloud** - **soundcloud:playlist** - **soundcloud:search**: Soundcloud search @@ -627,7 +625,7 @@ - **Vessel** - **Vesti**: Вести.Ru - **Vevo** - - **VGTV**: VGTV and BTTV + - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet - **vh1.com** - **Vice** - **Viddler** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 7095033c5..255d64269 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.12.21' +__version__ = '2015.12.23' From 7fe37d8a05609229332d5a156cb9b7cf4bba2790 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 23 Dec 2015 14:48:40 +0100 Subject: [PATCH 23/32] [appletrailers] Improve regex for fixing '' tags (#7953) --- youtube_dl/extractor/appletrailers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index ca9a70924..82beed2ce 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -80,7 +80,7 @@ class AppleTrailersIE(InfoExtractor): def fix_html(s): s = re.sub(r'(?s).*?', '', s) - s = re.sub(r'', r'', s) + s = re.sub(r'', r'', s) # The ' in the onClick attributes are not escaped, it couldn't be parsed # like: http://trailers.apple.com/trailers/wb/gravity/ From 747b028412828c66080c7f165b461a7ea490fead Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Dec 2015 20:42:36 +0600 Subject: [PATCH 24/32] [24video] Fix extraction (Closes #7956) --- youtube_dl/extractor/twentyfourvideo.py | 35 +++++++++---------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index c1ee1decc..cb9e5f1b5 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -64,33 +64,22 @@ class TwentyFourVideoIE(InfoExtractor): r'
(\d+) комментари', webpage, 'comment count', fatal=False)) - formats = [] + # Sets some cookies + self._download_xml( + r'http://www.24video.net/video/xml/%s?mode=init' % video_id, + video_id, 'Downloading init XML') - pc_video = self._download_xml( + video = self._download_xml( 'http://www.24video.net/video/xml/%s?mode=play' % video_id, - video_id, 'Downloading PC video URL').find('.//video') + video_id, 'Downloading video XML').find('.//video') - formats.append({ - 'url': pc_video.attrib['url'], - 'format_id': 'pc', - 'quality': 1, - }) + formats = [{ + 'url': video.attrib['url'], + }] - like_count = int_or_none(pc_video.get('ratingPlus')) - dislike_count = int_or_none(pc_video.get('ratingMinus')) - age_limit = 18 if pc_video.get('adult') == 'true' else 0 - - mobile_video = self._download_xml( - 'http://www.24video.net/video/xml/%s' % video_id, - video_id, 'Downloading mobile video URL').find('.//video') - - formats.append({ - 'url': mobile_video.attrib['url'], - 'format_id': 'mobile', - 'quality': 0, - }) - - self._sort_formats(formats) + like_count = int_or_none(video.get('ratingPlus')) + dislike_count = int_or_none(video.get('ratingMinus')) + age_limit = 18 if video.get('adult') == 'true' else 0 return { 'id': video_id, From 128eb31d90583113083ba1fe329eb4cf42c2989f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Dec 2015 20:49:41 +0600 Subject: [PATCH 25/32] [24video] Fix extraction on python 2.6 --- youtube_dl/extractor/twentyfourvideo.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index cb9e5f1b5..68e2277a4 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -5,6 +5,8 @@ from .common import InfoExtractor from ..utils import ( parse_iso8601, int_or_none, + xpath_attr, + xpath_element, ) @@ -69,12 +71,14 @@ class TwentyFourVideoIE(InfoExtractor): r'http://www.24video.net/video/xml/%s?mode=init' % video_id, video_id, 'Downloading init XML') - video = self._download_xml( + video_xml = self._download_xml( 'http://www.24video.net/video/xml/%s?mode=play' % video_id, - video_id, 'Downloading video XML').find('.//video') + video_id, 'Downloading video XML') + + video = xpath_element(video_xml, './/video', 'video', fatal=True) formats = [{ - 'url': video.attrib['url'], + 'url': xpath_attr(video, '', 'url', 'video URL', fatal=True), }] like_count = int_or_none(video.get('ratingPlus')) From be514c856cf9f95fe3e0d45b1df0319a0872b911 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Dec 2015 20:49:52 +0600 Subject: [PATCH 26/32] [24video] Fix test --- youtube_dl/extractor/twentyfourvideo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index 68e2277a4..e03e2dbaa 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -17,7 +17,7 @@ class TwentyFourVideoIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.24video.net/video/view/1044982', - 'md5': 'd041af8b5b4246ea466226a0d6693345', + 'md5': 'e09fc0901d9eaeedac872f154931deeb', 'info_dict': { 'id': '1044982', 'ext': 'mp4', From dcdc352371115007028632da6ae377d0ad39b62a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Dec 2015 21:13:31 +0600 Subject: [PATCH 27/32] [instagram:user] Improve _VALID_URL (Closes #7955) --- youtube_dl/extractor/instagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index c158f2064..e5e16ca3b 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -47,7 +47,7 @@ class InstagramIE(InfoExtractor): class InstagramUserIE(InfoExtractor): - _VALID_URL = r'https://instagram\.com/(?P[^/]{2,})/?(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P[^/]{2,})/?(?:$|[?#])' IE_DESC = 'Instagram user profile' IE_NAME = 'instagram:user' _TEST = { From 261b4c23c70c7c5dc4fe9fd22cf9e867b7456c40 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 23 Dec 2015 17:48:37 +0100 Subject: [PATCH 28/32] [appletrailers] skip clips with empty url --- youtube_dl/extractor/appletrailers.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 82beed2ce..62ed0c918 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -64,6 +64,12 @@ class AppleTrailersIE(InfoExtractor): }, }, ] + }, { + 'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/', + 'info_dict': { + 'id': 'blackthorn', + }, + 'playlist_mincount': 2, }, { 'url': 'http://trailers.apple.com/ca/metropole/autrui/', 'only_matching': True, @@ -97,6 +103,9 @@ class AppleTrailersIE(InfoExtractor): trailer_info_json = self._search_regex(self._JSON_RE, on_click, 'trailer info') trailer_info = json.loads(trailer_info_json) + first_url = trailer_info.get('url') + if not first_url: + continue title = trailer_info['title'] video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() thumbnail = li.find('.//img').attrib['src'] @@ -108,7 +117,6 @@ class AppleTrailersIE(InfoExtractor): if m: duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) - first_url = trailer_info['url'] trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower() settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json') From a8f1d167f6741485b7cbec2cb355315c3774d5bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 23 Dec 2015 17:55:58 +0100 Subject: [PATCH 29/32] [arte] Prefer json URLs that contain the video id from the 'vid' parameter in the URL (fixes #7920) --- youtube_dl/extractor/arte.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 2a00da3ee..10301a8ea 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -68,9 +68,13 @@ class ArteTVPlus7IE(InfoExtractor): def _extract_url_info(cls, url): mobj = re.match(cls._VALID_URL, url) lang = mobj.group('lang') - # This is not a real id, it can be for example AJT for the news - # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal - video_id = mobj.group('id') + query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + if 'vid' in query: + video_id = query['vid'][0] + else: + # This is not a real id, it can be for example AJT for the news + # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal + video_id = mobj.group('id') return video_id, lang def _real_extract(self, url): @@ -79,9 +83,15 @@ class ArteTVPlus7IE(InfoExtractor): return self._extract_from_webpage(webpage, video_id, lang) def _extract_from_webpage(self, webpage, video_id, lang): + patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') + ids = (video_id, '') + # some pages contain multiple videos (like + # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), + # so we first try to look for json URLs that contain the video id from + # the 'vid' parameter. + patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] json_url = self._html_search_regex( - [r'arte_vp_url=["\'](.*?)["\']', r'data-url=["\']([^"]+)["\']'], - webpage, 'json vp url', default=None) + patterns, webpage, 'json vp url', default=None) if not json_url: iframe_url = self._html_search_regex( r']+src=(["\'])(?P.+\bjson_url=.+?)\1', From 2c566d02febb0cf137a8dce8646957beb1415770 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Dec 2015 23:22:47 +0600 Subject: [PATCH 30/32] [pbs] Extend PBS station regex (Closes #7964) --- youtube_dl/extractor/pbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 744e4a09a..97e8ffc97 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -16,7 +16,7 @@ from ..utils import ( class PBSIE(InfoExtractor): _STATIONS = ( - (r'(?:video|www)\.pbs\.org', 'PBS: Public Broadcasting Service'), # http://www.pbs.org/ + (r'(?:video|www|player)\.pbs\.org', 'PBS: Public Broadcasting Service'), # http://www.pbs.org/ (r'video\.aptv\.org', 'APT - Alabama Public Television (WBIQ)'), # http://aptv.org/ (r'video\.gpb\.org', 'GPB/Georgia Public Broadcasting (WGTV)'), # http://www.gpb.org/ (r'video\.mpbonline\.org', 'Mississippi Public Broadcasting (WMPN)'), # http://www.mpbonline.org From fc383f199e73358f88ecf24b7e804dda7400afae Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Wed, 23 Dec 2015 17:35:10 +0100 Subject: [PATCH 31/32] Fix typos --- CONTRIBUTING.md | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f3fe0d432..d15267d7e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -28,7 +28,7 @@ So please elaborate on what feature you are requesting, or what bug you want to - How it could be fixed - How your proposed solution would look like -If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a commiter myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. +If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a committer myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the `-v` flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. diff --git a/README.md b/README.md index 7002f45e0..3a4707227 100644 --- a/README.md +++ b/README.md @@ -830,7 +830,7 @@ So please elaborate on what feature you are requesting, or what bug you want to - How it could be fixed - How your proposed solution would look like -If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a commiter myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. +If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a committer myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the `-v` flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. From 96db61ffb83de9d912003a4778e9ce7c4d46e848 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 23 Dec 2015 22:36:53 +0100 Subject: [PATCH 32/32] [theintercept] improve extraction --- youtube_dl/extractor/theintercept.py | 67 ++++++++++------------------ 1 file changed, 24 insertions(+), 43 deletions(-) diff --git a/youtube_dl/extractor/theintercept.py b/youtube_dl/extractor/theintercept.py index b096a28de..8cb3c3669 100644 --- a/youtube_dl/extractor/theintercept.py +++ b/youtube_dl/extractor/theintercept.py @@ -1,24 +1,28 @@ # encoding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor - +from ..compat import compat_str from ..utils import ( + parse_iso8601, + int_or_none, ExtractorError, ) + class TheInterceptIE(InfoExtractor): - _VALID_URL = r'https://theintercept.com/fieldofvision/(?P.+?)/' + _VALID_URL = r'https://theintercept.com/fieldofvision/(?P[^/?#]+)' _TESTS = [{ 'url': 'https://theintercept.com/fieldofvision/thisisacoup-episode-four-surrender-or-die/', + 'md5': '145f28b41d44aab2f87c0a4ac8ec95bd', 'info_dict': { - 'id': 'thisisacoup-episode-four-surrender-or-die', + 'id': '46214', 'ext': 'mp4', 'title': '#ThisIsACoup – Episode Four: Surrender or Die', - 'upload_date': '20151218', 'description': 'md5:74dd27f0e2fbd50817829f97eaa33140', + 'timestamp': 1450429239, + 'upload_date': '20151218', + 'comment_count': int, } }] @@ -26,43 +30,20 @@ class TheInterceptIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - mobj = re.search(r'initialStoreTree =(?P.+})', webpage) - if mobj is None: - raise ExtractorError('Unable to extract initialStoreTree') - json_data = self._parse_json(mobj.group('json_data'), display_id) + json_data = self._parse_json(self._search_regex( + r'initialStoreTree\s*=\s*(?P{.+})', webpage, + 'initialStoreTree'), display_id) - info = None for post in json_data['resources']['posts'].values(): if post['slug'] == display_id: - info = post - break - if info is None: - raise ExtractorError('Unable to find info for %s'%display_id) - - title = info['title'] - description = info['excerpt'] - upload_date = info['date'][:10].replace('-', '') - video_id = info['fov_videoid'] - creator = ','.join([a['display_name'] for a in info['authors']]) - thumbnail = self._og_search_property('image', webpage) - content_id = thumbnail.split('/')[-1].split('.')[0] - content_url = 'https://content.jwplatform.com/jw6/{content_id}.xml'.format(content_id=content_id) - content = self._download_xml(content_url, video_id) - - formats = [] - for source in content.findall('.//{http://rss.jwpcdn.com/}source'): - if source.attrib['file'].endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats( - source.attrib['file'], video_id, 'mp4', preference=1, m3u8_id='hls')) - - return { - 'creator': creator, - 'description': description, - 'display_id': display_id, - 'formats': formats, - 'id': video_id, - 'id': video_id, - 'thumbnail': thumbnail, - 'title': title, - 'upload_date': upload_date, - } + return { + '_type': 'url_transparent', + 'url': 'jwplatform:%s' % post['fov_videoid'], + 'id': compat_str(post['ID']), + 'display_id': display_id, + 'title': post['title'], + 'description': post.get('excerpt'), + 'timestamp': parse_iso8601(post.get('date')), + 'comment_count': int_or_none(post.get('comments_number')), + } + raise ExtractorError('Unable to find the current post')