From a745475808e125a590afb14df48c565309d3f75c Mon Sep 17 00:00:00 2001 From: Behrooz Date: Fri, 8 May 2015 02:50:46 +0200 Subject: [PATCH 01/30] Ir90Tv Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/ir90tv.py | 41 ++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 youtube_dl/extractor/ir90tv.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5dfa781f8..ee05a6958 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -229,6 +229,7 @@ from .infoq import InfoQIE from .instagram import InstagramIE, InstagramUserIE from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE +from .ir90tv import Ir90TvIE from .ivi import ( IviIE, IviCompilationIE diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py new file mode 100644 index 000000000..5aa9d6ff4 --- /dev/null +++ b/youtube_dl/extractor/ir90tv.py @@ -0,0 +1,41 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class Ir90TvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?90tv\.ir/video/(?P[0-9]+)/.*' + _TEST = { + 'url': 'http://90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218', + 'md5': '411dbd94891381960cb9e13daa47a869', + 'info_dict': { + 'id': '95719', + 'ext': 'mp4', + 'title': 'شایعات نقل و انتقالات مهم فوتبال اروپا 94/02/18', + 'thumbnail': 're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # TODO more code goes here, for example ... + title = self._html_search_regex( + r'\n90tv.ir :: (.*?)', webpage, 'title') + + video_url = self._search_regex( + r']+src="([^"]+)"', webpage, 'video url') + + thumbnail = self._search_regex(r'poster="([^"]+)"', webpage, 'thumbnail url') + print thumbnail + + + return { + 'url': video_url, + 'id': video_id, + 'title': title, + 'video_url' : video_url, + 'thumbnail' : thumbnail, + } \ No newline at end of file From 54b31d149e7be08eb7be9981a9eec398d11f17ef Mon Sep 17 00:00:00 2001 From: Behrooz Date: Fri, 8 May 2015 02:55:01 +0200 Subject: [PATCH 02/30] Ir90Tv Add new extractor --- youtube_dl/extractor/ir90tv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py index 5aa9d6ff4..3a3cb4887 100644 --- a/youtube_dl/extractor/ir90tv.py +++ b/youtube_dl/extractor/ir90tv.py @@ -38,4 +38,4 @@ class Ir90TvIE(InfoExtractor): 'title': title, 'video_url' : video_url, 'thumbnail' : thumbnail, - } \ No newline at end of file + } From a650110ba762b2658c64392317c1afd2a284dd3d Mon Sep 17 00:00:00 2001 From: Behrooz Date: Fri, 8 May 2015 04:32:08 +0200 Subject: [PATCH 03/30] remove print --- youtube_dl/extractor/ir90tv.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py index 3a3cb4887..b79529b1b 100644 --- a/youtube_dl/extractor/ir90tv.py +++ b/youtube_dl/extractor/ir90tv.py @@ -21,7 +21,6 @@ class Ir90TvIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - # TODO more code goes here, for example ... title = self._html_search_regex( r'\n90tv.ir :: (.*?)', webpage, 'title') @@ -29,8 +28,6 @@ class Ir90TvIE(InfoExtractor): r']+src="([^"]+)"', webpage, 'video url') thumbnail = self._search_regex(r'poster="([^"]+)"', webpage, 'thumbnail url') - print thumbnail - return { 'url': video_url, From 9e96dc8b3561c1e6e62ce6a34efba485e5e49054 Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 19 Jun 2015 01:36:59 -0500 Subject: [PATCH 04/30] Support BBC News (bbc.com/news) --- docs/supportedsites.md | 1 + youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/bbcnews.py | 162 +++++++++++++++++++++++++++++++ 3 files changed, 164 insertions(+) create mode 100644 youtube_dl/extractor/bbcnews.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 220e52b98..d4ccbbd3a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -50,6 +50,7 @@ - **Bandcamp** - **Bandcamp:album** - **bbc.co.uk**: BBC iPlayer + - **bbc.com**: BBC news videos - **BeatportPro** - **Beeg** - **BehindKink** diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6fdaf90b2..51d2d20e9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -36,6 +36,7 @@ from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE from .bbccouk import BBCCoUkIE +from .bbcnews import BBCNewsIE from .beeg import BeegIE from .behindkink import BehindKinkIE from .beatportpro import BeatportProIE diff --git a/youtube_dl/extractor/bbcnews.py b/youtube_dl/extractor/bbcnews.py new file mode 100644 index 000000000..b10e30a81 --- /dev/null +++ b/youtube_dl/extractor/bbcnews.py @@ -0,0 +1,162 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, +) +from ..compat import compat_HTTPError +import re +from .bbccouk import BBCCoUkIE + +class BBCNewsIE(BBCCoUkIE): + IE_NAME = 'bbc.com' + IE_DESC = 'BBC news' + _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' + + _TESTS = [{ + 'url': 'http://www.bbc.com/news/world-europe-32668511', + 'info_dict': { + 'id': 'world-europe-32668511', + 'title': 'Russia stages massive WW2 parade despite Western boycott', + }, + 'playlist_count': 2, + },{ + 'url': 'http://www.bbc.com/news/business-28299555', + 'info_dict': { + 'id': 'business-28299555', + 'title': 'Farnborough Airshow: Video highlights', + }, + 'playlist_count': 9, + },{ + 'url': 'http://www.bbc.com/news/world-europe-32041533', + 'note': 'Video', + 'info_dict': { + 'id': 'p02mprgb', + 'ext': 'mp4', + 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'duration': 47, + }, + 'params': { + 'skip_download': True, + } + }] + + def _duration_str2int(self, str): + if not str: + return None + ret = re.match(r'^\d+$', str) + if ret: + return int(ret.group(0)) + ret = re.match(r'PT((?P\d+)H)?((?P\d+)M)?(?P\d+)S$', str) + if ret: + total=int(ret.group('s')) + if ret.group('m'): + total+=(int(ret.group('m'))*60) + if ret.group('h'): + total+=(int(ret.group('h'))*3600) + return total + return None + + def _download_media_selector(self, programme_id): + # bbc news uses http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/ not + # http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/ + # Could add third urlspec arg to BBCCoUkIE._download_media_selector instead of duplicating it + + try: + media_selection = self._download_xml( + 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' % programme_id, + programme_id, 'Downloading media selection XML') + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: + media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8')) + else: + raise + formats = [] + subtitles = None + + for media in self._extract_medias(media_selection): + kind = media.get('kind') + if kind == 'audio': + formats.extend(self._extract_audio(media, programme_id)) + elif kind == 'video': + formats.extend(self._extract_video(media, programme_id)) + elif kind == 'captions': + subtitles = self.extract_subtitles(media, programme_id) + + formats = [] + subtitles = None + + for media in self._extract_medias(media_selection): + kind = media.get('kind') + if kind == 'audio': + formats.extend(self._extract_audio(media, programme_id)) + elif kind == 'video': + formats.extend(self._extract_video(media, programme_id)) + elif kind == 'captions': + subtitles = self.extract_subtitles(media, programme_id) + + return formats, subtitles + + def _real_extract(self, url): + list_id = self._match_id(url) + webpage = self._download_webpage(url, list_id) + + list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC News)?', webpage, 'list title') + + pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) + if pubdate: + pubdate = pubdate.replace('-','') + + ret = [] + # works with bbc.com/news/something-something-123456 articles + matches = re.findall(r"data-media-meta='({[^']+})'", webpage) + if not matches: + # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} + # in http://www.bbc.com/news/video_and_audio/international + matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + if not matches: + raise ExtractorError('No video found', expected=True) + + for ent in matches: + jent = self._parse_json(ent,list_id) + + programme_id = jent.get('externalId',None) + xml_url = jent.get('href', None) + + title = jent['caption'] + duration = self._duration_str2int(jent.get('duration',None)) + description = list_title + ' - ' + jent.get('caption','') + thumbnail = None + if jent.has_key('image'): + thumbnail=jent['image'].get('href',None) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + elif xml_url: + # Cheap fallback + # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml + xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') + programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') + formats, subtitles = self._download_media_selector(programme_id) + else: + raise ExtractorError('data-media-meta entry has no externalId or href value.') + + self._sort_formats(formats) + + ret.append( { + 'id': programme_id, + 'uploader': 'BBC News', + 'upload_date': pubdate, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } ) + + if len(ret) > 0: + return self.playlist_result(ret, list_id, list_title) + raise ExtractorError('No video found', expected=True) From a8b081a0523c412fd4e01d5cddec7ae382c4793e Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 19 Jun 2015 01:52:25 -0500 Subject: [PATCH 05/30] BBCNewsIE: eliminate redundant function. BBCCoUkIE._download_media_selector: use class variable instead of hardcoded string for mediaselector_url template. --- youtube_dl/extractor/bbccouk.py | 4 +++- youtube_dl/extractor/bbcnews.py | 42 ++------------------------------- 2 files changed, 5 insertions(+), 41 deletions(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index 0305f88b5..dcc5fc2fa 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -15,6 +15,8 @@ class BBCCoUkIE(InfoExtractor): IE_DESC = 'BBC iPlayer' _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' + mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' + _TESTS = [ { 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', @@ -277,7 +279,7 @@ class BBCCoUkIE(InfoExtractor): def _download_media_selector(self, programme_id): try: media_selection = self._download_xml( - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' % programme_id, + self.mediaselector_url % programme_id, programme_id, 'Downloading media selection XML') except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: diff --git a/youtube_dl/extractor/bbcnews.py b/youtube_dl/extractor/bbcnews.py index b10e30a81..9bb8d42e6 100644 --- a/youtube_dl/extractor/bbcnews.py +++ b/youtube_dl/extractor/bbcnews.py @@ -14,6 +14,8 @@ class BBCNewsIE(BBCCoUkIE): IE_DESC = 'BBC news' _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' + mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' + _TESTS = [{ 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { @@ -59,46 +61,6 @@ class BBCNewsIE(BBCCoUkIE): return total return None - def _download_media_selector(self, programme_id): - # bbc news uses http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/ not - # http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/ - # Could add third urlspec arg to BBCCoUkIE._download_media_selector instead of duplicating it - - try: - media_selection = self._download_xml( - 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' % programme_id, - programme_id, 'Downloading media selection XML') - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8')) - else: - raise - formats = [] - subtitles = None - - for media in self._extract_medias(media_selection): - kind = media.get('kind') - if kind == 'audio': - formats.extend(self._extract_audio(media, programme_id)) - elif kind == 'video': - formats.extend(self._extract_video(media, programme_id)) - elif kind == 'captions': - subtitles = self.extract_subtitles(media, programme_id) - - formats = [] - subtitles = None - - for media in self._extract_medias(media_selection): - kind = media.get('kind') - if kind == 'audio': - formats.extend(self._extract_audio(media, programme_id)) - elif kind == 'video': - formats.extend(self._extract_video(media, programme_id)) - elif kind == 'captions': - subtitles = self.extract_subtitles(media, programme_id) - - return formats, subtitles - def _real_extract(self, url): list_id = self._match_id(url) webpage = self._download_webpage(url, list_id) From d5552a3477a0970f4aaaa746ce07c816267bb9cf Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 19 Jun 2015 06:25:50 -0500 Subject: [PATCH 06/30] bbcnews: Switch to parse_duration, revert change to docs/supportedsites.md --- docs/supportedsites.md | 1 - youtube_dl/extractor/bbcnews.py | 19 ++----------------- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d4ccbbd3a..220e52b98 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -50,7 +50,6 @@ - **Bandcamp** - **Bandcamp:album** - **bbc.co.uk**: BBC iPlayer - - **bbc.com**: BBC news videos - **BeatportPro** - **Beeg** - **BehindKink** diff --git a/youtube_dl/extractor/bbcnews.py b/youtube_dl/extractor/bbcnews.py index 9bb8d42e6..fd4a5e38f 100644 --- a/youtube_dl/extractor/bbcnews.py +++ b/youtube_dl/extractor/bbcnews.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( ExtractorError, + parse_duration, int_or_none, ) from ..compat import compat_HTTPError @@ -45,22 +46,6 @@ class BBCNewsIE(BBCCoUkIE): } }] - def _duration_str2int(self, str): - if not str: - return None - ret = re.match(r'^\d+$', str) - if ret: - return int(ret.group(0)) - ret = re.match(r'PT((?P\d+)H)?((?P\d+)M)?(?P\d+)S$', str) - if ret: - total=int(ret.group('s')) - if ret.group('m'): - total+=(int(ret.group('m'))*60) - if ret.group('h'): - total+=(int(ret.group('h'))*3600) - return total - return None - def _real_extract(self, url): list_id = self._match_id(url) webpage = self._download_webpage(url, list_id) @@ -88,7 +73,7 @@ class BBCNewsIE(BBCCoUkIE): xml_url = jent.get('href', None) title = jent['caption'] - duration = self._duration_str2int(jent.get('duration',None)) + duration = parse_duration(jent.get('duration',None)) description = list_title + ' - ' + jent.get('caption','') thumbnail = None if jent.has_key('image'): From 10273d6e0846cd8f3762e3777712d5cd2a0cafcd Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 08:22:13 -0500 Subject: [PATCH 07/30] toss new stuff into old file --- youtube_dl/extractor/__init__.py | 3 +- youtube_dl/extractor/bbccouk.py | 101 ++++++++++++++++++++++++++++ youtube_dl/extractor/bbcnews.py | 109 ------------------------------- 3 files changed, 102 insertions(+), 111 deletions(-) delete mode 100644 youtube_dl/extractor/bbcnews.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 51d2d20e9..f9f7bdfaf 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -35,8 +35,7 @@ from .azubu import AzubuIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE -from .bbccouk import BBCCoUkIE -from .bbcnews import BBCNewsIE +from .bbccouk import BBCCoUkIE, BBCNewsIE from .beeg import BeegIE from .behindkink import BehindKinkIE from .beatportpro import BeatportProIE diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index dcc5fc2fa..ea682fb6f 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -5,9 +5,11 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( ExtractorError, + parse_duration, int_or_none, ) from ..compat import compat_HTTPError +import re class BBCCoUkIE(InfoExtractor): @@ -394,3 +396,102 @@ class BBCCoUkIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } + + +class BBCNewsIE(BBCCoUkIE): + IE_NAME = 'bbc.com' + IE_DESC = 'BBC news' + _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' + + mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' + + _TESTS = [{ + 'url': 'http://www.bbc.com/news/world-europe-32668511', + 'info_dict': { + 'id': 'world-europe-32668511', + 'title': 'Russia stages massive WW2 parade despite Western boycott', + }, + 'playlist_count': 2, + },{ + 'url': 'http://www.bbc.com/news/business-28299555', + 'info_dict': { + 'id': 'business-28299555', + 'title': 'Farnborough Airshow: Video highlights', + }, + 'playlist_count': 9, + },{ + 'url': 'http://www.bbc.com/news/world-europe-32041533', + 'note': 'Video', + 'info_dict': { + 'id': 'p02mprgb', + 'ext': 'mp4', + 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'duration': 47, + }, + 'params': { + 'skip_download': True, + } + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + webpage = self._download_webpage(url, list_id) + + list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC News)?', webpage, 'list title') + + pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) + if pubdate: + pubdate = pubdate.replace('-','') + + ret = [] + # works with bbc.com/news/something-something-123456 articles + matches = re.findall(r"data-media-meta='({[^']+})'", webpage) + if not matches: + # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} + # in http://www.bbc.com/news/video_and_audio/international + matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + if not matches: + raise ExtractorError('No video found', expected=True) + + for ent in matches: + jent = self._parse_json(ent,list_id) + + programme_id = jent.get('externalId',None) + xml_url = jent.get('href', None) + + title = jent['caption'] + duration = parse_duration(jent.get('duration',None)) + description = list_title + ' - ' + jent.get('caption','') + thumbnail = None + if jent.has_key('image'): + thumbnail=jent['image'].get('href',None) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + elif xml_url: + # Cheap fallback + # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml + xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') + programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') + formats, subtitles = self._download_media_selector(programme_id) + else: + raise ExtractorError('data-media-meta entry has no externalId or href value.') + + self._sort_formats(formats) + + ret.append( { + 'id': programme_id, + 'uploader': 'BBC News', + 'upload_date': pubdate, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } ) + + if len(ret) > 0: + return self.playlist_result(ret, list_id, list_title) + raise ExtractorError('No video found', expected=True) diff --git a/youtube_dl/extractor/bbcnews.py b/youtube_dl/extractor/bbcnews.py deleted file mode 100644 index fd4a5e38f..000000000 --- a/youtube_dl/extractor/bbcnews.py +++ /dev/null @@ -1,109 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - parse_duration, - int_or_none, -) -from ..compat import compat_HTTPError -import re -from .bbccouk import BBCCoUkIE - -class BBCNewsIE(BBCCoUkIE): - IE_NAME = 'bbc.com' - IE_DESC = 'BBC news' - _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' - - mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' - - _TESTS = [{ - 'url': 'http://www.bbc.com/news/world-europe-32668511', - 'info_dict': { - 'id': 'world-europe-32668511', - 'title': 'Russia stages massive WW2 parade despite Western boycott', - }, - 'playlist_count': 2, - },{ - 'url': 'http://www.bbc.com/news/business-28299555', - 'info_dict': { - 'id': 'business-28299555', - 'title': 'Farnborough Airshow: Video highlights', - }, - 'playlist_count': 9, - },{ - 'url': 'http://www.bbc.com/news/world-europe-32041533', - 'note': 'Video', - 'info_dict': { - 'id': 'p02mprgb', - 'ext': 'mp4', - 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'duration': 47, - }, - 'params': { - 'skip_download': True, - } - }] - - def _real_extract(self, url): - list_id = self._match_id(url) - webpage = self._download_webpage(url, list_id) - - list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC News)?', webpage, 'list title') - - pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) - if pubdate: - pubdate = pubdate.replace('-','') - - ret = [] - # works with bbc.com/news/something-something-123456 articles - matches = re.findall(r"data-media-meta='({[^']+})'", webpage) - if not matches: - # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} - # in http://www.bbc.com/news/video_and_audio/international - matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) - if not matches: - raise ExtractorError('No video found', expected=True) - - for ent in matches: - jent = self._parse_json(ent,list_id) - - programme_id = jent.get('externalId',None) - xml_url = jent.get('href', None) - - title = jent['caption'] - duration = parse_duration(jent.get('duration',None)) - description = list_title + ' - ' + jent.get('caption','') - thumbnail = None - if jent.has_key('image'): - thumbnail=jent['image'].get('href',None) - - if programme_id: - formats, subtitles = self._download_media_selector(programme_id) - elif xml_url: - # Cheap fallback - # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml - xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') - programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') - formats, subtitles = self._download_media_selector(programme_id) - else: - raise ExtractorError('data-media-meta entry has no externalId or href value.') - - self._sort_formats(formats) - - ret.append( { - 'id': programme_id, - 'uploader': 'BBC News', - 'upload_date': pubdate, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } ) - - if len(ret) > 0: - return self.playlist_result(ret, list_id, list_title) - raise ExtractorError('No video found', expected=True) From 75ab0ebcf593ec91a46d83e69854ffa313d33309 Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 08:24:02 -0500 Subject: [PATCH 08/30] no .get('..',None) --- youtube_dl/extractor/bbccouk.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index ea682fb6f..de4d7f9c0 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -457,15 +457,15 @@ class BBCNewsIE(BBCCoUkIE): for ent in matches: jent = self._parse_json(ent,list_id) - programme_id = jent.get('externalId',None) - xml_url = jent.get('href', None) + programme_id = jent.get('externalId') + xml_url = jent.get('href') title = jent['caption'] - duration = parse_duration(jent.get('duration',None)) + duration = parse_duration(jent.get('duration') description = list_title + ' - ' + jent.get('caption','') thumbnail = None if jent.has_key('image'): - thumbnail=jent['image'].get('href',None) + thumbnail=jent['image'].get('href') if programme_id: formats, subtitles = self._download_media_selector(programme_id) From 77c975f536befbe89bf718e86282958d391d9ffe Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 08:28:14 -0500 Subject: [PATCH 09/30] typofix --- youtube_dl/extractor/bbccouk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index de4d7f9c0..f9404f3fa 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -461,7 +461,7 @@ class BBCNewsIE(BBCCoUkIE): xml_url = jent.get('href') title = jent['caption'] - duration = parse_duration(jent.get('duration') + duration = parse_duration(jent.get('duration')) description = list_title + ' - ' + jent.get('caption','') thumbnail = None if jent.has_key('image'): From de939d89eb83c851c6db66933e5fc0c401a1a679 Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 11:04:46 -0500 Subject: [PATCH 10/30] Support BBC news in other languages, non-mediaselector videos --- youtube_dl/extractor/bbccouk.py | 87 +++++++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index f9404f3fa..72e20857b 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -401,7 +401,7 @@ class BBCCoUkIE(InfoExtractor): class BBCNewsIE(BBCCoUkIE): IE_NAME = 'bbc.com' IE_DESC = 'BBC news' - _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?bbc\.com/.+?/(?P[^/]+)$' mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' @@ -432,56 +432,115 @@ class BBCNewsIE(BBCCoUkIE): 'params': { 'skip_download': True, } + },{ + 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', + 'note': 'Video', + 'info_dict': { + 'id': 'NA', + 'ext': 'mp4', + 'title': 'YPG - Tel Abyad..n tamam. kontrol.m.zde', + 'duration': 47, + }, + 'params': { + 'skip_download': True, + } + },{ + 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', + 'note': 'Video', + 'info_dict': { + 'id': '39275083', + 'ext': 'mp4', + 'title': 'Honduras militariza sus hospitales por nuevo esc.ndalo de corrupci.n', + 'duration': 87, + }, + 'params': { + 'skip_download': True, + } }] def _real_extract(self, url): list_id = self._match_id(url) webpage = self._download_webpage(url, list_id) - list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC News)?', webpage, 'list title') + list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'list title') pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) if pubdate: pubdate = pubdate.replace('-','') ret = [] + jsent = [] + # works with bbc.com/news/something-something-123456 articles - matches = re.findall(r"data-media-meta='({[^']+})'", webpage) - if not matches: + jsent = map( + lambda m: self._parse_json(m,list_id), + re.findall(r"data-media-meta='({[^']+})'", webpage) + ) + + if len(jsent) == 0: + # http://www.bbc.com/news/video_and_audio/international + # and single-video articles + masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None) + if masset: + jmasset = self._parse_json(masset,list_id) + for key, val in jmasset.get('videos',{}).items(): + for skey, sval in val.items(): + sval['id'] = skey + jsent.append(sval) + + if len(jsent) == 0: # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} # in http://www.bbc.com/news/video_and_audio/international - matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) - if not matches: + # prone to breaking if entries have sourceFiles list + jsent = map( + lambda m: self._parse_json(m,list_id), + re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + ) + + if len(jsent) == 0: raise ExtractorError('No video found', expected=True) - for ent in matches: - jent = self._parse_json(ent,list_id) - + for jent in jsent: programme_id = jent.get('externalId') - xml_url = jent.get('href') + xml_url = jent.get('hxref') + + title = jent.get('caption',list_title) - title = jent['caption'] duration = parse_duration(jent.get('duration')) description = list_title + ' - ' + jent.get('caption','') thumbnail = None if jent.has_key('image'): thumbnail=jent['image'].get('href') + formats = [] + subtitles = [] + if programme_id: formats, subtitles = self._download_media_selector(programme_id) + elif jent.has_key('sourceFiles'): + # mediaselector not used at + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu + for key, val in jent['sourceFiles'].items(): + formats.append( { + 'ext': val.get('encoding'), + 'url': val.get('url'), + 'filesize': int(val.get('filesize')), + 'format_id': key + } ) elif xml_url: # Cheap fallback # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') formats, subtitles = self._download_media_selector(programme_id) - else: - raise ExtractorError('data-media-meta entry has no externalId or href value.') + + if len(formats) == 0: + raise ExtractorError('unsupported json media entry.\n '+str(jent)+'\n') self._sort_formats(formats) ret.append( { - 'id': programme_id, + 'id': jent.get('programme_id',jent.get('id')), 'uploader': 'BBC News', 'upload_date': pubdate, 'title': title, From 7bb23aeca4e9076528e3d31d501a9a288dcd444c Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 11:08:13 -0500 Subject: [PATCH 11/30] rename bbccouk.py -> bbc.py --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/{bbccouk.py => bbc.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename youtube_dl/extractor/{bbccouk.py => bbc.py} (100%) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f9f7bdfaf..a48346e60 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -35,7 +35,7 @@ from .azubu import AzubuIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE -from .bbccouk import BBCCoUkIE, BBCNewsIE +from .bbc import BBCCoUkIE, BBCNewsIE from .beeg import BeegIE from .behindkink import BehindKinkIE from .beatportpro import BeatportProIE diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbc.py similarity index 100% rename from youtube_dl/extractor/bbccouk.py rename to youtube_dl/extractor/bbc.py From 2a282a3b5f366ba0569bae477d5060329ba254fb Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 11:11:41 -0500 Subject: [PATCH 12/30] Unbreak breakage that was broken to test breakage --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 72e20857b..310db9d1d 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -502,7 +502,7 @@ class BBCNewsIE(BBCCoUkIE): for jent in jsent: programme_id = jent.get('externalId') - xml_url = jent.get('hxref') + xml_url = jent.get('href') title = jent.get('caption',list_title) From a9dcf4a860214e37971ab05f27f74bbae65ff8ae Mon Sep 17 00:00:00 2001 From: fnord Date: Tue, 23 Jun 2015 01:08:07 -0500 Subject: [PATCH 13/30] Prefer externalId over non-mediaserver-specific hashkey for video id. --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 310db9d1d..fed344ea0 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -540,7 +540,7 @@ class BBCNewsIE(BBCCoUkIE): self._sort_formats(formats) ret.append( { - 'id': jent.get('programme_id',jent.get('id')), + 'id': jent.get('id') if programme_id == None else programme_id, 'uploader': 'BBC News', 'upload_date': pubdate, 'title': title, From da92eeae42f556926cb676b3c14e270603b7e38e Mon Sep 17 00:00:00 2001 From: fnord Date: Thu, 25 Jun 2015 00:31:32 -0500 Subject: [PATCH 14/30] Fix tests, description formatting --- youtube_dl/extractor/bbc.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index fed344ea0..bb671d473 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -428,6 +428,8 @@ class BBCNewsIE(BBCCoUkIE): 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', 'duration': 47, + 'upload_date': '20150324', + 'uploader': 'BBC News', }, 'params': { 'skip_download': True, @@ -438,8 +440,11 @@ class BBCNewsIE(BBCCoUkIE): 'info_dict': { 'id': 'NA', 'ext': 'mp4', - 'title': 'YPG - Tel Abyad..n tamam. kontrol.m.zde', + 'title': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde', + 'description': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde', 'duration': 47, + 'upload_date': '20150615', + 'uploader': 'BBC News', }, 'params': { 'skip_download': True, @@ -450,8 +455,11 @@ class BBCNewsIE(BBCCoUkIE): 'info_dict': { 'id': '39275083', 'ext': 'mp4', - 'title': 'Honduras militariza sus hospitales por nuevo esc.ndalo de corrupci.n', + 'title': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n', + 'description': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n', 'duration': 87, + 'upload_date': '20150619', + 'uploader': 'BBC News', }, 'params': { 'skip_download': True, @@ -507,7 +515,9 @@ class BBCNewsIE(BBCCoUkIE): title = jent.get('caption',list_title) duration = parse_duration(jent.get('duration')) - description = list_title + ' - ' + jent.get('caption','') + description = list_title + if jent.get('caption'): + description += ' - ' + jent.get('caption') thumbnail = None if jent.has_key('image'): thumbnail=jent['image'].get('href') @@ -539,8 +549,12 @@ class BBCNewsIE(BBCCoUkIE): self._sort_formats(formats) + id = jent.get('id') if programme_id == None else programme_id + if id == None: + id = 'NA' + ret.append( { - 'id': jent.get('id') if programme_id == None else programme_id, + 'id': id, 'uploader': 'BBC News', 'upload_date': pubdate, 'title': title, From 36da48798a28b8261d2f39f73f2522651d58a364 Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 02:27:50 -0500 Subject: [PATCH 15/30] handle titles and captions set to '' --- youtube_dl/extractor/bbc.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 471d865d2..c910eb55a 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -497,11 +497,13 @@ class BBCNewsIE(BBCCoUkIE): programme_id = jent.get('externalId') xml_url = jent.get('href') - title = jent.get('caption',list_title) + title = jent.get('caption','') + if title == '': + title = list_title duration = parse_duration(jent.get('duration')) description = list_title - if jent.get('caption'): + if jent.get('caption', '') != '': description += ' - ' + jent.get('caption') thumbnail = None if jent.has_key('image'): From a3bfddfa5ee33cf085b959536f1025c0aa53cc77 Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 02:47:02 -0500 Subject: [PATCH 16/30] bbc.py: correct syntax --- youtube_dl/extractor/bbc.py | 106 ++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index c910eb55a..c8f285165 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -397,14 +397,14 @@ class BBCNewsIE(BBCCoUkIE): 'title': 'Russia stages massive WW2 parade despite Western boycott', }, 'playlist_count': 2, - },{ + }, { 'url': 'http://www.bbc.com/news/business-28299555', 'info_dict': { 'id': 'business-28299555', 'title': 'Farnborough Airshow: Video highlights', }, 'playlist_count': 9, - },{ + }, { 'url': 'http://www.bbc.com/news/world-europe-32041533', 'note': 'Video', 'info_dict': { @@ -419,7 +419,7 @@ class BBCNewsIE(BBCCoUkIE): 'params': { 'skip_download': True, } - },{ + }, { 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', 'note': 'Video', 'info_dict': { @@ -434,7 +434,7 @@ class BBCNewsIE(BBCCoUkIE): 'params': { 'skip_download': True, } - },{ + }, { 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', 'note': 'Video', 'info_dict': { @@ -459,88 +459,88 @@ class BBCNewsIE(BBCCoUkIE): pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) if pubdate: - pubdate = pubdate.replace('-','') + pubdate = pubdate.replace('-', '') ret = [] jsent = [] # works with bbc.com/news/something-something-123456 articles jsent = map( - lambda m: self._parse_json(m,list_id), - re.findall(r"data-media-meta='({[^']+})'", webpage) + lambda m: self._parse_json(m, list_id), + re.findall(r"data-media-meta='({[^']+})'", webpage) ) if len(jsent) == 0: - # http://www.bbc.com/news/video_and_audio/international - # and single-video articles - masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None) - if masset: - jmasset = self._parse_json(masset,list_id) - for key, val in jmasset.get('videos',{}).items(): - for skey, sval in val.items(): - sval['id'] = skey - jsent.append(sval) + # http://www.bbc.com/news/video_and_audio/international + # and single-video articles + masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None) + if masset: + jmasset = self._parse_json(masset, list_id) + for key, val in jmasset.get('videos', {}).items(): + for skey, sval in val.items(): + sval['id'] = skey + jsent.append(sval) if len(jsent) == 0: - # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} - # in http://www.bbc.com/news/video_and_audio/international - # prone to breaking if entries have sourceFiles list - jsent = map( - lambda m: self._parse_json(m,list_id), - re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) - ) + # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} + # in http://www.bbc.com/news/video_and_audio/international + # prone to breaking if entries have sourceFiles list + jsent = map( + lambda m: self._parse_json(m, list_id), + re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + ) if len(jsent) == 0: - raise ExtractorError('No video found', expected=True) + raise ExtractorError('No video found', expected=True) for jent in jsent: programme_id = jent.get('externalId') xml_url = jent.get('href') - title = jent.get('caption','') + title = jent.get('caption', '') if title == '': - title = list_title + title = list_title duration = parse_duration(jent.get('duration')) description = list_title if jent.get('caption', '') != '': - description += ' - ' + jent.get('caption') + description += ' - ' + jent.get('caption') thumbnail = None - if jent.has_key('image'): - thumbnail=jent['image'].get('href') + if jent.get('image') is not None: + thumbnail = jent['image'].get('href') formats = [] subtitles = [] if programme_id: - formats, subtitles = self._download_media_selector(programme_id) - elif jent.has_key('sourceFiles'): - # mediaselector not used at - # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu - for key, val in jent['sourceFiles'].items(): - formats.append( { - 'ext': val.get('encoding'), - 'url': val.get('url'), - 'filesize': int(val.get('filesize')), - 'format_id': key - } ) + formats, subtitles = self._download_media_selector(programme_id) + elif jent.get('sourceFiles') is not None: + # mediaselector not used at + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu + for key, val in jent['sourceFiles'].items(): + formats.append({ + 'ext': val.get('encoding'), + 'url': val.get('url'), + 'filesize': int(val.get('filesize')), + 'format_id': key + }) elif xml_url: - # Cheap fallback - # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml - xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') - programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') - formats, subtitles = self._download_media_selector(programme_id) + # Cheap fallback + # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml + xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') + programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') + formats, subtitles = self._download_media_selector(programme_id) if len(formats) == 0: - raise ExtractorError('unsupported json media entry.\n '+str(jent)+'\n') - + raise ExtractorError('unsupported json media entry.\n ' + str(jent) + '\n') + self._sort_formats(formats) - id = jent.get('id') if programme_id == None else programme_id - if id == None: - id = 'NA' + id = jent.get('id') if programme_id is None else programme_id + if id is None: + id = 'NA' - ret.append( { + ret.append({ 'id': id, 'uploader': 'BBC News', 'upload_date': pubdate, @@ -550,8 +550,8 @@ class BBCNewsIE(BBCCoUkIE): 'duration': duration, 'formats': formats, 'subtitles': subtitles, - } ) + }) if len(ret) > 0: - return self.playlist_result(ret, list_id, list_title) + return self.playlist_result(ret, list_id, list_title) raise ExtractorError('No video found', expected=True) From 678e436f2e77f1ae3a57c4b5d1fc3d74342ab412 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 25 Jul 2015 02:09:34 +0600 Subject: [PATCH 17/30] [youtube] Handle empty allowed regions (Closes #6351) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 229fc3a0f..4023a6e50 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -993,7 +993,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if 'reason' in video_info: if 'The uploader has not made this video available in your country.' in video_info['reason']: regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None) - if regions_allowed is not None: + if regions_allowed: raise ExtractorError('YouTube said: This video is available in %s only' % ( ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))), expected=True) From b14fa8e6874818a3f210b2a67cf53345000defdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 25 Jul 2015 15:47:53 +0600 Subject: [PATCH 18/30] [soundcloud:set] Defer download link resolve (Closes #6354) --- youtube_dl/extractor/soundcloud.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 118ca4832..cdee8e2a3 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -282,9 +282,11 @@ class SoundcloudSetIE(SoundcloudIE): msgs = (compat_str(err['error_message']) for err in info['errors']) raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) + entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in info['tracks']] + return { '_type': 'playlist', - 'entries': [self._extract_info_dict(track, secret_token=token) for track in info['tracks']], + 'entries': entries, 'id': '%s' % info['id'], 'title': info['title'], } From 40a2d17052e9b542eb3c360a0ce067d244e07fb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 25 Jul 2015 15:48:44 +0600 Subject: [PATCH 19/30] [soundcloud:playlist] Defer download link resolve --- youtube_dl/extractor/soundcloud.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index cdee8e2a3..0a6c9fe72 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -381,9 +381,7 @@ class SoundcloudPlaylistIE(SoundcloudIE): data = self._download_json( base_url + data, playlist_id, 'Downloading playlist') - entries = [ - self._extract_info_dict(t, quiet=True, secret_token=token) - for t in data['tracks']] + entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in data['tracks']] return { '_type': 'playlist', From eab7faa0c1e8511bc91c64347d0dffc28c94f101 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 25 Jul 2015 18:39:01 +0800 Subject: [PATCH 20/30] [ir90tv] Test (?:www\.)? part in _VALID_URL --- youtube_dl/extractor/ir90tv.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py index b79529b1b..880a6e32f 100644 --- a/youtube_dl/extractor/ir90tv.py +++ b/youtube_dl/extractor/ir90tv.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class Ir90TvIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?90tv\.ir/video/(?P[0-9]+)/.*' - _TEST = { + _TESTS = [{ 'url': 'http://90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218', 'md5': '411dbd94891381960cb9e13daa47a869', 'info_dict': { @@ -15,7 +15,10 @@ class Ir90TvIE(InfoExtractor): 'title': 'شایعات نقل و انتقالات مهم فوتبال اروپا 94/02/18', 'thumbnail': 're:^https?://.*\.jpg$', } - } + }, { + 'url': 'http://www.90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From 9700cd9097445d598515dc41fb3cb9421403b9b9 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 25 Jul 2015 18:42:40 +0800 Subject: [PATCH 21/30] [ir90tv] Improve title extraction --- youtube_dl/extractor/ir90tv.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py index 880a6e32f..92333c3ea 100644 --- a/youtube_dl/extractor/ir90tv.py +++ b/youtube_dl/extractor/ir90tv.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import remove_start class Ir90TvIE(InfoExtractor): @@ -24,8 +25,8 @@ class Ir90TvIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'\n90tv.ir :: (.*?)', webpage, 'title') + title = remove_start(self._html_search_regex( + r'([^<]+)', webpage, 'title'), '90tv.ir :: ') video_url = self._search_regex( r']+src="([^"]+)"', webpage, 'video url') From 7523647391969f8d747ba0fc178592f7f3d5e453 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 25 Jul 2015 18:43:07 +0800 Subject: [PATCH 22/30] [ir90tv] PEP8 --- youtube_dl/extractor/ir90tv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py index 92333c3ea..6418d6178 100644 --- a/youtube_dl/extractor/ir90tv.py +++ b/youtube_dl/extractor/ir90tv.py @@ -37,6 +37,6 @@ class Ir90TvIE(InfoExtractor): 'url': video_url, 'id': video_id, 'title': title, - 'video_url' : video_url, - 'thumbnail' : thumbnail, + 'video_url': video_url, + 'thumbnail': thumbnail, } From 2c7c721933e53ece49bee0140d2dad9a8219d6e4 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 25 Jul 2015 18:48:00 +0800 Subject: [PATCH 23/30] [ir90tv] Optional fields should be non-fatal --- youtube_dl/extractor/ir90tv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py index 6418d6178..214bcd5b5 100644 --- a/youtube_dl/extractor/ir90tv.py +++ b/youtube_dl/extractor/ir90tv.py @@ -31,7 +31,7 @@ class Ir90TvIE(InfoExtractor): video_url = self._search_regex( r']+src="([^"]+)"', webpage, 'video url') - thumbnail = self._search_regex(r'poster="([^"]+)"', webpage, 'thumbnail url') + thumbnail = self._search_regex(r'poster="([^"]+)"', webpage, 'thumbnail url', fatal=False) return { 'url': video_url, From 9afa1770d1a6835bc8fee48dc86cd1a702d1f67a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 25 Jul 2015 20:21:42 +0600 Subject: [PATCH 24/30] [bbc] Improve playlist extraction, refactor, expand support and document --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/bbc.py | 375 +++++++++++++++++++++---------- 2 files changed, 259 insertions(+), 121 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bc61cbdc5..d77ed3ba2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -43,7 +43,10 @@ from .azubu import AzubuIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE -from .bbc import BBCCoUkIE, BBCNewsIE +from .bbc import ( + BBCCoUkIE, + BBCIE, +) from .beeg import BeegIE from .behindkink import BehindKinkIE from .beatportpro import BeatportProIE diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 86327d8ed..2a0901ee4 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1,15 +1,18 @@ +# coding: utf-8 from __future__ import unicode_literals +import re import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( ExtractorError, - parse_duration, + float_or_none, int_or_none, + parse_duration, + parse_iso8601, ) from ..compat import compat_HTTPError -import re class BBCCoUkIE(InfoExtractor): @@ -17,7 +20,7 @@ class BBCCoUkIE(InfoExtractor): IE_DESC = 'BBC iPlayer' _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' - mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' + _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' _TESTS = [ { @@ -264,16 +267,21 @@ class BBCCoUkIE(InfoExtractor): return subtitles def _download_media_selector(self, programme_id): + return self._download_media_selector_url( + self._MEDIASELECTOR_URL % programme_id, programme_id) + + def _download_media_selector_url(self, url, programme_id=None): try: media_selection = self._download_xml( - self.mediaselector_url % programme_id, - programme_id, 'Downloading media selection XML') + url, programme_id, 'Downloading media selection XML') except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) else: raise + return self._process_media_selector(media_selection, programme_id) + def _process_media_selector(self, media_selection, programme_id): formats = [] subtitles = None @@ -312,10 +320,21 @@ class BBCCoUkIE(InfoExtractor): raise # fallback to legacy playlist - playlist = self._download_xml( - 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, - playlist_id, 'Downloading legacy playlist XML') + return self._process_legacy_playlist(playlist_id) + def _process_legacy_playlist_url(self, url, display_id): + playlist = self._download_legacy_playlist_url(url, display_id) + return self._extract_from_legacy_playlist(playlist, display_id) + + def _process_legacy_playlist(self, playlist_id): + return self._process_legacy_playlist_url( + 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id) + + def _download_legacy_playlist_url(self, url, playlist_id=None): + return self._download_xml( + url, playlist_id, 'Downloading legacy playlist XML') + + def _extract_from_legacy_playlist(self, playlist, playlist_id): no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems') if no_items is not None: reason = no_items.get('reason') @@ -335,8 +354,23 @@ class BBCCoUkIE(InfoExtractor): continue title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text - programme_id = item.get('identifier') + + def get_programme_id(item): + def get_from_attributes(item): + for p in('identifier', 'group'): + value = item.get(p) + if value and re.match(r'^[pb][\da-z]{7}$', value): + return value + get_from_attributes(item) + mediator = item.find('./{http://bbc.co.uk/2008/emp/playlist}mediator') + if mediator is not None: + return get_from_attributes(mediator) + + programme_id = get_programme_id(item) duration = int_or_none(item.get('duration')) + # TODO: programme_id can be None and media items can be incorporated right inside + # playlist's item (e.g. http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) + # as f4m and m3u8 formats, subtitles = self._download_media_selector(programme_id) return programme_id, title, description, duration, formats, subtitles @@ -383,175 +417,276 @@ class BBCCoUkIE(InfoExtractor): } -class BBCNewsIE(BBCCoUkIE): - IE_NAME = 'bbc.com' - IE_DESC = 'BBC news' - _VALID_URL = r'https?://(?:www\.)?bbc\.com/.+?/(?P[^/]+)$' +class BBCIE(BBCCoUkIE): + IE_NAME = 'bbc' + IE_DESC = 'BBC' + _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' - mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' + # fails with notukerror for some videos + #_MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' + _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s' _TESTS = [{ + # article with multiple videos embedded with data-media-meta containing + # playlist.sxml, externalId and no direct video links 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { 'id': 'world-europe-32668511', 'title': 'Russia stages massive WW2 parade despite Western boycott', + 'description': 'md5:00ff61976f6081841f759a08bf78cc9c', }, 'playlist_count': 2, }, { + # article with multiple videos embedded with data-media-meta (more videos) 'url': 'http://www.bbc.com/news/business-28299555', 'info_dict': { 'id': 'business-28299555', 'title': 'Farnborough Airshow: Video highlights', + 'description': 'BBC reports and video highlights at the Farnborough Airshow.', }, 'playlist_count': 9, + 'skip': 'Save time', }, { + # single video embedded with mediaAssetPage.init() 'url': 'http://www.bbc.com/news/world-europe-32041533', - 'note': 'Video', 'info_dict': { 'id': 'p02mprgb', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', 'duration': 47, + 'timestamp': 1427219242, 'upload_date': '20150324', - 'uploader': 'BBC News', }, 'params': { + # rtmp download 'skip_download': True, } }, { + # article with single video embedded with data-media-meta containing + # direct video links (for now these are extracted) and playlist.xml (with + # media items as f4m and m3u8 - currently unsupported) 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', - 'note': 'Video', 'info_dict': { - 'id': 'NA', + 'id': '150615_telabyad_kentin_cogu', 'ext': 'mp4', - 'title': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde', - 'description': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde', + 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde", 'duration': 47, + 'timestamp': 1434397334, 'upload_date': '20150615', - 'uploader': 'BBC News', }, 'params': { 'skip_download': True, } }, { + # single video embedded with mediaAssetPage.init() (regional section) 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', - 'note': 'Video', 'info_dict': { - 'id': '39275083', + 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', 'ext': 'mp4', - 'title': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n', - 'description': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n', + 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', 'duration': 87, + 'timestamp': 1434713142, 'upload_date': '20150619', - 'uploader': 'BBC News', }, 'params': { 'skip_download': True, } + }, { + # single video story with digitalData + 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', + 'info_dict': { + 'id': 'p02q6gc4', + 'ext': 'flv', + 'title': 'Sri Lanka’s spicy secret', + 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.', + 'timestamp': 1437674293, + 'upload_date': '20150723', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # single video story without digitalData + 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', + 'info_dict': { + 'id': 'p018zqqg', + 'ext': 'flv', + 'title': 'Hyundai Santa Fe Sport: Rock star', + 'description': 'md5:b042a26142c4154a6e472933cf20793d', + 'timestamp': 1368473503, + 'upload_date': '20130513', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # single video with playlist.sxml URL + 'url': 'http://www.bbc.com/sport/0/football/33653409', + 'info_dict': { + 'id': 'p02xycnp', + 'ext': 'flv', + 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', + 'description': 'md5:398fca0e2e701c609d726e034fa1fc89', + 'duration': 140, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # single video with playlist URL from weather section + 'url': 'http://www.bbc.com/weather/features/33601775', + 'only_matching': True, + }, { + # custom redirection to www.bbc.com + 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if BBCCoUkIE.suitable(url) else super(BBCIE, cls).suitable(url) + + def _extract_from_media_meta(self, media_meta, video_id): + # Direct links to media in media metadata (e.g. + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) + # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml + source_files = media_meta.get('sourceFiles') + if source_files: + return [{ + 'url': f['url'], + 'format_id': format_id, + 'ext': f.get('encoding'), + 'tbr': float_or_none(f.get('bitrate'), 1000), + 'filesize': int_or_none(f.get('filesize')), + } for format_id, f in source_files.items() if f.get('url')], [] + + programme_id = media_meta.get('externalId') + if programme_id: + return self._download_media_selector(programme_id) + + # Process playlist.sxml as legacy playlist + href = media_meta.get('href') + if href: + playlist = self._download_legacy_playlist_url(href) + _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id) + return formats, subtitles + + return [], [] + def _real_extract(self, url): - list_id = self._match_id(url) - webpage = self._download_webpage(url, list_id) + playlist_id = self._match_id(url) - list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'list title') + webpage = self._download_webpage(url, playlist_id) - pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) - if pubdate: - pubdate = pubdate.replace('-', '') - - ret = [] - jsent = [] - - # works with bbc.com/news/something-something-123456 articles - jsent = map( - lambda m: self._parse_json(m, list_id), - re.findall(r"data-media-meta='({[^']+})'", webpage) - ) - - if len(jsent) == 0: - # http://www.bbc.com/news/video_and_audio/international - # and single-video articles - masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None) - if masset: - jmasset = self._parse_json(masset, list_id) - for key, val in jmasset.get('videos', {}).items(): - for skey, sval in val.items(): - sval['id'] = skey - jsent.append(sval) - - if len(jsent) == 0: - # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} - # in http://www.bbc.com/news/video_and_audio/international - # prone to breaking if entries have sourceFiles list - jsent = map( - lambda m: self._parse_json(m, list_id), - re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) - ) - - if len(jsent) == 0: - raise ExtractorError('No video found', expected=True) - - for jent in jsent: - programme_id = jent.get('externalId') - xml_url = jent.get('href') - - title = jent.get('caption', '') - if title == '': - title = list_title - - duration = parse_duration(jent.get('duration')) - description = list_title - if jent.get('caption', '') != '': - description += ' - ' + jent.get('caption') - thumbnail = None - if jent.get('image') is not None: - thumbnail = jent['image'].get('href') - - formats = [] - subtitles = [] - - if programme_id: - formats, subtitles = self._download_media_selector(programme_id) - elif jent.get('sourceFiles') is not None: - # mediaselector not used at - # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu - for key, val in jent['sourceFiles'].items(): - formats.append({ - 'ext': val.get('encoding'), - 'url': val.get('url'), - 'filesize': int(val.get('filesize')), - 'format_id': key - }) - elif xml_url: - # Cheap fallback - # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml - xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') - programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') - formats, subtitles = self._download_media_selector(programme_id) - - if len(formats) == 0: - raise ExtractorError('unsupported json media entry.\n ' + str(jent) + '\n') + timestamp = parse_iso8601(self._search_regex( + [r'"datePublished":\s*"([^"]+)', + r']+property="article:published_time"[^>]+content="([^"]+)"', + r'itemprop="datePublished"[^>]+datetime="([^"]+)"'], + webpage, 'date', default=None)) + # single video with playlist.sxml URL (e.g. http://www.bbc.com/sport/0/football/3365340ng) + playlist = self._search_regex( + r']+name="playlist"[^>]+value="([^"]+)"', + webpage, 'playlist', default=None) + if playlist: + programme_id, title, description, duration, formats, subtitles = \ + self._process_legacy_playlist_url(playlist, playlist_id) self._sort_formats(formats) - - id = jent.get('id') if programme_id is None else programme_id - if id is None: - id = 'NA' - - ret.append({ - 'id': id, - 'uploader': 'BBC News', - 'upload_date': pubdate, + return { + 'id': programme_id, 'title': title, 'description': description, - 'thumbnail': thumbnail, 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + } + + # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) + programme_id = self._search_regex( + [r'data-video-player-vpid="([\da-z]{8})"', + r']+name="externalIdentifier"[^>]+value="([\da-z]{8})"'], + webpage, 'vpid', default=None) + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star) + digital_data = self._parse_json( + self._search_regex( + r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'), + programme_id, fatal=False) + page_info = digital_data.get('page', {}).get('pageInfo', {}) + title = page_info.get('pageName') or self._og_search_title(webpage) + description = page_info.get('description') or self._og_search_description(webpage) + timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + } + + playlist_title = self._html_search_regex( + r'(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'playlist title') + playlist_description = self._og_search_description(webpage) + + # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511) + medias = list(filter(None, map( + lambda s: self._parse_json(s, playlist_id, fatal=False), + re.findall(r"data-media-meta='({[^']+})'", webpage)))) + + if not medias: + # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international) + media_asset_page = self._parse_json( + self._search_regex( + r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'media asset'), + playlist_id) + medias = [] + for video in media_asset_page.get('videos', {}).values(): + medias.extend(video.values()) + + entries = [] + for num, media_meta in enumerate(medias, start=1): + formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id) + if not formats: + continue + self._sort_formats(formats) + + video_id = media_meta.get('externalId') + if not video_id: + video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num) + + title = media_meta.get('caption') + if not title: + title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num) + + duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration')) + + images = [] + for image in media_meta.get('images', {}).values(): + images.extend(image.values()) + if 'image' in media_meta: + images.append(media_meta['image']) + + thumbnails = [{ + 'url': image.get('href'), + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in images] + + entries.append({ + 'id': video_id, + 'title': title, + 'thumbnails': thumbnails, + 'duration': duration, + 'timestamp': timestamp, 'formats': formats, 'subtitles': subtitles, }) - if len(ret) > 0: - return self.playlist_result(ret, list_id, list_title) - raise ExtractorError('No video found', expected=True) + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) From 51da40e6218f1dda2fc61650c308194e9b4acbc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 25 Jul 2015 22:19:54 +0600 Subject: [PATCH 25/30] [bbc] PEP8 --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 2a0901ee4..4b23f82ca 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -423,7 +423,7 @@ class BBCIE(BBCCoUkIE): _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' # fails with notukerror for some videos - #_MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' + # _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s' _TESTS = [{ From 7a896817226405a772baa3808d63062d4ad11c94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 25 Jul 2015 22:32:54 +0600 Subject: [PATCH 26/30] [bbc] Skip DASH until supported --- youtube_dl/extractor/bbc.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 4b23f82ca..66e52641b 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -172,6 +172,7 @@ class BBCCoUkIE(InfoExtractor): supplier = connection.get('supplier') if protocol == 'http': href = connection.get('href') + transfer_format = connection.get('transferFormat') # ASX playlist if supplier == 'asx': for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): @@ -179,6 +180,9 @@ class BBCCoUkIE(InfoExtractor): 'url': ref, 'format_id': 'ref%s_%s' % (i, supplier), }) + # Skip DASH until supported + elif transfer_format == 'dash': + pass # Direct link else: formats.append({ From 5bdec59de15b9bde73a3077a6b9ce517c10b9906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 26 Jul 2015 09:51:54 +0600 Subject: [PATCH 27/30] [comcarcoff] Add support for singleshots (Closes #6366) --- youtube_dl/extractor/comcarcoff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py index 9c25b2223..81f3d7697 100644 --- a/youtube_dl/extractor/comcarcoff.py +++ b/youtube_dl/extractor/comcarcoff.py @@ -36,7 +36,7 @@ class ComCarCoffIE(InfoExtractor): webpage, 'full data json')) video_id = full_data['activeVideo']['video'] - video_data = full_data['videos'][video_id] + video_data = full_data.get('videos', {}).get(video_id) or full_data['singleshots'][video_id] thumbnails = [{ 'url': video_data['images']['thumb'], }, { From aeb7b41d44313b6bb007b2f1cd0fc8cda84e59d5 Mon Sep 17 00:00:00 2001 From: tippfeler Date: Sun, 26 Jul 2015 12:57:06 +0200 Subject: [PATCH 28/30] [spiegel] Accept iframe urls Closes #6370. --- youtube_dl/extractor/spiegel.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index b868241d5..5bd3c0087 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -9,7 +9,7 @@ from .spiegeltv import SpiegeltvIE class SpiegelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P[0-9]+)(?:-embed)?(?:\.html)?(?:#.*)?$' + _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', 'md5': '2c2754212136f35fb4b19767d242f66e', @@ -39,6 +39,9 @@ class SpiegelIE(InfoExtractor): 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', } + }, { + 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', + 'only_matching': True, }] def _real_extract(self, url): From 4c6bd5b5b61adfd912e14f8d704fde47628d164e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 26 Jul 2015 14:14:28 +0200 Subject: [PATCH 29/30] [youtube] Use 'vp8' and 'vp9' in lowercase (fixes #6358) That's how YouTube reports them in their DASH manifest. --- youtube_dl/extractor/youtube.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4023a6e50..0e411bfb6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -281,13 +281,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'}, # Dash webm - '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'}, + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'vp9'}, '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, @@ -297,11 +297,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, - '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, - '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, - '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'}, - '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, + '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, + '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, + '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'vp9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, # Dash webm audio '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, From 710b1744af102343219d0bc18bdca7fdf363a99f Mon Sep 17 00:00:00 2001 From: Marco 'don' Kaulea Date: Fri, 24 Jul 2015 21:45:00 +0200 Subject: [PATCH 30/30] Add --socks-proxy parameter to use SOCKS proxy This allows the user to specify a socks proxy to tunnel the connection through. This feature requires that PySocks is available. If the user tries to use a SOCKS proxy without PySocks available an error is printed and the process is aborted. --- youtube_dl/YoutubeDL.py | 33 +++++++++++++++++++++++++++++++-- youtube_dl/__init__.py | 4 ++++ youtube_dl/options.py | 5 +++++ youtube_dl/utils.py | 17 +++++++++++++++++ 4 files changed, 57 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 702a6ad50..759b87893 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -53,6 +53,7 @@ from .utils import ( locked_file, make_HTTPS_handler, MaxDownloadsReached, + OptionalDependencyNotFound, PagedList, parse_filesize, PerRequestProxyHandler, @@ -1825,11 +1826,39 @@ class YoutubeDL(object): proxies['https'] = proxies['http'] proxy_handler = PerRequestProxyHandler(proxies) + socks_handler = None + opts_socks = self.params.get('socksproxy') + if opts_socks is not None and opts_socks: + # Try to import the dependencies for this feature + try: + import socks + except ImportError: + raise OptionalDependencyNotFound(module_name='socks', + feature_name='"SocksProxy"') + try: + from sockshandler import SocksiPyHandler + except ImportError: + raise OptionalDependencyNotFound(module_name='sockshandler', + feature_name='"SocksProxy"') + + pair = opts_socks.split(':') + if len(pair) == 2: + socks_handler = SocksiPyHandler(socks.PROXY_TYPE_SOCKS5, + pair[0], + int(pair[1])) + else: + socks_handler = SocksiPyHandler(socks.PROXY_TYPE_SOCKS5, + 'localhost', + int(pair[0])) + debuglevel = 1 if self.params.get('debug_printtraffic') else 0 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) - opener = compat_urllib_request.build_opener( - proxy_handler, https_handler, cookie_processor, ydlh) + proxy_list = [] + if socks_handler: + proxy_list.append(socks_handler) + proxy_list += [proxy_handler, https_handler, cookie_processor, ydlh] + opener = compat_urllib_request.build_opener(*proxy_list) # Delete the default user-agent header, which would otherwise apply in # cases where our custom HTTP handler doesn't come into play diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 55b22c889..e12d4197c 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -29,6 +29,7 @@ from .utils import ( DownloadError, match_filter_func, MaxDownloadsReached, + OptionalDependencyNotFound, preferredencoding, read_batch_urls, SameFileError, @@ -346,6 +347,7 @@ def _real_main(argv=None): 'nocheckcertificate': opts.no_check_certificate, 'prefer_insecure': opts.prefer_insecure, 'proxy': opts.proxy, + 'socksproxy': opts.socksproxy, 'socket_timeout': opts.socket_timeout, 'bidi_workaround': opts.bidi_workaround, 'debug_printtraffic': opts.debug_printtraffic, @@ -414,5 +416,7 @@ def main(argv=None): sys.exit('ERROR: fixed output name but more than one file to download') except KeyboardInterrupt: sys.exit('\nERROR: Interrupted by user') + except OptionalDependencyNotFound: + sys.exit('ERROR: Dependency not found') __all__ = ['main', 'YoutubeDL', 'gen_extractors', 'list_extractors'] diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 9016e3498..26072d888 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -181,6 +181,11 @@ def parseOpts(overrideArguments=None): '--proxy', dest='proxy', default=None, metavar='URL', help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection') + network.add_option( + '--socks-proxy', dest='socksproxy', default=None, metavar='URL', + help=('Use the specified socks proxy. Pass in an empty string ' + '(--socks-proxy "") for direct connection. This feature requires' + 'the pysocks library.')) network.add_option( '--socket-timeout', dest='socket_timeout', type=float, default=None, metavar='SECONDS', diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ae813099d..f148def10 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -585,6 +585,23 @@ class ContentTooShortError(Exception): self.expected = expected +class OptionalDependencyNotFound(Exception): + """Optional dependency not found + + This exception may be raised by YoutubeDL, when the user tries to use a + feature that requires an optional dependency which could not be found. + """ + + def __init__(self, module_name, feature_name): + self.module_name = module_name + self.feature_name = feature_name + + def __str__(self): + return ("Unable to use {feature}, because it depends on {module} " + "which was not found.").format( + feature=self.feature_name, module=self.module_name) + + def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): hc = http_class(*args, **kwargs) source_address = ydl_handler._params.get('source_address')