From 48dde7589175d688ce7661459ca32c535d6500e5 Mon Sep 17 00:00:00 2001 From: Matthias Roos Date: Mon, 10 Jun 2019 12:32:24 +0200 Subject: [PATCH 1/7] [3sat] extractor now included in zdf3sat.py (closes #21185) --- youtube_dl/extractor/dreisat.py | 193 -------------------- youtube_dl/extractor/extractors.py | 3 +- youtube_dl/extractor/{zdf.py => zdf3sat.py} | 5 +- 3 files changed, 5 insertions(+), 196 deletions(-) delete mode 100644 youtube_dl/extractor/dreisat.py rename youtube_dl/extractor/{zdf.py => zdf3sat.py} (98%) diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py deleted file mode 100644 index 848d387d1..000000000 --- a/youtube_dl/extractor/dreisat.py +++ /dev/null @@ -1,193 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_strdate, - xpath_text, - determine_ext, - float_or_none, - ExtractorError, -) - - -class DreiSatIE(InfoExtractor): - IE_NAME = '3sat' - _GEO_COUNTRIES = ['DE'] - _VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)' - _TESTS = [ - { - 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', - 'md5': 'be37228896d30a88f315b638900a026e', - 'info_dict': { - 'id': '45918', - 'ext': 'mp4', - 'title': 'Waidmannsheil', - 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', - 'uploader': 'SCHWEIZWEIT', - 'uploader_id': '100000210', - 'upload_date': '20140913' - }, - 'params': { - 'skip_download': True, # m3u8 downloads - } - }, - { - 'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066', - 'only_matching': True, - }, - ] - - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): - param_groups = {} - for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): - group_id = param_group.get(self._xpath_ns( - 'id', 'http://www.w3.org/XML/1998/namespace')) - params = {} - for param in param_group: - params[param.get('name')] = param.get('value') - param_groups[group_id] = params - - formats = [] - for video in smil.findall(self._xpath_ns('.//video', namespace)): - src = video.get('src') - if not src: - continue - bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) - group_id = video.get('paramGroup') - param_group = param_groups[group_id] - for proto in param_group['protocols'].split(','): - formats.append({ - 'url': '%s://%s' % (proto, param_group['host']), - 'app': param_group['app'], - 'play_path': src, - 'ext': 'flv', - 'format_id': '%s-%d' % (proto, bitrate), - 'tbr': bitrate, - }) - self._sort_formats(formats) - return formats - - def extract_from_xml_url(self, video_id, xml_url): - doc = self._download_xml( - xml_url, video_id, - note='Downloading video info', - errnote='Failed to download video info') - - status_code = xpath_text(doc, './status/statuscode') - if status_code and status_code != 'ok': - if status_code == 'notVisibleAnymore': - message = 'Video %s is not available' % video_id - else: - message = '%s returned error: %s' % (self.IE_NAME, status_code) - raise ExtractorError(message, expected=True) - - title = xpath_text(doc, './/information/title', 'title', True) - - urls = [] - formats = [] - for fnode in doc.findall('.//formitaeten/formitaet'): - video_url = xpath_text(fnode, 'url') - if not video_url or video_url in urls: - continue - urls.append(video_url) - - is_available = 'http://www.metafilegenerator' not in video_url - geoloced = 'static_geoloced_online' in video_url - if not is_available or geoloced: - continue - - format_id = fnode.attrib['basetype'] - format_m = re.match(r'''(?x) - (?P[^_]+)_(?P[^_]+)_(?P[^_]+)_ - (?P[^_]+)_(?P[^_]+)_(?P[^_]+) - ''', format_id) - - ext = determine_ext(video_url, None) or format_m.group('container') - - if ext == 'meta': - continue - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - video_url, video_id, fatal=False)) - elif ext == 'm3u8': - # the certificates are misconfigured (see - # https://github.com/ytdl-org/youtube-dl/issues/8665) - if video_url.startswith('https://'): - continue - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id=format_id, fatal=False)) - else: - quality = xpath_text(fnode, './quality') - if quality: - format_id += '-' + quality - - abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000) - vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000) - - tbr = int_or_none(self._search_regex( - r'_(\d+)k', video_url, 'bitrate', None)) - if tbr and vbr and not abr: - abr = tbr - vbr - - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'ext': ext, - 'acodec': format_m.group('acodec'), - 'vcodec': format_m.group('vcodec'), - 'abr': abr, - 'vbr': vbr, - 'tbr': tbr, - 'width': int_or_none(xpath_text(fnode, './width')), - 'height': int_or_none(xpath_text(fnode, './height')), - 'filesize': int_or_none(xpath_text(fnode, './filesize')), - 'protocol': format_m.group('proto').lower(), - }) - - geolocation = xpath_text(doc, './/details/geolocation') - if not formats and geolocation and geolocation != 'none': - self.raise_geo_restricted(countries=self._GEO_COUNTRIES) - - self._sort_formats(formats) - - thumbnails = [] - for node in doc.findall('.//teaserimages/teaserimage'): - thumbnail_url = node.text - if not thumbnail_url: - continue - thumbnail = { - 'url': thumbnail_url, - } - thumbnail_key = node.get('key') - if thumbnail_key: - m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) - thumbnails.append(thumbnail) - - upload_date = unified_strdate(xpath_text(doc, './/details/airtime')) - - return { - 'id': video_id, - 'title': title, - 'description': xpath_text(doc, './/information/detail'), - 'duration': int_or_none(xpath_text(doc, './/details/lengthSec')), - 'thumbnails': thumbnails, - 'uploader': xpath_text(doc, './/details/originChannelTitle'), - 'uploader_id': xpath_text(doc, './/details/originChannelId'), - 'upload_date': upload_date, - 'formats': formats, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id - return self.extract_from_xml_url(video_id, details_url) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b1ed8a4b2..56f797ab6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -287,7 +287,6 @@ from .dplay import ( DPlayIE, DPlayItIE, ) -from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE from .drtv import ( @@ -1522,6 +1521,6 @@ from .zattoo import ( ZattooIE, ZattooLiveIE, ) -from .zdf import ZDFIE, ZDFChannelIE +from .zdf3sat import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE from .zype import ZypeIE diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf3sat.py similarity index 98% rename from youtube_dl/extractor/zdf.py rename to youtube_dl/extractor/zdf3sat.py index afa3f6c47..6c22d24c4 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf3sat.py @@ -39,7 +39,7 @@ class ZDFBaseIE(InfoExtractor): class ZDFIE(ZDFBaseIE): - _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?]+)\.html' + _VALID_URL = r'https?://www\.(?:zdf|3sat)\.de/(?:[^/]+/)*(?P[^/?]+)\.html' _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh') _TESTS = [{ @@ -53,6 +53,9 @@ class ZDFIE(ZDFBaseIE): 'timestamp': 1465021200, 'upload_date': '20160604', }, + }, { + 'url': 'https://www.3sat.de/kultur/kulturdoku/der-gugelhupf-koenig-der-kuchen-100.html', + 'only_matching': True, }, { 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', 'only_matching': True, From 3aba798551083f49dfdc436ab8926cffc570db1b Mon Sep 17 00:00:00 2001 From: Matthias Roos Date: Mon, 10 Jun 2019 13:19:50 +0200 Subject: [PATCH 2/7] Revert "[3sat] extractor now included in zdf3sat.py (closes #21185)" This reverts commit 48dde7589175d688ce7661459ca32c535d6500e5. --- youtube_dl/extractor/dreisat.py | 193 ++++++++++++++++++++ youtube_dl/extractor/extractors.py | 3 +- youtube_dl/extractor/{zdf3sat.py => zdf.py} | 5 +- 3 files changed, 196 insertions(+), 5 deletions(-) create mode 100644 youtube_dl/extractor/dreisat.py rename youtube_dl/extractor/{zdf3sat.py => zdf.py} (98%) diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py new file mode 100644 index 000000000..848d387d1 --- /dev/null +++ b/youtube_dl/extractor/dreisat.py @@ -0,0 +1,193 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, + xpath_text, + determine_ext, + float_or_none, + ExtractorError, +) + + +class DreiSatIE(InfoExtractor): + IE_NAME = '3sat' + _GEO_COUNTRIES = ['DE'] + _VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)' + _TESTS = [ + { + 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', + 'md5': 'be37228896d30a88f315b638900a026e', + 'info_dict': { + 'id': '45918', + 'ext': 'mp4', + 'title': 'Waidmannsheil', + 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', + 'uploader': 'SCHWEIZWEIT', + 'uploader_id': '100000210', + 'upload_date': '20140913' + }, + 'params': { + 'skip_download': True, # m3u8 downloads + } + }, + { + 'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066', + 'only_matching': True, + }, + ] + + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + param_groups = {} + for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): + group_id = param_group.get(self._xpath_ns( + 'id', 'http://www.w3.org/XML/1998/namespace')) + params = {} + for param in param_group: + params[param.get('name')] = param.get('value') + param_groups[group_id] = params + + formats = [] + for video in smil.findall(self._xpath_ns('.//video', namespace)): + src = video.get('src') + if not src: + continue + bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + group_id = video.get('paramGroup') + param_group = param_groups[group_id] + for proto in param_group['protocols'].split(','): + formats.append({ + 'url': '%s://%s' % (proto, param_group['host']), + 'app': param_group['app'], + 'play_path': src, + 'ext': 'flv', + 'format_id': '%s-%d' % (proto, bitrate), + 'tbr': bitrate, + }) + self._sort_formats(formats) + return formats + + def extract_from_xml_url(self, video_id, xml_url): + doc = self._download_xml( + xml_url, video_id, + note='Downloading video info', + errnote='Failed to download video info') + + status_code = xpath_text(doc, './status/statuscode') + if status_code and status_code != 'ok': + if status_code == 'notVisibleAnymore': + message = 'Video %s is not available' % video_id + else: + message = '%s returned error: %s' % (self.IE_NAME, status_code) + raise ExtractorError(message, expected=True) + + title = xpath_text(doc, './/information/title', 'title', True) + + urls = [] + formats = [] + for fnode in doc.findall('.//formitaeten/formitaet'): + video_url = xpath_text(fnode, 'url') + if not video_url or video_url in urls: + continue + urls.append(video_url) + + is_available = 'http://www.metafilegenerator' not in video_url + geoloced = 'static_geoloced_online' in video_url + if not is_available or geoloced: + continue + + format_id = fnode.attrib['basetype'] + format_m = re.match(r'''(?x) + (?P[^_]+)_(?P[^_]+)_(?P[^_]+)_ + (?P[^_]+)_(?P[^_]+)_(?P[^_]+) + ''', format_id) + + ext = determine_ext(video_url, None) or format_m.group('container') + + if ext == 'meta': + continue + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + video_url, video_id, fatal=False)) + elif ext == 'm3u8': + # the certificates are misconfigured (see + # https://github.com/ytdl-org/youtube-dl/issues/8665) + if video_url.startswith('https://'): + continue + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id=format_id, fatal=False)) + else: + quality = xpath_text(fnode, './quality') + if quality: + format_id += '-' + quality + + abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000) + vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000) + + tbr = int_or_none(self._search_regex( + r'_(\d+)k', video_url, 'bitrate', None)) + if tbr and vbr and not abr: + abr = tbr - vbr + + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'ext': ext, + 'acodec': format_m.group('acodec'), + 'vcodec': format_m.group('vcodec'), + 'abr': abr, + 'vbr': vbr, + 'tbr': tbr, + 'width': int_or_none(xpath_text(fnode, './width')), + 'height': int_or_none(xpath_text(fnode, './height')), + 'filesize': int_or_none(xpath_text(fnode, './filesize')), + 'protocol': format_m.group('proto').lower(), + }) + + geolocation = xpath_text(doc, './/details/geolocation') + if not formats and geolocation and geolocation != 'none': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + + self._sort_formats(formats) + + thumbnails = [] + for node in doc.findall('.//teaserimages/teaserimage'): + thumbnail_url = node.text + if not thumbnail_url: + continue + thumbnail = { + 'url': thumbnail_url, + } + thumbnail_key = node.get('key') + if thumbnail_key: + m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) + if m: + thumbnail['width'] = int(m.group(1)) + thumbnail['height'] = int(m.group(2)) + thumbnails.append(thumbnail) + + upload_date = unified_strdate(xpath_text(doc, './/details/airtime')) + + return { + 'id': video_id, + 'title': title, + 'description': xpath_text(doc, './/information/detail'), + 'duration': int_or_none(xpath_text(doc, './/details/lengthSec')), + 'thumbnails': thumbnails, + 'uploader': xpath_text(doc, './/details/originChannelTitle'), + 'uploader_id': xpath_text(doc, './/details/originChannelId'), + 'upload_date': upload_date, + 'formats': formats, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id + return self.extract_from_xml_url(video_id, details_url) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 56f797ab6..b1ed8a4b2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -287,6 +287,7 @@ from .dplay import ( DPlayIE, DPlayItIE, ) +from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE from .drtv import ( @@ -1521,6 +1522,6 @@ from .zattoo import ( ZattooIE, ZattooLiveIE, ) -from .zdf3sat import ZDFIE, ZDFChannelIE +from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE from .zype import ZypeIE diff --git a/youtube_dl/extractor/zdf3sat.py b/youtube_dl/extractor/zdf.py similarity index 98% rename from youtube_dl/extractor/zdf3sat.py rename to youtube_dl/extractor/zdf.py index 6c22d24c4..afa3f6c47 100644 --- a/youtube_dl/extractor/zdf3sat.py +++ b/youtube_dl/extractor/zdf.py @@ -39,7 +39,7 @@ class ZDFBaseIE(InfoExtractor): class ZDFIE(ZDFBaseIE): - _VALID_URL = r'https?://www\.(?:zdf|3sat)\.de/(?:[^/]+/)*(?P[^/?]+)\.html' + _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?]+)\.html' _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh') _TESTS = [{ @@ -53,9 +53,6 @@ class ZDFIE(ZDFBaseIE): 'timestamp': 1465021200, 'upload_date': '20160604', }, - }, { - 'url': 'https://www.3sat.de/kultur/kulturdoku/der-gugelhupf-koenig-der-kuchen-100.html', - 'only_matching': True, }, { 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', 'only_matching': True, From 2161db38787072dcb2187d45951fe16221fa1021 Mon Sep 17 00:00:00 2001 From: Matthias Roos Date: Mon, 10 Jun 2019 13:25:05 +0200 Subject: [PATCH 3/7] [3sat] now added to zdf extractor --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/{zdf.py => zdf3sat.py} | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) rename youtube_dl/extractor/{zdf.py => zdf3sat.py} (98%) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b1ed8a4b2..e1b6927e8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1522,6 +1522,6 @@ from .zattoo import ( ZattooIE, ZattooLiveIE, ) -from .zdf import ZDFIE, ZDFChannelIE +from .zdf3sat import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE from .zype import ZypeIE diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf3sat.py similarity index 98% rename from youtube_dl/extractor/zdf.py rename to youtube_dl/extractor/zdf3sat.py index afa3f6c47..6c22d24c4 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf3sat.py @@ -39,7 +39,7 @@ class ZDFBaseIE(InfoExtractor): class ZDFIE(ZDFBaseIE): - _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?]+)\.html' + _VALID_URL = r'https?://www\.(?:zdf|3sat)\.de/(?:[^/]+/)*(?P[^/?]+)\.html' _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh') _TESTS = [{ @@ -53,6 +53,9 @@ class ZDFIE(ZDFBaseIE): 'timestamp': 1465021200, 'upload_date': '20160604', }, + }, { + 'url': 'https://www.3sat.de/kultur/kulturdoku/der-gugelhupf-koenig-der-kuchen-100.html', + 'only_matching': True, }, { 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', 'only_matching': True, From 14d7dccccd2d5736448bc97ac47504164d79b420 Mon Sep 17 00:00:00 2001 From: Matthias Roos Date: Fri, 5 Jul 2019 21:03:34 +0200 Subject: [PATCH 4/7] Revert "[3sat] now added to zdf extractor" This reverts commit 2161db38787072dcb2187d45951fe16221fa1021. --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/{zdf3sat.py => zdf.py} | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) rename youtube_dl/extractor/{zdf3sat.py => zdf.py} (98%) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e1b6927e8..b1ed8a4b2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1522,6 +1522,6 @@ from .zattoo import ( ZattooIE, ZattooLiveIE, ) -from .zdf3sat import ZDFIE, ZDFChannelIE +from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE from .zype import ZypeIE diff --git a/youtube_dl/extractor/zdf3sat.py b/youtube_dl/extractor/zdf.py similarity index 98% rename from youtube_dl/extractor/zdf3sat.py rename to youtube_dl/extractor/zdf.py index 6c22d24c4..afa3f6c47 100644 --- a/youtube_dl/extractor/zdf3sat.py +++ b/youtube_dl/extractor/zdf.py @@ -39,7 +39,7 @@ class ZDFBaseIE(InfoExtractor): class ZDFIE(ZDFBaseIE): - _VALID_URL = r'https?://www\.(?:zdf|3sat)\.de/(?:[^/]+/)*(?P[^/?]+)\.html' + _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?]+)\.html' _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh') _TESTS = [{ @@ -53,9 +53,6 @@ class ZDFIE(ZDFBaseIE): 'timestamp': 1465021200, 'upload_date': '20160604', }, - }, { - 'url': 'https://www.3sat.de/kultur/kulturdoku/der-gugelhupf-koenig-der-kuchen-100.html', - 'only_matching': True, }, { 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', 'only_matching': True, From 3108be67eb1de8bf6be385469dd534ff9e2a17ea Mon Sep 17 00:00:00 2001 From: Matthias Roos Date: Fri, 5 Jul 2019 21:51:20 +0200 Subject: [PATCH 5/7] [3sat] new extractor based on zdf extractor --- youtube_dl/extractor/dreisat.py | 361 ++++++++++++++++++-------------- 1 file changed, 201 insertions(+), 160 deletions(-) diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 848d387d1..20fc1b5d4 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -1,193 +1,234 @@ +# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - int_or_none, - unified_strdate, - xpath_text, determine_ext, - float_or_none, - ExtractorError, + int_or_none, + NO_DEFAULT, + orderedSet, + parse_codecs, + qualities, + try_get, + unified_timestamp, + update_url_query, + url_or_none, + urljoin, ) -class DreiSatIE(InfoExtractor): - IE_NAME = '3sat' - _GEO_COUNTRIES = ['DE'] - _VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)' - _TESTS = [ - { - 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', - 'md5': 'be37228896d30a88f315b638900a026e', - 'info_dict': { - 'id': '45918', - 'ext': 'mp4', - 'title': 'Waidmannsheil', - 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', - 'uploader': 'SCHWEIZWEIT', - 'uploader_id': '100000210', - 'upload_date': '20140913' - }, - 'params': { - 'skip_download': True, # m3u8 downloads - } - }, - { - 'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066', - 'only_matching': True, - }, - ] +class DreiSatBaseIE(InfoExtractor): + def _call_api(self, url, player, referrer, video_id, item): + return self._download_json( + url, video_id, 'Downloading JSON %s' % item, + headers={ + 'Referer': referrer, + 'Api-Auth': 'Bearer %s' % player['apiToken'], + }) - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): - param_groups = {} - for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): - group_id = param_group.get(self._xpath_ns( - 'id', 'http://www.w3.org/XML/1998/namespace')) - params = {} - for param in param_group: - params[param.get('name')] = param.get('value') - param_groups[group_id] = params + def _extract_player(self, webpage, video_id, fatal=True): + return self._parse_json( + self._search_regex( + r'(?s)data-zdfplayer-jsb=(["\'])(?P{.+?})\1', webpage, + 'player JSON', default='{}' if not fatal else NO_DEFAULT, + group='json'), + video_id) - formats = [] - for video in smil.findall(self._xpath_ns('.//video', namespace)): - src = video.get('src') - if not src: - continue - bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) - group_id = video.get('paramGroup') - param_group = param_groups[group_id] - for proto in param_group['protocols'].split(','): - formats.append({ - 'url': '%s://%s' % (proto, param_group['host']), - 'app': param_group['app'], - 'play_path': src, - 'ext': 'flv', - 'format_id': '%s-%d' % (proto, bitrate), - 'tbr': bitrate, + +class DreiSatIE(DreiSatBaseIE): + _VALID_URL = r'https?://www\.3sat\.de/(?:[^/]+/)*(?P[^/?]+)\.html' + _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh') + + _TESTS = [{ + 'url': 'https://www.3sat.de/dokumentation/natur/dolomiten-sagenhaftes-juwel-der-alpen-100.html', + 'info_dict': { + 'id': 'dolomiten-sagenhaftes-juwel-der-alpen-100', + 'ext': 'mp4', + 'title': 'Dolomiten - Sagenhaftes Juwel der Alpen', + 'description': 'md5:a4fa13cae91b8044353c1d56f3a8fc77', + 'duration': 2618, + 'timestamp': 1561397400, + 'upload_date': '20190624', + }, + }, { + 'url': 'https://www.3sat.de/kultur/kulturdoku/der-gugelhupf-koenig-der-kuchen-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.3sat.de/dokumentation/natur/karnische-alpen-100.html', + 'only_matching': True, + }] + + @staticmethod + def _extract_subtitles(src): + subtitles = {} + for caption in try_get(src, lambda x: x['captions'], list) or []: + subtitle_url = url_or_none(caption.get('uri')) + if subtitle_url: + lang = caption.get('language', 'deu') + subtitles.setdefault(lang, []).append({ + 'url': subtitle_url, }) - self._sort_formats(formats) - return formats + return subtitles - def extract_from_xml_url(self, video_id, xml_url): - doc = self._download_xml( - xml_url, video_id, - note='Downloading video info', - errnote='Failed to download video info') + def _extract_format(self, video_id, formats, format_urls, meta): + format_url = url_or_none(meta.get('url')) + if not format_url: + return + if format_url in format_urls: + return + format_urls.add(format_url) + mime_type = meta.get('mimeType') + ext = determine_ext(format_url) + if mime_type == 'application/x-mpegURL' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', m3u8_id='hls', + entry_protocol='m3u8_native', fatal=False)) + elif mime_type == 'application/f4m+xml' or ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False)) + else: + f = parse_codecs(meta.get('mimeCodec')) + format_id = ['http'] + for p in (meta.get('type'), meta.get('quality')): + if p and isinstance(p, compat_str): + format_id.append(p) + f.update({ + 'url': format_url, + 'format_id': '-'.join(format_id), + 'format_note': meta.get('quality'), + 'language': meta.get('language'), + 'quality': qualities(self._QUALITIES)(meta.get('quality')), + 'preference': -10, + }) + formats.append(f) - status_code = xpath_text(doc, './status/statuscode') - if status_code and status_code != 'ok': - if status_code == 'notVisibleAnymore': - message = 'Video %s is not available' % video_id - else: - message = '%s returned error: %s' % (self.IE_NAME, status_code) - raise ExtractorError(message, expected=True) + def _extract_entry(self, url, player, content, video_id): + title = content.get('title') or content['teaserHeadline'] - title = xpath_text(doc, './/information/title', 'title', True) + t = content['mainVideoContent']['http://zdf.de/rels/target'] + + ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') + + if not ptmd_path: + ptmd_path = t[ + 'http://zdf.de/rels/streams/ptmd-template'].replace( + '{playerId}', 'portal') + + ptmd = self._call_api( + urljoin(url, ptmd_path), player, url, video_id, 'metadata') - urls = [] formats = [] - for fnode in doc.findall('.//formitaeten/formitaet'): - video_url = xpath_text(fnode, 'url') - if not video_url or video_url in urls: + track_uris = set() + for p in ptmd['priorityList']: + formitaeten = p.get('formitaeten') + if not isinstance(formitaeten, list): continue - urls.append(video_url) - - is_available = 'http://www.metafilegenerator' not in video_url - geoloced = 'static_geoloced_online' in video_url - if not is_available or geoloced: - continue - - format_id = fnode.attrib['basetype'] - format_m = re.match(r'''(?x) - (?P[^_]+)_(?P[^_]+)_(?P[^_]+)_ - (?P[^_]+)_(?P[^_]+)_(?P[^_]+) - ''', format_id) - - ext = determine_ext(video_url, None) or format_m.group('container') - - if ext == 'meta': - continue - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - video_url, video_id, fatal=False)) - elif ext == 'm3u8': - # the certificates are misconfigured (see - # https://github.com/ytdl-org/youtube-dl/issues/8665) - if video_url.startswith('https://'): + for f in formitaeten: + f_qualities = f.get('qualities') + if not isinstance(f_qualities, list): continue - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id=format_id, fatal=False)) - else: - quality = xpath_text(fnode, './quality') - if quality: - format_id += '-' + quality - - abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000) - vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000) - - tbr = int_or_none(self._search_regex( - r'_(\d+)k', video_url, 'bitrate', None)) - if tbr and vbr and not abr: - abr = tbr - vbr - - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'ext': ext, - 'acodec': format_m.group('acodec'), - 'vcodec': format_m.group('vcodec'), - 'abr': abr, - 'vbr': vbr, - 'tbr': tbr, - 'width': int_or_none(xpath_text(fnode, './width')), - 'height': int_or_none(xpath_text(fnode, './height')), - 'filesize': int_or_none(xpath_text(fnode, './filesize')), - 'protocol': format_m.group('proto').lower(), - }) - - geolocation = xpath_text(doc, './/details/geolocation') - if not formats and geolocation and geolocation != 'none': - self.raise_geo_restricted(countries=self._GEO_COUNTRIES) - + for quality in f_qualities: + tracks = try_get(quality, lambda x: x['audio']['tracks'], list) + if not tracks: + continue + for track in tracks: + self._extract_format( + video_id, formats, track_uris, { + 'url': track.get('uri'), + 'type': f.get('type'), + 'mimeType': f.get('mimeType'), + 'quality': quality.get('quality'), + 'language': track.get('language'), + }) self._sort_formats(formats) thumbnails = [] - for node in doc.findall('.//teaserimages/teaserimage'): - thumbnail_url = node.text - if not thumbnail_url: - continue - thumbnail = { - 'url': thumbnail_url, - } - thumbnail_key = node.get('key') - if thumbnail_key: - m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) - thumbnails.append(thumbnail) - - upload_date = unified_strdate(xpath_text(doc, './/details/airtime')) + layouts = try_get( + content, lambda x: x['teaserImageRef']['layouts'], dict) + if layouts: + for layout_key, layout_url in layouts.items(): + layout_url = url_or_none(layout_url) + if not layout_url: + continue + thumbnail = { + 'url': layout_url, + 'format_id': layout_key, + } + mobj = re.search(r'(?P\d+)x(?P\d+)', layout_key) + if mobj: + thumbnail.update({ + 'width': int(mobj.group('width')), + 'height': int(mobj.group('height')), + }) + thumbnails.append(thumbnail) return { 'id': video_id, 'title': title, - 'description': xpath_text(doc, './/information/detail'), - 'duration': int_or_none(xpath_text(doc, './/details/lengthSec')), + 'description': content.get('leadParagraph') or content.get('teasertext'), + 'duration': int_or_none(t.get('duration')), + 'timestamp': unified_timestamp(content.get('editorialDate')), 'thumbnails': thumbnails, - 'uploader': xpath_text(doc, './/details/originChannelTitle'), - 'uploader_id': xpath_text(doc, './/details/originChannelId'), - 'upload_date': upload_date, + 'subtitles': self._extract_subtitles(ptmd), + 'formats': formats, + } + + def _extract_regular(self, url, player, video_id): + content = self._call_api( + player['content'], player, url, video_id, 'content') + return self._extract_entry(player['content'], player, content, video_id) + + def _extract_mobile(self, video_id): + document = self._download_json( + 'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id, + video_id)['document'] + + title = document['titel'] + + formats = [] + format_urls = set() + for f in document['formitaeten']: + self._extract_format(video_id, formats, format_urls, f) + self._sort_formats(formats) + + thumbnails = [] + teaser_bild = document.get('teaserBild') + if isinstance(teaser_bild, dict): + for thumbnail_key, thumbnail in teaser_bild.items(): + thumbnail_url = try_get( + thumbnail, lambda x: x['url'], compat_str) + if thumbnail_url: + thumbnails.append({ + 'url': thumbnail_url, + 'id': thumbnail_key, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'description': document.get('beschreibung'), + 'duration': int_or_none(document.get('length')), + 'timestamp': unified_timestamp(try_get( + document, lambda x: x['meta']['editorialDate'], compat_str)), + 'thumbnails': thumbnails, + 'subtitles': self._extract_subtitles(document), 'formats': formats, } def _real_extract(self, url): video_id = self._match_id(url) - details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id - return self.extract_from_xml_url(video_id, details_url) + + webpage = self._download_webpage(url, video_id, fatal=False) + if webpage: + player = self._extract_player(webpage, url, fatal=False) + if player: + return self._extract_regular(url, player, video_id) + + return self._extract_mobile(video_id) + + From a23c8fa1a7aad6ecbd07f208255b0082b2ac7548 Mon Sep 17 00:00:00 2001 From: Matthias Roos Date: Fri, 19 Jul 2019 20:47:26 +0200 Subject: [PATCH 6/7] Revert "[3sat] new extractor based on zdf extractor" This reverts commit 3108be67eb1de8bf6be385469dd534ff9e2a17ea. --- youtube_dl/extractor/dreisat.py | 359 ++++++++++++++------------------ 1 file changed, 159 insertions(+), 200 deletions(-) diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 20fc1b5d4..848d387d1 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -1,234 +1,193 @@ -# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( - determine_ext, int_or_none, - NO_DEFAULT, - orderedSet, - parse_codecs, - qualities, - try_get, - unified_timestamp, - update_url_query, - url_or_none, - urljoin, + unified_strdate, + xpath_text, + determine_ext, + float_or_none, + ExtractorError, ) -class DreiSatBaseIE(InfoExtractor): - def _call_api(self, url, player, referrer, video_id, item): - return self._download_json( - url, video_id, 'Downloading JSON %s' % item, - headers={ - 'Referer': referrer, - 'Api-Auth': 'Bearer %s' % player['apiToken'], - }) - - def _extract_player(self, webpage, video_id, fatal=True): - return self._parse_json( - self._search_regex( - r'(?s)data-zdfplayer-jsb=(["\'])(?P{.+?})\1', webpage, - 'player JSON', default='{}' if not fatal else NO_DEFAULT, - group='json'), - video_id) - - -class DreiSatIE(DreiSatBaseIE): - _VALID_URL = r'https?://www\.3sat\.de/(?:[^/]+/)*(?P[^/?]+)\.html' - _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh') - - _TESTS = [{ - 'url': 'https://www.3sat.de/dokumentation/natur/dolomiten-sagenhaftes-juwel-der-alpen-100.html', - 'info_dict': { - 'id': 'dolomiten-sagenhaftes-juwel-der-alpen-100', - 'ext': 'mp4', - 'title': 'Dolomiten - Sagenhaftes Juwel der Alpen', - 'description': 'md5:a4fa13cae91b8044353c1d56f3a8fc77', - 'duration': 2618, - 'timestamp': 1561397400, - 'upload_date': '20190624', +class DreiSatIE(InfoExtractor): + IE_NAME = '3sat' + _GEO_COUNTRIES = ['DE'] + _VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)' + _TESTS = [ + { + 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', + 'md5': 'be37228896d30a88f315b638900a026e', + 'info_dict': { + 'id': '45918', + 'ext': 'mp4', + 'title': 'Waidmannsheil', + 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', + 'uploader': 'SCHWEIZWEIT', + 'uploader_id': '100000210', + 'upload_date': '20140913' + }, + 'params': { + 'skip_download': True, # m3u8 downloads + } }, - }, { - 'url': 'https://www.3sat.de/kultur/kulturdoku/der-gugelhupf-koenig-der-kuchen-100.html', - 'only_matching': True, - }, { - 'url': 'https://www.3sat.de/dokumentation/natur/karnische-alpen-100.html', - 'only_matching': True, - }] + { + 'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066', + 'only_matching': True, + }, + ] - @staticmethod - def _extract_subtitles(src): - subtitles = {} - for caption in try_get(src, lambda x: x['captions'], list) or []: - subtitle_url = url_or_none(caption.get('uri')) - if subtitle_url: - lang = caption.get('language', 'deu') - subtitles.setdefault(lang, []).append({ - 'url': subtitle_url, - }) - return subtitles - - def _extract_format(self, video_id, formats, format_urls, meta): - format_url = url_or_none(meta.get('url')) - if not format_url: - return - if format_url in format_urls: - return - format_urls.add(format_url) - mime_type = meta.get('mimeType') - ext = determine_ext(format_url) - if mime_type == 'application/x-mpegURL' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', m3u8_id='hls', - entry_protocol='m3u8_native', fatal=False)) - elif mime_type == 'application/f4m+xml' or ext == 'f4m': - formats.extend(self._extract_f4m_formats( - update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False)) - else: - f = parse_codecs(meta.get('mimeCodec')) - format_id = ['http'] - for p in (meta.get('type'), meta.get('quality')): - if p and isinstance(p, compat_str): - format_id.append(p) - f.update({ - 'url': format_url, - 'format_id': '-'.join(format_id), - 'format_note': meta.get('quality'), - 'language': meta.get('language'), - 'quality': qualities(self._QUALITIES)(meta.get('quality')), - 'preference': -10, - }) - formats.append(f) - - def _extract_entry(self, url, player, content, video_id): - title = content.get('title') or content['teaserHeadline'] - - t = content['mainVideoContent']['http://zdf.de/rels/target'] - - ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') - - if not ptmd_path: - ptmd_path = t[ - 'http://zdf.de/rels/streams/ptmd-template'].replace( - '{playerId}', 'portal') - - ptmd = self._call_api( - urljoin(url, ptmd_path), player, url, video_id, 'metadata') + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + param_groups = {} + for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): + group_id = param_group.get(self._xpath_ns( + 'id', 'http://www.w3.org/XML/1998/namespace')) + params = {} + for param in param_group: + params[param.get('name')] = param.get('value') + param_groups[group_id] = params formats = [] - track_uris = set() - for p in ptmd['priorityList']: - formitaeten = p.get('formitaeten') - if not isinstance(formitaeten, list): + for video in smil.findall(self._xpath_ns('.//video', namespace)): + src = video.get('src') + if not src: continue - for f in formitaeten: - f_qualities = f.get('qualities') - if not isinstance(f_qualities, list): - continue - for quality in f_qualities: - tracks = try_get(quality, lambda x: x['audio']['tracks'], list) - if not tracks: - continue - for track in tracks: - self._extract_format( - video_id, formats, track_uris, { - 'url': track.get('uri'), - 'type': f.get('type'), - 'mimeType': f.get('mimeType'), - 'quality': quality.get('quality'), - 'language': track.get('language'), - }) + bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + group_id = video.get('paramGroup') + param_group = param_groups[group_id] + for proto in param_group['protocols'].split(','): + formats.append({ + 'url': '%s://%s' % (proto, param_group['host']), + 'app': param_group['app'], + 'play_path': src, + 'ext': 'flv', + 'format_id': '%s-%d' % (proto, bitrate), + 'tbr': bitrate, + }) self._sort_formats(formats) + return formats - thumbnails = [] - layouts = try_get( - content, lambda x: x['teaserImageRef']['layouts'], dict) - if layouts: - for layout_key, layout_url in layouts.items(): - layout_url = url_or_none(layout_url) - if not layout_url: - continue - thumbnail = { - 'url': layout_url, - 'format_id': layout_key, - } - mobj = re.search(r'(?P\d+)x(?P\d+)', layout_key) - if mobj: - thumbnail.update({ - 'width': int(mobj.group('width')), - 'height': int(mobj.group('height')), - }) - thumbnails.append(thumbnail) + def extract_from_xml_url(self, video_id, xml_url): + doc = self._download_xml( + xml_url, video_id, + note='Downloading video info', + errnote='Failed to download video info') - return { - 'id': video_id, - 'title': title, - 'description': content.get('leadParagraph') or content.get('teasertext'), - 'duration': int_or_none(t.get('duration')), - 'timestamp': unified_timestamp(content.get('editorialDate')), - 'thumbnails': thumbnails, - 'subtitles': self._extract_subtitles(ptmd), - 'formats': formats, - } + status_code = xpath_text(doc, './status/statuscode') + if status_code and status_code != 'ok': + if status_code == 'notVisibleAnymore': + message = 'Video %s is not available' % video_id + else: + message = '%s returned error: %s' % (self.IE_NAME, status_code) + raise ExtractorError(message, expected=True) - def _extract_regular(self, url, player, video_id): - content = self._call_api( - player['content'], player, url, video_id, 'content') - return self._extract_entry(player['content'], player, content, video_id) - - def _extract_mobile(self, video_id): - document = self._download_json( - 'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id, - video_id)['document'] - - title = document['titel'] + title = xpath_text(doc, './/information/title', 'title', True) + urls = [] formats = [] - format_urls = set() - for f in document['formitaeten']: - self._extract_format(video_id, formats, format_urls, f) + for fnode in doc.findall('.//formitaeten/formitaet'): + video_url = xpath_text(fnode, 'url') + if not video_url or video_url in urls: + continue + urls.append(video_url) + + is_available = 'http://www.metafilegenerator' not in video_url + geoloced = 'static_geoloced_online' in video_url + if not is_available or geoloced: + continue + + format_id = fnode.attrib['basetype'] + format_m = re.match(r'''(?x) + (?P[^_]+)_(?P[^_]+)_(?P[^_]+)_ + (?P[^_]+)_(?P[^_]+)_(?P[^_]+) + ''', format_id) + + ext = determine_ext(video_url, None) or format_m.group('container') + + if ext == 'meta': + continue + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + video_url, video_id, fatal=False)) + elif ext == 'm3u8': + # the certificates are misconfigured (see + # https://github.com/ytdl-org/youtube-dl/issues/8665) + if video_url.startswith('https://'): + continue + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id=format_id, fatal=False)) + else: + quality = xpath_text(fnode, './quality') + if quality: + format_id += '-' + quality + + abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000) + vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000) + + tbr = int_or_none(self._search_regex( + r'_(\d+)k', video_url, 'bitrate', None)) + if tbr and vbr and not abr: + abr = tbr - vbr + + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'ext': ext, + 'acodec': format_m.group('acodec'), + 'vcodec': format_m.group('vcodec'), + 'abr': abr, + 'vbr': vbr, + 'tbr': tbr, + 'width': int_or_none(xpath_text(fnode, './width')), + 'height': int_or_none(xpath_text(fnode, './height')), + 'filesize': int_or_none(xpath_text(fnode, './filesize')), + 'protocol': format_m.group('proto').lower(), + }) + + geolocation = xpath_text(doc, './/details/geolocation') + if not formats and geolocation and geolocation != 'none': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + self._sort_formats(formats) thumbnails = [] - teaser_bild = document.get('teaserBild') - if isinstance(teaser_bild, dict): - for thumbnail_key, thumbnail in teaser_bild.items(): - thumbnail_url = try_get( - thumbnail, lambda x: x['url'], compat_str) - if thumbnail_url: - thumbnails.append({ - 'url': thumbnail_url, - 'id': thumbnail_key, - 'width': int_or_none(thumbnail.get('width')), - 'height': int_or_none(thumbnail.get('height')), - }) + for node in doc.findall('.//teaserimages/teaserimage'): + thumbnail_url = node.text + if not thumbnail_url: + continue + thumbnail = { + 'url': thumbnail_url, + } + thumbnail_key = node.get('key') + if thumbnail_key: + m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) + if m: + thumbnail['width'] = int(m.group(1)) + thumbnail['height'] = int(m.group(2)) + thumbnails.append(thumbnail) + + upload_date = unified_strdate(xpath_text(doc, './/details/airtime')) return { 'id': video_id, 'title': title, - 'description': document.get('beschreibung'), - 'duration': int_or_none(document.get('length')), - 'timestamp': unified_timestamp(try_get( - document, lambda x: x['meta']['editorialDate'], compat_str)), + 'description': xpath_text(doc, './/information/detail'), + 'duration': int_or_none(xpath_text(doc, './/details/lengthSec')), 'thumbnails': thumbnails, - 'subtitles': self._extract_subtitles(document), + 'uploader': xpath_text(doc, './/details/originChannelTitle'), + 'uploader_id': xpath_text(doc, './/details/originChannelId'), + 'upload_date': upload_date, 'formats': formats, } def _real_extract(self, url): video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id, fatal=False) - if webpage: - player = self._extract_player(webpage, url, fatal=False) - if player: - return self._extract_regular(url, player, video_id) - - return self._extract_mobile(video_id) - - + details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id + return self.extract_from_xml_url(video_id, details_url) From 152d2524ac2948c35cab66aa8d122f24a4921e68 Mon Sep 17 00:00:00 2001 From: Matthias Roos Date: Fri, 19 Jul 2019 22:54:37 +0200 Subject: [PATCH 7/7] [3sat] new extractor based on ZDFIE --- youtube_dl/extractor/dreisat.py | 210 ++++---------------------------- 1 file changed, 23 insertions(+), 187 deletions(-) diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 848d387d1..fdd8359d6 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -1,193 +1,29 @@ +# coding: utf-8 from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_strdate, - xpath_text, - determine_ext, - float_or_none, - ExtractorError, -) +from .zdf import ZDFIE -class DreiSatIE(InfoExtractor): - IE_NAME = '3sat' - _GEO_COUNTRIES = ['DE'] - _VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)' - _TESTS = [ - { - 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', - 'md5': 'be37228896d30a88f315b638900a026e', - 'info_dict': { - 'id': '45918', - 'ext': 'mp4', - 'title': 'Waidmannsheil', - 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', - 'uploader': 'SCHWEIZWEIT', - 'uploader_id': '100000210', - 'upload_date': '20140913' - }, - 'params': { - 'skip_download': True, # m3u8 downloads - } - }, - { - 'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066', - 'only_matching': True, - }, - ] +class DreiSatIE(ZDFIE): + _VALID_URL = r'https?://www\.3sat\.de/(?:[^/]+/)*(?P[^/?]+)\.html' + _TESTS = [{ + 'url': 'https://www.3sat.de/dokumentation/natur/dolomiten-sagenhaftes-juwel-der-alpen-100.html', + 'info_dict': { + 'id': 'dolomiten-sagenhaftes-juwel-der-alpen-100', + 'ext': 'mp4', + 'title': 'Dolomiten - Sagenhaftes Juwel der Alpen', + 'description': 'md5:a4fa13cae91b8044353c1d56f3a8fc77', + 'duration': 2618, + 'timestamp': 1561397400, + 'upload_date': '20190624', + }, + }, { + 'url': 'https://www.3sat.de/kultur/festspielsommer/anna-netrebko-arena-di-verona-il-trovatore-musik-100.html', + 'only_matching': True, + }, { + 'url': 'https://www.3sat.de/kultur/theater-und-tanz/nibelungen-ueberwaeltigung-100.html', + 'only_matching': True, + }] - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): - param_groups = {} - for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): - group_id = param_group.get(self._xpath_ns( - 'id', 'http://www.w3.org/XML/1998/namespace')) - params = {} - for param in param_group: - params[param.get('name')] = param.get('value') - param_groups[group_id] = params + pass - formats = [] - for video in smil.findall(self._xpath_ns('.//video', namespace)): - src = video.get('src') - if not src: - continue - bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) - group_id = video.get('paramGroup') - param_group = param_groups[group_id] - for proto in param_group['protocols'].split(','): - formats.append({ - 'url': '%s://%s' % (proto, param_group['host']), - 'app': param_group['app'], - 'play_path': src, - 'ext': 'flv', - 'format_id': '%s-%d' % (proto, bitrate), - 'tbr': bitrate, - }) - self._sort_formats(formats) - return formats - - def extract_from_xml_url(self, video_id, xml_url): - doc = self._download_xml( - xml_url, video_id, - note='Downloading video info', - errnote='Failed to download video info') - - status_code = xpath_text(doc, './status/statuscode') - if status_code and status_code != 'ok': - if status_code == 'notVisibleAnymore': - message = 'Video %s is not available' % video_id - else: - message = '%s returned error: %s' % (self.IE_NAME, status_code) - raise ExtractorError(message, expected=True) - - title = xpath_text(doc, './/information/title', 'title', True) - - urls = [] - formats = [] - for fnode in doc.findall('.//formitaeten/formitaet'): - video_url = xpath_text(fnode, 'url') - if not video_url or video_url in urls: - continue - urls.append(video_url) - - is_available = 'http://www.metafilegenerator' not in video_url - geoloced = 'static_geoloced_online' in video_url - if not is_available or geoloced: - continue - - format_id = fnode.attrib['basetype'] - format_m = re.match(r'''(?x) - (?P[^_]+)_(?P[^_]+)_(?P[^_]+)_ - (?P[^_]+)_(?P[^_]+)_(?P[^_]+) - ''', format_id) - - ext = determine_ext(video_url, None) or format_m.group('container') - - if ext == 'meta': - continue - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - video_url, video_id, fatal=False)) - elif ext == 'm3u8': - # the certificates are misconfigured (see - # https://github.com/ytdl-org/youtube-dl/issues/8665) - if video_url.startswith('https://'): - continue - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id=format_id, fatal=False)) - else: - quality = xpath_text(fnode, './quality') - if quality: - format_id += '-' + quality - - abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000) - vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000) - - tbr = int_or_none(self._search_regex( - r'_(\d+)k', video_url, 'bitrate', None)) - if tbr and vbr and not abr: - abr = tbr - vbr - - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'ext': ext, - 'acodec': format_m.group('acodec'), - 'vcodec': format_m.group('vcodec'), - 'abr': abr, - 'vbr': vbr, - 'tbr': tbr, - 'width': int_or_none(xpath_text(fnode, './width')), - 'height': int_or_none(xpath_text(fnode, './height')), - 'filesize': int_or_none(xpath_text(fnode, './filesize')), - 'protocol': format_m.group('proto').lower(), - }) - - geolocation = xpath_text(doc, './/details/geolocation') - if not formats and geolocation and geolocation != 'none': - self.raise_geo_restricted(countries=self._GEO_COUNTRIES) - - self._sort_formats(formats) - - thumbnails = [] - for node in doc.findall('.//teaserimages/teaserimage'): - thumbnail_url = node.text - if not thumbnail_url: - continue - thumbnail = { - 'url': thumbnail_url, - } - thumbnail_key = node.get('key') - if thumbnail_key: - m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) - thumbnails.append(thumbnail) - - upload_date = unified_strdate(xpath_text(doc, './/details/airtime')) - - return { - 'id': video_id, - 'title': title, - 'description': xpath_text(doc, './/information/detail'), - 'duration': int_or_none(xpath_text(doc, './/details/lengthSec')), - 'thumbnails': thumbnails, - 'uploader': xpath_text(doc, './/details/originChannelTitle'), - 'uploader_id': xpath_text(doc, './/details/originChannelId'), - 'upload_date': upload_date, - 'formats': formats, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id - return self.extract_from_xml_url(video_id, details_url)