From 33a1ec950c97b03e742926f3d37e9aa4ce642633 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 30 Apr 2016 20:38:45 +0100 Subject: [PATCH 01/17] [discovery] extract http formats --- youtube_dl/extractor/discovery.py | 50 +++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 5f1275b39..6d1f8e670 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -1,11 +1,16 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( parse_duration, parse_iso8601, ) -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urlparse, +) class DiscoveryIE(InfoExtractor): @@ -66,9 +71,48 @@ class DiscoveryIE(InfoExtractor): entries = [] for idx, video_info in enumerate(info['playlist']): - formats = self._extract_m3u8_formats( - video_info['src'], display_id, 'mp4', 'm3u8_native', m3u8_id='hls', + m3u8_url = video_info['src'] + formats = m3u8_formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', note='Download m3u8 information for video %d' % (idx + 1)) + qualities_basename = self._search_regex( + '/([^/]+)\.csmil/', m3u8_url, 'qualities basename', default=None) + if qualities_basename: + m3u8_path = compat_urlparse.urlparse(m3u8_url).path + QUALITIES_RE = r'((,\d+k)+,?)' + qualities = self._search_regex( + QUALITIES_RE, qualities_basename, + 'qualities', default=None) + if qualities: + qualities = list(map(lambda q: int(q[:-1]), qualities.strip(',').split(','))) + qualities.sort() + http_path = m3u8_path[1:].split('/', 1)[1] + http_template = re.sub(QUALITIES_RE, r'%dk', http_path) + http_template = http_template.replace('.csmil/master.m3u8', '') + http_template = compat_urlparse.urljoin( + 'http://discsmil.edgesuite.net/', http_template) + if m3u8_formats: + self._sort_formats(m3u8_formats) + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + m3u8_formats)) + if len(qualities) == len(m3u8_formats): + for q, m3u8_format in zip(qualities, m3u8_formats): + f = m3u8_format.copy() + f.update({ + 'url': http_template % q, + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + formats.append(f) + else: + for q in qualities: + formats.append({ + 'url': http_template % q, + 'ext': 'mp4', + 'format_id': 'http-%d' % q, + 'tbr': q, + }) self._sort_formats(formats) entries.append({ 'id': compat_str(video_info['id']), From 93f7a31bf3b572a98982a380ae3167ce954adc04 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 30 Apr 2016 20:49:09 +0100 Subject: [PATCH 02/17] [discovery] extract subtitle --- youtube_dl/extractor/discovery.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 6d1f8e670..87fb29b02 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -114,6 +114,16 @@ class DiscoveryIE(InfoExtractor): 'tbr': q, }) self._sort_formats(formats) + + subtitles = [] + caption_url = video_info.get('captionsUrl') + if caption_url: + subtitles = { + 'en': [{ + 'url': caption_url, + }] + } + entries.append({ 'id': compat_str(video_info['id']), 'formats': formats, @@ -124,6 +134,7 @@ class DiscoveryIE(InfoExtractor): 'thumbnail': video_info.get('thumbnailURL'), 'alt_title': video_info.get('secondary_title'), 'timestamp': parse_iso8601(video_info.get('publishedDate')), + 'subtitles': subtitles, }) return self.playlist_result(entries, display_id, video_title) From d00b93d58c2b78ce02fbf6c8e7ea556cd06ac3b6 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 30 Apr 2016 21:49:32 +0100 Subject: [PATCH 03/17] [discovery] extract more info using BrightcoveNewIE --- youtube_dl/extractor/discovery.py | 52 +++++-------------------------- 1 file changed, 7 insertions(+), 45 deletions(-) diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 87fb29b02..9dd631752 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -38,6 +38,7 @@ class DiscoveryIE(InfoExtractor): 'duration': 156, 'timestamp': 1302032462, 'upload_date': '20110405', + 'uploader_id': '103207', }, 'params': { 'skip_download': True, # requires ffmpeg @@ -59,7 +60,11 @@ class DiscoveryIE(InfoExtractor): 'upload_date': '20140725', 'timestamp': 1406246400, 'duration': 116, + 'uploader_id': '103207', }, + 'params': { + 'skip_download': True, # requires ffmpeg + } }] def _real_extract(self, url): @@ -71,50 +76,6 @@ class DiscoveryIE(InfoExtractor): entries = [] for idx, video_info in enumerate(info['playlist']): - m3u8_url = video_info['src'] - formats = m3u8_formats = self._extract_m3u8_formats( - m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', - note='Download m3u8 information for video %d' % (idx + 1)) - qualities_basename = self._search_regex( - '/([^/]+)\.csmil/', m3u8_url, 'qualities basename', default=None) - if qualities_basename: - m3u8_path = compat_urlparse.urlparse(m3u8_url).path - QUALITIES_RE = r'((,\d+k)+,?)' - qualities = self._search_regex( - QUALITIES_RE, qualities_basename, - 'qualities', default=None) - if qualities: - qualities = list(map(lambda q: int(q[:-1]), qualities.strip(',').split(','))) - qualities.sort() - http_path = m3u8_path[1:].split('/', 1)[1] - http_template = re.sub(QUALITIES_RE, r'%dk', http_path) - http_template = http_template.replace('.csmil/master.m3u8', '') - http_template = compat_urlparse.urljoin( - 'http://discsmil.edgesuite.net/', http_template) - if m3u8_formats: - self._sort_formats(m3u8_formats) - m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - m3u8_formats)) - if len(qualities) == len(m3u8_formats): - for q, m3u8_format in zip(qualities, m3u8_formats): - f = m3u8_format.copy() - f.update({ - 'url': http_template % q, - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - else: - for q in qualities: - formats.append({ - 'url': http_template % q, - 'ext': 'mp4', - 'format_id': 'http-%d' % q, - 'tbr': q, - }) - self._sort_formats(formats) - subtitles = [] caption_url = video_info.get('captionsUrl') if caption_url: @@ -125,8 +86,9 @@ class DiscoveryIE(InfoExtractor): } entries.append({ + '_type': 'url_transparent', + 'url': 'http://players.brightcove.net/103207/default_default/index.html?videoId=ref:%s' % video_info['referenceId'], 'id': compat_str(video_info['id']), - 'formats': formats, 'title': video_info['title'], 'description': video_info.get('description'), 'duration': parse_duration(video_info.get('video_length')), From ea7e7fecbd5da6866be003ea1ce5072dbe0118ae Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 30 Apr 2016 21:55:28 +0100 Subject: [PATCH 04/17] [discovery] remove unused imports --- youtube_dl/extractor/discovery.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 9dd631752..7c554ec14 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -1,16 +1,11 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( parse_duration, parse_iso8601, ) -from ..compat import ( - compat_str, - compat_urlparse, -) +from ..compat import compat_str class DiscoveryIE(InfoExtractor): From 339fe7228ae149db9fc163c94bef168f65a0a775 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 02:53:42 +0600 Subject: [PATCH 05/17] [tagesschau] Update _FORMATS map --- youtube_dl/extractor/tagesschau.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index 73e7657d4..ebd81eadc 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -64,9 +64,12 @@ class TagesschauIE(InfoExtractor): }] _FORMATS = { - 's': {'width': 256, 'height': 144, 'quality': 1}, + 'xs': {'quality': 0}, + 's': {'width': 320, 'height': 180, 'quality': 1}, 'm': {'width': 512, 'height': 288, 'quality': 2}, - 'l': {'width': 960, 'height': 544, 'quality': 3}, + 'l': {'width': 960, 'height': 540, 'quality': 3}, + 'xl': {'width': 1280, 'height': 720, 'quality': 4}, + 'xxl': {'quality': 5}, } def _real_extract(self, url): From fc35cd9e0c7ec88fedd90880dea23d593fed85ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 02:54:39 +0600 Subject: [PATCH 06/17] [tagesschau] Relax _VALID_URL --- youtube_dl/extractor/tagesschau.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index ebd81eadc..fcccb230c 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -8,7 +8,7 @@ from ..utils import parse_filesize class TagesschauIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:[^/]+/)*?[^/#?]+?(?P-?[0-9]+)(?:~_[^/#?]+?)?\.html' + _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:[^/]+/)*?[^/#?]+?(?P-?[0-9]+)(?:~_?[^/#?]+?)?\.html' _TESTS = [{ 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', @@ -61,6 +61,9 @@ class TagesschauIE(InfoExtractor): }, { 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html', 'only_matching': True, + }, { + 'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html', + 'only_matching': True, }] _FORMATS = { From 9e1b96ae400c70b1ecfc2d8917510def2ed23a6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 03:20:36 +0600 Subject: [PATCH 07/17] [rtlnl] Match formats only by height --- youtube_dl/extractor/rtlnl.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 8598b5840..4d612b5e3 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -125,10 +125,12 @@ class RtlNlIE(InfoExtractor): try: # Find hls format with the same width and height corresponding # to progressive format and copy metadata from it. - f = next(f for f in formats - if f.get('width') == width and f.get('height') == height).copy() - f.update(pg_format(format_id, width, height)) - pg_formats.append(f) + f = next(f for f in formats if f.get('height') == height) + # hls formats may have invalid width + f['width'] = width + f_copy = f.copy() + f_copy.update(pg_format(format_id, width, height)) + pg_formats.append(f_copy) except StopIteration: # Missing hls format does mean that no progressive format with # such width and height exists either. From 4c1b2e5c0ea6a041bfd773efd7c4ac78ac8f3b4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 04:18:56 +0600 Subject: [PATCH 08/17] [tagesschau] Add support for playlists --- youtube_dl/extractor/tagesschau.py | 110 ++++++++++++++++++----------- 1 file changed, 68 insertions(+), 42 deletions(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index fcccb230c..e58385c57 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -8,11 +8,11 @@ from ..utils import parse_filesize class TagesschauIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:[^/]+/)*?[^/#?]+?(?P-?[0-9]+)(?:~_?[^/#?]+?)?\.html' + _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/[^/]+/(?:[^/]+/)*?[^/#?]+?(?P-?[0-9]+)(?:~_?[^/#?]+?)?\.html' _TESTS = [{ 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', - 'md5': '917a228bc7df7850783bc47979673a09', + 'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6', 'info_dict': { 'id': '102143', 'ext': 'mp4', @@ -40,6 +40,13 @@ class TagesschauIE(InfoExtractor): 'description': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', 'thumbnail': 're:^https?:.*\.jpg$', }, + }, { + 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html', + 'info_dict': { + 'id': '135', + 'title': 'Möchtegern-Underdog mit Machtanspruch', + }, + 'playlist_count': 2, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', 'only_matching': True, @@ -75,6 +82,41 @@ class TagesschauIE(InfoExtractor): 'xxl': {'quality': 5}, } + def _extract_formats(self, download_text): + links = re.finditer( + r'', + download_text) + formats = [] + for l in links: + format_id = self._search_regex( + r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID') + format = { + 'format_id': format_id, + 'url': l.group('url'), + 'format_name': l.group('name'), + } + m = re.match( + r'''(?x) + Video:\s*(?P[a-zA-Z0-9/._-]+)\s*&\#10; + (?P[0-9]+)x(?P[0-9]+)px&\#10; + (?P[0-9]+)kbps&\#10; + Audio:\s*(?P[0-9]+)kbps,\s*(?P[A-Za-z\.0-9]+)&\#10; + Größe:\s*(?P[0-9.,]+\s+[a-zA-Z]*B)''', + l.group('title')) + if m: + format.update({ + 'format_note': m.group('audio_desc'), + 'vcodec': m.group('vcodec'), + 'width': int(m.group('width')), + 'height': int(m.group('height')), + 'abr': int(m.group('abr')), + 'vbr': int(m.group('vbr')), + 'filesize_approx': parse_filesize(m.group('filesize_approx')), + }) + formats.append(format) + self._sort_formats(formats) + return formats + def _real_extract(self, url): video_id = self._match_id(url) display_id = video_id.lstrip('-') @@ -94,14 +136,14 @@ class TagesschauIE(InfoExtractor): (?:,\s*quality:(?P["\'])(?P.+?)(?P=q_quality))? ''', playerpage): url = media.group('url') - type_ = media.group('type') + webpage_type = media.group('type') ext = media.group('ext') res = media.group('quality') f = { 'format_id': '%s_%s' % (res, ext) if res else ext, 'url': url, 'ext': ext, - 'vcodec': 'none' if type_ == 'audio' else None, + 'vcodec': 'none' if webpage_type == 'audio' else None, } f.update(self._FORMATS.get(res, {})) formats.append(f) @@ -109,47 +151,31 @@ class TagesschauIE(InfoExtractor): title = self._og_search_title(webpage).strip() description = self._og_search_description(webpage).strip() else: - download_text = self._search_regex( - r'(?s)

Wir bieten dieses Video in folgenden Formaten zum Download an:

\s*
(.*?)
\s*

', - webpage, 'download links') - links = re.finditer( - r'

', - download_text) - formats = [] - for l in links: - format_id = self._search_regex( - r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID') - format = { - 'format_id': format_id, - 'url': l.group('url'), - 'format_name': l.group('name'), - } - m = re.match( - r'''(?x) - Video:\s*(?P[a-zA-Z0-9/._-]+)\s*&\#10; - (?P[0-9]+)x(?P[0-9]+)px&\#10; - (?P[0-9]+)kbps&\#10; - Audio:\s*(?P[0-9]+)kbps,\s*(?P[A-Za-z\.0-9]+)&\#10; - Größe:\s*(?P[0-9.,]+\s+[a-zA-Z]*B)''', - l.group('title')) - if m: - format.update({ - 'format_note': m.group('audio_desc'), - 'vcodec': m.group('vcodec'), - 'width': int(m.group('width')), - 'height': int(m.group('height')), - 'abr': int(m.group('abr')), - 'vbr': int(m.group('vbr')), - 'filesize_approx': parse_filesize(m.group('filesize_approx')), - }) - formats.append(format) - thumbnail = self._og_search_thumbnail(webpage) - description = self._html_search_regex( - r'(?s)

(.*?)

', - webpage, 'description', default=None) title = self._html_search_regex( r'(.*?)', webpage, 'title') + DOWNLOAD_REGEX = r'(?s)

Wir bieten dieses Video in folgenden Formaten zum Download an:

\s*
(.*?)
\s*

' + + webpage_type = self._og_search_property('type', webpage, default=None) + if webpage_type == 'website': # Article + entries = [] + for num, (entry_title, download_text) in enumerate(re.findall( + r'(?s)]+class="infotext"[^>]*>.*?(.+?).*?

.*?%s' % DOWNLOAD_REGEX, + webpage)): + entries.append({ + 'id': display_id, + 'title': '%s-%d' % (entry_title, num), + 'formats': self._extract_formats(download_text), + }) + return self.playlist_result(entries, display_id, title) + else: # Assume single video + download_text = self._search_regex(DOWNLOAD_REGEX, webpage, 'download links') + formats = self._extract_formats(download_text) + thumbnail = self._og_search_thumbnail(webpage) + description = self._html_search_regex( + r'(?s)

(.*?)

', + webpage, 'description', default=None) + self._sort_formats(formats) return { From 1a2b377cc2fa9546fa08a7777a6fc5fc545cc441 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 04:38:46 +0600 Subject: [PATCH 09/17] [tagesschau] Fix audio support --- youtube_dl/extractor/tagesschau.py | 75 +++++++++++++++++++----------- 1 file changed, 49 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index e58385c57..ccc2d476d 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import parse_filesize +from ..utils import ( + determine_ext, + parse_filesize, +) class TagesschauIE(InfoExtractor): @@ -82,37 +85,54 @@ class TagesschauIE(InfoExtractor): 'xxl': {'quality': 5}, } - def _extract_formats(self, download_text): + def _extract_formats(self, download_text, media_kind): links = re.finditer( r'', download_text) formats = [] for l in links: + link_url = l.group('url') + if not link_url: + continue format_id = self._search_regex( - r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID') + r'.*/[^/.]+\.([^/]+)\.[^/.]+$', link_url, 'format ID', + default=determine_ext(link_url)) format = { 'format_id': format_id, 'url': l.group('url'), 'format_name': l.group('name'), } - m = re.match( - r'''(?x) - Video:\s*(?P[a-zA-Z0-9/._-]+)\s*&\#10; - (?P[0-9]+)x(?P[0-9]+)px&\#10; - (?P[0-9]+)kbps&\#10; - Audio:\s*(?P[0-9]+)kbps,\s*(?P[A-Za-z\.0-9]+)&\#10; - Größe:\s*(?P[0-9.,]+\s+[a-zA-Z]*B)''', - l.group('title')) - if m: - format.update({ - 'format_note': m.group('audio_desc'), - 'vcodec': m.group('vcodec'), - 'width': int(m.group('width')), - 'height': int(m.group('height')), - 'abr': int(m.group('abr')), - 'vbr': int(m.group('vbr')), - 'filesize_approx': parse_filesize(m.group('filesize_approx')), - }) + title = l.group('title') + if title: + if media_kind.lower() == 'video': + m = re.match( + r'''(?x) + Video:\s*(?P[a-zA-Z0-9/._-]+)\s*&\#10; + (?P[0-9]+)x(?P[0-9]+)px&\#10; + (?P[0-9]+)kbps&\#10; + Audio:\s*(?P[0-9]+)kbps,\s*(?P[A-Za-z\.0-9]+)&\#10; + Größe:\s*(?P[0-9.,]+\s+[a-zA-Z]*B)''', + title) + if m: + format.update({ + 'format_note': m.group('audio_desc'), + 'vcodec': m.group('vcodec'), + 'width': int(m.group('width')), + 'height': int(m.group('height')), + 'abr': int(m.group('abr')), + 'vbr': int(m.group('vbr')), + 'filesize_approx': parse_filesize(m.group('filesize_approx')), + }) + else: + m = re.match( + r'(?P.+?)-Format\s*:\s*(?P\d+)kbps\s*,\s*(?P.+)', + title) + if m: + format.update({ + 'format_note': '%s, %s' % (m.group('format'), m.group('note')), + 'vcodec': 'none', + 'abr': int(m.group('abr')), + }) formats.append(format) self._sort_formats(formats) return formats @@ -154,23 +174,26 @@ class TagesschauIE(InfoExtractor): title = self._html_search_regex( r'(.*?)', webpage, 'title') - DOWNLOAD_REGEX = r'(?s)

Wir bieten dieses Video in folgenden Formaten zum Download an:

\s*
(.*?)
\s*

' + DOWNLOAD_REGEX = r'(?s)

Wir bieten dieses (?PVideo|Audio) in folgenden Formaten zum Download an:

\s*
(?P.*?)
\s*

' webpage_type = self._og_search_property('type', webpage, default=None) if webpage_type == 'website': # Article entries = [] - for num, (entry_title, download_text) in enumerate(re.findall( + for num, (entry_title, media_kind, download_text) in enumerate(re.findall( r'(?s)]+class="infotext"[^>]*>.*?(.+?).*?

.*?%s' % DOWNLOAD_REGEX, webpage)): entries.append({ 'id': display_id, 'title': '%s-%d' % (entry_title, num), - 'formats': self._extract_formats(download_text), + 'formats': self._extract_formats(download_text, media_kind), }) return self.playlist_result(entries, display_id, title) else: # Assume single video - download_text = self._search_regex(DOWNLOAD_REGEX, webpage, 'download links') - formats = self._extract_formats(download_text) + download_text = self._search_regex( + DOWNLOAD_REGEX, webpage, 'download links', group='links') + media_kind = self._search_regex( + DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='links') + formats = self._extract_formats(download_text, media_kind) thumbnail = self._og_search_thumbnail(webpage) description = self._html_search_regex( r'(?s)

(.*?)

', From 2844b093360cf53829e1c127aba0bbc4a6a279a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 04:42:05 +0600 Subject: [PATCH 10/17] [tagesschau] Fix article media ids --- youtube_dl/extractor/tagesschau.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index ccc2d476d..6b71c8f81 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -181,10 +181,10 @@ class TagesschauIE(InfoExtractor): entries = [] for num, (entry_title, media_kind, download_text) in enumerate(re.findall( r'(?s)]+class="infotext"[^>]*>.*?(.+?).*?

.*?%s' % DOWNLOAD_REGEX, - webpage)): + webpage), 1): entries.append({ - 'id': display_id, - 'title': '%s-%d' % (entry_title, num), + 'id': '%s-%d' % (display_id, num), + 'title': '%s' % entry_title, 'formats': self._extract_formats(download_text, media_kind), }) return self.playlist_result(entries, display_id, title) From 9cf79e8f4bd7e0dfdb2ea9d29cf3ba7d3c6ab647 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sun, 1 May 2016 01:43:58 +0100 Subject: [PATCH 11/17] [ccc] improve extraction --- youtube_dl/extractor/ccc.py | 111 ++++++++++++------------------------ 1 file changed, 38 insertions(+), 73 deletions(-) diff --git a/youtube_dl/extractor/ccc.py b/youtube_dl/extractor/ccc.py index dda2c0959..8f7f09e22 100644 --- a/youtube_dl/extractor/ccc.py +++ b/youtube_dl/extractor/ccc.py @@ -1,13 +1,9 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( int_or_none, - parse_duration, - qualities, - unified_strdate, + parse_iso8601, ) @@ -19,14 +15,14 @@ class CCCIE(InfoExtractor): 'url': 'https://media.ccc.de/v/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor#video', 'md5': '3a1eda8f3a29515d27f5adb967d7e740', 'info_dict': { - 'id': '30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor', + 'id': '1839', 'ext': 'mp4', 'title': 'Introduction to Processor Design', - 'description': 'md5:80be298773966f66d56cb11260b879af', + 'description': 'md5:df55f6d073d4ceae55aae6f2fd98a0ac', 'thumbnail': 're:^https?://.*\.jpg$', - 'view_count': int, 'upload_date': '20131228', - 'duration': 3660, + 'timestamp': 1388188800, + 'duration': 3710, } }, { 'url': 'https://media.ccc.de/v/32c3-7368-shopshifting#download', @@ -34,79 +30,48 @@ class CCCIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + event_id = self._search_regex("data-id='(\d+)'", webpage, 'event id') + event_data = self._download_json('https://media.ccc.de/public/events/%s' % event_id, event_id) - if self._downloader.params.get('prefer_free_formats'): - preference = qualities(['mp3', 'opus', 'mp4-lq', 'webm-lq', 'h264-sd', 'mp4-sd', 'webm-sd', 'mp4', 'webm', 'mp4-hd', 'h264-hd', 'webm-hd']) - else: - preference = qualities(['opus', 'mp3', 'webm-lq', 'mp4-lq', 'webm-sd', 'h264-sd', 'mp4-sd', 'webm', 'mp4', 'webm-hd', 'mp4-hd', 'h264-hd']) - - title = self._html_search_regex( - r'(?s)

(.*?)

', webpage, 'title') - description = self._html_search_regex( - r'(?s)

About

(.+?)

', - webpage, 'description', fatal=False) - upload_date = unified_strdate(self._html_search_regex( - r"(?s)]+class='[^']*fa-calendar-o'[^>]*>(.+?)", - webpage, 'upload date', fatal=False)) - view_count = int_or_none(self._html_search_regex( - r"(?s)(.*?)", - webpage, 'view count', fatal=False)) - duration = parse_duration(self._html_search_regex( - r'(?s)]+class=(["\']).*?fa-clock-o.*?\1[^>]*>(?P.+?)(?P[^<]*)\s* - <(?:span|div)\s+class='label\s+filetype'>(?P[^<]*)\s* - [^']+)'>\s* - (?: - .*? - [^']+\.torrent)' - )?''', webpage) formats = [] - for m in matches: - format = m.group('format') - format_id = self._search_regex( - r'.*/([a-z0-9_-]+)/[^/]*$', - m.group('http_url'), 'format id', default=None) - if format_id: - format_id = m.group('lang') + '-' + format_id - vcodec = 'h264' if 'h264' in format_id else ( - 'none' if format_id in ('mp3', 'opus') else None + for recording in event_data.get('recordings', []): + recording_url = recording.get('recording_url') + if not recording_url: + continue + language = recording.get('language') + folder = recording.get('folder') + format_id = None + if language: + format_id = language + if folder: + if language: + format_id += '-' + folder + else: + format_id = folder + vcodec = 'h264' if 'h264' in folder else ( + 'none' if folder in ('mp3', 'opus') else None ) formats.append({ 'format_id': format_id, - 'format': format, - 'language': m.group('lang'), - 'url': m.group('http_url'), + 'url': recording_url, + 'width': int_or_none(recording.get('width')), + 'height': int_or_none(recording.get('height')), + 'filesize': int_or_none(recording.get('size'), invscale=1024 * 1024), + 'language': language, 'vcodec': vcodec, - 'preference': preference(format_id), }) - - if m.group('torrent_url'): - formats.append({ - 'format_id': 'torrent-%s' % (format if format_id is None else format_id), - 'format': '%s (torrent)' % format, - 'proto': 'torrent', - 'format_note': '(unsupported; will just download the .torrent file)', - 'vcodec': vcodec, - 'preference': -100 + preference(format_id), - 'url': m.group('torrent_url'), - }) self._sort_formats(formats) - thumbnail = self._html_search_regex( - r" Date: Sun, 1 May 2016 06:44:59 +0600 Subject: [PATCH 12/17] [tagesschau] Separate player extractor --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/tagesschau.py | 235 ++++++++++++++++++++--------- 2 files changed, 168 insertions(+), 72 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 737960a01..4aee53d6a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -724,7 +724,10 @@ from .svt import ( from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE from .sztvhu import SztvHuIE -from .tagesschau import TagesschauIE +from .tagesschau import ( + TagesschauPlayerIE, + TagesschauIE, +) from .tapely import TapelyIE from .tass import TassIE from .tdslifeway import TDSLifewayIE diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index 6b71c8f81..a71fbad7d 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -6,10 +6,124 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, + js_to_json, + parse_iso8601, parse_filesize, ) +class TagesschauPlayerIE(InfoExtractor): + IE_NAME = 'tagesschau:player' + _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?Paudio|video)/(?P=kind)-(?P\d+)~player(?:_[^/?#&]+)?\.html' + + _TESTS = [{ + 'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html', + 'md5': '8d09548d5c15debad38bee3a4d15ca21', + 'info_dict': { + 'id': '179517', + 'ext': 'mp4', + 'title': 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD', + 'thumbnail': 're:^https?:.*\.jpg$', + 'formats': 'mincount:6', + }, + }, { + 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html', + 'md5': '76e6eec6ebd40740671cf0a2c88617e5', + 'info_dict': { + 'id': '29417', + 'ext': 'mp3', + 'title': 'Trabi - Bye, bye Rennpappe', + 'thumbnail': 're:^https?:.*\.jpg$', + 'formats': 'mincount:2', + }, + }, { + 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417~player_autoplay-true.html', + 'only_matching': True, + }] + + _FORMATS = { + 'xs': {'quality': 0}, + 's': {'width': 320, 'height': 180, 'quality': 1}, + 'm': {'width': 512, 'height': 288, 'quality': 2}, + 'l': {'width': 960, 'height': 540, 'quality': 3}, + 'xl': {'width': 1280, 'height': 720, 'quality': 4}, + 'xxl': {'quality': 5}, + } + + def _extract_via_api(self, kind, video_id): + info = self._download_json( + 'https://www.tagesschau.de/api/multimedia/{0}/{0}-{1}.json'.format(kind, video_id), + video_id) + title = info['headline'] + formats = [] + for media in info['mediadata']: + for format_id, format_url in media.items(): + if determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls')) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'vcodec': 'none' if kind == 'audio' else None, + }) + self._sort_formats(formats) + timestamp = parse_iso8601(info.get('date')) + return { + 'id': video_id, + 'title': title, + 'timestamp': timestamp, + 'formats': formats, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + # kind = mobj.group('kind').lower() + # if kind == 'video': + # return self._extract_via_api(kind, video_id) + + # JSON api does not provide some audio formats (e.g. ogg) thus + # extractiong audio via webpage + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage).strip() + formats = [] + + for media_json in re.findall(r'({src\s*:\s*["\']http[^}]+type\s*:[^}]+})', webpage): + media = self._parse_json(js_to_json(media_json), video_id, fatal=False) + if not media: + continue + src = media.get('src') + if not src: + return + quality = media.get('quality') + kind = media.get('type', '').split('/')[0] + ext = determine_ext(src) + f = { + 'url': src, + 'format_id': '%s_%s' % (quality, ext) if quality else ext, + 'ext': ext, + 'vcodec': 'none' if kind == 'audio' else None, + } + f.update(self._FORMATS.get(quality, {})) + formats.append(f) + + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } + + class TagesschauIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/[^/]+/(?:[^/]+/)*?[^/#?]+?(?P-?[0-9]+)(?:~_?[^/#?]+?)?\.html' @@ -20,7 +134,7 @@ class TagesschauIE(InfoExtractor): 'id': '102143', 'ext': 'mp4', 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', - 'description': 'md5:171feccd9d9b3dd54d05d501568f6359', + 'description': '18.07.2015 20:10 Uhr', 'thumbnail': 're:^https?:.*\.jpg$', }, }, { @@ -29,18 +143,30 @@ class TagesschauIE(InfoExtractor): 'info_dict': { 'id': '5727', 'ext': 'mp4', - 'description': 'md5:695c01bfd98b7e313c501386327aea59', 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', + 'description': 'md5:695c01bfd98b7e313c501386327aea59', 'thumbnail': 're:^https?:.*\.jpg$', }, }, { - 'url': 'http://www.tagesschau.de/multimedia/politikimradio/audio-18407.html', - 'md5': 'aef45de271c4bf0a5db834aa40bf774c', + # exclusive audio + 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html', + 'md5': '76e6eec6ebd40740671cf0a2c88617e5', 'info_dict': { - 'id': '18407', + 'id': '29417', 'ext': 'mp3', - 'title': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', - 'description': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', + 'title': 'Trabi - Bye, bye Rennpappe', + 'description': 'md5:8687dda862cbbe2cfb2df09b56341317', + 'thumbnail': 're:^https?:.*\.jpg$', + }, + }, { + # audio in article + 'url': 'http://www.tagesschau.de/inland/bnd-303.html', + 'md5': 'e0916c623e85fc1d2b26b78f299d3958', + 'info_dict': { + 'id': '303', + 'ext': 'mp3', + 'title': 'Viele Baustellen für neuen BND-Chef', + 'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4', 'thumbnail': 're:^https?:.*\.jpg$', }, }, { @@ -71,19 +197,11 @@ class TagesschauIE(InfoExtractor): }, { 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html', 'only_matching': True, - }, { - 'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html', - 'only_matching': True, }] - _FORMATS = { - 'xs': {'quality': 0}, - 's': {'width': 320, 'height': 180, 'quality': 1}, - 'm': {'width': 512, 'height': 288, 'quality': 2}, - 'l': {'width': 960, 'height': 540, 'quality': 3}, - 'xl': {'width': 1280, 'height': 720, 'quality': 4}, - 'xxl': {'quality': 5}, - } + @classmethod + def suitable(cls, url): + return False if TagesschauPlayerIE.suitable(url) else super(TagesschauIE, cls).suitable(url) def _extract_formats(self, download_text, media_kind): links = re.finditer( @@ -140,64 +258,39 @@ class TagesschauIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) display_id = video_id.lstrip('-') + webpage = self._download_webpage(url, display_id) - player_url = self._html_search_meta( - 'twitter:player', webpage, 'player URL', default=None) - if player_url: - playerpage = self._download_webpage( - player_url, display_id, 'Downloading player page') + title = self._html_search_regex( + r']*class="headline"[^>]*>(.+?)', + webpage, 'title', default=None) or self._og_search_title(webpage) - formats = [] - for media in re.finditer( - r'''(?x) - (?P["\'])(?Phttp://media.+?)(?P=q_url) - ,\s*type:(?P["\'])(?Pvideo|audio)/(?P.+?)(?P=q_type) - (?:,\s*quality:(?P["\'])(?P.+?)(?P=q_quality))? - ''', playerpage): - url = media.group('url') - webpage_type = media.group('type') - ext = media.group('ext') - res = media.group('quality') - f = { - 'format_id': '%s_%s' % (res, ext) if res else ext, - 'url': url, - 'ext': ext, - 'vcodec': 'none' if webpage_type == 'audio' else None, - } - f.update(self._FORMATS.get(res, {})) - formats.append(f) - thumbnail = self._og_search_thumbnail(playerpage) - title = self._og_search_title(webpage).strip() - description = self._og_search_description(webpage).strip() - else: - title = self._html_search_regex( - r'(.*?)', webpage, 'title') + DOWNLOAD_REGEX = r'(?s)

Wir bieten dieses (?PVideo|Audio) in folgenden Formaten zum Download an:

\s*
(?P.*?)
\s*

' - DOWNLOAD_REGEX = r'(?s)

Wir bieten dieses (?PVideo|Audio) in folgenden Formaten zum Download an:

\s*
(?P.*?)
\s*

' - - webpage_type = self._og_search_property('type', webpage, default=None) - if webpage_type == 'website': # Article - entries = [] - for num, (entry_title, media_kind, download_text) in enumerate(re.findall( - r'(?s)]+class="infotext"[^>]*>.*?(.+?).*?

.*?%s' % DOWNLOAD_REGEX, - webpage), 1): - entries.append({ - 'id': '%s-%d' % (display_id, num), - 'title': '%s' % entry_title, - 'formats': self._extract_formats(download_text, media_kind), - }) + webpage_type = self._og_search_property('type', webpage, default=None) + if webpage_type == 'website': # Article + entries = [] + for num, (entry_title, media_kind, download_text) in enumerate(re.findall( + r'(?s)]+class="infotext"[^>]*>.*?(.+?).*?

.*?%s' % DOWNLOAD_REGEX, + webpage), 1): + entries.append({ + 'id': '%s-%d' % (display_id, num), + 'title': '%s' % entry_title, + 'formats': self._extract_formats(download_text, media_kind), + }) + if len(entries) > 1: return self.playlist_result(entries, display_id, title) - else: # Assume single video - download_text = self._search_regex( - DOWNLOAD_REGEX, webpage, 'download links', group='links') - media_kind = self._search_regex( - DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='links') - formats = self._extract_formats(download_text, media_kind) - thumbnail = self._og_search_thumbnail(webpage) - description = self._html_search_regex( - r'(?s)

(.*?)

', - webpage, 'description', default=None) + formats = entries[0]['formats'] + else: # Assume single video + download_text = self._search_regex( + DOWNLOAD_REGEX, webpage, 'download links', group='links') + media_kind = self._search_regex( + DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='kind') + formats = self._extract_formats(download_text, media_kind) + thumbnail = self._og_search_thumbnail(webpage) + description = self._html_search_regex( + r'(?s)

(.*?)

', + webpage, 'description', default=None) self._sort_formats(formats) From 651ad35ce0f0ee9d04db085c50c29441b47bc825 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 06:57:19 +0600 Subject: [PATCH 13/17] [tagesschau] Relax _VALID_URL --- youtube_dl/extractor/tagesschau.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index a71fbad7d..f6102c224 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -125,7 +125,7 @@ class TagesschauPlayerIE(InfoExtractor): class TagesschauIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/[^/]+/(?:[^/]+/)*?[^/#?]+?(?P-?[0-9]+)(?:~_?[^/#?]+?)?\.html' + _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P[^/]+/(?:[^/]+/)*?[^/#?]+?(?P-?[0-9]+)?)(?:~_?[^/#?]+?)?\.html' _TESTS = [{ 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', @@ -197,6 +197,9 @@ class TagesschauIE(InfoExtractor): }, { 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html', 'only_matching': True, + }, { + 'url': 'http://www.tagesschau.de/100sekunden/index.html', + 'only_matching': True, }] @classmethod @@ -256,7 +259,8 @@ class TagesschauIE(InfoExtractor): return formats def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('path') display_id = video_id.lstrip('-') webpage = self._download_webpage(url, display_id) From 854cc54bc1d0488d8fa88bd5dfed6f7f8981847e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 07:01:55 +0600 Subject: [PATCH 14/17] [tagesschau] Expand video id --- youtube_dl/extractor/tagesschau.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index f6102c224..499bd260b 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -125,13 +125,13 @@ class TagesschauPlayerIE(InfoExtractor): class TagesschauIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P[^/]+/(?:[^/]+/)*?[^/#?]+?(?P-?[0-9]+)?)(?:~_?[^/#?]+?)?\.html' + _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P[^/]+/(?:[^/]+/)*?(?P[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html' _TESTS = [{ 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', 'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6', 'info_dict': { - 'id': '102143', + 'id': 'video-102143', 'ext': 'mp4', 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', 'description': '18.07.2015 20:10 Uhr', @@ -141,7 +141,7 @@ class TagesschauIE(InfoExtractor): 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', 'md5': '3c54c1f6243d279b706bde660ceec633', 'info_dict': { - 'id': '5727', + 'id': 'ts-5727', 'ext': 'mp4', 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', 'description': 'md5:695c01bfd98b7e313c501386327aea59', @@ -152,7 +152,7 @@ class TagesschauIE(InfoExtractor): 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html', 'md5': '76e6eec6ebd40740671cf0a2c88617e5', 'info_dict': { - 'id': '29417', + 'id': 'audio-29417', 'ext': 'mp3', 'title': 'Trabi - Bye, bye Rennpappe', 'description': 'md5:8687dda862cbbe2cfb2df09b56341317', @@ -163,7 +163,7 @@ class TagesschauIE(InfoExtractor): 'url': 'http://www.tagesschau.de/inland/bnd-303.html', 'md5': 'e0916c623e85fc1d2b26b78f299d3958', 'info_dict': { - 'id': '303', + 'id': 'bnd-303', 'ext': 'mp3', 'title': 'Viele Baustellen für neuen BND-Chef', 'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4', @@ -172,7 +172,7 @@ class TagesschauIE(InfoExtractor): }, { 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html', 'info_dict': { - 'id': '135', + 'id': 'afd-parteitag-135', 'title': 'Möchtegern-Underdog mit Machtanspruch', }, 'playlist_count': 2, From 68bb2fef9565159eba4a47f464b6b420cf2d5cda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 07:15:23 +0600 Subject: [PATCH 15/17] [tagesschau] Restrict playlist entry regex --- youtube_dl/extractor/tagesschau.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index 499bd260b..136e18f96 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -200,6 +200,10 @@ class TagesschauIE(InfoExtractor): }, { 'url': 'http://www.tagesschau.de/100sekunden/index.html', 'only_matching': True, + }, { + # playlist article with collapsing sections + 'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html', + 'only_matching': True, }] @classmethod @@ -275,7 +279,7 @@ class TagesschauIE(InfoExtractor): if webpage_type == 'website': # Article entries = [] for num, (entry_title, media_kind, download_text) in enumerate(re.findall( - r'(?s)]+class="infotext"[^>]*>.*?(.+?).*?

.*?%s' % DOWNLOAD_REGEX, + r'(?s)]+class="infotext"[^>]*>\s*(?:]+>)?\s*(.+?).*?

.*?%s' % DOWNLOAD_REGEX, webpage), 1): entries.append({ 'id': '%s-%d' % (display_id, num), From 6f27bf1c7425d97eb07aee9f7e15d0066b0a74bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 08:08:51 +0600 Subject: [PATCH 16/17] Credit @blahgeek for xiami (#9079) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 07cade723..814fe9ec3 100644 --- a/AUTHORS +++ b/AUTHORS @@ -168,3 +168,4 @@ José Joaquín Atria Viťas Strádal Kagami Hiiragi Philip Huppert +blahgeek From 4bd143a3a06264fcda5fa254709d404ccab6601c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 10:56:54 +0600 Subject: [PATCH 17/17] [postprocessor/ffmpeg] Simplify metadata preparation and add track related metafields (Closes #9357) --- youtube_dl/postprocessor/ffmpeg.py | 41 +++++++++++++++++------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 1793a878c..fa99b0c2a 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -389,23 +389,30 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): class FFmpegMetadataPP(FFmpegPostProcessor): def run(self, info): metadata = {} - if info.get('title') is not None: - metadata['title'] = info['title'] - if info.get('upload_date') is not None: - metadata['date'] = info['upload_date'] - if info.get('artist') is not None: - metadata['artist'] = info['artist'] - elif info.get('uploader') is not None: - metadata['artist'] = info['uploader'] - elif info.get('uploader_id') is not None: - metadata['artist'] = info['uploader_id'] - if info.get('description') is not None: - metadata['description'] = info['description'] - metadata['comment'] = info['description'] - if info.get('webpage_url') is not None: - metadata['purl'] = info['webpage_url'] - if info.get('album') is not None: - metadata['album'] = info['album'] + + def add(meta_list, info_list=None): + if not info_list: + info_list = meta_list + if not isinstance(meta_list, (list, tuple)): + meta_list = (meta_list,) + if not isinstance(info_list, (list, tuple)): + info_list = (info_list,) + for info_f in info_list: + if info.get(info_f) is not None: + for meta_f in meta_list: + metadata[meta_f] = info[info_f] + break + + add('title', ('track', 'title')) + add('date', 'upload_date') + add(('description', 'comment'), 'description') + add('purl', 'webpage_url') + add('track', 'track_number') + add('artist', ('artist', 'creator', 'uploader', 'uploader_id')) + add('genre') + add('album') + add('album_artist') + add('disc', 'disc_number') if not metadata: self._downloader.to_screen('[ffmpeg] There isn\'t any metadata to add')