From 4c1b2e5c0ea6a041bfd773efd7c4ac78ac8f3b4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 04:18:56 +0600 Subject: [PATCH 01/82] [tagesschau] Add support for playlists --- youtube_dl/extractor/tagesschau.py | 110 ++++++++++++++++++----------- 1 file changed, 68 insertions(+), 42 deletions(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index fcccb230c..e58385c57 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -8,11 +8,11 @@ from ..utils import parse_filesize class TagesschauIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:[^/]+/)*?[^/#?]+?(?P-?[0-9]+)(?:~_?[^/#?]+?)?\.html' + _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/[^/]+/(?:[^/]+/)*?[^/#?]+?(?P-?[0-9]+)(?:~_?[^/#?]+?)?\.html' _TESTS = [{ 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', - 'md5': '917a228bc7df7850783bc47979673a09', + 'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6', 'info_dict': { 'id': '102143', 'ext': 'mp4', @@ -40,6 +40,13 @@ class TagesschauIE(InfoExtractor): 'description': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', 'thumbnail': 're:^https?:.*\.jpg$', }, + }, { + 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html', + 'info_dict': { + 'id': '135', + 'title': 'Möchtegern-Underdog mit Machtanspruch', + }, + 'playlist_count': 2, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', 'only_matching': True, @@ -75,6 +82,41 @@ class TagesschauIE(InfoExtractor): 'xxl': {'quality': 5}, } + def _extract_formats(self, download_text): + links = re.finditer( + r'', + download_text) + formats = [] + for l in links: + format_id = self._search_regex( + r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID') + format = { + 'format_id': format_id, + 'url': l.group('url'), + 'format_name': l.group('name'), + } + m = re.match( + r'''(?x) + Video:\s*(?P[a-zA-Z0-9/._-]+)\s*&\#10; + (?P[0-9]+)x(?P[0-9]+)px&\#10; + (?P[0-9]+)kbps&\#10; + Audio:\s*(?P[0-9]+)kbps,\s*(?P[A-Za-z\.0-9]+)&\#10; + Größe:\s*(?P[0-9.,]+\s+[a-zA-Z]*B)''', + l.group('title')) + if m: + format.update({ + 'format_note': m.group('audio_desc'), + 'vcodec': m.group('vcodec'), + 'width': int(m.group('width')), + 'height': int(m.group('height')), + 'abr': int(m.group('abr')), + 'vbr': int(m.group('vbr')), + 'filesize_approx': parse_filesize(m.group('filesize_approx')), + }) + formats.append(format) + self._sort_formats(formats) + return formats + def _real_extract(self, url): video_id = self._match_id(url) display_id = video_id.lstrip('-') @@ -94,14 +136,14 @@ class TagesschauIE(InfoExtractor): (?:,\s*quality:(?P["\'])(?P.+?)(?P=q_quality))? ''', playerpage): url = media.group('url') - type_ = media.group('type') + webpage_type = media.group('type') ext = media.group('ext') res = media.group('quality') f = { 'format_id': '%s_%s' % (res, ext) if res else ext, 'url': url, 'ext': ext, - 'vcodec': 'none' if type_ == 'audio' else None, + 'vcodec': 'none' if webpage_type == 'audio' else None, } f.update(self._FORMATS.get(res, {})) formats.append(f) @@ -109,47 +151,31 @@ class TagesschauIE(InfoExtractor): title = self._og_search_title(webpage).strip() description = self._og_search_description(webpage).strip() else: - download_text = self._search_regex( - r'(?s)

Wir bieten dieses Video in folgenden Formaten zum Download an:

\s*
(.*?)
\s*

', - webpage, 'download links') - links = re.finditer( - r'

', - download_text) - formats = [] - for l in links: - format_id = self._search_regex( - r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID') - format = { - 'format_id': format_id, - 'url': l.group('url'), - 'format_name': l.group('name'), - } - m = re.match( - r'''(?x) - Video:\s*(?P[a-zA-Z0-9/._-]+)\s*&\#10; - (?P[0-9]+)x(?P[0-9]+)px&\#10; - (?P[0-9]+)kbps&\#10; - Audio:\s*(?P[0-9]+)kbps,\s*(?P[A-Za-z\.0-9]+)&\#10; - Größe:\s*(?P[0-9.,]+\s+[a-zA-Z]*B)''', - l.group('title')) - if m: - format.update({ - 'format_note': m.group('audio_desc'), - 'vcodec': m.group('vcodec'), - 'width': int(m.group('width')), - 'height': int(m.group('height')), - 'abr': int(m.group('abr')), - 'vbr': int(m.group('vbr')), - 'filesize_approx': parse_filesize(m.group('filesize_approx')), - }) - formats.append(format) - thumbnail = self._og_search_thumbnail(webpage) - description = self._html_search_regex( - r'(?s)

(.*?)

', - webpage, 'description', default=None) title = self._html_search_regex( r'(.*?)', webpage, 'title') + DOWNLOAD_REGEX = r'(?s)

Wir bieten dieses Video in folgenden Formaten zum Download an:

\s*
(.*?)
\s*

' + + webpage_type = self._og_search_property('type', webpage, default=None) + if webpage_type == 'website': # Article + entries = [] + for num, (entry_title, download_text) in enumerate(re.findall( + r'(?s)]+class="infotext"[^>]*>.*?(.+?).*?

.*?%s' % DOWNLOAD_REGEX, + webpage)): + entries.append({ + 'id': display_id, + 'title': '%s-%d' % (entry_title, num), + 'formats': self._extract_formats(download_text), + }) + return self.playlist_result(entries, display_id, title) + else: # Assume single video + download_text = self._search_regex(DOWNLOAD_REGEX, webpage, 'download links') + formats = self._extract_formats(download_text) + thumbnail = self._og_search_thumbnail(webpage) + description = self._html_search_regex( + r'(?s)

(.*?)

', + webpage, 'description', default=None) + self._sort_formats(formats) return { From 1a2b377cc2fa9546fa08a7777a6fc5fc545cc441 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 04:38:46 +0600 Subject: [PATCH 02/82] [tagesschau] Fix audio support --- youtube_dl/extractor/tagesschau.py | 75 +++++++++++++++++++----------- 1 file changed, 49 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index e58385c57..ccc2d476d 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import parse_filesize +from ..utils import ( + determine_ext, + parse_filesize, +) class TagesschauIE(InfoExtractor): @@ -82,37 +85,54 @@ class TagesschauIE(InfoExtractor): 'xxl': {'quality': 5}, } - def _extract_formats(self, download_text): + def _extract_formats(self, download_text, media_kind): links = re.finditer( r'', download_text) formats = [] for l in links: + link_url = l.group('url') + if not link_url: + continue format_id = self._search_regex( - r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID') + r'.*/[^/.]+\.([^/]+)\.[^/.]+$', link_url, 'format ID', + default=determine_ext(link_url)) format = { 'format_id': format_id, 'url': l.group('url'), 'format_name': l.group('name'), } - m = re.match( - r'''(?x) - Video:\s*(?P[a-zA-Z0-9/._-]+)\s*&\#10; - (?P[0-9]+)x(?P[0-9]+)px&\#10; - (?P[0-9]+)kbps&\#10; - Audio:\s*(?P[0-9]+)kbps,\s*(?P[A-Za-z\.0-9]+)&\#10; - Größe:\s*(?P[0-9.,]+\s+[a-zA-Z]*B)''', - l.group('title')) - if m: - format.update({ - 'format_note': m.group('audio_desc'), - 'vcodec': m.group('vcodec'), - 'width': int(m.group('width')), - 'height': int(m.group('height')), - 'abr': int(m.group('abr')), - 'vbr': int(m.group('vbr')), - 'filesize_approx': parse_filesize(m.group('filesize_approx')), - }) + title = l.group('title') + if title: + if media_kind.lower() == 'video': + m = re.match( + r'''(?x) + Video:\s*(?P[a-zA-Z0-9/._-]+)\s*&\#10; + (?P[0-9]+)x(?P[0-9]+)px&\#10; + (?P[0-9]+)kbps&\#10; + Audio:\s*(?P[0-9]+)kbps,\s*(?P[A-Za-z\.0-9]+)&\#10; + Größe:\s*(?P[0-9.,]+\s+[a-zA-Z]*B)''', + title) + if m: + format.update({ + 'format_note': m.group('audio_desc'), + 'vcodec': m.group('vcodec'), + 'width': int(m.group('width')), + 'height': int(m.group('height')), + 'abr': int(m.group('abr')), + 'vbr': int(m.group('vbr')), + 'filesize_approx': parse_filesize(m.group('filesize_approx')), + }) + else: + m = re.match( + r'(?P.+?)-Format\s*:\s*(?P\d+)kbps\s*,\s*(?P.+)', + title) + if m: + format.update({ + 'format_note': '%s, %s' % (m.group('format'), m.group('note')), + 'vcodec': 'none', + 'abr': int(m.group('abr')), + }) formats.append(format) self._sort_formats(formats) return formats @@ -154,23 +174,26 @@ class TagesschauIE(InfoExtractor): title = self._html_search_regex( r'(.*?)', webpage, 'title') - DOWNLOAD_REGEX = r'(?s)

Wir bieten dieses Video in folgenden Formaten zum Download an:

\s*
(.*?)
\s*

' + DOWNLOAD_REGEX = r'(?s)

Wir bieten dieses (?PVideo|Audio) in folgenden Formaten zum Download an:

\s*
(?P.*?)
\s*

' webpage_type = self._og_search_property('type', webpage, default=None) if webpage_type == 'website': # Article entries = [] - for num, (entry_title, download_text) in enumerate(re.findall( + for num, (entry_title, media_kind, download_text) in enumerate(re.findall( r'(?s)]+class="infotext"[^>]*>.*?(.+?).*?

.*?%s' % DOWNLOAD_REGEX, webpage)): entries.append({ 'id': display_id, 'title': '%s-%d' % (entry_title, num), - 'formats': self._extract_formats(download_text), + 'formats': self._extract_formats(download_text, media_kind), }) return self.playlist_result(entries, display_id, title) else: # Assume single video - download_text = self._search_regex(DOWNLOAD_REGEX, webpage, 'download links') - formats = self._extract_formats(download_text) + download_text = self._search_regex( + DOWNLOAD_REGEX, webpage, 'download links', group='links') + media_kind = self._search_regex( + DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='links') + formats = self._extract_formats(download_text, media_kind) thumbnail = self._og_search_thumbnail(webpage) description = self._html_search_regex( r'(?s)

(.*?)

', From 2844b093360cf53829e1c127aba0bbc4a6a279a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 04:42:05 +0600 Subject: [PATCH 03/82] [tagesschau] Fix article media ids --- youtube_dl/extractor/tagesschau.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index ccc2d476d..6b71c8f81 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -181,10 +181,10 @@ class TagesschauIE(InfoExtractor): entries = [] for num, (entry_title, media_kind, download_text) in enumerate(re.findall( r'(?s)]+class="infotext"[^>]*>.*?(.+?).*?

.*?%s' % DOWNLOAD_REGEX, - webpage)): + webpage), 1): entries.append({ - 'id': display_id, - 'title': '%s-%d' % (entry_title, num), + 'id': '%s-%d' % (display_id, num), + 'title': '%s' % entry_title, 'formats': self._extract_formats(download_text, media_kind), }) return self.playlist_result(entries, display_id, title) From 9cf79e8f4bd7e0dfdb2ea9d29cf3ba7d3c6ab647 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sun, 1 May 2016 01:43:58 +0100 Subject: [PATCH 04/82] [ccc] improve extraction --- youtube_dl/extractor/ccc.py | 111 ++++++++++++------------------------ 1 file changed, 38 insertions(+), 73 deletions(-) diff --git a/youtube_dl/extractor/ccc.py b/youtube_dl/extractor/ccc.py index dda2c0959..8f7f09e22 100644 --- a/youtube_dl/extractor/ccc.py +++ b/youtube_dl/extractor/ccc.py @@ -1,13 +1,9 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( int_or_none, - parse_duration, - qualities, - unified_strdate, + parse_iso8601, ) @@ -19,14 +15,14 @@ class CCCIE(InfoExtractor): 'url': 'https://media.ccc.de/v/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor#video', 'md5': '3a1eda8f3a29515d27f5adb967d7e740', 'info_dict': { - 'id': '30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor', + 'id': '1839', 'ext': 'mp4', 'title': 'Introduction to Processor Design', - 'description': 'md5:80be298773966f66d56cb11260b879af', + 'description': 'md5:df55f6d073d4ceae55aae6f2fd98a0ac', 'thumbnail': 're:^https?://.*\.jpg$', - 'view_count': int, 'upload_date': '20131228', - 'duration': 3660, + 'timestamp': 1388188800, + 'duration': 3710, } }, { 'url': 'https://media.ccc.de/v/32c3-7368-shopshifting#download', @@ -34,79 +30,48 @@ class CCCIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + event_id = self._search_regex("data-id='(\d+)'", webpage, 'event id') + event_data = self._download_json('https://media.ccc.de/public/events/%s' % event_id, event_id) - if self._downloader.params.get('prefer_free_formats'): - preference = qualities(['mp3', 'opus', 'mp4-lq', 'webm-lq', 'h264-sd', 'mp4-sd', 'webm-sd', 'mp4', 'webm', 'mp4-hd', 'h264-hd', 'webm-hd']) - else: - preference = qualities(['opus', 'mp3', 'webm-lq', 'mp4-lq', 'webm-sd', 'h264-sd', 'mp4-sd', 'webm', 'mp4', 'webm-hd', 'mp4-hd', 'h264-hd']) - - title = self._html_search_regex( - r'(?s)

(.*?)

', webpage, 'title') - description = self._html_search_regex( - r'(?s)

About

(.+?)

', - webpage, 'description', fatal=False) - upload_date = unified_strdate(self._html_search_regex( - r"(?s)]+class='[^']*fa-calendar-o'[^>]*>(.+?)", - webpage, 'upload date', fatal=False)) - view_count = int_or_none(self._html_search_regex( - r"(?s)(.*?)", - webpage, 'view count', fatal=False)) - duration = parse_duration(self._html_search_regex( - r'(?s)]+class=(["\']).*?fa-clock-o.*?\1[^>]*>(?P.+?)(?P[^<]*)\s* - <(?:span|div)\s+class='label\s+filetype'>(?P[^<]*)\s* - [^']+)'>\s* - (?: - .*? - [^']+\.torrent)' - )?''', webpage) formats = [] - for m in matches: - format = m.group('format') - format_id = self._search_regex( - r'.*/([a-z0-9_-]+)/[^/]*$', - m.group('http_url'), 'format id', default=None) - if format_id: - format_id = m.group('lang') + '-' + format_id - vcodec = 'h264' if 'h264' in format_id else ( - 'none' if format_id in ('mp3', 'opus') else None + for recording in event_data.get('recordings', []): + recording_url = recording.get('recording_url') + if not recording_url: + continue + language = recording.get('language') + folder = recording.get('folder') + format_id = None + if language: + format_id = language + if folder: + if language: + format_id += '-' + folder + else: + format_id = folder + vcodec = 'h264' if 'h264' in folder else ( + 'none' if folder in ('mp3', 'opus') else None ) formats.append({ 'format_id': format_id, - 'format': format, - 'language': m.group('lang'), - 'url': m.group('http_url'), + 'url': recording_url, + 'width': int_or_none(recording.get('width')), + 'height': int_or_none(recording.get('height')), + 'filesize': int_or_none(recording.get('size'), invscale=1024 * 1024), + 'language': language, 'vcodec': vcodec, - 'preference': preference(format_id), }) - - if m.group('torrent_url'): - formats.append({ - 'format_id': 'torrent-%s' % (format if format_id is None else format_id), - 'format': '%s (torrent)' % format, - 'proto': 'torrent', - 'format_note': '(unsupported; will just download the .torrent file)', - 'vcodec': vcodec, - 'preference': -100 + preference(format_id), - 'url': m.group('torrent_url'), - }) self._sort_formats(formats) - thumbnail = self._html_search_regex( - r" Date: Sun, 1 May 2016 06:44:59 +0600 Subject: [PATCH 05/82] [tagesschau] Separate player extractor --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/tagesschau.py | 235 ++++++++++++++++++++--------- 2 files changed, 168 insertions(+), 72 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 737960a01..4aee53d6a 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -724,7 +724,10 @@ from .svt import ( from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE from .sztvhu import SztvHuIE -from .tagesschau import TagesschauIE +from .tagesschau import ( + TagesschauPlayerIE, + TagesschauIE, +) from .tapely import TapelyIE from .tass import TassIE from .tdslifeway import TDSLifewayIE diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index 6b71c8f81..a71fbad7d 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -6,10 +6,124 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, + js_to_json, + parse_iso8601, parse_filesize, ) +class TagesschauPlayerIE(InfoExtractor): + IE_NAME = 'tagesschau:player' + _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?Paudio|video)/(?P=kind)-(?P\d+)~player(?:_[^/?#&]+)?\.html' + + _TESTS = [{ + 'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html', + 'md5': '8d09548d5c15debad38bee3a4d15ca21', + 'info_dict': { + 'id': '179517', + 'ext': 'mp4', + 'title': 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD', + 'thumbnail': 're:^https?:.*\.jpg$', + 'formats': 'mincount:6', + }, + }, { + 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html', + 'md5': '76e6eec6ebd40740671cf0a2c88617e5', + 'info_dict': { + 'id': '29417', + 'ext': 'mp3', + 'title': 'Trabi - Bye, bye Rennpappe', + 'thumbnail': 're:^https?:.*\.jpg$', + 'formats': 'mincount:2', + }, + }, { + 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417~player_autoplay-true.html', + 'only_matching': True, + }] + + _FORMATS = { + 'xs': {'quality': 0}, + 's': {'width': 320, 'height': 180, 'quality': 1}, + 'm': {'width': 512, 'height': 288, 'quality': 2}, + 'l': {'width': 960, 'height': 540, 'quality': 3}, + 'xl': {'width': 1280, 'height': 720, 'quality': 4}, + 'xxl': {'quality': 5}, + } + + def _extract_via_api(self, kind, video_id): + info = self._download_json( + 'https://www.tagesschau.de/api/multimedia/{0}/{0}-{1}.json'.format(kind, video_id), + video_id) + title = info['headline'] + formats = [] + for media in info['mediadata']: + for format_id, format_url in media.items(): + if determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls')) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'vcodec': 'none' if kind == 'audio' else None, + }) + self._sort_formats(formats) + timestamp = parse_iso8601(info.get('date')) + return { + 'id': video_id, + 'title': title, + 'timestamp': timestamp, + 'formats': formats, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + # kind = mobj.group('kind').lower() + # if kind == 'video': + # return self._extract_via_api(kind, video_id) + + # JSON api does not provide some audio formats (e.g. ogg) thus + # extractiong audio via webpage + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage).strip() + formats = [] + + for media_json in re.findall(r'({src\s*:\s*["\']http[^}]+type\s*:[^}]+})', webpage): + media = self._parse_json(js_to_json(media_json), video_id, fatal=False) + if not media: + continue + src = media.get('src') + if not src: + return + quality = media.get('quality') + kind = media.get('type', '').split('/')[0] + ext = determine_ext(src) + f = { + 'url': src, + 'format_id': '%s_%s' % (quality, ext) if quality else ext, + 'ext': ext, + 'vcodec': 'none' if kind == 'audio' else None, + } + f.update(self._FORMATS.get(quality, {})) + formats.append(f) + + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } + + class TagesschauIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/[^/]+/(?:[^/]+/)*?[^/#?]+?(?P-?[0-9]+)(?:~_?[^/#?]+?)?\.html' @@ -20,7 +134,7 @@ class TagesschauIE(InfoExtractor): 'id': '102143', 'ext': 'mp4', 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', - 'description': 'md5:171feccd9d9b3dd54d05d501568f6359', + 'description': '18.07.2015 20:10 Uhr', 'thumbnail': 're:^https?:.*\.jpg$', }, }, { @@ -29,18 +143,30 @@ class TagesschauIE(InfoExtractor): 'info_dict': { 'id': '5727', 'ext': 'mp4', - 'description': 'md5:695c01bfd98b7e313c501386327aea59', 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', + 'description': 'md5:695c01bfd98b7e313c501386327aea59', 'thumbnail': 're:^https?:.*\.jpg$', }, }, { - 'url': 'http://www.tagesschau.de/multimedia/politikimradio/audio-18407.html', - 'md5': 'aef45de271c4bf0a5db834aa40bf774c', + # exclusive audio + 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html', + 'md5': '76e6eec6ebd40740671cf0a2c88617e5', 'info_dict': { - 'id': '18407', + 'id': '29417', 'ext': 'mp3', - 'title': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', - 'description': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', + 'title': 'Trabi - Bye, bye Rennpappe', + 'description': 'md5:8687dda862cbbe2cfb2df09b56341317', + 'thumbnail': 're:^https?:.*\.jpg$', + }, + }, { + # audio in article + 'url': 'http://www.tagesschau.de/inland/bnd-303.html', + 'md5': 'e0916c623e85fc1d2b26b78f299d3958', + 'info_dict': { + 'id': '303', + 'ext': 'mp3', + 'title': 'Viele Baustellen für neuen BND-Chef', + 'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4', 'thumbnail': 're:^https?:.*\.jpg$', }, }, { @@ -71,19 +197,11 @@ class TagesschauIE(InfoExtractor): }, { 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html', 'only_matching': True, - }, { - 'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html', - 'only_matching': True, }] - _FORMATS = { - 'xs': {'quality': 0}, - 's': {'width': 320, 'height': 180, 'quality': 1}, - 'm': {'width': 512, 'height': 288, 'quality': 2}, - 'l': {'width': 960, 'height': 540, 'quality': 3}, - 'xl': {'width': 1280, 'height': 720, 'quality': 4}, - 'xxl': {'quality': 5}, - } + @classmethod + def suitable(cls, url): + return False if TagesschauPlayerIE.suitable(url) else super(TagesschauIE, cls).suitable(url) def _extract_formats(self, download_text, media_kind): links = re.finditer( @@ -140,64 +258,39 @@ class TagesschauIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) display_id = video_id.lstrip('-') + webpage = self._download_webpage(url, display_id) - player_url = self._html_search_meta( - 'twitter:player', webpage, 'player URL', default=None) - if player_url: - playerpage = self._download_webpage( - player_url, display_id, 'Downloading player page') + title = self._html_search_regex( + r']*class="headline"[^>]*>(.+?)', + webpage, 'title', default=None) or self._og_search_title(webpage) - formats = [] - for media in re.finditer( - r'''(?x) - (?P["\'])(?Phttp://media.+?)(?P=q_url) - ,\s*type:(?P["\'])(?Pvideo|audio)/(?P.+?)(?P=q_type) - (?:,\s*quality:(?P["\'])(?P.+?)(?P=q_quality))? - ''', playerpage): - url = media.group('url') - webpage_type = media.group('type') - ext = media.group('ext') - res = media.group('quality') - f = { - 'format_id': '%s_%s' % (res, ext) if res else ext, - 'url': url, - 'ext': ext, - 'vcodec': 'none' if webpage_type == 'audio' else None, - } - f.update(self._FORMATS.get(res, {})) - formats.append(f) - thumbnail = self._og_search_thumbnail(playerpage) - title = self._og_search_title(webpage).strip() - description = self._og_search_description(webpage).strip() - else: - title = self._html_search_regex( - r'(.*?)', webpage, 'title') + DOWNLOAD_REGEX = r'(?s)

Wir bieten dieses (?PVideo|Audio) in folgenden Formaten zum Download an:

\s*
(?P.*?)
\s*

' - DOWNLOAD_REGEX = r'(?s)

Wir bieten dieses (?PVideo|Audio) in folgenden Formaten zum Download an:

\s*
(?P.*?)
\s*

' - - webpage_type = self._og_search_property('type', webpage, default=None) - if webpage_type == 'website': # Article - entries = [] - for num, (entry_title, media_kind, download_text) in enumerate(re.findall( - r'(?s)]+class="infotext"[^>]*>.*?(.+?).*?

.*?%s' % DOWNLOAD_REGEX, - webpage), 1): - entries.append({ - 'id': '%s-%d' % (display_id, num), - 'title': '%s' % entry_title, - 'formats': self._extract_formats(download_text, media_kind), - }) + webpage_type = self._og_search_property('type', webpage, default=None) + if webpage_type == 'website': # Article + entries = [] + for num, (entry_title, media_kind, download_text) in enumerate(re.findall( + r'(?s)]+class="infotext"[^>]*>.*?(.+?).*?

.*?%s' % DOWNLOAD_REGEX, + webpage), 1): + entries.append({ + 'id': '%s-%d' % (display_id, num), + 'title': '%s' % entry_title, + 'formats': self._extract_formats(download_text, media_kind), + }) + if len(entries) > 1: return self.playlist_result(entries, display_id, title) - else: # Assume single video - download_text = self._search_regex( - DOWNLOAD_REGEX, webpage, 'download links', group='links') - media_kind = self._search_regex( - DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='links') - formats = self._extract_formats(download_text, media_kind) - thumbnail = self._og_search_thumbnail(webpage) - description = self._html_search_regex( - r'(?s)

(.*?)

', - webpage, 'description', default=None) + formats = entries[0]['formats'] + else: # Assume single video + download_text = self._search_regex( + DOWNLOAD_REGEX, webpage, 'download links', group='links') + media_kind = self._search_regex( + DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='kind') + formats = self._extract_formats(download_text, media_kind) + thumbnail = self._og_search_thumbnail(webpage) + description = self._html_search_regex( + r'(?s)

(.*?)

', + webpage, 'description', default=None) self._sort_formats(formats) From 651ad35ce0f0ee9d04db085c50c29441b47bc825 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 06:57:19 +0600 Subject: [PATCH 06/82] [tagesschau] Relax _VALID_URL --- youtube_dl/extractor/tagesschau.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index a71fbad7d..f6102c224 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -125,7 +125,7 @@ class TagesschauPlayerIE(InfoExtractor): class TagesschauIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/[^/]+/(?:[^/]+/)*?[^/#?]+?(?P-?[0-9]+)(?:~_?[^/#?]+?)?\.html' + _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P[^/]+/(?:[^/]+/)*?[^/#?]+?(?P-?[0-9]+)?)(?:~_?[^/#?]+?)?\.html' _TESTS = [{ 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', @@ -197,6 +197,9 @@ class TagesschauIE(InfoExtractor): }, { 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html', 'only_matching': True, + }, { + 'url': 'http://www.tagesschau.de/100sekunden/index.html', + 'only_matching': True, }] @classmethod @@ -256,7 +259,8 @@ class TagesschauIE(InfoExtractor): return formats def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('path') display_id = video_id.lstrip('-') webpage = self._download_webpage(url, display_id) From 854cc54bc1d0488d8fa88bd5dfed6f7f8981847e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 07:01:55 +0600 Subject: [PATCH 07/82] [tagesschau] Expand video id --- youtube_dl/extractor/tagesschau.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index f6102c224..499bd260b 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -125,13 +125,13 @@ class TagesschauPlayerIE(InfoExtractor): class TagesschauIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P[^/]+/(?:[^/]+/)*?[^/#?]+?(?P-?[0-9]+)?)(?:~_?[^/#?]+?)?\.html' + _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P[^/]+/(?:[^/]+/)*?(?P[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html' _TESTS = [{ 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', 'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6', 'info_dict': { - 'id': '102143', + 'id': 'video-102143', 'ext': 'mp4', 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', 'description': '18.07.2015 20:10 Uhr', @@ -141,7 +141,7 @@ class TagesschauIE(InfoExtractor): 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', 'md5': '3c54c1f6243d279b706bde660ceec633', 'info_dict': { - 'id': '5727', + 'id': 'ts-5727', 'ext': 'mp4', 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', 'description': 'md5:695c01bfd98b7e313c501386327aea59', @@ -152,7 +152,7 @@ class TagesschauIE(InfoExtractor): 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html', 'md5': '76e6eec6ebd40740671cf0a2c88617e5', 'info_dict': { - 'id': '29417', + 'id': 'audio-29417', 'ext': 'mp3', 'title': 'Trabi - Bye, bye Rennpappe', 'description': 'md5:8687dda862cbbe2cfb2df09b56341317', @@ -163,7 +163,7 @@ class TagesschauIE(InfoExtractor): 'url': 'http://www.tagesschau.de/inland/bnd-303.html', 'md5': 'e0916c623e85fc1d2b26b78f299d3958', 'info_dict': { - 'id': '303', + 'id': 'bnd-303', 'ext': 'mp3', 'title': 'Viele Baustellen für neuen BND-Chef', 'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4', @@ -172,7 +172,7 @@ class TagesschauIE(InfoExtractor): }, { 'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html', 'info_dict': { - 'id': '135', + 'id': 'afd-parteitag-135', 'title': 'Möchtegern-Underdog mit Machtanspruch', }, 'playlist_count': 2, From 68bb2fef9565159eba4a47f464b6b420cf2d5cda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 07:15:23 +0600 Subject: [PATCH 08/82] [tagesschau] Restrict playlist entry regex --- youtube_dl/extractor/tagesschau.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index 499bd260b..136e18f96 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -200,6 +200,10 @@ class TagesschauIE(InfoExtractor): }, { 'url': 'http://www.tagesschau.de/100sekunden/index.html', 'only_matching': True, + }, { + # playlist article with collapsing sections + 'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html', + 'only_matching': True, }] @classmethod @@ -275,7 +279,7 @@ class TagesschauIE(InfoExtractor): if webpage_type == 'website': # Article entries = [] for num, (entry_title, media_kind, download_text) in enumerate(re.findall( - r'(?s)]+class="infotext"[^>]*>.*?(.+?).*?

.*?%s' % DOWNLOAD_REGEX, + r'(?s)]+class="infotext"[^>]*>\s*(?:]+>)?\s*(.+?).*?

.*?%s' % DOWNLOAD_REGEX, webpage), 1): entries.append({ 'id': '%s-%d' % (display_id, num), From 6f27bf1c7425d97eb07aee9f7e15d0066b0a74bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 08:08:51 +0600 Subject: [PATCH 09/82] Credit @blahgeek for xiami (#9079) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 07cade723..814fe9ec3 100644 --- a/AUTHORS +++ b/AUTHORS @@ -168,3 +168,4 @@ José Joaquín Atria Viťas Strádal Kagami Hiiragi Philip Huppert +blahgeek From 4bd143a3a06264fcda5fa254709d404ccab6601c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 10:56:54 +0600 Subject: [PATCH 10/82] [postprocessor/ffmpeg] Simplify metadata preparation and add track related metafields (Closes #9357) --- youtube_dl/postprocessor/ffmpeg.py | 41 +++++++++++++++++------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 1793a878c..fa99b0c2a 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -389,23 +389,30 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): class FFmpegMetadataPP(FFmpegPostProcessor): def run(self, info): metadata = {} - if info.get('title') is not None: - metadata['title'] = info['title'] - if info.get('upload_date') is not None: - metadata['date'] = info['upload_date'] - if info.get('artist') is not None: - metadata['artist'] = info['artist'] - elif info.get('uploader') is not None: - metadata['artist'] = info['uploader'] - elif info.get('uploader_id') is not None: - metadata['artist'] = info['uploader_id'] - if info.get('description') is not None: - metadata['description'] = info['description'] - metadata['comment'] = info['description'] - if info.get('webpage_url') is not None: - metadata['purl'] = info['webpage_url'] - if info.get('album') is not None: - metadata['album'] = info['album'] + + def add(meta_list, info_list=None): + if not info_list: + info_list = meta_list + if not isinstance(meta_list, (list, tuple)): + meta_list = (meta_list,) + if not isinstance(info_list, (list, tuple)): + info_list = (info_list,) + for info_f in info_list: + if info.get(info_f) is not None: + for meta_f in meta_list: + metadata[meta_f] = info[info_f] + break + + add('title', ('track', 'title')) + add('date', 'upload_date') + add(('description', 'comment'), 'description') + add('purl', 'webpage_url') + add('track', 'track_number') + add('artist', ('artist', 'creator', 'uploader', 'uploader_id')) + add('genre') + add('album') + add('album_artist') + add('disc', 'disc_number') if not metadata: self._downloader.to_screen('[ffmpeg] There isn\'t any metadata to add') From 0d66bd0eab436f7215f5da168b378127898ccd66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 1 May 2016 13:56:51 +0600 Subject: [PATCH 11/82] [downloader/hls] Delegate extraction to ffmpeg when unsupported features detected --- youtube_dl/downloader/hls.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index a01dac031..d7b34bde3 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -4,6 +4,7 @@ import os.path import re from .fragment import FragmentFD +from .external import FFmpegFD from ..compat import compat_urlparse from ..utils import ( @@ -17,12 +18,34 @@ class HlsFD(FragmentFD): FD_NAME = 'hlsnative' + @staticmethod + def can_download(manifest): + UNSUPPORTED_FEATURES = ( + r'#EXT-X-KEY:METHOD=(?!NONE)', # encrypted streams [1] + r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2] + r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3] + # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4 + # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2 + # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 + ) + return all(not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES) + def real_download(self, filename, info_dict): man_url = info_dict['url'] self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) manifest = self.ydl.urlopen(man_url).read() s = manifest.decode('utf-8', 'ignore') + + if not self.can_download(s): + self.report_warning( + 'hlsnative has detected features it does not support, ' + 'extraction will be delegated to ffmpeg') + fd = FFmpegFD(self.ydl, self.params) + for ph in self._progress_hooks: + fd.add_progress_hook(ph) + return fd.real_download(filename, info_dict) + fragment_urls = [] for line in s.splitlines(): line = line.strip() From 174aba3223d0b01ec30cc0a814c6c0813ce13f15 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 1 May 2016 10:19:14 +0200 Subject: [PATCH 12/82] release 2016.05.01 --- .github/ISSUE_TEMPLATE.md | 6 +++--- docs/supportedsites.md | 14 +++++++++----- youtube_dl/version.py | 2 +- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index c208eb689..a26ff1de4 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.04.24*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.04.24** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.05.01*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.05.01** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.04.24 +[debug] youtube-dl version 2016.05.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 03875b8db..9fb43671f 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -338,7 +338,6 @@ - **mailru**: Видео@Mail.Ru - **MakersChannel** - **MakerTV** - - **Malemotion** - **MatchTV** - **MDR**: MDR.DE and KiKA - **media.ccc.de** @@ -375,8 +374,8 @@ - **mtvservices:embedded** - **MuenchenTV**: münchen.tv - **MusicPlayOn** - - **muzu.tv** - **Mwave** + - **MwaveMeetGreet** - **MySpace** - **MySpace:album** - **MySpass** @@ -554,7 +553,6 @@ - **SenateISVP** - **ServingSys** - **Sexu** - - **SexyKarma**: Sexy Karma and Watch Indian Porn - **Shahid** - **Shared**: shared.sx and vivo.sx - **ShareSix** @@ -567,8 +565,6 @@ - **smotri:broadcast**: Smotri.com broadcasts - **smotri:community**: Smotri.com community videos - **smotri:user**: Smotri.com user videos - - **SnagFilms** - - **SnagFilmsEmbed** - **Snotr** - **Sohu** - **soundcloud** @@ -610,6 +606,7 @@ - **Syfy** - **SztvHu** - **Tagesschau** + - **tagesschau:player** - **Tapely** - **Tass** - **TDSLifeway** @@ -725,6 +722,8 @@ - **Vidzi** - **vier** - **vier:videos** + - **ViewLift** + - **ViewLiftEmbed** - **Viewster** - **Viidea** - **viki** @@ -756,6 +755,7 @@ - **Walla** - **WashingtonPost** - **wat.tv** + - **WatchIndianPorn**: Watch Indian Porn - **WDR** - **wdr:mobile** - **WDRMaus**: Sendung mit der Maus @@ -775,6 +775,10 @@ - **XFileShare**: XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me - **XHamster** - **XHamsterEmbed** + - **xiami:album**: 虾米音乐 - 专辑 + - **xiami:artist**: 虾米音乐 - 歌手 + - **xiami:collection**: 虾米音乐 - 精选集 + - **xiami:song**: 虾米音乐 - **XMinus** - **XNXX** - **Xstream** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8befd9607..551160897 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.04.24' +__version__ = '2016.05.01' From e0da32df6e924535dbbe1e1c8ad05386dcf93029 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 May 2016 00:48:26 +0600 Subject: [PATCH 13/82] [vevo:playlist] Add extractor (Closes #9334, closes #9364) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/vevo.py | 74 +++++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4aee53d6a..ef4431364 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -849,7 +849,10 @@ from .veehd import VeeHDIE from .veoh import VeohIE from .vessel import VesselIE from .vesti import VestiIE -from .vevo import VevoIE +from .vevo import ( + VevoIE, + VevoPlaylistIE, +) from .vgtv import ( BTArticleIE, BTVestlendingenIE, diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 147480f64..4ad1e87e4 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -3,7 +3,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_etree_fromstring +from ..compat import ( + compat_etree_fromstring, + compat_urlparse, +) from ..utils import ( ExtractorError, int_or_none, @@ -18,7 +21,7 @@ class VevoIE(InfoExtractor): (currently used by MTVIE and MySpaceIE) ''' _VALID_URL = r'''(?x) - (?:https?://www\.vevo\.com/watch/(?:[^/]+/(?:[^/]+/)?)?| + (?:https?://www\.vevo\.com/watch/(?!playlist|genre)(?:[^/]+/(?:[^/]+/)?)?| https?://cache\.vevo\.com/m/html/embed\.html\?video=| https?://videoplayer\.vevo\.com/embed/embedded\?videoId=| vevo:) @@ -301,3 +304,70 @@ class VevoIE(InfoExtractor): 'view_count': view_count, 'age_limit': age_limit, } + + +class VevoPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://www\.vevo\.com/watch/(?:playlist|genre)/(?P[^/?#&]+)' + + _TESTS = [{ + 'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29', + 'info_dict': { + 'id': 'dadbf4e7-b99f-4184-9670-6f0e547b6a29', + 'title': 'Best-Of: Birdman', + }, + 'playlist_count': 10, + 'params': { + 'proxy': '52.53.186.253:8083', + 'no_check_certificate': True, + }, + }, { + 'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29?index=0', + 'md5': '32dcdfddddf9ec6917fc88ca26d36282', + 'info_dict': { + 'id': 'USCMV1100073', + 'ext': 'mp4', + 'title': 'Y.U. MAD', + 'timestamp': 1323417600, + 'upload_date': '20111209', + 'uploader': 'Birdman', + }, + 'expected_warnings': ['Unable to download SMIL file'], + 'params': { + 'proxy': '52.53.186.253:8083', + 'no_check_certificate': True, + }, + }, { + 'url': 'http://www.vevo.com/watch/genre/rock?index=0', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + index = qs.get('index', [None])[0] + + if index: + video_id = self._search_regex( + r']+content=(["\'])vevo://video/(?P.+?)\1[^>]*>', + webpage, 'video id', default=None, group='id') + if video_id: + return self.url_result('vevo:%s' % video_id, VevoIE.ie_key()) + + playlists = self._parse_json( + self._search_regex( + r'window\.__INITIAL_STORE__\s*=\s*({.+?});\s*', + webpage, 'initial store'), + playlist_id)['default']['playlists'] + + playlist = list(playlists.values())[0] + + entries = [ + self.url_result('vevo:%s' % src, VevoIE.ie_key()) + for src in playlist['isrcs']] + + return self.playlist_result( + entries, playlist.get('playlistId'), + playlist.get('name'), playlist.get('description')) From 0c9d288ba0db3c1309bc6090457e11375073f3eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 May 2016 00:50:31 +0600 Subject: [PATCH 14/82] [vevo:playlist] Remove debug params --- youtube_dl/extractor/vevo.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 4ad1e87e4..8431077ad 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -316,10 +316,6 @@ class VevoPlaylistIE(InfoExtractor): 'title': 'Best-Of: Birdman', }, 'playlist_count': 10, - 'params': { - 'proxy': '52.53.186.253:8083', - 'no_check_certificate': True, - }, }, { 'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29?index=0', 'md5': '32dcdfddddf9ec6917fc88ca26d36282', @@ -332,10 +328,6 @@ class VevoPlaylistIE(InfoExtractor): 'uploader': 'Birdman', }, 'expected_warnings': ['Unable to download SMIL file'], - 'params': { - 'proxy': '52.53.186.253:8083', - 'no_check_certificate': True, - }, }, { 'url': 'http://www.vevo.com/watch/genre/rock?index=0', 'only_matching': True, From e2bd301ce7795597a7e3ef7f5a5446f9ec987883 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 May 2016 01:00:42 +0600 Subject: [PATCH 15/82] [vevo:playlist] Fix genre playlists --- youtube_dl/extractor/vevo.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 8431077ad..2d1ff05e1 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -307,7 +307,7 @@ class VevoIE(InfoExtractor): class VevoPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://www\.vevo\.com/watch/(?:playlist|genre)/(?P[^/?#&]+)' + _VALID_URL = r'https?://www\.vevo\.com/watch/(?Pplaylist|genre)/(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29', @@ -316,6 +316,13 @@ class VevoPlaylistIE(InfoExtractor): 'title': 'Best-Of: Birdman', }, 'playlist_count': 10, + }, { + 'url': 'http://www.vevo.com/watch/genre/rock', + 'info_dict': { + 'id': 'rock', + 'title': 'Rock', + }, + 'playlist_count': 20, }, { 'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29?index=0', 'md5': '32dcdfddddf9ec6917fc88ca26d36282', @@ -334,7 +341,9 @@ class VevoPlaylistIE(InfoExtractor): }] def _real_extract(self, url): - playlist_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + playlist_kind = mobj.group('kind') webpage = self._download_webpage(url, playlist_id) @@ -352,9 +361,10 @@ class VevoPlaylistIE(InfoExtractor): self._search_regex( r'window\.__INITIAL_STORE__\s*=\s*({.+?});\s*', webpage, 'initial store'), - playlist_id)['default']['playlists'] + playlist_id)['default']['%ss' % playlist_kind] - playlist = list(playlists.values())[0] + playlist = (list(playlists.values())[0] + if playlist_kind == 'playlist' else playlists[playlist_id]) entries = [ self.url_result('vevo:%s' % src, VevoIE.ie_key()) From 516ea41a7dd5a350e93ea7cc5ca2c1fcbd0cb43b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 May 2016 02:54:50 +0600 Subject: [PATCH 16/82] [vevo] Fix _call_api --- youtube_dl/extractor/vevo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 2d1ff05e1..35f974c4e 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -149,8 +149,8 @@ class VevoIE(InfoExtractor): auth_info = self._parse_json(webpage, video_id) self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token'] - def _call_api(self, path, video_id, note, errnote, fatal=True): - return self._download_json(self._api_url_template % path, video_id, note, errnote) + def _call_api(self, path, *args, **kwargs): + return self._download_json(self._api_url_template % path, *args, **kwargs) def _real_extract(self, url): video_id = self._match_id(url) From 9618c448247a6aa528b4bf2f289d3dd164c11417 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 May 2016 02:58:20 +0600 Subject: [PATCH 17/82] [vevo] Extract video versions from webpage as a last resort (Closes #8426, closes #9366) --- youtube_dl/extractor/vevo.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 35f974c4e..b6c6ba89f 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -15,7 +15,16 @@ from ..utils import ( ) -class VevoIE(InfoExtractor): +class VevoBaseIE(InfoExtractor): + def _extract_json(self, webpage, video_id, item): + return self._parse_json( + self._search_regex( + r'window\.__INITIAL_STORE__\s*=\s*({.+?});\s*', + webpage, 'initial store'), + video_id)['default'][item] + + +class VevoIE(VevoBaseIE): ''' Accepts urls from vevo.com or in the format 'vevo:{id}' (currently used by MTVIE and MySpaceIE) @@ -186,7 +195,14 @@ class VevoIE(InfoExtractor): video_versions = self._call_api( 'video/%s/streams' % video_id, video_id, 'Downloading video versions info', - 'Failed to download video versions info') + 'Failed to download video versions info', + fatal=False) + + # Some videos are only available via webpage (e.g. + # https://github.com/rg3/youtube-dl/issues/9366) + if not video_versions: + webpage = self._download_webpage(url, video_id) + video_versions = self._extract_json(webpage, video_id, 'streams')[video_id][0] timestamp = parse_iso8601(video_info.get('releaseDate')) artists = video_info.get('artists') @@ -306,7 +322,7 @@ class VevoIE(InfoExtractor): } -class VevoPlaylistIE(InfoExtractor): +class VevoPlaylistIE(VevoBaseIE): _VALID_URL = r'https?://www\.vevo\.com/watch/(?Pplaylist|genre)/(?P[^/?#&]+)' _TESTS = [{ @@ -357,11 +373,7 @@ class VevoPlaylistIE(InfoExtractor): if video_id: return self.url_result('vevo:%s' % video_id, VevoIE.ie_key()) - playlists = self._parse_json( - self._search_regex( - r'window\.__INITIAL_STORE__\s*=\s*({.+?});\s*', - webpage, 'initial store'), - playlist_id)['default']['%ss' % playlist_kind] + playlists = self._extract_json(webpage, playlist_id, '%ss' % playlist_kind) playlist = (list(playlists.values())[0] if playlist_kind == 'playlist' else playlists[playlist_id]) From 8e7d0048886f374a58f0fe8ba021644d7074d02c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 May 2016 03:06:48 +0600 Subject: [PATCH 18/82] [vevo] Add test for video only available via webpage --- youtube_dl/extractor/vevo.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index b6c6ba89f..4eb98e50d 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -84,6 +84,20 @@ class VevoIE(VevoBaseIE): 'uploader': 'K Camp', 'timestamp': 1449468000, }, + }, { + 'note': 'Only available via webpage', + 'url': 'http://www.vevo.com/watch/GBUV71600656', + 'md5': '67e79210613865b66a47c33baa5e37fe', + 'info_dict': { + 'id': 'GBUV71600656', + 'ext': 'mp4', + 'title': 'Viva Love', + 'upload_date': '20160428', + 'age_limit': 0, + 'uploader': 'ABC', + 'timestamp': 1461830400, + }, + 'expected_warnings': ['Failed to download video versions info'], }] _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com' _SOURCE_TYPES = { From 881dbc86c4e70252a5b8e5c726a6f2a32ff878c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 May 2016 03:28:58 +0600 Subject: [PATCH 19/82] [vevo] Extract track related metafields and add artists to title (Closes #1684) --- youtube_dl/extractor/vevo.py | 58 ++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 4eb98e50d..49cb3f479 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -42,11 +42,15 @@ class VevoIE(VevoBaseIE): 'info_dict': { 'id': 'GB1101300280', 'ext': 'mp4', - 'title': 'Somebody to Die For', + 'title': 'Hurts - Somebody to Die For', + 'timestamp': 1372057200, 'upload_date': '20130624', 'uploader': 'Hurts', - 'timestamp': 1372057200, + 'track': 'Somebody to Die For', + 'artist': 'Hurts', + 'genre': 'Pop', }, + 'expected_warnings': ['Unable to download SMIL file'], }, { 'note': 'v3 SMIL format', 'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923', @@ -54,23 +58,31 @@ class VevoIE(VevoBaseIE): 'info_dict': { 'id': 'USUV71302923', 'ext': 'mp4', - 'title': 'I Wish I Could Break Your Heart', + 'title': 'Cassadee Pope - I Wish I Could Break Your Heart', + 'timestamp': 1392796919, 'upload_date': '20140219', 'uploader': 'Cassadee Pope', - 'timestamp': 1392796919, + 'track': 'I Wish I Could Break Your Heart', + 'artist': 'Cassadee Pope', + 'genre': 'Country', }, + 'expected_warnings': ['Unable to download SMIL file'], }, { 'note': 'Age-limited video', 'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282', 'info_dict': { 'id': 'USRV81300282', 'ext': 'mp4', - 'title': 'Tunnel Vision (Explicit)', - 'upload_date': '20130703', + 'title': 'Justin Timberlake - Tunnel Vision (Explicit)', 'age_limit': 18, - 'uploader': 'Justin Timberlake', 'timestamp': 1372888800, + 'upload_date': '20130703', + 'uploader': 'Justin Timberlake', + 'track': 'Tunnel Vision (Explicit)', + 'artist': 'Justin Timberlake', + 'genre': 'Pop', }, + 'expected_warnings': ['Unable to download SMIL file'], }, { 'note': 'No video_info', 'url': 'http://www.vevo.com/watch/k-camp-1/Till-I-Die/USUV71503000', @@ -78,11 +90,14 @@ class VevoIE(VevoBaseIE): 'info_dict': { 'id': 'USUV71503000', 'ext': 'mp4', - 'title': 'Till I Die', - 'upload_date': '20151207', + 'title': 'K Camp - Till I Die', 'age_limit': 18, - 'uploader': 'K Camp', 'timestamp': 1449468000, + 'upload_date': '20151207', + 'uploader': 'K Camp', + 'track': 'Till I Die', + 'artist': 'K Camp', + 'genre': 'Rap/Hip-Hop', }, }, { 'note': 'Only available via webpage', @@ -91,11 +106,14 @@ class VevoIE(VevoBaseIE): 'info_dict': { 'id': 'GBUV71600656', 'ext': 'mp4', - 'title': 'Viva Love', - 'upload_date': '20160428', + 'title': 'ABC - Viva Love', 'age_limit': 0, - 'uploader': 'ABC', 'timestamp': 1461830400, + 'upload_date': '20160428', + 'uploader': 'ABC', + 'track': 'Viva Love', + 'artist': 'ABC', + 'genre': 'Pop', }, 'expected_warnings': ['Failed to download video versions info'], }] @@ -184,8 +202,8 @@ class VevoIE(VevoBaseIE): video_info = response.get('video') or {} video_versions = video_info.get('videoVersions') uploader = None - timestamp = None view_count = None + timestamp = None formats = [] if not video_info: @@ -311,7 +329,9 @@ class VevoIE(VevoBaseIE): smil_parsed = True self._sort_formats(formats) - title = video_info['title'] + track = video_info['title'] + title = '%s - %s' % (uploader, track) if uploader else track + genre = video_info.get('genres', [None])[0] is_explicit = video_info.get('isExplicit') if is_explicit is True: @@ -333,6 +353,9 @@ class VevoIE(VevoBaseIE): 'duration': duration, 'view_count': view_count, 'age_limit': age_limit, + 'track': track, + 'artist': uploader, + 'genre': genre, } @@ -359,10 +382,13 @@ class VevoPlaylistIE(VevoBaseIE): 'info_dict': { 'id': 'USCMV1100073', 'ext': 'mp4', - 'title': 'Y.U. MAD', + 'title': 'Birdman - Y.U. MAD', 'timestamp': 1323417600, 'upload_date': '20111209', 'uploader': 'Birdman', + 'track': 'Y.U. MAD', + 'artist': 'Birdman', + 'genre': 'Rap/Hip-Hop', }, 'expected_warnings': ['Unable to download SMIL file'], }, { From 78a3ff33ab686bc6fc75735e2b2a5935b80311be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 May 2016 03:29:48 +0600 Subject: [PATCH 20/82] [vevo:playlist] Add fallback for playlist id --- youtube_dl/extractor/vevo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 49cb3f479..bbe29fc51 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -423,5 +423,5 @@ class VevoPlaylistIE(VevoBaseIE): for src in playlist['isrcs']] return self.playlist_result( - entries, playlist.get('playlistId'), + entries, playlist.get('playlistId') or playlist_id, playlist.get('name'), playlist.get('description')) From 9508738f9a9d6fd3de2e60cd7ccb4c8631bea6f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 May 2016 03:36:40 +0600 Subject: [PATCH 21/82] [vevo] Extract featured artist --- youtube_dl/extractor/vevo.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index bbe29fc51..63eab4148 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -201,6 +201,8 @@ class VevoIE(VevoBaseIE): json_url, video_id, 'Downloading video info', 'Unable to download info') video_info = response.get('video') or {} video_versions = video_info.get('videoVersions') + artist = None + featured_artist = None uploader = None view_count = None timestamp = None @@ -239,7 +241,7 @@ class VevoIE(VevoBaseIE): timestamp = parse_iso8601(video_info.get('releaseDate')) artists = video_info.get('artists') if artists: - uploader = artists[0]['name'] + artist = uploader = artists[0]['name'] view_count = int_or_none(video_info.get('views', {}).get('total')) for video_version in video_versions: @@ -292,7 +294,11 @@ class VevoIE(VevoBaseIE): scale=1000) artists = video_info.get('mainArtists') if artists: - uploader = artists[0]['artistName'] + artist = uploader = artists[0]['artistName'] + + featured_artists = video_info.get('featuredArtists') + if featured_artists: + featured_artist = featured_artists[0]['artistName'] smil_parsed = False for video_version in video_info['videoVersions']: @@ -330,7 +336,9 @@ class VevoIE(VevoBaseIE): self._sort_formats(formats) track = video_info['title'] - title = '%s - %s' % (uploader, track) if uploader else track + if featured_artist: + artist = '%s ft. %s' % (artist, featured_artist) + title = '%s - %s' % (artist, track) if artist else track genre = video_info.get('genres', [None])[0] is_explicit = video_info.get('isExplicit') From 686cc8963441c37105c0447f31c5ea21405be05a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 May 2016 07:07:35 +0600 Subject: [PATCH 22/82] [discovery] Fix typo --- youtube_dl/extractor/discovery.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 7c554ec14..55853f76f 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -71,7 +71,7 @@ class DiscoveryIE(InfoExtractor): entries = [] for idx, video_info in enumerate(info['playlist']): - subtitles = [] + subtitles = {} caption_url = video_info.get('captionsUrl') if caption_url: subtitles = { From 79a2e94e79e65cdf4898bc2dedb6a1bb4ca9af3c Mon Sep 17 00:00:00 2001 From: Adam Thalhammer Date: Mon, 2 May 2016 13:21:39 +1000 Subject: [PATCH 23/82] Instead of replacing accented characters with an underscore when sanitizing file names in restricted mode, replace them with their non-accented equivalents fixes #9347 --- test/test_utils.py | 9 +++++++-- youtube_dl/utils.py | 9 +++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index e16a6761b..0072ba241 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -139,8 +139,8 @@ class TestUtil(unittest.TestCase): self.assertEqual('yes_no', sanitize_filename('yes? no', restricted=True)) self.assertEqual('this_-_that', sanitize_filename('this: that', restricted=True)) - tests = 'a\xe4b\u4e2d\u56fd\u7684c' - self.assertEqual(sanitize_filename(tests, restricted=True), 'a_b_c') + tests = 'aäb\u4e2d\u56fd\u7684c' + self.assertEqual(sanitize_filename(tests, restricted=True), 'aab_c') self.assertTrue(sanitize_filename('\xf6', restricted=True) != '') # No empty filename forbidden = '"\0\\/&!: \'\t\n()[]{}$;`^,#' @@ -155,6 +155,11 @@ class TestUtil(unittest.TestCase): self.assertTrue(sanitize_filename('-', restricted=True) != '') self.assertTrue(sanitize_filename(':', restricted=True) != '') + self.assertEqual(sanitize_filename( + 'ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', restricted=True), + 'AAAAAAAECEEEEIIIIDNOOOOOOUUUUYPssaaaaaaaeceeeeiiiionoooooouuuuypy') + pass + def test_sanitize_ids(self): self.assertEqual(sanitize_filename('_n_cd26wFpw', is_id=True), '_n_cd26wFpw') self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7bcc85e2b..f74f62268 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -14,8 +14,8 @@ import email.utils import errno import functools import gzip -import itertools import io +import itertools import json import locale import math @@ -24,8 +24,8 @@ import os import pipes import platform import re -import ssl import socket +import ssl import struct import subprocess import sys @@ -365,6 +365,11 @@ def sanitize_filename(s, restricted=False, is_id=False): Set is_id if this is not an arbitrary string, but an ID that should be kept if possible """ def replace_insane(char): + accents = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', + itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'], + 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy'))) + if restricted and char in accents: + return accents[char] if char == '?' or ord(char) < 32 or ord(char) == 127: return '' elif char == '"': From 31c4448f6ea8ec4236e0b67af33b53db1a93ef13 Mon Sep 17 00:00:00 2001 From: Adam Thalhammer Date: Mon, 2 May 2016 13:25:12 +1000 Subject: [PATCH 24/82] Instead of replacing accented characters with an underscore when sanitizing file names in restricted mode, replace them with their non-accented equivalents fixes #9347 --- test/test_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 0072ba241..00ada95ec 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -158,7 +158,6 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_filename( 'ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', restricted=True), 'AAAAAAAECEEEEIIIIDNOOOOOOUUUUYPssaaaaaaaeceeeeiiiionoooooouuuuypy') - pass def test_sanitize_ids(self): self.assertEqual(sanitize_filename('_n_cd26wFpw', is_id=True), '_n_cd26wFpw') From 5c9ced9504bd2ceb8e55a929124aad2091b23403 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 May 2016 18:19:00 +0600 Subject: [PATCH 25/82] [vevo] Improve genre extraction --- youtube_dl/extractor/vevo.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 63eab4148..3cb0343e6 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, + compat_str, compat_urlparse, ) from ..utils import ( @@ -116,6 +117,10 @@ class VevoIE(VevoBaseIE): 'genre': 'Pop', }, 'expected_warnings': ['Failed to download video versions info'], + }, { + # no genres available + 'url': 'http://www.vevo.com/watch/INS171400764', + 'only_matching': True, }] _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com' _SOURCE_TYPES = { @@ -339,7 +344,11 @@ class VevoIE(VevoBaseIE): if featured_artist: artist = '%s ft. %s' % (artist, featured_artist) title = '%s - %s' % (artist, track) if artist else track - genre = video_info.get('genres', [None])[0] + + genres = video_info.get('genres') + genre = ( + genres[0] if genres and isinstance(genres, list) and + isinstance(genres[0], compat_str) else None) is_explicit = video_info.get('isExplicit') if is_explicit is True: From 7960b0563b957d418ddd36555275d98ba4668c03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 May 2016 18:35:50 +0600 Subject: [PATCH 26/82] [YoutubeDL] Properly process unable-to-download-error on python2 --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 055433362..4e57c9687 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1639,7 +1639,7 @@ class YoutubeDL(object): # Just a single file success = dl(filename, info_dict) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self.report_error('unable to download video data: %s' % str(err)) + self.report_error('unable to download video data: %s' % error_to_compat_str(err)) return except (OSError, IOError) as err: raise UnavailableVideoError(err) From df5f4e8888bc02f6064b9b92fbf4cfc4eedd4c1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 May 2016 18:47:35 +0600 Subject: [PATCH 27/82] [vevo] Remove superfluous code --- youtube_dl/extractor/vevo.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 3cb0343e6..a6177f2cb 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -205,12 +205,10 @@ class VevoIE(VevoBaseIE): response = self._download_json( json_url, video_id, 'Downloading video info', 'Unable to download info') video_info = response.get('video') or {} - video_versions = video_info.get('videoVersions') artist = None featured_artist = None uploader = None view_count = None - timestamp = None formats = [] if not video_info: From f0e14fdd43bf8e86c5385220430eef842a10ccab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 May 2016 20:05:06 +0600 Subject: [PATCH 28/82] [YoutubeDL] Skip non-relevant field types when building output template --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 4e57c9687..2187dcc8f 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -580,7 +580,7 @@ class YoutubeDL(object): is_id=(k == 'id')) template_dict = dict((k, sanitize(k, v)) for k, v in template_dict.items() - if v is not None) + if v is not None and not isinstance(v, (list, tuple, dict))) template_dict = collections.defaultdict(lambda: 'NA', template_dict) outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) From 8a92e51c60fd122e676e4619e7e132b692292801 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 May 2016 21:31:35 +0600 Subject: [PATCH 29/82] [extractor/common] Relax wording for creator metafield --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 61a5d124c..0843d89af 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -163,7 +163,7 @@ class InfoExtractor(object): description: Full video description. uploader: Full name of the video uploader. license: License name the video is licensed under. - creator: The main artist who created the video. + creator: The creator of the video. release_date: The date (YYYYMMDD) when the video was released. timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). From 6c52a86f54b230a3f08dd10a89f55b8af4d98ee3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 2 May 2016 21:32:57 +0600 Subject: [PATCH 30/82] [README.md] Update creator description --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ecf737047..50acb26a0 100644 --- a/README.md +++ b/README.md @@ -465,7 +465,7 @@ The basic usage is not to set any template arguments when downloading a single f - `display_id`: An alternative identifier for the video - `uploader`: Full name of the video uploader - `license`: License name the video is licensed under - - `creator`: The main artist who created the video + - `creator`: The creator of the video - `release_date`: The date (YYYYMMDD) when the video was released - `timestamp`: UNIX timestamp of the moment the video became available - `upload_date`: Video upload date (YYYYMMDD) From c587cbb793a6eda4fc7b1c7b4163e236abec1a00 Mon Sep 17 00:00:00 2001 From: Adam Thalhammer Date: Tue, 3 May 2016 10:40:30 +1000 Subject: [PATCH 31/82] improved performance by extracting accented chars to top level --- youtube_dl/utils.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index f74f62268..a5922b2b5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -89,6 +89,11 @@ KNOWN_EXTENSIONS = ( 'wav', 'f4f', 'f4m', 'm3u8', 'smil') +# needed for sanitizing filenames in restricted mode +ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', + itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'], + 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy'))) + def preferredencoding(): """Get preferred encoding. @@ -365,11 +370,8 @@ def sanitize_filename(s, restricted=False, is_id=False): Set is_id if this is not an arbitrary string, but an ID that should be kept if possible """ def replace_insane(char): - accents = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', - itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'], - 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy'))) - if restricted and char in accents: - return accents[char] + if restricted and char in ACCENT_CHARS: + return ACCENT_CHARS[char] if char == '?' or ord(char) < 32 or ord(char) == 127: return '' elif char == '"': From a0a309b9734ea3d5f66d4a52e42f5cc24df7f808 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 3 May 2016 16:06:28 +0800 Subject: [PATCH 32/82] [kuwo:category] Fix description and update test --- youtube_dl/extractor/kuwo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 3740869c7..616ed19e1 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -266,7 +266,6 @@ class KuwoCategoryIE(InfoExtractor): 'info_dict': { 'id': '86375', 'title': '八十年代精选', - 'description': '这些都是属于八十年代的回忆!', }, 'playlist_mincount': 24, } @@ -283,6 +282,8 @@ class KuwoCategoryIE(InfoExtractor): category_desc = remove_start( get_element_by_id('intro', webpage).strip(), '%s简介:' % category_name) + if category_desc == '暂无': + category_desc = None jsonm = self._parse_json(self._html_search_regex( r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id) From 7759be38da1b5ad99a8ef04d9a1df21b989e3b8c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 3 May 2016 16:19:20 +0800 Subject: [PATCH 33/82] [xiami] Detect georestriction and skip tests --- youtube_dl/extractor/xiami.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/xiami.py b/youtube_dl/extractor/xiami.py index e4ed306b4..a6dfc4af9 100644 --- a/youtube_dl/extractor/xiami.py +++ b/youtube_dl/extractor/xiami.py @@ -9,6 +9,11 @@ from ..utils import int_or_none class XiamiBaseIE(InfoExtractor): _API_BASE_URL = 'http://www.xiami.com/song/playlist/cat/json/id' + def _download_webpage(self, *args, **kwargs): + webpage = super(XiamiBaseIE, self)._download_webpage(*args, **kwargs) + if '>Xiami is currently not available in your country.<' in webpage: + self.raise_geo_restricted('Xiami is currently not available in your country') + def _extract_track(self, track, track_id=None): title = track['title'] track_url = self._decrypt(track['location']) @@ -81,7 +86,8 @@ class XiamiSongIE(XiamiBaseIE): 'ext': 'lrc', }], }, - } + }, + 'skip': 'Georestricted', }, { 'url': 'http://www.xiami.com/song/1775256504', 'md5': '932a3abd45c6aa2b1fdbe028fcb4c4fc', @@ -100,7 +106,8 @@ class XiamiSongIE(XiamiBaseIE): 'ext': 'lrc', }], }, - } + }, + 'skip': 'Georestricted', }] def _real_extract(self, url): @@ -124,6 +131,7 @@ class XiamiAlbumIE(XiamiPlaylistBaseIE): 'id': '2100300444', }, 'playlist_count': 10, + 'skip': 'Georestricted', }, { 'url': 'http://www.xiami.com/album/512288?spm=a1z1s.6843761.1110925389.6.hhE9p9', 'only_matching': True, @@ -141,6 +149,7 @@ class XiamiArtistIE(XiamiPlaylistBaseIE): 'id': '2132', }, 'playlist_count': 20, + 'skip': 'Georestricted', } @@ -155,4 +164,5 @@ class XiamiCollectionIE(XiamiPlaylistBaseIE): 'id': '156527391', }, 'playlist_mincount': 29, + 'skip': 'Georestricted', } From 80bc4106aff21c46cf65c7310f1ed988518b3df5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 3 May 2016 15:09:23 +0600 Subject: [PATCH 34/82] [xfileshare] Add support for thevideobee.to (Closes #9374) --- youtube_dl/extractor/xfileshare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index 2d1504eaa..472ed1b3f 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -16,7 +16,7 @@ class XFileShareIE(InfoExtractor): IE_DESC = 'XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me' _VALID_URL = r'''(?x) https?://(?P(?:www\.)? - (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com|vidto\.me|powerwatch\.pw))/ + (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com|vidto\.me|powerwatch\.pw|thevideobee\.to))/ (?:embed-)?(?P[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)? ''' From 41745523918dee174a3e642b629bcbc585931c4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 3 May 2016 15:35:32 +0600 Subject: [PATCH 35/82] [xfileshare] Refactor _VALID_URL and remove ded sites --- youtube_dl/extractor/xfileshare.py | 39 +++++++++++------------------- 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index 472ed1b3f..4ab42d24e 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -13,12 +13,20 @@ from ..utils import ( class XFileShareIE(InfoExtractor): - IE_DESC = 'XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me' - _VALID_URL = r'''(?x) - https?://(?P(?:www\.)? - (?:daclips\.in|gorillavid\.in|movpod\.in|fastvideo\.in|realvid\.net|filehoot\.com|vidto\.me|powerwatch\.pw|thevideobee\.to))/ - (?:embed-)?(?P[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)? - ''' + _SITES = ( + ('daclips.in', 'DaClips'), + ('filehoot.com', 'FileHoot'), + ('gorillavid.in', 'GorillaVid'), + ('movpod.in', 'MovPod'), + ('powerwatch.pw', 'PowerWatch'), + ('rapidvideo.ws', 'Rapidvideo.ws'), + ('thevideobee.to', 'TheVideoBee'), + ('vidto.me', 'Vidto'), + ) + + IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1]) + _VALID_URL = (r'https?://(?P(?:www\.)?(?:%s))/(?:embed-)?(?P[0-9a-zA-Z]+)' + % '|'.join(re.escape(site) for site in list(zip(*_SITES))[0])) _FILE_NOT_FOUND_REGEX = r'>(?:404 - )?File Not Found<' @@ -43,25 +51,6 @@ class XFileShareIE(InfoExtractor): 'title': 'Micro Pig piglets ready on 16th July 2009-bG0PdrCdxUc', 'thumbnail': 're:http://.*\.jpg', } - }, { - # video with countdown timeout - 'url': 'http://fastvideo.in/1qmdn1lmsmbw', - 'md5': '8b87ec3f6564a3108a0e8e66594842ba', - 'info_dict': { - 'id': '1qmdn1lmsmbw', - 'ext': 'mp4', - 'title': 'Man of Steel - Trailer', - 'thumbnail': 're:http://.*\.jpg', - }, - }, { - 'url': 'http://realvid.net/ctn2y6p2eviw', - 'md5': 'b2166d2cf192efd6b6d764c18fd3710e', - 'info_dict': { - 'id': 'ctn2y6p2eviw', - 'ext': 'flv', - 'title': 'rdx 1955', - 'thumbnail': 're:http://.*\.jpg', - }, }, { 'url': 'http://movpod.in/0wguyyxi1yca', 'only_matching': True, From 57d8e32a3ec7fe70522edad6fd0c2847b4e00944 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 3 May 2016 16:58:11 +0600 Subject: [PATCH 36/82] [xfileshare] Add support for streamin.to --- youtube_dl/extractor/xfileshare.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index 4ab42d24e..769003735 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -22,6 +22,7 @@ class XFileShareIE(InfoExtractor): ('rapidvideo.ws', 'Rapidvideo.ws'), ('thevideobee.to', 'TheVideoBee'), ('vidto.me', 'Vidto'), + ('streamin.to', 'Streamin.To'), ) IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1]) From 964f49336fcf94b3a5399a026db3914b27a2a445 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 3 May 2016 21:24:51 +0600 Subject: [PATCH 37/82] [aol] Improve _VALID_URL (Closes #9381) --- youtube_dl/extractor/aol.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index 24df8fe93..42c21bf41 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -12,7 +12,7 @@ from ..utils import ( class AolIE(InfoExtractor): IE_NAME = 'on.aol.com' - _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/.*-)(?P[^/?-]+)' + _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/(?:[^/]+/)*(?:[^/?#&]+-)?)(?P[^/?#&]+)' _TESTS = [{ # video with 5min ID @@ -53,6 +53,12 @@ class AolIE(InfoExtractor): }, { 'url': 'http://on.aol.com/shows/park-bench-shw518173474-559a1b9be4b0c3bfad3357a7?context=SH:SHW518173474:PL4327:1460619712763', 'only_matching': True, + }, { + 'url': 'http://on.aol.com/video/519442220', + 'only_matching': True, + }, { + 'url': 'aol-video:5707d6b8e4b090497b04f706', + 'only_matching': True, }] def _real_extract(self, url): From bc7e77a04be6094e64263f9c622cff3cd1fc13cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 3 May 2016 23:18:36 +0600 Subject: [PATCH 38/82] [vevo] Use raise_geo_restricted --- youtube_dl/extractor/vevo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index a6177f2cb..c0ef08c02 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -189,8 +189,8 @@ class VevoIE(VevoBaseIE): errnote='Unable to retrieve oauth token') if 'THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION' in webpage: - raise ExtractorError( - '%s said: This page is currently unavailable in your region.' % self.IE_NAME, expected=True) + self.raise_geo_restricted( + '%s said: This page is currently unavailable in your region' % self.IE_NAME) auth_info = self._parse_json(webpage, video_id) self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token'] From e960c3c223acadb2fac81fb68595d902cf21e349 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 4 May 2016 22:25:39 +0600 Subject: [PATCH 39/82] [yandexmusic:playlist] Improve extraction (Closes #6801) --- youtube_dl/extractor/yandexmusic.py | 38 +++++++++++++++++++---------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index ce3723b55..22050add3 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -177,7 +177,7 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE): class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): IE_NAME = 'yandexmusic:playlist' IE_DESC = 'Яндекс.Музыка - Плейлист' - _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P\d+)' + _VALID_URL = r'https?://music\.yandex\.(?Pru|kz|ua|by)/users/(?P[^/]+)/playlists/(?P\d+)' _TESTS = [{ 'url': 'http://music.yandex.ru/users/music.partners/playlists/1245', @@ -201,19 +201,32 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): }] def _real_extract(self, url): - playlist_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + tld = mobj.group('tld') + user = mobj.group('user') + playlist_id = mobj.group('id') - webpage = self._download_webpage(url, playlist_id) + playlist = self._download_json( + 'https://music.yandex.%s/handlers/playlist.jsx' % tld, + playlist_id, 'Downloading missing tracks JSON', + fatal=False, + headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + 'X-Retpath-Y': url, + }, + query={ + 'owner': user, + 'kinds': playlist_id, + 'light': 'true', + 'lang': tld, + 'external-domain': 'music.yandex.%s' % tld, + 'overembed': 'false', + })['playlist'] - mu = self._parse_json( - self._search_regex( - r'var\s+Mu\s*=\s*({.+?});\s*', webpage, 'player'), - playlist_id) - - playlist = mu['pageData']['playlist'] tracks, track_ids = playlist['tracks'], playlist['trackIds'] - # tracks dictionary shipped with webpage is limited to 150 tracks, + # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks, # missing tracks should be retrieved manually. if len(tracks) < len(track_ids): present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')]) @@ -222,10 +235,9 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'https://music.yandex.ru/handlers/track-entries.jsx', urlencode_postdata({ 'entries': ','.join(missing_track_ids), - 'lang': mu.get('settings', {}).get('lang', 'en'), - 'external-domain': 'music.yandex.ru', + 'lang': tld, + 'external-domain': 'music.yandex.%s' % tld, 'overembed': 'false', - 'sign': mu.get('authData', {}).get('user', {}).get('sign'), 'strict': 'true', })) request.add_header('Referer', url) From 15fc0658f75403e76f9cb29dd4ce5d1e514d4bf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 4 May 2016 22:33:29 +0600 Subject: [PATCH 40/82] [yandexmusic:playlist] Modernize --- youtube_dl/extractor/yandexmusic.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 22050add3..0dda901a8 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -231,20 +231,21 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): if len(tracks) < len(track_ids): present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')]) missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids) - request = sanitized_Request( - 'https://music.yandex.ru/handlers/track-entries.jsx', - urlencode_postdata({ + missing_tracks = self._download_json( + 'https://music.yandex.%s/handlers/track-entries.jsx' % tld, + playlist_id, 'Downloading missing tracks JSON', + fatal=False, + headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + }, + query={ 'entries': ','.join(missing_track_ids), 'lang': tld, 'external-domain': 'music.yandex.%s' % tld, 'overembed': 'false', 'strict': 'true', - })) - request.add_header('Referer', url) - request.add_header('X-Requested-With', 'XMLHttpRequest') - - missing_tracks = self._download_json( - request, playlist_id, 'Downloading missing tracks JSON', fatal=False) + }) if missing_tracks: tracks.extend(missing_tracks) From d36724cca47c0dbbe691d5467c0c88e28a6627db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 4 May 2016 22:34:37 +0600 Subject: [PATCH 41/82] [yandexmusic:playlist] Remove unused imports --- youtube_dl/extractor/yandexmusic.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 0dda901a8..a9ebbbc04 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -10,8 +10,6 @@ from ..utils import ( ExtractorError, int_or_none, float_or_none, - sanitized_Request, - urlencode_postdata, ) From 203a3c0e6a589627f9b0f0c4c6ec56e24c27272c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 4 May 2016 22:35:28 +0600 Subject: [PATCH 42/82] [yandexmusic:playlist] Make title optional --- youtube_dl/extractor/yandexmusic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index a9ebbbc04..6e7a23782 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -250,4 +250,4 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): return self.playlist_result( self._build_playlist(tracks), compat_str(playlist_id), - playlist['title'], playlist.get('description')) + playlist.get('title'), playlist.get('description')) From 2a48e6f01a20545cd117cacb78463dba87d97b21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 4 May 2016 22:45:01 +0600 Subject: [PATCH 43/82] [yandexmusic:playlist] Respect track order for long (>150) playlists --- youtube_dl/extractor/yandexmusic.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 6e7a23782..283b55a91 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -222,13 +222,17 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'overembed': 'false', })['playlist'] - tracks, track_ids = playlist['tracks'], playlist['trackIds'] + tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds']) # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks, # missing tracks should be retrieved manually. if len(tracks) < len(track_ids): - present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')]) - missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids) + present_track_ids = set([ + compat_str(track['id']) + for track in tracks if track.get('id')]) + missing_track_ids = [ + track_id for track_id in track_ids + if track_id not in present_track_ids] missing_tracks = self._download_json( 'https://music.yandex.%s/handlers/track-entries.jsx' % tld, playlist_id, 'Downloading missing tracks JSON', From aabdc83d6e9ea1e6a28c48cb228a644c55fc11b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 4 May 2016 23:03:44 +0600 Subject: [PATCH 44/82] [udemy] Fix course enroll (Closes #9393) --- youtube_dl/extractor/udemy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index d1e6f2703..7ebd231f0 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -84,7 +84,8 @@ class UdemyIE(InfoExtractor): if enroll_url: webpage = self._download_webpage( combine_url(base_url, enroll_url), - course_id, 'Enrolling in the course') + course_id, 'Enrolling in the course', + headers={'Referer': base_url}) if '>You have enrolled in' in webpage: self.to_screen('%s: Successfully enrolled in the course' % course_id) From 75b81df3af31bdbe794b55924dd9ab85e1314d04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 4 May 2016 23:14:12 +0600 Subject: [PATCH 45/82] [udemy] Modernize --- youtube_dl/extractor/udemy.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 7ebd231f0..13e0cd237 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, ) @@ -91,12 +90,12 @@ class UdemyIE(InfoExtractor): def _download_lecture(self, course_id, lecture_id): return self._download_json( - 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % ( - course_id, lecture_id, compat_urllib_parse_urlencode({ - 'fields[lecture]': 'title,description,view_html,asset', - 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', - })), - lecture_id, 'Downloading lecture JSON') + 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?' + % (course_id, lecture_id), + lecture_id, 'Downloading lecture JSON', query={ + 'fields[lecture]': 'title,description,view_html,asset', + 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', + }) def _handle_error(self, response): if not isinstance(response, dict): @@ -156,13 +155,13 @@ class UdemyIE(InfoExtractor): 'password': password, }) - request = sanitized_Request( - self._LOGIN_URL, urlencode_postdata(login_form)) - request.add_header('Referer', self._ORIGIN_URL) - request.add_header('Origin', self._ORIGIN_URL) - response = self._download_webpage( - request, None, 'Logging in as %s' % username) + self._LOGIN_URL, None, 'Logging in as %s' % username, + data=urlencode_postdata(login_form), + headers={ + 'Referer': self._ORIGIN_URL, + 'Origin': self._ORIGIN_URL, + }) if not is_logged(response): error = self._html_search_regex( From 9da526aae75fc7bbc07e791b9f4eab78c007636c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 4 May 2016 23:18:48 +0600 Subject: [PATCH 46/82] [yandexmusic:playlist] Update test --- youtube_dl/extractor/yandexmusic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 283b55a91..0f78466e6 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -194,7 +194,7 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'id': '1036', 'title': 'Музыка 90-х', }, - 'playlist_count': 310, + 'playlist_mincount': 300, 'skip': 'Travis CI servers blocked by YandexMusic', }] From 4f8c56eb4e52d0a61a5facc6f22e36a2e420f4c9 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 5 May 2016 17:55:37 +0800 Subject: [PATCH 47/82] [fczenit] Fix extraction and update test Closes #9359 --- youtube_dl/extractor/fczenit.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/fczenit.py b/youtube_dl/extractor/fczenit.py index f1f150ef2..8d1010b88 100644 --- a/youtube_dl/extractor/fczenit.py +++ b/youtube_dl/extractor/fczenit.py @@ -1,20 +1,19 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_urlparse class FczenitIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/gl(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/(?P[0-9]+)' _TEST = { - 'url': 'http://fc-zenit.ru/video/gl6785/', - 'md5': '458bacc24549173fe5a5aa29174a5606', + 'url': 'http://fc-zenit.ru/video/41044/', + 'md5': '0e3fab421b455e970fa1aa3891e57df0', 'info_dict': { - 'id': '6785', + 'id': '41044', 'ext': 'mp4', - 'title': '«Зенит-ТВ»: как Олег Шатов играл против «Урала»', + 'title': 'Так пишется история: казанский разгром ЦСКА на «Зенит-ТВ»', }, } @@ -22,15 +21,23 @@ class FczenitIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_title = self._html_search_regex(r'
([^<]+)', webpage, 'title') + video_title = self._html_search_regex( + r'<[^>]+class=\"photoalbum__title\">([^<]+)', webpage, 'title') - bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, 'video URL') - bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw) + video_items = self._parse_json(self._search_regex( + r'arrPath\s*=\s*JSON\.parse\(\'(.+)\'\)', webpage, 'video items'), + video_id) + + def merge_dicts(*dicts): + ret = {} + for a_dict in dicts: + ret.update(a_dict) + return ret formats = [{ - 'url': furl, - 'tbr': tbr, - } for furl, tbr in bitrates] + 'url': compat_urlparse.urljoin(url, video_url), + 'tbr': int(tbr), + } for tbr, video_url in merge_dicts(*video_items).items()] self._sort_formats(formats) From 758a05924180ad2d1197ee1a293f28363ace54aa Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 5 May 2016 13:12:28 +0100 Subject: [PATCH 48/82] [dailymail] Add new extractor(closes #2667) --- youtube_dl/extractor/dailymail.py | 61 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 62 insertions(+) create mode 100644 youtube_dl/extractor/dailymail.py diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py new file mode 100644 index 000000000..b60a1d813 --- /dev/null +++ b/youtube_dl/extractor/dailymail.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + determine_protocol, +) + + +class DailyMailIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P[0-9]+)' + _TEST = { + 'url': 'http://www.dailymail.co.uk/video/sciencetech/video-1288527/Turn-video-impressionist-masterpiece.html', + 'md5': '2f639d446394f53f3a33658b518b6615', + 'info_dict': { + 'id': '1288527', + 'ext': 'mp4', + 'title': 'Turn any video into an impressionist masterpiece', + 'description': 'md5:88ddbcb504367987b2708bb38677c9d2', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_data = self._parse_json(self._search_regex( + r"data-opts='({.+?})'", webpage, 'video data'), video_id) + title = video_data['title'] + video_sources = self._download_json(video_data.get( + 'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id) + + formats = [] + for rendition in video_sources['renditions']: + rendition_url = rendition.get('url') + if not rendition_url: + continue + tbr = int_or_none(rendition.get('encodingRate'), 1000) + container = rendition.get('videoContainer') + is_hls = container == 'M2TS' + protocol = 'm3u8_native' if is_hls else determine_protocol({'url': rendition_url}) + formats.append({ + 'format_id': ('hls' if is_hls else protocol) + ('-%d' % tbr if tbr else ''), + 'url': rendition_url, + 'width': int_or_none(rendition.get('frameWidth')), + 'height': int_or_none(rendition.get('frameHeight')), + 'tbr': tbr, + 'vcodec': rendition.get('videoCodec'), + 'container': container, + 'protocol': protocol, + 'ext': 'mp4' if is_hls else None, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('descr'), + 'thumbnail': video_data.get('poster') or video_data.get('thumbnail'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ef4431364..aac85066f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -157,6 +157,7 @@ from .cspan import CSpanIE from .ctsnews import CtsNewsIE from .cultureunplugged import CultureUnpluggedIE from .cwtv import CWTVIE +from .dailymail import DailyMailIE from .dailymotion import ( DailymotionIE, DailymotionPlaylistIE, From 7d08f6073d42c8623031615c4b7e0bb26136c128 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 5 May 2016 20:06:59 +0800 Subject: [PATCH 49/82] [kuwo:category] Update test --- youtube_dl/extractor/kuwo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 616ed19e1..11b31a699 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -266,6 +266,7 @@ class KuwoCategoryIE(InfoExtractor): 'info_dict': { 'id': '86375', 'title': '八十年代精选', + 'description': '这些都是属于八十年代的回忆!', }, 'playlist_mincount': 24, } From b1c6a5bac8e3daf233cc79f0172d907905290f24 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 5 May 2016 20:50:39 +0800 Subject: [PATCH 50/82] [Makefile] Remove more media files in `make clean` --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 06cffcb71..c9ce216d1 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites clean: - rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe + rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi *.mkv *.webm CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe find . -name "*.pyc" -delete find . -name "*.class" -delete From ac12e888f9d76c75666822a79c125db8577e5fa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 5 May 2016 21:02:54 +0600 Subject: [PATCH 51/82] [redtube] Extract all formats, duration, upload date and view count (Closes #9397) --- youtube_dl/extractor/redtube.py | 59 +++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 7ba41ba59..1e532286c 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -1,7 +1,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + int_or_none, + str_to_int, + unified_strdate, +) class RedTubeIE(InfoExtractor): @@ -13,6 +18,9 @@ class RedTubeIE(InfoExtractor): 'id': '66418', 'ext': 'mp4', 'title': 'Sucked on a toilet', + 'upload_date': '20120831', + 'duration': 596, + 'view_count': int, 'age_limit': 18, } } @@ -24,12 +32,40 @@ class RedTubeIE(InfoExtractor): if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']): raise ExtractorError('Video %s has been removed' % video_id, expected=True) - video_url = self._html_search_regex( - r'', webpage, 'video URL') - video_title = self._html_search_regex( - r'

(.+?)

', - webpage, 'title') - video_thumbnail = self._og_search_thumbnail(webpage) + title = self._html_search_regex( + (r'

(?P.+?)</h1>', + r'videoTitle\s*:\s*(["\'])(?P<title>)\1'), + webpage, 'title', group='title') + + formats = [] + sources = self._parse_json( + self._search_regex( + r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'), + video_id, fatal=False) + if sources and isinstance(sources, dict): + for format_id, format_url in sources.items(): + if format_url: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + else: + video_url = self._html_search_regex( + r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') + formats.append({'url': video_url}) + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + upload_date = unified_strdate(self._search_regex( + r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<', + webpage, 'upload date', fatal=False)) + duration = int_or_none(self._search_regex( + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) + view_count = str_to_int(self._search_regex( + r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)', + webpage, 'view count', fatal=False)) + # No self-labeling, but they describe themselves as # "Home of Videos Porno" @@ -37,9 +73,12 @@ class RedTubeIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, 'ext': 'mp4', - 'title': video_title, - 'thumbnail': video_thumbnail, + 'title': title, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, 'age_limit': age_limit, + 'formats': formats, } From 915620fd6894d92f89f9e5c9362d20f94e787e57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 May 2016 21:34:06 +0600 Subject: [PATCH 52/82] [redtube] PEP 8 --- youtube_dl/extractor/redtube.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 1e532286c..721fc3a9e 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -66,7 +66,6 @@ class RedTubeIE(InfoExtractor): r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)', webpage, 'view count', fatal=False)) - # No self-labeling, but they describe themselves as # "Home of Videos Porno" age_limit = 18 From 6f59aa934bd5842a57d4c27ace0f3735be18ac27 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 6 May 2016 02:14:39 +0800 Subject: [PATCH 53/82] [periscope:user] Add new extractor for user pages Closes #9388 --- youtube_dl/extractor/extractors.py | 5 +++- youtube_dl/extractor/periscope.py | 37 ++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index aac85066f..c9d1422e5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -561,7 +561,10 @@ from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE from .people import PeopleIE -from .periscope import PeriscopeIE +from .periscope import ( + PeriscopeIE, + PeriscopeUserIE, +) from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 514e9b433..0a4bc761d 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -7,6 +7,7 @@ from ..utils import parse_iso8601 class PeriscopeIE(InfoExtractor): IE_DESC = 'Periscope' + IE_NAME = 'periscope' _VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P<id>[^/?#]+)' # Alive example URLs can be found here http://onperiscope.com/ _TESTS = [{ @@ -79,3 +80,39 @@ class PeriscopeIE(InfoExtractor): 'thumbnails': thumbnails, 'formats': formats, } + + +class PeriscopeUserIE(InfoExtractor): + _VALID_URL = r'https?://www\.periscope\.tv/(?P<id>[^/]+)/?$' + IE_DESC = 'Periscope user videos' + IE_NAME = 'periscope:user' + + _TEST = { + 'url': 'https://www.periscope.tv/LularoeHusbandMike/', + 'info_dict': { + 'id': 'LularoeHusbandMike', + 'title': 'LULAROE HUSBAND MIKE', + }, + # Periscope only shows videos in the last 24 hours, so it's possible to + # get 0 videos + 'playlist_mincount': 0, + } + + def _real_extract(self, url): + user_id = self._match_id(url) + + webpage = self._download_webpage(url, user_id) + + broadcast_data = self._parse_json(self._html_search_meta( + 'broadcast-data', webpage, default='{}'), user_id) + username = broadcast_data.get('user', {}).get('display_name') + user_broadcasts = self._parse_json( + self._html_search_meta('user-broadcasts', webpage, default='{}'), + user_id) + + entries = [ + self.url_result( + 'https://www.periscope.tv/%s/%s' % (user_id, broadcast['id'])) + for broadcast in user_broadcasts.get('broadcasts', [])] + + return self.playlist_result(entries, user_id, username) From 04e88ca2cac8f97e78cdee2825946e94b6173023 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 6 May 2016 15:02:40 +0100 Subject: [PATCH 54/82] [vk] improve extraction(fixes #7976) --- youtube_dl/extractor/biqle.py | 39 +++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/vk.py | 46 ++++++++++++++++-------------- 3 files changed, 64 insertions(+), 22 deletions(-) create mode 100644 youtube_dl/extractor/biqle.py diff --git a/youtube_dl/extractor/biqle.py b/youtube_dl/extractor/biqle.py new file mode 100644 index 000000000..ae4579b33 --- /dev/null +++ b/youtube_dl/extractor/biqle.py @@ -0,0 +1,39 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class BIQLEIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P<id>-?\d+_\d+)' + _TESTS = [{ + 'url': 'http://www.biqle.ru/watch/847655_160197695', + 'md5': 'ad5f746a874ccded7b8f211aeea96637', + 'info_dict': { + 'id': '160197695', + 'ext': 'mp4', + 'title': 'Foo Fighters - The Pretender (Live at Wembley Stadium)', + 'uploader': 'Andrey Rogozin', + 'upload_date': '20110605', + } + }, { + 'url': 'https://biqle.org/watch/-44781847_168547604', + 'md5': '7f24e72af1db0edf7c1aaba513174f97', + 'info_dict': { + 'id': '168547604', + 'ext': 'mp4', + 'title': 'Ребенок в шоке от автоматической мойки', + 'uploader': 'Dmitry Kotov', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + embed_url = self._proto_relative_url(self._search_regex( + r'<iframe.+?src="((?:http:)?//daxab\.com/[^"]+)".*?></iframe>', webpage, 'embed url')) + + return { + '_type': 'url_transparent', + 'url': embed_url, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c9d1422e5..14b4f245f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -75,6 +75,7 @@ from .bigflix import BigflixIE from .bild import BildIE from .bilibili import BiliBiliIE from .biobiochiletv import BioBioChileTVIE +from .biqle import BIQLEIE from .bleacherreport import ( BleacherReportIE, BleacherReportCMSIE, diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 67220f1b7..041d93629 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -26,12 +26,16 @@ class VKIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:m\.)?vk\.com/video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)| + (?: + (?:m\.)?vk\.com/video_| + (?:www\.)?daxab.com/ + ) + ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)| (?: (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video| - (?:www\.)?biqle\.ru/watch/ + (?:www\.)?daxab.com/embed/ ) - (?P<videoid>[^s].*?)(?:\?(?:.*\blist=(?P<list_id>[\da-f]+))?|%2F|$) + (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))? ) ''' _NETRC_MACHINE = 'vk' @@ -75,7 +79,8 @@ class VKIE(InfoExtractor): 'duration': 101, 'upload_date': '20120730', 'view_count': int, - } + }, + 'skip': 'This video has been removed from public access.', }, { # VIDEO NOW REMOVED @@ -142,7 +147,7 @@ class VKIE(InfoExtractor): 'id': 'V3K4mi0SYkc', 'ext': 'webm', 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", - 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', + 'description': 'md5:d9903938abdc74c738af77f527ca0596', 'duration': 178, 'upload_date': '20130116', 'uploader': "Children's Joy Foundation", @@ -173,11 +178,6 @@ class VKIE(InfoExtractor): 'url': 'https://vk.com/video205387401_164765225', 'only_matching': True, }, - { - # vk wrapper - 'url': 'http://www.biqle.ru/watch/847655_160197695', - 'only_matching': True, - }, { # pladform embed 'url': 'https://vk.com/video-76116461_171554880', @@ -217,20 +217,22 @@ class VKIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') - if not video_id: + info_url = url + if video_id: + info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id + # Some videos (removed?) can only be downloaded with list id specified + list_id = mobj.group('list_id') + if list_id: + info_url += '&list=%s' % list_id + else: + info_url = 'http://vk.com/video_ext.php?' + mobj.group('embed_query') video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) - info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id - - # Some videos (removed?) can only be downloaded with list id specified - list_id = mobj.group('list_id') - if list_id: - info_url += '&list=%s' % list_id - info_page = self._download_webpage(info_url, video_id) error_message = self._html_search_regex( - r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', + [r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', + r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'], info_page, 'error message', default=None) if error_message: raise ExtractorError(error_message, expected=True) @@ -305,17 +307,17 @@ class VKIE(InfoExtractor): view_count = None views = self._html_search_regex( r'"mv_views_count_number"[^>]*>(.+?\bviews?)<', - info_page, 'view count', fatal=False) + info_page, 'view count', default=None) if views: view_count = str_to_int(self._search_regex( r'([\d,.]+)', views, 'view count', fatal=False)) formats = [] for k, v in data.items(): - if not k.startswith('url') and k != 'extra_data' or not v: + if not k.startswith('url') and not k.startswith('cache') and k != 'extra_data' or not v: continue height = int_or_none(self._search_regex( - r'^url(\d+)', k, 'height', default=None)) + r'^(?:url|cache)(\d+)', k, 'height', default=None)) formats.append({ 'format_id': k, 'url': v, From abc97b5eda4ed4b36cec29e9966eb1bb7bcd97ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 May 2016 22:07:30 +0600 Subject: [PATCH 55/82] [utils] Allow empty attribute values in get_element_by_attribute (Closes #9415) --- youtube_dl/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a5922b2b5..6e4573784 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -256,9 +256,9 @@ def get_element_by_attribute(attribute, value, html): m = re.search(r'''(?xs) <([a-zA-Z0-9:._-]+) - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*? + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? \s+%s=['"]?%s['"]? - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*? + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? \s*> (?P<content>.*?) </\1> From 25cb7a0eebae0093a81fa1c930480fafa13feb25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 May 2016 22:11:18 +0600 Subject: [PATCH 56/82] [youtube] Allow empty attribute values in description regex --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b7c3cb63f..f3f102c30 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1326,9 +1326,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if video_description: video_description = re.sub(r'''(?x) <a\s+ - (?:[a-zA-Z-]+="[^"]+"\s+)*? + (?:[a-zA-Z-]+="[^"]*"\s+)*? (?:title|href)="([^"]+)"\s+ - (?:[a-zA-Z-]+="[^"]+"\s+)*? + (?:[a-zA-Z-]+="[^"]*"\s+)*? class="(?:yt-uix-redirect-link|yt-uix-sessionlink[^"]*)"[^>]*> [^<]+\.{3}\s* </a> From 3e80e6f40d6ef76142340a2292ef2445dc79594b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 May 2016 23:35:58 +0600 Subject: [PATCH 57/82] [vevo] Allow request to api.vevo.com to fail (Closes #9417) I don't know whether this it's tempopary or api has just gone --- youtube_dl/extractor/vevo.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index c0ef08c02..30b3a9e7e 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -201,9 +201,10 @@ class VevoIE(VevoBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - json_url = 'http://api.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id + json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id response = self._download_json( - json_url, video_id, 'Downloading video info', 'Unable to download info') + json_url, video_id, 'Downloading video info', + 'Unable to download info', fatal=False) or {} video_info = response.get('video') or {} artist = None featured_artist = None @@ -212,7 +213,7 @@ class VevoIE(VevoBaseIE): formats = [] if not video_info: - if response.get('statusCode') != 909: + if response and response.get('statusCode') != 909: ytid = response.get('errorInfo', {}).get('ytid') if ytid: self.report_warning( From f745403b5b448c170710256a61b8505e09e77674 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 May 2016 23:37:17 +0600 Subject: [PATCH 58/82] [vevo] Revert videoplayer.vevo.com to api.vevo.com --- youtube_dl/extractor/vevo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 30b3a9e7e..c0632cd6a 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -201,7 +201,7 @@ class VevoIE(VevoBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id + json_url = 'http://api.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id response = self._download_json( json_url, video_id, 'Downloading video info', 'Unable to download info', fatal=False) or {} From e2ee97dcd5c55e1c2aceae0d93fbfd64d0cc5ba3 Mon Sep 17 00:00:00 2001 From: inondle <qfulsher@gmai.com> Date: Fri, 6 May 2016 12:05:37 -0700 Subject: [PATCH 59/82] [liveleak] Adds support for thumbnails, updates tests --- youtube_dl/extractor/liveleak.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 29fba5f30..ea0565ac0 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -17,7 +17,8 @@ class LiveLeakIE(InfoExtractor): 'ext': 'flv', 'description': 'extremely bad day for this guy..!', 'uploader': 'ljfriel2', - 'title': 'Most unlucky car accident' + 'title': 'Most unlucky car accident', + 'thumbnail': 're:^https?://.*\.jpg$' } }, { 'url': 'http://www.liveleak.com/view?i=f93_1390833151', @@ -28,6 +29,7 @@ class LiveLeakIE(InfoExtractor): 'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.', 'uploader': 'ARD_Stinkt', 'title': 'German Television does first Edward Snowden Interview (ENGLISH)', + 'thumbnail': 're:^https?://.*\.jpg$' } }, { 'url': 'http://www.liveleak.com/view?i=4f7_1392687779', @@ -49,7 +51,8 @@ class LiveLeakIE(InfoExtractor): 'ext': 'mp4', 'description': 'Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.', 'uploader': 'bony333', - 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia' + 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia', + 'thumbnail': 're:^https?://.*\.jpg$' } }] @@ -72,6 +75,7 @@ class LiveLeakIE(InfoExtractor): age_limit = int_or_none(self._search_regex( r'you confirm that you are ([0-9]+) years and over.', webpage, 'age limit', default=None)) + video_thumbnail = self._og_search_thumbnail(webpage) sources_raw = self._search_regex( r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None) @@ -124,4 +128,5 @@ class LiveLeakIE(InfoExtractor): 'uploader': video_uploader, 'formats': formats, 'age_limit': age_limit, + 'thumbnail': video_thumbnail, } From 3fd6332c056115e5de37b0789d907e9344c2ff5c Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 7 May 2016 15:12:20 +0100 Subject: [PATCH 60/82] [flickr] extract license field(closes #9425) --- youtube_dl/extractor/flickr.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 0a3de1498..73ae3adee 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -27,10 +27,24 @@ class FlickrIE(InfoExtractor): 'comment_count': int, 'view_count': int, 'tags': list, + 'license': 'Attribution-ShareAlike', } } - _API_BASE_URL = 'https://api.flickr.com/services/rest?' + # https://help.yahoo.com/kb/flickr/SLN25525.html + _LICENSES = { + '0': 'All Rights Reserved', + '1': 'Attribution-NonCommercial-ShareAlike', + '2': 'Attribution-NonCommercial', + '3': 'Attribution-NonCommercial-NoDerivs', + '4': 'Attribution', + '5': 'Attribution-ShareAlike', + '6': 'Attribution-NoDerivs', + '7': 'No known copyright restrictions', + '8': 'United States government work', + '9': 'Public Domain Dedication (CC0)', + '10': 'Public Domain Work', + } def _call_api(self, method, video_id, api_key, note, secret=None): query = { @@ -87,7 +101,8 @@ class FlickrIE(InfoExtractor): 'uploader': owner.get('realname'), 'comment_count': int_or_none(video_info.get('comments', {}).get('_content')), 'view_count': int_or_none(video_info.get('views')), - 'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])] + 'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])], + 'license': self._LICENSES.get(video_info.get('license')), } else: raise ExtractorError('not a video', expected=True) From cb1fa5881315ed998a366f47511b7a4b4ea067b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 May 2016 20:15:40 +0600 Subject: [PATCH 61/82] [flickr] Extract uploader URL (Closes #9426) --- youtube_dl/extractor/flickr.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 73ae3adee..a8e1bf42a 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -24,6 +24,7 @@ class FlickrIE(InfoExtractor): 'upload_date': '20110423', 'uploader_id': '10922353@N03', 'uploader': 'Forest Wander', + 'uploader_url': 'https://www.flickr.com/photos/forestwander-nature-pictures/', 'comment_count': int, 'view_count': int, 'tags': list, @@ -89,6 +90,9 @@ class FlickrIE(InfoExtractor): self._sort_formats(formats) owner = video_info.get('owner', {}) + uploader_id = owner.get('nsid') + uploader_path = owner.get('path_alias') or uploader_id + uploader_url = 'https://www.flickr.com/photos/%s/' % uploader_path if uploader_path else None return { 'id': video_id, @@ -97,8 +101,9 @@ class FlickrIE(InfoExtractor): 'formats': formats, 'timestamp': int_or_none(video_info.get('dateuploaded')), 'duration': int_or_none(video_info.get('video', {}).get('duration')), - 'uploader_id': owner.get('nsid'), + 'uploader_id': uploader_id, 'uploader': owner.get('realname'), + 'uploader_url': uploader_url, 'comment_count': int_or_none(video_info.get('comments', {}).get('_content')), 'view_count': int_or_none(video_info.get('views')), 'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])], From a0904c5d8024c12b7f95b1126a6b8152a4e1021f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 8 May 2016 00:56:31 +0800 Subject: [PATCH 62/82] [telegraaf] Fix extractor (closes #9318) --- youtube_dl/extractor/telegraaf.py | 58 +++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py index 6f8333cfc..9092e9b85 100644 --- a/youtube_dl/extractor/telegraaf.py +++ b/youtube_dl/extractor/telegraaf.py @@ -2,14 +2,16 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import remove_end +from ..utils import ( + determine_ext, + remove_end, +) class TelegraafIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/tv/(?:[^/]+/)+(?P<id>\d+)/[^/]+\.html' _TEST = { 'url': 'http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html', - 'md5': '83245a9779bcc4a24454bfd53c65b6dc', 'info_dict': { 'id': '24353229', 'ext': 'mp4', @@ -18,18 +20,60 @@ class TelegraafIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 33, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): - playlist_id = self._match_id(url) + video_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) + webpage = self._download_webpage(url, video_id) + player_url = self._html_search_regex( + r'<iframe[^>]+src="([^"]+")', webpage, 'player URL') + player_page = self._download_webpage( + player_url, video_id, note='Download player webpage') playlist_url = self._search_regex( - r"iframe\.loadPlayer\('([^']+)'", webpage, 'player') + r'playlist\s*:\s*"([^"]+)"', player_page, 'playlist URL') + playlist_data = self._download_json(playlist_url, video_id) + + item = playlist_data['items'][0] + formats = [] + locations = item['locations'] + for location in locations.get('adaptive', []): + manifest_url = location['src'] + ext = determine_ext(manifest_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, ext='mp4', m3u8_id='hls')) + elif ext == 'mpd': + # TODO: Current DASH formats are broken - $Time$ pattern in + # <SegmentTemplate> not implemented yet + continue + else: + self.report_warning('Unknown adaptive format %s' % ext) + for location in locations.get('progressive', []): + formats.append({ + 'url': location['sources'][0]['src'], + 'width': location.get('width'), + 'height': location.get('height'), + 'format_id': 'http-%s' % location['label'], + }) + + self._sort_formats(formats) - entries = self._extract_xspf_playlist(playlist_url, playlist_id) title = remove_end(self._og_search_title(webpage), ' - VIDEO') description = self._og_search_description(webpage) + duration = item.get('duration') + thumbnail = item.get('poster') - return self.playlist_result(entries, playlist_id, title, description) + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'duration': duration, + 'thumbnail': thumbnail, + } From e2eca6f65e9969c31b3374bd3688321f3e471cd7 Mon Sep 17 00:00:00 2001 From: Kevin Deldycke <kevin@deldycke.com> Date: Sat, 7 May 2016 20:03:25 +0200 Subject: [PATCH 63/82] Expand user's home in batch file path. --- youtube_dl/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 737f6545d..7a0466077 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -86,7 +86,9 @@ def _real_main(argv=None): if opts.batchfile == '-': batchfd = sys.stdin else: - batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore') + batchfd = io.open( + compat_expanduser(opts.batchfile), + 'r', encoding='utf-8', errors='ignore') batch_urls = read_batch_urls(batchfd) if opts.verbose: write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n') From 00c21c225decf648199013f2fa3385a1332037bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 May 2016 00:11:44 +0600 Subject: [PATCH 64/82] Credit @kdeldycke for #9430 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 814fe9ec3..5f668338b 100644 --- a/AUTHORS +++ b/AUTHORS @@ -169,3 +169,4 @@ Viťas Strádal Kagami Hiiragi Philip Huppert blahgeek +Kevin Deldycke From 5c24873a9e6a47e58b10eb0c0825e165604796f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 May 2016 02:04:34 +0600 Subject: [PATCH 65/82] Credit @inondle for #9400 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 5f668338b..bf860b7f7 100644 --- a/AUTHORS +++ b/AUTHORS @@ -170,3 +170,4 @@ Kagami Hiiragi Philip Huppert blahgeek Kevin Deldycke +inondle From f5436c5d9e4e65790440ada40476712ff430651b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 May 2016 02:29:26 +0600 Subject: [PATCH 66/82] [downloader/external] Add temp fix ffmpeg m3u8 downloads (Closes #9394) --- youtube_dl/downloader/external.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 8d642fc3e..45f49c350 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -224,7 +224,7 @@ class FFmpegFD(ExternalFD): args += ['-rtmp_live', 'live'] args += ['-i', url, '-c', 'copy'] - if protocol == 'm3u8': + if protocol in ('m3u8', 'm3u8_native'): if self.params.get('hls_use_mpegts', False) or tmpfilename == '-': args += ['-f', 'mpegts'] else: From 3e169233daf76cd7585ebac12504f8e624b7693b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 May 2016 04:36:57 +0600 Subject: [PATCH 67/82] Expanduser for more options with input files --- youtube_dl/YoutubeDL.py | 1 + youtube_dl/__init__.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2187dcc8f..a96482e68 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -2018,6 +2018,7 @@ class YoutubeDL(object): if opts_cookiefile is None: self.cookiejar = compat_cookiejar.CookieJar() else: + opts_cookiefile = compat_expanduser(opts_cookiefile) self.cookiejar = compat_cookiejar.MozillaCookieJar( opts_cookiefile) if os.access(opts_cookiefile, os.R_OK): diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 7a0466077..cbd84c3af 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -406,7 +406,7 @@ def _real_main(argv=None): try: if opts.load_info_filename is not None: - retcode = ydl.download_with_info_file(opts.load_info_filename) + retcode = ydl.download_with_info_file(compat_expanduser(opts.load_info_filename)) else: retcode = ydl.download(all_urls) except MaxDownloadsReached: From 9c072d38c6b0361d91e92c50cd0c753dc8ce3101 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 May 2016 06:52:42 +0600 Subject: [PATCH 68/82] [arte] Improve language preference (Closes #9401, closes #9162) --- youtube_dl/extractor/arte.py | 58 ++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 881cacfab..e37fdae13 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -161,24 +161,53 @@ class ArteTVPlus7IE(InfoExtractor): 'es': 'E[ESP]', } + langcode = LANGS.get(lang, lang) + formats = [] for format_id, format_dict in player_info['VSR'].items(): f = dict(format_dict) versionCode = f.get('versionCode') - langcode = LANGS.get(lang, lang) - lang_rexs = [r'VO?%s-' % re.escape(langcode), r'VO?.-ST%s$' % re.escape(langcode)] - lang_pref = None - if versionCode: - matched_lang_rexs = [r for r in lang_rexs if re.match(r, versionCode)] - lang_pref = -10 if not matched_lang_rexs else 10 * len(matched_lang_rexs) - source_pref = 0 - if versionCode is not None: - # The original version with subtitles has lower relevance - if re.match(r'VO-ST(F|A|E)', versionCode): - source_pref -= 10 - # The version with sourds/mal subtitles has also lower relevance - elif re.match(r'VO?(F|A|E)-STM\1', versionCode): - source_pref -= 9 + l = re.escape(langcode) + + # Language preference from most to least priority + # Reference: section 5.6.3 of + # http://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-05.pdf + PREFERENCES = ( + # original version in requested language, without subtitles + r'VO{0}$'.format(l), + # original version in requested language, with partial subtitles in requested language + r'VO{0}-ST{0}$'.format(l), + # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language + r'VO{0}-STM{0}$'.format(l), + # non-original (dubbed) version in requested language, without subtitles + r'V{0}$'.format(l), + # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language + r'V{0}-ST{0}$'.format(l), + # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language + r'V{0}-STM{0}$'.format(l), + # original version in requested language, with partial subtitles in different language + r'VO{0}-ST(?!{0}).+?$'.format(l), + # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language + r'VO{0}-STM(?!{0}).+?$'.format(l), + # original version in different language, with partial subtitles in requested language + r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l), + # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language + r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l), + # original version in different language, without subtitles + r'VO(?:(?!{0}))?$'.format(l), + # original version in different language, with partial subtitles in different language + r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l), + # original version in different language, with subtitles for the deaf and hard-of-hearing in different language + r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l), + ) + + for pref, p in enumerate(PREFERENCES): + if re.match(p, versionCode): + lang_pref = len(PREFERENCES) - pref + break + else: + lang_pref = -1 + format = { 'format_id': format_id, 'preference': -10 if f.get('videoFormat') == 'M3U8' else None, @@ -188,7 +217,6 @@ class ArteTVPlus7IE(InfoExtractor): 'height': int_or_none(f.get('height')), 'tbr': int_or_none(f.get('bitrate')), 'quality': qfunc(f.get('quality')), - 'source_preference': source_pref, } if f.get('mediaType') == 'rtmp': From 3b01a9fbb63e33325fa979db8a846d3e655e79e6 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 8 May 2016 14:34:38 +0800 Subject: [PATCH 69/82] [litv] Add new extractor LiTV is a streaming platform providing free and paid legal contents in Taiwan. --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/litv.py | 137 +++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+) create mode 100644 youtube_dl/extractor/litv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 14b4f245f..7bacef184 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -384,6 +384,7 @@ from .limelight import ( LimelightChannelIE, LimelightChannelListIE, ) +from .litv import LiTVIE from .liveleak import LiveLeakIE from .livestream import ( LivestreamIE, diff --git a/youtube_dl/extractor/litv.py b/youtube_dl/extractor/litv.py new file mode 100644 index 000000000..3356d015d --- /dev/null +++ b/youtube_dl/extractor/litv.py @@ -0,0 +1,137 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + smuggle_url, + unsmuggle_url, +) + + +class LiTVIE(InfoExtractor): + _VALID_URL = r'https?://www\.litv\.tv/vod/[^/]+/content\.do\?.*?\bid=(?P<id>[^&]+)' + + _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s' + + _TESTS = [{ + 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', + 'info_dict': { + 'id': 'VOD00041606', + 'title': '花千骨', + }, + 'playlist_count': 50, + }, { + 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', + 'info_dict': { + 'id': 'VOD00041610', + 'ext': 'mp4', + 'title': '花千骨第1集', + 'thumbnail': 're:https?://.*\.jpg$', + 'description': 'md5:c7017aa144c87467c4fb2909c4b05d6f', + 'episode_number': 1, + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, # m3u8 download + }, + 'skip': 'Georestricted to Taiwan', + }] + + def _extract_playlist(self, season_list, video_id, vod_data, view_data, prompt=True): + episode_title = view_data['title'] + content_id = season_list['contentId'] + + if prompt: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (content_id, video_id)) + + all_episodes = [ + self.url_result(smuggle_url( + self._URL_TEMPLATE % (view_data['contentType'], episode['contentId']), + {'force_noplaylist': True})) # To prevent infinite recursion + for episode in season_list['episode']] + + return self.playlist_result(all_episodes, content_id, episode_title) + + def _real_extract(self, url): + url, data = unsmuggle_url(url, {}) + + video_id = self._match_id(url) + + noplaylist = self._downloader.params.get('noplaylist') + noplaylist_prompt = True + if 'force_noplaylist' in data: + noplaylist = data['force_noplaylist'] + noplaylist_prompt = False + + webpage = self._download_webpage(url, video_id) + + view_data = dict(map(lambda t: (t[0], t[2]), re.findall( + r'viewData\.([a-zA-Z]+)\s*=\s*(["\'])([^"\']+)\2', + webpage))) + + vod_data = self._parse_json(self._search_regex( + 'var\s+vod\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'), + video_id) + + season_list = list(vod_data.get('seasonList', {}).values()) + if season_list: + if not noplaylist: + return self._extract_playlist( + season_list[0], video_id, vod_data, view_data, + prompt=noplaylist_prompt) + + if noplaylist_prompt: + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + + # In browsers `getMainUrl` request is always issued. Usually this + # endpoint gives the same result as the data embedded in the webpage. + # If georestricted, there are no embedded data, so an extra request is + # necessary to get the error code + video_data = self._parse_json(self._search_regex( + r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);', + webpage, 'video data', default='{}'), video_id) + if not video_data: + payload = { + 'assetId': view_data['assetId'], + 'watchDevices': vod_data['watchDevices'], + 'contentType': view_data['contentType'], + } + video_data = self._download_json( + 'https://www.litv.tv/vod/getMainUrl', video_id, + data=json.dumps(payload).encode('utf-8'), + headers={'Content-Type': 'application/json'}) + + if not video_data.get('fullpath'): + error_msg = video_data.get('errorMessage') + if error_msg == 'vod.error.outsideregionerror': + self.raise_geo_restricted('This video is available in Taiwan only') + if error_msg: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_msg), expected=True) + raise ExtractorError('Unexpected result from %s' % self.IE_NAME) + + formats = self._extract_m3u8_formats( + video_data['fullpath'], video_id, ext='mp4', m3u8_id='hls') + for a_format in formats: + # LiTV HLS segments doesn't like compressions + a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True + + title = view_data['title'] + view_data.get('secondaryMark', '') + description = view_data.get('description') + thumbnail = view_data.get('imageFile') + categories = [item['name'] for item in vod_data.get('category', [])] + episode = int_or_none(view_data.get('episode')) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'categories': categories, + 'episode_number': episode, + } From f23a92a0cecac0d4db60e086e429793556347271 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 May 2016 20:02:54 +0600 Subject: [PATCH 70/82] [mva] Add extractor (Closes #6667) --- youtube_dl/extractor/extractors.py | 4 + .../extractor/microsoftvirtualacademy.py | 192 ++++++++++++++++++ 2 files changed, 196 insertions(+) create mode 100644 youtube_dl/extractor/microsoftvirtualacademy.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7bacef184..a0bb3d4c2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -409,6 +409,10 @@ from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mgoon import MgoonIE from .mgtv import MGTVIE +from .microsoftvirtualacademy import ( + MicrosoftVirtualAcademyIE, + MicrosoftVirtualAcademyCourseIE, +) from .minhateca import MinhatecaIE from .ministrygrid import MinistryGridIE from .minoto import MinotoIE diff --git a/youtube_dl/extractor/microsoftvirtualacademy.py b/youtube_dl/extractor/microsoftvirtualacademy.py new file mode 100644 index 000000000..b7fea47ee --- /dev/null +++ b/youtube_dl/extractor/microsoftvirtualacademy.py @@ -0,0 +1,192 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_xpath, +) +from ..utils import ( + int_or_none, + parse_duration, + smuggle_url, + unsmuggle_url, + xpath_text, +) + + +class MicrosoftVirtualAcademyBaseIE(InfoExtractor): + def _extract_base_url(self, course_id, display_id): + return self._download_json( + 'https://api-mlxprod.microsoft.com/services/products/anonymous/%s' % course_id, + display_id, 'Downloading course base URL') + + def _extract_chapter_and_title(self, title): + if not title: + return None, None + m = re.search(r'(?P<chapter>\d+)\s*\|\s*(?P<title>.+)', title) + return (int(m.group('chapter')), m.group('title')) if m else (None, title) + + +class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): + IE_NAME = 'mva' + IE_DESC = 'Microsoft Virtual Academy videos' + _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME + + _TESTS = [{ + 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382', + 'md5': '7826c44fc31678b12ad8db11f6b5abb9', + 'info_dict': { + 'id': 'gfVXISmEB_6804984382', + 'ext': 'mp4', + 'title': 'Course Introduction', + 'formats': 'mincount:3', + 'subtitles': { + 'en': [{ + 'ext': 'ttml', + }], + }, + } + }, { + 'url': 'mva:11788:gfVXISmEB_6804984382', + 'only_matching': True, + }] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + mobj = re.match(self._VALID_URL, url) + course_id = mobj.group('course_id') + video_id = mobj.group('id') + + base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id) + + settings = self._download_xml( + '%s/content/content_%s/videosettings.xml?v=1' % (base_url, video_id), + video_id, 'Downloading video settings XML') + + _, title = self._extract_chapter_and_title(xpath_text( + settings, './/Title', 'title', fatal=True)) + + formats = [] + + for sources in settings.findall(compat_xpath('.//MediaSources')): + if sources.get('videoType') == 'smoothstreaming': + continue + for source in sources.findall(compat_xpath('./MediaSource')): + video_url = source.text + if not video_url or not video_url.startswith('http'): + continue + video_mode = source.get('videoMode') + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', video_mode or '', 'height', default=None)) + codec = source.get('codec') + acodec, vcodec = [None] * 2 + if codec: + codecs = codec.split(',') + if len(codecs) == 2: + acodec, vcodec = codecs + elif len(codecs) == 1: + vcodec = codecs[0] + formats.append({ + 'url': video_url, + 'format_id': video_mode, + 'height': height, + 'acodec': acodec, + 'vcodec': vcodec, + }) + self._sort_formats(formats) + + subtitles = {} + for source in settings.findall(compat_xpath('.//MarkerResourceSource')): + subtitle_url = source.text + if not subtitle_url: + continue + subtitles.setdefault('en', []).append({ + 'url': '%s/%s' % (base_url, subtitle_url), + 'ext': source.get('type'), + }) + + return { + 'id': video_id, + 'title': title, + 'subtitles': subtitles, + 'formats': formats + } + + +class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE): + IE_NAME = 'mva:course' + IE_DESC = 'Microsoft Virtual Academy courses' + _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME + + _TESTS = [{ + 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', + 'info_dict': { + 'id': '11788', + 'title': 'Microsoft Azure Fundamentals: Virtual Machines', + }, + 'playlist_count': 36, + }, { + # with emphasized chapters + 'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335', + 'info_dict': { + 'id': '16335', + 'title': 'Developing Windows 10 Games with Construct 2', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', + 'only_matching': True, + }, { + 'url': 'mva:course:11788', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if MicrosoftVirtualAcademyIE.suitable(url) else super( + MicrosoftVirtualAcademyCourseIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_id = mobj.group('id') + display_id = mobj.group('display_id') + + base_url = self._extract_base_url(course_id, display_id) + + manifest = self._download_json( + '%s/imsmanifestlite.json' % base_url, + display_id, 'Downloading course manifest JSON')['manifest'] + + organization = manifest['organizations']['organization'][0] + + entries = [] + for chapter in organization['item']: + chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title')) + chapter_id = chapter.get('@identifier') + for item in chapter.get('item', []): + item_id = item.get('@identifier') + if not item_id: + continue + metadata = item.get('resource', {}).get('metadata') or {} + if metadata.get('learningresourcetype') != 'Video': + continue + _, title = self._extract_chapter_and_title(item.get('title')) + duration = parse_duration(metadata.get('duration')) + description = metadata.get('description') + entries.append({ + '_type': 'url_transparent', + 'url': smuggle_url( + 'mva:%s:%s' % (course_id, item_id), {'base_url': base_url}), + 'title': title, + 'description': description, + 'duration': duration, + 'chapter': chapter_title, + 'chapter_number': chapter_number, + 'chapter_id': chapter_id, + }) + + title = organization.get('title') or manifest.get('metadata', {}).get('title') + + return self.playlist_result(entries, course_id, title) From c52f4efaee2386a72c3f6b694fb4f4c3132ced55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 May 2016 20:10:20 +0600 Subject: [PATCH 71/82] [mva] Improve _VALID_URLs --- youtube_dl/extractor/microsoftvirtualacademy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/microsoftvirtualacademy.py b/youtube_dl/extractor/microsoftvirtualacademy.py index b7fea47ee..afd3e98ec 100644 --- a/youtube_dl/extractor/microsoftvirtualacademy.py +++ b/youtube_dl/extractor/microsoftvirtualacademy.py @@ -31,7 +31,7 @@ class MicrosoftVirtualAcademyBaseIE(InfoExtractor): class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): IE_NAME = 'mva' IE_DESC = 'Microsoft Virtual Academy videos' - _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME + _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME _TESTS = [{ 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382', @@ -118,7 +118,7 @@ class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE): IE_NAME = 'mva:course' IE_DESC = 'Microsoft Virtual Academy courses' - _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME + _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME _TESTS = [{ 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', From f1f6f5aa5e2a6d66fa54d35bf3e8b3626e85ee73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Cech?= <sleep_walker@suse.cz> Date: Sat, 7 May 2016 20:15:49 +0200 Subject: [PATCH 72/82] [ceskatelevize] Add support for live streams Live streams has no playlist title, use title of the stream containing TV channel name. Internal m3u8 handler doesn't seem to handle well continuous streams. Add test for live stream. Remove no longer reachable test. --- youtube_dl/extractor/ceskatelevize.py | 35 +++++++++++++++++++-------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 6652c8e42..b41888531 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -33,14 +33,13 @@ class CeskaTelevizeIE(InfoExtractor): 'skip_download': True, }, }, { - 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina', + # live stream + 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', 'info_dict': { - 'id': '61924494876844374', + 'id': 402, 'ext': 'mp4', - 'title': 'První republika: Zpěvačka z Dupárny Bobina', - 'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.', - 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 88.4, + 'title': 're:ČT Sport.*', + 'is_live': True, }, 'params': { # m3u8 download @@ -118,19 +117,21 @@ class CeskaTelevizeIE(InfoExtractor): req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) req.add_header('Referer', url) - playlist_title = self._og_search_title(webpage) - playlist_description = self._og_search_description(webpage) + playlist_title = self._og_search_title(webpage, default=None) + playlist_description = self._og_search_description(webpage, default=None) playlist = self._download_json(req, playlist_id)['playlist'] playlist_len = len(playlist) entries = [] for item in playlist: + is_live = item['type'] == 'LIVE' formats = [] for format_id, stream_url in item['streamUrls'].items(): formats.extend(self._extract_m3u8_formats( stream_url, playlist_id, 'mp4', - entry_protocol='m3u8_native', fatal=False)) + entry_protocol='m3u8' if is_live else 'm3u8_native', + fatal=False)) self._sort_formats(formats) item_id = item.get('id') or item['assetId'] @@ -145,14 +146,28 @@ class CeskaTelevizeIE(InfoExtractor): if subs: subtitles = self.extract_subtitles(episode_id, subs) + if playlist_len == 1: + if is_live: + # live streams has channel name in title + final_title = self._live_title(title) + elif playlist_title: + # title is always set (no KeyError caught) + # and gives good fallback + final_title = title + else: + final_title = playlist_title + else: + final_title = '%s (%s)' % (playlist_title, title) + entries.append({ 'id': item_id, - 'title': playlist_title if playlist_len == 1 else '%s (%s)' % (playlist_title, title), + 'title': final_title, 'description': playlist_description if playlist_len == 1 else None, 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, 'subtitles': subtitles, + 'is_live': is_live, }) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) From 3951e7eb9305448aab6395f4303ed7ab19248c52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 May 2016 20:37:20 +0600 Subject: [PATCH 73/82] [ceskatelevize] Simplify, restore bonus video test and skip georestricted test (Closes #9431) --- youtube_dl/extractor/ceskatelevize.py | 29 ++++++++++++++++++--------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index b41888531..5a58d1777 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -32,19 +32,34 @@ class CeskaTelevizeIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + }, { + 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', + 'info_dict': { + 'id': '61924494877028507', + 'ext': 'mp4', + 'title': 'Hyde Park Civilizace: Bonus 01 - En', + 'description': 'English Subtittles', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 81.3, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { # live stream 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', 'info_dict': { 'id': 402, 'ext': 'mp4', - 'title': 're:ČT Sport.*', + 'title': 're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'is_live': True, }, 'params': { # m3u8 download 'skip_download': True, }, + 'skip': 'Georestricted to Czech Republic', }, { # video with 18+ caution trailer 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', @@ -125,7 +140,7 @@ class CeskaTelevizeIE(InfoExtractor): entries = [] for item in playlist: - is_live = item['type'] == 'LIVE' + is_live = item.get('type') == 'LIVE' formats = [] for format_id, stream_url in item['streamUrls'].items(): formats.extend(self._extract_m3u8_formats( @@ -147,15 +162,9 @@ class CeskaTelevizeIE(InfoExtractor): subtitles = self.extract_subtitles(episode_id, subs) if playlist_len == 1: + final_title = playlist_title or title if is_live: - # live streams has channel name in title - final_title = self._live_title(title) - elif playlist_title: - # title is always set (no KeyError caught) - # and gives good fallback - final_title = title - else: - final_title = playlist_title + final_title = self._live_title(final_title) else: final_title = '%s (%s)' % (playlist_title, title) From 965fefdcd879405c3e4b5604513719353ba8474a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 May 2016 20:38:33 +0600 Subject: [PATCH 74/82] Credit @sleep-walker for #9431 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index bf860b7f7..5ca71ace7 100644 --- a/AUTHORS +++ b/AUTHORS @@ -171,3 +171,4 @@ Philip Huppert blahgeek Kevin Deldycke inondle +Tomáš Čech From c15c47d19bfeeacd42f44dd7736f175711a91346 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 May 2016 20:45:03 +0600 Subject: [PATCH 75/82] [downloader/hls] Remove EXT-X-MEDIA-SEQUENCE from unsupported features for hlsnative --- youtube_dl/downloader/hls.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index d7b34bde3..dcedc9a64 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -23,7 +23,9 @@ class HlsFD(FragmentFD): UNSUPPORTED_FEATURES = ( r'#EXT-X-KEY:METHOD=(?!NONE)', # encrypted streams [1] r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2] - r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3] + # Live streams heuristic does not always work (e.g. geo restricted to Germany + # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0) + #r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3] # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 From 6104cc2985c36e996df1aae7cfcc686f3bae0b82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 May 2016 20:55:37 +0600 Subject: [PATCH 76/82] [downloader/hls] Add event media playlists to unsupported features of hlsnative --- youtube_dl/downloader/hls.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index dcedc9a64..a8279718b 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -26,9 +26,12 @@ class HlsFD(FragmentFD): # Live streams heuristic does not always work (e.g. geo restricted to Germany # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0) #r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3] + r'#EXT-X-PLAYLIST-TYPE:EVENT', # media segments may be appended to the end of + # event media playlists [4] # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 + # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5 ) return all(not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES) From fe40f9eef2483748ed83c9749f35220143d8cc9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 May 2016 21:55:03 +0600 Subject: [PATCH 77/82] [compat] Add compat_setenv --- test/test_compat.py | 8 ++++++++ youtube_dl/compat.py | 10 ++++++++++ 2 files changed, 18 insertions(+) diff --git a/test/test_compat.py b/test/test_compat.py index 618668210..0d751a856 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -13,6 +13,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.utils import get_filesystem_encoding from youtube_dl.compat import ( compat_getenv, + compat_setenv, compat_etree_fromstring, compat_expanduser, compat_shlex_split, @@ -31,6 +32,13 @@ class TestCompat(unittest.TestCase): else test_str.encode(get_filesystem_encoding())) self.assertEqual(compat_getenv('YOUTUBE-DL-TEST'), test_str) + def test_compat_setenv(self): + test_var = 'YOUTUBE-DL-TEST' + test_str = 'тест' + compat_setenv(test_var, test_str) + compat_getenv(test_var) + self.assertEqual(compat_getenv(test_var), test_str) + def test_compat_expanduser(self): old_home = os.environ.get('HOME') test_str = 'C:\Documents and Settings\тест\Application Data' diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 0b6c5ca7a..12b53cdc8 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -373,6 +373,9 @@ compat_os_name = os._name if os.name == 'java' else os.name if sys.version_info >= (3, 0): compat_getenv = os.getenv compat_expanduser = os.path.expanduser + + def compat_setenv(key, value, env=os.environ): + env[key] = value else: # Environment variables should be decoded with filesystem encoding. # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918) @@ -384,6 +387,12 @@ else: env = env.decode(get_filesystem_encoding()) return env + def compat_setenv(key, value, env=os.environ): + def encode(v): + from .utils import get_filesystem_encoding + return v.encode(get_filesystem_encoding()) if isinstance(v, compat_str) else v + env[encode(key)] = encode(value) + # HACK: The default implementations of os.path.expanduser from cpython do not decode # environment variables with filesystem encoding. We will work around this by # providing adjusted implementations. @@ -604,6 +613,7 @@ __all__ = [ 'compat_os_name', 'compat_parse_qs', 'compat_print', + 'compat_setenv', 'compat_shlex_split', 'compat_socket_create_connection', 'compat_str', From 129263875403841da485ac74b09960d862d23f63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 May 2016 21:58:38 +0600 Subject: [PATCH 78/82] [test_compat] Use compat_setenv --- test/test_compat.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index 0d751a856..afe6bd528 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -27,9 +27,7 @@ from youtube_dl.compat import ( class TestCompat(unittest.TestCase): def test_compat_getenv(self): test_str = 'тест' - os.environ['YOUTUBE-DL-TEST'] = ( - test_str if sys.version_info >= (3, 0) - else test_str.encode(get_filesystem_encoding())) + compat_setenv('YOUTUBE-DL-TEST', test_str) self.assertEqual(compat_getenv('YOUTUBE-DL-TEST'), test_str) def test_compat_setenv(self): @@ -42,11 +40,9 @@ class TestCompat(unittest.TestCase): def test_compat_expanduser(self): old_home = os.environ.get('HOME') test_str = 'C:\Documents and Settings\тест\Application Data' - os.environ['HOME'] = ( - test_str if sys.version_info >= (3, 0) - else test_str.encode(get_filesystem_encoding())) + compat_setenv('HOME', test_str) self.assertEqual(compat_expanduser('~'), test_str) - os.environ['HOME'] = old_home + compat_setenv('HOME', old_home) def test_all_present(self): import youtube_dl.compat From 20cfdcc910d0bc2ee4b0ee38bdf5e6ecb67e5731 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 May 2016 22:00:14 +0600 Subject: [PATCH 79/82] [test_compat] Avoid None values for compat_setenv --- test/test_compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_compat.py b/test/test_compat.py index afe6bd528..b20814249 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -42,7 +42,7 @@ class TestCompat(unittest.TestCase): test_str = 'C:\Documents and Settings\тест\Application Data' compat_setenv('HOME', test_str) self.assertEqual(compat_expanduser('~'), test_str) - compat_setenv('HOME', old_home) + compat_setenv('HOME', old_home or '') def test_all_present(self): import youtube_dl.compat From e62d9c5caaa972ef4b1ed5d6ab5ee4a087a4ba95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 May 2016 22:05:12 +0600 Subject: [PATCH 80/82] [downloader/external] Call ffmpeg with with HTTP_PROXY env variable set (#9437) --- youtube_dl/downloader/external.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 45f49c350..3a73cee1c 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -6,6 +6,7 @@ import sys import re from .common import FileDownloader +from ..compat import compat_setenv from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS from ..utils import ( cli_option, @@ -198,6 +199,18 @@ class FFmpegFD(ExternalFD): '-headers', ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] + env = None + proxy = self.params.get('proxy') + if proxy: + if not re.match(r'^[\da-zA-Z]+://', proxy): + proxy = 'http://%s' % proxy + # Since December 2015 ffmpeg supports -http_proxy option (see + # http://git.videolan.org/?p=ffmpeg.git;a=commit;h=b4eb1f29ebddd60c41a2eb39f5af701e38e0d3fd) + # We could switch to the following code if we are able to detect version properly + # args += ['-http_proxy', proxy] + env = os.environ.copy() + compat_setenv('HTTP_PROXY', proxy, env=env) + protocol = info_dict.get('protocol') if protocol == 'rtmp': @@ -239,7 +252,7 @@ class FFmpegFD(ExternalFD): self._debug_cmd(args) - proc = subprocess.Popen(args, stdin=subprocess.PIPE) + proc = subprocess.Popen(args, stdin=subprocess.PIPE, env=env) try: retval = proc.wait() except KeyboardInterrupt: From fad7bbec3a1fb62964c8e6637dfd535fabe9c133 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 May 2016 22:15:55 +0600 Subject: [PATCH 81/82] [test_compat] Remove unused import --- test/test_compat.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_compat.py b/test/test_compat.py index b20814249..9adf75763 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -10,7 +10,6 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.utils import get_filesystem_encoding from youtube_dl.compat import ( compat_getenv, compat_setenv, From 2937590e8b70384ef91bdadbb56a55897aab0837 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 May 2016 22:16:33 +0600 Subject: [PATCH 82/82] [downloader/hls] PEP 8 --- youtube_dl/downloader/hls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index a8279718b..62136ee54 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -25,7 +25,7 @@ class HlsFD(FragmentFD): r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2] # Live streams heuristic does not always work (e.g. geo restricted to Germany # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0) - #r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3] + # r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3] r'#EXT-X-PLAYLIST-TYPE:EVENT', # media segments may be appended to the end of # event media playlists [4] # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4