From 57ce8a6d08a05140230864eccbc52029f1fd46c1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 10 Aug 2016 14:17:22 +0100 Subject: [PATCH 001/218] [wat] improve extraction(#10281) add alternative method to extract http formats works even if the video is geo-restricted or removed from public access(most of the cases) --- youtube_dl/extractor/wat.py | 126 +++++++++++++++++++++++------------- 1 file changed, 81 insertions(+), 45 deletions(-) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 48fc438ed..9f1b8b4b5 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -9,6 +9,7 @@ from ..utils import ( ExtractorError, unified_strdate, HEADRequest, + int_or_none, ) @@ -30,48 +31,58 @@ class WatIE(InfoExtractor): }, { 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', - 'md5': 'fbc84e4378165278e743956d9c1bf16b', + 'md5': '34bdfa5ca9fd3c7eb88601b635b0424c', 'info_dict': { 'id': '11713075', 'ext': 'mp4', 'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)', - 'description': 'md5:b7a849cf16a2b733d9cd10c52906dee3', 'upload_date': '20140816', - 'duration': 2910, }, - 'skip': "Ce contenu n'est pas disponible pour l'instant.", + 'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."], }, ] + _FORMATS = ( + (200, 416, 234), + (400, 480, 270), + (600, 640, 360), + (1200, 640, 360), + (1800, 960, 540), + (2500, 1280, 720), + ) + def _real_extract(self, url): video_id = self._match_id(url) video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36)) # 'contentv4' is used in the website, but it also returns the related # videos, we don't need them - video_info = self._download_json( - 'http://www.wat.tv/interface/contentv3/' + video_id, video_id)['media'] + video_data = self._download_json( + 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) + video_info = video_data['media'] error_desc = video_info.get('error_desc') if error_desc: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error_desc), expected=True) + self.report_warning( + '%s returned error: %s' % (self.IE_NAME, error_desc)) chapters = video_info['chapters'] - first_chapter = chapters[0] + if chapters: + first_chapter = chapters[0] - def video_id_for_chapter(chapter): - return chapter['tc_start'].split('-')[0] + def video_id_for_chapter(chapter): + return chapter['tc_start'].split('-')[0] - if video_id_for_chapter(first_chapter) != video_id: - self.to_screen('Multipart video detected') - entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters] - return self.playlist_result(entries, video_id, video_info['title']) - # Otherwise we can continue and extract just one part, we have to use - # the video id for getting the video url + if video_id_for_chapter(first_chapter) != video_id: + self.to_screen('Multipart video detected') + entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters] + return self.playlist_result(entries, video_id, video_info['title']) + # Otherwise we can continue and extract just one part, we have to use + # the video id for getting the video url + else: + first_chapter = video_info - date_diffusion = first_chapter.get('date_diffusion') - upload_date = unified_strdate(date_diffusion) if date_diffusion else None + title = first_chapter['title'] def extract_url(path_template, url_type): req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id) @@ -83,36 +94,61 @@ class WatIE(InfoExtractor): expected=True) return red_url - m3u8_url = extract_url('ipad/%s.m3u8', 'm3u8') - http_url = extract_url('android5/%s.mp4', 'http') - formats = [] - m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') - formats.extend(m3u8_formats) - formats.extend(self._extract_f4m_formats( - m3u8_url.replace('ios.', 'web.').replace('.m3u8', '.f4m'), - video_id, f4m_id='hds', fatal=False)) - for m3u8_format in m3u8_formats: - vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr') - if not vbr or not abr: - continue - f = m3u8_format.copy() - f.update({ - 'url': re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url), - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - self._sort_formats(formats) + try: + http_url = extract_url('android5/%s.mp4', 'http') + m3u8_url = extract_url('ipad/%s.m3u8', 'm3u8') + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') + formats.extend(m3u8_formats) + formats.extend(self._extract_f4m_formats( + m3u8_url.replace('ios.', 'web.').replace('.m3u8', '.f4m'), + video_id, f4m_id='hds', fatal=False)) + for m3u8_format in m3u8_formats: + vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr') + if not vbr or not abr: + continue + format_id = m3u8_format['format_id'].replace('hls', 'http') + fmt_url = re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url) + if self._is_valid_url(fmt_url, video_id, format_id): + f = m3u8_format.copy() + f.update({ + 'url': fmt_url, + 'format_id': format_id, + 'protocol': 'http', + }) + formats.append(f) + self._sort_formats(formats) + except ExtractorError: + abr = 64 + for vbr, width, height in self._FORMATS: + tbr = vbr + abr + format_id = 'http-%s' % tbr + fmt_url = 'http://dnl.adv.tf1.fr/2/USP-0x0/%s/%s/%s/ssm/%s-%s-64k.mp4' % (video_id[-4:-2], video_id[-2:], video_id, video_id, vbr) + if self._is_valid_url(fmt_url, video_id, format_id): + formats.append({ + 'format_id': format_id, + 'url': fmt_url, + 'vbr': vbr, + 'abr': abr, + 'width': width, + 'height': height, + }) + + date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4') + upload_date = unified_strdate(date_diffusion) if date_diffusion else None + duration = None + files = video_info['files'] + if files: + duration = int_or_none(files[0].get('duration')) return { 'id': video_id, - 'title': first_chapter['title'], - 'thumbnail': first_chapter['preview'], - 'description': first_chapter['description'], - 'view_count': video_info['views'], + 'title': title, + 'thumbnail': first_chapter.get('preview'), + 'description': first_chapter.get('description'), + 'view_count': int_or_none(video_info.get('views')), 'upload_date': upload_date, - 'duration': video_info['files'][0]['duration'], + 'duration': duration, 'formats': formats, } From c3fa77bdef14643c966913c49f0400ebc1e46b10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Aug 2016 21:00:40 +0700 Subject: [PATCH 002/218] [formula1] Relax _VALID_URL (Closes #10283) --- youtube_dl/extractor/formula1.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/formula1.py b/youtube_dl/extractor/formula1.py index 322c41e5a..8c417ab65 100644 --- a/youtube_dl/extractor/formula1.py +++ b/youtube_dl/extractor/formula1.py @@ -5,8 +5,8 @@ from .common import InfoExtractor class Formula1IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?formula1\.com/content/fom-website/en/video/\d{4}/\d{1,2}/(?P.+?)\.html' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?formula1\.com/(?:content/fom-website/)?en/video/\d{4}/\d{1,2}/(?P.+?)\.html' + _TESTS = [{ 'url': 'http://www.formula1.com/content/fom-website/en/video/2016/5/Race_highlights_-_Spain_2016.html', 'md5': '8c79e54be72078b26b89e0e111c0502b', 'info_dict': { @@ -15,7 +15,10 @@ class Formula1IE(InfoExtractor): 'title': 'Race highlights - Spain 2016', }, 'add_ie': ['Ooyala'], - } + }, { + 'url': 'http://www.formula1.com/en/video/2016/5/Race_highlights_-_Spain_2016.html', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) From 7f2ed4759513b153e526cd890fd5b8877f56f1c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 10 Aug 2016 21:07:43 +0700 Subject: [PATCH 003/218] [rtlnl] Relax _VALID_URL (Closes #10282) --- youtube_dl/extractor/rtlnl.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 4d612b5e3..f0250af8a 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -14,7 +14,7 @@ class RtlNlIE(InfoExtractor): _VALID_URL = r'''(?x) https?://(?:www\.)? (?: - rtlxl\.nl/\#!/[^/]+/| + rtlxl\.nl/[^\#]*\#!/[^/]+/| rtl\.nl/system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid= ) (?P[0-9a-f-]+)''' @@ -67,6 +67,9 @@ class RtlNlIE(InfoExtractor): }, { 'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0', 'only_matching': True, + }, { + 'url': 'http://rtlxl.nl/?_ga=1.204735956.572365465.1466978370#!/rtl-nieuws-132237/3c487912-023b-49ac-903e-2c5d79f8410f', + 'only_matching': True, }] def _real_extract(self, url): From 7f832413d6e4aa5aae4c904c42e0ecf4ae72aaf9 Mon Sep 17 00:00:00 2001 From: lkho Date: Tue, 9 Aug 2016 15:25:23 +0800 Subject: [PATCH 004/218] Preserve line endings for downloaded subtitle files --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 193f8db9f..fd7577bb8 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1603,7 +1603,7 @@ class YoutubeDL(object): self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format)) else: self.to_screen('[info] Writing video subtitles to: ' + sub_filename) - with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: + with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: subfile.write(sub_data) except (OSError, IOError): self.report_error('Cannot write subtitles file ' + sub_filename) From b1927f4e8a07a7893392135a71fdb6818295bbad Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 11 Aug 2016 19:04:23 +0800 Subject: [PATCH 005/218] [YoutubeDL] Disable newline conversion when writing subtitles By default io.open() convert all '\n' occurrences to '\r\n' when writing files. If the content already contains '\r\n', it will be converted to '\r\r\n', breaking some video players. --- youtube_dl/YoutubeDL.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index fd7577bb8..e844dc98a 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1603,6 +1603,8 @@ class YoutubeDL(object): self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format)) else: self.to_screen('[info] Writing video subtitles to: ' + sub_filename) + # Use newline='' to prevent conversion of newline characters + # See https://github.com/rg3/youtube-dl/issues/10268 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: subfile.write(sub_data) except (OSError, IOError): From e5f878c20573b258cad1974cc79a0526bcd1d46b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 11 Aug 2016 19:13:41 +0800 Subject: [PATCH 006/218] [ChangeLog] Add change log for #10269 [skip ci] --- ChangeLog | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog b/ChangeLog index adbdc4f9b..b6ea39cba 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version Core +* Subtitles are now written as is. Newline conversions are disabled. (#10268) + Recognize more formats in unified_timestamp Extractors From 30b25d382d1c2e06c19d8730ecbc0776a436d967 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 11 Aug 2016 21:42:55 +0700 Subject: [PATCH 007/218] [francetvinfo] Relax _VALID_URL --- youtube_dl/extractor/francetv.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 7653975e3..3233f66d5 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -131,7 +131,7 @@ class PluzzIE(FranceTVBaseInfoExtractor): class FranceTvInfoIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetvinfo.fr' - _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/.*/(?P.+)\.html' + _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<title>[^/?#&.]+)' _TESTS = [{ 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', @@ -206,6 +206,9 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'uploader_id': 'x2q2ez', }, 'add_ie': ['Dailymotion'], + }, { + 'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin', + 'only_matching': True, }] def _real_extract(self, url): From 0c070681c56589d44f81df8ed2165bca4333cef5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 11 Aug 2016 23:37:56 +0700 Subject: [PATCH 008/218] [chirbit] Fix extraction (Closes #10296) --- youtube_dl/extractor/chirbit.py | 54 ++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index b1eeaf101..b43518652 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -1,30 +1,33 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 + from .common import InfoExtractor -from ..utils import ( - parse_duration, - int_or_none, -) +from ..utils import parse_duration class ChirbitIE(InfoExtractor): IE_NAME = 'chirbit' _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P<id>[\da-zA-Z]+)' _TESTS = [{ - 'url': 'http://chirb.it/PrIPv5', - 'md5': '9847b0dad6ac3e074568bf2cfb197de8', + 'url': 'http://chirb.it/be2abG', 'info_dict': { - 'id': 'PrIPv5', + 'id': 'be2abG', 'ext': 'mp3', - 'title': 'Фасадстрой', - 'duration': 52, - 'view_count': int, - 'comment_count': int, + 'title': 'md5:f542ea253f5255240be4da375c6a5d7e', + 'description': 'md5:f24a4e22a71763e32da5fed59e47c770', + 'duration': 306, + }, + 'params': { + 'skip_download': True, } }, { 'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5', 'only_matching': True, + }, { + 'url': 'https://chirb.it/wp/MN58c2', + 'only_matching': True, }] def _real_extract(self, url): @@ -33,27 +36,30 @@ class ChirbitIE(InfoExtractor): webpage = self._download_webpage( 'http://chirb.it/%s' % audio_id, audio_id) - audio_url = self._search_regex( - r'"setFile"\s*,\s*"([^"]+)"', webpage, 'audio url') + data_fd = self._search_regex( + r'data-fd=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'data fd', group='url') + + # Reverse engineered from https://chirb.it/js/chirbit.player.js (look + # for soundURL) + audio_url = base64.b64decode( + data_fd[::-1].encode('ascii')).decode('utf-8') title = self._search_regex( - r'itemprop="name">([^<]+)', webpage, 'title') - duration = parse_duration(self._html_search_meta( - 'duration', webpage, 'duration', fatal=False)) - view_count = int_or_none(self._search_regex( - r'itemprop="playCount"\s*>(\d+)', webpage, - 'listen count', fatal=False)) - comment_count = int_or_none(self._search_regex( - r'>(\d+) Comments?:', webpage, - 'comment count', fatal=False)) + r'class=["\']chirbit-title["\'][^>]*>([^<]+)', webpage, 'title') + description = self._search_regex( + r'<h3>Description</h3>\s*<pre[^>]*>([^<]+)</pre>', + webpage, 'description', default=None) + duration = parse_duration(self._search_regex( + r'class=["\']c-length["\'][^>]*>([^<]+)', + webpage, 'duration', fatal=False)) return { 'id': audio_id, 'url': audio_url, 'title': title, + 'description': description, 'duration': duration, - 'view_count': view_count, - 'comment_count': comment_count, } From 0aef0771f8cf18d97ce1b6b9123ce76bae45f3ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 11 Aug 2016 23:47:27 +0700 Subject: [PATCH 009/218] [drtuber] Make dislike count optional (Closes #10297) --- youtube_dl/extractor/drtuber.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py index 639f9182c..4e5557703 100644 --- a/youtube_dl/extractor/drtuber.py +++ b/youtube_dl/extractor/drtuber.py @@ -3,7 +3,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import str_to_int +from ..utils import ( + NO_DEFAULT, + str_to_int, +) class DrTuberIE(InfoExtractor): @@ -17,7 +20,6 @@ class DrTuberIE(InfoExtractor): 'ext': 'mp4', 'title': 'hot perky blonde naked golf', 'like_count': int, - 'dislike_count': int, 'comment_count': int, 'categories': ['Babe', 'Blonde', 'Erotic', 'Outdoor', 'Softcore', 'Solo'], 'thumbnail': 're:https?://.*\.jpg$', @@ -43,18 +45,20 @@ class DrTuberIE(InfoExtractor): r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False) - def extract_count(id_, name): + def extract_count(id_, name, default=NO_DEFAULT): return str_to_int(self._html_search_regex( r'<span[^>]+(?:class|id)="%s"[^>]*>([\d,\.]+)</span>' % id_, - webpage, '%s count' % name, fatal=False)) + webpage, '%s count' % name, default=default, fatal=False)) like_count = extract_count('rate_likes', 'like') - dislike_count = extract_count('rate_dislikes', 'dislike') + dislike_count = extract_count('rate_dislikes', 'dislike', default=None) comment_count = extract_count('comments_count', 'comment') cats_str = self._search_regex( - r'<div[^>]+class="categories_list">(.+?)</div>', webpage, 'categories', fatal=False) - categories = [] if not cats_str else re.findall(r'<a title="([^"]+)"', cats_str) + r'<div[^>]+class="categories_list">(.+?)</div>', + webpage, 'categories', fatal=False) + categories = [] if not cats_str else re.findall( + r'<a title="([^"]+)"', cats_str) return { 'id': video_id, From 367976d49fba48ea44fc5bf622adcc989896f29a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 11 Aug 2016 23:47:52 +0700 Subject: [PATCH 010/218] [drtuber] Improve title extraction --- youtube_dl/extractor/drtuber.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py index 4e5557703..e8870c460 100644 --- a/youtube_dl/extractor/drtuber.py +++ b/youtube_dl/extractor/drtuber.py @@ -38,7 +38,9 @@ class DrTuberIE(InfoExtractor): r'<source src="([^"]+)"', webpage, 'video URL') title = self._html_search_regex( - [r'<p[^>]+class="title_substrate">([^<]+)</p>', r'<title>([^<]+) - \d+'], + (r'class="title_watch"[^>]*><p>([^<]+)<', + r'<p[^>]+class="title_substrate">([^<]+)</p>', + r'<title>([^<]+) - \d+'), webpage, 'title') thumbnail = self._html_search_regex( From 0fd1b1624cc42412fe4701f9de09f49adfa467f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 11 Aug 2016 23:52:17 +0700 Subject: [PATCH 011/218] [goldenmoustache] Remove extractor (Closes #10298) Now uses dailymotion --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/goldenmoustache.py | 48 ------------------------- 2 files changed, 1 insertion(+), 49 deletions(-) delete mode 100644 youtube_dl/extractor/goldenmoustache.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 387230be0..c0c18393f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -311,7 +311,6 @@ from .globo import ( ) from .godtube import GodTubeIE from .godtv import GodTVIE -from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE from .googledrive import GoogleDriveIE from .googleplus import GooglePlusIE @@ -1004,6 +1003,7 @@ from .viki import ( VikiIE, VikiChannelIE, ) +from .viu import ViuIE from .vk import ( VKIE, VKUserVideosIE, diff --git a/youtube_dl/extractor/goldenmoustache.py b/youtube_dl/extractor/goldenmoustache.py deleted file mode 100644 index 0fb509724..000000000 --- a/youtube_dl/extractor/goldenmoustache.py +++ /dev/null @@ -1,48 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class GoldenMoustacheIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?goldenmoustache\.com/(?P<display_id>[\w-]+)-(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://www.goldenmoustache.com/suricate-le-poker-3700/', - 'md5': '0f904432fa07da5054d6c8beb5efb51a', - 'info_dict': { - 'id': '3700', - 'ext': 'mp4', - 'title': 'Suricate - Le Poker', - 'description': 'md5:3d1f242f44f8c8cb0a106f1fd08e5dc9', - 'thumbnail': 're:^https?://.*\.jpg$', - } - }, { - 'url': 'http://www.goldenmoustache.com/le-lab-tout-effacer-mc-fly-et-carlito-55249/', - 'md5': '27f0c50fb4dd5f01dc9082fc67cd5700', - 'info_dict': { - 'id': '55249', - 'ext': 'mp4', - 'title': 'Le LAB - Tout Effacer (Mc Fly et Carlito)', - 'description': 'md5:9b7fbf11023fb2250bd4b185e3de3b2a', - 'thumbnail': 're:^https?://.*\.(?:png|jpg)$', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_url = self._html_search_regex( - r'data-src-type="mp4" data-src="([^"]+)"', webpage, 'video URL') - title = self._html_search_regex( - r'<title>(.*?)(?: - Golden Moustache)?', webpage, 'title') - thumbnail = self._og_search_thumbnail(webpage) - description = self._og_search_description(webpage) - - return { - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - } From a3be69b7f0d21c024e288a42864704f5c81d9dd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 12 Aug 2016 00:14:51 +0700 Subject: [PATCH 012/218] [viu] Remove from extractors --- youtube_dl/extractor/extractors.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c0c18393f..acf4e5d62 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1003,7 +1003,6 @@ from .viki import ( VikiIE, VikiChannelIE, ) -from .viu import ViuIE from .vk import ( VKIE, VKUserVideosIE, From fff37cfd4f09db6bb9f35da59b7d73b4e72855ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 12 Aug 2016 00:18:28 +0700 Subject: [PATCH 013/218] [ChangeLog] Actualize --- ChangeLog | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ChangeLog b/ChangeLog index b6ea39cba..985dca7d0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -5,6 +5,14 @@ Core + Recognize more formats in unified_timestamp Extractors +- [goldenmoustache] Remove extractor (#10298) +* [drtuber] Improve title extraction +* [drtuber] Make dislike count optional (#10297) +* [chirbit] Fix extraction (#10296) +* [francetvinfo] Relax URL regular expression +* [rtlnl] Relax URL regular expression (#10282) +* [formula1] Relax URL regular expression (#10283) +* [wat] Improve extraction (#10281) * [ctsnews] Fix extraction From b0081562d240fbe2ad854c53b6e098fa7e626247 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 12 Aug 2016 00:22:22 +0700 Subject: [PATCH 014/218] release 2016.08.12 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 - youtube_dl/version.py | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 1c06ba36e..6fdb2f77b 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.10*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.10** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.12*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.12** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.08.10 +[debug] youtube-dl version 2016.08.12 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 985dca7d0..376d96d12 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2016.08.12 Core * Subtitles are now written as is. Newline conversions are disabled. (#10268) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a44167a94..8fb581d2b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -265,7 +265,6 @@ - **GloboArticle** - **GodTube** - **GodTV** - - **GoldenMoustache** - **Golem** - **GoogleDrive** - **Goshgay** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index f7ad846d9..becf14458 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.08.10' +__version__ = '2016.08.12' From 990d533ee4a33f8c59921a4152817ff4835a974f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 12 Aug 2016 00:56:16 +0700 Subject: [PATCH 015/218] [crunchyroll] Add support for HLS (Closes #10301) --- youtube_dl/extractor/crunchyroll.py | 32 +++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 90a64303d..6d3abb52f 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -114,6 +114,21 @@ class CrunchyrollIE(CrunchyrollBaseIE): # rtmp 'skip_download': True, }, + }, { + 'url': 'http://www.crunchyroll.com/rezero-starting-life-in-another-world-/episode-5-the-morning-of-our-promise-is-still-distant-702409', + 'info_dict': { + 'id': '702409', + 'ext': 'mp4', + 'title': 'Re:ZERO -Starting Life in Another World- Episode 5 – The Morning of Our Promise Is Still Distant', + 'description': 'md5:97664de1ab24bbf77a9c01918cb7dca9', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'TV TOKYO', + 'upload_date': '20160508', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', 'only_matching': True, @@ -336,9 +351,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text if video_encode_id in video_encode_ids: continue video_encode_ids.append(video_encode_id) + + video_file = xpath_text(stream_info, './file') + if not video_file: + continue + if video_file.startswith('http'): + formats.extend(self._extract_m3u8_formats( + video_file, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + video_url = xpath_text(stream_info, './host') - video_play_path = xpath_text(stream_info, './file') - if not video_url or not video_play_path: + if not video_url: continue metadata = stream_info.find('./metadata') format_info = { @@ -353,7 +377,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text parsed_video_url = compat_urlparse.urlparse(video_url) direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( netloc='v.lvlt.crcdn.net', - path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_play_path.split(':')[-1]))) + path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1]))) if self._is_valid_url(direct_video_url, video_id, video_format): format_info.update({ 'url': direct_video_url, @@ -363,7 +387,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text format_info.update({ 'url': video_url, - 'play_path': video_play_path, + 'play_path': video_file, 'ext': 'flv', }) formats.append(format_info) From 3cddb8d6a776b09afd7f50772fa30cb536b1149a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 12 Aug 2016 08:38:06 +0100 Subject: [PATCH 016/218] [pbs] check all http formats and remove unnecessary request - some of the quality that not reported in the documentation are available(4500k, 6500k) - the videoInfo request doesn't work for a long time --- youtube_dl/extractor/pbs.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index f6f423597..6e2ef0fba 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -448,17 +448,6 @@ class PBSIE(InfoExtractor): redirects.append(redirect) redirect_urls.add(redirect_url) - try: - video_info = self._download_json( - 'http://player.pbs.org/videoInfo/%s?format=json&type=partner' % video_id, - display_id, 'Downloading video info JSON') - extract_redirect_urls(video_info) - info = video_info - except ExtractorError as e: - # videoInfo API may not work for some videos - if not isinstance(e.cause, compat_HTTPError) or e.cause.code != 404: - raise - # Player pages may also serve different qualities for page in ('widget/partnerplayer', 'portalplayer'): player = self._download_webpage( @@ -511,12 +500,12 @@ class PBSIE(InfoExtractor): formats)) if http_url: for m3u8_format in m3u8_formats: - bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None) + bitrate = self._search_regex(r'(\d+)k', m3u8_format['url'], 'bitrate', default=None) # extract only the formats that we know that they will be available as http format. # https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications - if not bitrate or bitrate not in ('400k', '800k', '1200k', '2500k'): + if not bitrate or int(bitrate) < 400: continue - f_url = re.sub(r'\d+k|baseline', bitrate, http_url) + f_url = re.sub(r'\d+k|baseline', bitrate + 'k', http_url) # This may produce invalid links sometimes (e.g. # http://www.pbs.org/wgbh/frontline/film/suicide-plan) if not self._is_valid_url(f_url, display_id, 'http-%s video' % bitrate): From 98e698f1ff3fd467ff03e10a8f8881cd06345ca7 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 12 Aug 2016 12:30:02 +0100 Subject: [PATCH 017/218] [external/curl] respect more downloader options and display progress --- youtube_dl/downloader/external.py | 15 +++++++++++++++ youtube_dl/utils.py | 2 ++ 2 files changed, 17 insertions(+) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index fae245024..f0c30007f 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -96,6 +96,12 @@ class CurlFD(ExternalFD): cmd = [self.exe, '--location', '-o', tmpfilename] for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] + cmd += self._bool_option('--continue-at', 'continuedl', '-', '0') + cmd += self._valueless_option('--silent', 'noprogress') + cmd += self._valueless_option('--verbose', 'verbose') + cmd += self._option('--limit-rate', 'ratelimit') + cmd += self._option('--retry', 'retries') + cmd += self._option('--max-filesize', 'max_filesize') cmd += self._option('--interface', 'source_address') cmd += self._option('--proxy', 'proxy') cmd += self._valueless_option('--insecure', 'nocheckcertificate') @@ -103,6 +109,15 @@ class CurlFD(ExternalFD): cmd += ['--', info_dict['url']] return cmd + def _call_downloader(self, tmpfilename, info_dict): + cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] + + self._debug_cmd(cmd) + + p = subprocess.Popen(cmd) + p.communicate() + return p.returncode + class AxelFD(ExternalFD): AVAILABLE_OPT = '-V' diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a03f7184d..b3b687a31 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2410,6 +2410,8 @@ def dfxp2srt(dfxp_data): def cli_option(params, command_option, param): param = params.get(param) + if param: + param = compat_str(param) return [command_option, param] if param is not None else [] From f0d3669437bb7f198ada9c0fead64d50a6e7a972 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 12 Aug 2016 18:05:49 +0100 Subject: [PATCH 018/218] [hgtv] Add new extractor(closes #3999) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/hgtv.py | 48 ++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 youtube_dl/extractor/hgtv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index acf4e5d62..6420167f2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -325,6 +325,7 @@ from .heise import HeiseIE from .hellporno import HellPornoIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE +from .hgtv import HGTVIE from .historicfilms import HistoricFilmsIE from .hitbox import HitboxIE, HitboxLiveIE from .hornbunny import HornBunnyIE diff --git a/youtube_dl/extractor/hgtv.py b/youtube_dl/extractor/hgtv.py new file mode 100644 index 000000000..c3f0733cf --- /dev/null +++ b/youtube_dl/extractor/hgtv.py @@ -0,0 +1,48 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + js_to_json, + smuggle_url, +) + + +class HGTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hgtv\.ca/[^/]+/video/(?P[^/]+)/video.html' + _TEST = { + 'url': 'http://www.hgtv.ca/homefree/video/overnight-success/video.html?v=738081859718&p=1&s=da#video', + 'md5': '', + 'info_dict': { + 'id': 'aFH__I_5FBOX', + 'ext': 'mp4', + 'title': 'Overnight Success', + 'description': 'After weeks of hard work, high stakes, breakdowns and pep talks, the final 2 contestants compete to win the ultimate dream.', + 'uploader': 'SHWM-NEW', + 'timestamp': 1470320034, + 'upload_date': '20160804', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + embed_vars = self._parse_json(self._search_regex( + r'(?s)embed_vars\s*=\s*({.*?});', + webpage, 'embed vars'), display_id, js_to_json) + return { + '_type': 'url_transparent', + 'url': smuggle_url( + 'http://link.theplatform.com/s/dtjsEC/%s?mbr=true&manifest=m3u' % embed_vars['pid'], { + 'force_smil_url': True + }), + 'series': embed_vars.get('show'), + 'season_number': int_or_none(embed_vars.get('season')), + 'episode_number': int_or_none(embed_vars.get('episode')), + 'ie_key': 'ThePlatform', + } From 794e5dcd7e24784c05e042e7e0655c584347f5c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Aug 2016 14:09:35 +0700 Subject: [PATCH 019/218] [sunporno] Fix metadata extraction (Closes #10316) --- youtube_dl/extractor/sunporno.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py index e527aa971..4269f2a30 100644 --- a/youtube_dl/extractor/sunporno.py +++ b/youtube_dl/extractor/sunporno.py @@ -15,10 +15,10 @@ class SunPornoIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?sunporno\.com/videos/(?P\d+)' _TEST = { 'url': 'http://www.sunporno.com/videos/807778/', - 'md5': '6457d3c165fd6de062b99ef6c2ff4c86', + 'md5': '507887e29033502f29dba69affeebfc9', 'info_dict': { 'id': '807778', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'md5:0a400058e8105d39e35c35e7c5184164', 'description': 'md5:a31241990e1bd3a64e72ae99afb325fb', 'thumbnail': 're:^https?://.*\.jpg$', @@ -40,7 +40,8 @@ class SunPornoIE(InfoExtractor): r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False) duration = parse_duration(self._search_regex( - r'itemprop="duration">\s*(\d+:\d+)\s*<', + (r'itemprop="duration"[^>]*>\s*(\d+:\d+)\s*<', + r'>Duration:\s*]+>\s*(\d+:\d+)\s*<'), webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( @@ -48,7 +49,7 @@ class SunPornoIE(InfoExtractor): webpage, 'view count', fatal=False)) comment_count = int_or_none(self._html_search_regex( r'(\d+) Comments?', - webpage, 'comment count', fatal=False)) + webpage, 'comment count', fatal=False, default=None)) formats = [] quality = qualities(['mp4', 'flv']) From b69b2ff7366cee97202eb333bf06329bfb2e974e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Aug 2016 14:13:49 +0700 Subject: [PATCH 020/218] [sunporno] Add support for embed URLs --- youtube_dl/extractor/sunporno.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py index 4269f2a30..ef9be7926 100644 --- a/youtube_dl/extractor/sunporno.py +++ b/youtube_dl/extractor/sunporno.py @@ -12,8 +12,8 @@ from ..utils import ( class SunPornoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?sunporno\.com/videos/(?P\d+)' - _TEST = { + _VALID_URL = r'https?://(?:(?:www\.)?sunporno\.com/videos|embeds\.sunporno\.com/embed)/(?P\d+)' + _TESTS = [{ 'url': 'http://www.sunporno.com/videos/807778/', 'md5': '507887e29033502f29dba69affeebfc9', 'info_dict': { @@ -25,12 +25,16 @@ class SunPornoIE(InfoExtractor): 'duration': 302, 'age_limit': 18, } - } + }, { + 'url': 'http://embeds.sunporno.com/embed/807778', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + 'http://www.sunporno.com/videos/%s' % video_id, video_id) title = self._html_search_regex( r'([^<]+)', webpage, 'title') From bd6fb007de2323065bface4467539b509fbdb062 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Aug 2016 14:22:47 +0700 Subject: [PATCH 021/218] [24video] Fix comment count extraction --- youtube_dl/extractor/twentyfourvideo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index 4025edf02..8b808d6d6 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -64,7 +64,7 @@ class TwentyFourVideoIE(InfoExtractor): r'(\d+) просмотр', webpage, 'view count', fatal=False)) comment_count = int_or_none(self._html_search_regex( - r'
(\d+) комментари', + r']+href="#tab-comments"[^>]*>(\d+) комментари', webpage, 'comment count', fatal=False)) # Sets some cookies From 6a26c5f9d5d7b32648e116be4ce902802994654e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Aug 2016 14:28:44 +0700 Subject: [PATCH 022/218] [muenchentv] Fix extraction (Closes #10313) --- youtube_dl/extractor/muenchentv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/muenchentv.py b/youtube_dl/extractor/muenchentv.py index b4e8ad17e..d9f176136 100644 --- a/youtube_dl/extractor/muenchentv.py +++ b/youtube_dl/extractor/muenchentv.py @@ -36,7 +36,7 @@ class MuenchenTVIE(InfoExtractor): title = self._live_title(self._og_search_title(webpage)) data_js = self._search_regex( - r'(?s)\nplaylist:\s*(\[.*?}\]),related:', + r'(?s)\nplaylist:\s*(\[.*?}\]),', webpage, 'playlist configuration') data_json = js_to_json(data_js) data = json.loads(data_json)[0] From c366f8d30a177d2d44130c9d077b15a4c960c003 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Aug 2016 14:47:51 +0700 Subject: [PATCH 023/218] [24video] Add support for me and xxx TLDs --- youtube_dl/extractor/twentyfourvideo.py | 48 ++++++++++++------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index 8b808d6d6..af92b713b 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -12,32 +12,32 @@ from ..utils import ( class TwentyFourVideoIE(InfoExtractor): IE_NAME = '24video' - _VALID_URL = r'https?://(?:www\.)?24video\.net/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?24video\.(?:net|me|xxx)/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P\d+)' - _TESTS = [ - { - 'url': 'http://www.24video.net/video/view/1044982', - 'md5': 'e09fc0901d9eaeedac872f154931deeb', - 'info_dict': { - 'id': '1044982', - 'ext': 'mp4', - 'title': 'Эротика каменного века', - 'description': 'Как смотрели порно в каменном веке.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'SUPERTELO', - 'duration': 31, - 'timestamp': 1275937857, - 'upload_date': '20100607', - 'age_limit': 18, - 'like_count': int, - 'dislike_count': int, - }, + _TESTS = [{ + 'url': 'http://www.24video.net/video/view/1044982', + 'md5': 'e09fc0901d9eaeedac872f154931deeb', + 'info_dict': { + 'id': '1044982', + 'ext': 'mp4', + 'title': 'Эротика каменного века', + 'description': 'Как смотрели порно в каменном веке.', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'SUPERTELO', + 'duration': 31, + 'timestamp': 1275937857, + 'upload_date': '20100607', + 'age_limit': 18, + 'like_count': int, + 'dislike_count': int, }, - { - 'url': 'http://www.24video.net/player/new24_play.swf?id=1044982', - 'only_matching': True, - } - ] + }, { + 'url': 'http://www.24video.net/player/new24_play.swf?id=1044982', + 'only_matching': True, + }, { + 'url': 'http://www.24video.me/video/view/1044982', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From f50365e91cfccf33d0b5696c7f989944bcf748e7 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 13 Aug 2016 09:08:57 +0100 Subject: [PATCH 024/218] [pbs] add test for videos with undocumented http formats and remove unused import --- youtube_dl/extractor/pbs.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 6e2ef0fba..335e44bdc 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_HTTPError from ..utils import ( ExtractorError, determine_ext, @@ -334,6 +333,16 @@ class PBSIE(InfoExtractor): 'formats': 'mincount:8', }, }, + { + # has undocumented http formats(4500k and 6500k) + 'url': 'http://www.pbs.org/video/2365815229/', + 'md5': '94635cd06b7133688e23f4b94e6637a5', + 'info_dict': { + 'id': '2365815229', + 'ext': 'mp4', + 'title': 'FRONTLINE - Mosquito Hunter', + }, + }, { 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'only_matching': True, @@ -501,14 +510,18 @@ class PBSIE(InfoExtractor): if http_url: for m3u8_format in m3u8_formats: bitrate = self._search_regex(r'(\d+)k', m3u8_format['url'], 'bitrate', default=None) - # extract only the formats that we know that they will be available as http format. + # lower qualities(150k and 192k) are not available as http formats + # https://github.com/rg3/youtube-dl/commit/cbc032c8b70a038a69259378c92b4ba97b42d491#commitcomment-17313656 + # we will try to extract any http format higher than than the lowest quality documented in # https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications + # as there also undocumented http formats formats(4500k and 6500k) + # http://www.pbs.org/video/2365815229/ if not bitrate or int(bitrate) < 400: continue f_url = re.sub(r'\d+k|baseline', bitrate + 'k', http_url) # This may produce invalid links sometimes (e.g. # http://www.pbs.org/wgbh/frontline/film/suicide-plan) - if not self._is_valid_url(f_url, display_id, 'http-%s video' % bitrate): + if not self._is_valid_url(f_url, display_id, 'http-%sk video' % bitrate): continue f = m3u8_format.copy() f.update({ From e581224843db95574ca65965f5c5a594a7ffd370 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 13 Aug 2016 16:32:07 +0800 Subject: [PATCH 025/218] [tapely] Remove extractor. It's shut down Closes #10323 --- ChangeLog | 5 ++ youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/tapely.py | 109 ----------------------------- 3 files changed, 5 insertions(+), 110 deletions(-) delete mode 100644 youtube_dl/extractor/tapely.py diff --git a/ChangeLog b/ChangeLog index 376d96d12..b1ce63d75 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +version + +Extractors +- [tapely] Remove extractor (#10323) + version 2016.08.12 Core diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6420167f2..104d8e37e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -812,7 +812,6 @@ from .tagesschau import ( TagesschauPlayerIE, TagesschauIE, ) -from .tapely import TapelyIE from .tass import TassIE from .tdslifeway import TDSLifewayIE from .teachertube import ( diff --git a/youtube_dl/extractor/tapely.py b/youtube_dl/extractor/tapely.py deleted file mode 100644 index ed560bd24..000000000 --- a/youtube_dl/extractor/tapely.py +++ /dev/null @@ -1,109 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - ExtractorError, - float_or_none, - parse_iso8601, - sanitized_Request, -) - - -class TapelyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:tape\.ly|tapely\.com)/(?P[A-Za-z0-9\-_]+)(?:/(?P\d+))?' - _API_URL = 'http://tape.ly/showtape?id={0:}' - _S3_SONG_URL = 'http://mytape.s3.amazonaws.com/{0:}' - _SOUNDCLOUD_SONG_URL = 'http://api.soundcloud.com{0:}' - _TESTS = [ - { - 'url': 'http://tape.ly/my-grief-as-told-by-water', - 'info_dict': { - 'id': 23952, - 'title': 'my grief as told by water', - 'thumbnail': 're:^https?://.*\.png$', - 'uploader_id': 16484, - 'timestamp': 1411848286, - 'description': 'For Robin and Ponkers, whom the tides of life have taken out to sea.', - }, - 'playlist_count': 13, - }, - { - 'url': 'http://tape.ly/my-grief-as-told-by-water/1', - 'md5': '79031f459fdec6530663b854cbc5715c', - 'info_dict': { - 'id': 258464, - 'title': 'Dreaming Awake (My Brightest Diamond)', - 'ext': 'm4a', - }, - }, - { - 'url': 'https://tapely.com/my-grief-as-told-by-water', - 'only_matching': True, - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') - - playlist_url = self._API_URL.format(display_id) - request = sanitized_Request(playlist_url) - request.add_header('X-Requested-With', 'XMLHttpRequest') - request.add_header('Accept', 'application/json') - request.add_header('Referer', url) - - playlist = self._download_json(request, display_id) - - tape = playlist['tape'] - - entries = [] - for s in tape['songs']: - song = s['song'] - entry = { - 'id': song['id'], - 'duration': float_or_none(song.get('songduration'), 1000), - 'title': song['title'], - } - if song['source'] == 'S3': - entry.update({ - 'url': self._S3_SONG_URL.format(song['filename']), - }) - entries.append(entry) - elif song['source'] == 'YT': - self.to_screen('YouTube video detected') - yt_id = song['filename'].replace('/youtube/', '') - entry.update(self.url_result(yt_id, 'Youtube', video_id=yt_id)) - entries.append(entry) - elif song['source'] == 'SC': - self.to_screen('SoundCloud song detected') - sc_url = self._SOUNDCLOUD_SONG_URL.format(song['filename']) - entry.update(self.url_result(sc_url, 'Soundcloud')) - entries.append(entry) - else: - self.report_warning('Unknown song source: %s' % song['source']) - - if mobj.group('songnr'): - songnr = int(mobj.group('songnr')) - 1 - try: - return entries[songnr] - except IndexError: - raise ExtractorError( - 'No song with index: %s' % mobj.group('songnr'), - expected=True) - - return { - '_type': 'playlist', - 'id': tape['id'], - 'display_id': display_id, - 'title': tape['name'], - 'entries': entries, - 'thumbnail': tape.get('image_url'), - 'description': clean_html(tape.get('subtext')), - 'like_count': tape.get('likescount'), - 'uploader_id': tape.get('user_id'), - 'timestamp': parse_iso8601(tape.get('published_at')), - } From cb55908e51b80d9a51664ec76dcfe05d739dadf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Aug 2016 15:47:20 +0700 Subject: [PATCH 026/218] [vbox7] Fix extraction (Closes #10309) --- youtube_dl/extractor/vbox7.py | 57 ++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index dff1bb702..326440758 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -12,7 +12,15 @@ from ..utils import ( class Vbox7IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?vbox7\.com/play:(?P[^/]+)' - _TEST = { + _TESTS = [{ + 'url': 'http://vbox7.com/play:0946fff23c', + 'md5': 'a60f9ab3a3a2f013ef9a967d5f7be5bf', + 'info_dict': { + 'id': '0946fff23c', + 'ext': 'mp4', + 'title': 'Борисов: Притеснен съм за бъдещето на България', + }, + }, { 'url': 'http://vbox7.com/play:249bb972c2', 'md5': '99f65c0c9ef9b682b97313e052734c3f', 'info_dict': { @@ -20,43 +28,38 @@ class Vbox7IE(InfoExtractor): 'ext': 'mp4', 'title': 'Смях! Чудо - чист за секунди - Скрита камера', }, - } + 'skip': 'georestricted', + }] def _real_extract(self, url): video_id = self._match_id(url) - # need to get the page 3 times for the correct jsSecretToken cookie - # which is necessary for the correct title - def get_session_id(): - redirect_page = self._download_webpage(url, video_id) - session_id_url = self._search_regex( - r'var\s*url\s*=\s*\'([^\']+)\';', redirect_page, - 'session id url') - self._download_webpage( - compat_urlparse.urljoin(url, session_id_url), video_id, - 'Getting session id') + webpage = self._download_webpage(url, video_id) - get_session_id() - get_session_id() + title = self._html_search_regex( + r'(.*)', webpage, 'title').split('/')[0].strip() - webpage = self._download_webpage(url, video_id, - 'Downloading redirect page') + video_url = self._search_regex( + r'src\s*:\s*(["\'])(?P.+?.mp4.*?)\1', + webpage, 'video url', default=None, group='url') - title = self._html_search_regex(r'(.*)', - webpage, 'title').split('/')[0].strip() + thumbnail_url = self._og_search_thumbnail(webpage) - info_url = 'http://vbox7.com/play/magare.do' - data = urlencode_postdata({'as3': '1', 'vid': video_id}) - info_request = sanitized_Request(info_url, data) - info_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - info_response = self._download_webpage(info_request, video_id, 'Downloading info webpage') - if info_response is None: - raise ExtractorError('Unable to extract the media url') - (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&')) + if not video_url: + info_response = self._download_webpage( + 'http://vbox7.com/play/magare.do', video_id, + 'Downloading info webpage', + data=urlencode_postdata({'as3': '1', 'vid': video_id}), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + final_url, thumbnail_url = map( + lambda x: x.split('=')[1], info_response.split('&')) + + if '/na.mp4' in video_url: + self.raise_geo_restricted() return { 'id': video_id, - 'url': final_url, + 'url': self._proto_relative_url(video_url, 'http:'), 'title': title, 'thumbnail': thumbnail_url, } From 5f2c2b7936eb092e482309a5b9aa036028dbab2c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 13 Aug 2016 09:53:46 +0100 Subject: [PATCH 027/218] [test_utils] add test for option with not str value --- test/test_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_utils.py b/test/test_utils.py index 724346886..74fcf91c0 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -968,6 +968,7 @@ The first line self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128']) self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), []) self.assertEqual(cli_option({}, '--proxy', 'proxy'), []) + self.assertEqual(cli_option({'retries': 10}, '--retries', 'retries'), ['--retries', '10']) def test_cli_valueless_option(self): self.assertEqual(cli_valueless_option( From acfccacad5555c21d649729c5e2cb237a70f46e6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 13 Aug 2016 10:26:02 +0100 Subject: [PATCH 028/218] [downloader/external:curl] Clarify why CurlFD should not capture stderr --- youtube_dl/downloader/external.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index f0c30007f..cf4556221 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -114,6 +114,7 @@ class CurlFD(ExternalFD): self._debug_cmd(cmd) + # curl writes the progress to stderr so don't capture it. p = subprocess.Popen(cmd) p.communicate() return p.returncode From e97c55ee6aaf5170f86bc8146a20cef56e337a3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Aug 2016 16:29:05 +0700 Subject: [PATCH 029/218] [expotv] Improve extraction and update test --- youtube_dl/extractor/expotv.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/expotv.py b/youtube_dl/extractor/expotv.py index 1585a03bb..971c918a4 100644 --- a/youtube_dl/extractor/expotv.py +++ b/youtube_dl/extractor/expotv.py @@ -1,7 +1,5 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -12,23 +10,22 @@ from ..utils import ( class ExpoTVIE(InfoExtractor): _VALID_URL = r'https?://www\.expotv\.com/videos/[^?#]*/(?P[0-9]+)($|[?#])' _TEST = { - 'url': 'http://www.expotv.com/videos/reviews/1/24/LinneCardscom/17561', - 'md5': '2985e6d7a392b2f7a05e0ca350fe41d0', + 'url': 'http://www.expotv.com/videos/reviews/3/40/NYX-Butter-lipstick/667916', + 'md5': 'fe1d728c3a813ff78f595bc8b7a707a8', 'info_dict': { - 'id': '17561', + 'id': '667916', 'ext': 'mp4', - 'upload_date': '20060212', - 'title': 'My Favorite Online Scrapbook Store', - 'view_count': int, - 'description': 'You\'ll find most everything you need at this virtual store front.', - 'uploader': 'Anna T.', + 'title': 'NYX Butter Lipstick Little Susie', + 'description': 'Goes on like butter, but looks better!', 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Stephanie S.', + 'upload_date': '20150520', + 'view_count': int, } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) player_key = self._search_regex( @@ -66,7 +63,7 @@ class ExpoTVIE(InfoExtractor): fatal=False) upload_date = unified_strdate(self._search_regex( r'
Reviewed on ([0-9/.]+)
', webpage, 'upload date', - fatal=False)) + fatal=False), day_first=False) return { 'id': video_id, From 52aa7e7476415ec632053f85f9db0919f7bf75c3 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 13 Aug 2016 17:36:14 +0800 Subject: [PATCH 030/218] [test_verbose_output] Fix tests under Python 3 --- test/test_verbose_output.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/test/test_verbose_output.py b/test/test_verbose_output.py index 4c77df242..96a66f7a0 100644 --- a/test/test_verbose_output.py +++ b/test/test_verbose_output.py @@ -22,10 +22,10 @@ class TestVerboseOutput(unittest.TestCase): '--password', 'secret', ], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sout, serr = outp.communicate() - self.assertTrue('--username' in serr) - self.assertTrue('johnsmith' not in serr) - self.assertTrue('--password' in serr) - self.assertTrue('secret' not in serr) + self.assertTrue(b'--username' in serr) + self.assertTrue(b'johnsmith' not in serr) + self.assertTrue(b'--password' in serr) + self.assertTrue(b'secret' not in serr) def test_private_info_shortarg(self): outp = subprocess.Popen( @@ -35,10 +35,10 @@ class TestVerboseOutput(unittest.TestCase): '-p', 'secret', ], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sout, serr = outp.communicate() - self.assertTrue('-u' in serr) - self.assertTrue('johnsmith' not in serr) - self.assertTrue('-p' in serr) - self.assertTrue('secret' not in serr) + self.assertTrue(b'-u' in serr) + self.assertTrue(b'johnsmith' not in serr) + self.assertTrue(b'-p' in serr) + self.assertTrue(b'secret' not in serr) def test_private_info_eq(self): outp = subprocess.Popen( @@ -48,10 +48,10 @@ class TestVerboseOutput(unittest.TestCase): '--password=secret', ], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sout, serr = outp.communicate() - self.assertTrue('--username' in serr) - self.assertTrue('johnsmith' not in serr) - self.assertTrue('--password' in serr) - self.assertTrue('secret' not in serr) + self.assertTrue(b'--username' in serr) + self.assertTrue(b'johnsmith' not in serr) + self.assertTrue(b'--password' in serr) + self.assertTrue(b'secret' not in serr) def test_private_info_shortarg_eq(self): outp = subprocess.Popen( @@ -61,10 +61,10 @@ class TestVerboseOutput(unittest.TestCase): '-p=secret', ], cwd=rootDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sout, serr = outp.communicate() - self.assertTrue('-u' in serr) - self.assertTrue('johnsmith' not in serr) - self.assertTrue('-p' in serr) - self.assertTrue('secret' not in serr) + self.assertTrue(b'-u' in serr) + self.assertTrue(b'johnsmith' not in serr) + self.assertTrue(b'-p' in serr) + self.assertTrue(b'secret' not in serr) if __name__ == '__main__': unittest.main() From cd29eaab955b930fc7ee595553d6351ad643569d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Aug 2016 16:45:34 +0700 Subject: [PATCH 031/218] [vbox7] Remove unused imports --- youtube_dl/extractor/vbox7.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index 326440758..fa7899e6d 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -2,12 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - ExtractorError, - sanitized_Request, - urlencode_postdata, -) +from ..utils import urlencode_postdata class Vbox7IE(InfoExtractor): From c2a453b46177787d5cc17e09cedb3eca215ab159 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Aug 2016 16:46:07 +0700 Subject: [PATCH 032/218] [imgur] Fix width and height extraction (Closes #10325) --- youtube_dl/extractor/imgur.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index 85e9344aa..d23489dcf 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -50,12 +50,10 @@ class ImgurIE(InfoExtractor): webpage = self._download_webpage( compat_urlparse.urljoin(url, video_id), video_id) - width = int_or_none(self._search_regex( - r'(.*?)
', From db535435b30540029b292e7217fb443bcc670aab Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 13 Aug 2016 18:02:11 +0800 Subject: [PATCH 033/218] [bigflix] Remove an invalid test There's no video anymore --- youtube_dl/extractor/bigflix.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/youtube_dl/extractor/bigflix.py b/youtube_dl/extractor/bigflix.py index b19f35b5d..b4ce767af 100644 --- a/youtube_dl/extractor/bigflix.py +++ b/youtube_dl/extractor/bigflix.py @@ -11,15 +11,6 @@ from ..compat import compat_urllib_parse_unquote class BigflixIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bigflix\.com/.+/(?P[0-9]+)' _TESTS = [{ - 'url': 'http://www.bigflix.com/Hindi-movies/Action-movies/Singham-Returns/16537', - 'md5': 'dc1b4aebb46e3a7077ecc0d9f43f61e3', - 'info_dict': { - 'id': '16537', - 'ext': 'mp4', - 'title': 'Singham Returns', - 'description': 'md5:3d2ba5815f14911d5cc6a501ae0cf65d', - } - }, { # 2 formats 'url': 'http://www.bigflix.com/Tamil-movies/Drama-movies/Madarasapatinam/16070', 'info_dict': { From 77afa008dd14efd930f8504609815a8ad2fedc7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Aug 2016 19:55:09 +0700 Subject: [PATCH 034/218] [4tube] Fix metadata extraction (Closes #10321) --- youtube_dl/extractor/fourtube.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index fc4a5a0fb..9776c8422 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -43,14 +43,14 @@ class FourTubeIE(InfoExtractor): 'uploadDate', webpage)) thumbnail = self._html_search_meta('thumbnailUrl', webpage) uploader_id = self._html_search_regex( - r'', + r'', webpage, 'uploader id', fatal=False) uploader = self._html_search_regex( - r'', + r'', webpage, 'uploader', fatal=False) categories_html = self._search_regex( - r'(?s)>\s*Categories / Tags\s*.*?
    (.*?)
', + r'(?s)>\s*Categories / Tags\s*.*?
    (.*?)
', webpage, 'categories', fatal=False) categories = None if categories_html: @@ -59,10 +59,10 @@ class FourTubeIE(InfoExtractor): r'(?s)
  • (.*?)', categories_html)] view_count = str_to_int(self._search_regex( - r'', + r']+itemprop="interactionCount"[^>]+content="UserPlays:([0-9,]+)">', webpage, 'view count', fatal=False)) like_count = str_to_int(self._search_regex( - r'', + r']+itemprop="interactionCount"[^>]+content="UserLikes:([0-9,]+)">', webpage, 'like count', fatal=False)) duration = parse_duration(self._html_search_meta('duration', webpage)) From 647a7bf5e8355b34ed030827f53ea1e87ffc9131 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Aug 2016 20:49:16 +0700 Subject: [PATCH 035/218] [pornotube] Fix extraction (Closes #10322) --- youtube_dl/extractor/pornotube.py | 83 ++++++++++++++----------------- 1 file changed, 38 insertions(+), 45 deletions(-) diff --git a/youtube_dl/extractor/pornotube.py b/youtube_dl/extractor/pornotube.py index 5398e708b..63816c358 100644 --- a/youtube_dl/extractor/pornotube.py +++ b/youtube_dl/extractor/pornotube.py @@ -3,10 +3,7 @@ from __future__ import unicode_literals import json from .common import InfoExtractor -from ..utils import ( - int_or_none, - sanitized_Request, -) +from ..utils import int_or_none class PornotubeIE(InfoExtractor): @@ -31,59 +28,55 @@ class PornotubeIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - # Fetch origin token - js_config = self._download_webpage( - 'http://www.pornotube.com/assets/src/app/config.js', video_id, - note='Download JS config') - originAuthenticationSpaceKey = self._search_regex( - r"constant\('originAuthenticationSpaceKey',\s*'([^']+)'", - js_config, 'originAuthenticationSpaceKey') + token = self._download_json( + 'https://api.aebn.net/auth/v2/origins/authenticate', + video_id, note='Downloading token', + data=json.dumps({'credentials': 'Clip Application'}).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + 'Origin': 'http://www.pornotube.com', + })['tokenKey'] - # Fetch actual token - token_req_data = { - 'authenticationSpaceKey': originAuthenticationSpaceKey, - 'credentials': 'Clip Application', - } - token_req = sanitized_Request( - 'https://api.aebn.net/auth/v1/token/primal', - data=json.dumps(token_req_data).encode('utf-8')) - token_req.add_header('Content-Type', 'application/json') - token_req.add_header('Origin', 'http://www.pornotube.com') - token_answer = self._download_json( - token_req, video_id, note='Requesting primal token') - token = token_answer['tokenKey'] + video_url = self._download_json( + 'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id, + video_id, note='Downloading delivery information', + headers={'Authorization': token})['mediaUrl'] - # Get video URL - delivery_req = sanitized_Request( - 'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id) - delivery_req.add_header('Authorization', token) - delivery_info = self._download_json( - delivery_req, video_id, note='Downloading delivery information') - video_url = delivery_info['mediaUrl'] + FIELDS = ( + 'title', 'description', 'startSecond', 'endSecond', 'publishDate', + 'studios{name}', 'categories{name}', 'movieId', 'primaryImageNumber' + ) - # Get additional info (title etc.) - info_req = sanitized_Request( - 'https://api.aebn.net/content/v1/clips/%s?expand=' - 'title,description,primaryImageNumber,startSecond,endSecond,' - 'movie.title,movie.MovieId,movie.boxCoverFront,movie.stars,' - 'movie.studios,stars.name,studios.name,categories.name,' - 'clipActive,movieActive,publishDate,orientations' % video_id) - info_req.add_header('Authorization', token) info = self._download_json( - info_req, video_id, note='Downloading metadata') + 'https://api.aebn.net/content/v2/clips/%s?fields=%s' + % (video_id, ','.join(FIELDS)), video_id, + note='Downloading metadata', + headers={'Authorization': token}) + + if isinstance(info, list): + info = info[0] + + title = info['title'] timestamp = int_or_none(info.get('publishDate'), scale=1000) uploader = info.get('studios', [{}])[0].get('name') - movie_id = info['movie']['movieId'] - thumbnail = 'http://pic.aebn.net/dis/t/%s/%s_%08d.jpg' % ( - movie_id, movie_id, info['primaryImageNumber']) - categories = [c['name'] for c in info.get('categories')] + movie_id = info.get('movieId') + primary_image_number = info.get('primaryImageNumber') + thumbnail = None + if movie_id and primary_image_number: + thumbnail = 'http://pic.aebn.net/dis/t/%s/%s_%08d.jpg' % ( + movie_id, movie_id, primary_image_number) + start = int_or_none(info.get('startSecond')) + end = int_or_none(info.get('endSecond')) + duration = end - start if start and end else None + categories = [c['name'] for c in info.get('categories', []) if c.get('name')] return { 'id': video_id, 'url': video_url, - 'title': info['title'], + 'title': title, 'description': info.get('description'), + 'duration': duration, 'timestamp': timestamp, 'uploader': uploader, 'thumbnail': thumbnail, From 82997dad571988aae59d85db4355f5f1695efcbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Aug 2016 21:00:34 +0700 Subject: [PATCH 036/218] [franceculture] Fix extraction (Closes #10324) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/franceculture.py | 98 +++++++-------------------- 2 files changed, 26 insertions(+), 77 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 104d8e37e..82d4ed153 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -272,10 +272,7 @@ from .fox import FOXIE from .foxgay import FoxgayIE from .foxnews import FoxNewsIE from .foxsports import FoxSportsIE -from .franceculture import ( - FranceCultureIE, - FranceCultureEmissionIE, -) +from .franceculture import FranceCultureIE from .franceinter import FranceInterIE from .francetv import ( PluzzIE, diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py index e2ca96283..186da0d3b 100644 --- a/youtube_dl/extractor/franceculture.py +++ b/youtube_dl/extractor/franceculture.py @@ -2,104 +2,56 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_urlparse, -) from ..utils import ( determine_ext, - int_or_none, - ExtractorError, + unified_strdate, ) class FranceCultureIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/player/reecouter\?play=(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P[^/?#&]+)' _TEST = { - 'url': 'http://www.franceculture.fr/player/reecouter?play=4795174', + 'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks', 'info_dict': { - 'id': '4795174', + 'id': 'rendez-vous-au-pays-des-geeks', + 'display_id': 'rendez-vous-au-pays-des-geeks', 'ext': 'mp3', 'title': 'Rendez-vous au pays des geeks', - 'alt_title': 'Carnet nomade | 13-14', - 'vcodec': 'none', + 'thumbnail': 're:^https?://.*\\.jpg$', 'upload_date': '20140301', - 'thumbnail': r're:^http://static\.franceculture\.fr/.*/images/player/Carnet-nomade\.jpg$', - 'description': 'startswith:Avec :Jean-Baptiste Péretié pour son documentaire sur Arte "La revanche', - 'timestamp': 1393700400, + 'vcodec': 'none', } } - def _extract_from_player(self, url, video_id): - webpage = self._download_webpage(url, video_id) + def _real_extract(self, url): + display_id = self._match_id(url) - video_path = self._search_regex( - r']+class="[^"]*?title-zone-diffusion[^"]*?"[^>]*>.*?]+href="([^"]+)"', + webpage, 'video path') + + title = self._og_search_title(webpage) + + upload_date = unified_strdate(self._search_regex( + '(?s)]+class="date"[^>]*>.*?]+class="inner"[^>]*>([^<]+)<', webpage, 'upload date', fatal=False)) thumbnail = self._search_regex( - r'\s+]+itemtype="https://schema.org/ImageObject"[^>]*>.*?]+data-pagespeed-(?:lazy|high-res)-src="([^"]+)"', webpage, 'thumbnail', fatal=False) - - display_id = self._search_regex( - r'emission-(.*?)', webpage, 'display_id') - - title = self._html_search_regex( - r'(.*?)', webpage, 'title') - alt_title = self._html_search_regex( - r'(.*?)', - webpage, 'alt_title', fatal=False) - description = self._html_search_regex( - r'(.*?)', - webpage, 'description', fatal=False) - uploader = self._html_search_regex( r'(?s)
    (.*?)', webpage, 'uploader', default=None) vcodec = 'none' if determine_ext(video_url.lower()) == 'mp3' else None return { - 'id': video_id, + 'id': display_id, + 'display_id': display_id, 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, 'vcodec': vcodec, 'uploader': uploader, - 'timestamp': timestamp, - 'title': title, - 'alt_title': alt_title, - 'thumbnail': thumbnail, - 'description': description, - 'display_id': display_id, + 'upload_date': upload_date, } - - def _real_extract(self, url): - video_id = self._match_id(url) - return self._extract_from_player(url, video_id) - - -class FranceCultureEmissionIE(FranceCultureIE): - _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emission-(?P[^?#]+)' - _TEST = { - 'url': 'http://www.franceculture.fr/emission-les-carnets-de-la-creation-jean-gabriel-periot-cineaste-2015-10-13', - 'info_dict': { - 'title': 'Jean-Gabriel Périot, cinéaste', - 'alt_title': 'Les Carnets de la création', - 'id': '5093239', - 'display_id': 'les-carnets-de-la-creation-jean-gabriel-periot-cineaste-2015-10-13', - 'ext': 'mp3', - 'timestamp': 1444762500, - 'upload_date': '20151013', - 'description': 'startswith:Aujourd\'hui dans "Les carnets de la création", le cinéaste', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_path = self._html_search_regex( - r'[0-9]+)', video_path, 'new_id', group='id') - video_url = compat_urlparse.urljoin(url, video_path) - return self._extract_from_player(video_url, new_id) From 542130a5d914e5f02acb872c88866194c66a612d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Aug 2016 21:59:29 +0700 Subject: [PATCH 037/218] [pbs] Fix description extraction and update tests --- youtube_dl/extractor/pbs.py | 47 ++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 335e44bdc..09aef7fb9 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -10,6 +10,7 @@ from ..utils import ( int_or_none, js_to_json, strip_jsonp, + strip_or_none, unified_strdate, US_RATINGS, ) @@ -200,7 +201,7 @@ class PBSIE(InfoExtractor): 'id': '2365006249', 'ext': 'mp4', 'title': 'Constitution USA with Peter Sagal - A More Perfect Union', - 'description': 'md5:36f341ae62e251b8f5bd2b754b95a071', + 'description': 'md5:31b664af3c65fd07fa460d306b837d00', 'duration': 3190, }, }, @@ -211,7 +212,7 @@ class PBSIE(InfoExtractor): 'id': '2365297690', 'ext': 'mp4', 'title': 'FRONTLINE - Losing Iraq', - 'description': 'md5:4d3eaa01f94e61b3e73704735f1196d9', + 'description': 'md5:5979a4d069b157f622d02bff62fbe654', 'duration': 5050, }, }, @@ -222,7 +223,7 @@ class PBSIE(InfoExtractor): 'id': '2201174722', 'ext': 'mp4', 'title': 'PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist', - 'description': 'md5:95a19f568689d09a166dff9edada3301', + 'description': 'md5:86ab9a3d04458b876147b355788b8781', 'duration': 801, }, }, @@ -267,7 +268,7 @@ class PBSIE(InfoExtractor): 'display_id': 'player', 'ext': 'mp4', 'title': 'American Experience - Death and the Civil War, Chapter 1', - 'description': 'md5:1b80a74e0380ed2a4fb335026de1600d', + 'description': 'md5:67fa89a9402e2ee7d08f53b920674c18', 'duration': 682, 'thumbnail': 're:^https?://.*\.jpg$', }, @@ -293,13 +294,13 @@ class PBSIE(InfoExtractor): # "', webpage): url = self._search_regex( @@ -432,10 +428,10 @@ class PBSIE(InfoExtractor): video_id = mobj.group('id') display_id = video_id - return video_id, display_id, None + return video_id, display_id, None, description def _real_extract(self, url): - video_id, display_id, upload_date = self._extract_webpage(url) + video_id, display_id, upload_date, description = self._extract_webpage(url) if isinstance(video_id, list): entries = [self.url_result( @@ -564,11 +560,14 @@ class PBSIE(InfoExtractor): if alt_title: info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + '[\s\-:]+', '', info['title']) + description = info.get('description') or info.get( + 'program', {}).get('description') or description + return { 'id': video_id, 'display_id': display_id, 'title': info['title'], - 'description': info.get('description') or info.get('program', {}).get('description'), + 'description': description, 'thumbnail': info.get('image_url'), 'duration': int_or_none(info.get('duration')), 'age_limit': age_limit, From 5ec5461e1a805595c5fef4ae482e86d7d7872d8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Aug 2016 22:50:18 +0700 Subject: [PATCH 038/218] [pbs] Clarify comment on http formats --- youtube_dl/extractor/pbs.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 09aef7fb9..b490ef74c 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -506,12 +506,12 @@ class PBSIE(InfoExtractor): if http_url: for m3u8_format in m3u8_formats: bitrate = self._search_regex(r'(\d+)k', m3u8_format['url'], 'bitrate', default=None) - # lower qualities(150k and 192k) are not available as http formats - # https://github.com/rg3/youtube-dl/commit/cbc032c8b70a038a69259378c92b4ba97b42d491#commitcomment-17313656 - # we will try to extract any http format higher than than the lowest quality documented in - # https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications - # as there also undocumented http formats formats(4500k and 6500k) - # http://www.pbs.org/video/2365815229/ + # Lower qualities (150k and 192k) are not available as HTTP formats (see [1]), + # we won't try extracting them. + # Since summer 2016 higher quality formats (4500k and 6500k) are also available + # albeit they are not documented in [2]. + # 1. https://github.com/rg3/youtube-dl/commit/cbc032c8b70a038a69259378c92b4ba97b42d491#commitcomment-17313656 + # 2. https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications if not bitrate or int(bitrate) < 400: continue f_url = re.sub(r'\d+k|baseline', bitrate + 'k', http_url) From a560f28c98445e2ae2528795609d5ac718ec5b2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Aug 2016 23:01:35 +0700 Subject: [PATCH 039/218] [ChangeLog] Actualize --- ChangeLog | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/ChangeLog b/ChangeLog index b1ce63d75..5efcb2316 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,27 @@ version +Core +* Show progress for curl external downloader +* Forward more options to curl external downloader + Extractors +* [pbs] Fix description extraction +* [franceculture] Fix extraction (#10324) +* [pornotube] Fix extraction (#10322) +* [4tube] Fix metadata extraction (#10321) +* [imgur] Fix width and height extraction (#10325) +* [expotv] Improve extraction ++ [vbox7] Fix extraction (#10309) - [tapely] Remove extractor (#10323) +* [muenchentv] Fix extraction (#10313) ++ [24video] Add support for .me and .xxx TLDs +* [24video] Fix comment count extraction +* [sunporno] Add support for embed URLs +* [sunporno] Fix metadata extraction (#10316) ++ [hgtv] Add extractor for hgtv.ca (#3999) +- [pbs] Remove request to unavailable API ++ [pbs] Add support for high quality HTTP formats ++ [crunchyroll] Add support for HLS formats (#10301) version 2016.08.12 From 73a85620eeb2d595cd86f73357bc4cb081cb3bc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Aug 2016 23:17:11 +0700 Subject: [PATCH 040/218] release 2016.08.13 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 3 +-- youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 6fdb2f77b..1e0d99b43 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.12*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.12** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.13*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.13** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.08.12 +[debug] youtube-dl version 2016.08.13 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 5efcb2316..fc99b9f73 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2016.08.13 Core * Show progress for curl external downloader diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 8fb581d2b..56fc41a40 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -238,7 +238,6 @@ - **FoxSports** - **france2.fr:generation-quoi** - **FranceCulture** - - **FranceCultureEmission** - **FranceInter** - **francetv**: France 2, 3, 4, 5 and Ô - **francetvinfo.fr** @@ -277,6 +276,7 @@ - **HellPorno** - **Helsinki**: helsinki.fi - **HentaiStigma** + - **HGTV** - **HistoricFilms** - **history:topic**: History.com Topic - **hitbox** @@ -664,7 +664,6 @@ - **SztvHu** - **Tagesschau** - **tagesschau:player** - - **Tapely** - **Tass** - **TDSLifeway** - **teachertube**: teachertube.com videos diff --git a/youtube_dl/version.py b/youtube_dl/version.py index becf14458..cc93d22aa 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.08.12' +__version__ = '2016.08.13' From 097eba019d0d5cab93e9ce66e1b727b782d48250 Mon Sep 17 00:00:00 2001 From: phi Date: Sun, 14 Aug 2016 02:18:59 +0800 Subject: [PATCH 041/218] bug fix for extractor xiami.py Before applying this patch, when downloading resources from xiami.com, it crashes with these: Traceback (most recent call last): File "/home/phi/.local/bin/youtube-dl", line 11, in sys.exit(main()) File "/home/phi/.local/lib/python3.5/site-packages/youtube_dl/__init__.py", line 433, in main _real_main(argv) File "/home/phi/.local/lib/python3.5/site-packages/youtube_dl/__init__.py", line 423, in _real_main retcode = ydl.download(all_urls) File "/home/phi/.local/lib/python3.5/site-packages/youtube_dl/YoutubeDL.py", line 1786, in download url, force_generic_extractor=self.params.get('force_generic_extractor', False)) File "/home/phi/.local/lib/python3.5/site-packages/youtube_dl/YoutubeDL.py", line 691, in extract_info ie_result = ie.extract(url) File "/home/phi/.local/lib/python3.5/site-packages/youtube_dl/extractor/common.py", line 347, in extract return self._real_extract(url) File "/home/phi/.local/lib/python3.5/site-packages/youtube_dl/extractor/xiami.py", line 116, in _real_extract return self._extract_tracks(self._match_id(url))[0] File "/home/phi/.local/lib/python3.5/site-packages/youtube_dl/extractor/xiami.py", line 43, in _extract_tracks '%s/%s%s' % (self._API_BASE_URL, item_id, '/type/%s' % typ if typ else ''), item_id) File "/home/phi/.local/lib/python3.5/site-packages/youtube_dl/extractor/common.py", line 562, in _download_json json_string, video_id, transform_source=transform_source, fatal=fatal) File "/home/phi/.local/lib/python3.5/site-packages/youtube_dl/extractor/common.py", line 568, in _parse_json return json.loads(json_string) File "/usr/lib/python3.5/json/__init__.py", line 312, in loads s.__class__.__name__)) TypeError: the JSON object must be str, not 'NoneType' This patch solves exactly this problem. --- youtube_dl/extractor/xiami.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/xiami.py b/youtube_dl/extractor/xiami.py index a6dfc4af9..86abef257 100644 --- a/youtube_dl/extractor/xiami.py +++ b/youtube_dl/extractor/xiami.py @@ -13,6 +13,7 @@ class XiamiBaseIE(InfoExtractor): webpage = super(XiamiBaseIE, self)._download_webpage(*args, **kwargs) if '>Xiami is currently not available in your country.<' in webpage: self.raise_geo_restricted('Xiami is currently not available in your country') + return webpage def _extract_track(self, track, track_id=None): title = track['title'] From fafabc0712d95e6a5b2ac56e9375fe90060738f5 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 14 Aug 2016 02:33:15 +0800 Subject: [PATCH 042/218] Update ChangeLog for #10342 [skip ci] --- ChangeLog | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ChangeLog b/ChangeLog index fc99b9f73..d04c5fc2a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version + +Extractors +* [xiami] Fix extraction (#10342) + + version 2016.08.13 Core @@ -23,6 +29,7 @@ Extractors + [pbs] Add support for high quality HTTP formats + [crunchyroll] Add support for HLS formats (#10301) + version 2016.08.12 Core From aaf44a2f47f013e8d864ac9f98b2833904a8be78 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 13 Aug 2016 22:53:07 +0100 Subject: [PATCH 043/218] [uplynk] Add new extractor --- youtube_dl/downloader/hls.py | 6 +++ youtube_dl/extractor/extractors.py | 4 ++ youtube_dl/extractor/uplynk.py | 64 ++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+) create mode 100644 youtube_dl/extractor/uplynk.py diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 3b7bb3508..8d7971e5d 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -20,6 +20,7 @@ from ..utils import ( encodeFilename, sanitize_open, parse_m3u8_attributes, + update_url_query, ) @@ -82,6 +83,7 @@ class HlsFD(FragmentFD): self._prepare_and_start_frag_download(ctx) + extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') i = 0 media_sequence = 0 decrypt_info = {'METHOD': 'NONE'} @@ -95,6 +97,8 @@ class HlsFD(FragmentFD): if re.match(r'^https?://', line) else compat_urlparse.urljoin(man_url, line)) frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i) + if extra_param_to_segment_url: + frag_url = update_url_query(frag_url, extra_param_to_segment_url) success = ctx['dl'].download(frag_filename, {'url': frag_url}) if not success: return False @@ -120,6 +124,8 @@ class HlsFD(FragmentFD): if not re.match(r'^https?://', decrypt_info['URI']): decrypt_info['URI'] = compat_urlparse.urljoin( man_url, decrypt_info['URI']) + if extra_param_to_segment_url: + decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_param_to_segment_url) decrypt_info['KEY'] = self.ydl.urlopen(decrypt_info['URI']).read() elif line.startswith('#EXT-X-MEDIA-SEQUENCE'): media_sequence = int(line[22:]) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 82d4ed153..901847509 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -926,6 +926,10 @@ from .udn import UDNEmbedIE from .digiteka import DigitekaIE from .unistra import UnistraIE from .uol import UOLIE +from .uplynk import ( + UplynkIE, + UplynkPreplayIE, +) from .urort import UrortIE from .urplay import URPlayIE from .usatoday import USATodayIE diff --git a/youtube_dl/extractor/uplynk.py b/youtube_dl/extractor/uplynk.py new file mode 100644 index 000000000..a6a685c9d --- /dev/null +++ b/youtube_dl/extractor/uplynk.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + ExtractorError, +) + + +class UplynkIE(InfoExtractor): + _VALID_URL = r'https?://.*?\.uplynk\.com/(?Pext/[0-9a-f]{32}/(?P[^/?&]+)|(?P[0-9a-f]{32}))\.(?:m3u8|json)(?:.*?\bpbs=(?P[^&]+))?' + _TEST = { + 'url': 'http://content.uplynk.com/e89eaf2ce9054aa89d92ddb2d817a52e.m3u8', + 'info_dict': { + 'id': 'e89eaf2ce9054aa89d92ddb2d817a52e', + 'ext': 'mp4', + 'title': '030816-kgo-530pm-solar-eclipse-vid_web.mp4', + 'uploader_id': '4413701bf5a1488db55b767f8ae9d4fa', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + path, external_id, video_id, session_id = re.match(self._VALID_URL, url).groups() + display_id = video_id or external_id + formats = self._extract_m3u8_formats('http://content.uplynk.com/%s.m3u8' % path, display_id, 'mp4') + if session_id: + for f in formats: + f['extra_param_to_segment_url'] = { + 'pbs': session_id, + } + self._sort_formats(formats) + asset = self._download_json('http://content.uplynk.com/player/assetinfo/%s.json' % path, display_id) + if asset.get('error') == 1: + raise ExtractorError('% said: %s' % (self.IE_NAME, asset['msg']), expected=True) + + return { + 'id': asset['asset'], + 'title': asset['desc'], + 'thumbnail': asset.get('default_poster_url'), + 'duration': float_or_none(asset.get('duration')), + 'uploader_id': asset.get('owner'), + 'formats': formats, + } + + +class UplynkPreplayIE(InfoExtractor): + _VALID_URL = r'https?://.*?\.uplynk\.com/preplay2?/(?Pext/[0-9a-f]{32}/(?P[^/?&]+)|(?P[0-9a-f]{32}))\.json' + + def _real_extract(self, url): + path, external_id, video_id = re.match(self._VALID_URL, url).groups() + display_id = video_id or external_id + preplay = self._download_json(url, display_id) + content_url = 'http://content.uplynk.com/%s.m3u8' % path + session_id = preplay.get('sid') + if session_id: + content_url += '?pbs=' + session_id + return self.url_result(content_url, 'Uplynk') From 320d597c21e7a0981f1dc9c4167fce53473ab488 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 14 Aug 2016 16:25:14 +0700 Subject: [PATCH 044/218] [vgtv] Detect geo restricted videos (#10348) --- youtube_dl/extractor/vgtv.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index b11cd254c..185756301 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -8,6 +8,7 @@ from .xstream import XstreamIE from ..utils import ( ExtractorError, float_or_none, + try_get, ) @@ -129,6 +130,11 @@ class VGTVIE(XstreamIE): 'url': 'http://ap.vgtv.no/webtv#!/video/111084/de-nye-bysyklene-lettere-bedre-gir-stoerre-hjul-og-feste-til-mobil', 'only_matching': True, }, + { + # geoblocked + 'url': 'http://www.vgtv.no/#!/video/127205/inside-the-mind-of-favela-funk', + 'only_matching': True, + }, ] def _real_extract(self, url): @@ -196,6 +202,12 @@ class VGTVIE(XstreamIE): info['formats'].extend(formats) + if not info['formats']: + properties = try_get( + data, lambda x: x['streamConfiguration']['properties'], list) + if properties and 'geoblocked' in properties: + raise self.raise_geo_restricted() + self._sort_formats(info['formats']) info.update({ From 2118fdd1a96ed7a904b53ed5aad50a203d0e0c70 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Aug 2016 11:48:13 +0100 Subject: [PATCH 045/218] [common] add separate method for getting netrc ligin info --- youtube_dl/extractor/common.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e47770c1d..9427ff449 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -662,6 +662,24 @@ class InfoExtractor(object): else: return res + def _get_netrc_login_info(self, netrc_machine=None): + username = None + password = None + netrc_machine = netrc_machine or self._NETRC_MACHINE + + if self._downloader.params.get('usenetrc', False): + try: + info = netrc.netrc().authenticators(netrc_machine) + if info is not None: + username = info[0] + password = info[2] + else: + raise netrc.NetrcParseError('No authenticators for %s' % netrc_machine) + except (IOError, netrc.NetrcParseError) as err: + self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err)) + + return (username, password) + def _get_login_info(self): """ Get the login info as (username, password) @@ -679,16 +697,8 @@ class InfoExtractor(object): if downloader_params.get('username') is not None: username = downloader_params['username'] password = downloader_params['password'] - elif downloader_params.get('usenetrc', False): - try: - info = netrc.netrc().authenticators(self._NETRC_MACHINE) - if info is not None: - username = info[0] - password = info[2] - else: - raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) - except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err)) + else: + username, password = self._get_netrc_login_info() return (username, password) From 9771b1f901b19ad5ba6632a37fc6348e8e6e98dd Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Aug 2016 11:52:48 +0100 Subject: [PATCH 046/218] [theplatform] use _get_netrc_login_info and fix session expiration check(#10345) --- youtube_dl/extractor/theplatform.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index bb3efc4ea..9ca765a5f 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -218,15 +218,16 @@ class ThePlatformIE(ThePlatformBaseIE): requestor_info = self._downloader.cache.load('mvpd', requestor_id) or {} authn_token = requestor_info.get('authn_token') if authn_token: - token_expires = unified_timestamp(xml_text(authn_token, 'simpleTokenExpires').replace('_GMT', '')) - if token_expires and token_expires >= time.time(): + token_expires = unified_timestamp(re.sub(r'[_ ]GMT', '', xml_text(authn_token, 'simpleTokenExpires'))) + if token_expires and token_expires <= int(time.time()): authn_token = None + requestor_info = {} if not authn_token: # TODO add support for other TV Providers mso_id = 'DTV' - login_info = netrc.netrc().authenticators(mso_id) - if not login_info: - return None + username, password = self._get_netrc_login_info(mso_id) + if not username or not password: + return '' def post_form(form_page, note, data={}): post_url = self._html_search_regex(r']+action=(["\'])(?P.+?)\1', form_page, 'post url', group='url') @@ -248,8 +249,8 @@ class ThePlatformIE(ThePlatformBaseIE): provider_login_page = post_form( provider_redirect_page, 'Downloading Provider Login Page') mvpd_confirm_page = post_form(provider_login_page, 'Logging in', { - 'username': login_info[0], - 'password': login_info[2], + 'username': username, + 'password': password, }) post_form(mvpd_confirm_page, 'Confirming Login') From 884cdb6cd9c872ea68a03341e462b58e51fba58a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 14 Aug 2016 20:49:11 +0700 Subject: [PATCH 047/218] [life:embed] Improve extraction --- youtube_dl/extractor/lifenews.py | 68 +++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index c2b4490c4..87120ecd1 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( determine_ext, ExtractorError, @@ -96,7 +99,7 @@ class LifeNewsIE(InfoExtractor): r']+>]+src=["\'](.+?)["\']', webpage) iframe_links = re.findall( - r']+src=["\']((?:https?:)?//embed\.life\.ru/embed/.+?)["\']', + r']+src=["\']((?:https?:)?//embed\.life\.ru/(?:embed|video)/.+?)["\']', webpage) if not video_urls and not iframe_links: @@ -164,9 +167,9 @@ class LifeNewsIE(InfoExtractor): class LifeEmbedIE(InfoExtractor): IE_NAME = 'life:embed' - _VALID_URL = r'https?://embed\.life\.ru/embed/(?P[\da-f]{32})' + _VALID_URL = r'https?://embed\.life\.ru/(?:embed|video)/(?P[\da-f]{32})' - _TEST = { + _TESTS = [{ 'url': 'http://embed.life.ru/embed/e50c2dec2867350528e2574c899b8291', 'md5': 'b889715c9e49cb1981281d0e5458fbbe', 'info_dict': { @@ -175,30 +178,57 @@ class LifeEmbedIE(InfoExtractor): 'title': 'e50c2dec2867350528e2574c899b8291', 'thumbnail': 're:http://.*\.jpg', } - } + }, { + # with 1080p + 'url': 'https://embed.life.ru/video/e50c2dec2867350528e2574c899b8291', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + thumbnail = None formats = [] - for video_url in re.findall(r'"file"\s*:\s*"([^"]+)', webpage): - video_url = compat_urlparse.urljoin(url, video_url) - ext = determine_ext(video_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='m3u8')) - else: - formats.append({ - 'url': video_url, - 'format_id': ext, - 'preference': 1, - }) + + def extract_m3u8(manifest_url): + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='m3u8')) + + def extract_original(original_url): + formats.append({ + 'url': original_url, + 'format_id': determine_ext(original_url, None), + 'preference': 1, + }) + + playlist = self._parse_json( + self._search_regex( + r'options\s*=\s*({.+?});', webpage, 'options', default='{}'), + video_id).get('playlist', {}) + if playlist: + master = playlist.get('master') + if isinstance(master, compat_str) and determine_ext(master) == 'm3u8': + extract_m3u8(compat_urlparse.urljoin(url, master)) + original = playlist.get('original') + if isinstance(original, compat_str): + extract_original(original) + thumbnail = playlist.get('image') + + # Old rendition fallback + if not formats: + for video_url in re.findall(r'"file"\s*:\s*"([^"]+)', webpage): + video_url = compat_urlparse.urljoin(url, video_url) + if determine_ext(video_url) == 'm3u8': + extract_m3u8(video_url) + else: + extract_original(video_url) + self._sort_formats(formats) - thumbnail = self._search_regex( + thumbnail = thumbnail or self._search_regex( r'"image"\s*:\s*"([^"]+)', webpage, 'thumbnail', default=None) return { From 1fd6e30988f44d372c7112c2d5e44c0d5cdbc4ed Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Aug 2016 17:55:56 +0100 Subject: [PATCH 048/218] [adobepass] create separate class for adobe pass authentication --- youtube_dl/extractor/adobepass.py | 124 +++++++++++++++++++++ youtube_dl/extractor/aenetworks.py | 5 +- youtube_dl/extractor/nationalgeographic.py | 2 +- youtube_dl/extractor/syfy.py | 4 +- youtube_dl/extractor/theplatform.py | 98 +--------------- 5 files changed, 134 insertions(+), 99 deletions(-) create mode 100644 youtube_dl/extractor/adobepass.py diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py new file mode 100644 index 000000000..4e59302ab --- /dev/null +++ b/youtube_dl/extractor/adobepass.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re +import time +import xml.etree.ElementTree as etree + +from .common import InfoExtractor +from ..utils import ( + unescapeHTML, + urlencode_postdata, + unified_timestamp, +) + + +class AdobePass(InfoExtractor): + _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' + _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' + + @staticmethod + def _get_mvpd_resource(provider_id, title, guid, rating): + channel = etree.Element('channel') + channel_title = etree.SubElement(channel, 'title') + channel_title.text = provider_id + item = etree.SubElement(channel, 'item') + resource_title = etree.SubElement(item, 'title') + resource_title.text = title + resource_guid = etree.SubElement(item, 'guid') + resource_guid.text = guid + resource_rating = etree.SubElement(item, 'media:rating') + resource_rating.attrib = {'scheme': 'urn:v-chip'} + resource_rating.text = rating + return '' + etree.tostring(channel).decode() + '' + + def _extract_mvpd_auth(self, url, video_id, requestor_id, resource): + def xml_text(xml_str, tag): + return self._search_regex( + '<%s>(.+?)' % (tag, tag), xml_str, tag) + + mvpd_headers = { + 'ap_42': 'anonymous', + 'ap_11': 'Linux i686', + 'ap_z': self._USER_AGENT, + 'User-Agent': self._USER_AGENT, + } + + guid = xml_text(resource, 'guid') + requestor_info = self._downloader.cache.load('mvpd', requestor_id) or {} + authn_token = requestor_info.get('authn_token') + if authn_token: + token_expires = unified_timestamp(re.sub(r'[_ ]GMT', '', xml_text(authn_token, 'simpleTokenExpires'))) + if token_expires and token_expires <= int(time.time()): + authn_token = None + requestor_info = {} + if not authn_token: + # TODO add support for other TV Providers + mso_id = 'DTV' + username, password = self._get_netrc_login_info(mso_id) + if not username or not password: + return '' + + def post_form(form_page, note, data={}): + post_url = self._html_search_regex(r']+action=(["\'])(?P.+?)\1', form_page, 'post url', group='url') + return self._download_webpage( + post_url, video_id, note, data=urlencode_postdata(data or self._hidden_inputs(form_page)), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + + provider_redirect_page = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, + 'Downloading Provider Redirect Page', query={ + 'noflash': 'true', + 'mso_id': mso_id, + 'requestor_id': requestor_id, + 'no_iframe': 'false', + 'domain_name': 'adobe.com', + 'redirect_url': url, + }) + provider_login_page = post_form( + provider_redirect_page, 'Downloading Provider Login Page') + mvpd_confirm_page = post_form(provider_login_page, 'Logging in', { + 'username': username, + 'password': password, + }) + post_form(mvpd_confirm_page, 'Confirming Login') + + session = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, + 'Retrieving Session', data=urlencode_postdata({ + '_method': 'GET', + 'requestor_id': requestor_id, + }), headers=mvpd_headers) + authn_token = unescapeHTML(xml_text(session, 'authnToken')) + requestor_info['authn_token'] = authn_token + self._downloader.cache.store('mvpd', requestor_id, requestor_info) + + authz_token = requestor_info.get(guid) + if not authz_token: + authorize = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'authorize', video_id, + 'Retrieving Authorization Token', data=urlencode_postdata({ + 'resource_id': resource, + 'requestor_id': requestor_id, + 'authentication_token': authn_token, + 'mso_id': xml_text(authn_token, 'simpleTokenMsoID'), + 'userMeta': '1', + }), headers=mvpd_headers) + authz_token = unescapeHTML(xml_text(authorize, 'authzToken')) + requestor_info[guid] = authz_token + self._downloader.cache.store('mvpd', requestor_id, requestor_info) + + mvpd_headers.update({ + 'ap_19': xml_text(authn_token, 'simpleSamlNameID'), + 'ap_23': xml_text(authn_token, 'simpleSamlSessionIndex'), + }) + + return self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'shortAuthorize', + video_id, 'Retrieving Media Token', data=urlencode_postdata({ + 'authz_token': authz_token, + 'requestor_id': requestor_id, + 'session_guid': xml_text(authn_token, 'simpleTokenAuthenticationGuid'), + 'hashed_guid': 'false', + }), headers=mvpd_headers) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 8f53050c9..6adb6d824 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -109,7 +109,10 @@ class AENetworksIE(AENetworksBaseIE): info = self._parse_theplatform_metadata(theplatform_metadata) if theplatform_metadata.get('AETN$isBehindWall'): requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain] - resource = '%s%s%s%s' % (requestor_id, theplatform_metadata['title'], theplatform_metadata['AETN$PPL_pplProgramId'], theplatform_metadata['ratings'][0]['rating']) + resource = self._get_mvpd_resource( + requestor_id, theplatform_metadata['title'], + theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), + theplatform_metadata['ratings'][0]['rating']) query['auth'] = self._extract_mvpd_auth( url, video_id, requestor_id, resource) info.update(self._search_json_ld(webpage, video_id, fatal=False)) diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 0027ff1b8..890e8d5bc 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -119,7 +119,7 @@ class NationalGeographicIE(ThePlatformIE): auth_resource_id = self._search_regex( r"video_auth_resourceId\s*=\s*'([^']+)'", webpage, 'auth resource id') - query['auth'] = self._extract_mvpd_auth(url, display_id, 'natgeo', auth_resource_id) or '' + query['auth'] = self._extract_mvpd_auth(url, display_id, 'natgeo', auth_resource_id) return { '_type': 'url_transparent', diff --git a/youtube_dl/extractor/syfy.py b/youtube_dl/extractor/syfy.py index 53723b66e..764287a64 100644 --- a/youtube_dl/extractor/syfy.py +++ b/youtube_dl/extractor/syfy.py @@ -40,7 +40,9 @@ class SyfyIE(ThePlatformIE): 'manifest': 'm3u', } if syfy_mpx.get('entitlement') == 'auth': - resource = 'syfy<![CDATA[%s]]>%s%s' % (title, video_id, syfy_mpx.get('mpxRating', 'TV-14')) + resource = self._get_mvpd_resource( + 'syfy', title, video_id, + syfy_mpx.get('mpxRating', 'TV-14')) query['auth'] = self._extract_mvpd_auth( url, video_id, 'syfy', resource) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 9ca765a5f..108ddd3a9 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -6,10 +6,10 @@ import time import hmac import binascii import hashlib -import netrc from .once import OnceIE +from .adobepass import AdobePass from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, @@ -25,9 +25,6 @@ from ..utils import ( xpath_with_ns, mimetype2ext, find_xpath_attr, - unescapeHTML, - urlencode_postdata, - unified_timestamp, ) default_ns = 'http://www.w3.org/2005/SMIL21/Language' @@ -96,7 +93,7 @@ class ThePlatformBaseIE(OnceIE): return self._parse_theplatform_metadata(info) -class ThePlatformIE(ThePlatformBaseIE): +class ThePlatformIE(ThePlatformBaseIE, AdobePass): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P[^/]+)/ (?:(?:(?:[^/]+/)+select/)?(?Pmedia/(?:guid/\d+/)?)|(?P(?:[^/\?]+/(?:swf|config)|onsite)/select/))? @@ -202,97 +199,6 @@ class ThePlatformIE(ThePlatformBaseIE): sig = flags + expiration_date + checksum + str_to_hex(sig_secret) return '%s&sig=%s' % (url, sig) - def _extract_mvpd_auth(self, url, video_id, requestor_id, resource): - def xml_text(xml_str, tag): - return self._search_regex( - '<%s>(.+?)' % (tag, tag), xml_str, tag) - - mvpd_headers = { - 'ap_42': 'anonymous', - 'ap_11': 'Linux i686', - 'ap_z': 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0', - 'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0', - } - - guid = xml_text(resource, 'guid') - requestor_info = self._downloader.cache.load('mvpd', requestor_id) or {} - authn_token = requestor_info.get('authn_token') - if authn_token: - token_expires = unified_timestamp(re.sub(r'[_ ]GMT', '', xml_text(authn_token, 'simpleTokenExpires'))) - if token_expires and token_expires <= int(time.time()): - authn_token = None - requestor_info = {} - if not authn_token: - # TODO add support for other TV Providers - mso_id = 'DTV' - username, password = self._get_netrc_login_info(mso_id) - if not username or not password: - return '' - - def post_form(form_page, note, data={}): - post_url = self._html_search_regex(r']+action=(["\'])(?P.+?)\1', form_page, 'post url', group='url') - return self._download_webpage( - post_url, video_id, note, data=urlencode_postdata(data or self._hidden_inputs(form_page)), headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }) - - provider_redirect_page = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, - 'Downloading Provider Redirect Page', query={ - 'noflash': 'true', - 'mso_id': mso_id, - 'requestor_id': requestor_id, - 'no_iframe': 'false', - 'domain_name': 'adobe.com', - 'redirect_url': url, - }) - provider_login_page = post_form( - provider_redirect_page, 'Downloading Provider Login Page') - mvpd_confirm_page = post_form(provider_login_page, 'Logging in', { - 'username': username, - 'password': password, - }) - post_form(mvpd_confirm_page, 'Confirming Login') - - session = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, - 'Retrieving Session', data=urlencode_postdata({ - '_method': 'GET', - 'requestor_id': requestor_id, - }), headers=mvpd_headers) - authn_token = unescapeHTML(xml_text(session, 'authnToken')) - requestor_info['authn_token'] = authn_token - self._downloader.cache.store('mvpd', requestor_id, requestor_info) - - authz_token = requestor_info.get(guid) - if not authz_token: - authorize = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'authorize', video_id, - 'Retrieving Authorization Token', data=urlencode_postdata({ - 'resource_id': resource, - 'requestor_id': requestor_id, - 'authentication_token': authn_token, - 'mso_id': xml_text(authn_token, 'simpleTokenMsoID'), - 'userMeta': '1', - }), headers=mvpd_headers) - authz_token = unescapeHTML(xml_text(authorize, 'authzToken')) - requestor_info[guid] = authz_token - self._downloader.cache.store('mvpd', requestor_id, requestor_info) - - mvpd_headers.update({ - 'ap_19': xml_text(authn_token, 'simpleSamlNameID'), - 'ap_23': xml_text(authn_token, 'simpleSamlSessionIndex'), - }) - - return self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'shortAuthorize', - video_id, 'Retrieving Media Token', data=urlencode_postdata({ - 'authz_token': authz_token, - 'requestor_id': requestor_id, - 'session_guid': xml_text(authn_token, 'simpleTokenAuthenticationGuid'), - 'hashed_guid': 'false', - }), headers=mvpd_headers) - def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) From d2ac04674d0d9085aedec229820c1d07082e5825 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Aug 2016 18:03:42 +0100 Subject: [PATCH 049/218] [viceland] Add new extractor(#8799) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/viceland.py | 100 +++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 youtube_dl/extractor/viceland.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 901847509..be96e34ba 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -958,6 +958,7 @@ from .vice import ( ViceIE, ViceShowIE, ) +from .viceland import VicelandIE from .vidbit import VidbitIE from .viddler import ViddlerIE from .videodetective import VideoDetectiveIE diff --git a/youtube_dl/extractor/viceland.py b/youtube_dl/extractor/viceland.py new file mode 100644 index 000000000..c66e8eb95 --- /dev/null +++ b/youtube_dl/extractor/viceland.py @@ -0,0 +1,100 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import time +import hashlib +import json + +from .adobepass import AdobePass +from ..compat import compat_HTTPError +from ..utils import ( + int_or_none, + parse_age_limit, + str_or_none, + parse_duration, + ExtractorError, + extract_attributes, +) + + +class VicelandIE(AdobePass): + _VALID_URL = r'https?://(?:www\.)?viceland\.com/[^/]+/video/[^/]+/(?P[a-f0-9]+)' + _TEST = { + # FIXME: fill the test after fixing delegation problem + 'url': 'https://www.viceland.com/en_us/video/cyberwar-trailer/57608447973ee7705f6fbd4e', + 'info_dict': { + 'id': '57608447973ee7705f6fbd4e', + 'ext': 'mp4', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['UplynkPreplay', 'Uplynk'], + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + watch_hub_data = extract_attributes(self._search_regex( + r'(?s)()', webpage, 'watch hub')) + video_id = watch_hub_data['vms-id'] + title = watch_hub_data['video-title'] + + query = {} + if watch_hub_data.get('video-locked') == '1': + resource = self._get_mvpd_resource( + 'VICELAND', title, video_id, + watch_hub_data.get('video-rating')) + query['tvetoken'] = self._extract_mvpd_auth(url, video_id, 'VICELAND', resource) + + # signature generation algorithm is reverse engineered from signatureGenerator in + # webpack:///../shared/~/vice-player/dist/js/vice-player.js in + # https://www.viceland.com/assets/common/js/web.vendor.bundle.js + exp = int(time.time()) + 14400 + query.update({ + 'exp': exp, + 'sign': hashlib.sha512(('%s:GET:%d' % (video_id, exp)).encode()).hexdigest(), + }) + + try: + preplay = self._download_json('https://www.viceland.com/en_us/preplay/%s' % video_id, video_id, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + error = json.loads(e.cause.read().decode()) + raise ExtractorError('%s said: %s' % (self.IE_NAME, error['details']), expected=True) + + video_data = preplay['video'] + base = video_data['base'] + uplynk_preplay_url = preplay['preplayURL'] + episode = video_data.get('episode', {}) + channel = video_data.get('channel', {}) + + subtitles = {} + cc_url = preplay.get('ccURL') + if cc_url: + subtitles['en'] = [{ + 'url': cc_url, + }] + + return { + '_type': 'url_transparent', + 'url': uplynk_preplay_url, + 'id': video_id, + 'title': title, + 'description': base.get('body'), + 'thumbnail': watch_hub_data.get('cover-image') or watch_hub_data.get('thumbnail'), + 'duration': parse_duration(video_data.get('video_duration') or watch_hub_data.get('video-duration')), + 'timestamp': int_or_none(video_data.get('created_at')), + 'age_limit': parse_age_limit(video_data.get('video_rating')), + 'series': video_data.get('show_title') or watch_hub_data.get('show-title'), + 'episode_number': int_or_none(episode.get('episode_number') or watch_hub_data.get('episode')), + 'episode_id': str_or_none(episode.get('id') or video_data.get('episode_id')), + 'season_number': int_or_none(watch_hub_data.get('season')), + 'season_id': str_or_none(episode.get('season_id')), + 'uploader': channel.get('base', {}).get('title') or watch_hub_data.get('channel-title'), + 'uploader_id': str_or_none(channel.get('id')), + 'subtitles': subtitles, + 'ie_key': 'UplynkPreplay', + } From 9fa57892790ce205634f6a7c83de2b9e52ab5284 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Aug 2016 19:04:23 +0100 Subject: [PATCH 050/218] [viceland] fix info extraction(closes #8799) --- youtube_dl/extractor/uplynk.py | 11 +++++++---- youtube_dl/extractor/viceland.py | 7 +++++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/uplynk.py b/youtube_dl/extractor/uplynk.py index a6a685c9d..4313bc9cb 100644 --- a/youtube_dl/extractor/uplynk.py +++ b/youtube_dl/extractor/uplynk.py @@ -26,8 +26,8 @@ class UplynkIE(InfoExtractor): }, } - def _real_extract(self, url): - path, external_id, video_id, session_id = re.match(self._VALID_URL, url).groups() + def _extract_uplynk_info(self, uplynk_content_url): + path, external_id, video_id, session_id = re.match(UplynkIE._VALID_URL, uplynk_content_url).groups() display_id = video_id or external_id formats = self._extract_m3u8_formats('http://content.uplynk.com/%s.m3u8' % path, display_id, 'mp4') if session_id: @@ -49,8 +49,11 @@ class UplynkIE(InfoExtractor): 'formats': formats, } + def _real_extract(self, url): + return self._extract_uplynk_info(url) -class UplynkPreplayIE(InfoExtractor): + +class UplynkPreplayIE(UplynkIE): _VALID_URL = r'https?://.*?\.uplynk\.com/preplay2?/(?Pext/[0-9a-f]{32}/(?P[^/?&]+)|(?P[0-9a-f]{32}))\.json' def _real_extract(self, url): @@ -61,4 +64,4 @@ class UplynkPreplayIE(InfoExtractor): session_id = preplay.get('sid') if session_id: content_url += '?pbs=' + session_id - return self.url_result(content_url, 'Uplynk') + return self._extract_uplynk_info(content_url) diff --git a/youtube_dl/extractor/viceland.py b/youtube_dl/extractor/viceland.py index c66e8eb95..f72294b51 100644 --- a/youtube_dl/extractor/viceland.py +++ b/youtube_dl/extractor/viceland.py @@ -25,6 +25,13 @@ class VicelandIE(AdobePass): 'info_dict': { 'id': '57608447973ee7705f6fbd4e', 'ext': 'mp4', + 'title': 'CYBERWAR (Trailer)', + 'description': 'Tapping into the geopolitics of hacking and surveillance, Ben Makuch travels the world to meet with hackers, government officials, and dissidents to investigate the ecosystem of cyberwarfare.', + 'age_limit': 14, + 'timestamp': 1466008539, + 'upload_date': '20160615', + 'uploader_id': '11', + 'uploader': 'Viceland', }, 'params': { # m3u8 download From 6103f59095bd1e514e43b3f84f4633e27ee09b69 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Aug 2016 19:08:35 +0100 Subject: [PATCH 051/218] [viceland] remove outdated comment --- youtube_dl/extractor/viceland.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/viceland.py b/youtube_dl/extractor/viceland.py index f72294b51..0be8a792f 100644 --- a/youtube_dl/extractor/viceland.py +++ b/youtube_dl/extractor/viceland.py @@ -20,7 +20,6 @@ from ..utils import ( class VicelandIE(AdobePass): _VALID_URL = r'https?://(?:www\.)?viceland\.com/[^/]+/video/[^/]+/(?P[a-f0-9]+)' _TEST = { - # FIXME: fill the test after fixing delegation problem 'url': 'https://www.viceland.com/en_us/video/cyberwar-trailer/57608447973ee7705f6fbd4e', 'info_dict': { 'id': '57608447973ee7705f6fbd4e', From e811bcf8f820d92b6629920b7c3c5a902815e6d1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Aug 2016 20:12:53 +0100 Subject: [PATCH 052/218] [viceland] raise ExtractorError for errors other than HTTP 400 --- youtube_dl/extractor/viceland.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/viceland.py b/youtube_dl/extractor/viceland.py index 0be8a792f..814a72fa2 100644 --- a/youtube_dl/extractor/viceland.py +++ b/youtube_dl/extractor/viceland.py @@ -70,6 +70,7 @@ class VicelandIE(AdobePass): if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: error = json.loads(e.cause.read().decode()) raise ExtractorError('%s said: %s' % (self.IE_NAME, error['details']), expected=True) + raise video_data = preplay['video'] base = video_data['base'] From 7e60ce9cf7b104c15fcc4c495166dc57b950b987 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 14 Aug 2016 21:24:33 +0100 Subject: [PATCH 053/218] [adobepass] clear cache in case of pendingLogout errors --- youtube_dl/extractor/adobepass.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index 4e59302ab..d315bfbc1 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -90,6 +90,9 @@ class AdobePass(InfoExtractor): '_method': 'GET', 'requestor_id': requestor_id, }), headers=mvpd_headers) + if ' Date: Sun, 14 Aug 2016 21:25:43 +0100 Subject: [PATCH 054/218] [adobepass] fix check for pendingLogout errors --- youtube_dl/extractor/adobepass.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index d315bfbc1..cf3a15cbb 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -90,7 +90,7 @@ class AdobePass(InfoExtractor): '_method': 'GET', 'requestor_id': requestor_id, }), headers=mvpd_headers) - if ' Date: Sun, 14 Aug 2016 22:45:43 +0100 Subject: [PATCH 055/218] [uplynk,viceland] update tests and change uplynk extractors names --- youtube_dl/extractor/uplynk.py | 3 +++ youtube_dl/extractor/viceland.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/uplynk.py b/youtube_dl/extractor/uplynk.py index 4313bc9cb..ae529f690 100644 --- a/youtube_dl/extractor/uplynk.py +++ b/youtube_dl/extractor/uplynk.py @@ -11,6 +11,7 @@ from ..utils import ( class UplynkIE(InfoExtractor): + IE_NAME = 'uplynk' _VALID_URL = r'https?://.*?\.uplynk\.com/(?Pext/[0-9a-f]{32}/(?P[^/?&]+)|(?P[0-9a-f]{32}))\.(?:m3u8|json)(?:.*?\bpbs=(?P[^&]+))?' _TEST = { 'url': 'http://content.uplynk.com/e89eaf2ce9054aa89d92ddb2d817a52e.m3u8', @@ -54,7 +55,9 @@ class UplynkIE(InfoExtractor): class UplynkPreplayIE(UplynkIE): + IE_NAME = 'uplynk:preplay' _VALID_URL = r'https?://.*?\.uplynk\.com/preplay2?/(?Pext/[0-9a-f]{32}/(?P[^/?&]+)|(?P[0-9a-f]{32}))\.json' + _TEST = None def _real_extract(self, url): path, external_id, video_id = re.match(self._VALID_URL, url).groups() diff --git a/youtube_dl/extractor/viceland.py b/youtube_dl/extractor/viceland.py index 814a72fa2..da766d8db 100644 --- a/youtube_dl/extractor/viceland.py +++ b/youtube_dl/extractor/viceland.py @@ -36,7 +36,7 @@ class VicelandIE(AdobePass): # m3u8 download 'skip_download': True, }, - 'add_ie': ['UplynkPreplay', 'Uplynk'], + 'add_ie': ['UplynkPreplay'], } def _real_extract(self, url): From 1a57b8c18c9bdaf5e231f2178499041446b57a3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 15 Aug 2016 08:25:24 +0700 Subject: [PATCH 056/218] [zippcast] Remove extractor (Closes #10332) ZippCast is shut down --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/zippcast.py | 94 ------------------------------ 2 files changed, 95 deletions(-) delete mode 100644 youtube_dl/extractor/zippcast.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index be96e34ba..15bc0a675 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1109,4 +1109,3 @@ from .zingmp3 import ( ZingMp3SongIE, ZingMp3AlbumIE, ) -from .zippcast import ZippCastIE diff --git a/youtube_dl/extractor/zippcast.py b/youtube_dl/extractor/zippcast.py deleted file mode 100644 index de819376d..000000000 --- a/youtube_dl/extractor/zippcast.py +++ /dev/null @@ -1,94 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - str_to_int, -) - - -class ZippCastIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?zippcast\.com/(?:video/|videoview\.php\?.*\bvplay=)(?P[0-9a-zA-Z]+)' - _TESTS = [{ - # m3u8, hq direct link - 'url': 'http://www.zippcast.com/video/c9cfd5c7e44dbc29c81', - 'md5': '5ea0263b5606866c4d6cda0fc5e8c6b6', - 'info_dict': { - 'id': 'c9cfd5c7e44dbc29c81', - 'ext': 'mp4', - 'title': '[Vinesauce] Vinny - Digital Space Traveler', - 'description': 'Muted on youtube, but now uploaded in it\'s original form.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'vinesauce', - 'view_count': int, - 'categories': ['Entertainment'], - 'tags': list, - }, - }, { - # f4m, lq ipod direct link - 'url': 'http://www.zippcast.com/video/b79c0a233e9c6581775', - 'only_matching': True, - }, { - 'url': 'http://www.zippcast.com/videoview.php?vplay=c9cfd5c7e44dbc29c81&auto=no', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://www.zippcast.com/video/%s' % video_id, video_id) - - formats = [] - video_url = self._search_regex( - r']+src=(["\'])(?P.+?)\1', webpage, - 'video url', default=None, group='url') - if video_url: - formats.append({ - 'url': video_url, - 'format_id': 'http', - 'preference': 0, # direct link is almost always of worse quality - }) - src_url = self._search_regex( - r'src\s*:\s*(?:escape\()?(["\'])(?Phttp://.+?)\1', - webpage, 'src', default=None, group='url') - ext = determine_ext(src_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - src_url, video_id, f4m_id='hds', fatal=False)) - self._sort_formats(formats) - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) or self._html_search_meta( - 'description', webpage) - uploader = self._search_regex( - r']+href="https?://[^/]+/profile/[^>]+>([^<]+)', - webpage, 'uploader', fatal=False) - thumbnail = self._og_search_thumbnail(webpage) - view_count = str_to_int(self._search_regex( - r'>([\d,.]+) views!', webpage, 'view count', fatal=False)) - - categories = re.findall( - r']+href="https?://[^/]+/categories/[^"]+">([^<]+),?<', - webpage) - tags = re.findall( - r']+href="https?://[^/]+/search/tags/[^"]+">([^<]+),?<', - webpage) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'view_count': view_count, - 'categories': categories, - 'tags': tags, - 'formats': formats, - } From b6c4e36728e8f60ae7f4910a9b7027a2b702e8dc Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 15 Aug 2016 13:29:01 +0800 Subject: [PATCH 057/218] [jwplatform] Parse video_id from JWPlayer data And remove a mysterious comma from 115c65793af4c56c8f1986d2640105fc7e760c13 --- youtube_dl/extractor/jwplatform.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 2a499bb77..ce3126943 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -30,7 +30,7 @@ class JWPlatformBaseIE(InfoExtractor): return self._parse_jwplayer_data( jwplayer_data, video_id, *args, **kwargs) - def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True, m3u8_id=None, rtmp_params=None, base_url=None): + def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, m3u8_id=None, rtmp_params=None, base_url=None): # JWPlayer backward compatibility: flattened playlists # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 if 'playlist' not in jwplayer_data: @@ -43,6 +43,8 @@ class JWPlatformBaseIE(InfoExtractor): if 'sources' not in video_data: video_data['sources'] = [video_data] + this_video_id = video_id or video_data['mediaid'] + formats = [] for source in video_data['sources']: source_url = self._proto_relative_url(source['file']) @@ -52,7 +54,7 @@ class JWPlatformBaseIE(InfoExtractor): ext = mimetype2ext(source_type) or determine_ext(source_url) if source_type == 'hls' or ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - source_url, video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) + source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'): formats.append({ @@ -68,7 +70,7 @@ class JWPlatformBaseIE(InfoExtractor): 'ext': ext, } if source_url.startswith('rtmp'): - a_format['ext'] = 'flv', + a_format['ext'] = 'flv' # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as # of jwplayer.flash.swf @@ -95,7 +97,7 @@ class JWPlatformBaseIE(InfoExtractor): }) entries.append({ - 'id': video_id, + 'id': this_video_id, 'title': video_data['title'] if require_title else video_data.get('title'), 'description': video_data.get('description'), 'thumbnail': self._proto_relative_url(video_data.get('image')), From 5c2d08722139118d8de27d43d6210e18ab1da9d5 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 15 Aug 2016 13:31:08 +0800 Subject: [PATCH 058/218] [sendtonews] Fix extraction --- youtube_dl/extractor/sendtonews.py | 103 +++++++++++++++-------------- 1 file changed, 53 insertions(+), 50 deletions(-) diff --git a/youtube_dl/extractor/sendtonews.py b/youtube_dl/extractor/sendtonews.py index 1c636f672..2dbe490bb 100644 --- a/youtube_dl/extractor/sendtonews.py +++ b/youtube_dl/extractor/sendtonews.py @@ -4,33 +4,43 @@ from __future__ import unicode_literals import re from .jwplatform import JWPlatformBaseIE -from ..compat import compat_parse_qs from ..utils import ( - ExtractorError, - parse_duration, + float_or_none, + parse_iso8601, + update_url_query, ) class SendtoNewsIE(JWPlatformBaseIE): - _VALID_URL = r'https?://embed\.sendtonews\.com/player/embed\.php\?(?P[^#]+)' + _VALID_URL = r'https?://embed\.sendtonews\.com/player2/embedplayer\.php\?.*\bSC=(?P[0-9A-Za-z-]+)' _TEST = { # From http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/ - 'url': 'http://embed.sendtonews.com/player/embed.php?SK=GxfCe0Zo7D&MK=175909&PK=5588&autoplay=on&sound=yes', + 'url': 'http://embed.sendtonews.com/player2/embedplayer.php?SC=GxfCe0Zo7D-175909-5588&type=single&autoplay=on&sound=YES', 'info_dict': { - 'id': 'GxfCe0Zo7D-175909-5588', - 'ext': 'mp4', - 'title': 'Recap: CLE 15, CIN 6', - 'description': '5/16/16: Indians\' bats explode for 15 runs in a win', - 'duration': 49, + 'id': 'GxfCe0Zo7D-175909-5588' }, + 'playlist_count': 9, + # test the first video only to prevent lengthy tests + 'playlist': [{ + 'info_dict': { + 'id': '198180', + 'ext': 'mp4', + 'title': 'Recap: CLE 5, LAA 4', + 'description': '8/14/16: Naquin, Almonte lead Indians in 5-4 win', + 'duration': 57.343, + 'thumbnail': 're:https?://.*\.jpg$', + 'upload_date': '20160815', + 'timestamp': 1471221961, + }, + }], 'params': { # m3u8 download 'skip_download': True, }, } - _URL_TEMPLATE = '//embed.sendtonews.com/player/embed.php?SK=%s&MK=%s&PK=%s' + _URL_TEMPLATE = '//embed.sendtonews.com/player2/embedplayer.php?SC=%s' @classmethod def _extract_url(cls, webpage): @@ -39,48 +49,41 @@ class SendtoNewsIE(JWPlatformBaseIE): .*\bSC=(?P[0-9a-zA-Z-]+).* \1>''', webpage) if mobj: - sk, mk, pk = mobj.group('SC').split('-') - return cls._URL_TEMPLATE % (sk, mk, pk) + sc = mobj.group('SC') + return cls._URL_TEMPLATE % sc def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - params = compat_parse_qs(mobj.group('query')) + playlist_id = self._match_id(url) - if 'SK' not in params or 'MK' not in params or 'PK' not in params: - raise ExtractorError('Invalid URL', expected=True) + data_url = update_url_query( + url.replace('embedplayer.php', 'data_read.php'), + {'cmd': 'loadInitial'}) + playlist_data = self._download_json(data_url, playlist_id) - video_id = '-'.join([params['SK'][0], params['MK'][0], params['PK'][0]]) + entries = [] + for video in playlist_data['playlistData'][0]: + info_dict = self._parse_jwplayer_data( + video['jwconfiguration'], + require_title=False, rtmp_params={'no_resume': True}) - webpage = self._download_webpage(url, video_id) + thumbnails = [] + if video.get('thumbnailUrl'): + thumbnails.append({ + 'id': 'normal', + 'url': video['thumbnailUrl'], + }) + if video.get('smThumbnailUrl'): + thumbnails.append({ + 'id': 'small', + 'url': video['smThumbnailUrl'], + }) + info_dict.update({ + 'title': video['S_headLine'], + 'description': video.get('S_fullStory'), + 'thumbnails': thumbnails, + 'duration': float_or_none(video.get('SM_length')), + 'timestamp': parse_iso8601(video.get('S_sysDate'), delimiter=' '), + }) + entries.append(info_dict) - jwplayer_data_str = self._search_regex( - r'jwplayer\("[^"]+"\)\.setup\((.+?)\);', webpage, 'JWPlayer data') - js_vars = { - 'w': 1024, - 'h': 768, - 'modeVar': 'html5', - } - for name, val in js_vars.items(): - js_val = '%d' % val if isinstance(val, int) else '"%s"' % val - jwplayer_data_str = jwplayer_data_str.replace(':%s,' % name, ':%s,' % js_val) - - info_dict = self._parse_jwplayer_data( - self._parse_json(jwplayer_data_str, video_id), - video_id, require_title=False, rtmp_params={'no_resume': True}) - - title = self._html_search_regex( - r']+class="embedTitle">([^<]+)
    ', webpage, 'title') - description = self._html_search_regex( - r']+class="embedSubTitle">([^<]+)', webpage, - 'description', fatal=False) - duration = parse_duration(self._html_search_regex( - r']+class="embedDetails">([0-9:]+)', webpage, - 'duration', fatal=False)) - - info_dict.update({ - 'title': title, - 'description': description, - 'duration': duration, - }) - - return info_dict + return self.playlist_result(entries, playlist_id) From 760845ce9965b57484f232a162b9bb4ad3a505a7 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 15 Aug 2016 13:37:37 +0800 Subject: [PATCH 059/218] [cbslocal] Adapt to SendtoNewsIE --- youtube_dl/extractor/cbslocal.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py index 008c5fe32..4bcd104af 100644 --- a/youtube_dl/extractor/cbslocal.py +++ b/youtube_dl/extractor/cbslocal.py @@ -41,13 +41,8 @@ class CBSLocalIE(AnvatoIE): 'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/', 'info_dict': { 'id': 'GxfCe0Zo7D-175909-5588', - 'ext': 'mp4', - 'title': 'Recap: CLE 15, CIN 6', - 'description': '5/16/16: Indians\' bats explode for 15 runs in a win', - 'upload_date': '20160516', - 'timestamp': 1463433840, - 'duration': 49, }, + 'playlist_count': 9, 'params': { # m3u8 download 'skip_download': True, @@ -60,12 +55,11 @@ class CBSLocalIE(AnvatoIE): sendtonews_url = SendtoNewsIE._extract_url(webpage) if sendtonews_url: - info_dict = { - '_type': 'url_transparent', - 'url': compat_urlparse.urljoin(url, sendtonews_url), - } - else: - info_dict = self._extract_anvato_videos(webpage, display_id) + return self.url_result( + compat_urlparse.urljoin(url, sendtonews_url), + ie=SendtoNewsIE.ie_key()) + + info_dict = self._extract_anvato_videos(webpage, display_id) time_str = self._html_search_regex( r'class="entry-date">([^<]+)<', webpage, 'released date', fatal=False) From 6d8ec8c3b7381c40afd89f9c118ae770997703d0 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 15 Aug 2016 13:39:43 +0800 Subject: [PATCH 060/218] [ChangeLog] Update for CBSLocal and related changes --- ChangeLog | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ChangeLog b/ChangeLog index d04c5fc2a..32504dab5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,9 @@ version Extractors +* [cbslocal] Fix extraction for SendtoNews-based videos +* [sendtonews] Fix extraction +* [jwplatform] Now can parse video_id from JWPlayer data * [xiami] Fix extraction (#10342) From 69eb4d699fe3f6d84acc7882e427e661040faecb Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 15 Aug 2016 20:29:22 +0800 Subject: [PATCH 061/218] [cbsnews] Remove invalid tests. CBS Live videos gets deleted soon. --- youtube_dl/extractor/cbsnews.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 9328e3e20..9d3b75526 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -70,7 +70,8 @@ class CBSNewsLiveVideoIE(InfoExtractor): IE_DESC = 'CBS News Live Videos' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P[\da-z_-]+)' - _TESTS = [{ + # Live videos get deleted soon. See http://www.cbsnews.com/live/ for the latest examples + _TEST = { 'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/', 'info_dict': { 'id': 'clinton-sanders-prepare-to-face-off-in-nh', @@ -78,15 +79,8 @@ class CBSNewsLiveVideoIE(InfoExtractor): 'title': 'Clinton, Sanders Prepare To Face Off In NH', 'duration': 334, }, - 'skip': 'Video gone, redirected to http://www.cbsnews.com/live/', - }, { - 'url': 'http://www.cbsnews.com/live/video/video-shows-intense-paragliding-accident/', - 'info_dict': { - 'id': 'video-shows-intense-paragliding-accident', - 'ext': 'flv', - 'title': 'Video Shows Intense Paragliding Accident', - }, - }] + 'skip': 'Video gone', + } def _real_extract(self, url): video_id = self._match_id(url) From bf90c46790bac92e8a61ee0514cf3c41a8c048e9 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 15 Aug 2016 16:33:35 +0100 Subject: [PATCH 062/218] [fxnetworks] Add new extractor(closes #9462) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/fxnetworks.py | 49 ++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 youtube_dl/extractor/fxnetworks.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 15bc0a675..07928c530 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -287,6 +287,7 @@ from .freevideo import FreeVideoIE from .funimation import FunimationIE from .funnyordie import FunnyOrDieIE from .fusion import FusionIE +from .fxnetworks import FXNetworksIE from .gameinformer import GameInformerIE from .gameone import ( GameOneIE, diff --git a/youtube_dl/extractor/fxnetworks.py b/youtube_dl/extractor/fxnetworks.py new file mode 100644 index 000000000..70bc186a3 --- /dev/null +++ b/youtube_dl/extractor/fxnetworks.py @@ -0,0 +1,49 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .adobepass import AdobePass +from ..utils import ( + update_url_query, + extract_attributes, + parse_age_limit, + smuggle_url, +) + + +class FXNetworksIE(AdobePass): + _VALID_URL = r'https?://(?:www\.)?fxnetworks\.com/video/(?P\d+)' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_data = extract_attributes(self._search_regex( + r'()', webpage, 'video data')) + player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', fatal=False) + release_url = video_data['rel'] + title = video_data['data-title'] + rating = video_data.get('data-rating') + query = { + 'mbr': 'true', + } + if player_type == 'movies': + query.update({ + 'manifest': 'm3u', + }) + else: + query.update({ + 'switch': 'http', + }) + if video_data.get('data-req-auth') == '1': + resource = self._get_mvpd_resource( + video_data['data-channel'], title, + video_data.get('data-guid'), rating) + query['auth'] = self._extract_mvpd_auth(url, video_id, 'fx', resource) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), + 'thumbnail': video_data.get('data-large-thumb'), + 'age_limit': parse_age_limit(rating), + 'ie_key': 'ThePlatform', + } From cbef4d5c9ff5013d0c10b960e1690805724120cd Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 15 Aug 2016 17:10:45 +0100 Subject: [PATCH 063/218] [fxnetworks] add test and check geo restriction --- youtube_dl/extractor/fxnetworks.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/youtube_dl/extractor/fxnetworks.py b/youtube_dl/extractor/fxnetworks.py index 70bc186a3..940e7427c 100644 --- a/youtube_dl/extractor/fxnetworks.py +++ b/youtube_dl/extractor/fxnetworks.py @@ -12,10 +12,27 @@ from ..utils import ( class FXNetworksIE(AdobePass): _VALID_URL = r'https?://(?:www\.)?fxnetworks\.com/video/(?P\d+)' + _TEST = { + 'url': 'http://www.fxnetworks.com/video/719841347694', + 'md5': '1447d4722e42ebca19e5232ab93abb22', + 'info_dict': { + 'id': '719841347694', + 'ext': 'mp4', + 'title': 'Vanpage', + 'description': 'F*ck settling down. You\'re the Worst returns for an all new season August 31st on FXX.', + 'age_limit': 14, + 'uploader': 'NEWA-FNG-FX', + 'upload_date': '20160706', + 'timestamp': 1467844741, + }, + 'add_ie': ['ThePlatform'], + } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + if 'The content you are trying to access is not available in your region.' in webpage: + self.raise_geo_restricted() video_data = extract_attributes(self._search_regex( r'()', webpage, 'video data')) player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', fatal=False) @@ -42,6 +59,7 @@ class FXNetworksIE(AdobePass): return { '_type': 'url_transparent', 'id': video_id, + 'title': title, 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), 'thumbnail': video_data.get('data-large-thumb'), 'age_limit': parse_age_limit(rating), From 818ac213eb80e18f472ecdf2406569bafd4cccaf Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 15 Aug 2016 21:36:34 +0100 Subject: [PATCH 064/218] [adobepass] add IE suffix to the extractor and remove duplicate constant --- youtube_dl/extractor/adobepass.py | 2 +- youtube_dl/extractor/fxnetworks.py | 4 ++-- youtube_dl/extractor/nationalgeographic.py | 6 +++--- youtube_dl/extractor/syfy.py | 4 ++-- youtube_dl/extractor/theplatform.py | 5 ++--- youtube_dl/extractor/viceland.py | 4 ++-- 6 files changed, 12 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index cf3a15cbb..2c9f8817b 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -13,7 +13,7 @@ from ..utils import ( ) -class AdobePass(InfoExtractor): +class AdobePassIE(InfoExtractor): _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' diff --git a/youtube_dl/extractor/fxnetworks.py b/youtube_dl/extractor/fxnetworks.py index 940e7427c..3ec3b0b46 100644 --- a/youtube_dl/extractor/fxnetworks.py +++ b/youtube_dl/extractor/fxnetworks.py @@ -1,7 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -from .adobepass import AdobePass +from .adobepass import AdobePassIE from ..utils import ( update_url_query, extract_attributes, @@ -10,7 +10,7 @@ from ..utils import ( ) -class FXNetworksIE(AdobePass): +class FXNetworksIE(AdobePassIE): _VALID_URL = r'https?://(?:www\.)?fxnetworks\.com/video/(?P\d+)' _TEST = { 'url': 'http://www.fxnetworks.com/video/719841347694', diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 890e8d5bc..1dcf27afe 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from .theplatform import ThePlatformIE +from .adobepass import AdobePassIE from ..utils import ( smuggle_url, url_basename, @@ -65,7 +65,7 @@ class NationalGeographicVideoIE(InfoExtractor): } -class NationalGeographicIE(ThePlatformIE): +class NationalGeographicIE(AdobePassIE): IE_NAME = 'natgeo' _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/(?:videos|episodes)/(?P[^/?]+)' @@ -131,7 +131,7 @@ class NationalGeographicIE(ThePlatformIE): } -class NationalGeographicEpisodeGuideIE(ThePlatformIE): +class NationalGeographicEpisodeGuideIE(InfoExtractor): IE_NAME = 'natgeo:episodeguide' _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?(?P[^/]+)/episode-guide' _TESTS = [ diff --git a/youtube_dl/extractor/syfy.py b/youtube_dl/extractor/syfy.py index 764287a64..cc81f6003 100644 --- a/youtube_dl/extractor/syfy.py +++ b/youtube_dl/extractor/syfy.py @@ -1,13 +1,13 @@ from __future__ import unicode_literals -from .theplatform import ThePlatformIE +from .adobepass import AdobePassIE from ..utils import ( update_url_query, smuggle_url, ) -class SyfyIE(ThePlatformIE): +class SyfyIE(AdobePassIE): _VALID_URL = r'https?://www\.syfy\.com/(?:[^/]+/)?videos/(?P[^/?#]+)' _TESTS = [{ 'url': 'http://www.syfy.com/theinternetruinedmylife/videos/the-internet-ruined-my-life-season-1-trailer', diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 108ddd3a9..eda899497 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -9,7 +9,7 @@ import hashlib from .once import OnceIE -from .adobepass import AdobePass +from .adobepass import AdobePassIE from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, @@ -93,7 +93,7 @@ class ThePlatformBaseIE(OnceIE): return self._parse_theplatform_metadata(info) -class ThePlatformIE(ThePlatformBaseIE, AdobePass): +class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P[^/]+)/ (?:(?:(?:[^/]+/)+select/)?(?Pmedia/(?:guid/\d+/)?)|(?P(?:[^/\?]+/(?:swf|config)|onsite)/select/))? @@ -164,7 +164,6 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePass): 'url': 'http://player.theplatform.com/p/NnzsPC/onsite_universal/select/media/guid/2410887629/2928790?fwsitesection=nbc_the_blacklist_video_library&autoPlay=true&carouselID=137781', 'only_matching': True, }] - _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' @classmethod def _extract_urls(cls, webpage): diff --git a/youtube_dl/extractor/viceland.py b/youtube_dl/extractor/viceland.py index da766d8db..8742b607a 100644 --- a/youtube_dl/extractor/viceland.py +++ b/youtube_dl/extractor/viceland.py @@ -5,7 +5,7 @@ import time import hashlib import json -from .adobepass import AdobePass +from .adobepass import AdobePassIE from ..compat import compat_HTTPError from ..utils import ( int_or_none, @@ -17,7 +17,7 @@ from ..utils import ( ) -class VicelandIE(AdobePass): +class VicelandIE(AdobePassIE): _VALID_URL = r'https?://(?:www\.)?viceland\.com/[^/]+/video/[^/]+/(?P[a-f0-9]+)' _TEST = { 'url': 'https://www.viceland.com/en_us/video/cyberwar-trailer/57608447973ee7705f6fbd4e', From 254e64a20aa37a033cb200bc6f1aa9daf57eead8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 16 Aug 2016 04:36:23 +0700 Subject: [PATCH 065/218] [bbc:playlist] Add support for pagination (Closes #10349) --- youtube_dl/extractor/bbc.py | 48 ++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 83e6d024c..16a97a76d 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import re +import itertools from .common import InfoExtractor from ..utils import ( @@ -17,6 +18,7 @@ from ..utils import ( from ..compat import ( compat_etree_fromstring, compat_HTTPError, + compat_urlparse, ) @@ -1056,19 +1058,35 @@ class BBCCoUkArticleIE(InfoExtractor): class BBCCoUkPlaylistBaseIE(InfoExtractor): + def _entries(self, webpage, url, playlist_id): + single_page = 'page' in compat_urlparse.parse_qs( + compat_urlparse.urlparse(url).query) + for page_num in itertools.count(2): + for video_id in re.findall( + self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage): + yield self.url_result( + self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) + if single_page: + return + next_page = self._search_regex( + r']+class=(["\'])pagination_+next\1[^>]*>]+href=(["\'])(?P(?:(?!\2).)+)\2', + webpage, 'next page url', default=None, group='url') + if not next_page: + break + webpage = self._download_webpage( + compat_urlparse.urljoin(url, next_page), playlist_id, + 'Downloading page %d' % page_num, page_num) + def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - entries = [ - self.url_result(self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) - for video_id in re.findall( - self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage)] - title, description = self._extract_title_and_description(webpage) - return self.playlist_result(entries, playlist_id, title, description) + return self.playlist_result( + self._entries(webpage, url, playlist_id), + playlist_id, title, description) class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): @@ -1094,6 +1112,24 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7', }, 'playlist_mincount': 10, + }, { + # explicit page + 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1', + 'info_dict': { + 'id': 'b00mfl7n', + 'title': 'Bohemian Icons', + 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7', + }, + 'playlist_mincount': 24, + }, { + # all pages + 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips', + 'info_dict': { + 'id': 'b00mfl7n', + 'title': 'Bohemian Icons', + 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7', + }, + 'playlist_mincount': 142, }] def _extract_title_and_description(self, webpage): From 4f640f28901be8a3ce57e77ead404d751e36d208 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 16 Aug 2016 04:43:10 +0700 Subject: [PATCH 066/218] [bbc:playlist] Fix tests --- youtube_dl/extractor/bbc.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 16a97a76d..deb9cc1c0 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1112,24 +1112,6 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7', }, 'playlist_mincount': 10, - }, { - # explicit page - 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1', - 'info_dict': { - 'id': 'b00mfl7n', - 'title': 'Bohemian Icons', - 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7', - }, - 'playlist_mincount': 24, - }, { - # all pages - 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips', - 'info_dict': { - 'id': 'b00mfl7n', - 'title': 'Bohemian Icons', - 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7', - }, - 'playlist_mincount': 142, }] def _extract_title_and_description(self, webpage): @@ -1153,6 +1135,24 @@ class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): 'description': 'French thriller serial about a missing teenager.', }, 'playlist_mincount': 7, + }, { + # multipage playlist, explicit page + 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1', + 'info_dict': { + 'id': 'b00mfl7n', + 'title': 'Frozen Planet - Clips - BBC One', + 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c', + }, + 'playlist_mincount': 24, + }, { + # multipage playlist, all pages + 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips', + 'info_dict': { + 'id': 'b00mfl7n', + 'title': 'Frozen Planet - Clips - BBC One', + 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c', + }, + 'playlist_mincount': 142, }, { 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06', 'only_matching': True, From fb64adcbd37a660da92687878831d08e82ae748c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 16 Aug 2016 04:45:21 +0700 Subject: [PATCH 067/218] [adobepass] PEP 8 --- youtube_dl/extractor/adobepass.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index 2c9f8817b..9e3a3e362 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -131,4 +131,4 @@ class AdobePassIE(InfoExtractor): if ' Date: Tue, 16 Aug 2016 13:43:33 +0100 Subject: [PATCH 068/218] [amcnetworks] Add new extractor --- youtube_dl/extractor/amcnetworks.py | 72 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 73 insertions(+) create mode 100644 youtube_dl/extractor/amcnetworks.py diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py new file mode 100644 index 000000000..be9552541 --- /dev/null +++ b/youtube_dl/extractor/amcnetworks.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .theplatform import ThePlatformIE +from ..utils import ( + update_url_query, + parse_age_limit, +) + + +class AMCNetworksIE(ThePlatformIE): + _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|wetv)\.com/(?:movies/|shows/[^/]+/(?:full-episodes/)?season-\d+/episode-\d+(?:-(?:[^/]+/)?|/))(?P[^/?#]+)' + _TESTS = [{ + 'url': 'http://www.ifc.com/shows/maron/season-04/episode-01/step-1', + 'md5': '', + 'info_dict': { + 'id': 's3MX01Nl4vPH', + 'ext': 'mp4', + 'title': 'Step 1', + 'description': 'In denial about his current situation, Marc is reluctantly convinced by his friends to enter rehab. Starring Marc Maron and Constance Zimmer.', + 'age_limit': 17, + 'upload_date': '20160505', + 'timestamp': 1462468831, + 'uploader': 'AMCN', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge', + 'only_matching': True, + }, { + 'url': 'http://www.amc.com/shows/preacher/full-episodes/season-01/episode-00/pilot', + 'only_matching': True, + }, { + 'url': 'http://www.wetv.com/shows/million-dollar-matchmaker/season-01/episode-06-the-dumped-dj-and-shallow-hal', + 'only_matching': True, + }, { + 'url': 'http://www.ifc.com/movies/chaos', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + media_url = self._search_regex(r'window\.platformLinkURL\s*=\s*[\'"]([^\'"]+)', webpage, 'media url') + theplatform_metadata = self._download_theplatform_metadata(self._search_regex( + r'https?://link.theplatform.com/s/([^?]+)', media_url, 'theplatform_path'), display_id) + info = self._parse_theplatform_metadata(theplatform_metadata) + video_id = theplatform_metadata['pid'] + title = theplatform_metadata['title'] + rating = theplatform_metadata['ratings'][0]['rating'] + auth_required = self._search_regex(r'window\.authRequired\s*=\s*(true|false);', webpage, 'auth required') + if auth_required == 'true': + requestor_id = self._search_regex(r'window\.requestor_id\s*=\s*[\'"]([^\'"]+)', webpage, 'requestor id') + resource = self._get_mvpd_resource(requestor_id, title, video_id, rating) + query['auth'] = self._extract_mvpd_auth(url, video_id, requestor_id, resource) + media_url = update_url_query(media_url, query) + formats, subtitles = self._extract_theplatform_smil(media_url, video_id) + self._sort_formats(formats) + info.update({ + 'id': video_id, + 'subtiles': subtitles, + 'formats': formats, + 'age_limit': parse_age_limit(parse_age_limit(rating)), + }) + return info diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 07928c530..a5e0805b2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -29,6 +29,7 @@ from .aftonbladet import AftonbladetIE from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE +from .amcnetworks import AMCNetworksIE from .animeondemand import AnimeOnDemandIE from .anitube import AnitubeIE from .anysex import AnySexIE From 837e56c8eefa725ca72feca9431050cdda571c57 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 16 Aug 2016 14:49:32 +0100 Subject: [PATCH 069/218] [amcnetworks] extract episode metadata --- youtube_dl/extractor/amcnetworks.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py index be9552541..26f46acb5 100644 --- a/youtube_dl/extractor/amcnetworks.py +++ b/youtube_dl/extractor/amcnetworks.py @@ -5,6 +5,7 @@ from .theplatform import ThePlatformIE from ..utils import ( update_url_query, parse_age_limit, + int_or_none, ) @@ -16,7 +17,7 @@ class AMCNetworksIE(ThePlatformIE): 'info_dict': { 'id': 's3MX01Nl4vPH', 'ext': 'mp4', - 'title': 'Step 1', + 'title': 'Maron - Season 4 - Step 1', 'description': 'In denial about his current situation, Marc is reluctantly convinced by his friends to enter rehab. Starring Marc Maron and Constance Zimmer.', 'age_limit': 17, 'upload_date': '20160505', @@ -69,4 +70,22 @@ class AMCNetworksIE(ThePlatformIE): 'formats': formats, 'age_limit': parse_age_limit(parse_age_limit(rating)), }) + ns_keys = theplatform_metadata.get('$xmlns', {}).keys() + if ns_keys: + ns = list(ns_keys)[0] + series = theplatform_metadata.get(ns + '$show') + season_number = int_or_none(theplatform_metadata.get(ns + '$season')) + episode = theplatform_metadata.get(ns + '$episodeTitle') + episode_number = int_or_none(theplatform_metadata.get(ns + '$episode')) + if season_number: + title = 'Season %d - %s' % (season_number, title) + if series: + title = '%s - %s' % (series, title) + info.update({ + 'title': title, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + }) return info From 70a2829fee4203ebeb399481304d289ff92adf29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 16 Aug 2016 21:17:52 +0700 Subject: [PATCH 070/218] [xvideos] Fix HLS extraction (Closes #10356) --- youtube_dl/extractor/xvideos.py | 34 ++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 1dfe031ca..b2ef15119 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -42,24 +42,24 @@ class XVideosIE(InfoExtractor): video_url = compat_urllib_parse_unquote(self._search_regex( r'flv_url=(.+?)&', webpage, 'video URL', default='')) if video_url: - formats.append({'url': video_url}) + formats.append({ + 'url': video_url, + 'format_id': 'flv', + }) - player_args = self._search_regex( - r'(?s)new\s+HTML5Player\((.+?)\)', webpage, ' html5 player', default=None) - if player_args: - for arg in player_args.split(','): - format_url = self._search_regex( - r'(["\'])(?Phttps?://.+?)\1', arg, 'url', - default=None, group='url') - if not format_url: - continue - ext = determine_ext(format_url) - if ext == 'mp4': - formats.append({'url': format_url}) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + for kind, _, format_url in re.findall( + r'setVideo([^(]+)\((["\'])(http.+?)\2\)', webpage): + format_id = kind.lower() + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + elif format_id in ('urllow', 'urlhigh'): + formats.append({ + 'url': format_url, + 'format_id': '%s-%s' % (determine_ext(format_url, 'mp4'), format_id[3:]), + 'quality': -2 if format_id.endswith('low') else None, + }) self._sort_formats(formats) From 98affc1a482ab41466c76cfded41949c4db58f67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 16 Aug 2016 21:20:15 +0700 Subject: [PATCH 071/218] [xvideos] Fix test --- youtube_dl/extractor/xvideos.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index b2ef15119..30825daae 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -15,10 +15,10 @@ class XVideosIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?xvideos\.com/video(?P[0-9]+)(?:.*)' _TEST = { 'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl', - 'md5': '4b46ae6ea5e6e9086e714d883313c0c9', + 'md5': '14cea69fcb84db54293b1e971466c2e1', 'info_dict': { 'id': '4588838', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Biker Takes his Girl', 'age_limit': 18, } From 11f502fac145b4592f47c025ee8317fe44020db0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 16 Aug 2016 16:19:36 +0100 Subject: [PATCH 072/218] [theplatform] extract subtitles with multiple formats from the metadata --- youtube_dl/extractor/theplatform.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index eda899497..23067e8c6 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -73,10 +73,10 @@ class ThePlatformBaseIE(OnceIE): if isinstance(captions, list): for caption in captions: lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type') - subtitles[lang] = [{ + subtitles.setdefault(lang, []).append({ 'ext': mimetype2ext(mime), 'url': src, - }] + }) return { 'title': info['title'], From 2cabee2a7d4c94aa2f4f2e84a3c68eb97cdf9cce Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 16 Aug 2016 16:20:07 +0100 Subject: [PATCH 073/218] [amcnetworks] fix typo --- youtube_dl/extractor/amcnetworks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py index 26f46acb5..c739d2c99 100644 --- a/youtube_dl/extractor/amcnetworks.py +++ b/youtube_dl/extractor/amcnetworks.py @@ -66,7 +66,7 @@ class AMCNetworksIE(ThePlatformIE): self._sort_formats(formats) info.update({ 'id': video_id, - 'subtiles': subtitles, + 'subtitles': subtitles, 'formats': formats, 'age_limit': parse_age_limit(parse_age_limit(rating)), }) From 53fef319f14896ce497d309f661ceb586d7b4d90 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 16 Aug 2016 16:21:04 +0100 Subject: [PATCH 074/218] [fxnetworks] extend _VALID_URL to support simpsonsworld.com --- youtube_dl/extractor/fxnetworks.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/fxnetworks.py b/youtube_dl/extractor/fxnetworks.py index 3ec3b0b46..629897317 100644 --- a/youtube_dl/extractor/fxnetworks.py +++ b/youtube_dl/extractor/fxnetworks.py @@ -11,8 +11,8 @@ from ..utils import ( class FXNetworksIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?fxnetworks\.com/video/(?P\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:fxnetworks|simpsonsworld)\.com/video/(?P\d+)' + _TESTS = [{ 'url': 'http://www.fxnetworks.com/video/719841347694', 'md5': '1447d4722e42ebca19e5232ab93abb22', 'info_dict': { @@ -26,7 +26,10 @@ class FXNetworksIE(AdobePassIE): 'timestamp': 1467844741, }, 'add_ie': ['ThePlatform'], - } + }, { + 'url': 'http://www.simpsonsworld.com/video/716094019682', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -35,7 +38,7 @@ class FXNetworksIE(AdobePassIE): self.raise_geo_restricted() video_data = extract_attributes(self._search_regex( r'()', webpage, 'video data')) - player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', fatal=False) + player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', default=None) release_url = video_data['rel'] title = video_data['data-title'] rating = video_data.get('data-rating') From 6e7e4a6edf6c4ffd56d908ade7f0bfe2bff738b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 17 Aug 2016 00:19:43 +0700 Subject: [PATCH 075/218] [mtg] Add support for viafree URLs (#10358) --- youtube_dl/extractor/tvplay.py | 41 ++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 150bde663..d82bf67b4 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -20,16 +20,25 @@ from ..utils import ( class TVPlayIE(InfoExtractor): - IE_DESC = 'TV3Play and related services' - _VALID_URL = r'''(?x)https?://(?:www\.)? - (?:tvplay(?:\.skaties)?\.lv/parraides| - (?:tv3play|play\.tv3)\.lt/programos| - tv3play(?:\.tv3)?\.ee/sisu| - tv(?:3|6|8|10)play\.se/program| - (?:(?:tv3play|viasat4play|tv6play)\.no|tv3play\.dk)/programmer| - play\.novatv\.bg/programi - )/[^/]+/(?P\d+) - ''' + IE_NAME = 'mtg' + IE_DESC = 'MTG services' + _VALID_URL = r'''(?x) + (?: + mtg:| + https?:// + (?:www\.)? + (?: + tvplay(?:\.skaties)?\.lv/parraides| + (?:tv3play|play\.tv3)\.lt/programos| + tv3play(?:\.tv3)?\.ee/sisu| + (?:tv(?:3|6|8|10)play|viafree)\.se/program| + (?:(?:tv3play|viasat4play|tv6play|viafree)\.no|(?:tv3play|viafree)\.dk)/programmer| + play\.novatv\.bg/programi + ) + /(?:[^/]+/)+ + ) + (?P\d+) + ''' _TESTS = [ { 'url': 'http://www.tvplay.lv/parraides/vinas-melo-labak/418113?autostart=true', @@ -197,6 +206,14 @@ class TVPlayIE(InfoExtractor): { 'url': 'http://tv3play.tv3.ee/sisu/kodu-keset-linna/238551?autostart=true', 'only_matching': True, + }, + { + 'url': 'http://www.viafree.se/program/underhallning/i-like-radio-live/sasong-1/676869', + 'only_matching': True, + }, + { + 'url': 'mtg:418113', + 'only_matching': True, } ] @@ -204,13 +221,13 @@ class TVPlayIE(InfoExtractor): video_id = self._match_id(url) video = self._download_json( - 'http://playapi.mtgx.tv/v1/videos/%s' % video_id, video_id, 'Downloading video JSON') + 'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON') title = video['title'] try: streams = self._download_json( - 'http://playapi.mtgx.tv/v1/videos/stream/%s' % video_id, + 'http://playapi.mtgx.tv/v3/videos/stream/%s' % video_id, video_id, 'Downloading streams JSON') except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: From b35b0d73d853c52ca96ccf4488a4f8960a12e2ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 17 Aug 2016 00:21:30 +0700 Subject: [PATCH 076/218] [viafree] Add extractor (Closes #10358) --- youtube_dl/extractor/extractors.py | 5 ++- youtube_dl/extractor/tvplay.py | 53 ++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a5e0805b2..55c639158 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -898,7 +898,10 @@ from .tvp import ( TVPIE, TVPSeriesIE, ) -from .tvplay import TVPlayIE +from .tvplay import ( + TVPlayIE, + ViafreeIE, +) from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE from .twentymin import TwentyMinutenIE diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index d82bf67b4..c8ec2465c 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -311,3 +311,56 @@ class TVPlayIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } + + +class ViafreeIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + viafree\. + (?: + (?:dk|no)/programmer| + se/program + ) + /(?:[^/]+/)+(?P[^/?#&]+) + ''' + _TESTS = [{ + 'url': 'http://www.viafree.se/program/livsstil/husraddarna/sasong-2/avsnitt-2', + 'info_dict': { + 'id': '395375', + 'ext': 'mp4', + 'title': 'Husräddarna S02E02', + 'description': 'md5:4db5c933e37db629b5a2f75dfb34829e', + 'series': 'Husräddarna', + 'season': 'Säsong 2', + 'season_number': 2, + 'duration': 2576, + 'timestamp': 1400596321, + 'upload_date': '20140520', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [TVPlayIE.ie_key()], + }, { + 'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1', + 'only_matching': True, + }, { + 'url': 'http://www.viafree.dk/programmer/reality/paradise-hotel/saeson-7/episode-5', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if TVPlayIE.suitable(url) else super(ViafreeIE, cls).suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + r'currentVideo["\']\s*:\s*.+?["\']id["\']\s*:\s*["\'](?P\d{6,})', + webpage, 'video id') + + return self.url_result('mtg:%s' % video_id, TVPlayIE.ie_key()) From 502d87c5464f1894a8777873b9d11b76ba5a6375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 17 Aug 2016 00:32:28 +0700 Subject: [PATCH 077/218] [mtg] Improve view count extraction --- youtube_dl/extractor/tvplay.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index c8ec2465c..4186e82db 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -15,6 +15,7 @@ from ..utils import ( int_or_none, parse_iso8601, qualities, + try_get, update_url_query, ) @@ -203,6 +204,11 @@ class TVPlayIE(InfoExtractor): 'url': 'http://tvplay.skaties.lv/parraides/vinas-melo-labak/418113?autostart=true', 'only_matching': True, }, + { + # views is null + 'url': 'http://tvplay.skaties.lv/parraides/tv3-zinas/760183', + 'only_matching': True, + }, { 'url': 'http://tv3play.tv3.ee/sisu/kodu-keset-linna/238551?autostart=true', 'only_matching': True, @@ -306,7 +312,7 @@ class TVPlayIE(InfoExtractor): 'season_number': season_number, 'duration': int_or_none(video.get('duration')), 'timestamp': parse_iso8601(video.get('created_at')), - 'view_count': int_or_none(video.get('views', {}).get('total')), + 'view_count': try_get(video, lambda x: x['views']['total'], int), 'age_limit': int_or_none(video.get('age_limit', 0)), 'formats': formats, 'subtitles': subtitles, From 9c0fa60bf375959c7d8582f655b441c534865c03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 17 Aug 2016 00:42:02 +0700 Subject: [PATCH 078/218] [vbox7] Add support for embed URLs --- youtube_dl/extractor/vbox7.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index fa7899e6d..8e6d7efe7 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -6,7 +6,7 @@ from ..utils import urlencode_postdata class Vbox7IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vbox7\.com/play:(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?vbox7\.com/(?:play:|emb/external\.php\?.*?\bvid=)(?P[\da-fA-F]+)' _TESTS = [{ 'url': 'http://vbox7.com/play:0946fff23c', 'md5': 'a60f9ab3a3a2f013ef9a967d5f7be5bf', @@ -24,15 +24,19 @@ class Vbox7IE(InfoExtractor): 'title': 'Смях! Чудо - чист за секунди - Скрита камера', }, 'skip': 'georestricted', + }, { + 'url': 'http://vbox7.com/emb/external.php?vid=a240d20f9c&autoplay=1', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + 'http://vbox7.com/play:%s' % video_id, video_id) title = self._html_search_regex( - r'(.*)', webpage, 'title').split('/')[0].strip() + r'(.+?)', webpage, 'title').split('/')[0].strip() video_url = self._search_regex( r'src\s*:\s*(["\'])(?P.+?.mp4.*?)\1', From 2a1321a272c7b410db25654cdfdc33c3cd8bd440 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 17 Aug 2016 01:02:59 +0700 Subject: [PATCH 079/218] [vbox7:generic] Add support for vbox7 embeds --- youtube_dl/extractor/generic.py | 18 ++++++++++++++++++ youtube_dl/extractor/vbox7.py | 10 ++++++++++ 2 files changed, 28 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 50500ce0e..197ab9531 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -72,6 +72,7 @@ from .kaltura import KalturaIE from .eagleplatform import EaglePlatformIE from .facebook import FacebookIE from .soundcloud import SoundcloudIE +from .vbox7 import Vbox7IE class GenericIE(InfoExtractor): @@ -1373,6 +1374,18 @@ class GenericIE(InfoExtractor): }, 'add_ie': [ArkenaIE.ie_key()], }, + { + 'url': 'http://nova.bg/news/view/2016/08/16/156543/%D0%BD%D0%B0-%D0%BA%D0%BE%D1%81%D1%8A%D0%BC-%D0%BE%D1%82-%D0%B2%D0%B7%D1%80%D0%B8%D0%B2-%D0%BE%D1%82%D1%86%D0%B5%D0%BF%D0%B8%D1%85%D0%B0-%D1%86%D1%8F%D0%BB-%D0%BA%D0%B2%D0%B0%D1%80%D1%82%D0%B0%D0%BB-%D0%B7%D0%B0%D1%80%D0%B0%D0%B4%D0%B8-%D0%B8%D0%B7%D1%82%D0%B8%D1%87%D0%B0%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%B3%D0%B0%D0%B7-%D0%B2-%D0%BF%D0%BB%D0%BE%D0%B2%D0%B4%D0%B8%D0%B2/', + 'info_dict': { + 'id': '1c7141f46c', + 'ext': 'mp4', + 'title': 'НА КОСЪМ ОТ ВЗРИВ: Изтичане на газ на бензиностанция в Пловдив', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [Vbox7IE.ie_key()], + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2239,6 +2252,11 @@ class GenericIE(InfoExtractor): 'uploader': video_uploader, } + # Look for VBOX7 embeds + vbox7_url = Vbox7IE._extract_url(webpage) + if vbox7_url: + return self.url_result(vbox7_url, Vbox7IE.ie_key()) + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( webpage, video_id, default={}, expected_type='VideoObject') diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index 8e6d7efe7..e17988573 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -1,6 +1,8 @@ # encoding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import urlencode_postdata @@ -29,6 +31,14 @@ class Vbox7IE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_url(webpage): + mobj = re.search( + ']+src=(?P["\'])(?P(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)', + webpage) + if mobj: + return mobj.group('url') + def _real_extract(self, url): video_id = self._match_id(url) From 8652770bd23ff5f46c5687d94f71cec08d2c5886 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 17 Aug 2016 05:44:46 +0700 Subject: [PATCH 080/218] [keezmovies] Improve and modernize --- youtube_dl/extractor/keezmovies.py | 138 +++++++++++++++++++++-------- 1 file changed, 99 insertions(+), 39 deletions(-) diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index 126ca13df..ad2f8a8c8 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -3,64 +3,124 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..aes import aes_decrypt_text +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( - sanitized_Request, - url_basename, + determine_ext, + ExtractorError, + int_or_none, + str_to_int, + strip_or_none, ) class KeezMoviesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/.+?(?P[0-9]+)(?:[/?&]|$)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/(?:(?P[^/]+)-)?(?P\d+)' + _TESTS = [{ 'url': 'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711', 'md5': '1c1e75d22ffa53320f45eeb07bc4cdc0', 'info_dict': { 'id': '1214711', + 'display_id': 'petite-asian-lady-mai-playing-in-bathtub', 'ext': 'mp4', 'title': 'Petite Asian Lady Mai Playing In Bathtub', - 'age_limit': 18, 'thumbnail': 're:^https?://.*\.jpg$', + 'view_count': int, + 'age_limit': 18, } - } + }, { + 'url': 'http://www.keezmovies.com/video/1214711', + 'only_matching': True, + }] - def _real_extract(self, url): - video_id = self._match_id(url) + def _extract_info(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id - req = sanitized_Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) - - # embedded video - mobj = re.search(r'href="([^"]+)">', webpage) - if mobj: - embedded_url = mobj.group(1) - return self.url_result(embedded_url) - - video_title = self._html_search_regex( - r'

    ]*>([^<]+)', webpage, 'title') - flashvars = self._parse_json(self._search_regex( - r'var\s+flashvars\s*=\s*([^;]+);', webpage, 'flashvars'), video_id) + webpage = self._download_webpage( + url, display_id, headers={'Cookie': 'age_verified=1'}) formats = [] - for height in (180, 240, 480): - if flashvars.get('quality_%dp' % height): - video_url = flashvars['quality_%dp' % height] - a_format = { - 'url': video_url, - 'height': height, - 'format_id': '%dp' % height, - } - filename_parts = url_basename(video_url).split('_') - if len(filename_parts) >= 2 and re.match(r'\d+[Kk]', filename_parts[1]): - a_format['tbr'] = int(filename_parts[1][:-1]) - formats.append(a_format) + format_urls = set() - age_limit = self._rta_search(webpage) + title = None + thumbnail = None + duration = None + encrypted = False - return { + def extract_format(format_url, height=None): + if not isinstance(format_url, compat_str) or not format_url.startswith('http'): + return + if format_url in format_urls: + return + format_urls.add(format_url) + tbr = int_or_none(self._search_regex( + r'[/_](\d+)[kK][/_]', format_url, 'tbr', default=None)) + if not height: + height = int_or_none(self._search_regex( + r'[/_](\d+)[pP][/_]', format_url, 'height', default=None)) + if encrypted: + format_url = aes_decrypt_text( + video_url, title, 32).decode('utf-8') + formats.append({ + 'url': format_url, + 'format_id': '%dp' % height if height else None, + 'height': height, + 'tbr': tbr, + }) + + flashvars = self._parse_json( + self._search_regex( + r'flashvars\s*=\s*({.+?});', webpage, + 'flashvars', default='{}'), + display_id, fatal=False) + + if flashvars: + title = flashvars.get('video_title') + thumbnail = flashvars.get('image_url') + duration = int_or_none(flashvars.get('video_duration')) + encrypted = flashvars.get('encrypted') is True + for key, value in flashvars.items(): + mobj = re.search(r'quality_(\d+)[pP]', key) + if mobj: + extract_format(value, int(mobj.group(1))) + video_url = flashvars.get('video_url') + if video_url and determine_ext(video_url, None): + extract_format(video_url) + + video_url = self._html_search_regex( + r'flashvars\.video_url\s*=\s*(["\'])(?Phttp.+?)\1', + webpage, 'video url', default=None, group='url') + if video_url: + extract_format(compat_urllib_parse_unquote(video_url)) + + if not formats: + if 'title="This video is no longer available"' in webpage: + raise ExtractorError( + 'Video %s is no longer available' % video_id, expected=True) + + self._sort_formats(formats) + + if not title: + title = self._html_search_regex( + r']*>([^<]+)', webpage, 'title') + + return webpage, { 'id': video_id, - 'title': video_title, + 'display_id': display_id, + 'title': strip_or_none(title), + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': 18, 'formats': formats, - 'age_limit': age_limit, - 'thumbnail': flashvars.get('image_url') } + + def _real_extract(self, url): + webpage, info = self._extract_info(url) + info['view_count'] = str_to_int(self._search_regex( + r'([\d,.]+) Views?', webpage, 'view count', fatal=False)) + return info From 6be17c08703ad8ec89c6fb62f31f280956694cee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 17 Aug 2016 05:45:49 +0700 Subject: [PATCH 081/218] [mofosex] Extract all formats and modernize (Closes #10335) --- youtube_dl/extractor/mofosex.py | 81 +++++++++++++++++---------------- 1 file changed, 42 insertions(+), 39 deletions(-) diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py index e47c80119..e3bbe5aa8 100644 --- a/youtube_dl/extractor/mofosex.py +++ b/youtube_dl/extractor/mofosex.py @@ -1,53 +1,56 @@ from __future__ import unicode_literals -import os -import re - -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse, +from ..utils import ( + int_or_none, + str_to_int, + unified_strdate, ) -from ..utils import sanitized_Request +from .keezmovies import KeezMoviesIE -class MofosexIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?Pmofosex\.com/videos/(?P[0-9]+)/.*?\.html)' - _TEST = { - 'url': 'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html', - 'md5': '1b2eb47ac33cc75d4a80e3026b613c5a', +class MofosexIE(KeezMoviesIE): + _VALID_URL = r'https?://(?:www\.)?mofosex\.com/videos/(?P\d+)/(?P[^/?#&.]+)\.html' + _TESTS = [{ + 'url': 'http://www.mofosex.com/videos/318131/amateur-teen-playing-and-masturbating-318131.html', + 'md5': '39a15853632b7b2e5679f92f69b78e91', 'info_dict': { - 'id': '5018', + 'id': '318131', + 'display_id': 'amateur-teen-playing-and-masturbating-318131', 'ext': 'mp4', - 'title': 'Japanese Teen Music Video', + 'title': 'amateur teen playing and masturbating', + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20121114', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, 'age_limit': 18, } - } + }, { + # This video is no longer available + 'url': 'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - url = 'http://www.' + mobj.group('url') + webpage, info = self._extract_info(url) - req = sanitized_Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) + view_count = str_to_int(self._search_regex( + r'VIEWS:\s*([\d,.]+)', webpage, 'view count', fatal=False)) + like_count = int_or_none(self._search_regex( + r'id=["\']amountLikes["\'][^>]*>(\d+)', webpage, + 'like count', fatal=False)) + dislike_count = int_or_none(self._search_regex( + r'id=["\']amountDislikes["\'][^>]*>(\d+)', webpage, + 'like count', fatal=False)) + upload_date = unified_strdate(self._html_search_regex( + r'Added:([^<]+)', webpage, 'upload date', fatal=False)) - video_title = self._html_search_regex(r'

    (.+?)<', webpage, 'title') - video_url = compat_urllib_parse_unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, 'video_url')) - path = compat_urllib_parse_urlparse(video_url).path - extension = os.path.splitext(path)[1][1:] - format = path.split('/')[5].split('_')[:2] - format = '-'.join(format) + info.update({ + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'upload_date': upload_date, + 'thumbnail': self._og_search_thumbnail(webpage), + }) - age_limit = self._rta_search(webpage) - - return { - 'id': video_id, - 'title': video_title, - 'url': video_url, - 'ext': extension, - 'format': format, - 'format_id': format, - 'age_limit': age_limit, - } + return info From 8804f10e6b580db38df7301a174cb48ea374f9eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 17 Aug 2016 05:46:45 +0700 Subject: [PATCH 082/218] [tube8] Modernize --- youtube_dl/extractor/tube8.py | 60 ++++++----------------------------- 1 file changed, 10 insertions(+), 50 deletions(-) diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index 1d9271d1e..4053f6c21 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -1,18 +1,13 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, - sanitized_Request, str_to_int, ) -from ..aes import aes_decrypt_text +from .keezmovies import KeezMoviesIE -class Tube8IE(InfoExtractor): +class Tube8IE(KeezMoviesIE): _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P[^/]+)/(?P\d+)' _TESTS = [{ 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/', @@ -33,47 +28,17 @@ class Tube8IE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') + webpage, info = self._extract_info(url) - req = sanitized_Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, display_id) + if not info['title']: + info['title'] = self._html_search_regex( + r'videoTitle\s*=\s*"([^"]+)', webpage, 'title') - flashvars = self._parse_json( - self._search_regex( - r'flashvars\s*=\s*({.+?});\r?\n', webpage, 'flashvars'), - video_id) - - formats = [] - for key, video_url in flashvars.items(): - if not isinstance(video_url, compat_str) or not video_url.startswith('http'): - continue - height = self._search_regex( - r'quality_(\d+)[pP]', key, 'height', default=None) - if not height: - continue - if flashvars.get('encrypted') is True: - video_url = aes_decrypt_text( - video_url, flashvars['video_title'], 32).decode('utf-8') - formats.append({ - 'url': video_url, - 'format_id': '%sp' % height, - 'height': int(height), - }) - self._sort_formats(formats) - - thumbnail = flashvars.get('image_url') - - title = self._html_search_regex( - r'videoTitle\s*=\s*"([^"]+)', webpage, 'title') description = self._html_search_regex( r'>Description:\s*(.+?)\s*<', webpage, 'description', fatal=False) uploader = self._html_search_regex( r'\s*(.+?)\s*<', webpage, 'uploader', fatal=False) - duration = int_or_none(flashvars.get('video_duration')) like_count = int_or_none(self._search_regex( r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False)) @@ -86,18 +51,13 @@ class Tube8IE(InfoExtractor): r'(\d+)', webpage, 'comment count', fatal=False)) - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, + info.update({ 'description': description, - 'thumbnail': thumbnail, 'uploader': uploader, - 'duration': duration, 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, 'comment_count': comment_count, - 'age_limit': 18, - 'formats': formats, - } + }) + + return info From ab19b46b88bb54971b973176976d8d189222a6d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 17 Aug 2016 06:02:12 +0700 Subject: [PATCH 083/218] [extremetube] Modernize --- youtube_dl/extractor/extremetube.py | 73 +++++++---------------------- 1 file changed, 16 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index 3403581fd..b4fd9334a 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -1,22 +1,17 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - sanitized_Request, - str_to_int, -) +from ..utils import str_to_int +from .keezmovies import KeezMoviesIE -class ExtremeTubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P[^/#?&]+)' +class ExtremeTubeIE(KeezMoviesIE): + _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?:(?P[^/]+)-)(?P\d+)' _TESTS = [{ 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', - 'md5': '344d0c6d50e2f16b06e49ca011d8ac69', + 'md5': '1fb9228f5e3332ec8c057d6ac36f33e0', 'info_dict': { - 'id': 'music-video-14-british-euro-brit-european-cumshots-swallow-652431', + 'id': '652431', + 'display_id': 'music-video-14-british-euro-brit-european-cumshots-swallow', 'ext': 'mp4', 'title': 'Music Video 14 british euro brit european cumshots swallow', 'uploader': 'unknown', @@ -35,58 +30,22 @@ class ExtremeTubeIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) + webpage, info = self._extract_info(url) - req = sanitized_Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) + if not info['title']: + info['title'] = self._search_regex( + r']+title="([^"]+)"[^>]*>', webpage, 'title') - video_title = self._html_search_regex( - r'

    ]*?title="([^"]+)"[^>]*>', webpage, 'title') uploader = self._html_search_regex( r'Uploaded by:\s*\s*(.+?)\s*', webpage, 'uploader', fatal=False) - view_count = str_to_int(self._html_search_regex( + view_count = str_to_int(self._search_regex( r'Views:\s*\s*([\d,\.]+)', webpage, 'view count', fatal=False)) - flash_vars = self._parse_json( - self._search_regex( - r'var\s+flashvars\s*=\s*({.+?});', webpage, 'flash vars'), - video_id) - - formats = [] - for quality_key, video_url in flash_vars.items(): - height = int_or_none(self._search_regex( - r'quality_(\d+)[pP]$', quality_key, 'height', default=None)) - if not height: - continue - f = { - 'url': video_url, - } - mobj = re.search( - r'/(?P\d{3,4})[pP]_(?P\d+)[kK]_\d+', video_url) - if mobj: - height = int(mobj.group('height')) - bitrate = int(mobj.group('bitrate')) - f.update({ - 'format_id': '%dp-%dk' % (height, bitrate), - 'height': height, - 'tbr': bitrate, - }) - else: - f.update({ - 'format_id': '%dp' % height, - 'height': height, - }) - formats.append(f) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': video_title, - 'formats': formats, + info.update({ 'uploader': uploader, 'view_count': view_count, - 'age_limit': 18, - } + }) + + return info From a44694ab4e1ee6ac496ea09c3759923c03b9430c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 17 Aug 2016 06:19:22 +0700 Subject: [PATCH 084/218] [ChangeLog] Actualize --- ChangeLog | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 32504dab5..8f27019c1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,9 +1,28 @@ version +Core ++ Add _get_netrc_login_info + Extractors +* [mofosex] Extract all formats (#10335) ++ [generic] Add support for vbox7 embeds ++ [vbox7] Add support for embed URLs ++ [viafree] Add extractor (#10358) ++ [mtg] Add support for viafree URLs (#10358) +* [theplatform] Extract all subtitles per language ++ [xvideos] Fix HLS extraction (#10356) ++ [amcnetworks] Add extractor ++ [bbc:playlist] Add support for pagination (#10349) ++ [fxnetworks] Add extractor (#9462) * [cbslocal] Fix extraction for SendtoNews-based videos * [sendtonews] Fix extraction -* [jwplatform] Now can parse video_id from JWPlayer data +* [jwplatform] Extract video id from JWPlayer data +- [zippcast] Remove extractor (#10332) ++ [viceland] Add extractor (#8799) ++ [adobepass] Add base extractor for Adobe Pass Authentication +* [life:embed] Improve extraction +* [vgtv] Detect geo restricted videos (#10348) ++ [uplynk] Add extractor * [xiami] Fix extraction (#10342) From b3d7dce42952cf23b8f9ea883c75736dadfee12e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 17 Aug 2016 06:21:21 +0700 Subject: [PATCH 085/218] release 2016.08.17 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 9 +++++++-- youtube_dl/version.py | 2 +- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 1e0d99b43..ae28d83d5 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.13*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.13** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.17*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.17** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.08.13 +[debug] youtube-dl version 2016.08.17 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 8f27019c1..354306a97 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2016.08.17 Core + Add _get_netrc_login_info diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 56fc41a40..189b9301d 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -35,6 +35,7 @@ - **AlJazeera** - **Allocine** - **AlphaPorno** + - **AMCNetworks** - **AnimeOnDemand** - **anitube.se** - **AnySex** @@ -247,6 +248,7 @@ - **Funimation** - **FunnyOrDie** - **Fusion** + - **FXNetworks** - **GameInformer** - **GameOne** - **gameone:playlist** @@ -398,6 +400,7 @@ - **Moviezine** - **MPORA** - **MSN** + - **mtg**: MTG services - **MTV** - **mtv.de** - **mtvservices:embedded** @@ -731,7 +734,6 @@ - **tvp**: Telewizja Polska - **tvp:embed**: Telewizja Polska - **tvp:series** - - **TVPlay**: TV3Play and related services - **Tweakers** - **twitch:chapter** - **twitch:clips** @@ -748,6 +750,8 @@ - **UDNEmbed**: 聯合影音 - **Unistra** - **uol.com.br** + - **uplynk** + - **uplynk:preplay** - **Urort**: NRK P3 Urørt - **URPlay** - **USAToday** @@ -765,7 +769,9 @@ - **VevoPlaylist** - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet - **vh1.com** + - **Viafree** - **Vice** + - **Viceland** - **ViceShow** - **Vidbit** - **Viddler** @@ -887,4 +893,3 @@ - **ZDFChannel** - **zingmp3:album**: mp3.zing.vn albums - **zingmp3:song**: mp3.zing.vn songs - - **ZippCast** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index cc93d22aa..cf5950117 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.08.13' +__version__ = '2016.08.17' From 92cd9fd56574f22087a8f8df52192df1d4c11a21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 17 Aug 2016 07:01:32 +0700 Subject: [PATCH 086/218] [keezmovies] Make display_id optional --- youtube_dl/extractor/keezmovies.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index ad2f8a8c8..b002c0dd1 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -39,7 +39,8 @@ class KeezMoviesIE(InfoExtractor): def _extract_info(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id + display_id = (mobj.group('display_id') if 'display_id' + in mobj.groupdict() else None) or mobj.group('id') webpage = self._download_webpage( url, display_id, headers={'Cookie': 'age_verified=1'}) From b505e98784b2c1cc07f734e9709702ee9d01287e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 17 Aug 2016 07:02:13 +0700 Subject: [PATCH 087/218] [extremetube] Revert display_id --- youtube_dl/extractor/extremetube.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index b4fd9334a..445f9438d 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -5,13 +5,12 @@ from .keezmovies import KeezMoviesIE class ExtremeTubeIE(KeezMoviesIE): - _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?:(?P[^/]+)-)(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P[^/#?&]+)' _TESTS = [{ 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', 'md5': '1fb9228f5e3332ec8c057d6ac36f33e0', 'info_dict': { - 'id': '652431', - 'display_id': 'music-video-14-british-euro-brit-european-cumshots-swallow', + 'id': 'music-video-14-british-euro-brit-european-cumshots-swallow-652431', 'ext': 'mp4', 'title': 'Music Video 14 british euro brit european cumshots swallow', 'uploader': 'unknown', From 7273e5849b27cb7d0f4d5f40e7801cab2da85ae3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 17 Aug 2016 11:03:09 +0100 Subject: [PATCH 088/218] [discoverygo] extend _VALID_URL to support other networks --- youtube_dl/extractor/discoverygo.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/discoverygo.py b/youtube_dl/extractor/discoverygo.py index adb68b96c..cba709935 100644 --- a/youtube_dl/extractor/discoverygo.py +++ b/youtube_dl/extractor/discoverygo.py @@ -11,7 +11,17 @@ from ..utils import ( class DiscoveryGoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?discoverygo\.com/(?:[^/]+/)*(?P[^/?#&]+)' + _VALID_URL = r'''(?x)https?://(?:www\.)?(?: + discovery| + investigationdiscovery| + discoverylife| + animalplanet| + ahctv| + destinationamerica| + sciencechannel| + tlc| + velocitychannel + )go\.com/(?:[^/]+/)*(?P[^/?#&]+)''' _TEST = { 'url': 'https://www.discoverygo.com/love-at-first-kiss/kiss-first-ask-questions-later/', 'info_dict': { From 4e9fee101508fe90c5b103738d1b6458e40affd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 18 Aug 2016 04:37:14 +0700 Subject: [PATCH 089/218] [hgtvcom:show] Add extractor (Closes #10365) --- youtube_dl/extractor/extractors.py | 5 ++++- youtube_dl/extractor/hgtv.py | 31 ++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 55c639158..e61bb11c3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -324,7 +324,10 @@ from .heise import HeiseIE from .hellporno import HellPornoIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE -from .hgtv import HGTVIE +from .hgtv import ( + HGTVIE, + HGTVComShowIE, +) from .historicfilms import HistoricFilmsIE from .hitbox import HitboxIE, HitboxLiveIE from .hornbunny import HornBunnyIE diff --git a/youtube_dl/extractor/hgtv.py b/youtube_dl/extractor/hgtv.py index c3f0733cf..69543bff2 100644 --- a/youtube_dl/extractor/hgtv.py +++ b/youtube_dl/extractor/hgtv.py @@ -46,3 +46,34 @@ class HGTVIE(InfoExtractor): 'episode_number': int_or_none(embed_vars.get('episode')), 'ie_key': 'ThePlatform', } + + +class HGTVComShowIE(InfoExtractor): + IE_NAME = 'hgtv.com:show' + _VALID_URL = r'https?://(?:www\.)?hgtv\.com/shows/[^/]+/(?P[^/?#&]+)' + _TEST = { + 'url': 'http://www.hgtv.com/shows/flip-or-flop/flip-or-flop-full-episodes-videos', + 'info_dict': { + 'id': 'flip-or-flop-full-episodes-videos', + 'title': 'Flip or Flop Full Episodes', + }, + 'playlist_mincount': 15, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + config = self._parse_json( + self._search_regex( + r'(?s)data-module=["\']video["\'][^>]*>.*?]+type=["\']text/x-config["\'][^>]*>(.+?) Date: Thu, 18 Aug 2016 04:39:31 +0700 Subject: [PATCH 090/218] [keezmovies] PEP 8 --- youtube_dl/extractor/keezmovies.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index b002c0dd1..588a4d0ec 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -39,8 +39,9 @@ class KeezMoviesIE(InfoExtractor): def _extract_info(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - display_id = (mobj.group('display_id') if 'display_id' - in mobj.groupdict() else None) or mobj.group('id') + display_id = (mobj.group('display_id') + if 'display_id' in mobj.groupdict() + else None) or mobj.group('id') webpage = self._download_webpage( url, display_id, headers={'Cookie': 'age_verified=1'}) From 08a42f9c741aa37a599e6fe54ec8b9660df117e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 18 Aug 2016 05:22:23 +0700 Subject: [PATCH 091/218] [vk] Fix authentication on python3 --- youtube_dl/extractor/vk.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 3ee66e23e..634d17d91 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -52,8 +52,9 @@ class VKBaseIE(InfoExtractor): # what actually happens. # We will workaround this VK issue by resetting the remixlhk cookie to # the first one manually. - cookies = url_handle.headers.get('Set-Cookie') - if cookies: + for header, cookies in url_handle.headers.items(): + if header.lower() != 'set-cookie': + continue if sys.version_info[0] >= 3: cookies = cookies.encode('iso-8859-1') cookies = cookies.decode('utf-8') @@ -61,6 +62,7 @@ class VKBaseIE(InfoExtractor): if remixlhk: value, domain = remixlhk.groups() self._set_cookie(domain, 'remixlhk', value) + break login_page = self._download_webpage( 'https://login.vk.com/?act=login', None, From 51815886a98503593524ec6ffa778ff19d840e2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 18 Aug 2016 06:14:05 +0700 Subject: [PATCH 092/218] [vk:wallpost] Fix audio extraction --- youtube_dl/extractor/vk.py | 66 ++++++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 634d17d91..cd22df25a 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -1,6 +1,7 @@ # encoding: utf-8 from __future__ import unicode_literals +import collections import re import json import sys @@ -16,7 +17,6 @@ from ..utils import ( get_element_by_class, int_or_none, orderedSet, - parse_duration, remove_start, str_to_int, unescapeHTML, @@ -447,6 +447,9 @@ class VKWallPostIE(VKBaseIE): 'skip_download': True, }, }], + 'params': { + 'usenetrc': True, + }, 'skip': 'Requires vk account credentials', }, { # single YouTube embed, no leading - @@ -456,6 +459,9 @@ class VKWallPostIE(VKBaseIE): 'title': 'Sergey Gorbunov - Wall post 85155021_6319', }, 'playlist_count': 1, + 'params': { + 'usenetrc': True, + }, 'skip': 'Requires vk account credentials', }, { # wall page URL @@ -483,37 +489,41 @@ class VKWallPostIE(VKBaseIE): raise ExtractorError('VK said: %s' % error, expected=True) description = clean_html(get_element_by_class('wall_post_text', webpage)) - uploader = clean_html(get_element_by_class( - 'fw_post_author', webpage)) or self._og_search_description(webpage) + uploader = clean_html(get_element_by_class('author', webpage)) thumbnail = self._og_search_thumbnail(webpage) entries = [] - for audio in re.finditer(r'''(?sx) - ]+ - id=(?P["\'])audio_info(?P\d+_\d+).*?(?P=q1)[^>]+ - value=(?P["\'])(?Phttp.+?)(?P=q2) - .+? - ''', webpage): - audio_html = audio.group(0) - audio_id = audio.group('id') - duration = parse_duration(get_element_by_class('duration', audio_html)) - track = self._html_search_regex( - r']+id=["\']title%s[^>]*>([^<]+)' % audio_id, - audio_html, 'title', default=None) - artist = self._html_search_regex( - r'>([^<]+)\s*&ndash', audio_html, - 'artist', default=None) - entries.append({ - 'id': audio_id, - 'url': audio.group('url'), - 'title': '%s - %s' % (artist, track) if artist and track else audio_id, - 'thumbnail': thumbnail, - 'duration': duration, - 'uploader': uploader, - 'artist': artist, - 'track': track, - }) + audio_ids = re.findall(r'data-full-id=["\'](\d+_\d+)', webpage) + if audio_ids: + al_audio = self._download_webpage( + 'https://vk.com/al_audio.php', post_id, + note='Downloading audio info', fatal=False, + data=urlencode_postdata({ + 'act': 'reload_audio', + 'al': '1', + 'ids': ','.join(audio_ids) + })) + if al_audio: + Audio = collections.namedtuple( + 'Audio', ['id', 'user_id', 'url', 'track', 'artist', 'duration']) + audios = self._parse_json( + self._search_regex( + r'(.+?)', al_audio, 'audios', default='[]'), + post_id, fatal=False, transform_source=unescapeHTML) + if isinstance(audios, list): + for audio in audios: + a = Audio._make(audio[:6]) + entries.append({ + 'id': '%s_%s' % (a.user_id, a.id), + 'url': a.url, + 'title': '%s - %s' % (a.artist, a.track) if a.artist and a.track else a.id, + 'thumbnail': thumbnail, + 'duration': a.duration, + 'uploader': uploader, + 'artist': a.artist, + 'track': a.track, + }) for video in re.finditer( r']+href=(["\'])(?P/video(?:-?[\d_]+).*?)\1', webpage): From b0c8f2e9c8946f8aab4be0d1435e504aac0d317f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A9stin=20Reed?= Date: Wed, 17 Aug 2016 12:45:24 +0200 Subject: [PATCH 093/218] [DBTV:generic] Add support for embeds --- youtube_dl/extractor/dbtv.py | 6 ++++++ youtube_dl/extractor/generic.py | 11 +++++++++++ 2 files changed, 17 insertions(+) diff --git a/youtube_dl/extractor/dbtv.py b/youtube_dl/extractor/dbtv.py index caff8842e..73dba5e2a 100644 --- a/youtube_dl/extractor/dbtv.py +++ b/youtube_dl/extractor/dbtv.py @@ -38,6 +38,12 @@ class DBTVIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r']+src=(["\'])((?:https?:)?//(?:www\.)?dbtv\.no/lazyplayer/\d+.*?)\1', + webpage)] + def _real_extract(self, url): video_id, display_id = re.match(self._VALID_URL, url).groups() diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 197ab9531..1b71f7ac8 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -73,6 +73,7 @@ from .eagleplatform import EaglePlatformIE from .facebook import FacebookIE from .soundcloud import SoundcloudIE from .vbox7 import Vbox7IE +from .dbtv import DBTVIE class GenericIE(InfoExtractor): @@ -1386,6 +1387,11 @@ class GenericIE(InfoExtractor): }, 'add_ie': [Vbox7IE.ie_key()], }, + { + # DBTV embeds + 'url': 'http://www.dagbladet.no/2016/02/23/nyheter/nordlys/ski/troms/ver/43254897/', + 'playlist_mincount': 3, + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2257,6 +2263,11 @@ class GenericIE(InfoExtractor): if vbox7_url: return self.url_result(vbox7_url, Vbox7IE.ie_key()) + # Look for DBTV embeds + dbtv_urls = DBTVIE._extract_urls(webpage) + if dbtv_urls: + return _playlist_from_matches(dbtv_urls, ie=DBTVIE.ie_key()) + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( webpage, video_id, default={}, expected_type='VideoObject') From b0d578ff7b54c521776cf8d1e050dc198bbc26e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 18 Aug 2016 21:30:55 +0700 Subject: [PATCH 094/218] [dbtv] Relax embed regex --- youtube_dl/extractor/dbtv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dbtv.py b/youtube_dl/extractor/dbtv.py index 73dba5e2a..6d880d43d 100644 --- a/youtube_dl/extractor/dbtv.py +++ b/youtube_dl/extractor/dbtv.py @@ -41,7 +41,7 @@ class DBTVIE(InfoExtractor): @staticmethod def _extract_urls(webpage): return [url for _, url in re.findall( - r']+src=(["\'])((?:https?:)?//(?:www\.)?dbtv\.no/lazyplayer/\d+.*?)\1', + r']+src=(["\'])((?:https?:)?//(?:www\.)?dbtv\.no/(?:lazy)?player/\d+.*?)\1', webpage)] def _real_extract(self, url): From fd3ec986a4217319d0cc345c5e2eb910d90be6f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 18 Aug 2016 21:35:41 +0700 Subject: [PATCH 095/218] [generic] Fix dbtv test (Closes #10364) --- youtube_dl/extractor/generic.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 1b71f7ac8..506892b11 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1390,6 +1390,10 @@ class GenericIE(InfoExtractor): { # DBTV embeds 'url': 'http://www.dagbladet.no/2016/02/23/nyheter/nordlys/ski/troms/ver/43254897/', + 'info_dict': { + 'id': '43254897', + 'title': 'Etter ett års planlegging, klaffet endelig alt: - Jeg måtte ta en liten dans', + }, 'playlist_mincount': 3, }, # { From 13585d7682ef6351bfcd463cf1802bc8fbadaf43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 18 Aug 2016 23:32:00 +0700 Subject: [PATCH 096/218] [utils] Recognize lowercase units in parse_filesize --- test/test_utils.py | 2 ++ youtube_dl/utils.py | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 74fcf91c0..cb578cd53 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -817,7 +817,9 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_filesize('2 MiB'), 2097152) self.assertEqual(parse_filesize('5 GB'), 5000000000) self.assertEqual(parse_filesize('1.2Tb'), 1200000000000) + self.assertEqual(parse_filesize('1.2tb'), 1200000000000) self.assertEqual(parse_filesize('1,24 KB'), 1240) + self.assertEqual(parse_filesize('1,24 kb'), 1240) def test_parse_count(self): self.assertEqual(parse_count(None), None) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b3b687a31..35362e767 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1508,34 +1508,42 @@ def parse_filesize(s): 'KB': 1000, 'kB': 1024, 'Kb': 1000, + 'kb': 1000, 'MiB': 1024 ** 2, 'MB': 1000 ** 2, 'mB': 1024 ** 2, 'Mb': 1000 ** 2, + 'mb': 1000 ** 2, 'GiB': 1024 ** 3, 'GB': 1000 ** 3, 'gB': 1024 ** 3, 'Gb': 1000 ** 3, + 'gb': 1000 ** 3, 'TiB': 1024 ** 4, 'TB': 1000 ** 4, 'tB': 1024 ** 4, 'Tb': 1000 ** 4, + 'tb': 1000 ** 4, 'PiB': 1024 ** 5, 'PB': 1000 ** 5, 'pB': 1024 ** 5, 'Pb': 1000 ** 5, + 'pb': 1000 ** 5, 'EiB': 1024 ** 6, 'EB': 1000 ** 6, 'eB': 1024 ** 6, 'Eb': 1000 ** 6, + 'eb': 1000 ** 6, 'ZiB': 1024 ** 7, 'ZB': 1000 ** 7, 'zB': 1024 ** 7, 'Zb': 1000 ** 7, + 'zb': 1000 ** 7, 'YiB': 1024 ** 8, 'YB': 1000 ** 8, 'yB': 1024 ** 8, 'Yb': 1000 ** 8, + 'yb': 1000 ** 8, } return lookup_unit_table(_UNIT_TABLE, s) From 850837b67ada7cf0a139117a7335aa40990cd0d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 18 Aug 2016 23:52:41 +0700 Subject: [PATCH 097/218] [porncom] Add extractor (Closes #2251, closes #10251) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/porncom.py | 89 ++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 youtube_dl/extractor/porncom.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e61bb11c3..6c5d46015 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -642,6 +642,7 @@ from .podomatic import PodomaticIE from .pokemon import PokemonIE from .polskieradio import PolskieRadioIE from .porn91 import Porn91IE +from .porncom import PornComIE from .pornhd import PornHdIE from .pornhub import ( PornHubIE, diff --git a/youtube_dl/extractor/porncom.py b/youtube_dl/extractor/porncom.py new file mode 100644 index 000000000..4baf79688 --- /dev/null +++ b/youtube_dl/extractor/porncom.py @@ -0,0 +1,89 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + js_to_json, + parse_filesize, + str_to_int, +) + + +class PornComIE(InfoExtractor): + _VALID_URL = r'https?://(?:[a-zA-Z]+\.)?porn\.com/videos/(?:(?P[^/]+)-)?(?P\d+)' + _TESTS = [{ + 'url': 'http://www.porn.com/videos/teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec-2603339', + 'md5': '3f30ce76267533cd12ba999263156de7', + 'info_dict': { + 'id': '2603339', + 'display_id': 'teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec', + 'ext': 'mp4', + 'title': 'Teen grabs a dildo and fucks her pussy live on 1hottie, I rec', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 551, + 'view_count': int, + 'age_limit': 18, + }, + }, { + 'url': 'http://se.porn.com/videos/marsha-may-rides-seth-on-top-of-his-thick-cock-2658067', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + + config = self._parse_json( + self._search_regex( + r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*=', + webpage, 'config', default='{}'), + display_id, transform_source=js_to_json, fatal=False) + + if config: + title = config['title'] + formats = [{ + 'url': stream['url'], + 'format_id': stream.get('id'), + 'height': int_or_none(self._search_regex( + r'^(\d+)[pP]', stream.get('id') or '', 'height', default=None)) + } for stream in config['streams'] if stream.get('url')] + thumbnail = (compat_urlparse.urljoin( + config['thumbCDN'], config['poster']) + if config.get('thumbCDN') and config.get('poster') else None) + duration = int_or_none(config.get('length')) + else: + title = self._search_regex( + (r'([^<]+)', r']*>([^<]+)

    '), + webpage, 'title') + formats = [{ + 'url': compat_urlparse.urljoin(url, format_url), + 'format_id': '%sp' % height, + 'height': int(height), + 'filesize_approx': parse_filesize(filesize), + } for format_url, height, filesize in re.findall( + r']+href="(/download/[^"]+)">MPEG4 (\d+)p]*>(\d+\s+[a-zA-Z]+)<', + webpage)] + thumbnail = None + duration = None + + self._sort_formats(formats) + + view_count = str_to_int(self._search_regex( + r'class=["\']views["\'][^>]*>

    ([\d,.]+)', webpage, 'view count')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + 'age_limit': 18, + } From 8b2dc4c3287e5e90f339af687f3a272818c94fea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 18 Aug 2016 23:59:13 +0700 Subject: [PATCH 098/218] [options] Remove output template description from --help Same reasons as for --format --- youtube_dl/options.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index d32a9e32c..5d62deef4 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -628,22 +628,7 @@ def parseOpts(overrideArguments=None): filesystem.add_option( '-o', '--output', dest='outtmpl', metavar='TEMPLATE', - help=('Output filename template. Use %(title)s to get the title, ' - '%(uploader)s for the uploader name, %(uploader_id)s for the uploader nickname if different, ' - '%(autonumber)s to get an automatically incremented number, ' - '%(ext)s for the filename extension, ' - '%(format)s for the format description (like "22 - 1280x720" or "HD"), ' - '%(format_id)s for the unique id of the format (like YouTube\'s itags: "137"), ' - '%(upload_date)s for the upload date (YYYYMMDD), ' - '%(extractor)s for the provider (youtube, metacafe, etc), ' - '%(id)s for the video id, ' - '%(playlist_title)s, %(playlist_id)s, or %(playlist)s (=title if present, ID otherwise) for the playlist the video is in, ' - '%(playlist_index)s for the position in the playlist. ' - '%(height)s and %(width)s for the width and height of the video format. ' - '%(resolution)s for a textual description of the resolution of the video format. ' - '%% for a literal percent. ' - 'Use - to output to stdout. Can also be used to download to a different directory, ' - 'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .')) + help=('Output filename template, see the "OUTPUT TEMPLATE" for all the info')) filesystem.add_option( '--autonumber-size', dest='autonumber_size', metavar='NUMBER', From 93a63b36f1c52a9981050e393d1876d6162abb49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 19 Aug 2016 00:13:24 +0700 Subject: [PATCH 099/218] [ChangeLog] Actualize --- ChangeLog | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ChangeLog b/ChangeLog index 354306a97..7e8bb834d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +version + +Core +- Remove output template description from --help +* Recognize lowercase units in parse_filesize + +Extractors ++ [porncom] Add extractor for porn.com (#2251, #10251) ++ [generic] Add support for DBTV embeds +* [vk:wallpost] Fix audio extraction for new site layout +* [vk] Fix authentication ++ [hgtvcom:show] Add extractor for hgtv.com shows (#10365) ++ [discoverygo] Add support for another GO network sites + + version 2016.08.17 Core From bd1bcd3ea079889cfd7cd44c0ea750ac9d432e41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 19 Aug 2016 00:15:12 +0700 Subject: [PATCH 100/218] release 2016.08.19 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- README.md | 28 ++-------------------------- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 5 files changed, 9 insertions(+), 31 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index ae28d83d5..7af3c7099 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.17*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.17** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.19*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.19** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.08.17 +[debug] youtube-dl version 2016.08.19 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 7e8bb834d..e99ffcec6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2016.08.19 Core - Remove output template description from --help diff --git a/README.md b/README.md index cabbbef76..952db7abb 100644 --- a/README.md +++ b/README.md @@ -201,32 +201,8 @@ which means you can modify it, redistribute it or use it however you like. -a, --batch-file FILE File containing URLs to download ('-' for stdin) --id Use only video ID in file name - -o, --output TEMPLATE Output filename template. Use %(title)s to - get the title, %(uploader)s for the - uploader name, %(uploader_id)s for the - uploader nickname if different, - %(autonumber)s to get an automatically - incremented number, %(ext)s for the - filename extension, %(format)s for the - format description (like "22 - 1280x720" or - "HD"), %(format_id)s for the unique id of - the format (like YouTube's itags: "137"), - %(upload_date)s for the upload date - (YYYYMMDD), %(extractor)s for the provider - (youtube, metacafe, etc), %(id)s for the - video id, %(playlist_title)s, - %(playlist_id)s, or %(playlist)s (=title if - present, ID otherwise) for the playlist the - video is in, %(playlist_index)s for the - position in the playlist. %(height)s and - %(width)s for the width and height of the - video format. %(resolution)s for a textual - description of the resolution of the video - format. %% for a literal percent. Use - to - output to stdout. Can also be used to - download to a different directory, for - example with -o '/my/downloads/%(uploader)s - /%(title)s-%(id)s.%(ext)s' . + -o, --output TEMPLATE Output filename template, see the "OUTPUT + TEMPLATE" for all the info --autonumber-size NUMBER Specify the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 189b9301d..edf192138 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -279,6 +279,7 @@ - **Helsinki**: helsinki.fi - **HentaiStigma** - **HGTV** + - **hgtv.com:show** - **HistoricFilms** - **history:topic**: History.com Topic - **hitbox** @@ -523,6 +524,7 @@ - **podomatic** - **Pokemon** - **PolskieRadio** + - **PornCom** - **PornHd** - **PornHub**: PornHub and Thumbzilla - **PornHubPlaylist** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index cf5950117..691f2c591 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.08.17' +__version__ = '2016.08.19' From 9e5751b9fe72f7425e4cb3f22a56b6a95b59e41d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 19 Aug 2016 01:13:45 +0700 Subject: [PATCH 101/218] [globo:article] Relax _VALID_URL and video id regex (Closes #10379) --- youtube_dl/extractor/globo.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 3de8356f6..dbacbfc61 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -396,12 +396,12 @@ class GloboIE(InfoExtractor): class GloboArticleIE(InfoExtractor): - _VALID_URL = 'https?://.+?\.globo\.com/(?:[^/]+/)*(?P[^/]+)\.html' + _VALID_URL = 'https?://.+?\.globo\.com/(?:[^/]+/)*(?P[^/]+)(?:\.html)?' _VIDEOID_REGEXES = [ r'\bdata-video-id=["\'](\d{7,})', r'\bdata-player-videosids=["\'](\d{7,})', - r'\bvideosIDs\s*:\s*["\'](\d{7,})', + r'\bvideosIDs\s*:\s*["\']?(\d{7,})', r'\bdata-id=["\'](\d{7,})', r']+\bid=["\'](\d{7,})', ] @@ -423,6 +423,9 @@ class GloboArticleIE(InfoExtractor): }, { 'url': 'http://gshow.globo.com/programas/tv-xuxa/O-Programa/noticia/2014/01/xuxa-e-junno-namoram-muuuito-em-luau-de-zeze-di-camargo-e-luciano.html', 'only_matching': True, + }, { + 'url': 'http://oglobo.globo.com/rio/a-amizade-entre-um-entregador-de-farmacia-um-piano-19946271', + 'only_matching': True, }] @classmethod From e4659b45474acb563db0ab4284abdfc80837307e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 19 Aug 2016 20:37:17 +0800 Subject: [PATCH 102/218] [utils] Correct octal/hexadecimal number detection in js_to_json --- ChangeLog | 6 ++++++ test/test_utils.py | 3 +++ youtube_dl/utils.py | 6 +++--- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index e99ffcec6..98a3dbca3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version + +Core +* Fix js_to_json(): correct octal or hexadecimal number detection + + version 2016.08.19 Core diff --git a/test/test_utils.py b/test/test_utils.py index cb578cd53..b83da93b4 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -712,6 +712,9 @@ class TestUtil(unittest.TestCase): inp = '''{"foo":101}''' self.assertEqual(js_to_json(inp), '''{"foo":101}''') + inp = '''{"duration": "00:01:07"}''' + self.assertEqual(js_to_json(inp), '''{"duration": "00:01:07"}''') + def test_js_to_json_edgecases(self): on = js_to_json("{abc_def:'1\\'\\\\2\\\\\\'3\"4'}") self.assertEqual(json.loads(on), {"abc_def": "1'\\2\\'3\"4"}) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 35362e767..0c36c1b80 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2038,14 +2038,14 @@ def js_to_json(code): }.get(m.group(0), m.group(0)), v[1:-1]) INTEGER_TABLE = ( - (r'^0[xX][0-9a-fA-F]+', 16), - (r'^0+[0-7]+', 8), + (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16), + (r'^(0+[0-7]+)\s*:?$', 8), ) for regex, base in INTEGER_TABLE: im = re.match(regex, v) if im: - i = int(im.group(0), base) + i = int(im.group(1), base) return '"%d":' % i if v.endswith(':') else '%d' % i return '"%s"' % v From b82232036a019e340b715779108c3f4caea8a78d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 19 Aug 2016 20:39:28 +0800 Subject: [PATCH 103/218] [n-tv.de] Fix extraction (closes #10331) --- ChangeLog | 3 +++ youtube_dl/extractor/ntvde.py | 8 +++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index 98a3dbca3..6281fe325 100644 --- a/ChangeLog +++ b/ChangeLog @@ -3,6 +3,9 @@ version Core * Fix js_to_json(): correct octal or hexadecimal number detection +Extractors +* [n-tv.de] Fix extraction (#10331) + version 2016.08.19 diff --git a/youtube_dl/extractor/ntvde.py b/youtube_dl/extractor/ntvde.py index a83e85cb8..d28a81542 100644 --- a/youtube_dl/extractor/ntvde.py +++ b/youtube_dl/extractor/ntvde.py @@ -1,6 +1,8 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( @@ -40,8 +42,8 @@ class NTVDeIE(InfoExtractor): timestamp = int_or_none(info.get('publishedDateAsUnixTimeStamp')) vdata = self._parse_json(self._search_regex( r'(?s)\$\(\s*"\#player"\s*\)\s*\.data\(\s*"player",\s*(\{.*?\})\);', - webpage, 'player data'), - video_id, transform_source=js_to_json) + webpage, 'player data'), video_id, + transform_source=lambda s: js_to_json(re.sub(r'advertising:\s*{[^}]+},', '', s))) duration = parse_duration(vdata.get('duration')) formats = [] From 55af45fcab4295a92d56180cdbebe7b47e094bc3 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 19 Aug 2016 23:12:30 +0800 Subject: [PATCH 104/218] [radiobremen] Update _TEST (closes #10337) --- youtube_dl/extractor/radiobremen.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/radiobremen.py b/youtube_dl/extractor/radiobremen.py index 0cbb15f08..19a751da0 100644 --- a/youtube_dl/extractor/radiobremen.py +++ b/youtube_dl/extractor/radiobremen.py @@ -13,15 +13,15 @@ class RadioBremenIE(InfoExtractor): IE_NAME = 'radiobremen' _TEST = { - 'url': 'http://www.radiobremen.de/mediathek/index.html?id=114720', + 'url': 'http://www.radiobremen.de/mediathek/?id=141876', 'info_dict': { - 'id': '114720', + 'id': '141876', 'ext': 'mp4', - 'duration': 1685, + 'duration': 178, 'width': 512, - 'title': 'buten un binnen vom 22. Dezember', + 'title': 'Druck auf Patrick Öztürk', 'thumbnail': 're:https?://.*\.jpg$', - 'description': 'Unter anderem mit diesen Themen: 45 Flüchtlinge sind in Worpswede angekommen +++ Freies Internet für alle: Bremer arbeiten an einem flächendeckenden W-Lan-Netzwerk +++ Aktivisten kämpfen für das Unibad +++ So war das Wetter 2014 +++', + 'description': 'Gegen den SPD-Bürgerschaftsabgeordneten Patrick Öztürk wird wegen Beihilfe zum gewerbsmäßigen Betrug ermittelt. Am Donnerstagabend sollte er dem Vorstand des SPD-Unterbezirks Bremerhaven dazu Rede und Antwort stehen.', }, } From 520251c093f5e0fe6af5e57203a0452aef0682ac Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 19 Aug 2016 23:53:47 +0800 Subject: [PATCH 105/218] [extractor/common] Recognize m3u8 manifests in HTML5 multimedia tags --- ChangeLog | 1 + youtube_dl/extractor/common.py | 36 +++++++++++++++++++++++----------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/ChangeLog b/ChangeLog index 6281fe325..450351231 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version Core +* Support m3u8 manifests in HTML5 multimedia tags * Fix js_to_json(): correct octal or hexadecimal number detection Extractors diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 9427ff449..07d58afe7 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1695,7 +1695,7 @@ class InfoExtractor(object): self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats - def _parse_html5_media_entries(self, base_url, webpage): + def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None): def absolute_url(video_url): return compat_urlparse.urljoin(base_url, video_url) @@ -1710,6 +1710,21 @@ class InfoExtractor(object): return f return {} + def _media_formats(src, cur_media_type): + full_url = absolute_url(src) + if determine_ext(full_url) == 'm3u8': + is_plain_url = False + formats = self._extract_m3u8_formats( + full_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=m3u8_id) + else: + is_plain_url = True + formats = [{ + 'url': full_url, + 'vcodec': 'none' if cur_media_type == 'audio' else None, + }] + return is_plain_url, formats + entries = [] for media_tag, media_type, media_content in re.findall(r'(?s)(<(?Pvideo|audio)[^>]*>)(.*?)', webpage): media_info = { @@ -1719,10 +1734,8 @@ class InfoExtractor(object): media_attributes = extract_attributes(media_tag) src = media_attributes.get('src') if src: - media_info['formats'].append({ - 'url': absolute_url(src), - 'vcodec': 'none' if media_type == 'audio' else None, - }) + _, formats = _media_formats(src) + media_info['formats'].extend(formats) media_info['thumbnail'] = media_attributes.get('poster') if media_content: for source_tag in re.findall(r']+>', media_content): @@ -1730,12 +1743,13 @@ class InfoExtractor(object): src = source_attributes.get('src') if not src: continue - f = parse_content_type(source_attributes.get('type')) - f.update({ - 'url': absolute_url(src), - 'vcodec': 'none' if media_type == 'audio' else None, - }) - media_info['formats'].append(f) + is_plain_url, formats = _media_formats(src, media_type) + if is_plain_url: + f = parse_content_type(source_attributes.get('type')) + f.update(formats[0]) + media_info['formats'].append(f) + else: + media_info['formats'].extend(formats) for track_tag in re.findall(r']+>', media_content): track_attributes = extract_attributes(track_tag) kind = track_attributes.get('kind') From ecc90093f9c3793439832f4c9d279605da3489a7 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 19 Aug 2016 23:56:09 +0800 Subject: [PATCH 106/218] [vuclip] Adapt to the new API and update _TEST --- youtube_dl/extractor/vuclip.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py index b73da5cd0..55e087bdb 100644 --- a/youtube_dl/extractor/vuclip.py +++ b/youtube_dl/extractor/vuclip.py @@ -17,12 +17,12 @@ class VuClipIE(InfoExtractor): _VALID_URL = r'https?://(?:m\.)?vuclip\.com/w\?.*?cid=(?P[0-9]+)' _TEST = { - 'url': 'http://m.vuclip.com/w?cid=922692425&fid=70295&z=1010&nvar&frm=index.html', + 'url': 'http://m.vuclip.com/w?cid=1129900602&bu=8589892792&frm=w&z=34801&op=0&oc=843169247§ion=recommend', 'info_dict': { - 'id': '922692425', + 'id': '1129900602', 'ext': '3gp', - 'title': 'The Toy Soldiers - Hollywood Movie Trailer', - 'duration': 177, + 'title': 'Top 10 TV Convicts', + 'duration': 733, } } @@ -54,7 +54,7 @@ class VuClipIE(InfoExtractor): 'url': video_url, }] else: - formats = self._parse_html5_media_entries(url, webpage)[0]['formats'] + formats = self._parse_html5_media_entries(url, webpage, video_id)[0]['formats'] title = remove_end(self._html_search_regex( r'(.*?)-\s*Vuclip', webpage, 'title').strip(), ' - Video') From a9a3b4a081a6793f0dd0b40be8429a2aa3c1c36d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 20 Aug 2016 00:08:23 +0800 Subject: [PATCH 107/218] [miomio] Adapt to the new API and update _TESTS The test case is from #9680 --- youtube_dl/extractor/miomio.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py index 937ba0f28..ec1b4c4fe 100644 --- a/youtube_dl/extractor/miomio.py +++ b/youtube_dl/extractor/miomio.py @@ -25,10 +25,7 @@ class MioMioIE(InfoExtractor): 'title': '【SKY】字幕 铠武昭和VS平成 假面骑士大战FEAT战队 魔星字幕组 字幕', 'duration': 5923, }, - 'params': { - # The server provides broken file - 'skip_download': True, - } + 'skip': 'Unable to load videos', }, { 'url': 'http://www.miomio.tv/watch/cc184024/', 'info_dict': { @@ -47,16 +44,12 @@ class MioMioIE(InfoExtractor): 'skip': 'Unable to load videos', }, { # new 'h5' player - 'url': 'http://www.miomio.tv/watch/cc273295/', - 'md5': '', + 'url': 'http://www.miomio.tv/watch/cc273997/', + 'md5': '0b27a4b4495055d826813f8c3a6b2070', 'info_dict': { - 'id': '273295', + 'id': '273997', 'ext': 'mp4', - 'title': 'アウト×デラックス 20160526', - }, - 'params': { - # intermittent HTTP 500 - 'skip_download': True, + 'title': 'マツコの知らない世界【劇的進化SP!ビニール傘&冷凍食品2016】 1_2 - 16 05 31', }, }] @@ -116,7 +109,7 @@ class MioMioIE(InfoExtractor): player_webpage = self._download_webpage( player_url, video_id, note='Downloading player webpage', headers={'Referer': url}) - entries = self._parse_html5_media_entries(player_url, player_webpage) + entries = self._parse_html5_media_entries(player_url, player_webpage, video_id) http_headers = {'Referer': player_url} else: http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path} From 70852b47ca101f0b4acc76eb3213b763a14b3602 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 20 Aug 2016 00:12:32 +0800 Subject: [PATCH 108/218] [utils] Recognize units with full names in parse_filename Reference: https://en.wikipedia.org/wiki/Template:Quantities_of_bytes --- ChangeLog | 4 +++- test/test_utils.py | 1 + youtube_dl/utils.py | 17 +++++++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 450351231..b36e4438c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,9 @@ version Core -* Support m3u8 manifests in HTML5 multimedia tags ++ Recognize file size strings with full unit names (for example "8.5 + megabytes") ++ Support m3u8 manifests in HTML5 multimedia tags * Fix js_to_json(): correct octal or hexadecimal number detection Extractors diff --git a/test/test_utils.py b/test/test_utils.py index b83da93b4..d16ea7f77 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -823,6 +823,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_filesize('1.2tb'), 1200000000000) self.assertEqual(parse_filesize('1,24 KB'), 1240) self.assertEqual(parse_filesize('1,24 kb'), 1240) + self.assertEqual(parse_filesize('8.5 megabytes'), 8500000) def test_parse_count(self): self.assertEqual(parse_count(None), None) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 0c36c1b80..41ca562f1 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1504,46 +1504,63 @@ def parse_filesize(s): _UNIT_TABLE = { 'B': 1, 'b': 1, + 'bytes': 1, 'KiB': 1024, 'KB': 1000, 'kB': 1024, 'Kb': 1000, 'kb': 1000, + 'kilobytes': 1000, + 'kibibytes': 1024, 'MiB': 1024 ** 2, 'MB': 1000 ** 2, 'mB': 1024 ** 2, 'Mb': 1000 ** 2, 'mb': 1000 ** 2, + 'megabytes': 1000 ** 2, + 'mebibytes': 1024 ** 2, 'GiB': 1024 ** 3, 'GB': 1000 ** 3, 'gB': 1024 ** 3, 'Gb': 1000 ** 3, 'gb': 1000 ** 3, + 'gigabytes': 1000 ** 3, + 'gibibytes': 1024 ** 3, 'TiB': 1024 ** 4, 'TB': 1000 ** 4, 'tB': 1024 ** 4, 'Tb': 1000 ** 4, 'tb': 1000 ** 4, + 'terabytes': 1000 ** 4, + 'tebibytes': 1024 ** 4, 'PiB': 1024 ** 5, 'PB': 1000 ** 5, 'pB': 1024 ** 5, 'Pb': 1000 ** 5, 'pb': 1000 ** 5, + 'petabytes': 1000 ** 5, + 'pebibytes': 1024 ** 5, 'EiB': 1024 ** 6, 'EB': 1000 ** 6, 'eB': 1024 ** 6, 'Eb': 1000 ** 6, 'eb': 1000 ** 6, + 'exabytes': 1000 ** 6, + 'exbibytes': 1024 ** 6, 'ZiB': 1024 ** 7, 'ZB': 1000 ** 7, 'zB': 1024 ** 7, 'Zb': 1000 ** 7, 'zb': 1000 ** 7, + 'zettabytes': 1000 ** 7, + 'zebibytes': 1024 ** 7, 'YiB': 1024 ** 8, 'YB': 1000 ** 8, 'yB': 1024 ** 8, 'Yb': 1000 ** 8, 'yb': 1000 ** 8, + 'yottabytes': 1000 ** 8, + 'yobibytes': 1024 ** 8, } return lookup_unit_table(_UNIT_TABLE, s) From 19f35402c5296e93213d56034d85698087ce3fe1 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 20 Aug 2016 00:18:22 +0800 Subject: [PATCH 109/218] [snotr] Fix extraction (closes #10338) --- ChangeLog | 1 + youtube_dl/extractor/snotr.py | 38 +++++++++++++++++++---------------- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/ChangeLog b/ChangeLog index b36e4438c..13c3d3ffc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -7,6 +7,7 @@ Core * Fix js_to_json(): correct octal or hexadecimal number detection Extractors +* [snotr] Fix extraction (#10338) * [n-tv.de] Fix extraction (#10331) diff --git a/youtube_dl/extractor/snotr.py b/youtube_dl/extractor/snotr.py index 0d1ab07f8..3bb78cb84 100644 --- a/youtube_dl/extractor/snotr.py +++ b/youtube_dl/extractor/snotr.py @@ -5,9 +5,9 @@ import re from .common import InfoExtractor from ..utils import ( - float_or_none, - str_to_int, parse_duration, + parse_filesize, + str_to_int, ) @@ -17,21 +17,24 @@ class SnotrIE(InfoExtractor): 'url': 'http://www.snotr.com/video/13708/Drone_flying_through_fireworks', 'info_dict': { 'id': '13708', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Drone flying through fireworks!', - 'duration': 247, - 'filesize_approx': 98566144, + 'duration': 248, + 'filesize_approx': 40700000, 'description': 'A drone flying through Fourth of July Fireworks', - } + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'expected_warnings': ['description'], }, { 'url': 'http://www.snotr.com/video/530/David_Letteman_-_George_W_Bush_Top_10', 'info_dict': { 'id': '530', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'David Letteman - George W. Bush Top 10', 'duration': 126, - 'filesize_approx': 8912896, + 'filesize_approx': 8500000, 'description': 'The top 10 George W. Bush moments, brought to you by David Letterman!', + 'thumbnail': 're:^https?://.*\.jpg$', } }] @@ -43,26 +46,27 @@ class SnotrIE(InfoExtractor): title = self._og_search_title(webpage) description = self._og_search_description(webpage) - video_url = 'http://cdn.videos.snotr.com/%s.flv' % video_id + info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] view_count = str_to_int(self._html_search_regex( - r'

    \nViews:\n([\d,\.]+)

    ', + r']*>\s*]*>Views:\s*]*>([\d,\.]+)', webpage, 'view count', fatal=False)) duration = parse_duration(self._html_search_regex( - r'

    \nLength:\n\s*([0-9:]+).*?

    ', + r']*>\s*]*>Length:\s*]*>([\d:]+)', webpage, 'duration', fatal=False)) - filesize_approx = float_or_none(self._html_search_regex( - r'

    \nFilesize:\n\s*([0-9.]+)\s*megabyte

    ', - webpage, 'filesize', fatal=False), invscale=1024 * 1024) + filesize_approx = parse_filesize(self._html_search_regex( + r']*>\s*]*>Filesize:\s*]*>([^<]+)', + webpage, 'filesize', fatal=False)) - return { + info_dict.update({ 'id': video_id, 'description': description, 'title': title, - 'url': video_url, 'view_count': view_count, 'duration': duration, 'filesize_approx': filesize_approx, - } + }) + + return info_dict From 39e1c4f08c4cfca81943e73523bd66b890f5aff2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 20 Aug 2016 00:52:37 +0800 Subject: [PATCH 110/218] [litv] Support 'promo' URLs (closes #10385) --- ChangeLog | 1 + youtube_dl/extractor/litv.py | 24 ++++++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index 13c3d3ffc..a8d8d05a3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -7,6 +7,7 @@ Core * Fix js_to_json(): correct octal or hexadecimal number detection Extractors ++ [litv] Support 'promo' URLs (#10385) * [snotr] Fix extraction (#10338) * [n-tv.de] Fix extraction (#10331) diff --git a/youtube_dl/extractor/litv.py b/youtube_dl/extractor/litv.py index 3356d015d..05c6579f1 100644 --- a/youtube_dl/extractor/litv.py +++ b/youtube_dl/extractor/litv.py @@ -14,7 +14,7 @@ from ..utils import ( class LiTVIE(InfoExtractor): - _VALID_URL = r'https?://www\.litv\.tv/vod/[^/]+/content\.do\?.*?\bid=(?P[^&]+)' + _VALID_URL = r'https?://www\.litv\.tv/(?:vod|promo)/[^/]+/(?:content\.do)?\?.*?\b(?:content_)?id=(?P[^&]+)' _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s' @@ -27,6 +27,7 @@ class LiTVIE(InfoExtractor): 'playlist_count': 50, }, { 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', + 'md5': '969e343d9244778cb29acec608e53640', 'info_dict': { 'id': 'VOD00041610', 'ext': 'mp4', @@ -37,7 +38,16 @@ class LiTVIE(InfoExtractor): }, 'params': { 'noplaylist': True, - 'skip_download': True, # m3u8 download + }, + 'skip': 'Georestricted to Taiwan', + }, { + 'url': 'https://www.litv.tv/promo/miyuezhuan/?content_id=VOD00044841&', + 'md5': '88322ea132f848d6e3e18b32a832b918', + 'info_dict': { + 'id': 'VOD00044841', + 'ext': 'mp4', + 'title': '芈月傳第1集 霸星芈月降世楚國', + 'description': '楚威王二年,太史令唐昧夜觀星象,發現霸星即將現世。王后得知霸星的預言後,想盡辦法不讓孩子順利出生,幸得莒姬相護化解危機。沒想到眾人期待下出生的霸星卻是位公主,楚威王對此失望至極。楚王后命人將女嬰丟棄河中,居然奇蹟似的被少司命像攔下,楚威王認為此女非同凡響,為她取名芈月。', }, 'skip': 'Georestricted to Taiwan', }] @@ -92,13 +102,18 @@ class LiTVIE(InfoExtractor): # endpoint gives the same result as the data embedded in the webpage. # If georestricted, there are no embedded data, so an extra request is # necessary to get the error code + if 'assetId' not in view_data: + view_data = self._download_json( + 'https://www.litv.tv/vod/ajax/getProgramInfo', video_id, + query={'contentId': video_id}, + headers={'Accept': 'application/json'}) video_data = self._parse_json(self._search_regex( r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);', webpage, 'video data', default='{}'), video_id) if not video_data: payload = { 'assetId': view_data['assetId'], - 'watchDevices': vod_data['watchDevices'], + 'watchDevices': view_data['watchDevices'], 'contentType': view_data['contentType'], } video_data = self._download_json( @@ -115,7 +130,8 @@ class LiTVIE(InfoExtractor): raise ExtractorError('Unexpected result from %s' % self.IE_NAME) formats = self._extract_m3u8_formats( - video_data['fullpath'], video_id, ext='mp4', m3u8_id='hls') + video_data['fullpath'], video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls') for a_format in formats: # LiTV HLS segments doesn't like compressions a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True From 5b9d187cc6545c0c5209a4db5525b1023ca8ea41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A9stin=20Reed?= Date: Fri, 19 Aug 2016 22:59:26 +0200 Subject: [PATCH 111/218] [imdb] Improve title extraction and make thumbnail non-fatal --- youtube_dl/extractor/imdb.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 0acce9f4c..3a6a6f5ad 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..utils import ( mimetype2ext, qualities, + remove_end, ) @@ -19,7 +20,7 @@ class ImdbIE(InfoExtractor): 'info_dict': { 'id': '2524815897', 'ext': 'mp4', - 'title': 'Ice Age: Continental Drift Trailer (No. 2) - IMDb', + 'title': 'Ice Age: Continental Drift Trailer (No. 2)', 'description': 'md5:9061c2219254e5d14e03c25c98e96a81', } }, { @@ -83,10 +84,10 @@ class ImdbIE(InfoExtractor): return { 'id': video_id, - 'title': self._og_search_title(webpage), + 'title': remove_end(self._og_search_title(webpage), ' - IMDb'), 'formats': formats, 'description': descr, - 'thumbnail': format_info['slate'], + 'thumbnail': format_info.get('slate'), } From 4245f55880c42e670cebd5a8a2b10929be834682 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 20 Aug 2016 06:18:20 +0700 Subject: [PATCH 112/218] [dotsub] Replace test (Closes #10386) --- youtube_dl/extractor/dotsub.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py index e9ca236d4..fd64d1a7f 100644 --- a/youtube_dl/extractor/dotsub.py +++ b/youtube_dl/extractor/dotsub.py @@ -10,18 +10,18 @@ from ..utils import ( class DotsubIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dotsub\.com/view/(?P[^/]+)' _TEST = { - 'url': 'http://dotsub.com/view/aed3b8b2-1889-4df5-ae63-ad85f5572f27', - 'md5': '0914d4d69605090f623b7ac329fea66e', + 'url': 'https://dotsub.com/view/9c63db2a-fa95-4838-8e6e-13deafe47f09', + 'md5': '21c7ff600f545358134fea762a6d42b6', 'info_dict': { - 'id': 'aed3b8b2-1889-4df5-ae63-ad85f5572f27', + 'id': '9c63db2a-fa95-4838-8e6e-13deafe47f09', 'ext': 'flv', - 'title': 'Pyramids of Waste (2010), AKA The Lightbulb Conspiracy - Planned obsolescence documentary', - 'description': 'md5:699a0f7f50aeec6042cb3b1db2d0d074', - 'thumbnail': 're:^https?://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p', - 'duration': 3169, - 'uploader': '4v4l0n42', - 'timestamp': 1292248482.625, - 'upload_date': '20101213', + 'title': 'MOTIVATION - "It\'s Possible" Best Inspirational Video Ever', + 'description': 'md5:41af1e273edbbdfe4e216a78b9d34ac6', + 'thumbnail': 're:^https?://dotsub.com/media/9c63db2a-fa95-4838-8e6e-13deafe47f09/p', + 'duration': 198, + 'uploader': 'liuxt', + 'timestamp': 1385778501.104, + 'upload_date': '20131130', 'view_count': int, } } From dabe15701b3c12ef7e6af1f3333e1d3e39149592 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 20 Aug 2016 13:25:32 +0100 Subject: [PATCH 113/218] [cbs, cbsnews] fix extraction(fixes #10393) --- youtube_dl/extractor/cbs.py | 47 ++++++++++++++++++++----------- youtube_dl/extractor/cbsnews.py | 9 +++--- youtube_dl/extractor/cbssports.py | 3 ++ 3 files changed, 38 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index a23173d6f..c72ed2dbb 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -4,6 +4,7 @@ from .theplatform import ThePlatformFeedIE from ..utils import ( int_or_none, find_xpath_attr, + ExtractorError, ) @@ -17,19 +18,6 @@ class CBSBaseIE(ThePlatformFeedIE): }] } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] - def _extract_video_info(self, filter_query, video_id): - return self._extract_feed_info( - 'dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id, lambda entry: { - 'series': entry.get('cbs$SeriesTitle'), - 'season_number': int_or_none(entry.get('cbs$SeasonNumber')), - 'episode': entry.get('cbs$EpisodeTitle'), - 'episode_number': int_or_none(entry.get('cbs$EpisodeNumber')), - }, { - 'StreamPack': { - 'manifest': 'm3u', - } - }) - class CBSIE(CBSBaseIE): _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' @@ -38,7 +26,6 @@ class CBSIE(CBSBaseIE): 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', 'info_dict': { 'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_', - 'display_id': 'connect-chat-feat-garth-brooks', 'ext': 'mp4', 'title': 'Connect Chat feat. Garth Brooks', 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', @@ -47,7 +34,10 @@ class CBSIE(CBSBaseIE): 'upload_date': '20131127', 'uploader': 'CBSI-NEW', }, - 'expected_warnings': ['Failed to download m3u8 information'], + 'params': { + # m3u8 download + 'skip_download': True, + }, '_skip': 'Blocked outside the US', }, { 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', @@ -56,8 +46,31 @@ class CBSIE(CBSBaseIE): 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', 'only_matching': True, }] - TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' + + def _extract_video_info(self, guid): + path = 'dJ5BDC/media/guid/2198311517/' + guid + smil_url = 'http://link.theplatform.com/s/%s?mbr=true' % path + formats, subtitles = self._extract_theplatform_smil(smil_url + '&manifest=m3u', guid) + for r in ('HLS&formats=M3U', 'RTMP', 'WIFI', '3G'): + try: + tp_formats, _ = self._extract_theplatform_smil(smil_url + '&assetTypes=' + r, guid, 'Downloading %s SMIL data' % r.split('&')[0]) + formats.extend(tp_formats) + except ExtractorError: + continue + self._sort_formats(formats) + metadata = self._download_theplatform_metadata(path, guid) + info = self._parse_theplatform_metadata(metadata) + info.update({ + 'id': guid, + 'formats': formats, + 'subtitles': subtitles, + 'series': metadata.get('cbs$SeriesTitle'), + 'season_number': int_or_none(metadata.get('cbs$SeasonNumber')), + 'episode': metadata.get('cbs$EpisodeTitle'), + 'episode_number': int_or_none(metadata.get('cbs$EpisodeNumber')), + }) + return info def _real_extract(self, url): content_id = self._match_id(url) - return self._extract_video_info('byGuid=%s' % content_id, content_id) + return self._extract_video_info(content_id) diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 9d3b75526..4aa6917a0 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -2,13 +2,13 @@ from __future__ import unicode_literals from .common import InfoExtractor -from .cbs import CBSBaseIE +from .cbs import CBSIE from ..utils import ( parse_duration, ) -class CBSNewsIE(CBSBaseIE): +class CBSNewsIE(CBSIE): IE_DESC = 'CBS News' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P[\da-z_-]+)' @@ -35,7 +35,8 @@ class CBSNewsIE(CBSBaseIE): 'ext': 'mp4', 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack', 'description': 'md5:4a6983e480542d8b333a947bfc64ddc7', - 'upload_date': '19700101', + 'upload_date': '20140404', + 'timestamp': 1396650660, 'uploader': 'CBSI-NEW', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 205, @@ -63,7 +64,7 @@ class CBSNewsIE(CBSBaseIE): item = video_info['item'] if 'item' in video_info else video_info guid = item['mpxRefId'] - return self._extract_video_info('byGuid=%s' % guid, guid) + return self._extract_video_info(guid) class CBSNewsLiveVideoIE(InfoExtractor): diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index 78ca44b02..bf7915626 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -23,6 +23,9 @@ class CBSSportsIE(CBSBaseIE): } }] + def _extract_video_info(self, filter_query, video_id): + return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) + def _real_extract(self, url): video_id = self._match_id(url) return self._extract_video_info('byId=%s' % video_id, video_id) From 292a2301bf0b99be81640c4511d78ebc3c622dad Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 20 Aug 2016 19:00:25 +0100 Subject: [PATCH 114/218] [cnn] add support for money.cnn.com videos(closes #2797) --- youtube_dl/extractor/cnn.py | 41 ++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 53489a14e..220bb55e8 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -11,7 +11,7 @@ from ..utils import ( class CNNIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/ + _VALID_URL = r'''(?x)https?://(?:(?Pedition|www|money)\.)?cnn\.com/(?:video/(?:data/.+?|\?)/)?videos?/ (?P.+?/(?P[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))''' _TESTS = [{ @@ -45,19 +45,46 @@ class CNNIE(InfoExtractor): 'description': 'md5:e7223a503315c9f150acac52e76de086', 'upload_date': '20141222', } + }, { + 'url': 'http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html', + 'md5': '52a515dc1b0f001cd82e4ceda32be9d1', + 'info_dict': { + 'id': '/video/news/2016/08/19/netflix-stunning-stats.cnnmoney', + 'ext': 'mp4', + 'title': '5 stunning stats about Netflix', + 'description': 'Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn\'t know.', + 'upload_date': '20160819', + } }, { 'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk', 'only_matching': True, }, { 'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg', 'only_matching': True, + }, { + 'url': 'http://edition.cnn.com/videos/arts/2016/04/21/olympic-games-cultural-a-z-brazil.cnn', + 'only_matching': True, }] + _CONFIG = { + # http://edition.cnn.com/.element/apps/cvp/3.0/cfg/spider/cnn/expansion/config.xml + 'edition': { + 'data_src': 'http://edition.cnn.com/video/data/3.0/video/%s/index.xml', + 'media_src': 'http://pmd.cdn.turner.com/cnn/big', + }, + # http://money.cnn.com/.element/apps/cvp2/cfg/config.xml + 'money': { + 'data_src': 'http://money.cnn.com/video/data/4.0/video/%s.xml', + 'media_src': 'http://ht3.cdn.turner.com/money/big', + }, + } + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - path = mobj.group('path') - page_title = mobj.group('title') - info_url = 'http://edition.cnn.com/video/data/3.0/%s/index.xml' % path + sub_domain, path, page_title = re.match(self._VALID_URL, url).groups() + if sub_domain not in ('money', 'edition'): + sub_domain = 'edition' + config = self._CONFIG[sub_domain] + info_url = config['data_src'] % path info = self._download_xml(info_url, page_title) formats = [] @@ -66,7 +93,7 @@ class CNNIE(InfoExtractor): (?:_(?P<bitrate>[0-9]+)k)? ''') for f in info.findall('files/file'): - video_url = 'http://ht.cdn.turner.com/cnn/big%s' % (f.text.strip()) + video_url = config['media_src'] + f.text.strip() fdct = { 'format_id': f.attrib['bitrate'], 'url': video_url, @@ -146,7 +173,7 @@ class CNNBlogsIE(InfoExtractor): class CNNArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!video/)' + _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)' _TEST = { 'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/', 'md5': '689034c2a3d9c6dc4aa72d65a81efd01', From e25586e47163c83e519ae0af9aa6d8fbc3d58ef4 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 20 Aug 2016 20:02:49 +0100 Subject: [PATCH 115/218] [cultureunplugged] fix extraction(closes #10330) --- youtube_dl/extractor/cultureunplugged.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cultureunplugged.py b/youtube_dl/extractor/cultureunplugged.py index 9c764fe68..9f26fa587 100644 --- a/youtube_dl/extractor/cultureunplugged.py +++ b/youtube_dl/extractor/cultureunplugged.py @@ -1,9 +1,13 @@ from __future__ import unicode_literals import re +import time from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + HEADRequest, +) class CultureUnpluggedIE(InfoExtractor): @@ -32,6 +36,9 @@ class CultureUnpluggedIE(InfoExtractor): video_id = mobj.group('id') display_id = mobj.group('display_id') or video_id + # request setClientTimezone.php to get PHPSESSID cookie which is need to get valid json data in the next request + self._request_webpage(HEADRequest( + 'http://www.cultureunplugged.com/setClientTimezone.php?timeOffset=%d' % -(time.timezone / 3600)), display_id) movie_data = self._download_json( 'http://www.cultureunplugged.com/movie-data/cu-%s.json' % video_id, display_id) From 5b1d85754ee2f1a8b94c979bc5122b4130ef8cc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 21 Aug 2016 08:07:26 +0700 Subject: [PATCH 116/218] [YoutubeDL] Autocalculate ext when ext is None --- youtube_dl/YoutubeDL.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e844dc98a..0b3e3da82 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1299,7 +1299,7 @@ class YoutubeDL(object): for subtitle_format in subtitle: if subtitle_format.get('url'): subtitle_format['url'] = sanitize_url(subtitle_format['url']) - if 'ext' not in subtitle_format: + if subtitle_format.get('ext') is None: subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() if self.params.get('listsubtitles', False): @@ -1354,7 +1354,7 @@ class YoutubeDL(object): note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '', ) # Automatically determine file extension if missing - if 'ext' not in format: + if format.get('ext') is None: format['ext'] = determine_ext(format['url']).lower() # Automatically determine protocol if missing (useful for format # selection purposes) From d8f30a7e6606d2300dfffb8fc0aaf8d6a0c79b0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 21 Aug 2016 08:10:49 +0700 Subject: [PATCH 117/218] [kaltura] Remove unused code --- youtube_dl/extractor/kaltura.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index ddf1165ff..66c7b36bc 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -122,18 +122,6 @@ class KalturaIE(InfoExtractor): return data - def _get_kaltura_signature(self, video_id, partner_id, service_url=None): - actions = [{ - 'apiVersion': '3.1', - 'expiry': 86400, - 'format': 1, - 'service': 'session', - 'action': 'startWidgetSession', - 'widgetId': '_%s' % partner_id, - }] - return self._kaltura_api_call( - video_id, actions, service_url, note='Downloading Kaltura signature')['ks'] - def _get_video_info(self, video_id, partner_id, service_url=None): actions = [ { From a80944675010617cc0124c57ab597f9d9004c0d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 21 Aug 2016 08:26:45 +0700 Subject: [PATCH 118/218] [kaltura] Add subtitles support when entry_id is unknown beforehand (Closes #10279) --- youtube_dl/extractor/kaltura.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 66c7b36bc..a8ce6dda2 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -196,6 +196,17 @@ class KalturaIE(InfoExtractor): reference_id)['entryResult'] info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets'] entry_id = info['id'] + # Unfortunately, data returned in kalturaIframePackageData lacks + # captions so we will try requesting the complete data using + # regular approach since we now know the entry_id + try: + _, info, flavor_assets, captions = self._get_video_info( + entry_id, partner_id) + except ExtractorError: + # Regular scenario failed but we already have everything + # extracted apart from captions and can process at least + # with this + pass else: raise ExtractorError('Invalid URL', expected=True) ks = params.get('flashvars[ks]', [None])[0] From fddaa76a599a7df00dc94dd5663d43c881f8fee0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 21 Aug 2016 08:27:53 +0700 Subject: [PATCH 119/218] [kaltura] Assume ttml to be default subtitles' extension --- youtube_dl/extractor/kaltura.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index a8ce6dda2..15f2fe24f 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -266,7 +266,7 @@ class KalturaIE(InfoExtractor): continue subtitles.setdefault(caption.get('languageCode') or caption.get('language'), []).append({ 'url': '%s/api_v3/service/caption_captionasset/action/serve/captionAssetId/%s' % (self._SERVICE_URL, caption['id']), - 'ext': caption.get('fileExt'), + 'ext': caption.get('fileExt', 'ttml'), }) return { From 2c6acdfd2d31b7ce9500e9efe411620c61059b98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 21 Aug 2016 08:37:01 +0700 Subject: [PATCH 120/218] [kaltura] Add test for #10279 --- youtube_dl/extractor/kaltura.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 15f2fe24f..e0f7366c2 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -67,6 +67,27 @@ class KalturaIE(InfoExtractor): # video with subtitles 'url': 'kaltura:111032:1_cw786r8q', 'only_matching': True, + }, + { + # video with ttml subtitles (no fileExt) + 'url': 'kaltura:1926081:0_l5ye1133', + 'info_dict': { + 'id': '0_l5ye1133', + 'ext': 'mp4', + 'title': 'What Can You Do With Python?', + 'upload_date': '20160221', + 'uploader_id': 'stork', + 'thumbnail': 're:^https?://.*/thumbnail/.*', + 'timestamp': int, + 'subtitles': { + 'en': [{ + 'ext': 'ttml', + }], + }, + }, + 'params': { + 'skip_download': True, + }, } ] From db29af6d36b3d16614355dac70f22c4f2d8410d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C3=A9stin=20Reed?= <trox1972@users.noreply.github.com> Date: Fri, 19 Aug 2016 12:53:34 +0200 Subject: [PATCH 121/218] [charlierose] Add new extractor --- youtube_dl/extractor/charlierose.py | 45 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 46 insertions(+) create mode 100644 youtube_dl/extractor/charlierose.py diff --git a/youtube_dl/extractor/charlierose.py b/youtube_dl/extractor/charlierose.py new file mode 100644 index 000000000..ba1d1b833 --- /dev/null +++ b/youtube_dl/extractor/charlierose.py @@ -0,0 +1,45 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import remove_end + + +class CharlieRoseIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?charlierose\.com/video(?:s|/player)/(?P<id>\d+)' + _TEST = { + 'url': 'https://charlierose.com/videos/27996', + 'info_dict': { + 'id': '27996', + 'ext': 'mp4', + 'title': 'Remembering Zaha Hadid', + 'thumbnail': 're:^https?://.*\.jpg\?\d+', + 'description': 'We revisit past conversations with Zaha Hadid, in memory of the world renowned Iraqi architect.', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + } + + _PLAYER_BASE = 'https://charlierose.com/video/player/%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(self._PLAYER_BASE % video_id, video_id) + + title = remove_end(self._og_search_title(webpage), ' - Charlie Rose') + + entries = self._parse_html5_media_entries(self._PLAYER_BASE % video_id, webpage, video_id)[0] + formats = entries['formats'] + + self._sort_formats(formats) + self._remove_duplicate_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage), + 'subtitles': entries.get('subtitles'), + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6c5d46015..d4d90c1f8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -134,6 +134,7 @@ from .ccc import CCCIE from .cda import CDAIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE +from .charlierose import CharlieRoseIE from .chaturbate import ChaturbateIE from .chilloutzone import ChilloutzoneIE from .chirbit import ( From d164a0d41bdc95caa2b1458b9f51381de7d6a5a7 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 21 Aug 2016 20:00:48 +0800 Subject: [PATCH 122/218] [README.md] Add a format selection example using comma Ref: #10399 --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 952db7abb..a10aaf35c 100644 --- a/README.md +++ b/README.md @@ -645,7 +645,11 @@ $ youtube-dl -f 'best[filesize<50M]' # Download best format available via direct link over HTTP/HTTPS protocol $ youtube-dl -f '(bestvideo+bestaudio/best)[protocol^=http]' + +# Download the best video format and the best audio format without merging them +$ youtube-dl -f 'bestvideo,bestaudio' -o '%(title)s.f%(format_id)s.%(ext)s' ``` +Note that in the last example, an output template is recommended as bestvideo and bestaudio may have the same file name. # VIDEO SELECTION From 3d47ee0a9eb37b2c91dfae80c7f22fda0242dd61 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 21 Aug 2016 14:09:18 +0100 Subject: [PATCH 123/218] [zingmp3] fix extraction and add support for video clips(closes #10041) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/zingmp3.py | 127 +++++++++++++++++------------ 2 files changed, 76 insertions(+), 56 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6c5d46015..20fb23527 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1114,7 +1114,4 @@ from .youtube import ( ) from .zapiks import ZapiksIE from .zdf import ZDFIE, ZDFChannelIE -from .zingmp3 import ( - ZingMp3SongIE, - ZingMp3AlbumIE, -) +from .zingmp3 import ZingMp3IE diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py index 437eecb67..bd708b42c 100644 --- a/youtube_dl/extractor/zingmp3.py +++ b/youtube_dl/extractor/zingmp3.py @@ -4,13 +4,17 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + int_or_none, + update_url_query, +) class ZingMp3BaseInfoExtractor(InfoExtractor): - def _extract_item(self, item, fatal=True): - error_message = item.find('./errormessage').text + def _extract_item(self, item, page_type, fatal=True): + error_message = item.get('msg') if error_message: if not fatal: return @@ -18,25 +22,48 @@ class ZingMp3BaseInfoExtractor(InfoExtractor): '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) - title = item.find('./title').text.strip() - source = item.find('./source').text - extension = item.attrib['type'] - thumbnail = item.find('./backimage').text + formats = [] + for quality, source_url in zip(item.get('qualities') or item.get('quality', []), item.get('source_list') or item.get('source', [])): + if not source_url or source_url == 'require vip': + continue + if not re.match(r'https?://', source_url): + source_url = '//' + source_url + source_url = self._proto_relative_url(source_url, 'http:') + quality_num = int_or_none(quality) + f = { + 'format_id': quality, + 'url': source_url, + } + if page_type == 'video': + f.update({ + 'height': quality_num, + 'ext': 'mp4', + }) + else: + f.update({ + 'abr': quality_num, + 'ext': 'mp3', + }) + formats.append(f) + + cover = item.get('cover') return { - 'title': title, - 'url': source, - 'ext': extension, - 'thumbnail': thumbnail, + 'title': (item.get('name') or item.get('title')).strip(), + 'formats': formats, + 'thumbnail': 'http:/' + cover if cover else None, + 'artist': item.get('artist'), } - def _extract_player_xml(self, player_xml_url, id, playlist_title=None): - player_xml = self._download_xml(player_xml_url, id, 'Downloading Player XML') - items = player_xml.findall('./item') + def _extract_player_json(self, player_json_url, id, page_type, playlist_title=None): + player_json = self._download_json(player_json_url, id, 'Downloading Player JSON') + items = player_json['data'] + if 'item' in items: + items = items['item'] if len(items) == 1: # one single song - data = self._extract_item(items[0]) + data = self._extract_item(items[0], page_type) data['id'] = id return data @@ -45,7 +72,7 @@ class ZingMp3BaseInfoExtractor(InfoExtractor): entries = [] for i, item in enumerate(items, 1): - entry = self._extract_item(item, fatal=False) + entry = self._extract_item(item, page_type, fatal=False) if not entry: continue entry['id'] = '%s-%d' % (id, i) @@ -59,8 +86,8 @@ class ZingMp3BaseInfoExtractor(InfoExtractor): } -class ZingMp3SongIE(ZingMp3BaseInfoExtractor): - _VALID_URL = r'https?://mp3\.zing\.vn/bai-hat/(?P<slug>[^/]+)/(?P<song_id>\w+)\.html' +class ZingMp3IE(ZingMp3BaseInfoExtractor): + _VALID_URL = r'https?://mp3\.zing\.vn/(?:bai-hat|album|playlist|video-clip)/[^/]+/(?P<id>\w+)\.html' _TESTS = [{ 'url': 'http://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', 'md5': 'ead7ae13693b3205cbc89536a077daed', @@ -70,51 +97,47 @@ class ZingMp3SongIE(ZingMp3BaseInfoExtractor): 'ext': 'mp3', 'thumbnail': 're:^https?://.*\.jpg$', }, - }] - IE_NAME = 'zingmp3:song' - IE_DESC = 'mp3.zing.vn songs' - - def _real_extract(self, url): - matched = re.match(self._VALID_URL, url) - slug = matched.group('slug') - song_id = matched.group('song_id') - - webpage = self._download_webpage( - 'http://mp3.zing.vn/bai-hat/%s/%s.html' % (slug, song_id), song_id) - - player_xml_url = self._search_regex( - r'&xmlURL=(?P<xml_url>[^&]+)&', webpage, 'player xml url') - - return self._extract_player_xml(player_xml_url, song_id) - - -class ZingMp3AlbumIE(ZingMp3BaseInfoExtractor): - _VALID_URL = r'https?://mp3\.zing\.vn/(?:album|playlist)/(?P<slug>[^/]+)/(?P<album_id>\w+)\.html' - _TESTS = [{ + }, { + 'url': 'http://mp3.zing.vn/video-clip/Let-It-Go-Frozen-OST-Sungha-Jung/ZW6BAEA0.html', + 'md5': '870295a9cd8045c0e15663565902618d', + 'info_dict': { + 'id': 'ZW6BAEA0', + 'title': 'Let It Go (Frozen OST)', + 'ext': 'mp4', + }, + }, { 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', 'info_dict': { '_type': 'playlist', 'id': 'ZWZBWDAF', - 'title': 'Lâu Đài Tình Ái - Bằng Kiều ft. Minh Tuyết | Album 320 lossless', + 'title': 'Lâu Đài Tình Ái - Bằng Kiều,Minh Tuyết | Album 320 lossless', }, 'playlist_count': 10, + 'skip': 'removed at the request of the owner', }, { 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', 'only_matching': True, }] - IE_NAME = 'zingmp3:album' - IE_DESC = 'mp3.zing.vn albums' + IE_NAME = 'zingmp3' + IE_DESC = 'mp3.zing.vn' def _real_extract(self, url): - matched = re.match(self._VALID_URL, url) - slug = matched.group('slug') - album_id = matched.group('album_id') + page_id = self._match_id(url) - webpage = self._download_webpage( - 'http://mp3.zing.vn/album/%s/%s.html' % (slug, album_id), album_id) - player_xml_url = self._search_regex( - r'&xmlURL=(?P<xml_url>[^&]+)&', webpage, 'player xml url') + webpage = self._download_webpage(url, page_id) - return self._extract_player_xml( - player_xml_url, album_id, - playlist_title=self._og_search_title(webpage)) + player_json_url = self._search_regex([ + r'data-xml="([^"]+)', + r'&xmlURL=([^&]+)&' + ], webpage, 'player xml url') + + playlist_title = None + page_type = self._search_regex(r'/(?:html5)?xml/([^/-]+)', player_json_url, 'page type') + if page_type == 'video': + player_json_url = update_url_query(player_json_url, {'format': 'json'}) + else: + player_json_url = player_json_url.replace('/xml/', '/html5xml/') + if page_type == 'album': + playlist_title = self._og_search_title(webpage) + + return self._extract_player_json(player_json_url, page_id, page_type, playlist_title) From 92d4cfa358bacff0e79da30ffb0908c7096e82f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 21 Aug 2016 21:01:01 +0700 Subject: [PATCH 124/218] [kaltura] Fallback ext calculation on caption's format --- youtube_dl/extractor/kaltura.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index e0f7366c2..6a8464998 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -36,6 +36,12 @@ class KalturaIE(InfoExtractor): ''' _SERVICE_URL = 'http://cdnapi.kaltura.com' _SERVICE_BASE = '/api_v3/index.php' + # See https://github.com/kaltura/server/blob/master/plugins/content/caption/base/lib/model/enums/CaptionType.php + _CAPTION_TYPES = { + 1: 'srt', + 2: 'ttml', + 3: 'vtt', + } _TESTS = [ { 'url': 'kaltura:269692:1_1jc2y3e4', @@ -285,9 +291,12 @@ class KalturaIE(InfoExtractor): # Continue if caption is not ready if f.get('status') != 2: continue + if not caption.get('id'): + continue + caption_format = int_or_none(caption.get('format')) subtitles.setdefault(caption.get('languageCode') or caption.get('language'), []).append({ 'url': '%s/api_v3/service/caption_captionasset/action/serve/captionAssetId/%s' % (self._SERVICE_URL, caption['id']), - 'ext': caption.get('fileExt', 'ttml'), + 'ext': caption.get('fileExt') or self._CAPTION_TYPES.get(caption_format) or 'ttml', }) return { From b1e676fde81d33116f6739006d9aa0b68eebc072 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 21 Aug 2016 21:28:02 +0700 Subject: [PATCH 125/218] [twitch] Modernize --- youtube_dl/extractor/twitch.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 890f55180..4b5b2030c 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -20,7 +20,6 @@ from ..utils import ( orderedSet, parse_duration, parse_iso8601, - sanitized_Request, urlencode_postdata, ) @@ -50,8 +49,8 @@ class TwitchBaseIE(InfoExtractor): for cookie in self._downloader.cookiejar: if cookie.name == 'api_token': headers['Twitch-Api-Token'] = cookie.value - request = sanitized_Request(url, headers=headers) - response = super(TwitchBaseIE, self)._download_json(request, video_id, note) + response = super(TwitchBaseIE, self)._download_json( + url, video_id, note, headers=headers) self._handle_error(response) return response @@ -82,11 +81,10 @@ class TwitchBaseIE(InfoExtractor): if not post_url.startswith('http'): post_url = compat_urlparse.urljoin(redirect_url, post_url) - request = sanitized_Request( - post_url, urlencode_postdata(login_form)) - request.add_header('Referer', redirect_url) response = self._download_webpage( - request, None, 'Logging in as %s' % username) + post_url, None, 'Logging in as %s' % username, + data=urlencode_postdata(login_form), + headers={'Referer': redirect_url}) error_message = self._search_regex( r'<div[^>]+class="subwindow_notice"[^>]*>([^<]+)</div>', From e3f6b569096ba6faa8de230333849817c8b31a2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 21 Aug 2016 22:09:29 +0700 Subject: [PATCH 126/218] [twitch] Refactor API calls --- youtube_dl/extractor/twitch.py | 38 +++++++++++++++++----------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 4b5b2030c..f0a9370c8 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -41,7 +41,7 @@ class TwitchBaseIE(InfoExtractor): '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')), expected=True) - def _download_json(self, url, video_id, note='Downloading JSON metadata'): + def _call_api(self, path, item_id, note): headers = { 'Referer': 'http://api.twitch.tv/crossdomain/receiver.html?v=2', 'X-Requested-With': 'XMLHttpRequest', @@ -49,8 +49,8 @@ class TwitchBaseIE(InfoExtractor): for cookie in self._downloader.cookiejar: if cookie.name == 'api_token': headers['Twitch-Api-Token'] = cookie.value - response = super(TwitchBaseIE, self)._download_json( - url, video_id, note, headers=headers) + response = self._download_json( + '%s/%s' % (self._API_BASE, path), item_id, note) self._handle_error(response) return response @@ -107,14 +107,14 @@ class TwitchBaseIE(InfoExtractor): class TwitchItemBaseIE(TwitchBaseIE): def _download_info(self, item, item_id): - return self._extract_info(self._download_json( - '%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id, + return self._extract_info(self._call_api( + 'kraken/videos/%s%s' % (item, item_id), item_id, 'Downloading %s info JSON' % self._ITEM_TYPE)) def _extract_media(self, item_id): info = self._download_info(self._ITEM_SHORTCUT, item_id) - response = self._download_json( - '%s/api/videos/%s%s' % (self._API_BASE, self._ITEM_SHORTCUT, item_id), item_id, + response = self._call_api( + 'api/videos/%s%s' % (self._ITEM_SHORTCUT, item_id), item_id, 'Downloading %s playlist JSON' % self._ITEM_TYPE) entries = [] chunks = response['chunks'] @@ -244,8 +244,8 @@ class TwitchVodIE(TwitchItemBaseIE): item_id = self._match_id(url) info = self._download_info(self._ITEM_SHORTCUT, item_id) - access_token = self._download_json( - '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id, + access_token = self._call_api( + 'api/vods/%s/access_token' % item_id, item_id, 'Downloading %s access token' % self._ITEM_TYPE) formats = self._extract_m3u8_formats( @@ -273,12 +273,12 @@ class TwitchVodIE(TwitchItemBaseIE): class TwitchPlaylistBaseIE(TwitchBaseIE): - _PLAYLIST_URL = '%s/kraken/channels/%%s/videos/?offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE + _PLAYLIST_PATH = 'kraken/channels/%s/videos/?offset=%d&limit=%d' _PAGE_LIMIT = 100 def _extract_playlist(self, channel_id): - info = self._download_json( - '%s/kraken/channels/%s' % (self._API_BASE, channel_id), + info = self._call_api( + 'kraken/channels/%s' % channel_id, channel_id, 'Downloading channel info JSON') channel_name = info.get('display_name') or info.get('name') entries = [] @@ -287,8 +287,8 @@ class TwitchPlaylistBaseIE(TwitchBaseIE): broken_paging_detected = False counter_override = None for counter in itertools.count(1): - response = self._download_json( - self._PLAYLIST_URL % (channel_id, offset, limit), + response = self._call_api( + self._PLAYLIST_PATH % (channel_id, offset, limit), channel_id, 'Downloading %s videos JSON page %s' % (self._PLAYLIST_TYPE, counter_override or counter)) @@ -343,7 +343,7 @@ class TwitchProfileIE(TwitchPlaylistBaseIE): class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE): IE_NAME = 'twitch:past_broadcasts' _VALID_URL = r'%s/(?P<id>[^/]+)/profile/past_broadcasts/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE - _PLAYLIST_URL = TwitchPlaylistBaseIE._PLAYLIST_URL + '&broadcasts=true' + _PLAYLIST_PATH = TwitchPlaylistBaseIE._PLAYLIST_PATH + '&broadcasts=true' _PLAYLIST_TYPE = 'past broadcasts' _TEST = { @@ -387,8 +387,8 @@ class TwitchStreamIE(TwitchBaseIE): def _real_extract(self, url): channel_id = self._match_id(url) - stream = self._download_json( - '%s/kraken/streams/%s' % (self._API_BASE, channel_id), channel_id, + stream = self._call_api( + 'kraken/streams/%s' % channel_id, channel_id, 'Downloading stream JSON').get('stream') # Fallback on profile extraction if stream is offline @@ -403,8 +403,8 @@ class TwitchStreamIE(TwitchBaseIE): # JSON and fallback to lowercase if it's not available. channel_id = stream.get('channel', {}).get('name') or channel_id.lower() - access_token = self._download_json( - '%s/api/channels/%s/access_token' % (self._API_BASE, channel_id), channel_id, + access_token = self._call_api( + 'api/channels/%s/access_token' % channel_id, channel_id, 'Downloading channel access token') query = { From efe470e2614d8a50a5cc2d14551e9bc4fc41cc8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 21 Aug 2016 22:45:50 +0700 Subject: [PATCH 127/218] [twitch] Renew authentication --- youtube_dl/extractor/twitch.py | 38 +++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index f0a9370c8..359a8859c 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -7,6 +7,7 @@ import random from .common import InfoExtractor from ..compat import ( + compat_HTTPError, compat_parse_qs, compat_str, compat_urllib_parse_urlencode, @@ -14,6 +15,7 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + clean_html, ExtractorError, int_or_none, js_to_json, @@ -62,9 +64,17 @@ class TwitchBaseIE(InfoExtractor): if username is None: return + def fail(message): + raise ExtractorError( + 'Unable to login. Twitch said: %s' % message, expected=True) + login_page, handle = self._download_webpage_handle( self._LOGIN_URL, None, 'Downloading login page') + # Some TOR nodes and public proxies are blocked completely + if 'blacklist_message' in login_page: + fail(clean_html(login_page)) + login_form = self._hidden_inputs(login_page) login_form.update({ @@ -81,20 +91,24 @@ class TwitchBaseIE(InfoExtractor): if not post_url.startswith('http'): post_url = compat_urlparse.urljoin(redirect_url, post_url) - response = self._download_webpage( - post_url, None, 'Logging in as %s' % username, - data=urlencode_postdata(login_form), - headers={'Referer': redirect_url}) + headers = {'Referer': redirect_url} - error_message = self._search_regex( - r'<div[^>]+class="subwindow_notice"[^>]*>([^<]+)</div>', - response, 'error message', default=None) - if error_message: - raise ExtractorError( - 'Unable to login. Twitch said: %s' % error_message, expected=True) + try: + response = self._download_json( + post_url, None, 'Logging in as %s' % username, + data=urlencode_postdata(login_form), + headers=headers) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + response = self._parse_json( + e.cause.read().decode('utf-8'), None) + fail(response['message']) + raise - if '>Reset your password<' in response: - self.report_warning('Twitch asks you to reset your password, go to https://secure.twitch.tv/reset/submit') + if response.get('redirect'): + self._download_webpage( + response['redirect'], None, 'Downloading login redirect page', + headers=headers) def _prefer_source(self, formats): try: From 9b8c554ea70ee970009de2628bafe7fd7390bf9e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 21 Aug 2016 17:55:47 +0100 Subject: [PATCH 128/218] [firsttv] fix extraction(closes #9249) --- youtube_dl/extractor/firsttv.py | 133 ++++++++++---------------------- 1 file changed, 39 insertions(+), 94 deletions(-) diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 88bca1007..af7de10b7 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -2,130 +2,75 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_xpath +from ..compat import compat_urlparse from ..utils import ( int_or_none, qualities, unified_strdate, - xpath_attr, - xpath_element, - xpath_text, - xpath_with_ns, ) class FirstTVIE(InfoExtractor): IE_NAME = '1tv' IE_DESC = 'Первый канал' - _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+p?(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P<id>[^/?#]+)' _TESTS = [{ - # single format via video_materials.json API - 'url': 'http://www.1tv.ru/prj/inprivate/vypusk/35930', - 'md5': '82a2777648acae812d58b3f5bd42882b', + 'url': 'http://www.1tv.ru/shows/naedine-so-vsemi/vypuski/gost-lyudmila-senchina-naedine-so-vsemi-vypusk-ot-12-02-2015', + 'md5': 'a1b6b60d530ebcf8daacf4565762bbaf', 'info_dict': { - 'id': '35930', + 'id': '40049', 'ext': 'mp4', 'title': 'Гость Людмила Сенчина. Наедине со всеми. Выпуск от 12.02.2015', - 'description': 'md5:357933adeede13b202c7c21f91b871b2', + 'description': 'md5:36a39c1d19618fec57d12efe212a8370', 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', 'upload_date': '20150212', 'duration': 2694, }, }, { - # multiple formats via video_materials.json API - 'url': 'http://www.1tv.ru/video_archive/projects/dobroeutro/p113641', - 'info_dict': { - 'id': '113641', - 'ext': 'mp4', - 'title': 'Весенняя аллергия. Доброе утро. Фрагмент выпуска от 07.04.2016', - 'description': 'md5:8dcebb3dded0ff20fade39087fd1fee2', - 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', - 'upload_date': '20160407', - 'duration': 179, - 'formats': 'mincount:3', - }, - 'params': { - 'skip_download': True, - }, - }, { - # single format only available via ONE_ONLINE_VIDEOS.archive_single_xml API - 'url': 'http://www.1tv.ru/video_archive/series/f7552/p47038', - 'md5': '519d306c5b5669761fd8906c39dbee23', - 'info_dict': { - 'id': '47038', - 'ext': 'mp4', - 'title': '"Побег". Второй сезон. 3 серия', - 'description': 'md5:3abf8f6b9bce88201c33e9a3d794a00b', - 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', - 'upload_date': '20120516', - 'duration': 3080, - }, - }, { - 'url': 'http://www.1tv.ru/videoarchive/9967', - 'only_matching': True, + 'url': 'http://www.1tv.ru/shows/dobroe-utro/pro-zdorove/vesennyaya-allergiya-dobroe-utro-fragment-vypuska-ot-07042016', + 'only_matching': 'true', }] def _real_extract(self, url): - video_id = self._match_id(url) + display_id = self._match_id(url) - # Videos with multiple formats only available via this API - video = self._download_json( - 'http://www.1tv.ru/video_materials.json?legacy_id=%s' % video_id, - video_id, fatal=False) - - description, thumbnail, upload_date, duration = [None] * 4 - - if video: - item = video[0] - title = item['title'] - quality = qualities(('ld', 'sd', 'hd', )) - formats = [{ - 'url': f['src'], - 'format_id': f.get('name'), - 'quality': quality(f.get('name')), - } for f in item['mbr'] if f.get('src')] - thumbnail = item.get('poster') - else: - # Some videos are not available via video_materials.json - video = self._download_xml( - 'http://www.1tv.ru/owa/win/ONE_ONLINE_VIDEOS.archive_single_xml?pid=%s' % video_id, - video_id) - - NS_MAP = { - 'media': 'http://search.yahoo.com/mrss/', - } - - item = xpath_element(video, './channel/item', fatal=True) - title = xpath_text(item, './title', fatal=True) - formats = [{ - 'url': content.attrib['url'], - } for content in item.findall( - compat_xpath(xpath_with_ns('./media:content', NS_MAP))) if content.attrib.get('url')] - thumbnail = xpath_attr( - item, xpath_with_ns('./media:thumbnail', NS_MAP), 'url') + webpage = self._download_webpage(url, display_id) + playlist_url = compat_urlparse.urljoin(url, self._search_regex( + r'data-playlist-url="([^"]+)', webpage, 'playlist url')) + item = self._download_json(playlist_url, display_id)[0] + video_id = item['id'] + quality = qualities(('ld', 'sd', 'hd', )) + formats = [] + for f in item.get('mbr', []): + src = f.get('src') + if not src: + continue + fname = f.get('name') + formats.append({ + 'url': src, + 'format_id': fname, + 'quality': quality(fname), + }) self._sort_formats(formats) - webpage = self._download_webpage(url, video_id, 'Downloading page', fatal=False) - if webpage: - title = self._html_search_regex( - (r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>', - r"'title'\s*:\s*'([^']+)'"), - webpage, 'title', default=None) or title - description = self._html_search_regex( - r'<div class="descr">\s*<div> </div>\s*<p>([^<]*)</p></div>', - webpage, 'description', default=None) or self._html_search_meta( - 'description', webpage, 'description') - thumbnail = thumbnail or self._og_search_thumbnail(webpage) - duration = int_or_none(self._html_search_meta( - 'video:duration', webpage, 'video duration', fatal=False)) - upload_date = unified_strdate(self._html_search_meta( - 'ya:ovs:upload_date', webpage, 'upload date', fatal=False)) + title = self._html_search_regex( + (r'<div class="tv_translation">\s*<h1><a href="[^"]+">([^<]*)</a>', + r"'title'\s*:\s*'([^']+)'"), + webpage, 'title', default=None) or item['title'] + description = self._html_search_regex( + r'<div class="descr">\s*<div> </div>\s*<p>([^<]*)</p></div>', + webpage, 'description', default=None) or self._html_search_meta( + 'description', webpage, 'description') + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'video duration', fatal=False)) + upload_date = unified_strdate(self._html_search_meta( + 'ya:ovs:upload_date', webpage, 'upload date', fatal=False)) return { 'id': video_id, - 'thumbnail': thumbnail, + 'thumbnail': item.get('poster') or self._og_search_thumbnail(webpage), 'title': title, 'description': description, 'upload_date': upload_date, From 526656726b13f47a33c36e56821136b90d6decf1 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 22 Aug 2016 02:06:47 +0800 Subject: [PATCH 129/218] [charlierose] Simplify and improve --- youtube_dl/extractor/charlierose.py | 33 +++++++++++++++++------------ 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/charlierose.py b/youtube_dl/extractor/charlierose.py index ba1d1b833..817f7128f 100644 --- a/youtube_dl/extractor/charlierose.py +++ b/youtube_dl/extractor/charlierose.py @@ -6,20 +6,25 @@ from ..utils import remove_end class CharlieRoseIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?charlierose\.com/video(?:s|/player)/(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'https://charlierose.com/videos/27996', + 'md5': 'fda41d49e67d4ce7c2411fd2c4702e09', 'info_dict': { 'id': '27996', 'ext': 'mp4', 'title': 'Remembering Zaha Hadid', 'thumbnail': 're:^https?://.*\.jpg\?\d+', 'description': 'We revisit past conversations with Zaha Hadid, in memory of the world renowned Iraqi architect.', + 'subtitles': { + 'en': [{ + 'ext': 'vtt', + }], + }, }, - 'params': { - # m3u8 download - 'skip_download': True, - } - } + }, { + 'url': 'https://charlierose.com/videos/27996', + 'only_matching': True, + }] _PLAYER_BASE = 'https://charlierose.com/video/player/%s' @@ -29,17 +34,17 @@ class CharlieRoseIE(InfoExtractor): title = remove_end(self._og_search_title(webpage), ' - Charlie Rose') - entries = self._parse_html5_media_entries(self._PLAYER_BASE % video_id, webpage, video_id)[0] - formats = entries['formats'] + info_dict = self._parse_html5_media_entries( + self._PLAYER_BASE % video_id, webpage, video_id)[0] - self._sort_formats(formats) - self._remove_duplicate_formats(formats) + self._sort_formats(info_dict['formats']) + self._remove_duplicate_formats(info_dict['formats']) - return { + info_dict.update({ 'id': video_id, 'title': title, - 'formats': formats, 'thumbnail': self._og_search_thumbnail(webpage), 'description': self._og_search_description(webpage), - 'subtitles': entries.get('subtitles'), - } + }) + + return info_dict From d0fa172e5fc1d676834252dcd395ec495b20b0bc Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 21 Aug 2016 19:11:51 +0100 Subject: [PATCH 130/218] [firsttv] keep a test videos with multiple formats --- youtube_dl/extractor/firsttv.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index af7de10b7..332d12020 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -16,6 +16,7 @@ class FirstTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P<id>[^/?#]+)' _TESTS = [{ + # single format 'url': 'http://www.1tv.ru/shows/naedine-so-vsemi/vypuski/gost-lyudmila-senchina-naedine-so-vsemi-vypusk-ot-12-02-2015', 'md5': 'a1b6b60d530ebcf8daacf4565762bbaf', 'info_dict': { @@ -28,8 +29,21 @@ class FirstTVIE(InfoExtractor): 'duration': 2694, }, }, { + # multiple formats 'url': 'http://www.1tv.ru/shows/dobroe-utro/pro-zdorove/vesennyaya-allergiya-dobroe-utro-fragment-vypuska-ot-07042016', - 'only_matching': 'true', + 'info_dict': { + 'id': '364746', + 'ext': 'mp4', + 'title': 'Весенняя аллергия. Доброе утро. Фрагмент выпуска от 07.04.2016', + 'description': 'md5:a242eea0031fd180a4497d52640a9572', + 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', + 'upload_date': '20160407', + 'duration': 179, + 'formats': 'mincount:3', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): From ad120ae1c57fe3ff0c7f5559d280cb8230a2b38c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 22 Aug 2016 02:18:46 +0800 Subject: [PATCH 131/218] [extractor/common] Change the default m3u8 protocol in HTML5 Helper functions should have consistent default values --- ChangeLog | 1 + youtube_dl/extractor/charlierose.py | 3 ++- youtube_dl/extractor/common.py | 6 +++--- youtube_dl/extractor/snotr.py | 3 ++- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/ChangeLog b/ChangeLog index a8d8d05a3..383ff59ea 100644 --- a/ChangeLog +++ b/ChangeLog @@ -7,6 +7,7 @@ Core * Fix js_to_json(): correct octal or hexadecimal number detection Extractors ++ [charlierose] Add new extractor (#10382) + [litv] Support 'promo' URLs (#10385) * [snotr] Fix extraction (#10338) * [n-tv.de] Fix extraction (#10331) diff --git a/youtube_dl/extractor/charlierose.py b/youtube_dl/extractor/charlierose.py index 817f7128f..4bf2cf7b0 100644 --- a/youtube_dl/extractor/charlierose.py +++ b/youtube_dl/extractor/charlierose.py @@ -35,7 +35,8 @@ class CharlieRoseIE(InfoExtractor): title = remove_end(self._og_search_title(webpage), ' - Charlie Rose') info_dict = self._parse_html5_media_entries( - self._PLAYER_BASE % video_id, webpage, video_id)[0] + self._PLAYER_BASE % video_id, webpage, video_id, + m3u8_entry_protocol='m3u8_native')[0] self._sort_formats(info_dict['formats']) self._remove_duplicate_formats(info_dict['formats']) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 07d58afe7..ba4c03d3d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1695,7 +1695,7 @@ class InfoExtractor(object): self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats - def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None): + def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8'): def absolute_url(video_url): return compat_urlparse.urljoin(base_url, video_url) @@ -1715,8 +1715,8 @@ class InfoExtractor(object): if determine_ext(full_url) == 'm3u8': is_plain_url = False formats = self._extract_m3u8_formats( - full_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=m3u8_id) + full_url, video_id, ext='mp4', + entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id) else: is_plain_url = True formats = [{ diff --git a/youtube_dl/extractor/snotr.py b/youtube_dl/extractor/snotr.py index 3bb78cb84..4819fe5b4 100644 --- a/youtube_dl/extractor/snotr.py +++ b/youtube_dl/extractor/snotr.py @@ -46,7 +46,8 @@ class SnotrIE(InfoExtractor): title = self._og_search_title(webpage) description = self._og_search_description(webpage) - info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] + info_dict = self._parse_html5_media_entries( + url, webpage, video_id, m3u8_entry_protocol='m3u8_native')[0] view_count = str_to_int(self._html_search_regex( r'<p[^>]*>\s*<strong[^>]*>Views:</strong>\s*<span[^>]*>([\d,\.]+)', From cf143c4d977915c993f4aa467b491a6c284bb569 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 22 Aug 2016 03:31:33 +0700 Subject: [PATCH 132/218] [ivi] Add support for 720p and 1080p --- youtube_dl/extractor/ivi.py | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index 472d72b4c..f5ab5f4af 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -1,4 +1,4 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -8,7 +8,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, - sanitized_Request, + qualities, ) @@ -49,11 +49,27 @@ class IviIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', }, 'skip': 'Only works from Russia', + }, + { + # with MP4-HD720 format + 'url': 'http://www.ivi.ru/watch/146500', + 'md5': 'd63d35cdbfa1ea61a5eafec7cc523e1e', + 'info_dict': { + 'id': '146500', + 'ext': 'mp4', + 'title': 'Кукла', + 'description': 'md5:ffca9372399976a2d260a407cc74cce6', + 'duration': 5599, + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'skip': 'Only works from Russia', } ] # Sorted by quality - _KNOWN_FORMATS = ['MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', 'MP4-SHQ'] + _KNOWN_FORMATS = ( + 'MP4-low-mobile', 'MP4-mobile', 'FLV-lo', 'MP4-lo', 'FLV-hi', 'MP4-hi', + 'MP4-SHQ', 'MP4-HD720', 'MP4-HD1080') def _real_extract(self, url): video_id = self._match_id(url) @@ -69,10 +85,9 @@ class IviIE(InfoExtractor): ] } - request = sanitized_Request( - 'http://api.digitalaccess.ru/api/json/', json.dumps(data)) video_json = self._download_json( - request, video_id, 'Downloading video JSON') + 'http://api.digitalaccess.ru/api/json/', video_id, + 'Downloading video JSON', data=json.dumps(data)) if 'error' in video_json: error = video_json['error'] @@ -84,11 +99,13 @@ class IviIE(InfoExtractor): result = video_json['result'] + quality = qualities(self._KNOWN_FORMATS) + formats = [{ 'url': x['url'], - 'format_id': x['content_format'], - 'preference': self._KNOWN_FORMATS.index(x['content_format']), - } for x in result['files'] if x['content_format'] in self._KNOWN_FORMATS] + 'format_id': x.get('content_format'), + 'quality': quality(x.get('content_format')), + } for x in result['files'] if x.get('url')] self._sort_formats(formats) From 3d897cc791781430f371da98f2f3a05a0b856c5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 22 Aug 2016 03:34:27 +0700 Subject: [PATCH 133/218] [ivi] Fix episode number extraction --- youtube_dl/extractor/ivi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py index f5ab5f4af..7c8cb21c2 100644 --- a/youtube_dl/extractor/ivi.py +++ b/youtube_dl/extractor/ivi.py @@ -132,7 +132,7 @@ class IviIE(InfoExtractor): webpage, 'season number', default=None)) episode_number = int_or_none(self._search_regex( - r'<meta[^>]+itemprop="episode"[^>]*>\s*<meta[^>]+itemprop="episodeNumber"[^>]+content="(\d+)', + r'[^>]+itemprop="episode"[^>]*>\s*<meta[^>]+itemprop="episodeNumber"[^>]+content="(\d+)', webpage, 'episode number', default=None)) description = self._og_search_description(webpage, default=None) or self._html_search_meta( From afbab5688e837d9b1617119b1ac26b4a4e343bed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 22 Aug 2016 04:15:46 +0700 Subject: [PATCH 134/218] [ChangeLog] Actualize --- ChangeLog | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index 383ff59ea..ee9b9500f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,16 +1,28 @@ version <unreleased> Core -+ Recognize file size strings with full unit names (for example "8.5 - megabytes") -+ Support m3u8 manifests in HTML5 multimedia tags -* Fix js_to_json(): correct octal or hexadecimal number detection +* Improve formats and subtitles extension auto calculation ++ Recognize full unit names in parse_filesize ++ Add support for m3u8 manifests in HTML5 multimedia tags +* Fix octal/hexadecimal number detection in js_to_json Extractors ++ [ivi] Add support for 720p and 1080p + [charlierose] Add new extractor (#10382) +* [1tv] Fix extraction (#9249) +* [twitch] Renew authentication +* [kaltura] Improve subtitles extension calculation ++ [zingmp3] Add support for video clips +* [zingmp3] Fix extraction (#10041) +* [kaltura] Improve subtitles extraction (#10279) +* [cultureunplugged] Fix extraction (#10330) ++ [cnn] Add support for money.cnn.com (#2797) +* [cbsnews] Fix extraction (#10362) +* [cbs] Fix extraction (#10393) + [litv] Support 'promo' URLs (#10385) * [snotr] Fix extraction (#10338) * [n-tv.de] Fix extraction (#10331) +* [globo:article] Relax URL and video id regular expressions (#10379) version 2016.08.19 From 6d2679ee26eb6ad0587d01e40ca7a17a6edd6e2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 22 Aug 2016 04:17:34 +0700 Subject: [PATCH 135/218] release 2016.08.22 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 4 ++-- youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 7af3c7099..7dcca18a1 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.19*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.19** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.22*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.22** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.08.19 +[debug] youtube-dl version 2016.08.22 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index ee9b9500f..a8202d3de 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2016.08.22 Core * Improve formats and subtitles extension auto calculation diff --git a/docs/supportedsites.md b/docs/supportedsites.md index edf192138..ca96d2b07 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -121,6 +121,7 @@ - **CDA** - **CeskaTelevize** - **channel9**: Channel 9 + - **CharlieRose** - **Chaturbate** - **Chilloutzone** - **chirbit** @@ -893,5 +894,4 @@ - **Zapiks** - **ZDF** - **ZDFChannel** - - **zingmp3:album**: mp3.zing.vn albums - - **zingmp3:song**: mp3.zing.vn songs + - **zingmp3**: mp3.zing.vn diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 691f2c591..e33d32e97 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.08.19' +__version__ = '2016.08.22' From 55d119e2a10ccbfadc12b9af30c495f46874c2a3 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 22 Aug 2016 00:06:39 +0100 Subject: [PATCH 136/218] [abc:iview] Add new extractor(closes #6148) --- youtube_dl/extractor/abc.py | 63 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 5 ++- 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index b584277be..879ded88d 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -7,6 +7,8 @@ from ..utils import ( ExtractorError, js_to_json, int_or_none, + update_url_query, + parse_iso8601, ) @@ -93,3 +95,64 @@ class ABCIE(InfoExtractor): 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), } + + +class ABCIViewIE(InfoExtractor): + IE_NAME = 'abc.net.au:iview' + _VALID_URL = r'https?://iview\.abc\.net\.au/programs/[^/]+/(?P<id>[^/?#]+)' + + _TESTS = [{ + 'url': 'http://iview.abc.net.au/programs/gardening-australia/FA1505V024S00', + 'md5': '979d10b2939101f0d27a06b79edad536', + 'info_dict': { + 'id': 'FA1505V024S00', + 'ext': 'mp4', + 'title': 'Series 27 Ep 24', + 'description': 'md5:b28baeae7504d1148e1d2f0e3ed3c15d', + 'upload_date': '20160820', + 'uploader_id': 'abc1', + 'timestamp': 1471719600, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_params = self._parse_json(self._search_regex( + r'videoParams\s*=\s*({.+?});', webpage, 'video params'), video_id) + title = video_params['title'] + stream = next(s for s in video_params['playlist'] if s.get('type') == 'program') + + formats = [] + f4m_url = stream.get('hds-unmetered') or stream['hds-metered'] + formats.extend(self._extract_f4m_formats( + update_url_query(f4m_url, {'hdcore': '3.7.0'}), + video_id, f4m_id='hds', fatal=False)) + formats.extend(self._extract_m3u8_formats(f4m_url.replace( + 'akamaihd.net/z/', 'akamaihd.net/i/').replace('/manifest.f4m', '/master.m3u8'), + video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + subtitles = {} + src_vtt = stream.get('captions', {}).get('src-vtt') + if src_vtt: + subtitles['en'] = [{ + 'url': src_vtt, + 'ext': 'vtt', + }] + + return { + 'id': video_id, + 'title': title, + 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage), + 'thumbnail': self._html_search_meta(['og:image', 'twitter:image:src'], webpage), + 'duration': int_or_none(video_params.get('eventDuration')), + 'timestamp': parse_iso8601(video_params.get('pubDate'), ' '), + 'series': video_params.get('seriesTitle'), + 'series_id': video_params.get('seriesHouseNumber') or video_id[:7], + 'episode_number': int_or_none(self._html_search_meta('episodeNumber', webpage)), + 'episode': self._html_search_meta('episode_title', webpage), + 'uploader_id': video_params.get('channel'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b0644be11..8e405ad72 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1,7 +1,10 @@ # flake8: noqa from __future__ import unicode_literals -from .abc import ABCIE +from .abc import ( + ABCIE, + ABCIViewIE, +) from .abc7news import Abc7NewsIE from .abcnews import ( AbcNewsIE, From 96229e5f95a5be622a694b464085bdea59134ccf Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 22 Aug 2016 13:56:09 +0800 Subject: [PATCH 137/218] [mtvservices:embedded] Update config URL All starts from #10363. The test case in mtvservices:embedded uses config.xml, while the video from #10363 and the test case in generic.py is broken. Both uses index.html for fetching the feed URL. --- youtube_dl/extractor/mtv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 2f455680e..200f340de 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -257,8 +257,8 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): def _get_feed_url(self, uri): video_id = self._id_from_uri(uri) site_id = uri.replace(video_id, '') - config_url = ('http://media.mtvnservices.com/pmt/e1/players/{0}/' - 'context4/context5/config.xml'.format(site_id)) + config_url = ('http://media.mtvnservices.com/pmt-arc/e1/players/{0}/' + 'context52/config.xml'.format(site_id)) config_doc = self._download_xml(config_url, video_id) feed_node = config_doc.find('.//feed') feed_url = feed_node.text.strip().split('?')[0] From c7c43a93ba4abbd2175ab0891b63def7e25aa385 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 22 Aug 2016 07:47:25 +0100 Subject: [PATCH 138/218] [common] add helper method to extract akamai m3u8 and f4m formats --- youtube_dl/extractor/common.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index ba4c03d3d..8ed16deee 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1765,6 +1765,18 @@ class InfoExtractor(object): entries.append(media_info) return entries + def _extract_akamai_formats(self, manifest_url, video_id): + formats = [] + f4m_url = re.sub(r'(https?://.+?)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') + formats.extend(self._extract_f4m_formats( + update_url_query(f4m_url, {'hdcore': '3.7.0'}), + video_id, f4m_id='hds', fatal=False)) + m3u8_url = re.sub(r'(https?://.+?)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + return formats + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() From ad316425840315b40405a55243635fcfbcae5f19 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 22 Aug 2016 07:48:40 +0100 Subject: [PATCH 139/218] [nrk,abc:iview] use _extract_akamai_formats --- youtube_dl/extractor/abc.py | 10 +--------- youtube_dl/extractor/nrk.py | 14 ++------------ 2 files changed, 3 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 879ded88d..c7b6df7d0 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -7,7 +7,6 @@ from ..utils import ( ExtractorError, js_to_json, int_or_none, - update_url_query, parse_iso8601, ) @@ -123,14 +122,7 @@ class ABCIViewIE(InfoExtractor): title = video_params['title'] stream = next(s for s in video_params['playlist'] if s.get('type') == 'program') - formats = [] - f4m_url = stream.get('hds-unmetered') or stream['hds-metered'] - formats.extend(self._extract_f4m_formats( - update_url_query(f4m_url, {'hdcore': '3.7.0'}), - video_id, f4m_id='hds', fatal=False)) - formats.extend(self._extract_m3u8_formats(f4m_url.replace( - 'akamaihd.net/z/', 'akamaihd.net/i/').replace('/manifest.f4m', '/master.m3u8'), - video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + formats = self._extract_akamai_formats(stream['hds-unmetered'], video_id) self._sort_formats(formats) subtitles = {} diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 6ded5bd45..ed42eb301 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -14,16 +14,6 @@ from ..utils import ( class NRKBaseIE(InfoExtractor): - def _extract_formats(self, manifest_url, video_id, fatal=True): - formats = [] - formats.extend(self._extract_f4m_formats( - manifest_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', - video_id, f4m_id='hds', fatal=fatal)) - formats.extend(self._extract_m3u8_formats(manifest_url.replace( - 'akamaihd.net/z/', 'akamaihd.net/i/').replace('/manifest.f4m', '/master.m3u8'), - video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=fatal)) - return formats - def _real_extract(self, url): video_id = self._match_id(url) @@ -45,7 +35,7 @@ class NRKBaseIE(InfoExtractor): asset_url = asset.get('url') if not asset_url: continue - formats = self._extract_formats(asset_url, video_id, fatal=False) + formats = self._extract_akamai_formats(asset_url, video_id) if not formats: continue self._sort_formats(formats) @@ -69,7 +59,7 @@ class NRKBaseIE(InfoExtractor): if not entries: media_url = data.get('mediaUrl') if media_url: - formats = self._extract_formats(media_url, video_id) + formats = self._extract_akamai_formats(media_url, video_id) self._sort_formats(formats) duration = parse_duration(data.get('duration')) entries = [{ From 7367bdef23a3db4691ba99f01613b7759340f05e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 22 Aug 2016 23:10:06 +0100 Subject: [PATCH 140/218] [awaan] fix extraction, modernize, rename the extractors and add test for live stream --- youtube_dl/extractor/{dcn.py => awaan.py} | 89 +++++++++++------------ youtube_dl/extractor/extractors.py | 12 +-- 2 files changed, 50 insertions(+), 51 deletions(-) rename youtube_dl/extractor/{dcn.py => awaan.py} (75%) diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/awaan.py similarity index 75% rename from youtube_dl/extractor/dcn.py rename to youtube_dl/extractor/awaan.py index b8542820a..bdf23c6a9 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/awaan.py @@ -12,46 +12,41 @@ from ..compat import ( from ..utils import ( int_or_none, parse_iso8601, - sanitized_Request, smuggle_url, unsmuggle_url, urlencode_postdata, ) -class DCNIE(InfoExtractor): +class AWAANIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?' def _real_extract(self, url): show_id, video_id, season_id = re.match(self._VALID_URL, url).groups() if video_id and int(video_id) > 0: return self.url_result( - 'http://www.dcndigital.ae/media/%s' % video_id, 'DCNVideo') + 'http://awaan.ae/media/%s' % video_id, 'AWAANVideo') elif season_id and int(season_id) > 0: return self.url_result(smuggle_url( - 'http://www.dcndigital.ae/program/season/%s' % season_id, - {'show_id': show_id}), 'DCNSeason') + 'http://awaan.ae/program/season/%s' % season_id, + {'show_id': show_id}), 'AWAANSeason') else: return self.url_result( - 'http://www.dcndigital.ae/program/%s' % show_id, 'DCNSeason') + 'http://awaan.ae/program/%s' % show_id, 'AWAANSeason') -class DCNBaseIE(InfoExtractor): - def _extract_video_info(self, video_data, video_id, is_live): +class AWAANBaseIE(InfoExtractor): + def _parse_video_data(self, video_data, video_id, is_live): title = video_data.get('title_en') or video_data['title_ar'] img = video_data.get('img') - thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None - duration = int_or_none(video_data.get('duration')) - description = video_data.get('description_en') or video_data.get('description_ar') - timestamp = parse_iso8601(video_data.get('create_time'), ' ') return { 'id': video_id, 'title': self._live_title(title) if is_live else title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, + 'description': video_data.get('description_en') or video_data.get('description_ar'), + 'thumbnail': 'http://admin.mangomolo.com/analytics/%s' % img if img else None, + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': parse_iso8601(video_data.get('create_time'), ' '), 'is_live': is_live, } @@ -75,11 +70,12 @@ class DCNBaseIE(InfoExtractor): return formats -class DCNVideoIE(DCNBaseIE): - IE_NAME = 'dcn:video' +class AWAANVideoIE(AWAANBaseIE): + IE_NAME = 'awaan:video' _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?(?:video(?:/[^/]+)?|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375', + 'md5': '5f61c33bfc7794315c671a62d43116aa', 'info_dict': { 'id': '17375', @@ -90,10 +86,6 @@ class DCNVideoIE(DCNBaseIE): 'timestamp': 1227504126, 'upload_date': '20081124', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, }, { 'url': 'http://awaan.ae/video/26723981/%D8%AF%D8%A7%D8%B1-%D8%A7%D9%84%D8%B3%D9%84%D8%A7%D9%85:-%D8%AE%D9%8A%D8%B1-%D8%AF%D9%88%D8%B1-%D8%A7%D9%84%D8%A3%D9%86%D8%B5%D8%A7%D8%B1', 'only_matching': True, @@ -102,11 +94,10 @@ class DCNVideoIE(DCNBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - request = sanitized_Request( + video_data = self._download_json( 'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id, - headers={'Origin': 'http://www.dcndigital.ae'}) - video_data = self._download_json(request, video_id) - info = self._extract_video_info(video_data, video_id, False) + video_id, headers={'Origin': 'http://awaan.ae'}) + info = self._parse_video_data(video_data, video_id, False) webpage = self._download_webpage( 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' + @@ -121,19 +112,31 @@ class DCNVideoIE(DCNBaseIE): return info -class DCNLiveIE(DCNBaseIE): - IE_NAME = 'dcn:live' +class AWAANLiveIE(AWAANBaseIE): + IE_NAME = 'awaan:live' _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?live/(?P<id>\d+)' + _TEST = { + 'url': 'http://awaan.ae/live/6/dubai-tv', + 'info_dict': { + 'id': '6', + 'ext': 'mp4', + 'title': 're:Dubai Al Oula [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'upload_date': '20150107', + 'timestamp': 1420588800, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } def _real_extract(self, url): channel_id = self._match_id(url) - request = sanitized_Request( + channel_data = self._download_json( 'http://admin.mangomolo.com/analytics/index.php/plus/getchanneldetails?channel_id=%s' % channel_id, - headers={'Origin': 'http://www.dcndigital.ae'}) - - channel_data = self._download_json(request, channel_id) - info = self._extract_video_info(channel_data, channel_id, True) + channel_id, headers={'Origin': 'http://awaan.ae'}) + info = self._parse_video_data(channel_data, channel_id, True) webpage = self._download_webpage( 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + @@ -148,8 +151,8 @@ class DCNLiveIE(DCNBaseIE): return info -class DCNSeasonIE(InfoExtractor): - IE_NAME = 'dcn:season' +class AWAANSeasonIE(InfoExtractor): + IE_NAME = 'awaan:season' _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' _TEST = { 'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A', @@ -170,21 +173,17 @@ class DCNSeasonIE(InfoExtractor): data['season'] = season_id show_id = smuggled_data.get('show_id') if show_id is None: - request = sanitized_Request( + season = self._download_json( 'http://admin.mangomolo.com/analytics/index.php/plus/season_info?id=%s' % season_id, - headers={'Origin': 'http://www.dcndigital.ae'}) - season = self._download_json(request, season_id) + season_id, headers={'Origin': 'http://awaan.ae'}) show_id = season['id'] data['show_id'] = show_id - request = sanitized_Request( + show = self._download_json( 'http://admin.mangomolo.com/analytics/index.php/plus/show', - urlencode_postdata(data), - { - 'Origin': 'http://www.dcndigital.ae', + show_id, data=urlencode_postdata(data), headers={ + 'Origin': 'http://awaan.ae', 'Content-Type': 'application/x-www-form-urlencoded' }) - - show = self._download_json(request, show_id) if not season_id: season_id = show['default_season'] for season in show['seasons']: @@ -195,6 +194,6 @@ class DCNSeasonIE(InfoExtractor): for video in show['videos']: video_id = compat_str(video['id']) entries.append(self.url_result( - 'http://www.dcndigital.ae/media/%s' % video_id, 'DCNVideo', video_id)) + 'http://awaan.ae/media/%s' % video_id, 'AWAANVideo', video_id)) return self.playlist_result(entries, season_id, title) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8e405ad72..04cd23bdb 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -71,6 +71,12 @@ from .atttechchannel import ATTTechChannelIE from .audimedia import AudiMediaIE from .audioboom import AudioBoomIE from .audiomack import AudiomackIE, AudiomackAlbumIE +from .awaan import ( + AWAANIE, + AWAANVideoIE, + AWAANLiveIE, + AWAANSeasonIE, +) from .azubu import AzubuIE, AzubuLiveIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE @@ -200,12 +206,6 @@ from .daum import ( DaumUserIE, ) from .dbtv import DBTVIE -from .dcn import ( - DCNIE, - DCNVideoIE, - DCNLiveIE, - DCNSeasonIE, -) from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE from .democracynow import DemocracynowIE From 3083e4dc070d6378456f9b20ebd5cbf9ee9d92af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 23 Aug 2016 07:22:14 +0700 Subject: [PATCH 141/218] [eagleplatform] Improve detection of embedded videos (Closes #10409) --- youtube_dl/extractor/eagleplatform.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 12d28d3b9..d4dfda8cd 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -52,11 +52,24 @@ class EaglePlatformIE(InfoExtractor): @staticmethod def _extract_url(webpage): + # Regular iframe embedding mobj = re.search( r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1', webpage) if mobj is not None: return mobj.group('url') + # Basic usage embedding (see http://dultonmedia.github.io/eplayer/) + mobj = re.search( + r'''(?xs) + <script[^>]+ + src=(?P<q1>["\'])(?:https?:)?//(?P<host>.+?\.media\.eagleplatform\.com)/player/player\.js(?P=q1) + .+? + <div[^>]+ + class=(?P<q2>["\'])eagleplayer(?P=q2)[^>]+ + data-id=["\'](?P<id>\d+) + ''', webpage) + if mobj is not None: + return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() @staticmethod def _handle_error(response): From fb009b7f534e600e98b93e062198ade5826b5800 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 23 Aug 2016 10:28:28 +0100 Subject: [PATCH 142/218] [bravotv] correct clip info extraction and add support for adobe pass auth(closes #10407) --- youtube_dl/extractor/bravotv.py | 81 +++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py index 541c76944..31763b4c6 100644 --- a/youtube_dl/extractor/bravotv.py +++ b/youtube_dl/extractor/bravotv.py @@ -1,31 +1,74 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import smuggle_url +from .adobepass import AdobePassIE +from ..utils import ( + smuggle_url, + update_url_query, + int_or_none, +) -class BravoTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+videos/(?P<id>[^/?]+)' - _TEST = { +class BravoTVIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+(?P<id>[^/?#]+)' + _TESTS = [{ 'url': 'http://www.bravotv.com/last-chance-kitchen/season-5/videos/lck-ep-12-fishy-finale', - 'md5': 'd60cdf68904e854fac669bd26cccf801', + 'md5': '9086d0b7ef0ea2aabc4781d75f4e5863', 'info_dict': { - 'id': 'LitrBdX64qLn', + 'id': 'zHyk1_HU_mPy', 'ext': 'mp4', - 'title': 'Last Chance Kitchen Returns', - 'description': 'S13: Last Chance Kitchen Returns for Top Chef Season 13', - 'timestamp': 1448926740, - 'upload_date': '20151130', + 'title': 'LCK Ep 12: Fishy Finale', + 'description': 'S13/E12: Two eliminated chefs have just 12 minutes to cook up a delicious fish dish.', 'uploader': 'NBCU-BRAV', + 'upload_date': '20160302', + 'timestamp': 1456945320, } - } + }, { + 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - account_pid = self._search_regex(r'"account_pid"\s*:\s*"([^"]+)"', webpage, 'account pid') - release_pid = self._search_regex(r'"release_pid"\s*:\s*"([^"]+)"', webpage, 'release pid') - return self.url_result(smuggle_url( - 'http://link.theplatform.com/s/%s/%s?mbr=true&switch=progressive' % (account_pid, release_pid), - {'force_smil_url': True}), 'ThePlatform', release_pid) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + settings = self._parse_json(self._search_regex( + r'jQuery\.extend\([^,]+,\s*({.+})\);', webpage, 'drupal settings'), + display_id) + info = {} + query = { + 'mbr': 'true', + } + account_pid, release_pid = [None] * 2 + tve = settings.get('sharedTVE') + if tve: + query['manifest'] = 'm3u' + account_pid = 'HNK2IC' + release_pid = tve['release_pid'] + if tve.get('entitlement') == 'auth': + adobe_pass = settings.get('adobePass', {}) + resource = self._get_mvpd_resource( + adobe_pass.get('adobePassResourceId', 'bravo'), + tve['title'], release_pid, tve.get('rating')) + query['auth'] = self._extract_mvpd_auth( + url, release_pid, adobe_pass.get('adobePassRequestorId', 'bravo'), resource) + else: + shared_playlist = settings['shared_playlist'] + account_pid = shared_playlist['account_pid'] + metadata = shared_playlist['video_metadata'][shared_playlist['default_clip']] + release_pid = metadata['release_pid'] + info.update({ + 'title': metadata['title'], + 'description': metadata.get('description'), + 'season_number': int_or_none(metadata.get('season_num')), + 'episode_number': int_or_none(metadata.get('episode_num')), + }) + query['switch'] = 'progressive' + info.update({ + '_type': 'url_transparent', + 'id': release_pid, + 'url': smuggle_url(update_url_query( + 'http://link.theplatform.com/s/%s/%s' % (account_pid, release_pid), + query), {'force_smil_url': True}), + 'ie_key': 'ThePlatform', + }) + return info From 18b6216150fa39d5e3cdbf353339e1c010bcee8d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 23 Aug 2016 21:55:58 +0800 Subject: [PATCH 143/218] [openload] Fix extraction (closes #10408) Thanks @yokrysty for the algorithm --- ChangeLog | 6 +++ youtube_dl/extractor/openload.py | 81 +++++++------------------------- 2 files changed, 24 insertions(+), 63 deletions(-) diff --git a/ChangeLog b/ChangeLog index a8202d3de..651d4d5d7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version <unreleased> + +Extractors +* [openload] Fix extraction (#10408) + + version 2016.08.22 Core diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 4e80ca9ff..e181d0b3a 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -1,12 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals, division -import math - from .common import InfoExtractor -from ..compat import compat_chr +from ..compat import ( + compat_chr, + compat_ord, +) from ..utils import ( - decode_png, determine_ext, ExtractorError, ) @@ -42,71 +42,26 @@ class OpenloadIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage('https://openload.co/embed/%s/' % video_id, video_id) - if 'File not found' in webpage: + if 'File not found' in webpage or 'deleted by the owner' in webpage: raise ExtractorError('File not found', expected=True) - # The following extraction logic is proposed by @Belderak and @gdkchan - # and declared to be used freely in youtube-dl - # See https://github.com/rg3/youtube-dl/issues/9706 + # The following decryption algorithm is written by @yokrysty and + # declared to be freely used in youtube-dl + # See https://github.com/rg3/youtube-dl/issues/10408 + enc_data = self._html_search_regex( + r'<span[^>]+id="hiddenurl"[^>]*>([^<]+)</span>', webpage, 'encrypted data') - numbers_js = self._download_webpage( - 'https://openload.co/assets/js/obfuscator/n.js', video_id, - note='Downloading signature numbers') - signums = self._search_regex( - r'window\.signatureNumbers\s*=\s*[\'"](?P<data>[a-z]+)[\'"]', - numbers_js, 'signature numbers', group='data') + video_url_chars = [] - linkimg_uri = self._search_regex( - r'<img[^>]+id="linkimg"[^>]+src="([^"]+)"', webpage, 'link image') - linkimg = self._request_webpage( - linkimg_uri, video_id, note=False).read() + for c in enc_data: + j = compat_ord(c) + if j >= 33 and j <= 126: + j = ((j + 14) % 94) + 33 + video_url_chars += compat_chr(j) - width, height, pixels = decode_png(linkimg) - - output = '' - for y in range(height): - for x in range(width): - r, g, b = pixels[y][3 * x:3 * x + 3] - if r == 0 and g == 0 and b == 0: - break - else: - output += compat_chr(r) - output += compat_chr(g) - output += compat_chr(b) - - img_str_length = len(output) // 200 - img_str = [[0 for x in range(img_str_length)] for y in range(10)] - - sig_str_length = len(signums) // 260 - sig_str = [[0 for x in range(sig_str_length)] for y in range(10)] - - for i in range(10): - for j in range(img_str_length): - begin = i * img_str_length * 20 + j * 20 - img_str[i][j] = output[begin:begin + 20] - for j in range(sig_str_length): - begin = i * sig_str_length * 26 + j * 26 - sig_str[i][j] = signums[begin:begin + 26] - - parts = [] - # TODO: find better names for str_, chr_ and sum_ - str_ = '' - for i in [2, 3, 5, 7]: - str_ = '' - sum_ = float(99) - for j in range(len(sig_str[i])): - for chr_idx in range(len(img_str[i][j])): - if sum_ > float(122): - sum_ = float(98) - chr_ = compat_chr(int(math.floor(sum_))) - if sig_str[i][j][chr_idx] == chr_ and j >= len(str_): - sum_ += float(2.5) - str_ += img_str[i][j][chr_idx] - parts.append(str_.replace(',', '')) - - video_url = 'https://openload.co/stream/%s~%s~%s~%s' % (parts[3], parts[1], parts[2], parts[0]) + video_url = 'https://openload.co/stream/%s?mime=true' % ''.join(video_url_chars) title = self._og_search_title(webpage, default=None) or self._search_regex( r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage, From ccb6570e9e625ff5e9adf88729e745acadcaff0e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 23 Aug 2016 17:31:08 +0100 Subject: [PATCH 144/218] [syfy,bravotv] restrict drupal settings regex --- youtube_dl/extractor/bravotv.py | 2 +- youtube_dl/extractor/syfy.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py index 31763b4c6..a25d500e4 100644 --- a/youtube_dl/extractor/bravotv.py +++ b/youtube_dl/extractor/bravotv.py @@ -32,7 +32,7 @@ class BravoTVIE(AdobePassIE): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) settings = self._parse_json(self._search_regex( - r'jQuery\.extend\([^,]+,\s*({.+})\);', webpage, 'drupal settings'), + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', webpage, 'drupal settings'), display_id) info = {} query = { diff --git a/youtube_dl/extractor/syfy.py b/youtube_dl/extractor/syfy.py index cc81f6003..ab8bab5cd 100644 --- a/youtube_dl/extractor/syfy.py +++ b/youtube_dl/extractor/syfy.py @@ -31,7 +31,7 @@ class SyfyIE(AdobePassIE): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) syfy_mpx = list(self._parse_json(self._search_regex( - r'jQuery\.extend\([^,]+,\s*({.+})\);', webpage, 'drupal settings'), + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', webpage, 'drupal settings'), display_id)['syfy']['syfy_mpx'].values())[0] video_id = syfy_mpx['mpxGUID'] title = syfy_mpx['episodeTitle'] From 1212e9972fce69df6bd871a5c301294427299cbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 24 Aug 2016 00:25:21 +0700 Subject: [PATCH 145/218] [youtube] Fix authentication (#10392) --- youtube_dl/extractor/youtube.py | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 268080ba6..38556d86e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -91,36 +91,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if login_page is False: return - galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"', - login_page, 'Login GALX parameter') + login_form = self._hidden_inputs(login_page) - # Log in - login_form_strs = { - 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1', + login_form.update({ 'Email': username, - 'GALX': galx, 'Passwd': password, - - 'PersistentCookie': 'yes', - '_utf8': '霱', - 'bgresponse': 'js_disabled', - 'checkConnection': '', - 'checkedDomains': 'youtube', - 'dnConn': '', - 'pstMsg': '0', - 'rmShown': '1', - 'secTok': '', - 'signIn': 'Sign in', - 'timeStmp': '', - 'service': 'youtube', - 'uilel': '3', - 'hl': 'en_US', - } + }) login_results = self._download_webpage( self._PASSWORD_CHALLENGE_URL, None, note='Logging in', errnote='unable to log in', fatal=False, - data=urlencode_postdata(login_form_strs)) + data=urlencode_postdata(login_form)) if login_results is False: return False From 05bddcc512cd5058f1af1d5985979b70bdcf4711 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 24 Aug 2016 01:29:50 +0700 Subject: [PATCH 146/218] [youtube] Fix authentication (2) (Closes #10392) --- youtube_dl/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 38556d86e..d5d5b7334 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -94,6 +94,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): login_form = self._hidden_inputs(login_page) login_form.update({ + 'checkConnection': 'youtube', 'Email': username, 'Passwd': password, }) From 6e52bbb41320e1b6f4b7a16a5e651d945ac14611 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 24 Aug 2016 01:36:27 +0700 Subject: [PATCH 147/218] [ChangeLog] Actualize --- ChangeLog | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ChangeLog b/ChangeLog index 651d4d5d7..07ab5867f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,14 @@ version <unreleased> Extractors +* [youtube] Fix authentication (#10392) * [openload] Fix extraction (#10408) ++ [bravotv] Add support for Adobe Pass (#10407) +* [bravotv] Fix clip info extraction (#10407) +* [eagleplatform] Improve embedded videos detection (#10409) +* [awaan] Fix extraction +* [mtvservices:embedded] Update config URL ++ [abc:iview] Add extractor (#6148) version 2016.08.22 From c86f51ee38b2063ad4eec2f0bb6e3d3551be0855 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 24 Aug 2016 01:38:46 +0700 Subject: [PATCH 148/218] release 2016.08.24 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 9 +++++---- youtube_dl/version.py | 2 +- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 7dcca18a1..00f593783 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.22*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.22** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.24*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.24** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.08.22 +[debug] youtube-dl version 2016.08.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 07ab5867f..b4f6dbe08 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2016.08.24 Extractors * [youtube] Fix authentication (#10392) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index ca96d2b07..08db56fa9 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -16,6 +16,7 @@ - **9gag** - **9now.com.au** - **abc.net.au** + - **abc.net.au:iview** - **Abc7News** - **abcnews** - **abcnews:video** @@ -66,6 +67,10 @@ - **audiomack** - **audiomack:album** - **auroravid**: AuroraVid + - **AWAAN** + - **awaan:live** + - **awaan:season** + - **awaan:video** - **Azubu** - **AzubuLive** - **BaiduVideo**: 百度视频 @@ -172,10 +177,6 @@ - **daum.net:playlist** - **daum.net:user** - **DBTV** - - **DCN** - - **dcn:live** - - **dcn:season** - - **dcn:video** - **DctpTv** - **DeezerPlaylist** - **defense.gouv.fr** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e33d32e97..c1194124e 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.08.22' +__version__ = '2016.08.24' From 8c3e35dd441ceed682da885368f5cd97afb1816e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 24 Aug 2016 08:41:52 +0700 Subject: [PATCH 149/218] [pluralsight] Add support for subtitles (Closes #9681) --- youtube_dl/extractor/pluralsight.py | 74 ++++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 9aab77645..afd3217d9 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -1,9 +1,10 @@ from __future__ import unicode_literals -import re -import json -import random import collections +import json +import os +import random +import re from .common import InfoExtractor from ..compat import ( @@ -12,10 +13,12 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + float_or_none, int_or_none, parse_duration, qualities, sanitized_Request, + srt_subtitles_timecode, urlencode_postdata, ) @@ -91,6 +94,51 @@ class PluralsightIE(PluralsightBaseIE): if all(p not in response for p in ('__INITIAL_STATE__', '"currentUser"')): raise ExtractorError('Unable to log in') + def _get_subtitles(self, author, clip_id, lang, name, duration, video_id): + captions_post = { + 'a': author, + 'cn': clip_id, + 'lc': lang, + 'm': name, + } + captions = self._download_json( + '%s/training/Player/Captions' % self._API_BASE, video_id, + 'Downloading captions JSON', 'Unable to download captions JSON', + fatal=False, data=json.dumps(captions_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) + if captions: + return { + lang: [{ + 'ext': 'json', + 'data': json.dumps(captions), + }, { + 'ext': 'srt', + 'data': self._convert_subtitles(duration, captions), + }] + } + + @staticmethod + def _convert_subtitles(duration, subs): + srt = '' + for num, current in enumerate(subs): + current = subs[num] + start, text = float_or_none( + current.get('DisplayTimeOffset')), current.get('Text') + if start is None or text is None: + continue + end = duration if num == len(subs) - 1 else float_or_none( + subs[num + 1].get('DisplayTimeOffset')) + srt += os.linesep.join( + ( + '%d' % num, + '%s --> %s' % ( + srt_subtitles_timecode(start), + srt_subtitles_timecode(end)), + text, + os.linesep, + )) + return srt + def _real_extract(self, url): qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) @@ -138,6 +186,8 @@ class PluralsightIE(PluralsightBaseIE): if not clip: raise ExtractorError('Unable to resolve clip') + title = '%s - %s' % (module['title'], clip['title']) + QUALITIES = { 'low': {'width': 640, 'height': 480}, 'medium': {'width': 848, 'height': 640}, @@ -225,18 +275,20 @@ class PluralsightIE(PluralsightBaseIE): formats.append(f) self._sort_formats(formats) - # TODO: captions - # http://www.pluralsight.com/training/Player/ViewClip + cap = true - # or - # http://www.pluralsight.com/training/Player/Captions - # { a = author, cn = clip_id, lc = end, m = name } + duration = int_or_none( + clip.get('duration')) or parse_duration(clip.get('formattedDuration')) + + # TODO: other languages? + subtitles = self.extract_subtitles( + author, clip_id, 'en', name, duration, display_id) return { 'id': clip.get('clipName') or clip['name'], - 'title': '%s - %s' % (module['title'], clip['title']), - 'duration': int_or_none(clip.get('duration')) or parse_duration(clip.get('formattedDuration')), + 'title': title, + 'duration': duration, 'creator': author, - 'formats': formats + 'formats': formats, + 'subtitles': subtitles, } From 30317f4887178082809706ce8ac9cb989014c8fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 24 Aug 2016 08:52:12 +0700 Subject: [PATCH 150/218] [pluralsight] Modernize and make more robust --- youtube_dl/extractor/pluralsight.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index afd3217d9..ea5caefa9 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -17,7 +17,6 @@ from ..utils import ( int_or_none, parse_duration, qualities, - sanitized_Request, srt_subtitles_timecode, urlencode_postdata, ) @@ -78,12 +77,10 @@ class PluralsightIE(PluralsightBaseIE): if not post_url.startswith('http'): post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - request = sanitized_Request( - post_url, urlencode_postdata(login_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - response = self._download_webpage( - request, None, 'Logging in as %s' % username) + post_url, None, 'Logging in as %s' % username, + data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) error = self._search_regex( r'<span[^>]+class="field-validation-error"[^>]*>([^<]+)</span>', @@ -128,6 +125,8 @@ class PluralsightIE(PluralsightBaseIE): continue end = duration if num == len(subs) - 1 else float_or_none( subs[num + 1].get('DisplayTimeOffset')) + if end is None: + continue srt += os.linesep.join( ( '%d' % num, @@ -246,13 +245,12 @@ class PluralsightIE(PluralsightBaseIE): 'mt': ext, 'q': '%dx%d' % (f['width'], f['height']), } - request = sanitized_Request( - '%s/training/Player/ViewClip' % self._API_BASE, - json.dumps(clip_post).encode('utf-8')) - request.add_header('Content-Type', 'application/json;charset=utf-8') format_id = '%s-%s' % (ext, quality) clip_url = self._download_webpage( - request, display_id, 'Downloading %s URL' % format_id, fatal=False) + '%s/training/Player/ViewClip' % self._API_BASE, display_id, + 'Downloading %s URL' % format_id, fatal=False, + data=json.dumps(clip_post).encode('utf-8'), + headers={'Content-Type': 'application/json;charset=utf-8'}) # Pluralsight tracks multiple sequential calls to ViewClip API and start # to return 429 HTTP errors after some time (see From 6d94cbd2f43548575b32907724f48331df1693ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 24 Aug 2016 10:07:06 +0700 Subject: [PATCH 151/218] [ChangeLog] Actualize --- ChangeLog | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ChangeLog b/ChangeLog index b4f6dbe08..b63f49ae1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version <unreleased> + +Extractors ++ [pluralsight] Add support for subtitles (#9681) + + version 2016.08.24 Extractors From d38b27dd9b108a7518dd291c5c231a53abd3f2df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 24 Aug 2016 10:11:04 +0700 Subject: [PATCH 152/218] release 2016.08.24.1 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 00f593783..15acc025a 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.24*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.24** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.24.1*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.24.1** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.08.24 +[debug] youtube-dl version 2016.08.24.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index b63f49ae1..4f3f1265f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2016.08.24.1 Extractors + [pluralsight] Add support for subtitles (#9681) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c1194124e..7447d3d7e 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.08.24' +__version__ = '2016.08.24.1' From 97653f81b2565c752f2c107fc44167a93c3eef42 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 24 Aug 2016 21:18:56 +0800 Subject: [PATCH 153/218] [bilibili] Mark as broken Bilibili now uses emscripten, which is very difficult for reverse engineering. I don't expect it to be fixed in near future, so I mark it as broken. Ref: #10375 --- youtube_dl/extractor/bilibili.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index d8eb71821..d87c38a02 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -21,6 +21,8 @@ from ..utils import ( class BiliBiliIE(InfoExtractor): + _WORKING = False + _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)' _TESTS = [{ From 0c75abbb7bb9135d145805e86c87a5a43b69ac15 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 24 Aug 2016 23:58:22 +0800 Subject: [PATCH 154/218] [mtvservices:embedded] Use another endpoint to get feed URL Closes #10363 In the original mtvservices:embedded test case, config.xml is still used to get the feed URL. Some other examples, including test_Generic_40 (http://www.vulture.com/2016/06/new-key-peele-sketches-released.html), and the video mentioned in #10363, use another endpoint to get the feed URL. The 'index.html' approach works for the original test case, too. So I didn't keep the old approach. --- ChangeLog | 6 ++++++ youtube_dl/extractor/bet.py | 5 ++--- youtube_dl/extractor/mtv.py | 27 +++++++++++++-------------- youtube_dl/extractor/nick.py | 5 ++--- 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/ChangeLog b/ChangeLog index 4f3f1265f..c3cc8f38f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version <unreleased> + +Extractors +* [mtvservices:embedded] Fix extraction for the new 'edge' player (#10363) + + version 2016.08.24.1 Extractors diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py index bd3ee2e2e..1f8ef0303 100644 --- a/youtube_dl/extractor/bet.py +++ b/youtube_dl/extractor/bet.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor from ..utils import unified_strdate -from ..compat import compat_urllib_parse_urlencode class BetIE(MTVServicesInfoExtractor): @@ -53,9 +52,9 @@ class BetIE(MTVServicesInfoExtractor): _FEED_URL = "http://feeds.mtvnservices.com/od/feed/bet-mrss-player" def _get_feed_query(self, uri): - return compat_urllib_parse_urlencode({ + return { 'uuid': uri, - }) + } def _extract_mgid(self, webpage): return self._search_regex(r'data-uri="([^"]+)', webpage, 'mgid') diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 200f340de..bdda68819 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -4,7 +4,6 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse_urlencode, compat_str, compat_xpath, ) @@ -14,12 +13,13 @@ from ..utils import ( fix_xml_ampersands, float_or_none, HEADRequest, + RegexNotFoundError, sanitized_Request, strip_or_none, timeconvert, unescapeHTML, + update_url_query, url_basename, - RegexNotFoundError, xpath_text, ) @@ -36,6 +36,11 @@ class MTVServicesInfoExtractor(InfoExtractor): def _id_from_uri(uri): return uri.split(':')[-1] + @staticmethod + def _remove_template_parameter(url): + # Remove the templates, like &device={device} + return re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', url) + # This was originally implemented for ComedyCentral, but it also works here @classmethod def _transform_rtmp_url(cls, rtmp_video_url): @@ -117,9 +122,7 @@ class MTVServicesInfoExtractor(InfoExtractor): video_id = self._id_from_uri(uri) self.report_extraction(video_id) content_el = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))) - mediagen_url = content_el.attrib['url'] - # Remove the templates, like &device={device} - mediagen_url = re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', mediagen_url) + mediagen_url = self._remove_template_parameter(content_el.attrib['url']) if 'acceptMethods' not in mediagen_url: mediagen_url += '&' if '?' in mediagen_url else '?' mediagen_url += 'acceptMethods=fms' @@ -178,12 +181,12 @@ class MTVServicesInfoExtractor(InfoExtractor): data = {'uri': uri} if self._LANG: data['lang'] = self._LANG - return compat_urllib_parse_urlencode(data) + return data def _get_videos_info(self, uri): video_id = self._id_from_uri(uri) feed_url = self._get_feed_url(uri) - info_url = feed_url + '?' + self._get_feed_query(uri) + info_url = update_url_query(feed_url, self._get_feed_query(uri)) return self._get_videos_info_from_url(info_url, video_id) def _get_videos_info_from_url(self, url, video_id): @@ -256,13 +259,9 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): def _get_feed_url(self, uri): video_id = self._id_from_uri(uri) - site_id = uri.replace(video_id, '') - config_url = ('http://media.mtvnservices.com/pmt-arc/e1/players/{0}/' - 'context52/config.xml'.format(site_id)) - config_doc = self._download_xml(config_url, video_id) - feed_node = config_doc.find('.//feed') - feed_url = feed_node.text.strip().split('?')[0] - return feed_url + config = self._download_json( + 'http://media.mtvnservices.com/pmt/e1/access/index.html?uri=%s&configtype=edge' % uri, video_id) + return self._remove_template_parameter(config['feedWithQueryParams']) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 9c54846e1..64730a624 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor -from ..compat import compat_urllib_parse_urlencode from ..utils import update_url_query @@ -59,10 +58,10 @@ class NickIE(MTVServicesInfoExtractor): }] def _get_feed_query(self, uri): - return compat_urllib_parse_urlencode({ + return { 'feed': 'nick_arc_player_prime', 'mgid': uri, - }) + } def _extract_mgid(self, webpage): return self._search_regex(r'data-contenturi="([^"]+)', webpage, 'mgid') From 08773689f37341f8c70c3fd298f5910235b8c151 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 25 Aug 2016 01:29:32 +0800 Subject: [PATCH 155/218] [kickstarter] Silent the warning for og:description Closes #10415 --- youtube_dl/extractor/kickstarter.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py index 9f1ade2e4..c61e78622 100644 --- a/youtube_dl/extractor/kickstarter.py +++ b/youtube_dl/extractor/kickstarter.py @@ -37,7 +37,6 @@ class KickStarterIE(InfoExtractor): 'ext': 'mp4', 'title': 'Power Drive 2000', }, - 'expected_warnings': ['OpenGraph description'], }] def _real_extract(self, url): @@ -67,6 +66,6 @@ class KickStarterIE(InfoExtractor): 'id': video_id, 'url': video_url, 'title': title, - 'description': self._og_search_description(webpage), + 'description': self._og_search_description(webpage, default=None), 'thumbnail': thumbnail, } From 0c6422cdd649c6f39cb2d8680e29f91da18d8c57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 25 Aug 2016 07:34:55 +0700 Subject: [PATCH 156/218] [README.md] Add FAQ entry for streaming to player --- README.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a10aaf35c..52e53803e 100644 --- a/README.md +++ b/README.md @@ -730,7 +730,7 @@ Videos or video formats streamed via RTMP protocol can only be downloaded when [ ### I have downloaded a video but how can I play it? -Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](http://www.videolan.org) or [mplayer](http://www.mplayerhq.hu/). +Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](http://www.videolan.org/) or [mplayer](http://www.mplayerhq.hu/). ### I extracted a video URL with `-g`, but it does not play on another machine / in my webbrowser. @@ -816,6 +816,12 @@ Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. N Passing cookies to youtube-dl is a good way to workaround login when a particular extractor does not implement it explicitly. Another use case is working around [CAPTCHA](https://en.wikipedia.org/wiki/CAPTCHA) some websites require you to solve in particular cases in order to get access (e.g. YouTube, CloudFlare). +### How do I stream directly to media player? + +You will first need to tell youtube-dl to stream media to stdout with `-o -`, and also tell your media player to read from stdin (it must be capable of this for streaming) and then pipe former to latter. For example, streaming to [vlc](http://www.videolan.org/) can be achieved with: + + youtube-dl -o - http://www.youtube.com/watch?v=BaW_jenozKcj | vlc - + ### Can you add support for this anime video site, or site which shows current movies for free? As a matter of policy (as well as legality), youtube-dl does not include support for services that specialize in infringing copyright. As a rule of thumb, if you cannot easily find a video that the service is quite obviously allowed to distribute (i.e. that has been uploaded by the creator, the creator's distributor, or is published under a free license), the service is probably unfit for inclusion to youtube-dl. From 073ac1225f6fe28905e11f29f2d23f4b4db50f9c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 25 Aug 2016 08:33:16 +0100 Subject: [PATCH 157/218] [utils] add ac-3 to the list of audio codecs in parse_codecs --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 41ca562f1..1091f17f3 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2183,7 +2183,7 @@ def parse_codecs(codecs_str): if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'): if not vcodec: vcodec = full_codec - elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac'): + elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'): if not acodec: acodec = full_codec else: From 07ea9c9b05359aef14472dfa66a6578d21c88e96 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 25 Aug 2016 08:37:41 +0100 Subject: [PATCH 158/218] [downloader/hls] fill IV with zeros for IVs shorter than 16-octet --- youtube_dl/downloader/hls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 8d7971e5d..8dd1b898e 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -120,7 +120,7 @@ class HlsFD(FragmentFD): decrypt_info = parse_m3u8_attributes(line[11:]) if decrypt_info['METHOD'] == 'AES-128': if 'IV' in decrypt_info: - decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:]) + decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:].zfill(32)) if not re.match(r'^https?://', decrypt_info['URI']): decrypt_info['URI'] = compat_urlparse.urljoin( man_url, decrypt_info['URI']) From f39ffc5877e4e9f112fa26ff21079f179b4aec46 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 25 Aug 2016 08:39:23 +0100 Subject: [PATCH 159/218] [common] extract formats from #EXT-X-MEDIA tags --- youtube_dl/extractor/common.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 8ed16deee..da0af29ec 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1202,30 +1202,45 @@ class InfoExtractor(object): 'preference': preference, }] last_info = None - last_media = None for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-STREAM-INF:'): last_info = parse_m3u8_attributes(line) elif line.startswith('#EXT-X-MEDIA:'): - last_media = parse_m3u8_attributes(line) + media = parse_m3u8_attributes(line) + media_type = media.get('TYPE') + if media_type in ('VIDEO', 'AUDIO'): + media_url = media.get('URI') + if media_url: + format_id = [] + for v in (media.get('GROUP-ID'), media.get('NAME')): + if v: + format_id.append(v) + formats.append({ + 'format_id': '-'.join(format_id), + 'url': format_url(media_url), + 'language': media.get('LANGUAGE'), + 'vcodec': 'none' if media_type == 'AUDIO' else None, + 'ext': ext, + 'protocol': entry_protocol, + 'preference': preference, + }) elif line.startswith('#') or not line.strip(): continue else: if last_info is None: formats.append({'url': format_url(line)}) continue - tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000) + tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000) format_id = [] if m3u8_id: format_id.append(m3u8_id) - last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') not in ('SUBTITLES', 'CLOSED-CAPTIONS') else None - # Despite specification does not mention NAME attribute for - # EXT-X-STREAM-INF it still sometimes may be present - stream_name = last_info.get('NAME') or last_media_name # Bandwidth of live streams may differ over time thus making # format_id unpredictable. So it's better to keep provided # format_id intact. if not live: + # Despite specification does not mention NAME attribute for + # EXT-X-STREAM-INF it still sometimes may be present + stream_name = last_info.get('NAME') format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats))) f = { 'format_id': '-'.join(format_id), @@ -1252,9 +1267,6 @@ class InfoExtractor(object): 'abr': abr, }) f.update(parse_codecs(last_info.get('CODECS'))) - if last_media is not None: - f['m3u8_media'] = last_media - last_media = None formats.append(f) last_info = {} return formats From 75fa990dc669563b51f22eeddd2f33acc41c8599 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 25 Aug 2016 08:46:54 +0100 Subject: [PATCH 160/218] [YoutubeDL] add fallback value for thumbnails values in thumbnails sorting --- youtube_dl/YoutubeDL.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 0b3e3da82..c499c1da4 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1256,8 +1256,8 @@ class YoutubeDL(object): info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}] if thumbnails: thumbnails.sort(key=lambda t: ( - t.get('preference'), t.get('width'), t.get('height'), - t.get('id'), t.get('url'))) + t.get('preference') or -1, t.get('width') or -1, t.get('height') or -1, + t.get('id') or '', t.get('url'))) for i, t in enumerate(thumbnails): t['url'] = sanitize_url(t['url']) if t.get('width') and t.get('height'): From 30afe4aeb25576225d3f3ca486983b5ad9258aa0 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 25 Aug 2016 08:49:15 +0100 Subject: [PATCH 161/218] [cbc] Add support for watch.cbc.ca --- youtube_dl/extractor/cbc.py | 172 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 2 + 2 files changed, 174 insertions(+) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index a87e97140..d71fddf58 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -9,10 +9,19 @@ from ..utils import ( js_to_json, smuggle_url, try_get, + xpath_text, + xpath_element, + xpath_with_ns, + find_xpath_attr, + parse_iso8601, + parse_age_limit, + int_or_none, + ExtractorError, ) class CBCIE(InfoExtractor): + IE_NAME = 'cbc.ca' _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?!player/)(?:[^/]+/)+(?P<id>[^/?#]+)' _TESTS = [{ # with mediaId @@ -114,6 +123,7 @@ class CBCIE(InfoExtractor): class CBCPlayerIE(InfoExtractor): + IE_NAME = 'cbc.ca:player' _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.cbc.ca/player/play/2683190193', @@ -167,3 +177,165 @@ class CBCPlayerIE(InfoExtractor): }), 'id': video_id, } + + +class CBCWatchBaseIE(InfoExtractor): + _device_id = None + _device_token = None + _API_BASE_URL = 'https://api-cbc.cloud.clearleap.com/cloffice/client/' + _NS_MAP = { + 'media': 'http://search.yahoo.com/mrss/', + 'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/', + } + + def _call_api(self, path, video_id): + url = path if path.startswith('http') else self._API_BASE_URL + path + result = self._download_xml(url, video_id, headers={ + 'X-Clearleap-DeviceId': self._device_id, + 'X-Clearleap-DeviceToken': self._device_token, + }) + error_message = xpath_text(result, 'userMessage') or xpath_text(result, 'systemMessage') + if error_message: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message)) + return result + + def _real_initialize(self): + if not self._device_id or not self._device_token: + device = self._downloader.cache.load('cbcwatch', 'device') or {} + self._device_id, self._device_token = device.get('id'), device.get('token') + if not self._device_id or not self._device_token: + result = self._download_xml( + self._API_BASE_URL + 'device/register', + None, data=b'<device><type>web</type></device>') + self._device_id = xpath_text(result, 'deviceId', fatal=True) + self._device_token = xpath_text(result, 'deviceToken', fatal=True) + self._downloader.cache.store( + 'cbcwatch', 'device', { + 'id': self._device_id, + 'token': self._device_token, + }) + + def _parse_rss_feed(self, rss): + channel = xpath_element(rss, 'channel', fatal=True) + + def _add_ns(path): + return xpath_with_ns(path, self._NS_MAP) + + entries = [] + for item in channel.findall('item'): + guid = xpath_text(item, 'guid', fatal=True) + title = xpath_text(item, 'title', fatal=True) + + media_group = xpath_element(item, _add_ns('media:group'), fatal=True) + content = xpath_element(media_group, _add_ns('media:content'), fatal=True) + content_url = content.attrib['url'] + + thumbnails = [] + for thumbnail in media_group.findall(_add_ns('media:thumbnail')): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'id': thumbnail.get('profile'), + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + timestamp = None + release_date = find_xpath_attr( + item, _add_ns('media:credit'), 'role', 'releaseDate') + if release_date is not None: + timestamp = parse_iso8601(release_date.text) + + entries.append({ + '_type': 'url_transparent', + 'url': content_url, + 'id': guid, + 'title': title, + 'description': xpath_text(item, 'description'), + 'timestamp': timestamp, + 'duration': int_or_none(content.get('duration')), + 'age_limit': parse_age_limit(xpath_text(item, _add_ns('media:rating'))), + 'episode': xpath_text(item, _add_ns('clearleap:episode')), + 'episode_number': int_or_none(xpath_text(item, _add_ns('clearleap:episodeInSeason'))), + 'series': xpath_text(item, _add_ns('clearleap:series')), + 'season_number': int_or_none(xpath_text(item, _add_ns('clearleap:season'))), + 'thumbnails': thumbnails, + 'ie_key': 'CBCWatchVideo', + }) + + return self.playlist_result( + entries, xpath_text(channel, 'guid'), + xpath_text(channel, 'title'), + xpath_text(channel, 'description')) + + +class CBCWatchVideoIE(CBCWatchBaseIE): + IE_NAME = 'cbc.ca:watch:video' + _VALID_URL = r'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + + def _real_extract(self, url): + video_id = self._match_id(url) + result = self._call_api(url, video_id) + + m3u8_url = xpath_text(result, 'url', fatal=True) + formats = self._extract_m3u8_formats(re.sub(r'/([^/]+)/[^/?]+\.m3u8', r'/\1/\1.m3u8', m3u8_url), video_id, 'mp4', fatal=False) + if len(formats) < 2: + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + # Despite metadata in m3u8 all video+audio formats are + # actually video-only (no audio) + for f in formats: + if f.get('acodec') != 'none' and f.get('vcodec') != 'none': + f['acodec'] = 'none' + self._sort_formats(formats) + + info = { + 'id': video_id, + 'title': video_id, + 'formats': formats, + } + + rss = xpath_element(result, 'rss') + if rss: + info.update(self._parse_rss_feed(rss)['entries'][0]) + del info['url'] + del info['_type'] + del info['ie_key'] + return info + + +class CBCWatchIE(CBCWatchBaseIE): + IE_NAME = 'cbc.ca:watch' + _VALID_URL = r'https?://watch\.cbc\.ca/(?:[^/]+/)+(?P<id>[0-9a-f-]+)' + _TESTS = [{ + 'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4', + 'info_dict': { + 'id': '38e815a-009e3ab12e4', + 'ext': 'mp4', + 'title': 'Customer (Dis)Service', + 'description': 'md5:8bdd6913a0fe03d4b2a17ebe169c7c87', + 'upload_date': '20160219', + 'timestamp': 1455840000, + }, + 'params': { + # m3u8 download + 'skip_download': True, + 'format': 'bestvideo', + }, + 'skip': 'Geo-restricted to Canada', + }, { + 'url': 'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057', + 'info_dict': { + 'id': '1ed4b385-cd84-49cf-95f0-80f004680057', + 'title': 'Arthur', + 'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.', + }, + 'playlist_mincount': 30, + 'skip': 'Geo-restricted to Canada', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + rss = self._call_api('web/browse/' + video_id, video_id) + return self._parse_rss_feed(rss) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 04cd23bdb..a58145e3e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -130,6 +130,8 @@ from .carambatv import ( from .cbc import ( CBCIE, CBCPlayerIE, + CBCWatchVideoIE, + CBCWatchIE, ) from .cbs import CBSIE from .cbslocal import CBSLocalIE From f70e9229e623eb041ad514605ceca484b176b850 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 25 Aug 2016 09:11:23 +0100 Subject: [PATCH 162/218] [discoverygo] detect when video needs authentication(closes #10425) --- youtube_dl/extractor/discoverygo.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/discoverygo.py b/youtube_dl/extractor/discoverygo.py index cba709935..e86d16d36 100644 --- a/youtube_dl/extractor/discoverygo.py +++ b/youtube_dl/extractor/discoverygo.py @@ -7,6 +7,7 @@ from ..utils import ( int_or_none, parse_age_limit, unescapeHTML, + ExtractorError, ) @@ -53,7 +54,13 @@ class DiscoveryGoIE(InfoExtractor): title = video['name'] - stream = video['stream'] + stream = video.get('stream') + if not stream: + raise ExtractorError( + 'This video is only available via cable service provider subscription that' + ' is not currently supported. You may want to use --cookies.' + if video.get('authenticated') is True else 'Unable to find stream', + expected=True) STREAM_URL_SUFFIX = 'streamUrl' formats = [] for stream_kind in ('', 'hds'): From 5c13c285660c2811206c5bb29acf43b114ab31e3 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 25 Aug 2016 09:55:23 +0100 Subject: [PATCH 163/218] raise unexpected error when no stream found --- youtube_dl/extractor/adultswim.py | 11 ++++++----- youtube_dl/extractor/discoverygo.py | 11 ++++++----- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 3f7f8c036..96599048f 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -162,11 +162,12 @@ class AdultSwimIE(InfoExtractor): elif video_info.get('videoPlaybackID'): segment_ids = [video_info['videoPlaybackID']] else: - raise ExtractorError( - 'This video is only available via cable service provider subscription that' - ' is not currently supported. You may want to use --cookies.' - if video_info.get('auth') is True else 'Unable to find stream or clips', - expected=True) + if video_info.get('auth') is True: + raise ExtractorError( + 'This video is only available via cable service provider subscription that' + ' is not currently supported. You may want to use --cookies.', expected=True) + else: + raise ExtractorError('Unable to find stream or clips') episode_id = video_info['id'] episode_title = video_info['title'] diff --git a/youtube_dl/extractor/discoverygo.py b/youtube_dl/extractor/discoverygo.py index e86d16d36..c4e83b2c3 100644 --- a/youtube_dl/extractor/discoverygo.py +++ b/youtube_dl/extractor/discoverygo.py @@ -56,11 +56,12 @@ class DiscoveryGoIE(InfoExtractor): stream = video.get('stream') if not stream: - raise ExtractorError( - 'This video is only available via cable service provider subscription that' - ' is not currently supported. You may want to use --cookies.' - if video.get('authenticated') is True else 'Unable to find stream', - expected=True) + if video.get('authenticated') is True: + raise ExtractorError( + 'This video is only available via cable service provider subscription that' + ' is not currently supported. You may want to use --cookies.', expected=True) + else: + raise ExtractorError('Unable to find stream') STREAM_URL_SUFFIX = 'streamUrl' formats = [] for stream_kind in ('', 'hds'): From d37708fc861b3534c522f2892b5cd2ee716e1035 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 25 Aug 2016 11:53:47 +0100 Subject: [PATCH 164/218] [YoutubeDL] check only for None Value in thumbnails sorting --- youtube_dl/YoutubeDL.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index c499c1da4..805733fb7 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1256,8 +1256,10 @@ class YoutubeDL(object): info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}] if thumbnails: thumbnails.sort(key=lambda t: ( - t.get('preference') or -1, t.get('width') or -1, t.get('height') or -1, - t.get('id') or '', t.get('url'))) + t.get('preference') if t.get('preference') is not None else -1, + t.get('width') if t.get('width') is not None else -1, + t.get('height') if t.get('height') is not None else -1, + t.get('id') if t.get('id') is not None else '', t.get('url'))) for i, t in enumerate(thumbnails): t['url'] = sanitize_url(t['url']) if t.get('width') and t.get('height'): From 6a76b53355947eef2a534d8f2505ed683db8754f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 25 Aug 2016 18:05:01 +0700 Subject: [PATCH 165/218] [README.md] Quote URL in streaming to player FAQ entry --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 52e53803e..1aa267752 100644 --- a/README.md +++ b/README.md @@ -820,7 +820,7 @@ Passing cookies to youtube-dl is a good way to workaround login when a particula You will first need to tell youtube-dl to stream media to stdout with `-o -`, and also tell your media player to read from stdin (it must be capable of this for streaming) and then pipe former to latter. For example, streaming to [vlc](http://www.videolan.org/) can be achieved with: - youtube-dl -o - http://www.youtube.com/watch?v=BaW_jenozKcj | vlc - + youtube-dl -o - "http://www.youtube.com/watch?v=BaW_jenozKcj" | vlc - ### Can you add support for this anime video site, or site which shows current movies for free? From ea01cdbf61c9a689e7914dd2d06371f3ef73b490 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 25 Aug 2016 18:17:45 +0700 Subject: [PATCH 166/218] [README.md] Clarify how to export cookies from browser for cookies FAQ entry --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1aa267752..0bb7b791f 100644 --- a/README.md +++ b/README.md @@ -812,7 +812,11 @@ Either prepend `http://www.youtube.com/watch?v=` or separate the ID from the opt ### How do I pass cookies to youtube-dl? -Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows, `LF` (`\n`) for Linux and `CR` (`\r`) for Mac OS. `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. +Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. + +In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [Export Cookies](https://addons.mozilla.org/ru/firefox/addon/export-cookies/) (for Firefox). + +Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows, `LF` (`\n`) for Linux and `CR` (`\r`) for Mac OS. `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. Passing cookies to youtube-dl is a good way to workaround login when a particular extractor does not implement it explicitly. Another use case is working around [CAPTCHA](https://en.wikipedia.org/wiki/CAPTCHA) some websites require you to solve in particular cases in order to get access (e.g. YouTube, CloudFlare). From f26a298247fe19bc8114d6f7a280140dfabee984 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 25 Aug 2016 18:19:41 +0700 Subject: [PATCH 167/218] [README.md] Use en-US URL in cookies FAQ entry --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0bb7b791f..04f423c17 100644 --- a/README.md +++ b/README.md @@ -814,7 +814,7 @@ Either prepend `http://www.youtube.com/watch?v=` or separate the ID from the opt Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. -In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [Export Cookies](https://addons.mozilla.org/ru/firefox/addon/export-cookies/) (for Firefox). +In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [Export Cookies](https://addons.mozilla.org/en-US/firefox/addon/export-cookies/) (for Firefox). Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows, `LF` (`\n`) for Linux and `CR` (`\r`) for Mac OS. `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. From 4c8f9c2577da2f4ba7300d44613599e96cde5c9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 25 Aug 2016 18:27:15 +0700 Subject: [PATCH 168/218] [README.md] Add comments in sample configuration for clarity --- README.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 04f423c17..e01b71cff 100644 --- a/README.md +++ b/README.md @@ -412,11 +412,19 @@ You can configure youtube-dl by placing any supported command line option to a c For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory: ``` --x ---no-mtime ---proxy 127.0.0.1:3128 --o ~/Movies/%(title)s.%(ext)s # Lines starting with # are comments + +# Always extract audio +-x + +# Do not copy the mtime +--no-mtime + +# Use this proxy +--proxy 127.0.0.1:3128 + +# Save all videos under Movies directory in your home directory +-o ~/Movies/%(title)s.%(ext)s ``` Note that options in configuration file are just the same options aka switches used in regular command line calls thus there **must be no whitespace** after `-` or `--`, e.g. `-o` or `--proxy` but not `- o` or `-- proxy`. From 5a3efcd27c1262cc7132f7e1a092524b580788ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 25 Aug 2016 18:57:31 +0700 Subject: [PATCH 169/218] [README.md] Add FAQ entry for download archive --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index e01b71cff..1b9e2a989 100644 --- a/README.md +++ b/README.md @@ -834,6 +834,18 @@ You will first need to tell youtube-dl to stream media to stdout with `-o -`, an youtube-dl -o - "http://www.youtube.com/watch?v=BaW_jenozKcj" | vlc - +### How do I download only new videos from playlist? + +Use the download archive feature. With this feature you should initially download the complete playlist with `--download-archive /path/to/download/archive/file.txt` that will record identifiers of all the videos in a special *download archive file*. Each subsequent run with the same `--download-archive` will download only new videos that are not yet in download archive (if any) and also record them in download archive. Note that only successful downloads are recorded in download archive. + +For example, first run will download complete `PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re` playlist and create download archive `archive.txt`: + + youtube-dl --download-archive archive.txt "https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re" + +Each subsequent run will only download new videos if any: + + youtube-dl --download-archive archive.txt "https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re" + ### Can you add support for this anime video site, or site which shows current movies for free? As a matter of policy (as well as legality), youtube-dl does not include support for services that specialize in infringing copyright. As a rule of thumb, if you cannot easily find a video that the service is quite obviously allowed to distribute (i.e. that has been uploaded by the creator, the creator's distributor, or is published under a free license), the service is probably unfit for inclusion to youtube-dl. From c1f62dd338e0965507ee0976bc88885fdb0fa780 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 25 Aug 2016 14:45:01 +0200 Subject: [PATCH 170/218] [README] Clean up grammar in --download-archive paragraph --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 1b9e2a989..20241307f 100644 --- a/README.md +++ b/README.md @@ -834,15 +834,15 @@ You will first need to tell youtube-dl to stream media to stdout with `-o -`, an youtube-dl -o - "http://www.youtube.com/watch?v=BaW_jenozKcj" | vlc - -### How do I download only new videos from playlist? +### How do I download only new videos from a playlist? -Use the download archive feature. With this feature you should initially download the complete playlist with `--download-archive /path/to/download/archive/file.txt` that will record identifiers of all the videos in a special *download archive file*. Each subsequent run with the same `--download-archive` will download only new videos that are not yet in download archive (if any) and also record them in download archive. Note that only successful downloads are recorded in download archive. +Use download-archive feature. With this feature you should initially download the complete playlist with `--download-archive /path/to/download/archive/file.txt` that will record identifiers of all the videos in a special file. Each subsequent run with the same `--download-archive` will download only new videos that and skip all videos that have been downloaded before. Note that only successful downloads are recorded in the file. -For example, first run will download complete `PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re` playlist and create download archive `archive.txt`: +For example, at first, youtube-dl --download-archive archive.txt "https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re" -Each subsequent run will only download new videos if any: +will download the complete `PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re` playlist and create a file `archive.txt`. Each subsequent run will only download new videos if any: youtube-dl --download-archive archive.txt "https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re" From dc2c37f3162da534281f5f3758231e4c2cb8d1b2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 25 Aug 2016 20:45:57 +0800 Subject: [PATCH 171/218] [spankbang] Fix description and uploader (closes #10339) --- ChangeLog | 1 + youtube_dl/extractor/spankbang.py | 8 +++----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ChangeLog b/ChangeLog index c3cc8f38f..5fb596e33 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version <unreleased> Extractors +* [spankbang] Fix description and uploader (#10339) * [mtvservices:embedded] Fix extraction for the new 'edge' player (#10363) diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index 50433d0f6..186d22b7d 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -14,7 +14,7 @@ class SpankBangIE(InfoExtractor): 'id': '3vvn', 'ext': 'mp4', 'title': 'fantasy solo', - 'description': 'dillion harper masturbates on a bed', + 'description': 'Watch fantasy solo free HD porn video - 05 minutes - dillion harper masturbates on a bed free adult movies.', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'silly2587', 'age_limit': 18, @@ -44,12 +44,10 @@ class SpankBangIE(InfoExtractor): title = self._html_search_regex( r'(?s)<h1[^>]*>(.+?)</h1>', webpage, 'title') - description = self._search_regex( - r'class="desc"[^>]*>([^<]+)', - webpage, 'description', default=None) + description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) uploader = self._search_regex( - r'class="user"[^>]*>([^<]+)', + r'class="user"[^>]*><img[^>]+>([^<]+)', webpage, 'uploader', fatal=False) age_limit = self._rta_search(webpage) From b54a2da4333556baa3b34fc595060223181320d1 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 25 Aug 2016 22:22:31 +0800 Subject: [PATCH 172/218] [crackle] Fix extraction and update _TESTS (closes #10333) --- ChangeLog | 1 + youtube_dl/extractor/crackle.py | 58 ++++++++++++++++++++++++--------- 2 files changed, 43 insertions(+), 16 deletions(-) diff --git a/ChangeLog b/ChangeLog index 5fb596e33..0789549c0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version <unreleased> Extractors +* [crackle] Fix extraction (#10333) * [spankbang] Fix description and uploader (#10339) * [mtvservices:embedded] Fix extraction for the new 'edge' player (#10363) diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index 79238cce7..21f94d33c 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -1,5 +1,7 @@ # coding: utf-8 -from __future__ import unicode_literals +from __future__ import unicode_literals, division + +import re from .common import InfoExtractor from ..utils import int_or_none @@ -8,12 +10,22 @@ from ..utils import int_or_none class CrackleIE(InfoExtractor): _VALID_URL = r'(?:crackle:|https?://(?:www\.)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P<id>\d+)' _TEST = { - 'url': 'http://www.crackle.com/the-art-of-more/2496419', + 'url': 'http://www.crackle.com/comedians-in-cars-getting-coffee/2498934', 'info_dict': { - 'id': '2496419', + 'id': '2498934', 'ext': 'mp4', - 'title': 'Heavy Lies the Head', - 'description': 'md5:bb56aa0708fe7b9a4861535f15c3abca', + 'title': 'Everybody Respects A Bloody Nose', + 'description': 'Jerry is kaffeeklatsching in L.A. with funnyman J.B. Smoove (Saturday Night Live, Real Husbands of Hollywood). They’re headed for brew at 10 Speed Coffee in a 1964 Studebaker Avanti.', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 906, + 'series': 'Comedians In Cars Getting Coffee', + 'season_number': 8, + 'episode_number': 4, + 'subtitles': { + 'en-US': [{ + 'ext': 'ttml', + }] + }, }, 'params': { # m3u8 download @@ -21,11 +33,6 @@ class CrackleIE(InfoExtractor): } } - # extracted from http://legacyweb-us.crackle.com/flash/QueryReferrer.ashx - _SUBTITLE_SERVER = 'http://web-us-az.crackle.com' - _UPLYNK_OWNER_ID = 'e8773f7770a44dbd886eee4fca16a66b' - _THUMBNAIL_TEMPLATE = 'http://images-us-am.crackle.com/%stnl_1920x1080.jpg?ts=20140107233116?c=635333335057637614' - # extracted from http://legacyweb-us.crackle.com/flash/ReferrerRedirect.ashx _MEDIA_FILE_SLOTS = { 'c544.flv': { @@ -48,19 +55,22 @@ class CrackleIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + + config_doc = self._download_xml( + 'http://legacyweb-us.crackle.com/flash/QueryReferrer.ashx?site=16', + video_id, 'Downloading config') + item = self._download_xml( 'http://legacyweb-us.crackle.com/app/revamp/vidwallcache.aspx?flags=-1&fm=%s' % video_id, video_id).find('i') title = item.attrib['t'] - thumbnail = None subtitles = {} formats = self._extract_m3u8_formats( - 'http://content.uplynk.com/ext/%s/%s.m3u8' % (self._UPLYNK_OWNER_ID, video_id), + 'http://content.uplynk.com/ext/%s/%s.m3u8' % (config_doc.attrib['strUplynkOwnerId'], video_id), video_id, 'mp4', m3u8_id='hls', fatal=None) path = item.attrib.get('p') if path: - thumbnail = self._THUMBNAIL_TEMPLATE % path http_base_url = 'http://ahttp.crackle.com/' + path for mfs_path, mfs_info in self._MEDIA_FILE_SLOTS.items(): formats.append({ @@ -76,20 +86,36 @@ class CrackleIE(InfoExtractor): if locale not in subtitles: subtitles[locale] = [] subtitles[locale] = [{ - 'url': '%s/%s%s_%s.xml' % (self._SUBTITLE_SERVER, path, locale, v), + 'url': '%s/%s%s_%s.xml' % (config_doc.attrib['strSubtitleServer'], path, locale, v), 'ext': 'ttml', }] self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) + media_details = self._download_json( + 'https://web-api-us.crackle.com/Service.svc/details/media/%s/TW?format=json' % video_id, + video_id, fatal=False) + thumbnails = [] + if media_details: + for key, value in media_details.items(): + mobj = re.match('^Thumbnail_(\d+)x(\d+)$', key) + if mobj: + width, height = list(map(int, mobj.groups())) + thumbnails.append({ + 'id': '%dp' % height, + 'url': value, + 'width': width, + 'height': height, + }) + return { 'id': video_id, 'title': title, 'description': item.attrib.get('d'), - 'duration': int(item.attrib.get('r'), 16) if item.attrib.get('r') else None, + 'duration': int(item.attrib.get('r'), 16) / 1000 if item.attrib.get('r') else None, 'series': item.attrib.get('sn'), 'season_number': int_or_none(item.attrib.get('se')), 'episode_number': int_or_none(item.attrib.get('ep')), - 'thumbnail': thumbnail, + 'thumbnails': thumbnails, 'subtitles': subtitles, 'formats': formats, } From 20bad91d765284e06f8a8c600a122857d23efeea Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 25 Aug 2016 22:38:06 +0800 Subject: [PATCH 173/218] [downloader/external] Clarify that ffmpeg doesn't support SOCKS Ref: #10304 --- youtube_dl/downloader/external.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index cf4556221..17f12e970 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -220,6 +220,11 @@ class FFmpegFD(ExternalFD): if proxy: if not re.match(r'^[\da-zA-Z]+://', proxy): proxy = 'http://%s' % proxy + + if proxy.startswith('socks'): + self.report_warning( + '%s does not support SOCKS proxies. Downloading may fail.' % self.get_basename()) + # Since December 2015 ffmpeg supports -http_proxy option (see # http://git.videolan.org/?p=ffmpeg.git;a=commit;h=b4eb1f29ebddd60c41a2eb39f5af701e38e0d3fd) # We could switch to the following code if we are able to detect version properly From a0f071a50dc611a66a5fc8ceceb0b455a88f1cb0 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 25 Aug 2016 19:40:56 +0100 Subject: [PATCH 174/218] [usanetwork] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/usanetwork.py | 76 ++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 youtube_dl/extractor/usanetwork.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a58145e3e..74d916e64 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -947,6 +947,7 @@ from .uplynk import ( ) from .urort import UrortIE from .urplay import URPlayIE +from .usanetwork import USANetworkIE from .usatoday import USATodayIE from .ustream import UstreamIE, UstreamChannelIE from .ustudio import ( diff --git a/youtube_dl/extractor/usanetwork.py b/youtube_dl/extractor/usanetwork.py new file mode 100644 index 000000000..823340776 --- /dev/null +++ b/youtube_dl/extractor/usanetwork.py @@ -0,0 +1,76 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .adobepass import AdobePassIE +from ..utils import ( + extract_attributes, + smuggle_url, + update_url_query, +) + + +class USANetworkIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?usanetwork\.com/(?:[^/]+/videos|movies)/(?P<id>[^/?#]+)' + _TEST = { + 'url': 'http://www.usanetwork.com/mrrobot/videos/hpe-cybersecurity', + 'md5': '33c0d2ba381571b414024440d08d57fd', + 'info_dict': { + 'id': '3086229', + 'ext': 'mp4', + 'title': 'HPE Cybersecurity', + 'description': 'The more we digitize our world, the more vulnerable we are.', + 'upload_date': '20160818', + 'timestamp': 1471535460, + 'uploader': 'NBCU-USA', + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + player_params = extract_attributes(self._search_regex( + r'(<div[^>]+data-usa-tve-player-container[^>]*>)', webpage, 'player params')) + video_id = player_params['data-mpx-guid'] + title = player_params['data-episode-title'] + + account_pid, path = re.search( + r'data-src="(?:https?)?//player\.theplatform\.com/p/([^/]+)/.*?/(media/guid/\d+/\d+)', + webpage).groups() + + query = { + 'mbr': 'true', + } + if player_params.get('data-is-full-episode') == '1': + query['manifest'] = 'm3u' + + if player_params.get('data-entitlement') == 'auth': + adobe_pass = {} + drupal_settings = self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings', fatal=False) + if drupal_settings: + drupal_settings = self._parse_json(drupal_settings, video_id, fatal=False) + if drupal_settings: + adobe_pass = drupal_settings.get('adobePass', {}) + resource = self._get_mvpd_resource( + adobe_pass.get('adobePassResourceId', 'usa'), + title, video_id, player_params.get('data-episode-rating', 'TV-14')) + query['auth'] = self._extract_mvpd_auth( + url, video_id, adobe_pass.get('adobePassRequestorId', 'usa'), resource) + + info = self._search_json_ld(webpage, video_id, default={}) + info.update({ + '_type': 'url_transparent', + 'url': smuggle_url(update_url_query( + 'http://link.theplatform.com/s/%s/%s' % (account_pid, path), + query), {'force_smil_url': True}), + 'id': video_id, + 'title': title, + 'series': player_params.get('data-show-title'), + 'episode': title, + 'ie_key': 'ThePlatform', + }) + return info From e3faecde30d85f54c1a341350cba609d3f5b6691 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 26 Aug 2016 03:43:13 +0700 Subject: [PATCH 175/218] [trutube] Remove extractor (Closes #10438) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/trutube.py | 26 -------------------------- 2 files changed, 27 deletions(-) delete mode 100644 youtube_dl/extractor/trutube.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 74d916e64..717ba9375 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -873,7 +873,6 @@ from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE from .trollvids import TrollvidsIE -from .trutube import TruTubeIE from .tube8 import Tube8IE from .tubitv import TubiTvIE from .tudou import ( diff --git a/youtube_dl/extractor/trutube.py b/youtube_dl/extractor/trutube.py deleted file mode 100644 index d55e0c563..000000000 --- a/youtube_dl/extractor/trutube.py +++ /dev/null @@ -1,26 +0,0 @@ -from __future__ import unicode_literals - -from .nuevo import NuevoBaseIE - - -class TruTubeIE(NuevoBaseIE): - _VALID_URL = r'https?://(?:www\.)?trutube\.tv/(?:video/|nuevo/player/embed\.php\?v=)(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://trutube.tv/video/14880/Ramses-II-Proven-To-Be-A-Red-Headed-Caucasoid-', - 'md5': 'c5b6e301b0a2040b074746cbeaa26ca1', - 'info_dict': { - 'id': '14880', - 'ext': 'flv', - 'title': 'Ramses II - Proven To Be A Red Headed Caucasoid', - 'thumbnail': 're:^http:.*\.jpg$', - } - }, { - 'url': 'https://trutube.tv/nuevo/player/embed.php?v=14880', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self._extract_nuevo( - 'https://trutube.tv/nuevo/player/config.php?v=%s' % video_id, - video_id) From 298a120ab76008c900e30de50dc738dd63e79fb4 Mon Sep 17 00:00:00 2001 From: Aleksander Nitecki <ixendr@itogi.re> Date: Thu, 25 Aug 2016 20:21:06 +0200 Subject: [PATCH 176/218] [nhk] Add extractor for VoD. --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/nhk.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 youtube_dl/extractor/nhk.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 717ba9375..8d88d6cb4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -541,6 +541,7 @@ from .nextmedia import ( ) from .nfb import NFBIE from .nfl import NFLIE +from .nhk import NhkVodIE from .nhl import ( NHLVideocenterIE, NHLNewsIE, diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py new file mode 100644 index 000000000..90e935351 --- /dev/null +++ b/youtube_dl/extractor/nhk.py @@ -0,0 +1,29 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class NhkVodIE(InfoExtractor): + _VALID_URL = r'http://www3\.nhk\.or\.jp/nhkworld/en/vod/(?P<id>.+)\.html' + _TESTS = [{ + 'url': 'http://www3.nhk.or.jp/nhkworld/en/vod/tokyofashion/20160815.html', + 'info_dict': { + 'id': 'A1bnNiNTE6nY3jLllS-BIISfcC_PpvF5', + 'ext': 'flv', + 'title': '[nhkworld]VOD;2009-251-2016;TOKYO FASHION EXPRESS;The Kimono as Global Fashion;en', + }, + 'params': { + 'skip_download': True # Videos available only for a limited period of time. + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + embed_code = self._search_regex( + r'''nw_vod_ooplayer\('movie-area', '([^']+)'\);''', + webpage, + 'ooyala embed code') + + return self.url_result('ooyala:' + embed_code, 'Ooyala') From f9b373afda2a936c4f8303671f3160c532ccae67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 26 Aug 2016 04:48:40 +0700 Subject: [PATCH 177/218] [nhk:vod] Improve extraction (Closes #10424) --- youtube_dl/extractor/nhk.py | 43 +++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index 90e935351..691bdfa4e 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -4,26 +4,47 @@ from .common import InfoExtractor class NhkVodIE(InfoExtractor): - _VALID_URL = r'http://www3\.nhk\.or\.jp/nhkworld/en/vod/(?P<id>.+)\.html' - _TESTS = [{ + _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/en/vod/(?P<id>.+?)\.html' + _TEST = { + # Videos available only for a limited period of time. Visit + # http://www3.nhk.or.jp/nhkworld/en/vod/ for working samples. 'url': 'http://www3.nhk.or.jp/nhkworld/en/vod/tokyofashion/20160815.html', 'info_dict': { 'id': 'A1bnNiNTE6nY3jLllS-BIISfcC_PpvF5', 'ext': 'flv', - 'title': '[nhkworld]VOD;2009-251-2016;TOKYO FASHION EXPRESS;The Kimono as Global Fashion;en', + 'title': 'TOKYO FASHION EXPRESS - The Kimono as Global Fashion', + 'description': 'md5:db338ee6ce8204f415b754782f819824', + 'series': 'TOKYO FASHION EXPRESS', + 'episode': 'The Kimono as Global Fashion', }, - 'params': { - 'skip_download': True # Videos available only for a limited period of time. - }, - }] + 'skip': 'Videos available only for a limited period of time', + } def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) embed_code = self._search_regex( - r'''nw_vod_ooplayer\('movie-area', '([^']+)'\);''', - webpage, - 'ooyala embed code') + r'nw_vod_ooplayer\([^,]+,\s*(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'ooyala embed code', group='id') - return self.url_result('ooyala:' + embed_code, 'Ooyala') + title = self._search_regex( + r'<div[^>]+class=["\']episode-detail["\']>\s*<h\d+>([^<]+)', + webpage, 'title', default=None) + description = self._html_search_regex( + r'(?s)<p[^>]+class=["\']description["\'][^>]*>(.+?)</p>', + webpage, 'description', default=None) + series = self._search_regex( + r'<h2[^>]+class=["\']detail-top-player-title[^>]+><a[^>]+>([^<]+)', + webpage, 'series', default=None) + + return { + '_type': 'url_transparent', + 'ie_key': 'Ooyala', + 'url': 'ooyala:%s' % embed_code, + 'title': '%s - %s' % (series, title) if series and title else title, + 'description': description, + 'series': series, + 'episode': title, + } From c9de980106990485fd9bff9a86d463349fe1d384 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 26 Aug 2016 04:49:52 +0700 Subject: [PATCH 178/218] Credit @Xender for nhk:vod (#10424) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 1fd4be785..b9a602c12 100644 --- a/AUTHORS +++ b/AUTHORS @@ -181,3 +181,4 @@ Nehal Patel Rob van Bekkum Petr Zvoníček Pratyush Singh +Aleksander Nitecki From 6b18a24e6ee39ab2fdb5e3d9e1cf2eec547ca3f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 26 Aug 2016 05:57:52 +0700 Subject: [PATCH 179/218] [tnaflix] Fix extraction (Closes #10434) --- youtube_dl/extractor/tnaflix.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index 7ddf77767..77d56b8ca 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -10,6 +10,7 @@ from ..utils import ( int_or_none, parse_duration, str_to_int, + unescapeHTML, xpath_text, ) @@ -80,7 +81,8 @@ class TNAFlixNetworkBaseIE(InfoExtractor): if not cfg_url: inputs = self._hidden_inputs(webpage) - cfg_url = 'https://cdn-fck.tnaflix.com/tnaflix/%s.fid?key=%s' % (inputs['vkey'], inputs['nkey']) + cfg_url = ('https://cdn-fck.tnaflix.com/tnaflix/%s.fid?key=%s&VID=%s&premium=1&vip=1&alpha' + % (inputs['vkey'], inputs['nkey'], video_id)) cfg_xml = self._download_xml( cfg_url, display_id, 'Downloading metadata', @@ -89,7 +91,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor): formats = [] def extract_video_url(vl): - return re.sub('speed=\d+', 'speed=', vl.text) + return re.sub('speed=\d+', 'speed=', unescapeHTML(vl.text)) video_link = cfg_xml.find('./videoLink') if video_link is not None: @@ -201,7 +203,7 @@ class TNAFlixIE(TNAFlixNetworkBaseIE): _TESTS = [{ # anonymous uploader, no categories 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', - 'md5': '7e569419fe6d69543d01e6be22f5f7c4', + 'md5': 'ecf3498417d09216374fc5907f9c6ec0', 'info_dict': { 'id': '553878', 'display_id': 'Carmella-Decesare-striptease', @@ -215,11 +217,11 @@ class TNAFlixIE(TNAFlixNetworkBaseIE): }, { # non-anonymous uploader, categories 'url': 'https://www.tnaflix.com/teen-porn/Educational-xxx-video/video6538', - 'md5': 'fcba2636572895aba116171a899a5658', + 'md5': '0f5d4d490dbfd117b8607054248a07c0', 'info_dict': { 'id': '6538', 'display_id': 'Educational-xxx-video', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Educational xxx video', 'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8', 'thumbnail': 're:https?://.*\.jpg$', From b281aad2dc658e3c6535579d75b42a5634487b83 Mon Sep 17 00:00:00 2001 From: steven7851 <steven7851@msn.com> Date: Fri, 26 Aug 2016 07:32:54 +0800 Subject: [PATCH 180/218] [douyutv] Use new api use lapi for flv info, and html5 api for room info #10153 #10318 --- youtube_dl/extractor/douyutv.py | 87 ++++++++++++++++----------------- 1 file changed, 43 insertions(+), 44 deletions(-) diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index ce6962755..33efc993e 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -3,9 +3,10 @@ from __future__ import unicode_literals import hashlib import time +import uuid from .common import InfoExtractor from ..utils import (ExtractorError, unescapeHTML) -from ..compat import (compat_str, compat_basestring) +from ..compat import (compat_str, compat_basestring, compat_urllib_parse_urlencode) class DouyuTVIE(InfoExtractor): @@ -21,7 +22,6 @@ class DouyuTVIE(InfoExtractor): 'description': 're:.*m7show@163\.com.*', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': '7师傅', - 'uploader_id': '431925', 'is_live': True, }, 'params': { @@ -37,7 +37,6 @@ class DouyuTVIE(InfoExtractor): 'description': 'md5:746a2f7a253966a06755a912f0acc0d2', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'douyu小漠', - 'uploader_id': '3769985', 'is_live': True, }, 'params': { @@ -54,7 +53,6 @@ class DouyuTVIE(InfoExtractor): 'description': 're:.*m7show@163\.com.*', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': '7师傅', - 'uploader_id': '431925', 'is_live': True, }, 'params': { @@ -75,19 +73,39 @@ class DouyuTVIE(InfoExtractor): room_id = self._html_search_regex( r'"room_id"\s*:\s*(\d+),', page, 'room id') - config = None + room_url = 'http://m.douyu.com/html5/live?roomId=%s' % room_id + room_content = self._download_webpage(room_url, video_id) + room_json = self._parse_json(room_content, video_id, fatal=False) + + room = room_json['data'] + + show_status = room.get('show_status') + # 1 = live, 2 = offline + if show_status == '2': + raise ExtractorError( + 'Live stream is offline', expected=True) + + flv_json = None # Douyu API sometimes returns error "Unable to load the requested class: eticket_redis_cache" # Retry with different parameters - same parameters cause same errors for i in range(5): - prefix = 'room/%s?aid=android&client_sys=android&time=%d' % ( - room_id, int(time.time())) - auth = hashlib.md5((prefix + '1231').encode('ascii')).hexdigest() + tt = int(time.time() / 60) + did = uuid.uuid4().hex.upper() - config_page = self._download_webpage( - 'http://www.douyutv.com/api/v1/%s&auth=%s' % (prefix, auth), - video_id) + # Decompile core.swf in webpage by ffdec "Search SWFs in memory" + # core.swf is encrypted originally, but ffdec can dump memory to get the decrypted one + # If API changes in the future, just use this way to update + sign_content = '{room_id}{did}A12Svb&%1UUmf@hC{tt}'.format(room_id = room_id, did = did, tt = tt) + sign = hashlib.md5((sign_content).encode('utf-8')).hexdigest() + + payload = {'cdn': 'ws', 'rate': '0', 'tt': tt, 'did': did, 'sign': sign} + flv_data = compat_urllib_parse_urlencode(payload) + + flv_request_url = 'http://www.douyu.com/lapi/live/getPlay/%s' % room_id + flv_content = self._download_webpage(flv_request_url, video_id, data=flv_data, + headers={'Content-Type': 'application/x-www-form-urlencoded'}) try: - config = self._parse_json(config_page, video_id, fatal=False) + flv_json = self._parse_json(flv_content, video_id, fatal=False) except ExtractorError: # Wait some time before retrying to get a different time() value self._sleep(1, video_id, msg_template='%(video_id)s: Error occurs. ' @@ -95,54 +113,35 @@ class DouyuTVIE(InfoExtractor): continue else: break - if config is None: + if flv_json is None: raise ExtractorError('Unable to fetch API result') - data = config['data'] + flv = flv_json['data'] - error_code = config.get('error', 0) + error_code = flv_json.get('error', 0) if error_code is not 0: error_desc = 'Server reported error %i' % error_code - if isinstance(data, (compat_str, compat_basestring)): - error_desc += ': ' + data + if isinstance(flv, (compat_str, compat_basestring)): + error_desc += ': ' + flv raise ExtractorError(error_desc, expected=True) - show_status = data.get('show_status') - # 1 = live, 2 = offline - if show_status == '2': - raise ExtractorError( - 'Live stream is offline', expected=True) + base_url = flv['rtmp_url'] + live_path = flv['rtmp_live'] - base_url = data['rtmp_url'] - live_path = data['rtmp_live'] + video_url = '%s/%s' % (base_url, live_path) - title = self._live_title(unescapeHTML(data['room_name'])) - description = data.get('show_details') - thumbnail = data.get('room_src') - - uploader = data.get('nickname') - uploader_id = data.get('owner_uid') - - multi_formats = data.get('rtmp_multi_bitrate') - if not isinstance(multi_formats, dict): - multi_formats = {} - multi_formats['live'] = live_path - - formats = [{ - 'url': '%s/%s' % (base_url, format_path), - 'format_id': format_id, - 'preference': 1 if format_id == 'live' else 0, - } for format_id, format_path in multi_formats.items()] - self._sort_formats(formats) + title = self._live_title(unescapeHTML(room['room_name'])) + description = room.get('notice') + thumbnail = room.get('room_src') + uploader = room.get('nickname') return { 'id': room_id, 'display_id': video_id, + 'url': video_url, 'title': title, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, - 'uploader_id': uploader_id, - 'formats': formats, 'is_live': True, } From 906b87cf5f6ccf28ebd75d6a92367d7c238f2ad9 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 26 Aug 2016 19:58:17 +0800 Subject: [PATCH 181/218] [crackle] Revert to template-based thumbnail extraction To reduce to number of HTTP requests --- youtube_dl/extractor/crackle.py | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index 21f94d33c..cc68f1c00 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals, division -import re - from .common import InfoExtractor from ..utils import int_or_none @@ -34,6 +32,7 @@ class CrackleIE(InfoExtractor): } # extracted from http://legacyweb-us.crackle.com/flash/ReferrerRedirect.ashx + _THUMBNAIL_TEMPLATE = 'http://images-us-am.crackle.com/%stnl_1920x1080.jpg?ts=20140107233116?c=635333335057637614' _MEDIA_FILE_SLOTS = { 'c544.flv': { 'width': 544, @@ -69,8 +68,10 @@ class CrackleIE(InfoExtractor): formats = self._extract_m3u8_formats( 'http://content.uplynk.com/ext/%s/%s.m3u8' % (config_doc.attrib['strUplynkOwnerId'], video_id), video_id, 'mp4', m3u8_id='hls', fatal=None) + thumbnail = None path = item.attrib.get('p') if path: + thumbnail = self._THUMBNAIL_TEMPLATE % path http_base_url = 'http://ahttp.crackle.com/' + path for mfs_path, mfs_info in self._MEDIA_FILE_SLOTS.items(): formats.append({ @@ -91,22 +92,6 @@ class CrackleIE(InfoExtractor): }] self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) - media_details = self._download_json( - 'https://web-api-us.crackle.com/Service.svc/details/media/%s/TW?format=json' % video_id, - video_id, fatal=False) - thumbnails = [] - if media_details: - for key, value in media_details.items(): - mobj = re.match('^Thumbnail_(\d+)x(\d+)$', key) - if mobj: - width, height = list(map(int, mobj.groups())) - thumbnails.append({ - 'id': '%dp' % height, - 'url': value, - 'width': width, - 'height': height, - }) - return { 'id': video_id, 'title': title, @@ -115,7 +100,7 @@ class CrackleIE(InfoExtractor): 'series': item.attrib.get('sn'), 'season_number': int_or_none(item.attrib.get('se')), 'episode_number': int_or_none(item.attrib.get('ep')), - 'thumbnails': thumbnails, + 'thumbnail': thumbnail, 'subtitles': subtitles, 'formats': formats, } From 3b4b82d4cec702fc06e2d6b38a44dd0c7bd77a5b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 27 Aug 2016 01:16:39 +0800 Subject: [PATCH 182/218] [douyutv] Simplify --- youtube_dl/extractor/douyutv.py | 86 +++++++++++++++------------------ 1 file changed, 39 insertions(+), 47 deletions(-) diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index 33efc993e..e366e17e6 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -4,9 +4,16 @@ from __future__ import unicode_literals import hashlib import time import uuid + from .common import InfoExtractor -from ..utils import (ExtractorError, unescapeHTML) -from ..compat import (compat_str, compat_basestring, compat_urllib_parse_urlencode) +from ..compat import ( + compat_str, + compat_urllib_parse_urlencode, +) +from ..utils import ( + ExtractorError, + unescapeHTML, +) class DouyuTVIE(InfoExtractor): @@ -63,6 +70,10 @@ class DouyuTVIE(InfoExtractor): 'only_matching': True, }] + # Decompile core.swf in webpage by ffdec "Search SWFs in memory". core.swf + # is encrypted originally, but ffdec can dump memory to get the decrypted one. + _API_KEY = 'A12Svb&%1UUmf@hC' + def _real_extract(self, url): video_id = self._match_id(url) @@ -73,60 +84,41 @@ class DouyuTVIE(InfoExtractor): room_id = self._html_search_regex( r'"room_id"\s*:\s*(\d+),', page, 'room id') - room_url = 'http://m.douyu.com/html5/live?roomId=%s' % room_id - room_content = self._download_webpage(room_url, video_id) - room_json = self._parse_json(room_content, video_id, fatal=False) + room = self._download_json( + 'http://m.douyu.com/html5/live?roomId=%s' % room_id, video_id, + note='Downloading room info')['data'] - room = room_json['data'] - - show_status = room.get('show_status') # 1 = live, 2 = offline - if show_status == '2': - raise ExtractorError( - 'Live stream is offline', expected=True) + if room.get('show_status') == '2': + raise ExtractorError('Live stream is offline', expected=True) - flv_json = None - # Douyu API sometimes returns error "Unable to load the requested class: eticket_redis_cache" - # Retry with different parameters - same parameters cause same errors - for i in range(5): - tt = int(time.time() / 60) - did = uuid.uuid4().hex.upper() + tt = compat_str(int(time.time() / 60)) + did = uuid.uuid4().hex.upper() - # Decompile core.swf in webpage by ffdec "Search SWFs in memory" - # core.swf is encrypted originally, but ffdec can dump memory to get the decrypted one - # If API changes in the future, just use this way to update - sign_content = '{room_id}{did}A12Svb&%1UUmf@hC{tt}'.format(room_id = room_id, did = did, tt = tt) - sign = hashlib.md5((sign_content).encode('utf-8')).hexdigest() + sign_content = ''.join((room_id, did, self._API_KEY, tt)) + sign = hashlib.md5((sign_content).encode('utf-8')).hexdigest() - payload = {'cdn': 'ws', 'rate': '0', 'tt': tt, 'did': did, 'sign': sign} - flv_data = compat_urllib_parse_urlencode(payload) + flv_data = compat_urllib_parse_urlencode({ + 'cdn': 'ws', + 'rate': '0', + 'tt': tt, + 'did': did, + 'sign': sign, + }) - flv_request_url = 'http://www.douyu.com/lapi/live/getPlay/%s' % room_id - flv_content = self._download_webpage(flv_request_url, video_id, data=flv_data, - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - try: - flv_json = self._parse_json(flv_content, video_id, fatal=False) - except ExtractorError: - # Wait some time before retrying to get a different time() value - self._sleep(1, video_id, msg_template='%(video_id)s: Error occurs. ' - 'Waiting for %(timeout)s seconds before retrying') - continue - else: - break - if flv_json is None: - raise ExtractorError('Unable to fetch API result') + video_info = self._download_json( + 'http://www.douyu.com/lapi/live/getPlay/%s' % room_id, video_id, + data=flv_data, note='Downloading video info', + headers={'Content-Type': 'application/x-www-form-urlencoded'}) - flv = flv_json['data'] - - error_code = flv_json.get('error', 0) + error_code = video_info.get('error', 0) if error_code is not 0: - error_desc = 'Server reported error %i' % error_code - if isinstance(flv, (compat_str, compat_basestring)): - error_desc += ': ' + flv - raise ExtractorError(error_desc, expected=True) + raise ExtractorError( + '%s reported error %i' % (self.IE_NAME, error_code), + expected=True) - base_url = flv['rtmp_url'] - live_path = flv['rtmp_live'] + base_url = video_info['data']['rtmp_url'] + live_path = video_info['data']['rtmp_live'] video_url = '%s/%s' % (base_url, live_path) From 92c27a0dbf19eff211e7ffdd8db5895387e75529 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 28 Aug 2016 02:35:49 +0700 Subject: [PATCH 183/218] [periscope:user] Fix extraction (Closes #10453) --- youtube_dl/extractor/periscope.py | 47 ++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 75f5884a9..6c640089d 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -8,7 +8,14 @@ from ..utils import ( ) -class PeriscopeIE(InfoExtractor): +class PeriscopeBaseIE(InfoExtractor): + def _call_api(self, method, query, item_id): + return self._download_json( + 'https://api.periscope.tv/api/v2/%s' % method, + item_id, query=query) + + +class PeriscopeIE(PeriscopeBaseIE): IE_DESC = 'Periscope' IE_NAME = 'periscope' _VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P<id>[^/?#]+)' @@ -34,14 +41,11 @@ class PeriscopeIE(InfoExtractor): 'only_matching': True, }] - def _call_api(self, method, value): - return self._download_json( - 'https://api.periscope.tv/api/v2/%s?broadcast_id=%s' % (method, value), value) - def _real_extract(self, url): token = self._match_id(url) - broadcast_data = self._call_api('getBroadcastPublic', token) + broadcast_data = self._call_api( + 'getBroadcastPublic', {'broadcast_id': token}, token) broadcast = broadcast_data['broadcast'] status = broadcast['status'] @@ -61,7 +65,8 @@ class PeriscopeIE(InfoExtractor): 'url': broadcast[image], } for image in ('image_url', 'image_url_small') if broadcast.get(image)] - stream = self._call_api('getAccessPublic', token) + stream = self._call_api( + 'getAccessPublic', {'broadcast_id': token}, token) formats = [] for format_id in ('replay', 'rtmp', 'hls', 'https_hls'): @@ -88,7 +93,7 @@ class PeriscopeIE(InfoExtractor): } -class PeriscopeUserIE(InfoExtractor): +class PeriscopeUserIE(PeriscopeBaseIE): _VALID_URL = r'https?://www\.periscope\.tv/(?P<id>[^/]+)/?$' IE_DESC = 'Periscope user videos' IE_NAME = 'periscope:user' @@ -106,26 +111,34 @@ class PeriscopeUserIE(InfoExtractor): } def _real_extract(self, url): - user_id = self._match_id(url) + user_name = self._match_id(url) - webpage = self._download_webpage(url, user_id) + webpage = self._download_webpage(url, user_name) data_store = self._parse_json( unescapeHTML(self._search_regex( r'data-store=(["\'])(?P<data>.+?)\1', webpage, 'data store', default='{}', group='data')), - user_id) + user_name) - user = data_store.get('User', {}).get('user', {}) - title = user.get('display_name') or user.get('username') + user = list(data_store['UserCache']['users'].values())[0]['user'] + user_id = user['id'] + session_id = data_store['SessionToken']['broadcastHistory']['token']['session_id'] + + broadcasts = self._call_api( + 'getUserBroadcastsPublic', + {'user_id': user_id, 'session_id': session_id}, + user_name)['broadcasts'] + + broadcast_ids = [ + broadcast['id'] for broadcast in broadcasts if broadcast.get('id')] + + title = user.get('display_name') or user.get('username') or user_name description = user.get('description') - broadcast_ids = (data_store.get('UserBroadcastHistory', {}).get('broadcastIds') or - data_store.get('BroadcastCache', {}).get('broadcastIds', [])) - entries = [ self.url_result( - 'https://www.periscope.tv/%s/%s' % (user_id, broadcast_id)) + 'https://www.periscope.tv/%s/%s' % (user_name, broadcast_id)) for broadcast_id in broadcast_ids] return self.playlist_result(entries, user_id, title, description) From d7aae610f6674d96971246f916973158374f88b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 28 Aug 2016 07:00:15 +0700 Subject: [PATCH 184/218] [ChangeLog] Actualize --- ChangeLog | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 0789549c0..4062c2021 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,8 +1,23 @@ version <unreleased> +Core ++ Add warning message that ffmpeg doesn't support SOCKS +* Improve thumbnail sorting ++ Extract formats from #EXT-X-MEDIA tags in _extract_m3u8_formats +* Fill IV with leading zeros for IVs shorter than 16 octets in hlsnative ++ Add ac-3 to the list of audio codecs in parse_codecs + Extractors +* [periscope:user] Fix extraction (#10453) +* [douyutv] Fix extraction (#10153, #10318, #10444) ++ [nhk:vod] Add extractor for www3.nhk.or.jp on demand (#4437, #10424) +- [trutube] Remove extractor (#10438) ++ [usanetwork] Add extractor for usanetwork.com * [crackle] Fix extraction (#10333) -* [spankbang] Fix description and uploader (#10339) +* [spankbang] Fix description and uploader extraction (#10339) +* [discoverygo] Detect cable provider restricted videos (#10425) ++ [cbc] Add support for watch.cbc.ca +* [kickstarter] Silent the warning for og:description (#10415) * [mtvservices:embedded] Fix extraction for the new 'edge' player (#10363) From 71e90766b5f7d57bdbe20b71c32ce5a8f66aecc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 28 Aug 2016 07:09:03 +0700 Subject: [PATCH 185/218] [README.md] Fix typo in download archive FAQ entry --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 20241307f..87465aa5e 100644 --- a/README.md +++ b/README.md @@ -836,7 +836,7 @@ You will first need to tell youtube-dl to stream media to stdout with `-o -`, an ### How do I download only new videos from a playlist? -Use download-archive feature. With this feature you should initially download the complete playlist with `--download-archive /path/to/download/archive/file.txt` that will record identifiers of all the videos in a special file. Each subsequent run with the same `--download-archive` will download only new videos that and skip all videos that have been downloaded before. Note that only successful downloads are recorded in the file. +Use download-archive feature. With this feature you should initially download the complete playlist with `--download-archive /path/to/download/archive/file.txt` that will record identifiers of all the videos in a special file. Each subsequent run with the same `--download-archive` will download only new videos and skip all videos that have been downloaded before. Note that only successful downloads are recorded in the file. For example, at first, From 1198fe14a1eff1047652c51163266246577e3682 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 28 Aug 2016 07:24:08 +0700 Subject: [PATCH 186/218] release 2016.08.28 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 11 +++++++---- youtube_dl/version.py | 2 +- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 15acc025a..a2fe59f80 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.24.1*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.24.1** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.28*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.28** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.08.24.1 +[debug] youtube-dl version 2016.08.28 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 4062c2021..d3496b5dc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2016.08.28 Core + Add warning message that ffmpeg doesn't support SOCKS diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 08db56fa9..bf08697be 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -89,7 +89,7 @@ - **Bet** - **Bigflix** - **Bild**: Bild.de - - **BiliBili** + - **BiliBili** (Currently broken) - **BioBioChileTV** - **BIQLE** - **BleacherReport** @@ -115,8 +115,10 @@ - **Canvas** - **CarambaTV** - **CarambaTVPage** - - **CBC** - - **CBCPlayer** + - **cbc.ca** + - **cbc.ca:player** + - **cbc.ca:watch** + - **cbc.ca:watch:video** - **CBS** - **CBSInteractive** - **CBSLocal** @@ -448,6 +450,7 @@ - **NextMediaActionNews**: 蘋果日報 - 動新聞 - **nfb**: National Film Board of Canada - **nfl.com** + - **NhkVod** - **nhl.com** - **nhl.com:news**: NHL news - **nhl.com:videocenter** @@ -713,7 +716,6 @@ - **TrailerAddict** (Currently broken) - **Trilulilu** - **trollvids** - - **TruTube** - **Tube8** - **TubiTv** - **tudou** @@ -758,6 +760,7 @@ - **uplynk:preplay** - **Urort**: NRK P3 Urørt - **URPlay** + - **USANetwork** - **USAToday** - **ustream** - **ustream:channel** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 7447d3d7e..ee30ca2ad 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.08.24.1' +__version__ = '2016.08.28' From 39efc6e3e048a8323c36efcdf6b7434259a35e44 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 28 Aug 2016 15:46:11 +0800 Subject: [PATCH 187/218] [generic] Update some _TESTS --- youtube_dl/extractor/generic.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 506892b11..c6e655c84 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -104,7 +104,8 @@ class GenericIE(InfoExtractor): }, 'expected_warnings': [ 'URL could be a direct video link, returning it as such.' - ] + ], + 'skip': 'URL invalid', }, # Direct download with broken HEAD { @@ -268,7 +269,8 @@ class GenericIE(InfoExtractor): 'params': { # m3u8 downloads 'skip_download': True, - } + }, + 'skip': 'video gone', }, # m3u8 served with Content-Type: text/plain { @@ -283,7 +285,8 @@ class GenericIE(InfoExtractor): 'params': { # m3u8 downloads 'skip_download': True, - } + }, + 'skip': 'video gone', }, # google redirect { @@ -368,6 +371,7 @@ class GenericIE(InfoExtractor): 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', }, 'add_ie': ['BrightcoveLegacy'], + 'skip': 'video gone', }, { 'url': 'http://www.championat.com/video/football/v/87/87499.html', @@ -421,6 +425,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'movie expired', }, # embed.ly video { @@ -448,6 +453,8 @@ class GenericIE(InfoExtractor): 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama', 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.', }, + # HEAD requests lead to endless 301, while GET is OK + 'expected_warnings': ['301'], }, # RUTV embed { @@ -522,6 +529,9 @@ class GenericIE(InfoExtractor): 'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )', }, 'playlist_mincount': 7, + # This forum does not allow <iframe> syntaxes anymore + # Now HTML tags are displayed as-is + 'skip': 'No videos on this page', }, # Embedded TED video { @@ -570,7 +580,8 @@ class GenericIE(InfoExtractor): }, 'params': { 'skip_download': 'Requires rtmpdump' - } + }, + 'skip': 'video gone', }, # francetv embed { From 40eec6b15cd3135b24cb42fde5ccf62e9a1f0807 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 28 Aug 2016 20:27:08 +0800 Subject: [PATCH 188/218] [openload] Fix extraction (closes #10408) Thanks to @yokrysty again! --- ChangeLog | 6 ++++++ youtube_dl/extractor/openload.py | 4 +++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index d3496b5dc..5d7a052a5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version <unreleased> + +Extractors +* [openload] Fix extraction (#10408) + + version 2016.08.28 Core diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index e181d0b3a..c8dde7ae3 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -55,10 +55,12 @@ class OpenloadIE(InfoExtractor): video_url_chars = [] - for c in enc_data: + for idx, c in enumerate(enc_data): j = compat_ord(c) if j >= 33 and j <= 126: j = ((j + 14) % 94) + 33 + if idx == len(enc_data) - 1: + j += 2 video_url_chars += compat_chr(j) video_url = 'https://openload.co/stream/%s?mime=true' % ''.join(video_url_chars) From 04b32c8f9679004d11ee97c2b7beecaedf1b477b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 28 Aug 2016 22:06:31 +0800 Subject: [PATCH 189/218] [bilibili] Fix extraction (closes #10375) Thanks @gdkchan for the algorithm --- ChangeLog | 1 + youtube_dl/extractor/bilibili.py | 98 ++++++++++++-------------------- 2 files changed, 36 insertions(+), 63 deletions(-) diff --git a/ChangeLog b/ChangeLog index 5d7a052a5..e055976c5 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version <unreleased> Extractors +* [bilibili] Fix extraction (#10375) * [openload] Fix extraction (#10408) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index d87c38a02..a332fbb69 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -1,35 +1,26 @@ # coding: utf-8 from __future__ import unicode_literals -import calendar -import datetime +import hashlib import re from .common import InfoExtractor -from ..compat import ( - compat_etree_fromstring, - compat_str, - compat_parse_qs, - compat_xml_parse_error, -) +from ..compat import compat_parse_qs from ..utils import ( - ExtractorError, int_or_none, float_or_none, - xpath_text, + unified_timestamp, ) class BiliBiliIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', 'md5': '9fa226fe2b8a9a4d5a69b4c6a183417e', 'info_dict': { - 'id': '1554319', + 'id': '1074402', 'ext': 'mp4', 'title': '【金坷垃】金泡沫', 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', @@ -43,24 +34,28 @@ class BiliBiliIE(InfoExtractor): }, { 'url': 'http://www.bilibili.com/video/av1041170/', 'info_dict': { - 'id': '1507019', + 'id': '1041170', 'ext': 'mp4', 'title': '【BD1080P】刀语【诸神&异域】', 'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦!~', + 'duration': 3382.259, 'timestamp': 1396530060, 'upload_date': '20140403', + 'thumbnail': 're:^https?://.+\.jpg', 'uploader': '枫叶逝去', 'uploader_id': '520116', }, }, { 'url': 'http://www.bilibili.com/video/av4808130/', 'info_dict': { - 'id': '7802182', + 'id': '4808130', 'ext': 'mp4', 'title': '【长篇】哆啦A梦443【钉铛】', 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', + 'duration': 1493.995, 'timestamp': 1464564180, 'upload_date': '20160529', + 'thumbnail': 're:^https?://.+\.jpg', 'uploader': '喜欢拉面', 'uploader_id': '151066', }, @@ -68,12 +63,14 @@ class BiliBiliIE(InfoExtractor): # Missing upload time 'url': 'http://www.bilibili.com/video/av1867637/', 'info_dict': { - 'id': '2880301', + 'id': '1867637', 'ext': 'mp4', 'title': '【HDTV】【喜剧】岳父岳母真难当 (2014)【法国票房冠军】', 'description': '一个信奉天主教的法国旧式传统资产阶级家庭中有四个女儿。三个女儿却分别找了阿拉伯、犹太、中国丈夫,老夫老妻唯独期盼剩下未嫁的小女儿能找一个信奉天主教的法国白人,结果没想到小女儿找了一位非裔黑人……【这次应该不会跳帧了】', + 'duration': 5760.0, 'uploader': '黑夜为猫', 'uploader_id': '610729', + 'thumbnail': 're:^https?://.+\.jpg', }, 'params': { # Just to test metadata extraction @@ -82,86 +79,61 @@ class BiliBiliIE(InfoExtractor): 'expected_warnings': ['upload time'], }] - # BiliBili blocks keys from time to time. The current key is extracted from - # the Android client - # TODO: find the sign algorithm used in the flash player - _APP_KEY = '86385cdc024c0f6c' + _APP_KEY = '6f90a59ac58a4123' + _BILIBILI_KEY = '0bfd84cc3940035173f35e6777508326' def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - params = compat_parse_qs(self._search_regex( + cid = compat_parse_qs(self._search_regex( [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'], - webpage, 'player parameters')) - cid = params['cid'][0] + webpage, 'player parameters'))['cid'][0] - info_xml_str = self._download_webpage( - 'http://interface.bilibili.com/v_cdn_play', - cid, query={'appkey': self._APP_KEY, 'cid': cid}, - note='Downloading video info page') + payload = 'appkey=%s&cid=%s&otype=json&quality=2&type=mp4' % (self._APP_KEY, cid) + sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() - err_msg = None - durls = None - info_xml = None - try: - info_xml = compat_etree_fromstring(info_xml_str.encode('utf-8')) - except compat_xml_parse_error: - info_json = self._parse_json(info_xml_str, video_id, fatal=False) - err_msg = (info_json or {}).get('error_text') - else: - err_msg = xpath_text(info_xml, './message') - - if info_xml is not None: - durls = info_xml.findall('./durl') - if not durls: - if err_msg: - raise ExtractorError('%s said: %s' % (self.IE_NAME, err_msg), expected=True) - else: - raise ExtractorError('No videos found!') + video_info = self._download_json( + 'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign), + video_id, note='Downloading video info page') entries = [] - for durl in durls: - size = xpath_text(durl, ['./filesize', './size']) + for idx, durl in enumerate(video_info['durl']): formats = [{ - 'url': durl.find('./url').text, - 'filesize': int_or_none(size), + 'url': durl['url'], + 'filesize': int_or_none(durl['size']), }] - for backup_url in durl.findall('./backup_url/url'): + for backup_url in durl['backup_url']: formats.append({ - 'url': backup_url.text, + 'url': backup_url, # backup URLs have lower priorities - 'preference': -2 if 'hd.mp4' in backup_url.text else -3, + 'preference': -2 if 'hd.mp4' in backup_url else -3, }) self._sort_formats(formats) entries.append({ - 'id': '%s_part%s' % (cid, xpath_text(durl, './order')), - 'duration': int_or_none(xpath_text(durl, './length'), 1000), + 'id': '%s_part%s' % (video_id, idx), + 'duration': float_or_none(durl.get('length'), 1000), 'formats': formats, }) title = self._html_search_regex('<h1[^>]+title="([^"]+)">', webpage, 'title') description = self._html_search_meta('description', webpage) - datetime_str = self._html_search_regex( - r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', fatal=False) - timestamp = None - if datetime_str: - timestamp = calendar.timegm(datetime.datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M').timetuple()) + timestamp = unified_timestamp(self._html_search_regex( + r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', fatal=False)) # TODO 'view_count' requires deobfuscating Javascript info = { - 'id': compat_str(cid), + 'id': video_id, 'title': title, 'description': description, 'timestamp': timestamp, 'thumbnail': self._html_search_meta('thumbnailUrl', webpage), - 'duration': float_or_none(xpath_text(info_xml, './timelength'), scale=1000), + 'duration': float_or_none(video_info.get('timelength'), scale=1000), } uploader_mobj = re.search( From 98908bcf7c50d034042ab86223b7689e91b589ba Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 28 Aug 2016 22:49:46 +0800 Subject: [PATCH 190/218] [openload] Update algorithm again (#10408) --- youtube_dl/extractor/openload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index c8dde7ae3..03baf8e32 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -60,7 +60,7 @@ class OpenloadIE(InfoExtractor): if j >= 33 and j <= 126: j = ((j + 14) % 94) + 33 if idx == len(enc_data) - 1: - j += 2 + j += 1 video_url_chars += compat_chr(j) video_url = 'https://openload.co/stream/%s?mime=true' % ''.join(video_url_chars) From 2982514072594b1f708abdf654b31da77c0bfa81 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 28 Aug 2016 16:43:15 +0100 Subject: [PATCH 191/218] [turner,nba,cnn,adultswim] add base extractor to parse cvp feeds --- youtube_dl/extractor/adultswim.py | 70 +++---------- youtube_dl/extractor/cnn.py | 97 +++++------------- youtube_dl/extractor/nba.py | 70 +++---------- youtube_dl/extractor/turner.py | 163 ++++++++++++++++++++++++++++++ 4 files changed, 214 insertions(+), 186 deletions(-) create mode 100644 youtube_dl/extractor/turner.py diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 96599048f..ef3cc2a61 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -3,16 +3,11 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor -from ..utils import ( - determine_ext, - ExtractorError, - float_or_none, - xpath_text, -) +from .turner import TurnerBaseIE +from ..utils import ExtractorError -class AdultSwimIE(InfoExtractor): +class AdultSwimIE(TurnerBaseIE): _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<is_playlist>playlists/)?(?P<show_path>[^/]+)/(?P<episode_path>[^/?#]+)/?' _TESTS = [{ @@ -96,7 +91,8 @@ class AdultSwimIE(InfoExtractor): 'params': { # m3u8 download 'skip_download': True, - } + }, + 'expected_warnings': ['Unable to download f4m manifest'], }] @staticmethod @@ -176,57 +172,23 @@ class AdultSwimIE(InfoExtractor): entries = [] for part_num, segment_id in enumerate(segment_ids): - segment_url = 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id - + segement_info = self._extract_cvp_info( + 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id, + segment_id, { + 'secure': { + 'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big', + 'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do', + }, + }) segment_title = '%s - %s' % (show_title, episode_title) if len(segment_ids) > 1: segment_title += ' Part %d' % (part_num + 1) - - idoc = self._download_xml( - segment_url, segment_title, - 'Downloading segment information', 'Unable to download segment information') - - segment_duration = float_or_none( - xpath_text(idoc, './/trt', 'segment duration').strip()) - - formats = [] - file_els = idoc.findall('.//files/file') or idoc.findall('./files/file') - - unique_urls = [] - unique_file_els = [] - for file_el in file_els: - media_url = file_el.text - if not media_url or determine_ext(media_url) == 'f4m': - continue - if file_el.text not in unique_urls: - unique_urls.append(file_el.text) - unique_file_els.append(file_el) - - for file_el in unique_file_els: - bitrate = file_el.attrib.get('bitrate') - ftype = file_el.attrib.get('type') - media_url = file_el.text - if determine_ext(media_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - media_url, segment_title, 'mp4', preference=0, - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'format_id': '%s_%s' % (bitrate, ftype), - 'url': file_el.text.strip(), - # The bitrate may not be a number (for example: 'iphone') - 'tbr': int(bitrate) if bitrate.isdigit() else None, - }) - - self._sort_formats(formats) - - entries.append({ + segement_info.update({ 'id': segment_id, 'title': segment_title, - 'formats': formats, - 'duration': segment_duration, - 'description': episode_description + 'description': episode_description, }) + entries.append(segement_info) return { '_type': 'playlist', diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 220bb55e8..1bf87f6ea 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -3,14 +3,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_duration, - url_basename, -) +from .turner import TurnerBaseIE +from ..utils import url_basename -class CNNIE(InfoExtractor): +class CNNIE(TurnerBaseIE): _VALID_URL = r'''(?x)https?://(?:(?P<sub_domain>edition|www|money)\.)?cnn\.com/(?:video/(?:data/.+?|\?)/)?videos?/ (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))''' @@ -18,43 +15,50 @@ class CNNIE(InfoExtractor): 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', 'md5': '3e6121ea48df7e2259fe73a0628605c4', 'info_dict': { - 'id': 'sports/2013/06/09/nadal-1-on-1.cnn', + 'id': 'nadal-1-on-1', 'ext': 'mp4', 'title': 'Nadal wins 8th French Open title', 'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', 'duration': 135, 'upload_date': '20130609', }, + 'expected_warnings': ['Failed to download m3u8 information'], }, { 'url': 'http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29', 'md5': 'b5cc60c60a3477d185af8f19a2a26f4e', 'info_dict': { - 'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology', + 'id': 'sot-student-gives-epic-speech', 'ext': 'mp4', 'title': "Student's epic speech stuns new freshmen", 'description': "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"", 'upload_date': '20130821', - } + }, + 'expected_warnings': ['Failed to download m3u8 information'], }, { 'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html', 'md5': 'f14d02ebd264df951feb2400e2c25a1b', 'info_dict': { - 'id': 'living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln', + 'id': 'growing-america-nashville-salemtown-board-episode-1', 'ext': 'mp4', 'title': 'Nashville Ep. 1: Hand crafted skateboards', 'description': 'md5:e7223a503315c9f150acac52e76de086', 'upload_date': '20141222', - } + }, + 'expected_warnings': ['Failed to download m3u8 information'], }, { 'url': 'http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html', 'md5': '52a515dc1b0f001cd82e4ceda32be9d1', 'info_dict': { - 'id': '/video/news/2016/08/19/netflix-stunning-stats.cnnmoney', + 'id': 'netflix-stunning-stats', 'ext': 'mp4', 'title': '5 stunning stats about Netflix', 'description': 'Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn\'t know.', 'upload_date': '20160819', - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk', 'only_matching': True, @@ -84,67 +88,12 @@ class CNNIE(InfoExtractor): if sub_domain not in ('money', 'edition'): sub_domain = 'edition' config = self._CONFIG[sub_domain] - info_url = config['data_src'] % path - info = self._download_xml(info_url, page_title) - - formats = [] - rex = re.compile(r'''(?x) - (?P<width>[0-9]+)x(?P<height>[0-9]+) - (?:_(?P<bitrate>[0-9]+)k)? - ''') - for f in info.findall('files/file'): - video_url = config['media_src'] + f.text.strip() - fdct = { - 'format_id': f.attrib['bitrate'], - 'url': video_url, - } - - mf = rex.match(f.attrib['bitrate']) - if mf: - fdct['width'] = int(mf.group('width')) - fdct['height'] = int(mf.group('height')) - fdct['tbr'] = int_or_none(mf.group('bitrate')) - else: - mf = rex.search(f.text) - if mf: - fdct['width'] = int(mf.group('width')) - fdct['height'] = int(mf.group('height')) - fdct['tbr'] = int_or_none(mf.group('bitrate')) - else: - mi = re.match(r'ios_(audio|[0-9]+)$', f.attrib['bitrate']) - if mi: - if mi.group(1) == 'audio': - fdct['vcodec'] = 'none' - fdct['ext'] = 'm4a' - else: - fdct['tbr'] = int(mi.group(1)) - - formats.append(fdct) - - self._sort_formats(formats) - - thumbnails = [{ - 'height': int(t.attrib['height']), - 'width': int(t.attrib['width']), - 'url': t.text, - } for t in info.findall('images/image')] - - metas_el = info.find('metas') - upload_date = ( - metas_el.attrib.get('version') if metas_el is not None else None) - - duration_el = info.find('length') - duration = parse_duration(duration_el.text) - - return { - 'id': info.attrib['id'], - 'title': info.find('headline').text, - 'formats': formats, - 'thumbnails': thumbnails, - 'description': info.find('description').text, - 'duration': duration, - 'upload_date': upload_date, - } + return self._extract_cvp_info( + config['data_src'] % path, page_title, { + 'default': { + 'media_src': config['media_src'], + } + }) class CNNBlogsIE(InfoExtractor): diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index d896b0d04..aabd5b670 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -1,25 +1,20 @@ from __future__ import unicode_literals import functools -import os.path import re -from .common import InfoExtractor +from .turner import TurnerBaseIE from ..compat import ( compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( - int_or_none, OnDemandPagedList, - parse_duration, remove_start, - xpath_text, - xpath_attr, ) -class NBAIE(InfoExtractor): +class NBAIE(TurnerBaseIE): _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P<path>(?:[^/]+/)+(?P<id>[^?]*?))/?(?:/index\.html)?(?:\?.*)?$' _TESTS = [{ 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', @@ -59,7 +54,7 @@ class NBAIE(InfoExtractor): 'ext': 'mp4', 'title': 'Practice: Doc Rivers - 2/16/16', 'description': 'Head Coach Doc Rivers addresses the media following practice.', - 'upload_date': '20160217', + 'upload_date': '20160216', 'timestamp': 1455672000, }, 'params': { @@ -80,7 +75,7 @@ class NBAIE(InfoExtractor): }, { 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#', 'info_dict': { - 'id': 'Wigginsmp4', + 'id': 'Wigginsmp4-3462601', 'ext': 'mp4', 'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins', 'description': 'Wolves rookie Andrew Wiggins addresses the media after Friday\'s shootaround.', @@ -145,53 +140,12 @@ class NBAIE(InfoExtractor): if path.startswith('video/teams'): path = 'video/channels/proxy/' + path[6:] - video_info = self._download_xml('http://www.nba.com/%s.xml' % path, video_id) - video_id = os.path.splitext(xpath_text(video_info, 'slug'))[0] - title = xpath_text(video_info, 'headline') - description = xpath_text(video_info, 'description') - duration = parse_duration(xpath_text(video_info, 'length')) - timestamp = int_or_none(xpath_attr(video_info, 'dateCreated', 'uts')) - - thumbnails = [] - for image in video_info.find('images'): - thumbnails.append({ - 'id': image.attrib.get('cut'), - 'url': image.text, - 'width': int_or_none(image.attrib.get('width')), - 'height': int_or_none(image.attrib.get('height')), + return self._extract_cvp_info( + 'http://www.nba.com/%s.xml' % path, video_id, { + 'default': { + 'media_src': 'http://nba.cdn.turner.com/nba/big', + }, + 'm3u8': { + 'media_src': 'http://nbavod-f.akamaihd.net', + }, }) - - formats = [] - for video_file in video_info.findall('.//file'): - video_url = video_file.text - if video_url.startswith('/'): - continue - if video_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls', fatal=False)) - elif video_url.endswith('.f4m'): - formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.1.1', video_id, f4m_id='hds', fatal=False)) - else: - key = video_file.attrib.get('bitrate') - format_info = { - 'format_id': key, - 'url': video_url, - } - mobj = re.search(r'(\d+)x(\d+)(?:_(\d+))?', key) - if mobj: - format_info.update({ - 'width': int(mobj.group(1)), - 'height': int(mobj.group(2)), - 'tbr': int_or_none(mobj.group(3)), - }) - formats.append(format_info) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'thumbnails': thumbnails, - 'formats': formats, - } diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py new file mode 100644 index 000000000..0d4271f11 --- /dev/null +++ b/youtube_dl/extractor/turner.py @@ -0,0 +1,163 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + xpath_text, + int_or_none, + determine_ext, + parse_duration, + xpath_attr, + update_url_query, +) + + +class TurnerBaseIE(InfoExtractor): + def _extract_cvp_info(self, data_src, video_id, path_data={}): + video_data = self._download_xml(data_src, video_id) + video_id = video_data.attrib['id'].split('/')[-1].split('.')[0] + title = xpath_text(video_data, 'headline', fatal=True) + # rtmp_src = xpath_text(video_data, 'akamai/src') + # if rtmp_src: + # splited_rtmp_src = rtmp_src.split(',') + # if len(splited_rtmp_src) == 2: + # rtmp_src = splited_rtmp_src[1] + # aifp = xpath_text(video_data, 'akamai/aifp', default='') + + tokens = {} + urls = [] + formats = [] + rex = re.compile(r'''(?x) + (?P<width>[0-9]+)x(?P<height>[0-9]+) + (?:_(?P<bitrate>[0-9]+))? + ''') + for video_file in video_data.findall('files/file'): + video_url = video_file.text.strip() + if not video_url: + continue + ext = determine_ext(video_url) + if video_url.startswith('/mp4:protected/'): + continue + # TODO Correct extraction for these files + # protected_path_data = path_data.get('protected') + # if not protected_path_data or not rtmp_src: + # continue + # protected_path = self._search_regex( + # r'/mp4:(.+)\.[a-z0-9]', video_url, 'secure path') + # auth = self._download_webpage( + # protected_path_data['tokenizer_src'], query={ + # 'path': protected_path, + # 'videoId': video_id, + # 'aifp': aifp, + # }) + # token = xpath_text(auth, 'token') + # if not token: + # continue + # video_url = rtmp_src + video_url + '?' + token + elif video_url.startswith('/secure/'): + secure_path_data = path_data.get('secure') + if not secure_path_data: + continue + video_url = secure_path_data['media_src'] + video_url + secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*' + token = tokens.get(secure_path) + if not token: + auth = self._download_xml( + secure_path_data['tokenizer_src'], video_id, query={ + 'path': secure_path, + 'videoId': video_id, + }) + token = xpath_text(auth, 'token') + if not token: + continue + tokens[secure_path] = token + video_url = video_url + '?hdnea=' + token + elif not re.match('https?://', video_url): + base_path_data = path_data.get(ext, path_data.get('default', {})) + media_src = base_path_data.get('media_src') + if not media_src: + continue + video_url = media_src + video_url + if video_url in urls: + continue + urls.append(video_url) + format_id = video_file.attrib['bitrate'] + if ext == 'smil': + formats.extend(self._extract_smil_formats(video_url, video_id, fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(video_url, {'hdcore': '3.7.0'}), + video_id, f4m_id=format_id, fatal=False)) + else: + f = { + 'format_id': format_id, + 'url': video_url, + 'ext': ext, + } + mobj = rex.search(format_id + video_url) + if mobj: + f.update({ + 'width': int(mobj.group('width')), + 'height': int(mobj.group('height')), + 'tbr': int_or_none(mobj.group('bitrate')), + }) + elif format_id.isdigit(): + f['tbr'] = int(format_id) + else: + mobj = re.match(r'ios_(audio|[0-9]+)$', format_id) + if mobj: + if mobj.group(1) == 'audio': + f.update({ + 'vcodec': 'none', + 'ext': 'm4a', + }) + else: + f['tbr'] = int(mobj.group(1)) + formats.append(f) + self._sort_formats(formats) + + subtitles = {} + for source in video_data.findall('closedCaptions/source'): + for track in source.findall('track'): + source_url = source.get('url') + if not source_url: + continue + subtitles.set_default(source.get('lang') or source.get('label') or 'en', []).append({ + 'url': source_url, + 'ext': { + 'scc': 'scc', + 'webvtt': 'vtt', + 'smptett': 'tt', + }.get(source.get('format')) + }) + + thumbnails = [{ + 'id': image.get('cut'), + 'url': image.text, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in video_data.findall('images/image')] + + timestamp = None + if 'cnn.com' not in data_src: + timestamp = int_or_none(xpath_attr(video_data, 'dateCreated', 'uts')) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'description': xpath_text(video_data, 'description'), + 'duration': parse_duration(xpath_text(video_data, 'length') or xpath_text(video_data, 'trt')), + 'timestamp': timestamp, + 'upload_date': xpath_attr(video_data, 'metas', 'version'), + 'series': xpath_text(video_data, 'showTitle'), + 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), + 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), + } From ec65b391cbb0bc42a78515915e61602f4d1ae1f9 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 28 Aug 2016 16:47:59 +0100 Subject: [PATCH 192/218] [cartoonnetwork] Add new extractor(#10110) --- youtube_dl/extractor/cartoonnetwork.py | 36 ++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 37 insertions(+) create mode 100644 youtube_dl/extractor/cartoonnetwork.py diff --git a/youtube_dl/extractor/cartoonnetwork.py b/youtube_dl/extractor/cartoonnetwork.py new file mode 100644 index 000000000..813f53644 --- /dev/null +++ b/youtube_dl/extractor/cartoonnetwork.py @@ -0,0 +1,36 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .turner import TurnerBaseIE + + +class CartoonNetworkIE(TurnerBaseIE): + _VALID_URL = r'https?://(?:www\.)?cartoonnetwork\.com/video/(?:[^/]+/)+(?P<id>[^/?#]+)-(?:clip|episode)\.html' + _TEST = { + 'url': 'http://www.cartoonnetwork.com/video/teen-titans-go/starfire-the-cat-lady-clip.html', + 'info_dict': { + 'id': '8a250ab04ed07e6c014ef3f1e2f9016c', + 'ext': 'mp4', + 'title': 'Starfire the Cat Lady', + 'description': 'Robin decides to become a cat so that Starfire will finally love him.', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + id_type, video_id = re.search(r"_cnglobal\.cvp(Video|Title)Id\s*=\s*'([^']+)';", webpage).groups() + query = ('id' if id_type == 'Video' else 'titleId') + '=' + video_id + return self._extract_cvp_info( + 'http://www.cartoonnetwork.com/video-seo-svc/episodeservices/getCvpPlaylist?' + query, video_id, { + 'secure': { + 'media_src': 'http://apple-secure.cdn.turner.com/toon/big', + 'tokenizer_src': 'http://www.cartoonnetwork.com/cntv/mvpd/processors/services/token_ipadAdobe.do', + }, + }) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8d88d6cb4..6eb495b07 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -127,6 +127,7 @@ from .carambatv import ( CarambaTVIE, CarambaTVPageIE, ) +from .cartoonnetwork import CartoonNetworkIE from .cbc import ( CBCIE, CBCPlayerIE, From b3eaeded12f470afd6f0cb851e6b7dd2ee78b7c5 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 28 Aug 2016 16:50:32 +0100 Subject: [PATCH 193/218] [tbs] Add new extractor(#10222) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tbs.py | 59 ++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 youtube_dl/extractor/tbs.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6eb495b07..06c6746ff 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -824,6 +824,7 @@ from .tagesschau import ( TagesschauIE, ) from .tass import TassIE +from .tbs import TBSIE from .tdslifeway import TDSLifewayIE from .teachertube import ( TeacherTubeIE, diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py new file mode 100644 index 000000000..79b00e376 --- /dev/null +++ b/youtube_dl/extractor/tbs.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .turner import TurnerBaseIE +from ..utils import ( + extract_attributes, + ExtractorError, +) + + +class TBSIE(TurnerBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/videos/(?:[^/]+/)+(?P<id>[^/?#]+)\.html' + _TESTS = [{ + 'url': 'http://www.tbs.com/videos/people-of-earth/season-1/extras/2007318/theatrical-trailer.html', + 'md5': '9e61d680e2285066ade7199e6408b2ee', + 'info_dict': { + 'id': '2007318', + 'ext': 'mp4', + 'title': 'Theatrical Trailer', + 'description': 'Catch the latest comedy from TBS, People of Earth, premiering Halloween night--Monday, October 31, at 9/8c.', + } + }, { + 'url': 'http://www.tntdrama.com/videos/good-behavior/season-1/extras/1538823/you-better-run.html', + 'md5': 'ce53c6ead5e9f3280b4ad2031a6fab56', + 'info_dict': { + 'id': '1538823', + 'ext': 'mp4', + 'title': 'You Better Run', + 'description': 'Letty Raines must figure out what she\'s running toward while running away from her past. Good Behavior premieres November 15 at 9/8c.', + } + }] + + def _real_extract(self, url): + domain, display_id = re.match(self._VALID_URL, url).groups() + site = domain[:3] + webpage = self._download_webpage(url, display_id) + video_params = extract_attributes(self._search_regex(r'(<[^>]+id="page-video"[^>]*>)', webpage, 'video params')) + if video_params.get('isAuthRequired') == 'true': + raise ExtractorError( + 'This video is only available via cable service provider subscription that' + ' is not currently supported.', expected=True) + query = None + clip_id = video_params.get('clipid') + if clip_id: + query = 'id=' + clip_id + else: + query = 'titleId=' + video_params['titleid'] + return self._extract_cvp_info( + 'http://www.%s.com/service/cvpXml?%s' % (domain, query), display_id, { + 'default': { + 'media_src': 'http://ht.cdn.turner.com/%s/big' % site, + }, + 'secure': { + 'media_src': 'http://apple-secure.cdn.turner.com/%s/big' % site, + 'tokenizer_src': 'http://www.%s.com/video/processors/services/token_ipadAdobe.do' % domain, + }, + }) From 5bc8a73af69f4aac8b2df6f7c23ecfb4ee72e518 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 28 Aug 2016 17:08:26 +0100 Subject: [PATCH 194/218] [cartoonnetwork] make extraction work for more videos in the website some videos require `networkName=CN2` to be present in the feed url --- youtube_dl/extractor/cartoonnetwork.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cartoonnetwork.py b/youtube_dl/extractor/cartoonnetwork.py index 813f53644..b3f30b1ca 100644 --- a/youtube_dl/extractor/cartoonnetwork.py +++ b/youtube_dl/extractor/cartoonnetwork.py @@ -28,7 +28,7 @@ class CartoonNetworkIE(TurnerBaseIE): id_type, video_id = re.search(r"_cnglobal\.cvp(Video|Title)Id\s*=\s*'([^']+)';", webpage).groups() query = ('id' if id_type == 'Video' else 'titleId') + '=' + video_id return self._extract_cvp_info( - 'http://www.cartoonnetwork.com/video-seo-svc/episodeservices/getCvpPlaylist?' + query, video_id, { + 'http://www.cartoonnetwork.com/video-seo-svc/episodeservices/getCvpPlaylist?networkName=CN2&' + query, video_id, { 'secure': { 'media_src': 'http://apple-secure.cdn.turner.com/toon/big', 'tokenizer_src': 'http://www.cartoonnetwork.com/cntv/mvpd/processors/services/token_ipadAdobe.do', From b8079a40bc61326b17a672b073dce6cdfa791fb5 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 28 Aug 2016 17:51:53 +0100 Subject: [PATCH 195/218] [turner] fix secure m3u8 formats downloading --- youtube_dl/downloader/hls.py | 11 +++++++---- youtube_dl/extractor/turner.py | 15 +++++++++++++-- youtube_dl/extractor/uplynk.py | 4 +--- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 8dd1b898e..baaff44d5 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -83,7 +83,10 @@ class HlsFD(FragmentFD): self._prepare_and_start_frag_download(ctx) + extra_query = None extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') + if extra_param_to_segment_url: + extra_query = compat_urlparse.parse_qs(extra_param_to_segment_url) i = 0 media_sequence = 0 decrypt_info = {'METHOD': 'NONE'} @@ -97,8 +100,8 @@ class HlsFD(FragmentFD): if re.match(r'^https?://', line) else compat_urlparse.urljoin(man_url, line)) frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i) - if extra_param_to_segment_url: - frag_url = update_url_query(frag_url, extra_param_to_segment_url) + if extra_query: + frag_url = update_url_query(frag_url, extra_query) success = ctx['dl'].download(frag_filename, {'url': frag_url}) if not success: return False @@ -124,8 +127,8 @@ class HlsFD(FragmentFD): if not re.match(r'^https?://', decrypt_info['URI']): decrypt_info['URI'] = compat_urlparse.urljoin( man_url, decrypt_info['URI']) - if extra_param_to_segment_url: - decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_param_to_segment_url) + if extra_query: + decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query) decrypt_info['KEY'] = self.ydl.urlopen(decrypt_info['URI']).read() elif line.startswith('#EXT-X-MEDIA-SEQUENCE'): media_sequence = int(line[22:]) diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index 0d4271f11..108caa9d8 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -11,6 +11,7 @@ from ..utils import ( parse_duration, xpath_attr, update_url_query, + compat_urlparse, ) @@ -87,8 +88,18 @@ class TurnerBaseIE(InfoExtractor): if ext == 'smil': formats.extend(self._extract_smil_formats(video_url, video_id, fatal=False)) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + m3u8_formats = self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False) + if m3u8_formats: + # Sometimes final URLs inside m3u8 are unsigned, let's fix this + # ourselves + qs = compat_urlparse.urlparse(video_url).query + if qs: + query = compat_urlparse.parse_qs(qs) + for m3u8_format in m3u8_formats: + m3u8_format['url'] = update_url_query(m3u8_format['url'], query) + m3u8_format['extra_param_to_segment_url'] = qs + formats.extend(m3u8_formats) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( update_url_query(video_url, {'hdcore': '3.7.0'}), diff --git a/youtube_dl/extractor/uplynk.py b/youtube_dl/extractor/uplynk.py index ae529f690..2cd22cf8a 100644 --- a/youtube_dl/extractor/uplynk.py +++ b/youtube_dl/extractor/uplynk.py @@ -33,9 +33,7 @@ class UplynkIE(InfoExtractor): formats = self._extract_m3u8_formats('http://content.uplynk.com/%s.m3u8' % path, display_id, 'mp4') if session_id: for f in formats: - f['extra_param_to_segment_url'] = { - 'pbs': session_id, - } + f['extra_param_to_segment_url'] = 'pbs=' + session_id self._sort_formats(formats) asset = self._download_json('http://content.uplynk.com/player/assetinfo/%s.json' % path, display_id) if asset.get('error') == 1: From 9ba1e1dcc0dc27d36f3f396cb608cef7cd50e48a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 29 Aug 2016 08:26:07 +0700 Subject: [PATCH 196/218] [played] Remove extractor (Closes #10470) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/played.py | 60 ------------------------------ 2 files changed, 61 deletions(-) delete mode 100644 youtube_dl/extractor/played.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 06c6746ff..20e85703f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -636,7 +636,6 @@ from .phoenix import PhoenixIE from .photobucket import PhotobucketIE from .pinkbike import PinkbikeIE from .pladform import PladformIE -from .played import PlayedIE from .playfm import PlayFMIE from .plays import PlaysTVIE from .playtvak import PlaytvakIE diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py deleted file mode 100644 index 57c875ef0..000000000 --- a/youtube_dl/extractor/played.py +++ /dev/null @@ -1,60 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import os.path - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - sanitized_Request, - urlencode_postdata, -) - - -class PlayedIE(InfoExtractor): - IE_NAME = 'played.to' - _VALID_URL = r'https?://(?:www\.)?played\.to/(?P<id>[a-zA-Z0-9_-]+)' - - _TEST = { - 'url': 'http://played.to/j2f2sfiiukgt', - 'md5': 'c2bd75a368e82980e7257bf500c00637', - 'info_dict': { - 'id': 'j2f2sfiiukgt', - 'ext': 'flv', - 'title': 'youtube-dl_test_video.mp4', - }, - 'skip': 'Removed for copyright infringement.', # oh wow - } - - def _real_extract(self, url): - video_id = self._match_id(url) - orig_webpage = self._download_webpage(url, video_id) - - m_error = re.search( - r'(?s)Reason for deletion:.*?<b class="err"[^>]*>(?P<msg>[^<]+)</b>', orig_webpage) - if m_error: - raise ExtractorError(m_error.group('msg'), expected=True) - - data = self._hidden_inputs(orig_webpage) - - self._sleep(2, video_id) - - post = urlencode_postdata(data) - headers = { - b'Content-Type': b'application/x-www-form-urlencoded', - } - req = sanitized_Request(url, post, headers) - webpage = self._download_webpage( - req, video_id, note='Downloading video page ...') - - title = os.path.splitext(data['fname'])[0] - - video_url = self._search_regex( - r'file: "?(.+?)",', webpage, 'video URL') - - return { - 'id': video_id, - 'title': title, - 'url': video_url, - } From 93b84045994ca88b486901f54de1102347a67537 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 29 Aug 2016 07:56:54 +0100 Subject: [PATCH 197/218] [generic,vodplatform] improve embed regex --- youtube_dl/extractor/generic.py | 4 ++-- youtube_dl/extractor/vodplatform.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c6e655c84..24b217715 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2243,11 +2243,11 @@ class GenericIE(InfoExtractor): # Look for VODPlatform embeds mobj = re.search( - r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vod-platform\.net/embed/[^/?#]+)', + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vod-platform\.net/[eE]mbed/.+?)\1', webpage) if mobj is not None: return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group(1))), 'VODPlatform') + self._proto_relative_url(unescapeHTML(mobj.group('url'))), 'VODPlatform') # Look for Instagram embeds instagram_embed_url = InstagramIE._extract_embed_url(webpage) diff --git a/youtube_dl/extractor/vodplatform.py b/youtube_dl/extractor/vodplatform.py index b49542b16..7bdd8b1dc 100644 --- a/youtube_dl/extractor/vodplatform.py +++ b/youtube_dl/extractor/vodplatform.py @@ -6,7 +6,7 @@ from ..utils import unescapeHTML class VODPlatformIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vod-platform\.net/embed/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?vod-platform\.net/[eE]mbed/(?P<id>[^/?#]+)' _TEST = { # from http://www.lbcgroup.tv/watch/chapter/29143/52844/%D8%A7%D9%84%D9%86%D8%B5%D8%B1%D8%A9-%D9%81%D9%8A-%D8%B6%D9%8A%D8%A7%D9%81%D8%A9-%D8%A7%D9%84%D9%80-cnn/ar 'url': 'http://vod-platform.net/embed/RufMcytHDolTH1MuKHY9Fw', From 6c9b71bc0862560cbb9c4c2d9ec295072c208838 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 29 Aug 2016 19:05:38 +0800 Subject: [PATCH 198/218] [downloader/external] Recommend --hls-prefer-native for SOCKS users Related: #10490 --- youtube_dl/downloader/external.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 17f12e970..0aeae3b8f 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -223,7 +223,8 @@ class FFmpegFD(ExternalFD): if proxy.startswith('socks'): self.report_warning( - '%s does not support SOCKS proxies. Downloading may fail.' % self.get_basename()) + '%s does not support SOCKS proxies. Downloading is likely to fail. ' + 'Consider adding --hls-prefer-native to your command.' % self.get_basename()) # Since December 2015 ffmpeg supports -http_proxy option (see # http://git.videolan.org/?p=ffmpeg.git;a=commit;h=b4eb1f29ebddd60c41a2eb39f5af701e38e0d3fd) From 547993dcd09dd46fda2fd429ed0ed72db7263503 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 29 Aug 2016 21:52:41 +0700 Subject: [PATCH 199/218] [turner] Fix subtitles extraction --- youtube_dl/extractor/turner.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index 108caa9d8..d69977b56 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -135,11 +135,12 @@ class TurnerBaseIE(InfoExtractor): subtitles = {} for source in video_data.findall('closedCaptions/source'): for track in source.findall('track'): - source_url = source.get('url') - if not source_url: + track_url = track.get('url') + if not track_url: continue - subtitles.set_default(source.get('lang') or source.get('label') or 'en', []).append({ - 'url': source_url, + lang = track.get('lang') or track.get('label') or 'en' + subtitles.setdefault(lang, []).append({ + 'url': track_url, 'ext': { 'scc': 'scc', 'webvtt': 'vtt', From cd10b3ea63fd167216234932aba4d63a34aec4c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 29 Aug 2016 22:13:49 +0700 Subject: [PATCH 200/218] [turner] Extract all formats --- youtube_dl/extractor/turner.py | 46 ++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index d69977b56..6df22fd24 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( xpath_text, int_or_none, @@ -30,11 +31,11 @@ class TurnerBaseIE(InfoExtractor): tokens = {} urls = [] formats = [] - rex = re.compile(r'''(?x) - (?P<width>[0-9]+)x(?P<height>[0-9]+) - (?:_(?P<bitrate>[0-9]+))? - ''') - for video_file in video_data.findall('files/file'): + rex = re.compile( + r'(?P<width>[0-9]+)x(?P<height>[0-9]+)(?:_(?P<bitrate>[0-9]+))?') + # Possible formats locations: files/file, files/groupFiles/files + # and maybe others + for video_file in video_data.findall('.//file'): video_url = video_file.text.strip() if not video_url: continue @@ -84,12 +85,14 @@ class TurnerBaseIE(InfoExtractor): if video_url in urls: continue urls.append(video_url) - format_id = video_file.attrib['bitrate'] + format_id = video_file.get('bitrate') if ext == 'smil': - formats.extend(self._extract_smil_formats(video_url, video_id, fatal=False)) + formats.extend(self._extract_smil_formats( + video_url, video_id, fatal=False)) elif ext == 'm3u8': m3u8_formats = self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False) + video_url, video_id, 'mp4', m3u8_id=format_id or 'hls', + fatal=False) if m3u8_formats: # Sometimes final URLs inside m3u8 are unsigned, let's fix this # ourselves @@ -103,7 +106,7 @@ class TurnerBaseIE(InfoExtractor): elif ext == 'f4m': formats.extend(self._extract_f4m_formats( update_url_query(video_url, {'hdcore': '3.7.0'}), - video_id, f4m_id=format_id, fatal=False)) + video_id, f4m_id=format_id or 'hds', fatal=False)) else: f = { 'format_id': format_id, @@ -117,18 +120,19 @@ class TurnerBaseIE(InfoExtractor): 'height': int(mobj.group('height')), 'tbr': int_or_none(mobj.group('bitrate')), }) - elif format_id.isdigit(): - f['tbr'] = int(format_id) - else: - mobj = re.match(r'ios_(audio|[0-9]+)$', format_id) - if mobj: - if mobj.group(1) == 'audio': - f.update({ - 'vcodec': 'none', - 'ext': 'm4a', - }) - else: - f['tbr'] = int(mobj.group(1)) + elif isinstance(format_id, compat_str): + if format_id.isdigit(): + f['tbr'] = int(format_id) + else: + mobj = re.match(r'ios_(audio|[0-9]+)$', format_id) + if mobj: + if mobj.group(1) == 'audio': + f.update({ + 'vcodec': 'none', + 'ext': 'm4a', + }) + else: + f['tbr'] = int(mobj.group(1)) formats.append(f) self._sort_formats(formats) From 3fb2a23029934dcbf6fe2cd283d851506dcdff5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 29 Aug 2016 22:40:35 +0700 Subject: [PATCH 201/218] [adultswim] Extract video info from onlineOriginals (Closes #10492) --- youtube_dl/extractor/adultswim.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index ef3cc2a61..5d0bf5a68 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .turner import TurnerBaseIE -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + int_or_none, +) class AdultSwimIE(TurnerBaseIE): @@ -144,7 +147,10 @@ class AdultSwimIE(TurnerBaseIE): if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path: video_info = bootstrapped_data['slugged_video'] if not video_info: - video_info = bootstrapped_data.get('heroMetadata', {}).get('trailer').get('video') + video_info = bootstrapped_data.get( + 'heroMetadata', {}).get('trailer', {}).get('video') + if not video_info: + video_info = bootstrapped_data.get('onlineOriginals', [None])[0] if not video_info: raise ExtractorError('Unable to find video info') @@ -167,8 +173,9 @@ class AdultSwimIE(TurnerBaseIE): episode_id = video_info['id'] episode_title = video_info['title'] - episode_description = video_info['description'] - episode_duration = video_info.get('duration') + episode_description = video_info.get('description') + episode_duration = int_or_none(video_info.get('duration')) + view_count = int_or_none(video_info.get('views')) entries = [] for part_num, segment_id in enumerate(segment_ids): @@ -197,5 +204,6 @@ class AdultSwimIE(TurnerBaseIE): 'entries': entries, 'title': '%s - %s' % (show_title, episode_title), 'description': episode_description, - 'duration': episode_duration + 'duration': episode_duration, + 'view_count': view_count, } From 5a80e7b43a7abc83e104f1cd711d8fe7985c30eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 29 Aug 2016 22:44:15 +0700 Subject: [PATCH 202/218] [turner] Skip invalid subtitles' URLs --- youtube_dl/extractor/turner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index 6df22fd24..f5736bd15 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -140,7 +140,7 @@ class TurnerBaseIE(InfoExtractor): for source in video_data.findall('closedCaptions/source'): for track in source.findall('track'): track_url = track.get('url') - if not track_url: + if not isinstance(track_url, compat_str) or track_url.endswith('/big'): continue lang = track.get('lang') or track.get('label') or 'en' subtitles.setdefault(lang, []).append({ From a06e1498aa7fc02e6db5c6ec8411e90f210ce2c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 29 Aug 2016 22:54:33 +0700 Subject: [PATCH 203/218] [kusi] Update test --- youtube_dl/extractor/kusi.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/kusi.py b/youtube_dl/extractor/kusi.py index 12cc56e44..2e66e8cf9 100644 --- a/youtube_dl/extractor/kusi.py +++ b/youtube_dl/extractor/kusi.py @@ -18,31 +18,20 @@ from ..utils import ( class KUSIIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?kusi\.com/(?P<path>story/.+|video\?clipId=(?P<clipId>\d+))' _TESTS = [{ - 'url': 'http://www.kusi.com/story/31183873/turko-files-case-closed-put-on-hold', - 'md5': 'f926e7684294cf8cb7bdf8858e1b3988', + 'url': 'http://www.kusi.com/story/32849881/turko-files-refused-to-help-it-aint-right', + 'md5': '4e76ce8e53660ce9697d06c0ba6fc47d', 'info_dict': { - 'id': '12203019', + 'id': '12689020', 'ext': 'mp4', - 'title': 'Turko Files: Case Closed! & Put On Hold!', - 'duration': 231.0, - 'upload_date': '20160210', - 'timestamp': 1455087571, + 'title': "Turko Files: Refused to Help, It Ain't Right!", + 'duration': 223.586, + 'upload_date': '20160826', + 'timestamp': 1472233118, 'thumbnail': 're:^https?://.*\.jpg$' }, }, { 'url': 'http://kusi.com/video?clipId=12203019', - 'info_dict': { - 'id': '12203019', - 'ext': 'mp4', - 'title': 'Turko Files: Case Closed! & Put On Hold!', - 'duration': 231.0, - 'upload_date': '20160210', - 'timestamp': 1455087571, - 'thumbnail': 're:^https?://.*\.jpg$' - }, - 'params': { - 'skip_download': True, # Same as previous one - }, + 'only_matching': True, }] def _real_extract(self, url): From fe45b0e06081752ff3617cdfae701408a1d8256a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 29 Aug 2016 18:17:32 +0100 Subject: [PATCH 204/218] [9c9media] fix multiple stacks extraction and extract more metadata(#10016) --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/ninecninemedia.py | 126 +++++++++++++++++++------ 2 files changed, 103 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 20e85703f..21efa96b2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -554,7 +554,10 @@ from .nick import ( NickDeIE, ) from .niconico import NiconicoIE, NiconicoPlaylistIE -from .ninecninemedia import NineCNineMediaIE +from .ninecninemedia import ( + NineCNineMediaStackIE, + NineCNineMediaIE, +) from .ninegag import NineGagIE from .ninenow import NineNowIE from .nintendo import NintendoIE diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py index d889245ad..ec4d675e2 100644 --- a/youtube_dl/extractor/ninecninemedia.py +++ b/youtube_dl/extractor/ninecninemedia.py @@ -4,40 +4,36 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( parse_iso8601, - parse_duration, - ExtractorError + float_or_none, + ExtractorError, + int_or_none, ) -class NineCNineMediaIE(InfoExtractor): - _VALID_URL = r'9c9media:(?P<destination_code>[^:]+):(?P<id>\d+)' +class NineCNineMediaBaseIE(InfoExtractor): + _API_BASE_TEMPLATE = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' + + +class NineCNineMediaStackIE(NineCNineMediaBaseIE): + IE_NAME = '9c9media:stack' + _VALID_URL = r'9c9media:stack:(?P<destination_code>[^:]+):(?P<content_id>\d+):(?P<content_package>\d+):(?P<id>\d+)' def _real_extract(self, url): - destination_code, video_id = re.match(self._VALID_URL, url).groups() - api_base_url = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' % (destination_code, video_id) - content = self._download_json(api_base_url, video_id, query={ - '$include': '[contentpackages]', - }) - title = content['Name'] - if len(content['ContentPackages']) > 1: - raise ExtractorError('multiple content packages') - content_package = content['ContentPackages'][0] - stacks_base_url = api_base_url + 'contentpackages/%s/stacks/' % content_package['Id'] - stacks = self._download_json(stacks_base_url, video_id)['Items'] - if len(stacks) > 1: - raise ExtractorError('multiple stacks') - stack = stacks[0] - stack_base_url = '%s%s/manifest.' % (stacks_base_url, stack['Id']) + destination_code, content_id, package_id, stack_id = re.match(self._VALID_URL, url).groups() + stack_base_url_template = self._API_BASE_TEMPLATE + 'contentpackages/%s/stacks/%s/manifest.' + stack_base_url = stack_base_url_template % (destination_code, content_id, package_id, stack_id) + formats = [] formats.extend(self._extract_m3u8_formats( - stack_base_url + 'm3u8', video_id, 'mp4', + stack_base_url + 'm3u8', stack_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) formats.extend(self._extract_f4m_formats( - stack_base_url + 'f4m', video_id, + stack_base_url + 'f4m', stack_id, f4m_id='hds', fatal=False)) - mp4_url = self._download_webpage(stack_base_url + 'pd', video_id, fatal=False) + mp4_url = self._download_webpage(stack_base_url + 'pd', stack_id, fatal=False) if mp4_url: formats.append({ 'url': mp4_url, @@ -46,10 +42,86 @@ class NineCNineMediaIE(InfoExtractor): self._sort_formats(formats) return { - 'id': video_id, - 'title': title, - 'description': content.get('Desc') or content.get('ShortDesc'), - 'timestamp': parse_iso8601(content.get('BroadcastDateTime')), - 'duration': parse_duration(content.get('BroadcastTime')), + 'id': stack_id, 'formats': formats, } + + +class NineCNineMediaIE(NineCNineMediaBaseIE): + IE_NAME = '9c9media' + _VALID_URL = r'9c9media:(?P<destination_code>[^:]+):(?P<id>\d+)' + + def _real_extract(self, url): + destination_code, content_id = re.match(self._VALID_URL, url).groups() + api_base_url = self._API_BASE_TEMPLATE % (destination_code, content_id) + content = self._download_json(api_base_url, content_id, query={ + '$include': '[Media,Season,ContentPackages]', + }) + title = content['Name'] + if len(content['ContentPackages']) > 1: + raise ExtractorError('multiple content packages') + content_package = content['ContentPackages'][0] + package_id = content_package['Id'] + content_package_url = api_base_url + 'contentpackages/%s/' % package_id + content_package = self._download_json(content_package_url, content_id) + + if content_package.get('Constraints', {}).get('Security', {}).get('Type') == 'adobe-drm': + raise ExtractorError('This video is DRM protected.', expected=True) + + stacks = self._download_json(content_package_url + 'stacks/', package_id)['Items'] + multistacks = len(stacks) > 1 + + thumbnails = [] + for image in content.get('Images', []): + image_url = image.get('Url') + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('Width')), + 'height': int_or_none(image.get('Height')), + }) + + tags, categories = [], [] + for source_name, container in (('Tags', tags), ('Genres', categories)): + for e in content.get(source_name, []): + e_name = e.get('Name') + if not e_name: + continue + container.append(e_name) + + description = content.get('Desc') or content.get('ShortDesc') + season = content.get('Season', {}) + base_info = { + 'description': description, + 'timestamp': parse_iso8601(content.get('BroadcastDateTime')), + 'episode_number': int_or_none(content.get('Episode')), + 'season': season.get('Name'), + 'season_number': season.get('Number'), + 'season_id': season.get('Id'), + 'series': content.get('Media', {}).get('Name'), + 'tags': tags, + 'categories': categories, + } + + entries = [] + for stack in stacks: + stack_id = compat_str(stack['Id']) + entry = { + '_type': 'url_transparent', + 'url': '9c9media:stack:%s:%s:%s:%s' % (destination_code, content_id, package_id, stack_id), + 'id': stack_id, + 'title': '%s_part%s' % (title, stack['Name']) if multistacks else title, + 'duration': float_or_none(stack.get('Duration')), + 'ie_key': 'NineCNineMediaStack', + } + entry.update(base_info) + entries.append(entry) + + return { + '_type': 'multi_video', + 'id': content_id, + 'title': title, + 'description': description, + 'entries': entries, + } From 42e05be8671e149f79307145eda78892003279dc Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 29 Aug 2016 18:20:58 +0100 Subject: [PATCH 205/218] [ctv] add support for (tsn,bnn,thecomedynetwork).ca websites(#10016) --- youtube_dl/extractor/ctv.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/ctv.py b/youtube_dl/extractor/ctv.py index 5807fbac9..a1fe86316 100644 --- a/youtube_dl/extractor/ctv.py +++ b/youtube_dl/extractor/ctv.py @@ -1,11 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor class CTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ctv\.ca/video/player\?vid=(?P<id>[0-9.]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<domain>ctv|tsn|bnn|thecomedynetwork)\.ca/.*?(?:\bvid=|-vid|~|%7E)(?P<id>[0-9.]+)' _TESTS = [{ 'url': 'http://www.ctv.ca/video/player?vid=706966', 'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0', @@ -18,13 +20,27 @@ class CTVIE(InfoExtractor): 'timestamp': 1442624700, }, 'expected_warnings': ['HTTP Error 404'], + }, { + 'url': 'http://www.thecomedynetwork.ca/video/player?vid=923582', + 'only_matching': True, + }, { + 'url': 'http://www.tsn.ca/video/expectations-high-for-milos-raonic-at-us-open~939549', + 'only_matching': True, + }, { + 'url': 'http://www.bnn.ca/video/berman-s-call-part-two-viewer-questions~939654', + 'only_matching': True, + }, { + 'url': 'http://www.ctv.ca/YourMorning/Video/S1E6-Monday-August-29-2016-vid938009', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + domain, video_id = re.match(self._VALID_URL, url).groups() + if domain == 'thecomedynetwork': + domain = 'comedy' return { '_type': 'url_transparent', 'id': video_id, - 'url': '9c9media:ctv_web:%s' % video_id, + 'url': '9c9media:%s_web:%s' % (domain, video_id), 'ie_key': 'NineCNineMedia', } From 1fe48afea5f203cbcb29c0d2984b7b850df8103f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 29 Aug 2016 18:23:21 +0100 Subject: [PATCH 206/218] [cnn] update _TEST for CNNBlogsIE and CNNArticleIE(closes #10489) --- youtube_dl/extractor/cnn.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 1bf87f6ea..bb42f35bd 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -102,12 +102,13 @@ class CNNBlogsIE(InfoExtractor): 'url': 'http://reliablesources.blogs.cnn.com/2014/02/09/criminalizing-journalism/', 'md5': '3e56f97b0b6ffb4b79f4ea0749551084', 'info_dict': { - 'id': 'bestoftv/2014/02/09/criminalizing-journalism.cnn', + 'id': 'criminalizing-journalism', 'ext': 'mp4', 'title': 'Criminalizing journalism?', 'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.', 'upload_date': '20140209', }, + 'expected_warnings': ['Failed to download m3u8 information'], 'add_ie': ['CNN'], } @@ -127,12 +128,13 @@ class CNNArticleIE(InfoExtractor): 'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/', 'md5': '689034c2a3d9c6dc4aa72d65a81efd01', 'info_dict': { - 'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn', + 'id': 'ip-north-korea-obama', 'ext': 'mp4', 'title': 'Obama: Cyberattack not an act of war', - 'description': 'md5:51ce6750450603795cad0cdfbd7d05c5', + 'description': 'md5:0a802a40d2376f60e6b04c8d5bcebc4b', 'upload_date': '20141221', }, + 'expected_warnings': ['Failed to download m3u8 information'], 'add_ie': ['CNN'], } From da30a20a4d8b0ece61c271a5d0f0c6de2817ef5f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 29 Aug 2016 19:26:53 +0100 Subject: [PATCH 207/218] [turner,cnn] move a check for wrong timestamp to CNNIE --- youtube_dl/extractor/cnn.py | 4 ++++ youtube_dl/extractor/turner.py | 9 ++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index bb42f35bd..a51b239cc 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -83,6 +83,10 @@ class CNNIE(TurnerBaseIE): }, } + def _extract_timestamp(self, video_data): + # TODO: fix timestamp extraction + return None + def _real_extract(self, url): sub_domain, path, page_title = re.match(self._VALID_URL, url).groups() if sub_domain not in ('money', 'edition'): diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index f5736bd15..64fdcc56e 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -17,6 +17,9 @@ from ..utils import ( class TurnerBaseIE(InfoExtractor): + def _extract_timestamp(self, video_data): + return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts')) + def _extract_cvp_info(self, data_src, video_id, path_data={}): video_data = self._download_xml(data_src, video_id) video_id = video_data.attrib['id'].split('/')[-1].split('.')[0] @@ -159,10 +162,6 @@ class TurnerBaseIE(InfoExtractor): 'height': int_or_none(image.get('height')), } for image in video_data.findall('images/image')] - timestamp = None - if 'cnn.com' not in data_src: - timestamp = int_or_none(xpath_attr(video_data, 'dateCreated', 'uts')) - return { 'id': video_id, 'title': title, @@ -171,7 +170,7 @@ class TurnerBaseIE(InfoExtractor): 'thumbnails': thumbnails, 'description': xpath_text(video_data, 'description'), 'duration': parse_duration(xpath_text(video_data, 'length') or xpath_text(video_data, 'trt')), - 'timestamp': timestamp, + 'timestamp': self._extract_timestamp(video_data), 'upload_date': xpath_attr(video_data, 'metas', 'version'), 'series': xpath_text(video_data, 'showTitle'), 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), From 3c77a54d5dfa1097d5e3a5eaa0c631b5b01e93ce Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 30 Aug 2016 10:46:48 +0100 Subject: [PATCH 208/218] [turner] keep video id intact --- youtube_dl/extractor/cnn.py | 12 ++++++------ youtube_dl/extractor/nba.py | 11 +++++++---- youtube_dl/extractor/turner.py | 2 +- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index a51b239cc..5fc311f53 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -15,7 +15,7 @@ class CNNIE(TurnerBaseIE): 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', 'md5': '3e6121ea48df7e2259fe73a0628605c4', 'info_dict': { - 'id': 'nadal-1-on-1', + 'id': 'sports/2013/06/09/nadal-1-on-1.cnn', 'ext': 'mp4', 'title': 'Nadal wins 8th French Open title', 'description': 'World Sport\'s Amanda Davies chats with 2013 French Open champion Rafael Nadal.', @@ -27,7 +27,7 @@ class CNNIE(TurnerBaseIE): 'url': 'http://edition.cnn.com/video/?/video/us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology&utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+rss%2Fcnn_topstories+%28RSS%3A+Top+Stories%29', 'md5': 'b5cc60c60a3477d185af8f19a2a26f4e', 'info_dict': { - 'id': 'sot-student-gives-epic-speech', + 'id': 'us/2013/08/21/sot-student-gives-epic-speech.georgia-institute-of-technology', 'ext': 'mp4', 'title': "Student's epic speech stuns new freshmen", 'description': "A Georgia Tech student welcomes the incoming freshmen with an epic speech backed by music from \"2001: A Space Odyssey.\"", @@ -38,7 +38,7 @@ class CNNIE(TurnerBaseIE): 'url': 'http://www.cnn.com/video/data/2.0/video/living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln.html', 'md5': 'f14d02ebd264df951feb2400e2c25a1b', 'info_dict': { - 'id': 'growing-america-nashville-salemtown-board-episode-1', + 'id': 'living/2014/12/22/growing-america-nashville-salemtown-board-episode-1.hln', 'ext': 'mp4', 'title': 'Nashville Ep. 1: Hand crafted skateboards', 'description': 'md5:e7223a503315c9f150acac52e76de086', @@ -49,7 +49,7 @@ class CNNIE(TurnerBaseIE): 'url': 'http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html', 'md5': '52a515dc1b0f001cd82e4ceda32be9d1', 'info_dict': { - 'id': 'netflix-stunning-stats', + 'id': '/video/news/2016/08/19/netflix-stunning-stats.cnnmoney', 'ext': 'mp4', 'title': '5 stunning stats about Netflix', 'description': 'Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn\'t know.', @@ -106,7 +106,7 @@ class CNNBlogsIE(InfoExtractor): 'url': 'http://reliablesources.blogs.cnn.com/2014/02/09/criminalizing-journalism/', 'md5': '3e56f97b0b6ffb4b79f4ea0749551084', 'info_dict': { - 'id': 'criminalizing-journalism', + 'id': 'bestoftv/2014/02/09/criminalizing-journalism.cnn', 'ext': 'mp4', 'title': 'Criminalizing journalism?', 'description': 'Glenn Greenwald responds to comments made this week on Capitol Hill that journalists could be criminal accessories.', @@ -132,7 +132,7 @@ class CNNArticleIE(InfoExtractor): 'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/', 'md5': '689034c2a3d9c6dc4aa72d65a81efd01', 'info_dict': { - 'id': 'ip-north-korea-obama', + 'id': 'bestoftv/2014/12/21/ip-north-korea-obama.cnn', 'ext': 'mp4', 'title': 'Obama: Cyberattack not an act of war', 'description': 'md5:0a802a40d2376f60e6b04c8d5bcebc4b', diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index aabd5b670..53561961c 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -39,18 +39,19 @@ class NBAIE(TurnerBaseIE): 'url': 'http://watch.nba.com/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4', 'info_dict': { - 'id': '0041400301-cle-atl-recap', + 'id': 'channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', 'ext': 'mp4', 'title': 'Hawks vs. Cavaliers Game 1', 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', 'duration': 228, 'timestamp': 1432134543, 'upload_date': '20150520', - } + }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'http://www.nba.com/clippers/news/doc-rivers-were-not-trading-blake', 'info_dict': { - 'id': '1455672027478-Doc_Feb16_720', + 'id': 'teams/clippers/2016/02/17/1455672027478-Doc_Feb16_720.mov-297324', 'ext': 'mp4', 'title': 'Practice: Doc Rivers - 2/16/16', 'description': 'Head Coach Doc Rivers addresses the media following practice.', @@ -61,6 +62,7 @@ class NBAIE(TurnerBaseIE): # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#', 'info_dict': { @@ -75,7 +77,7 @@ class NBAIE(TurnerBaseIE): }, { 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#', 'info_dict': { - 'id': 'Wigginsmp4-3462601', + 'id': 'teams/timberwolves/2014/12/12/Wigginsmp4-3462601', 'ext': 'mp4', 'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins', 'description': 'Wolves rookie Andrew Wiggins addresses the media after Friday\'s shootaround.', @@ -87,6 +89,7 @@ class NBAIE(TurnerBaseIE): # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest'], }] _PAGE_SIZE = 30 diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index 64fdcc56e..b59dafda6 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -22,7 +22,7 @@ class TurnerBaseIE(InfoExtractor): def _extract_cvp_info(self, data_src, video_id, path_data={}): video_data = self._download_xml(data_src, video_id) - video_id = video_data.attrib['id'].split('/')[-1].split('.')[0] + video_id = video_data.attrib['id'] title = xpath_text(video_data, 'headline', fatal=True) # rtmp_src = xpath_text(video_data, 'akamai/src') # if rtmp_src: From 245023a86145f7074dacdab4c735dea268d766ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 30 Aug 2016 23:51:18 +0700 Subject: [PATCH 209/218] [pyvideo] Fix extraction (Closes #10468) --- youtube_dl/extractor/pyvideo.py | 94 +++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 40 deletions(-) diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py index cc0416cb8..08ec09183 100644 --- a/youtube_dl/extractor/pyvideo.py +++ b/youtube_dl/extractor/pyvideo.py @@ -1,59 +1,73 @@ from __future__ import unicode_literals import re -import os from .common import InfoExtractor +from ..compat import compat_str +from ..utils import int_or_none class PyvideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pyvideo\.org/video/(?P<id>\d+)/(.*)' + _VALID_URL = r'https?://(?:www\.)?pyvideo\.org/(?P<category>[^/]+)/(?P<id>[^/?#&.]+)' - _TESTS = [ - { - 'url': 'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes', - 'md5': '520915673e53a5c5d487c36e0c4d85b5', - 'info_dict': { - 'id': '24_4WWkSmNo', - 'ext': 'webm', - 'title': 'Become a logging expert in 30 minutes', - 'description': 'md5:9665350d466c67fb5b1598de379021f7', - 'upload_date': '20130320', - 'uploader': 'Next Day Video', - 'uploader_id': 'NextDayVideo', - }, - 'add_ie': ['Youtube'], + _TESTS = [{ + 'url': 'http://pyvideo.org/pycon-us-2013/become-a-logging-expert-in-30-minutes.html', + 'info_dict': { + 'id': 'become-a-logging-expert-in-30-minutes', }, - { - 'url': 'http://pyvideo.org/video/2542/gloriajw-spotifywitherikbernhardsson182m4v', - 'md5': '5fe1c7e0a8aa5570330784c847ff6d12', - 'info_dict': { - 'id': '2542', - 'ext': 'm4v', - 'title': 'Gloriajw-SpotifyWithErikBernhardsson182', - }, + 'playlist_count': 2, + }, { + 'url': 'http://pyvideo.org/pygotham-2012/gloriajw-spotifywitherikbernhardsson182m4v.html', + 'md5': '5fe1c7e0a8aa5570330784c847ff6d12', + 'info_dict': { + 'id': '2542', + 'ext': 'm4v', + 'title': 'Gloriajw-SpotifyWithErikBernhardsson182.m4v', }, - ] + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + category = mobj.group('category') video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) + entries = [] - m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', webpage) - if m_youtube is not None: - return self.url_result(m_youtube.group(1), 'Youtube') + data = self._download_json( + 'https://raw.githubusercontent.com/pyvideo/data/master/%s/videos/%s.json' + % (category, video_id), video_id, fatal=False) - title = self._html_search_regex( - r'<div class="section">\s*<h3(?:\s+class="[^"]*"[^>]*)?>([^>]+?)</h3>', - webpage, 'title', flags=re.DOTALL) - video_url = self._search_regex( - [r'<source src="(.*?)"', r'<dt>Download</dt>.*?<a href="(.+?)"'], - webpage, 'video url', flags=re.DOTALL) + if data: + print(data) + for video in data['videos']: + video_url = video.get('url') + if video_url: + if video.get('type') == 'youtube': + entries.append(self.url_result(video_url, 'Youtube')) + else: + entries.append({ + 'id': compat_str(data.get('id') or video_id), + 'url': video_url, + 'title': data['title'], + 'description': data.get('description') or data.get('summary'), + 'thumbnail': data.get('thumbnail_url'), + 'duration': int_or_none(data.get('duration')), + }) + else: + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + media_urls = self._search_regex( + r'(?s)Media URL:(.+?)</li>', webpage, 'media urls') + for m in re.finditer( + r'<a[^>]+href=(["\'])(?P<url>http.+?)\1', media_urls): + media_url = m.group('url') + if re.match(r'https?://www\.youtube\.com/watch\?v=.*', media_url): + entries.append(self.url_result(media_url, 'Youtube')) + else: + entries.append({ + 'id': video_id, + 'url': media_url, + 'title': title, + }) - return { - 'id': video_id, - 'title': os.path.splitext(title)[0], - 'url': video_url, - } + return self.playlist_result(entries, video_id) From 64fc49aba018ebd51627ddcc92f8fa88f2c499cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 31 Aug 2016 00:29:49 +0700 Subject: [PATCH 210/218] [bandcamp:album] Fix title extraction (Closes #10455) --- youtube_dl/extractor/bandcamp.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 991ab0676..249c3d956 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -162,6 +162,15 @@ class BandcampAlbumIE(InfoExtractor): 'uploader_id': 'dotscale', }, 'playlist_mincount': 7, + }, { + # with escaped quote in title + 'url': 'https://jstrecords.bandcamp.com/album/entropy-ep', + 'info_dict': { + 'title': '"Entropy" EP', + 'uploader_id': 'jstrecords', + 'id': 'entropy-ep', + }, + 'playlist_mincount': 3, }] def _real_extract(self, url): @@ -176,8 +185,11 @@ class BandcampAlbumIE(InfoExtractor): entries = [ self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key()) for t_path in tracks_paths] - title = self._search_regex( - r'album_title\s*:\s*"(.*?)"', webpage, 'title', fatal=False) + title = self._html_search_regex( + r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"', + webpage, 'title', fatal=False) + if title: + title = title.replace(r'\"', '"') return { '_type': 'playlist', 'uploader_id': uploader_id, From f7043ef39cb73f8501d18d2e1f93997357397ba2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 31 Aug 2016 01:56:15 +0700 Subject: [PATCH 211/218] [soundcloud] Fix _VALID_URL clashes with sets (Closes #10505) --- youtube_dl/extractor/soundcloud.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index aeae931a2..9635c2b49 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -32,7 +32,7 @@ class SoundcloudIE(InfoExtractor): _VALID_URL = r'''(?x)^(?:https?://)? (?:(?:(?:www\.|m\.)?soundcloud\.com/ (?P<uploader>[\w\d-]+)/ - (?!(?:tracks|sets(?:/[^/?#]+)?|reposts|likes|spotlight)/?(?:$|[?#])) + (?!(?:tracks|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#])) (?P<title>[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) @@ -265,6 +265,9 @@ class SoundcloudSetIE(SoundcloudIE): 'title': 'The Royal Concept EP', }, 'playlist_mincount': 6, + }, { + 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep/token', + 'only_matching': True, }] def _real_extract(self, url): From a249ab83cb1d7765d787a7b1d050449736aaa789 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 31 Aug 2016 01:56:58 +0700 Subject: [PATCH 212/218] [pyvideo] Remove debugging code --- youtube_dl/extractor/pyvideo.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py index 08ec09183..b8ac93a62 100644 --- a/youtube_dl/extractor/pyvideo.py +++ b/youtube_dl/extractor/pyvideo.py @@ -38,7 +38,6 @@ class PyvideoIE(InfoExtractor): % (category, video_id), video_id, fatal=False) if data: - print(data) for video in data['videos']: video_url = video.get('url') if video_url: From 263fef43dea463ab4b897c8374dbb11c705f061c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 31 Aug 2016 02:37:40 +0700 Subject: [PATCH 213/218] [ChangeLog] Actualize --- ChangeLog | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ChangeLog b/ChangeLog index e055976c5..7e24b8c6b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,21 @@ version <unreleased> Extractors +* [soundcloud] Fix URL regular expression to avoid clashes with sets (#10505) +* [bandcamp:album] Fix title extraction (#10455) +* [pyvideo] Fix extraction (#10468) ++ [ctv] Add support for tsn.ca, bnn.ca and thecomedynetwork.ca (#10016) +* [9c9media] Extract more metadata +* [9c9media] Fix multiple stacks extraction (#10016) +* [adultswim] Improve video info extraction (#10492) +* [vodplatform] Improve embed regular expression +- [played] Remove extractor (#10470) ++ [tbs] Add extractor for tbs.com and tntdrama.com (#10222) ++ [cartoonnetwork] Add extractor for cartoonnetwork.com (#10110) +* [adultswim] Rework in terms of turner extractor +* [cnn] Rework in terms of turner extractor +* [nba] Rework in terms of turner extractor ++ [turner] Add base extractor for Turner Broadcasting System based sites * [bilibili] Fix extraction (#10375) * [openload] Fix extraction (#10408) From 4fd350611c71571733950ad2473d4148f7bb6a63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 31 Aug 2016 02:39:39 +0700 Subject: [PATCH 214/218] release 2016.08.31 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 8 +++++--- youtube_dl/version.py | 2 +- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index a2fe59f80..2caca5115 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.28*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.28** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.31*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.31** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.08.28 +[debug] youtube-dl version 2016.08.31 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 7e24b8c6b..0f8076d96 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2016.08.31 Extractors * [soundcloud] Fix URL regular expression to avoid clashes with sets (#10505) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index bf08697be..42bf291e2 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -13,6 +13,8 @@ - **5min** - **8tracks** - **91porn** + - **9c9media** + - **9c9media:stack** - **9gag** - **9now.com.au** - **abc.net.au** @@ -89,7 +91,7 @@ - **Bet** - **Bigflix** - **Bild**: Bild.de - - **BiliBili** (Currently broken) + - **BiliBili** - **BioBioChileTV** - **BIQLE** - **BleacherReport** @@ -115,6 +117,7 @@ - **Canvas** - **CarambaTV** - **CarambaTVPage** + - **CartoonNetwork** - **cbc.ca** - **cbc.ca:player** - **cbc.ca:watch** @@ -459,7 +462,6 @@ - **nick.de** - **niconico**: ニコニコ動画 - **NiconicoPlaylist** - - **NineCNineMedia** - **Nintendo** - **njoy**: N-JOY - **njoy:embed** @@ -517,7 +519,6 @@ - **Pinkbike** - **Pladform** - **play.fm** - - **played.to** - **PlaysTV** - **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz - **Playvid** @@ -675,6 +676,7 @@ - **Tagesschau** - **tagesschau:player** - **Tass** + - **TBS** - **TDSLifeway** - **teachertube**: teachertube.com videos - **teachertube:user:collection**: teachertube.com user and collection videos diff --git a/youtube_dl/version.py b/youtube_dl/version.py index ee30ca2ad..fe442dd88 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.08.28' +__version__ = '2016.08.31' From 165620e320ecb9213ee9928466a9209e7608f83c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 30 Aug 2016 21:48:59 +0100 Subject: [PATCH 215/218] [yahoo] extract more and better formats --- youtube_dl/extractor/yahoo.py | 81 +++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index b0679dfb7..d7a81ab8c 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -8,7 +8,6 @@ import re from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( compat_urllib_parse, - compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -17,6 +16,7 @@ from ..utils import ( ExtractorError, int_or_none, mimetype2ext, + determine_ext, ) from .brightcove import BrightcoveNewIE @@ -39,7 +39,7 @@ class YahooIE(InfoExtractor): }, { 'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html', - 'md5': 'c3466d2b6d5dd6b9f41ba9ed04c24b23', + 'md5': '251af144a19ebc4a033e8ba91ac726bb', 'info_dict': { 'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9', 'ext': 'mp4', @@ -50,7 +50,7 @@ class YahooIE(InfoExtractor): }, { 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', - 'md5': '75ffabdb87c16d4ffe8c036dc4d1c136', + 'md5': '7993e572fac98e044588d0b5260f4352', 'info_dict': { 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb', 'ext': 'mp4', @@ -61,7 +61,7 @@ class YahooIE(InfoExtractor): }, { 'url': 'https://tw.news.yahoo.com/%E6%95%A2%E5%95%8F%E5%B8%82%E9%95%B7%20%E9%BB%83%E7%A7%80%E9%9C%9C%E6%89%B9%E8%B3%B4%E6%B8%85%E5%BE%B7%20%E9%9D%9E%E5%B8%B8%E9%AB%98%E5%82%B2-034024051.html', - 'md5': '9035d38f88b1782682a3e89f985be5bb', + 'md5': '45c024bad51e63e9b6f6fad7a43a8c23', 'info_dict': { 'id': 'cac903b3-fcf4-3c14-b632-643ab541712f', 'ext': 'mp4', @@ -72,10 +72,10 @@ class YahooIE(InfoExtractor): }, { 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html', - 'md5': '0b51660361f0e27c9789e7037ef76f4b', + 'md5': '71298482f7c64cbb7fa064e4553ff1c1', 'info_dict': { 'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder', 'description': 'md5:f66c890e1490f4910a9953c941dee944', 'duration': 97, @@ -98,7 +98,7 @@ class YahooIE(InfoExtractor): 'id': '154609075', }, 'playlist': [{ - 'md5': 'f8e336c6b66f503282e5f719641d6565', + 'md5': '000887d0dc609bc3a47c974151a40fb8', 'info_dict': { 'id': 'e624c4bc-3389-34de-9dfc-025f74943409', 'ext': 'mp4', @@ -107,7 +107,7 @@ class YahooIE(InfoExtractor): 'duration': 30, }, }, { - 'md5': '958bcb90b4d6df71c56312137ee1cd5a', + 'md5': '81bc74faf10750fe36e4542f9a184c66', 'info_dict': { 'id': '1fc8ada0-718e-3abe-a450-bf31f246d1a9', 'ext': 'mp4', @@ -139,7 +139,7 @@ class YahooIE(InfoExtractor): 'skip': 'Domain name in.lifestyle.yahoo.com gone', }, { 'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html', - 'md5': 'b17ac378b1134fa44370fb27db09a744', + 'md5': '2a9752f74cb898af5d1083ea9f661b58', 'info_dict': { 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1', 'ext': 'mp4', @@ -168,7 +168,7 @@ class YahooIE(InfoExtractor): }, { # Query result is embedded in webpage, but explicit request to video API fails with geo restriction 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html', - 'md5': '1ddbf7c850777548438e5c4f147c7b8c', + 'md5': '4fbafb9c9b6f07aa8f870629f6671b35', 'info_dict': { 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504', 'ext': 'mp4', @@ -196,6 +196,7 @@ class YahooIE(InfoExtractor): 'description': 'Galactic', 'title': 'Dolla Diva (feat. Maggie Koerner)', }, + 'skip': 'redirect to https://www.yahoo.com/music', }, ] @@ -213,15 +214,7 @@ class YahooIE(InfoExtractor): entries = [] iframe_urls = re.findall(r'<iframe[^>]+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage) for idx, iframe_url in enumerate(iframe_urls): - iframepage = self._download_webpage( - host + iframe_url, display_id, - note='Downloading iframe webpage for video #%d' % idx) - items_json = self._search_regex( - r'mediaItems: (\[.+?\])$', iframepage, 'items', flags=re.MULTILINE, default=None) - if items_json: - items = json.loads(items_json) - video_id = items[0]['id'] - entries.append(self._get_info(video_id, display_id, webpage)) + entries.append(self.url_result(host + iframe_url, 'Yahoo')) if entries: return self.playlist_result(entries, page_id) @@ -246,7 +239,9 @@ class YahooIE(InfoExtractor): if config: sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi') if sapi and 'query' in sapi: - return self._extract_info(display_id, sapi, webpage) + info = self._extract_info(display_id, sapi, webpage) + self._sort_formats(info['formats']) + return info items_json = self._search_regex( r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, @@ -292,15 +287,17 @@ class YahooIE(InfoExtractor): formats = [] for s in info['streams']: + tbr = int_or_none(s.get('bitrate')) format_info = { 'width': int_or_none(s.get('width')), 'height': int_or_none(s.get('height')), - 'tbr': int_or_none(s.get('bitrate')), + 'tbr': tbr, } host = s['host'] path = s['path'] if host.startswith('rtmp'): + fmt = 'rtmp' format_info.update({ 'url': host, 'play_path': path, @@ -308,14 +305,18 @@ class YahooIE(InfoExtractor): }) else: if s.get('format') == 'm3u8_playlist': - format_info['protocol'] = 'm3u8_native' - format_info['ext'] = 'mp4' + fmt = 'hls' + format_info.update({ + 'protocol': 'm3u8_native', + 'ext': 'mp4', + }) + else: + fmt = format_info['ext'] = determine_ext(path) format_url = compat_urlparse.urljoin(host, path) format_info['url'] = format_url + format_info['format_id'] = fmt + ('-%d' % tbr if tbr else '') formats.append(format_info) - self._sort_formats(formats) - closed_captions = self._html_search_regex( r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions', default='[]') @@ -346,17 +347,25 @@ class YahooIE(InfoExtractor): def _get_info(self, video_id, display_id, webpage): region = self._search_regex( r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', - webpage, 'region', fatal=False, default='US') - data = compat_urllib_parse_urlencode({ - 'protocol': 'http', - 'region': region.upper(), - }) - query_url = ( - 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' - '{id}?{data}'.format(id=video_id, data=data)) - query_result = self._download_json( - query_url, display_id, 'Downloading video info') - return self._extract_info(display_id, query_result, webpage) + webpage, 'region', fatal=False, default='US').upper() + formats = [] + info = {} + for fmt in ('webm', 'mp4'): + query_result = self._download_json( + 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' + video_id, + display_id, 'Downloading %s video info' % fmt, query={ + 'protocol': 'http', + 'region': region, + 'format': fmt, + }) + info = self._extract_info(display_id, query_result, webpage) + formats.extend(info['formats']) + formats.extend(self._extract_m3u8_formats( + 'http://video.media.yql.yahoo.com/v1/hls/%s?region=%s' % (video_id, region), + video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + info['formats'] = formats + return info class YahooSearchIE(SearchInfoExtractor): From 196c6ba06792ec38238631d9173fc146822baa7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 31 Aug 2016 22:12:37 +0700 Subject: [PATCH 216/218] [facebook] Extract timestamp (Closes #10508) --- youtube_dl/extractor/facebook.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 0fb781a73..228b0b6d7 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -15,6 +15,7 @@ from ..compat import ( from ..utils import ( error_to_compat_str, ExtractorError, + int_or_none, limit_length, sanitized_Request, urlencode_postdata, @@ -62,6 +63,8 @@ class FacebookIE(InfoExtractor): 'ext': 'mp4', 'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam', 'uploader': 'Tennis on Facebook', + 'upload_date': '20140908', + 'timestamp': 1410199200, } }, { 'note': 'Video without discernible title', @@ -71,6 +74,8 @@ class FacebookIE(InfoExtractor): 'ext': 'mp4', 'title': 'Facebook video #274175099429670', 'uploader': 'Asif Nawab Butt', + 'upload_date': '20140506', + 'timestamp': 1399398998, }, 'expected_warnings': [ 'title' @@ -78,12 +83,14 @@ class FacebookIE(InfoExtractor): }, { 'note': 'Video with DASH manifest', 'url': 'https://www.facebook.com/video.php?v=957955867617029', - 'md5': '54706e4db4f5ad58fbad82dde1f1213f', + 'md5': 'b2c28d528273b323abe5c6ab59f0f030', 'info_dict': { 'id': '957955867617029', 'ext': 'mp4', 'title': 'When you post epic content on instagram.com/433 8 million followers, this is ...', 'uploader': 'Demy de Zeeuw', + 'upload_date': '20160110', + 'timestamp': 1452431627, }, }, { 'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570', @@ -306,12 +313,16 @@ class FacebookIE(InfoExtractor): if not video_title: video_title = 'Facebook video #%s' % video_id uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) + timestamp = int_or_none(self._search_regex( + r'<abbr[^>]+data-utime=["\'](\d+)', webpage, + 'timestamp', default=None)) info_dict = { 'id': video_id, 'title': video_title, 'formats': formats, 'uploader': uploader, + 'timestamp': timestamp, } return webpage, info_dict From 7a3e849f6eaf51b1d86b843a63664012ced2258c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 31 Aug 2016 22:23:55 +0700 Subject: [PATCH 217/218] [porncom] Extract categories and tags (Closes #10510) --- youtube_dl/extractor/porncom.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/porncom.py b/youtube_dl/extractor/porncom.py index 4baf79688..d85e0294d 100644 --- a/youtube_dl/extractor/porncom.py +++ b/youtube_dl/extractor/porncom.py @@ -26,6 +26,8 @@ class PornComIE(InfoExtractor): 'duration': 551, 'view_count': int, 'age_limit': 18, + 'categories': list, + 'tags': list, }, }, { 'url': 'http://se.porn.com/videos/marsha-may-rides-seth-on-top-of-his-thick-cock-2658067', @@ -75,7 +77,14 @@ class PornComIE(InfoExtractor): self._sort_formats(formats) view_count = str_to_int(self._search_regex( - r'class=["\']views["\'][^>]*><p>([\d,.]+)', webpage, 'view count')) + r'class=["\']views["\'][^>]*><p>([\d,.]+)', webpage, + 'view count', fatal=False)) + + def extract_list(kind): + s = self._search_regex( + r'(?s)<p[^>]*>%s:(.+?)</p>' % kind.capitalize(), + webpage, kind, fatal=False) + return re.findall(r'<a[^>]+>([^<]+)</a>', s or '') return { 'id': video_id, @@ -86,4 +95,6 @@ class PornComIE(InfoExtractor): 'view_count': view_count, 'formats': formats, 'age_limit': 18, + 'categories': extract_list('categories'), + 'tags': extract_list('tags'), } From f8fd510eb4b2733a5c083d767d45baa88b289298 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 31 Aug 2016 18:31:49 +0100 Subject: [PATCH 218/218] [limelight] skip ism manifests and reduce requests --- youtube_dl/extractor/limelight.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index a425bafe3..6752ffee2 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -34,11 +34,12 @@ class LimelightBaseIE(InfoExtractor): def _extract_info(self, streams, mobile_urls, properties): video_id = properties['media_id'] formats = [] - + urls = [] for stream in streams: stream_url = stream.get('url') - if not stream_url or stream.get('drmProtected'): + if not stream_url or stream.get('drmProtected') or stream_url in urls: continue + urls.append(stream_url) ext = determine_ext(stream_url) if ext == 'f4m': formats.extend(self._extract_f4m_formats( @@ -58,9 +59,11 @@ class LimelightBaseIE(InfoExtractor): format_id = 'rtmp' if stream.get('videoBitRate'): format_id += '-%d' % int_or_none(stream['videoBitRate']) + http_url = 'http://%s/%s' % (rtmp.group('host').replace('csl.', 'cpl.'), rtmp.group('playpath')[4:]) + urls.append(http_url) http_fmt = fmt.copy() http_fmt.update({ - 'url': 'http://%s/%s' % (rtmp.group('host').replace('csl.', 'cpl.'), rtmp.group('playpath')[4:]), + 'url': http_url, 'format_id': format_id.replace('rtmp', 'http'), }) formats.append(http_fmt) @@ -76,8 +79,9 @@ class LimelightBaseIE(InfoExtractor): for mobile_url in mobile_urls: media_url = mobile_url.get('mobileUrl') format_id = mobile_url.get('targetMediaPlatform') - if not media_url or format_id == 'Widevine': + if not media_url or format_id in ('Widevine', 'SmoothStreaming') or media_url in urls: continue + urls.append(media_url) ext = determine_ext(media_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats(