From f11554092b419baa919875432fe6ebc1f22f5307 Mon Sep 17 00:00:00 2001 From: Tjark Saul Date: Fri, 17 Apr 2015 09:21:54 +0200 Subject: [PATCH 001/450] [Lecture2Go] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/lecture2go.py | 33 ++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 youtube_dl/extractor/lecture2go.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bbf3be41d..3d6e981b2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -249,6 +249,7 @@ from .krasview import KrasViewIE from .ku6 import Ku6IE from .la7 import LA7IE from .laola1tv import Laola1TvIE +from .lecture2go import Lecture2GoIE from .letv import ( LetvIE, LetvTvIE, diff --git a/youtube_dl/extractor/lecture2go.py b/youtube_dl/extractor/lecture2go.py new file mode 100644 index 000000000..9cf28e31c --- /dev/null +++ b/youtube_dl/extractor/lecture2go.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class Lecture2GoIE(InfoExtractor): + _VALID_URL = r'https?://lecture2go.uni-hamburg.de/veranstaltungen/-/v/(?P[0-9]+)' + _TEST = { + 'url': 'https://lecture2go.uni-hamburg.de/veranstaltungen/-/v/17473', + 'md5': 'a9e76f83b3ef58019c4b7dbc35f406c1', + 'info_dict': { + 'id': '17473', + 'ext': 'mp4', + 'url': 'https://fms1.rrz.uni-hamburg.de/abo/64.050_FrankHeitmann_2015-04-13_14-35.mp4', + 'title': '2 - Endliche Automaten und reguläre Sprachen' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'(.*?)', webpage, 'title') + video_url = self._search_regex(r'b.isFirefox..a.useHTML5\).b.setOption.a,"src","(.*.mp4)"\).else', webpage, 'video_url') + creator = self._html_search_regex(r'
(.*)
', webpage, 'creator') + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'creator': creator + } From a745475808e125a590afb14df48c565309d3f75c Mon Sep 17 00:00:00 2001 From: Behrooz Date: Fri, 8 May 2015 02:50:46 +0200 Subject: [PATCH 002/450] Ir90Tv Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/ir90tv.py | 41 ++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 youtube_dl/extractor/ir90tv.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5dfa781f8..ee05a6958 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -229,6 +229,7 @@ from .infoq import InfoQIE from .instagram import InstagramIE, InstagramUserIE from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE +from .ir90tv import Ir90TvIE from .ivi import ( IviIE, IviCompilationIE diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py new file mode 100644 index 000000000..5aa9d6ff4 --- /dev/null +++ b/youtube_dl/extractor/ir90tv.py @@ -0,0 +1,41 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class Ir90TvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?90tv\.ir/video/(?P[0-9]+)/.*' + _TEST = { + 'url': 'http://90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218', + 'md5': '411dbd94891381960cb9e13daa47a869', + 'info_dict': { + 'id': '95719', + 'ext': 'mp4', + 'title': 'شایعات نقل و انتقالات مهم فوتبال اروپا 94/02/18', + 'thumbnail': 're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # TODO more code goes here, for example ... + title = self._html_search_regex( + r'\n90tv.ir :: (.*?)', webpage, 'title') + + video_url = self._search_regex( + r']+src="([^"]+)"', webpage, 'video url') + + thumbnail = self._search_regex(r'poster="([^"]+)"', webpage, 'thumbnail url') + print thumbnail + + + return { + 'url': video_url, + 'id': video_id, + 'title': title, + 'video_url' : video_url, + 'thumbnail' : thumbnail, + } \ No newline at end of file From 54b31d149e7be08eb7be9981a9eec398d11f17ef Mon Sep 17 00:00:00 2001 From: Behrooz Date: Fri, 8 May 2015 02:55:01 +0200 Subject: [PATCH 003/450] Ir90Tv Add new extractor --- youtube_dl/extractor/ir90tv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py index 5aa9d6ff4..3a3cb4887 100644 --- a/youtube_dl/extractor/ir90tv.py +++ b/youtube_dl/extractor/ir90tv.py @@ -38,4 +38,4 @@ class Ir90TvIE(InfoExtractor): 'title': title, 'video_url' : video_url, 'thumbnail' : thumbnail, - } \ No newline at end of file + } From a650110ba762b2658c64392317c1afd2a284dd3d Mon Sep 17 00:00:00 2001 From: Behrooz Date: Fri, 8 May 2015 04:32:08 +0200 Subject: [PATCH 004/450] remove print --- youtube_dl/extractor/ir90tv.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py index 3a3cb4887..b79529b1b 100644 --- a/youtube_dl/extractor/ir90tv.py +++ b/youtube_dl/extractor/ir90tv.py @@ -21,7 +21,6 @@ class Ir90TvIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - # TODO more code goes here, for example ... title = self._html_search_regex( r'\n90tv.ir :: (.*?)', webpage, 'title') @@ -29,8 +28,6 @@ class Ir90TvIE(InfoExtractor): r']+src="([^"]+)"', webpage, 'video url') thumbnail = self._search_regex(r'poster="([^"]+)"', webpage, 'thumbnail url') - print thumbnail - return { 'url': video_url, From 6800d3372f35e08dcc4d34d06601815bf0cb0a3d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 3 Jun 2015 23:10:18 +0800 Subject: [PATCH 005/450] [YoutubeDL] Support DASH manifest downloading --- youtube_dl/downloader/dash.py | 50 +++++++++++++++++++++++++++++++++ youtube_dl/downloader/http.py | 4 +++ youtube_dl/extractor/youtube.py | 6 ++++ 3 files changed, 60 insertions(+) create mode 100644 youtube_dl/downloader/dash.py diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py new file mode 100644 index 000000000..18eca2c04 --- /dev/null +++ b/youtube_dl/downloader/dash.py @@ -0,0 +1,50 @@ +from __future__ import unicode_literals +from .common import FileDownloader +from ..compat import compat_urllib_request + +import re + + +class DashSegmentsFD(FileDownloader): + """ + Download segments in a DASH manifest + """ + def real_download(self, filename, info_dict): + self.report_destination(filename) + tmpfilename = self.temp_name(filename) + base_url = info_dict['url'] + segment_urls = info_dict['segment_urls'] + + self.byte_counter = 0 + + def append_url_to_file(outf, target_url, target_name): + self.to_screen('[DashSegments] %s: Downloading %s' % (info_dict['id'], target_name)) + req = compat_urllib_request.Request(target_url) + data = self.ydl.urlopen(req).read() + outf.write(data) + self.byte_counter += len(data) + + def combine_url(base_url, target_url): + if re.match(r'^https?://', target_url): + return target_url + return '%s/%s' % (base_url, target_url) + + with open(tmpfilename, 'wb') as outf: + append_url_to_file( + outf, combine_url(base_url, info_dict['initialization_url']), + 'initialization segment') + for i, segment_url in enumerate(segment_urls): + append_url_to_file( + outf, combine_url(base_url, segment_url), + 'segment %d / %d' % (i + 1, len(segment_urls))) + + self.try_rename(tmpfilename, filename) + + self._hook_progress({ + 'downloaded_bytes': self.byte_counter, + 'total_bytes': self.byte_counter, + 'filename': filename, + 'status': 'finished', + }) + + return True diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index b7f144af9..ceacb8522 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -6,6 +6,7 @@ import socket import time from .common import FileDownloader +from .dash import DashSegmentsFD from ..compat import ( compat_urllib_request, compat_urllib_error, @@ -19,6 +20,9 @@ from ..utils import ( class HttpFD(FileDownloader): def real_download(self, filename, info_dict): + if info_dict.get('initialization_url') and list(filter(None, info_dict.get('segment_urls', []))): + return DashSegmentsFD(self.ydl, self.params).real_download(filename, info_dict) + url = info_dict['url'] tmpfilename = self.temp_name(filename) stream = None diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index aacb999ce..5d1297e0d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -802,6 +802,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # TODO implement WebVTT downloading pass elif mime_type.startswith('audio/') or mime_type.startswith('video/'): + segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList') format_id = r.attrib['id'] video_url = url_el.text filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength')) @@ -815,6 +816,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'filesize': filesize, 'fps': int_or_none(r.attrib.get('frameRate')), } + if segment_list: + f.update({ + 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'], + 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')] + }) try: existing_format = next( fo for fo in formats From b9258c61789388b49792ebdceb5d804217a36da5 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 4 Jun 2015 22:05:33 +0800 Subject: [PATCH 006/450] [YoutubeDL] Change how DashSegmentsFD is selected --- youtube_dl/downloader/__init__.py | 2 ++ youtube_dl/downloader/http.py | 4 ---- youtube_dl/extractor/youtube.py | 3 ++- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index f110830c4..1b618ab54 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -8,6 +8,7 @@ from .hls import NativeHlsFD from .http import HttpFD from .rtsp import RtspFD from .rtmp import RtmpFD +from .dash import DashSegmentsFD from ..utils import ( determine_protocol, @@ -20,6 +21,7 @@ PROTOCOL_MAP = { 'mms': RtspFD, 'rtsp': RtspFD, 'f4m': F4mFD, + 'dash_segments': DashSegmentsFD, } diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index ceacb8522..b7f144af9 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -6,7 +6,6 @@ import socket import time from .common import FileDownloader -from .dash import DashSegmentsFD from ..compat import ( compat_urllib_request, compat_urllib_error, @@ -20,9 +19,6 @@ from ..utils import ( class HttpFD(FileDownloader): def real_download(self, filename, info_dict): - if info_dict.get('initialization_url') and list(filter(None, info_dict.get('segment_urls', []))): - return DashSegmentsFD(self.ydl, self.params).real_download(filename, info_dict) - url = info_dict['url'] tmpfilename = self.temp_name(filename) stream = None diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5d1297e0d..692d4d8db 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -819,7 +819,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if segment_list: f.update({ 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'], - 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')] + 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')], + 'protocol': 'dash_segments', }) try: existing_format = next( From 453a1617aac6e8000ed947cad7d88817c5740ede Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 4 Jun 2015 22:12:05 +0800 Subject: [PATCH 007/450] [downloader/dash] Reorder imports --- youtube_dl/downloader/dash.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 18eca2c04..5f14658ba 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -1,9 +1,10 @@ from __future__ import unicode_literals -from .common import FileDownloader -from ..compat import compat_urllib_request import re +from .common import FileDownloader +from ..compat import compat_urllib_request + class DashSegmentsFD(FileDownloader): """ From 423d2be5f8c5e70d202ddfa63f3e5365e6afe823 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 4 Jun 2015 22:27:29 +0800 Subject: [PATCH 008/450] [downloader/dash] Rename the protocol 'http_dash_segments' looks more like a protocol name than 'dash_segments' --- youtube_dl/downloader/__init__.py | 2 +- youtube_dl/extractor/youtube.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index 1b618ab54..dccc59212 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -21,7 +21,7 @@ PROTOCOL_MAP = { 'mms': RtspFD, 'rtsp': RtspFD, 'f4m': F4mFD, - 'dash_segments': DashSegmentsFD, + 'http_dash_segments': DashSegmentsFD, } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 692d4d8db..6d288e848 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -820,7 +820,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f.update({ 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'], 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')], - 'protocol': 'dash_segments', + 'protocol': 'http_dash_segments', }) try: existing_format = next( From 4da31bd56629054497634d041035e4bd6fcfacbb Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 6 Jun 2015 22:22:26 +0800 Subject: [PATCH 009/450] [youtube] Fix a FutureWarning from xml.etree.ElementTree --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6d288e848..2424ac2c0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -816,7 +816,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'filesize': filesize, 'fps': int_or_none(r.attrib.get('frameRate')), } - if segment_list: + if len(segment_list): f.update({ 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'], 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')], From d84f1d14b526c4a5359117a58f25691a3da4c97e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lio=20A=2E=20Heckert?= Date: Tue, 9 Jun 2015 22:08:16 -0300 Subject: [PATCH 010/450] Adds support for XviD output with extra parametrization As the "LG Time Machine" (a (not so) smart TV) has a limitation for video dimensions (as for codecs), I take to implement an extra parameter `--pp-params` where we can send extra parameterization for the video converter (post-processor). Example: ``` $ youtube-dl --recode-video=xvid --pp-params='-s 720x480' -c https://www.youtube.com/watch?v=BE7Qoe2ZiXE ``` That works fine on a 4yo LG Time Machine. Closes #5733 --- README.md | 3 ++- youtube_dl/__init__.py | 5 ++++- youtube_dl/options.py | 6 +++++- youtube_dl/postprocessor/ffmpeg.py | 14 ++++++++++---- 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index f3d83c89f..726ec9cf2 100644 --- a/README.md +++ b/README.md @@ -213,7 +213,8 @@ which means you can modify it, redistribute it or use it however you like. --audio-format FORMAT Specify audio format: "best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; "best" by default --audio-quality QUALITY Specify ffmpeg/avconv audio quality, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default 5) - --recode-video FORMAT Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv) + --recode-video FORMAT Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|xvid) + --pp-params Extra parameters for video post-processor. The params will be splited on spaces. -k, --keep-video Keep the video file on disk after the post-processing; the video is erased by default --no-post-overwrites Do not overwrite post-processed files; the post-processed files are overwritten by default --embed-subs Embed subtitles in the video (only for mkv and mp4 videos) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index ace17857c..5b28e4817 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -169,8 +169,10 @@ def _real_main(argv=None): if not opts.audioquality.isdigit(): parser.error('invalid audio quality specified') if opts.recodevideo is not None: - if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv']: + if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'xvid']: parser.error('invalid video recode format specified') + if opts.pp_params is not None: + opts.pp_params = opts.pp_params.split() if opts.convertsubtitles is not None: if opts.convertsubtitles not in ['srt', 'vtt', 'ass']: parser.error('invalid subtitle format specified') @@ -227,6 +229,7 @@ def _real_main(argv=None): postprocessors.append({ 'key': 'FFmpegVideoConvertor', 'preferedformat': opts.recodevideo, + 'extra_params': opts.pp_params }) if opts.convertsubtitles: postprocessors.append({ diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 689fa7595..ceb4b5f38 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -686,7 +686,11 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--recode-video', metavar='FORMAT', dest='recodevideo', default=None, - help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv)') + help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|xvid)') + postproc.add_option( + '--pp-params', + dest='pp_params', default=None, + help='Extra parameters for video post-processor. The params will be splited on spaces.') postproc.add_option( '-k', '--keep-video', action='store_true', dest='keepvideo', default=False, diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index cc65b34e7..a696b12b4 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -287,22 +287,28 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): class FFmpegVideoConvertorPP(FFmpegPostProcessor): - def __init__(self, downloader=None, preferedformat=None): + def __init__(self, downloader=None, preferedformat=None, extra_params=[]): super(FFmpegVideoConvertorPP, self).__init__(downloader) self._preferedformat = preferedformat + self._extra_params = extra_params def run(self, information): path = information['filepath'] prefix, sep, ext = path.rpartition('.') - outpath = prefix + sep + self._preferedformat + ext = self._preferedformat + options = self._extra_params + if self._preferedformat == 'xvid': + ext = 'avi' + options.extend(['-c:v', 'libxvid', '-vtag', 'XVID']) + outpath = prefix + sep + ext if information['ext'] == self._preferedformat: self._downloader.to_screen('[ffmpeg] Not converting video file %s - already is in target format %s' % (path, self._preferedformat)) return [], information self._downloader.to_screen('[' + 'ffmpeg' + '] Converting video from %s to %s, Destination: ' % (information['ext'], self._preferedformat) + outpath) - self.run_ffmpeg(path, outpath, []) + self.run_ffmpeg(path, outpath, options) information['filepath'] = outpath information['format'] = self._preferedformat - information['ext'] = self._preferedformat + information['ext'] = ext return [path], information From 0c8662d2b6f033ad42f1cc97989d4975629b524b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 10 Jun 2015 13:40:41 +0800 Subject: [PATCH 011/450] [youtube] Fix a TypeError caused by 4da31bd56629054497634d041035e4bd6fcfacbb --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2424ac2c0..a1906eef6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -816,7 +816,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'filesize': filesize, 'fps': int_or_none(r.attrib.get('frameRate')), } - if len(segment_list): + if segment_list is not None: f.update({ 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'], 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')], From 93dfcb9357b400b4d7e353d0a9db0e0194135b19 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 10 Jun 2015 13:44:54 +0800 Subject: [PATCH 012/450] [downloader/dash] Do not pollute ```self``` --- youtube_dl/downloader/dash.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 5f14658ba..cd84e0b07 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -16,14 +16,14 @@ class DashSegmentsFD(FileDownloader): base_url = info_dict['url'] segment_urls = info_dict['segment_urls'] - self.byte_counter = 0 + byte_counter = 0 def append_url_to_file(outf, target_url, target_name): self.to_screen('[DashSegments] %s: Downloading %s' % (info_dict['id'], target_name)) req = compat_urllib_request.Request(target_url) data = self.ydl.urlopen(req).read() outf.write(data) - self.byte_counter += len(data) + return len(data) def combine_url(base_url, target_url): if re.match(r'^https?://', target_url): @@ -35,15 +35,16 @@ class DashSegmentsFD(FileDownloader): outf, combine_url(base_url, info_dict['initialization_url']), 'initialization segment') for i, segment_url in enumerate(segment_urls): - append_url_to_file( + segment_len = append_url_to_file( outf, combine_url(base_url, segment_url), 'segment %d / %d' % (i + 1, len(segment_urls))) + byte_counter += segment_len self.try_rename(tmpfilename, filename) self._hook_progress({ - 'downloaded_bytes': self.byte_counter, - 'total_bytes': self.byte_counter, + 'downloaded_bytes': byte_counter, + 'total_bytes': byte_counter, 'filename': filename, 'status': 'finished', }) From 5bf3276e8d6ee7d017c8be04414398752cd9cdf3 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 10 Jun 2015 14:45:54 +0800 Subject: [PATCH 013/450] [downloader/dash] Add testing facility --- youtube_dl/downloader/dash.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index cd84e0b07..a4685d307 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -16,12 +16,21 @@ class DashSegmentsFD(FileDownloader): base_url = info_dict['url'] segment_urls = info_dict['segment_urls'] + is_test = self.params.get('test', False) + remaining_bytes = self._TEST_FILE_SIZE if is_test else None byte_counter = 0 - def append_url_to_file(outf, target_url, target_name): + def append_url_to_file(outf, target_url, target_name, remaining_bytes=None): self.to_screen('[DashSegments] %s: Downloading %s' % (info_dict['id'], target_name)) req = compat_urllib_request.Request(target_url) + if remaining_bytes is not None: + req.add_header('Range', 'bytes=0-%d' % (remaining_bytes - 1)) + data = self.ydl.urlopen(req).read() + + if remaining_bytes is not None: + data = data[:remaining_bytes] + outf.write(data) return len(data) @@ -37,8 +46,13 @@ class DashSegmentsFD(FileDownloader): for i, segment_url in enumerate(segment_urls): segment_len = append_url_to_file( outf, combine_url(base_url, segment_url), - 'segment %d / %d' % (i + 1, len(segment_urls))) + 'segment %d / %d' % (i + 1, len(segment_urls)), + remaining_bytes) byte_counter += segment_len + if remaining_bytes is not None: + remaining_bytes -= segment_len + if remaining_bytes <= 0: + break self.try_rename(tmpfilename, filename) From 8a1a26ce4c64d7a2c142718fc56f46d9a1c2c4f2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 10 Jun 2015 14:47:02 +0800 Subject: [PATCH 014/450] [youtube] Add a test for the DASH segment downloader --- youtube_dl/extractor/youtube.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a1906eef6..939f5e61f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -516,6 +516,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': 'requires avconv', } }, + # DASH manifest with segment_list + { + 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8', + 'md5': '8ce563a1d667b599d21064e982ab9e31', + 'info_dict': { + 'id': 'CsmdDsKjzN8', + 'ext': 'mp4', + 'upload_date': '20150510', + 'uploader': 'Airtek', + 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.', + 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ', + 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015', + }, + 'params': { + 'youtube_include_dash_manifest': True, + 'format': '135', # bestvideo + } + } ] def __init__(self, *args, **kwargs): From 14835de9fb41798c8e6e731a3f07ae871770666f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lio=20A=2E=20Heckert?= Date: Tue, 16 Jun 2015 18:10:31 -0300 Subject: [PATCH 015/450] Use shlex.split for --pp-params and update related docs. --- README.md | 2 +- youtube_dl/YoutubeDL.py | 1 + youtube_dl/__init__.py | 6 ++++-- youtube_dl/options.py | 4 ++-- youtube_dl/postprocessor/common.py | 3 ++- 5 files changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 726ec9cf2..813ac4a15 100644 --- a/README.md +++ b/README.md @@ -214,7 +214,7 @@ which means you can modify it, redistribute it or use it however you like. --audio-quality QUALITY Specify ffmpeg/avconv audio quality, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default 5) --recode-video FORMAT Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|xvid) - --pp-params Extra parameters for video post-processor. The params will be splited on spaces. + --pp-params Extra parameters for video post-processor. -k, --keep-video Keep the video file on disk after the post-processing; the video is erased by default --no-post-overwrites Do not overwrite post-processed files; the post-processed files are overwritten by default --embed-subs Embed subtitles in the video (only for mkv and mp4 videos) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b1f792d4e..3bfe30c76 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -261,6 +261,7 @@ class YoutubeDL(object): The following options are used by the post processors: prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, otherwise prefer avconv. + pp_params: Extra parameters for external apps, like avconv. """ params = None diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 5b28e4817..8b54d4ae2 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -171,8 +171,10 @@ def _real_main(argv=None): if opts.recodevideo is not None: if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'xvid']: parser.error('invalid video recode format specified') - if opts.pp_params is not None: - opts.pp_params = opts.pp_params.split() + if opts.pp_params is None: + opts.pp_params = [] + else: + opts.pp_params = shlex.split(opts.pp_params) if opts.convertsubtitles is not None: if opts.convertsubtitles not in ['srt', 'vtt', 'ass']: parser.error('invalid subtitle format specified') diff --git a/youtube_dl/options.py b/youtube_dl/options.py index ceb4b5f38..fbba9b9d8 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -689,8 +689,8 @@ def parseOpts(overrideArguments=None): help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|xvid)') postproc.add_option( '--pp-params', - dest='pp_params', default=None, - help='Extra parameters for video post-processor. The params will be splited on spaces.') + dest='pp_params', default=None, metavar='ARGS', + help='Extra parameters for video post-processor.') postproc.add_option( '-k', '--keep-video', action='store_true', dest='keepvideo', default=False, diff --git a/youtube_dl/postprocessor/common.py b/youtube_dl/postprocessor/common.py index 3b0e8ddd8..d944d9367 100644 --- a/youtube_dl/postprocessor/common.py +++ b/youtube_dl/postprocessor/common.py @@ -22,7 +22,8 @@ class PostProcessor(object): of the chain is reached. PostProcessor objects follow a "mutual registration" process similar - to InfoExtractor objects. + to InfoExtractor objects. And it can receive parameters from CLI trough + --pp-params. """ _downloader = None From 0d0d5d37174ed611dd823c5be025c49e73d83d1d Mon Sep 17 00:00:00 2001 From: ping Date: Thu, 18 Jun 2015 13:59:37 +0800 Subject: [PATCH 016/450] [qqmusic] Add support for playlists --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/qqmusic.py | 34 ++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6fdaf90b2..d03577cdf 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -427,6 +427,7 @@ from .qqmusic import ( QQMusicSingerIE, QQMusicAlbumIE, QQMusicToplistIE, + QQMusicPlaylistIE, ) from .quickvid import QuickVidIE from .r7 import R7IE diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index bafa81c21..f9aafcd28 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -9,6 +9,7 @@ from .common import InfoExtractor from ..utils import ( strip_jsonp, unescapeHTML, + clean_html, ) from ..compat import compat_urllib_request @@ -243,3 +244,36 @@ class QQMusicToplistIE(QQPlaylistBaseIE): list_name = topinfo.get('ListName') list_description = topinfo.get('info') return self.playlist_result(entries, list_id, list_name, list_description) + + +class QQMusicPlaylistIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:playlist' + _VALID_URL = r'http://y\.qq\.com/#type=taoge&id=(?P[0-9]+)' + + _TEST = { + 'url': 'http://y.qq.com/#type=taoge&id=3462654915', + 'info_dict': { + 'id': '3462654915', + 'title': '韩国5月新歌精选下旬', + 'description': 'md5:d2c9d758a96b9888cf4fe82f603121d4', + }, + 'playlist_count': 40, + } + + def _real_extract(self, url): + list_id = self._match_id(url) + + list_json = self._download_json( + 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg?type=1&json=1&utf8=1&onlysong=0&disstid=%s' + % list_id, list_id, 'Download list page', + transform_source=strip_jsonp)['cdlist'][0] + + entries = [ + self.url_result( + 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid'] + ) for song in list_json['songlist'] + ] + + list_name = list_json['dissname'] + list_description = clean_html(unescapeHTML(list_json.get('desc'))) + return self.playlist_result(entries, list_id, list_name, list_description) From 8f73e89ca0ecde0a8bbd1f1463e9a06a53c6b573 Mon Sep 17 00:00:00 2001 From: ping Date: Thu, 18 Jun 2015 14:15:54 +0800 Subject: [PATCH 017/450] [kuwo] New extractor for kuwo.cn --- youtube_dl/extractor/__init__.py | 9 + youtube_dl/extractor/kuwo.py | 326 +++++++++++++++++++++++++++++++ 2 files changed, 335 insertions(+) create mode 100644 youtube_dl/extractor/kuwo.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6fdaf90b2..82cd85c44 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -257,6 +257,15 @@ from .keek import KeekIE from .kontrtube import KontrTubeIE from .krasview import KrasViewIE from .ku6 import Ku6IE +from .kuwo import ( + KuwoIE, + KuwoAlbumIE, + KuwoChartIE, + KuwoSingerIE, + KuwoSingerMusicIE, + KuwoCategoryIE, + KuwoMvIE, +) from .la7 import LA7IE from .laola1tv import Laola1TvIE from .letv import ( diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py new file mode 100644 index 000000000..6a96a1aa4 --- /dev/null +++ b/youtube_dl/extractor/kuwo.py @@ -0,0 +1,326 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import itertools + +from .common import InfoExtractor +from ..utils import ( + get_element_by_id, + clean_html, + ExtractorError, +) + + +class KuwoIE(InfoExtractor): + IE_NAME = 'kuwo:song' + _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P[0-9]+?)/' + _TESTS = [{ + 'url': 'http://www.kuwo.cn/yinyue/635632/', + 'info_dict': { + 'id': '635632', + 'ext': 'ape', + 'title': '爱我别走', + 'creator': '张震岳', + 'upload_date': '20080122', + 'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c' + }, + }, { + 'url': 'http://www.kuwo.cn/yinyue/6446136/', + 'info_dict': { + 'id': '6446136', + 'ext': 'mp3', + 'title': '心', + 'creator': 'IU', + 'upload_date': '20150518', + }, + 'params': { + 'format': 'mp3-320' + }, + }] + _FORMATS = [ + {'format': 'ape', 'ext': 'ape', 'preference': 100}, + {'format': 'mp3-320', 'ext': 'mp3', 'br': '320kmp3', 'abr': 320, 'preference': 80}, + {'format': 'mp3-192', 'ext': 'mp3', 'br': '192kmp3', 'abr': 192, 'preference': 70}, + {'format': 'mp3-128', 'ext': 'mp3', 'br': '128kmp3', 'abr': 128, 'preference': 60}, + {'format': 'wma', 'ext': 'wma', 'preference': 20}, + {'format': 'aac', 'ext': 'aac', 'abr': 48, 'preference': 10} + ] + + def _get_formats(self, song_id): + formats = [] + for file_format in self._FORMATS: + song_url = self._download_webpage( + "http://antiserver.kuwo.cn/anti.s?format=%s&br=%s&rid=MUSIC_%s&type=convert_url&response=url" % + (file_format['ext'], file_format.get('br', ''), song_id), + song_id, note="Download %s url info" % file_format["format"], + ) + if song_url.startswith('http://') or song_url.startswith('https://'): + formats.append({ + 'url': song_url, + 'format_id': file_format['format'], + 'format': file_format['format'], + 'preference': file_format['preference'], + 'abr': file_format.get('abr'), + }) + self._sort_formats(formats) + return formats + + def _real_extract(self, url): + song_id = self._match_id(url) + webpage = self._download_webpage( + url, song_id, note='Download song detail info', + errnote='Unable to get song detail info') + + song_name = self._html_search_regex( + r'

', webpage, 'song name') + singer_name = self._html_search_regex( + r'
.+?title="(.+?)".+?
', webpage, 'singer name', + flags=re.DOTALL, default=None) + lrc_content = clean_html(get_element_by_id("lrcContent", webpage)) + if lrc_content == '暂无': # indicates no lyrics + lrc_content = None + + formats = self._get_formats(song_id) + + album_id = self._html_search_regex( + r'

.+?[0-9]+?)/' + _TEST = { + 'url': 'http://www.kuwo.cn/album/502294/', + 'info_dict': { + 'id': '502294', + 'title': 'M', + 'description': 'md5:6a7235a84cc6400ec3b38a7bdaf1d60c', + }, + 'playlist_count': 2, + } + + def _real_extract(self, url): + album_id = self._match_id(url) + + webpage = self._download_webpage( + url, album_id, note='Download album info', + errnote='Unable to get album info') + + album_name = self._html_search_regex( + r'

', webpage, + 'album name', flags=re.DOTALL) + album_intro = clean_html( + re.sub(r'^.+简介:', '', get_element_by_id("intro", webpage).strip())) + + entries = [ + self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id) + for song_id in re.findall( + r'

', + webpage) + ] + return self.playlist_result(entries, album_id, album_name, album_intro) + + +class KuwoChartIE(InfoExtractor): + IE_NAME = 'kuwo:chart' + _VALID_URL = r'http://yinyue\.kuwo\.cn/billboard_(?P.+?).htm' + _TEST = { + 'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm', + 'info_dict': { + 'id': '香港中文龙虎榜', + 'title': '香港中文龙虎榜', + 'description': 're:[0-9]{4}第[0-9]{2}期', + }, + 'playlist_mincount': 10, + } + + def _real_extract(self, url): + chart_id = self._match_id(url) + webpage = self._download_webpage( + url, chart_id, note='Download chart info', + errnote='Unable to get chart info') + + chart_name = self._html_search_regex( + r'

(.+?)

', webpage, 'chart name') + + chart_desc = self._html_search_regex( + r'

([0-9]{4}第[0-9]{2}期)

', webpage, 'chart desc') + + entries = [ + self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id) + for song_id in re.findall( + r'.+?', webpage) + ] + return self.playlist_result(entries, chart_id, chart_name, chart_desc) + + +class KuwoSingerIE(InfoExtractor): + IE_NAME = 'kuwo:singer' + _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P[^/]+?)/$' + _TEST = { + 'url': 'http://www.kuwo.cn/mingxing/bruno+mars/', + 'info_dict': { + 'id': 'bruno+mars', + 'title': 'Bruno Mars', + }, + 'playlist_count': 10, + } + + def _real_extract(self, url): + singer_id = self._match_id(url) + webpage = self._download_webpage( + url, singer_id, note='Download singer info', + errnote='Unable to get singer info') + + singer_name = self._html_search_regex( + r'姓名:(.+?)', webpage, 'singer name') + + entries = [ + self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id) + for song_id in re.findall( + r'.+?', + webpage, flags=re.DOTALL) + ] + return self.playlist_result(entries, singer_id, singer_name) + + +class KuwoSingerMusicIE(InfoExtractor): + IE_NAME = 'kuwo:singermusic' + _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P[^/]+?)/music(_[0-9]+)?.htm' + _TEST = { + 'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm', + 'info_dict': { + 'id': 'Ali', + 'title': 'Ali的热门歌曲', + }, + 'playlist_mincount': 95, + } + + def _real_extract(self, url): + singer_id = self._match_id(url) + + list_name = None + entries = [] + for page_num in itertools.count(1): + webpage = self._download_webpage( + 'http://www.kuwo.cn/mingxing/%s/music_%d.htm' % (singer_id, page_num), + singer_id, note='Download song list page #%d' % page_num, + errnote='Unable to get song list page #%d' % page_num) + + if list_name is None: + list_name = self._html_search_regex( + r'

([^<>]+)', webpage, 'list name') + + entries.extend([ + self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id) + for song_id in re.findall( + r'

下一页', webpage): + break + + return self.playlist_result(entries, singer_id, list_name) + + +class KuwoCategoryIE(InfoExtractor): + IE_NAME = 'kuwo:category' + _VALID_URL = r'http://yinyue\.kuwo\.cn/yy/cinfo_(?P[0-9]+?).htm' + _TEST = { + 'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm', + 'info_dict': { + 'id': '86375', + 'title': '八十年代精选', + 'description': '这些都是属于八十年代的回忆!', + }, + 'playlist_count': 30, + } + + def _real_extract(self, url): + category_id = self._match_id(url) + webpage = self._download_webpage( + url, category_id, note='Download category info', + errnote='Unable to get category info') + + category_name = self._html_search_regex( + r'

[^<>]+?

', webpage, 'category name') + + category_desc = re.sub( + r'^.+简介:', '', get_element_by_id("intro", webpage).strip()) + + jsonm = self._parse_json(self._html_search_regex( + r'var jsonm = (\{.+?\});', webpage, 'category songs'), category_id) + + entries = [ + self.url_result( + "http://www.kuwo.cn/yinyue/%s/" % song['musicrid'], + 'Kuwo', song['musicrid']) + for song in jsonm['musiclist'] + ] + return self.playlist_result(entries, category_id, category_name, category_desc) + + +class KuwoMvIE(KuwoIE): + IE_NAME = 'kuwo:mv' + _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P[0-9]+?)/' + _TESTS = [{ + 'url': 'http://www.kuwo.cn/mv/6480076/', + 'info_dict': { + 'id': '6480076', + 'ext': 'mkv', + 'title': '我们家MV', + 'creator': '2PM', + }, + }] + _FORMATS = KuwoIE._FORMATS + [ + {'format': 'mkv', 'ext': 'mkv', 'preference': 250}, + {'format': 'mp4', 'ext': 'mp4', 'preference': 200}, + ] + + def _real_extract(self, url): + song_id = self._match_id(url) + webpage = self._download_webpage( + url, song_id, note='Download mv detail info: %s' % song_id, + errnote='Unable to get mv detail info: %s' % song_id) + + mobj = re.search( + r'

[^<>]+[^<>]+

', + webpage) + if mobj: + song_name = mobj.group('song') + singer_name = mobj.group('singer') + else: + raise ExtractorError("Unable to find song or singer names") + + formats = self._get_formats(song_id) + + return { + 'id': song_id, + 'title': song_name, + 'creator': singer_name, + 'formats': formats, + } From 29b809de68aeefb5e991c75929ed3d03fb40c1f1 Mon Sep 17 00:00:00 2001 From: ping Date: Thu, 18 Jun 2015 15:52:04 +0800 Subject: [PATCH 018/450] [qqmusic] Fix album extraction --- youtube_dl/extractor/qqmusic.py | 37 ++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index bafa81c21..d9a783f8a 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -163,31 +163,38 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:album' _VALID_URL = r'http://y.qq.com/#type=album&mid=(?P[0-9A-Za-z]+)' - _TEST = { - 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1&play=0', + _TESTS = [{ + 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1', 'info_dict': { 'id': '000gXCTb2AhRR1', 'title': '我们都是这样长大的', - 'description': 'md5:d216c55a2d4b3537fe4415b8767d74d6', + 'description': 'md5:712f0cdbfc7e776820d08150e6df593d', }, 'playlist_count': 4, - } + }, { + 'url': 'http://y.qq.com/#type=album&mid=002Y5a3b3AlCu3', + 'info_dict': { + 'id': '002Y5a3b3AlCu3', + 'title': '그리고...', + 'description': 'md5:b1d133b8c9bac8fed4e1a97df759f4cf', + }, + 'playlist_count': 8, + }] def _real_extract(self, url): mid = self._match_id(url) - album_page = self._download_webpage( - self.qq_static_url('album', mid), mid, 'Download album page') + album = self._download_json( + 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_album_info_cp.fcg?albummid=%s&format=json' % mid, + mid, 'Download album page')['data'] - entries = self.get_entries_from_page(album_page) - - album_name = self._html_search_regex( - r"albumname\s*:\s*'([^']+)',", album_page, 'album name', - default=None) - - album_detail = self._html_search_regex( - r'
\s*

((?:[^<>]+(?:
)?)+)

', - album_page, 'album details', default=None) + entries = [ + self.url_result( + 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid'] + ) for song in album['list'] + ] + album_name = album['name'] + album_detail = album.get('desc') return self.playlist_result(entries, mid, album_name, album_detail) From 5e3915cbe3fabd2dbb633131b851b1158c0bba7b Mon Sep 17 00:00:00 2001 From: ping Date: Thu, 18 Jun 2015 21:06:25 +0800 Subject: [PATCH 019/450] [qqmusic] Fix song extraction when certain formats are unavailable --- youtube_dl/extractor/qqmusic.py | 55 +++++++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index bafa81c21..7ddc4ca25 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -9,8 +9,13 @@ from .common import InfoExtractor from ..utils import ( strip_jsonp, unescapeHTML, + HEADRequest, + ExtractorError, +) +from ..compat import ( + compat_urllib_request, + compat_HTTPError, ) -from ..compat import compat_urllib_request class QQMusicIE(InfoExtractor): @@ -26,6 +31,20 @@ class QQMusicIE(InfoExtractor): 'upload_date': '20141227', 'creator': '林俊杰', 'description': 'md5:d327722d0361576fde558f1ac68a7065', + 'thumbnail': 'http://i.gtimg.cn/music/photo/mid_album_500/7/p/001IV22P1RDX7p.jpg', + } + }, { + 'note': 'There is no mp3-320 version of this song.', + 'url': 'http://y.qq.com/#type=song&mid=004MsGEo3DdNxV', + 'md5': 'fa3926f0c585cda0af8fa4f796482e3e', + 'info_dict': { + 'id': '004MsGEo3DdNxV', + 'ext': 'mp3', + 'title': '如果', + 'upload_date': '20050626', + 'creator': '李季美', + 'description': 'md5:46857d5ed62bc4ba84607a805dccf437', + 'thumbnail': 'http://i.gtimg.cn/music/photo/mid_album_500/r/Q/0042owYj46IxrQ.jpg', } }] @@ -68,6 +87,13 @@ class QQMusicIE(InfoExtractor): if lrc_content: lrc_content = lrc_content.replace('\\n', '\n') + thumbnail_url = None + albummid = self._search_regex( + [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'], detail_info_page, 'album mid', default=None) + if albummid: + thumbnail_url = "http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg" \ + % (albummid[-2:-1], albummid[-1], albummid) + guid = self.m_r_get_ruin() vkey = self._download_json( @@ -77,14 +103,24 @@ class QQMusicIE(InfoExtractor): formats = [] for format_id, details in self._FORMATS.items(): - formats.append({ - 'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0' - % (details['prefix'], mid, details['ext'], vkey, guid), - 'format': format_id, - 'format_id': format_id, - 'preference': details['preference'], - 'abr': details.get('abr'), - }) + video_url = 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0' \ + % (details['prefix'], mid, details['ext'], vkey, guid) + req = HEADRequest(video_url) + try: + res = self._request_webpage( + req, mid, note='Testing %s video URL' % format_id, fatal=False) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in [400, 404]: + self.report_warning('Invalid %s video URL' % format_id, mid) + else: + if res: + formats.append({ + 'url': video_url, + 'format': format_id, + 'format_id': format_id, + 'preference': details['preference'], + 'abr': details.get('abr'), + }) self._sort_formats(formats) return { @@ -94,6 +130,7 @@ class QQMusicIE(InfoExtractor): 'upload_date': publish_time, 'creator': singer, 'description': lrc_content, + 'thumbnail': thumbnail_url, } From 0392ac98d2c5c5a6fd2ab51c51096f82312a287c Mon Sep 17 00:00:00 2001 From: ping Date: Thu, 18 Jun 2015 21:13:03 +0800 Subject: [PATCH 020/450] [qqmusic] Fix code formatting --- youtube_dl/extractor/qqmusic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 7ddc4ca25..5a18191bc 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -89,7 +89,8 @@ class QQMusicIE(InfoExtractor): thumbnail_url = None albummid = self._search_regex( - [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'], detail_info_page, 'album mid', default=None) + [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'], + detail_info_page, 'album mid', default=None) if albummid: thumbnail_url = "http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg" \ % (albummid[-2:-1], albummid[-1], albummid) From 4d58b24c15ea0efc699a7ad7ee468245029da4e3 Mon Sep 17 00:00:00 2001 From: ping Date: Thu, 18 Jun 2015 23:09:04 +0800 Subject: [PATCH 021/450] [qqmusic] Use _check_formats instead --- youtube_dl/extractor/qqmusic.py | 34 ++++++++++----------------------- 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 5a18191bc..7183c2bb1 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -9,13 +9,8 @@ from .common import InfoExtractor from ..utils import ( strip_jsonp, unescapeHTML, - HEADRequest, - ExtractorError, -) -from ..compat import ( - compat_urllib_request, - compat_HTTPError, ) +from ..compat import compat_urllib_request class QQMusicIE(InfoExtractor): @@ -104,24 +99,15 @@ class QQMusicIE(InfoExtractor): formats = [] for format_id, details in self._FORMATS.items(): - video_url = 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0' \ - % (details['prefix'], mid, details['ext'], vkey, guid) - req = HEADRequest(video_url) - try: - res = self._request_webpage( - req, mid, note='Testing %s video URL' % format_id, fatal=False) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in [400, 404]: - self.report_warning('Invalid %s video URL' % format_id, mid) - else: - if res: - formats.append({ - 'url': video_url, - 'format': format_id, - 'format_id': format_id, - 'preference': details['preference'], - 'abr': details.get('abr'), - }) + formats.append({ + 'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0' + % (details['prefix'], mid, details['ext'], vkey, guid), + 'format': format_id, + 'format_id': format_id, + 'preference': details['preference'], + 'abr': details.get('abr'), + }) + self._check_formats(formats, mid) self._sort_formats(formats) return { From 9e96dc8b3561c1e6e62ce6a34efba485e5e49054 Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 19 Jun 2015 01:36:59 -0500 Subject: [PATCH 022/450] Support BBC News (bbc.com/news) --- docs/supportedsites.md | 1 + youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/bbcnews.py | 162 +++++++++++++++++++++++++++++++ 3 files changed, 164 insertions(+) create mode 100644 youtube_dl/extractor/bbcnews.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 220e52b98..d4ccbbd3a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -50,6 +50,7 @@ - **Bandcamp** - **Bandcamp:album** - **bbc.co.uk**: BBC iPlayer + - **bbc.com**: BBC news videos - **BeatportPro** - **Beeg** - **BehindKink** diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6fdaf90b2..51d2d20e9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -36,6 +36,7 @@ from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE from .bbccouk import BBCCoUkIE +from .bbcnews import BBCNewsIE from .beeg import BeegIE from .behindkink import BehindKinkIE from .beatportpro import BeatportProIE diff --git a/youtube_dl/extractor/bbcnews.py b/youtube_dl/extractor/bbcnews.py new file mode 100644 index 000000000..b10e30a81 --- /dev/null +++ b/youtube_dl/extractor/bbcnews.py @@ -0,0 +1,162 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, +) +from ..compat import compat_HTTPError +import re +from .bbccouk import BBCCoUkIE + +class BBCNewsIE(BBCCoUkIE): + IE_NAME = 'bbc.com' + IE_DESC = 'BBC news' + _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' + + _TESTS = [{ + 'url': 'http://www.bbc.com/news/world-europe-32668511', + 'info_dict': { + 'id': 'world-europe-32668511', + 'title': 'Russia stages massive WW2 parade despite Western boycott', + }, + 'playlist_count': 2, + },{ + 'url': 'http://www.bbc.com/news/business-28299555', + 'info_dict': { + 'id': 'business-28299555', + 'title': 'Farnborough Airshow: Video highlights', + }, + 'playlist_count': 9, + },{ + 'url': 'http://www.bbc.com/news/world-europe-32041533', + 'note': 'Video', + 'info_dict': { + 'id': 'p02mprgb', + 'ext': 'mp4', + 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'duration': 47, + }, + 'params': { + 'skip_download': True, + } + }] + + def _duration_str2int(self, str): + if not str: + return None + ret = re.match(r'^\d+$', str) + if ret: + return int(ret.group(0)) + ret = re.match(r'PT((?P\d+)H)?((?P\d+)M)?(?P\d+)S$', str) + if ret: + total=int(ret.group('s')) + if ret.group('m'): + total+=(int(ret.group('m'))*60) + if ret.group('h'): + total+=(int(ret.group('h'))*3600) + return total + return None + + def _download_media_selector(self, programme_id): + # bbc news uses http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/ not + # http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/ + # Could add third urlspec arg to BBCCoUkIE._download_media_selector instead of duplicating it + + try: + media_selection = self._download_xml( + 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' % programme_id, + programme_id, 'Downloading media selection XML') + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: + media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8')) + else: + raise + formats = [] + subtitles = None + + for media in self._extract_medias(media_selection): + kind = media.get('kind') + if kind == 'audio': + formats.extend(self._extract_audio(media, programme_id)) + elif kind == 'video': + formats.extend(self._extract_video(media, programme_id)) + elif kind == 'captions': + subtitles = self.extract_subtitles(media, programme_id) + + formats = [] + subtitles = None + + for media in self._extract_medias(media_selection): + kind = media.get('kind') + if kind == 'audio': + formats.extend(self._extract_audio(media, programme_id)) + elif kind == 'video': + formats.extend(self._extract_video(media, programme_id)) + elif kind == 'captions': + subtitles = self.extract_subtitles(media, programme_id) + + return formats, subtitles + + def _real_extract(self, url): + list_id = self._match_id(url) + webpage = self._download_webpage(url, list_id) + + list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC News)?', webpage, 'list title') + + pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) + if pubdate: + pubdate = pubdate.replace('-','') + + ret = [] + # works with bbc.com/news/something-something-123456 articles + matches = re.findall(r"data-media-meta='({[^']+})'", webpage) + if not matches: + # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} + # in http://www.bbc.com/news/video_and_audio/international + matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + if not matches: + raise ExtractorError('No video found', expected=True) + + for ent in matches: + jent = self._parse_json(ent,list_id) + + programme_id = jent.get('externalId',None) + xml_url = jent.get('href', None) + + title = jent['caption'] + duration = self._duration_str2int(jent.get('duration',None)) + description = list_title + ' - ' + jent.get('caption','') + thumbnail = None + if jent.has_key('image'): + thumbnail=jent['image'].get('href',None) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + elif xml_url: + # Cheap fallback + # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml + xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') + programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') + formats, subtitles = self._download_media_selector(programme_id) + else: + raise ExtractorError('data-media-meta entry has no externalId or href value.') + + self._sort_formats(formats) + + ret.append( { + 'id': programme_id, + 'uploader': 'BBC News', + 'upload_date': pubdate, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } ) + + if len(ret) > 0: + return self.playlist_result(ret, list_id, list_title) + raise ExtractorError('No video found', expected=True) From a8b081a0523c412fd4e01d5cddec7ae382c4793e Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 19 Jun 2015 01:52:25 -0500 Subject: [PATCH 023/450] BBCNewsIE: eliminate redundant function. BBCCoUkIE._download_media_selector: use class variable instead of hardcoded string for mediaselector_url template. --- youtube_dl/extractor/bbccouk.py | 4 +++- youtube_dl/extractor/bbcnews.py | 42 ++------------------------------- 2 files changed, 5 insertions(+), 41 deletions(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index 0305f88b5..dcc5fc2fa 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -15,6 +15,8 @@ class BBCCoUkIE(InfoExtractor): IE_DESC = 'BBC iPlayer' _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' + mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' + _TESTS = [ { 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', @@ -277,7 +279,7 @@ class BBCCoUkIE(InfoExtractor): def _download_media_selector(self, programme_id): try: media_selection = self._download_xml( - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' % programme_id, + self.mediaselector_url % programme_id, programme_id, 'Downloading media selection XML') except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: diff --git a/youtube_dl/extractor/bbcnews.py b/youtube_dl/extractor/bbcnews.py index b10e30a81..9bb8d42e6 100644 --- a/youtube_dl/extractor/bbcnews.py +++ b/youtube_dl/extractor/bbcnews.py @@ -14,6 +14,8 @@ class BBCNewsIE(BBCCoUkIE): IE_DESC = 'BBC news' _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' + mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' + _TESTS = [{ 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { @@ -59,46 +61,6 @@ class BBCNewsIE(BBCCoUkIE): return total return None - def _download_media_selector(self, programme_id): - # bbc news uses http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/ not - # http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/ - # Could add third urlspec arg to BBCCoUkIE._download_media_selector instead of duplicating it - - try: - media_selection = self._download_xml( - 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' % programme_id, - programme_id, 'Downloading media selection XML') - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().encode('utf-8')) - else: - raise - formats = [] - subtitles = None - - for media in self._extract_medias(media_selection): - kind = media.get('kind') - if kind == 'audio': - formats.extend(self._extract_audio(media, programme_id)) - elif kind == 'video': - formats.extend(self._extract_video(media, programme_id)) - elif kind == 'captions': - subtitles = self.extract_subtitles(media, programme_id) - - formats = [] - subtitles = None - - for media in self._extract_medias(media_selection): - kind = media.get('kind') - if kind == 'audio': - formats.extend(self._extract_audio(media, programme_id)) - elif kind == 'video': - formats.extend(self._extract_video(media, programme_id)) - elif kind == 'captions': - subtitles = self.extract_subtitles(media, programme_id) - - return formats, subtitles - def _real_extract(self, url): list_id = self._match_id(url) webpage = self._download_webpage(url, list_id) From d5552a3477a0970f4aaaa746ce07c816267bb9cf Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 19 Jun 2015 06:25:50 -0500 Subject: [PATCH 024/450] bbcnews: Switch to parse_duration, revert change to docs/supportedsites.md --- docs/supportedsites.md | 1 - youtube_dl/extractor/bbcnews.py | 19 ++----------------- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d4ccbbd3a..220e52b98 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -50,7 +50,6 @@ - **Bandcamp** - **Bandcamp:album** - **bbc.co.uk**: BBC iPlayer - - **bbc.com**: BBC news videos - **BeatportPro** - **Beeg** - **BehindKink** diff --git a/youtube_dl/extractor/bbcnews.py b/youtube_dl/extractor/bbcnews.py index 9bb8d42e6..fd4a5e38f 100644 --- a/youtube_dl/extractor/bbcnews.py +++ b/youtube_dl/extractor/bbcnews.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( ExtractorError, + parse_duration, int_or_none, ) from ..compat import compat_HTTPError @@ -45,22 +46,6 @@ class BBCNewsIE(BBCCoUkIE): } }] - def _duration_str2int(self, str): - if not str: - return None - ret = re.match(r'^\d+$', str) - if ret: - return int(ret.group(0)) - ret = re.match(r'PT((?P\d+)H)?((?P\d+)M)?(?P\d+)S$', str) - if ret: - total=int(ret.group('s')) - if ret.group('m'): - total+=(int(ret.group('m'))*60) - if ret.group('h'): - total+=(int(ret.group('h'))*3600) - return total - return None - def _real_extract(self, url): list_id = self._match_id(url) webpage = self._download_webpage(url, list_id) @@ -88,7 +73,7 @@ class BBCNewsIE(BBCCoUkIE): xml_url = jent.get('href', None) title = jent['caption'] - duration = self._duration_str2int(jent.get('duration',None)) + duration = parse_duration(jent.get('duration',None)) description = list_title + ' - ' + jent.get('caption','') thumbnail = None if jent.has_key('image'): From 10273d6e0846cd8f3762e3777712d5cd2a0cafcd Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 08:22:13 -0500 Subject: [PATCH 025/450] toss new stuff into old file --- youtube_dl/extractor/__init__.py | 3 +- youtube_dl/extractor/bbccouk.py | 101 ++++++++++++++++++++++++++++ youtube_dl/extractor/bbcnews.py | 109 ------------------------------- 3 files changed, 102 insertions(+), 111 deletions(-) delete mode 100644 youtube_dl/extractor/bbcnews.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 51d2d20e9..f9f7bdfaf 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -35,8 +35,7 @@ from .azubu import AzubuIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE -from .bbccouk import BBCCoUkIE -from .bbcnews import BBCNewsIE +from .bbccouk import BBCCoUkIE, BBCNewsIE from .beeg import BeegIE from .behindkink import BehindKinkIE from .beatportpro import BeatportProIE diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index dcc5fc2fa..ea682fb6f 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -5,9 +5,11 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( ExtractorError, + parse_duration, int_or_none, ) from ..compat import compat_HTTPError +import re class BBCCoUkIE(InfoExtractor): @@ -394,3 +396,102 @@ class BBCCoUkIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } + + +class BBCNewsIE(BBCCoUkIE): + IE_NAME = 'bbc.com' + IE_DESC = 'BBC news' + _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' + + mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' + + _TESTS = [{ + 'url': 'http://www.bbc.com/news/world-europe-32668511', + 'info_dict': { + 'id': 'world-europe-32668511', + 'title': 'Russia stages massive WW2 parade despite Western boycott', + }, + 'playlist_count': 2, + },{ + 'url': 'http://www.bbc.com/news/business-28299555', + 'info_dict': { + 'id': 'business-28299555', + 'title': 'Farnborough Airshow: Video highlights', + }, + 'playlist_count': 9, + },{ + 'url': 'http://www.bbc.com/news/world-europe-32041533', + 'note': 'Video', + 'info_dict': { + 'id': 'p02mprgb', + 'ext': 'mp4', + 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', + 'duration': 47, + }, + 'params': { + 'skip_download': True, + } + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + webpage = self._download_webpage(url, list_id) + + list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC News)?', webpage, 'list title') + + pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) + if pubdate: + pubdate = pubdate.replace('-','') + + ret = [] + # works with bbc.com/news/something-something-123456 articles + matches = re.findall(r"data-media-meta='({[^']+})'", webpage) + if not matches: + # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} + # in http://www.bbc.com/news/video_and_audio/international + matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + if not matches: + raise ExtractorError('No video found', expected=True) + + for ent in matches: + jent = self._parse_json(ent,list_id) + + programme_id = jent.get('externalId',None) + xml_url = jent.get('href', None) + + title = jent['caption'] + duration = parse_duration(jent.get('duration',None)) + description = list_title + ' - ' + jent.get('caption','') + thumbnail = None + if jent.has_key('image'): + thumbnail=jent['image'].get('href',None) + + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + elif xml_url: + # Cheap fallback + # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml + xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') + programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') + formats, subtitles = self._download_media_selector(programme_id) + else: + raise ExtractorError('data-media-meta entry has no externalId or href value.') + + self._sort_formats(formats) + + ret.append( { + 'id': programme_id, + 'uploader': 'BBC News', + 'upload_date': pubdate, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } ) + + if len(ret) > 0: + return self.playlist_result(ret, list_id, list_title) + raise ExtractorError('No video found', expected=True) diff --git a/youtube_dl/extractor/bbcnews.py b/youtube_dl/extractor/bbcnews.py deleted file mode 100644 index fd4a5e38f..000000000 --- a/youtube_dl/extractor/bbcnews.py +++ /dev/null @@ -1,109 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - parse_duration, - int_or_none, -) -from ..compat import compat_HTTPError -import re -from .bbccouk import BBCCoUkIE - -class BBCNewsIE(BBCCoUkIE): - IE_NAME = 'bbc.com' - IE_DESC = 'BBC news' - _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' - - mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' - - _TESTS = [{ - 'url': 'http://www.bbc.com/news/world-europe-32668511', - 'info_dict': { - 'id': 'world-europe-32668511', - 'title': 'Russia stages massive WW2 parade despite Western boycott', - }, - 'playlist_count': 2, - },{ - 'url': 'http://www.bbc.com/news/business-28299555', - 'info_dict': { - 'id': 'business-28299555', - 'title': 'Farnborough Airshow: Video highlights', - }, - 'playlist_count': 9, - },{ - 'url': 'http://www.bbc.com/news/world-europe-32041533', - 'note': 'Video', - 'info_dict': { - 'id': 'p02mprgb', - 'ext': 'mp4', - 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'duration': 47, - }, - 'params': { - 'skip_download': True, - } - }] - - def _real_extract(self, url): - list_id = self._match_id(url) - webpage = self._download_webpage(url, list_id) - - list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC News)?', webpage, 'list title') - - pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) - if pubdate: - pubdate = pubdate.replace('-','') - - ret = [] - # works with bbc.com/news/something-something-123456 articles - matches = re.findall(r"data-media-meta='({[^']+})'", webpage) - if not matches: - # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} - # in http://www.bbc.com/news/video_and_audio/international - matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) - if not matches: - raise ExtractorError('No video found', expected=True) - - for ent in matches: - jent = self._parse_json(ent,list_id) - - programme_id = jent.get('externalId',None) - xml_url = jent.get('href', None) - - title = jent['caption'] - duration = parse_duration(jent.get('duration',None)) - description = list_title + ' - ' + jent.get('caption','') - thumbnail = None - if jent.has_key('image'): - thumbnail=jent['image'].get('href',None) - - if programme_id: - formats, subtitles = self._download_media_selector(programme_id) - elif xml_url: - # Cheap fallback - # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml - xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') - programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') - formats, subtitles = self._download_media_selector(programme_id) - else: - raise ExtractorError('data-media-meta entry has no externalId or href value.') - - self._sort_formats(formats) - - ret.append( { - 'id': programme_id, - 'uploader': 'BBC News', - 'upload_date': pubdate, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } ) - - if len(ret) > 0: - return self.playlist_result(ret, list_id, list_title) - raise ExtractorError('No video found', expected=True) From 75ab0ebcf593ec91a46d83e69854ffa313d33309 Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 08:24:02 -0500 Subject: [PATCH 026/450] no .get('..',None) --- youtube_dl/extractor/bbccouk.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index ea682fb6f..de4d7f9c0 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -457,15 +457,15 @@ class BBCNewsIE(BBCCoUkIE): for ent in matches: jent = self._parse_json(ent,list_id) - programme_id = jent.get('externalId',None) - xml_url = jent.get('href', None) + programme_id = jent.get('externalId') + xml_url = jent.get('href') title = jent['caption'] - duration = parse_duration(jent.get('duration',None)) + duration = parse_duration(jent.get('duration') description = list_title + ' - ' + jent.get('caption','') thumbnail = None if jent.has_key('image'): - thumbnail=jent['image'].get('href',None) + thumbnail=jent['image'].get('href') if programme_id: formats, subtitles = self._download_media_selector(programme_id) From 77c975f536befbe89bf718e86282958d391d9ffe Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 08:28:14 -0500 Subject: [PATCH 027/450] typofix --- youtube_dl/extractor/bbccouk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index de4d7f9c0..f9404f3fa 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -461,7 +461,7 @@ class BBCNewsIE(BBCCoUkIE): xml_url = jent.get('href') title = jent['caption'] - duration = parse_duration(jent.get('duration') + duration = parse_duration(jent.get('duration')) description = list_title + ' - ' + jent.get('caption','') thumbnail = None if jent.has_key('image'): From de939d89eb83c851c6db66933e5fc0c401a1a679 Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 11:04:46 -0500 Subject: [PATCH 028/450] Support BBC news in other languages, non-mediaselector videos --- youtube_dl/extractor/bbccouk.py | 87 +++++++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index f9404f3fa..72e20857b 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -401,7 +401,7 @@ class BBCCoUkIE(InfoExtractor): class BBCNewsIE(BBCCoUkIE): IE_NAME = 'bbc.com' IE_DESC = 'BBC news' - _VALID_URL = r'https?://(?:www\.)?(?:bbc\.co\.uk|bbc\.com)/news/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?bbc\.com/.+?/(?P[^/]+)$' mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' @@ -432,56 +432,115 @@ class BBCNewsIE(BBCCoUkIE): 'params': { 'skip_download': True, } + },{ + 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', + 'note': 'Video', + 'info_dict': { + 'id': 'NA', + 'ext': 'mp4', + 'title': 'YPG - Tel Abyad..n tamam. kontrol.m.zde', + 'duration': 47, + }, + 'params': { + 'skip_download': True, + } + },{ + 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', + 'note': 'Video', + 'info_dict': { + 'id': '39275083', + 'ext': 'mp4', + 'title': 'Honduras militariza sus hospitales por nuevo esc.ndalo de corrupci.n', + 'duration': 87, + }, + 'params': { + 'skip_download': True, + } }] def _real_extract(self, url): list_id = self._match_id(url) webpage = self._download_webpage(url, list_id) - list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC News)?', webpage, 'list title') + list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'list title') pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) if pubdate: pubdate = pubdate.replace('-','') ret = [] + jsent = [] + # works with bbc.com/news/something-something-123456 articles - matches = re.findall(r"data-media-meta='({[^']+})'", webpage) - if not matches: + jsent = map( + lambda m: self._parse_json(m,list_id), + re.findall(r"data-media-meta='({[^']+})'", webpage) + ) + + if len(jsent) == 0: + # http://www.bbc.com/news/video_and_audio/international + # and single-video articles + masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None) + if masset: + jmasset = self._parse_json(masset,list_id) + for key, val in jmasset.get('videos',{}).items(): + for skey, sval in val.items(): + sval['id'] = skey + jsent.append(sval) + + if len(jsent) == 0: # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} # in http://www.bbc.com/news/video_and_audio/international - matches = re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) - if not matches: + # prone to breaking if entries have sourceFiles list + jsent = map( + lambda m: self._parse_json(m,list_id), + re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + ) + + if len(jsent) == 0: raise ExtractorError('No video found', expected=True) - for ent in matches: - jent = self._parse_json(ent,list_id) - + for jent in jsent: programme_id = jent.get('externalId') - xml_url = jent.get('href') + xml_url = jent.get('hxref') + + title = jent.get('caption',list_title) - title = jent['caption'] duration = parse_duration(jent.get('duration')) description = list_title + ' - ' + jent.get('caption','') thumbnail = None if jent.has_key('image'): thumbnail=jent['image'].get('href') + formats = [] + subtitles = [] + if programme_id: formats, subtitles = self._download_media_selector(programme_id) + elif jent.has_key('sourceFiles'): + # mediaselector not used at + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu + for key, val in jent['sourceFiles'].items(): + formats.append( { + 'ext': val.get('encoding'), + 'url': val.get('url'), + 'filesize': int(val.get('filesize')), + 'format_id': key + } ) elif xml_url: # Cheap fallback # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') formats, subtitles = self._download_media_selector(programme_id) - else: - raise ExtractorError('data-media-meta entry has no externalId or href value.') + + if len(formats) == 0: + raise ExtractorError('unsupported json media entry.\n '+str(jent)+'\n') self._sort_formats(formats) ret.append( { - 'id': programme_id, + 'id': jent.get('programme_id',jent.get('id')), 'uploader': 'BBC News', 'upload_date': pubdate, 'title': title, From 7bb23aeca4e9076528e3d31d501a9a288dcd444c Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 11:08:13 -0500 Subject: [PATCH 029/450] rename bbccouk.py -> bbc.py --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/{bbccouk.py => bbc.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename youtube_dl/extractor/{bbccouk.py => bbc.py} (100%) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f9f7bdfaf..a48346e60 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -35,7 +35,7 @@ from .azubu import AzubuIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE -from .bbccouk import BBCCoUkIE, BBCNewsIE +from .bbc import BBCCoUkIE, BBCNewsIE from .beeg import BeegIE from .behindkink import BehindKinkIE from .beatportpro import BeatportProIE diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbc.py similarity index 100% rename from youtube_dl/extractor/bbccouk.py rename to youtube_dl/extractor/bbc.py From 2a282a3b5f366ba0569bae477d5060329ba254fb Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 20 Jun 2015 11:11:41 -0500 Subject: [PATCH 030/450] Unbreak breakage that was broken to test breakage --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 72e20857b..310db9d1d 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -502,7 +502,7 @@ class BBCNewsIE(BBCCoUkIE): for jent in jsent: programme_id = jent.get('externalId') - xml_url = jent.get('hxref') + xml_url = jent.get('href') title = jent.get('caption',list_title) From af1fa6234e9623a1a09e66c24c0dedc6996645c9 Mon Sep 17 00:00:00 2001 From: ping Date: Sun, 21 Jun 2015 11:12:31 +0800 Subject: [PATCH 031/450] [neteasemusic] Add new extractor for music.163.com --- youtube_dl/extractor/__init__.py | 9 + youtube_dl/extractor/neteasemusic.py | 434 +++++++++++++++++++++++++++ 2 files changed, 443 insertions(+) create mode 100644 youtube_dl/extractor/neteasemusic.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6c548d8e9..c8692ffa9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -339,6 +339,15 @@ from .ndtv import NDTVIE from .netzkino import NetzkinoIE from .nerdcubed import NerdCubedFeedIE from .nerdist import NerdistIE +from .neteasemusic import ( + NetEaseMusicIE, + NetEaseMusicAlbumIE, + NetEaseMusicSingerIE, + NetEaseMusicListIE, + NetEaseMusicMvIE, + NetEaseMusicProgramIE, + NetEaseMusicDjRadioIE, +) from .newgrounds import NewgroundsIE from .newstube import NewstubeIE from .nextmedia import ( diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py new file mode 100644 index 000000000..ebed5d3b4 --- /dev/null +++ b/youtube_dl/extractor/neteasemusic.py @@ -0,0 +1,434 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from hashlib import md5 +from datetime import datetime +import itertools +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_request, + compat_urllib_parse, +) + + +class NetEaseMusicBaseIE(InfoExtractor): + _FORMATS = ['bMusic', 'mMusic', 'hMusic'] + _NETEASE_SALT = '3go8&$8*3*3h0k(2)2' + _API_BASE = 'http://music.163.com/api/' + + @classmethod + def _encrypt(cls, dfsid): + salt_bytes = bytearray(str(cls._NETEASE_SALT)) + string_bytes = bytearray(str(dfsid)) + salt_len = len(salt_bytes) + for i in xrange(len(string_bytes)): + string_bytes[i] = string_bytes[i] ^ salt_bytes[i % salt_len] + m = md5() + m.update(string_bytes) + result = m.digest().encode('base64')[:-1] + return result.replace('/', '_').replace('+', '-') + + @classmethod + def extract_formats(cls, info): + formats = [] + for song_format in cls._FORMATS: + details = info.get(song_format) + if not details: + continue + formats.append({ + 'url': 'http://m1.music.126.net/%s/%s.%s' % + (cls._encrypt(details['dfsId']), details['dfsId'], + details['extension']), + 'ext': details['extension'], + 'abr': details['bitrate'] / 1000, + 'preference': details['bitrate'], + 'format_id': song_format, + 'filesize': details['size'], + 'asr': details['sr'] + }) + return formats + + def query_api(self, endpoint, video_id, note): + req = compat_urllib_request.Request('%s%s' % (self._API_BASE, endpoint)) + req.add_header('Referer', self._API_BASE) + return self._download_json(req, video_id, note) + + +class NetEaseMusicIE(NetEaseMusicBaseIE): + IE_NAME = 'netease:song' + _VALID_URL = r'https?://music\.163\.com/(#/)?song\?id=(?P[0-9]+)' + _TESTS = [{ + 'url': 'http://music.163.com/#/song?id=32102397', + 'md5': 'f2e97280e6345c74ba9d5677dd5dcb45', + 'info_dict': { + 'id': '32102397', + 'ext': 'mp3', + 'title': 'Bad Blood (feat. Kendrick Lamar)', + 'creator': 'Taylor Swift / Kendrick Lamar', + 'upload_date': '20150517', + 'timestamp': 1431878400, + 'description': 'md5:a10a54589c2860300d02e1de821eb2ef', + }, + }, { + 'note': 'No lyrics translation.', + 'url': 'http://music.163.com/#/song?id=29822014', + 'info_dict': { + 'id': '29822014', + 'ext': 'mp3', + 'title': '听见下雨的声音', + 'creator': '周杰伦', + 'upload_date': '20141225', + 'timestamp': 1419523200, + 'description': 'md5:a4d8d89f44656af206b7b2555c0bce6c', + }, + }, { + 'note': 'No lyrics.', + 'url': 'http://music.163.com/song?id=17241424', + 'info_dict': { + 'id': '17241424', + 'ext': 'mp3', + 'title': 'Opus 28', + 'creator': 'Dustin O\'Halloran', + 'upload_date': '20080211', + 'timestamp': 1202745600, + }, + }] + + def _process_lyrics(self, lyrics_info): + original = lyrics_info.get('lrc', {}).get('lyric') + translated = lyrics_info.get('tlyric', {}).get('lyric') + + if not translated: + return original + + lyrics_expr = r'(\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\])([^\n]+)' + original_ts_texts = re.findall(lyrics_expr, original) + translation_ts_dict = { + time_stamp: text for time_stamp, text in re.findall(lyrics_expr, translated) + } + + lyrics = '\n'.join([ + '%s%s / %s' % (time_stamp, text, translation_ts_dict.get(time_stamp, '')) + for time_stamp, text in original_ts_texts + ]) + return lyrics + + def _real_extract(self, url): + song_id = self._match_id(url) + + params = { + 'id': song_id, + 'ids': '[%s]' % song_id + } + info = self.query_api( + 'song/detail?' + compat_urllib_parse.urlencode(params), + song_id, 'Downloading song info')['songs'][0] + + formats = self.extract_formats(info) + self._sort_formats(formats) + + lyrics_info = self.query_api( + 'song/lyric?id=%s&lv=-1&tv=-1' % song_id, + song_id, 'Downloading lyrics data') + lyrics = self._process_lyrics(lyrics_info) + + alt_title = None + if info.get('alias'): + alt_title = '/'.join(info.get('alias')) + + return { + 'id': song_id, + 'title': info['name'], + 'alt_title': alt_title, + 'creator': ' / '.join([artist['name'] for artist in info.get('artists', [])]), + 'timestamp': int(info.get('album', {}).get('publishTime')/1000), + 'thumbnail': info.get('album', {}).get('picUrl'), + 'duration': int(info.get('duration', 0)/1000), + 'description': lyrics, + 'formats': formats, + } + + +class NetEaseMusicAlbumIE(NetEaseMusicBaseIE): + IE_NAME = 'netease:album' + _VALID_URL = r'https?://music\.163\.com/(#/)?album\?id=(?P[0-9]+)' + _TEST = { + 'url': 'http://music.163.com/#/album?id=220780', + 'info_dict': { + 'id': '220780', + 'title': 'B\'day', + }, + 'playlist_count': 23, + } + + def _real_extract(self, url): + album_id = self._match_id(url) + + info = self.query_api( + 'album/%s?id=%s' % (album_id, album_id), + album_id, 'Downloading album data')['album'] + + name = info['name'] + desc = info.get('description') + entries = [ + self.url_result('http://music.163.com/#/song?id=%s' % song['id'], + 'NetEaseMusic', song['id']) + for song in info['songs'] + ] + return self.playlist_result(entries, album_id, name, desc) + + +class NetEaseMusicSingerIE(NetEaseMusicBaseIE): + IE_NAME = 'netease:singer' + _VALID_URL = r'https?://music\.163\.com/(#/)?artist\?id=(?P[0-9]+)' + _TESTS = [{ + 'note': 'Singer has aliases.', + 'url': 'http://music.163.com/#/artist?id=10559', + 'info_dict': { + 'id': '10559', + 'title': '张惠妹 - aMEI;阿密特', + }, + 'playlist_count': 50, + }, { + 'note': 'Singer has translated name.', + 'url': 'http://music.163.com/#/artist?id=124098', + 'info_dict': { + 'id': '124098', + 'title': '李昇基 - 이승기', + }, + 'playlist_count': 50, + }] + + def _real_extract(self, url): + singer_id = self._match_id(url) + + info = self.query_api( + 'artist/%s?id=%s' % (singer_id, singer_id), + singer_id, 'Downloading singer data') + + name = info['artist']['name'] + if info['artist']['trans']: + name = '%s - %s' % (name, info['artist']['trans']) + if info['artist']['alias']: + name = '%s - %s' % (name, ";".join(info['artist']['alias'])) + + entries = [ + self.url_result('http://music.163.com/#/song?id=%s' % song['id'], + 'NetEaseMusic', song['id']) + for song in info['hotSongs'] + ] + return self.playlist_result(entries, singer_id, name) + + +class NetEaseMusicListIE(NetEaseMusicBaseIE): + IE_NAME = 'netease:playlist' + _VALID_URL = r'https?://music\.163\.com/(#/)?(playlist|discover/toplist)\?id=(?P[0-9]+)' + _TESTS = [{ + 'url': 'http://music.163.com/#/playlist?id=79177352', + 'info_dict': { + 'id': '79177352', + 'title': 'Billboard 2007 Top 100', + 'description': 'md5:12fd0819cab2965b9583ace0f8b7b022' + }, + 'playlist_count': 99, + }, { + 'note': 'Toplist/Charts sample', + 'url': 'http://music.163.com/#/discover/toplist?id=3733003', + 'info_dict': { + 'id': '3733003', + 'title': 're:韩国Melon排行榜周榜 [0-9]{4}-[0-9]{2}-[0-9]{2}', + 'description': 'md5:73ec782a612711cadc7872d9c1e134fc', + }, + 'playlist_count': 50, + }] + + def _real_extract(self, url): + list_id = self._match_id(url) + + info = self.query_api( + 'playlist/detail?id=%s&lv=-1&tv=-1' % list_id, + list_id, 'Downloading playlist data')['result'] + + name = info['name'] + desc = info.get('description') + + if info.get('specialType') == 10: # is a chart/toplist + datestamp = datetime.fromtimestamp(info['updateTime']/1000).strftime('%Y-%m-%d') + name = '%s %s' % (name, datestamp) + + entries = [ + self.url_result('http://music.163.com/#/song?id=%s' % song['id'], + 'NetEaseMusic', song['id']) + for song in info['tracks'] + ] + return self.playlist_result(entries, list_id, name, desc) + + +class NetEaseMusicMvIE(NetEaseMusicBaseIE): + IE_NAME = 'netease:mv' + _VALID_URL = r'https?://music\.163\.com/(#/)?mv\?id=(?P[0-9]+)' + _TEST = { + 'url': 'http://music.163.com/#/mv?id=415350', + 'info_dict': { + 'id': '415350', + 'ext': 'mp4', + 'title': '이럴거면 그러지말지', + 'description': '白雅言自作曲唱甜蜜爱情', + 'creator': '白雅言', + 'upload_date': '20150520', + }, + } + + def _real_extract(self, url): + mv_id = self._match_id(url) + + info = self.query_api( + 'mv/detail?id=%s&type=mp4' % mv_id, + mv_id, 'Downloading mv info')['data'] + + formats = [ + {'url': mv_url, 'ext': 'mp4', 'format_id': '%sp' % brs, 'preference': int(brs)} + for brs, mv_url in info['brs'].items() + ] + self._sort_formats(formats) + + return { + 'id': mv_id, + 'title': info['name'], + 'description': info.get('desc') or info.get('briefDesc'), + 'creator': info['artistName'], + 'upload_date': info['publishTime'].replace('-', ''), + 'formats': formats, + 'thumbnail': info.get('cover'), + 'duration': int(info.get('duration', 0)/1000), + } + + +class NetEaseMusicProgramIE(NetEaseMusicBaseIE): + IE_NAME = 'netease:program' + _VALID_URL = r'https?://music\.163\.com/(#/?)program\?id=(?P[0-9]+)' + _TESTS = [{ + 'url': 'http://music.163.com/#/program?id=10109055', + 'info_dict': { + 'id': '10109055', + 'ext': 'mp3', + 'title': '不丹足球背后的故事', + 'description': '喜马拉雅人的足球梦 ...', + 'creator': '大话西藏', + 'timestamp': 1434179341, + 'upload_date': '20150613', + 'duration': 900, + }, + }, { + 'note': 'This program has accompanying songs.', + 'url': 'http://music.163.com/#/program?id=10141022', + 'info_dict': { + 'id': '10141022', + 'title': '25岁,你是自在如风的少年<27°C>', + 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b', + }, + 'playlist_count': 4, + }, { + 'note': 'This program has accompanying songs.', + 'url': 'http://music.163.com/#/program?id=10141022', + 'info_dict': { + 'id': '10141022', + 'ext': 'mp3', + 'title': '25岁,你是自在如风的少年<27°C>', + 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b', + 'timestamp': 1434450840, + 'upload_date': '20150616', + }, + 'params': { + 'noplaylist': True + } + }] + + def _real_extract(self, url): + program_id = self._match_id(url) + + info = self.query_api( + 'dj/program/detail?id=%s' % program_id, + program_id, 'Downloading program info')['program'] + + name = info['name'] + description = info['description'] + + if not info['songs'] or self._downloader.params.get('noplaylist'): + if info['songs']: + self.to_screen( + 'Downloading just the main audio %s because of --no-playlist' + % info['mainSong']['id']) + + formats = self.extract_formats(info['mainSong']) + self._sort_formats(formats) + + return { + 'id': program_id, + 'title': name, + 'description': description, + 'creator': info['dj']['brand'], + 'timestamp': int(info['createTime']/1000), + 'thumbnail': info['coverUrl'], + 'duration': int(info.get('duration', 0)/1000), + 'formats': formats, + } + + self.to_screen( + 'Downloading playlist %s - add --no-playlist to just download the main audio %s' + % (program_id, info['mainSong']['id'])) + + song_ids = [info['mainSong']['id']] + song_ids.extend([song['id'] for song in info['songs']]) + entries = [ + self.url_result('http://music.163.com/#/song?id=%s' % song_id, + 'NetEaseMusic', song_id) + for song_id in song_ids + ] + return self.playlist_result(entries, program_id, name, description) + + +class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE): + IE_NAME = 'netease:djradio' + _VALID_URL = r'https?://music\.163\.com/(#/)?djradio\?id=(?P[0-9]+)' + _TEST = { + 'url': 'http://music.163.com/#/djradio?id=42', + 'info_dict': { + 'id': '42', + 'title': '声音蔓延', + 'description': 'md5:766220985cbd16fdd552f64c578a6b15' + }, + 'playlist_mincount': 40, + } + _PAGE_SIZE = 1000 + + def _real_extract(self, url): + dj_id = self._match_id(url) + + name = None + desc = None + entries = [] + for offset in itertools.count(start=0, step=self._PAGE_SIZE): + info = self.query_api( + 'dj/program/byradio?asc=false&limit=%d&radioId=%s&offset=%d' + % (self._PAGE_SIZE, dj_id, offset), + dj_id, 'Downloading dj programs - %d' % offset) + + entries.extend([ + self.url_result( + 'http://music.163.com/#/program?id=%s' % program['id'], + 'NetEaseMusicProgram', program['id']) + for program in info['programs'] + ]) + + if name is None: + radio = info['programs'][0]['radio'] + name = radio['name'] + desc = radio['desc'] + + if not info['more']: + break + + return self.playlist_result(entries, dj_id, name, desc) From 2da0cad6ae08e3c78f85a340260aef80a464cb20 Mon Sep 17 00:00:00 2001 From: ping Date: Sun, 21 Jun 2015 11:44:50 +0800 Subject: [PATCH 032/450] [neteasemusic] Do proper rounding conversion of millisecond timestamps/durations --- youtube_dl/extractor/neteasemusic.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py index ebed5d3b4..a70c65ca5 100644 --- a/youtube_dl/extractor/neteasemusic.py +++ b/youtube_dl/extractor/neteasemusic.py @@ -50,6 +50,10 @@ class NetEaseMusicBaseIE(InfoExtractor): }) return formats + @classmethod + def convert_milliseconds(cls, ms): + return int(round(ms/1000.0)) + def query_api(self, endpoint, video_id, note): req = compat_urllib_request.Request('%s%s' % (self._API_BASE, endpoint)) req.add_header('Referer', self._API_BASE) @@ -143,9 +147,9 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'title': info['name'], 'alt_title': alt_title, 'creator': ' / '.join([artist['name'] for artist in info.get('artists', [])]), - 'timestamp': int(info.get('album', {}).get('publishTime')/1000), + 'timestamp': self.convert_milliseconds(info.get('album', {}).get('publishTime')), 'thumbnail': info.get('album', {}).get('picUrl'), - 'duration': int(info.get('duration', 0)/1000), + 'duration': self.convert_milliseconds(info.get('duration', 0)), 'description': lyrics, 'formats': formats, } @@ -255,7 +259,8 @@ class NetEaseMusicListIE(NetEaseMusicBaseIE): desc = info.get('description') if info.get('specialType') == 10: # is a chart/toplist - datestamp = datetime.fromtimestamp(info['updateTime']/1000).strftime('%Y-%m-%d') + datestamp = datetime.fromtimestamp( + self.convert_milliseconds(info['updateTime'])).strftime('%Y-%m-%d') name = '%s %s' % (name, datestamp) entries = [ @@ -302,7 +307,7 @@ class NetEaseMusicMvIE(NetEaseMusicBaseIE): 'upload_date': info['publishTime'].replace('-', ''), 'formats': formats, 'thumbnail': info.get('cover'), - 'duration': int(info.get('duration', 0)/1000), + 'duration': self.convert_milliseconds(info.get('duration', 0)), } @@ -317,7 +322,7 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE): 'title': '不丹足球背后的故事', 'description': '喜马拉雅人的足球梦 ...', 'creator': '大话西藏', - 'timestamp': 1434179341, + 'timestamp': 1434179342, 'upload_date': '20150613', 'duration': 900, }, @@ -338,7 +343,7 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE): 'ext': 'mp3', 'title': '25岁,你是自在如风的少年<27°C>', 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b', - 'timestamp': 1434450840, + 'timestamp': 1434450841, 'upload_date': '20150616', }, 'params': { @@ -370,9 +375,9 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE): 'title': name, 'description': description, 'creator': info['dj']['brand'], - 'timestamp': int(info['createTime']/1000), + 'timestamp': self.convert_milliseconds(info['createTime']), 'thumbnail': info['coverUrl'], - 'duration': int(info.get('duration', 0)/1000), + 'duration': self.convert_milliseconds(info.get('duration', 0)), 'formats': formats, } From a9dcf4a860214e37971ab05f27f74bbae65ff8ae Mon Sep 17 00:00:00 2001 From: fnord Date: Tue, 23 Jun 2015 01:08:07 -0500 Subject: [PATCH 033/450] Prefer externalId over non-mediaserver-specific hashkey for video id. --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 310db9d1d..fed344ea0 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -540,7 +540,7 @@ class BBCNewsIE(BBCCoUkIE): self._sort_formats(formats) ret.append( { - 'id': jent.get('programme_id',jent.get('id')), + 'id': jent.get('id') if programme_id == None else programme_id, 'uploader': 'BBC News', 'upload_date': pubdate, 'title': title, From da92eeae42f556926cb676b3c14e270603b7e38e Mon Sep 17 00:00:00 2001 From: fnord Date: Thu, 25 Jun 2015 00:31:32 -0500 Subject: [PATCH 034/450] Fix tests, description formatting --- youtube_dl/extractor/bbc.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index fed344ea0..bb671d473 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -428,6 +428,8 @@ class BBCNewsIE(BBCCoUkIE): 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', 'duration': 47, + 'upload_date': '20150324', + 'uploader': 'BBC News', }, 'params': { 'skip_download': True, @@ -438,8 +440,11 @@ class BBCNewsIE(BBCCoUkIE): 'info_dict': { 'id': 'NA', 'ext': 'mp4', - 'title': 'YPG - Tel Abyad..n tamam. kontrol.m.zde', + 'title': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde', + 'description': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde', 'duration': 47, + 'upload_date': '20150615', + 'uploader': 'BBC News', }, 'params': { 'skip_download': True, @@ -450,8 +455,11 @@ class BBCNewsIE(BBCCoUkIE): 'info_dict': { 'id': '39275083', 'ext': 'mp4', - 'title': 'Honduras militariza sus hospitales por nuevo esc.ndalo de corrupci.n', + 'title': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n', + 'description': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n', 'duration': 87, + 'upload_date': '20150619', + 'uploader': 'BBC News', }, 'params': { 'skip_download': True, @@ -507,7 +515,9 @@ class BBCNewsIE(BBCCoUkIE): title = jent.get('caption',list_title) duration = parse_duration(jent.get('duration')) - description = list_title + ' - ' + jent.get('caption','') + description = list_title + if jent.get('caption'): + description += ' - ' + jent.get('caption') thumbnail = None if jent.has_key('image'): thumbnail=jent['image'].get('href') @@ -539,8 +549,12 @@ class BBCNewsIE(BBCCoUkIE): self._sort_formats(formats) + id = jent.get('id') if programme_id == None else programme_id + if id == None: + id = 'NA' + ret.append( { - 'id': jent.get('id') if programme_id == None else programme_id, + 'id': id, 'uploader': 'BBC News', 'upload_date': pubdate, 'title': title, From 2028c6e03d7e254831350081bb4b4741b0b47ac4 Mon Sep 17 00:00:00 2001 From: Shadab Zafar Date: Fri, 26 Jun 2015 21:27:43 +0530 Subject: [PATCH 035/450] Added a Playlist Info Extractor for WebOfStories --- youtube_dl/extractor/__init__.py | 5 ++++- youtube_dl/extractor/webofstories.py | 25 +++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 46cc4cd06..c3f3a3e38 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -693,7 +693,10 @@ from .wdr import ( WDRMobileIE, WDRMausIE, ) -from .webofstories import WebOfStoriesIE +from .webofstories import ( + WebOfStoriesIE, + WebOfStoriesPlaylistIE, +) from .weibo import WeiboIE from .wimp import WimpIE from .wistia import WistiaIE diff --git a/youtube_dl/extractor/webofstories.py b/youtube_dl/extractor/webofstories.py index 73077a312..d70e30c00 100644 --- a/youtube_dl/extractor/webofstories.py +++ b/youtube_dl/extractor/webofstories.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import int_or_none @@ -98,3 +100,26 @@ class WebOfStoriesIE(InfoExtractor): 'description': description, 'duration': duration, } + + +class WebOfStoriesPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?webofstories\.com/playAll/(?P[^/]+)' + _TESTS = [] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('http://www.webofstories.com/play/%s' % video_number, 'WebOfStories') + for video_number in set(re.findall('href="/playAll/%s\?sId=(\d+)"' % playlist_id, webpage)) + ] + + title = self._html_search_regex( + r'([^<]+)\s*-\s*Web\sof\sStories', webpage, 'title') + + description = self._html_search_meta( + 'description', webpage, 'description') + + return self.playlist_result(entries, playlist_id, title, description) From 67134eaba1a56cec4117000acb2fc9284c9cdd9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 28 Jun 2015 22:08:29 +0200 Subject: [PATCH 036/450] [YoutubeDL] rework how the format spec is processed The spec string is processed using 'tokenize.tokenize' to split it in words and operators, the filters are still processed using regular expressions. This should make easier to allow grouping operators with parens. --- test/test_YoutubeDL.py | 27 ++-- youtube_dl/YoutubeDL.py | 298 +++++++++++++++++++++++++--------------- youtube_dl/compat.py | 5 + 3 files changed, 209 insertions(+), 121 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index a13c09ef4..8f7aef512 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -229,21 +229,30 @@ class TestFormatSelection(unittest.TestCase): '141', '172', '140', '171', '139', ] - for f1id, f2id in zip(order, order[1:]): - f1 = YoutubeIE._formats[f1id].copy() - f1['format_id'] = f1id - f1['url'] = 'url:' + f1id - f2 = YoutubeIE._formats[f2id].copy() - f2['format_id'] = f2id - f2['url'] = 'url:' + f2id + def format_info(f_id): + info = YoutubeIE._formats[f_id].copy() + info['format_id'] = f_id + info['url'] = 'url:' + f_id + return info + formats_order = [format_info(f_id) for f_id in order] + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': 'bestvideo+bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], '137+141') + self.assertEqual(downloaded['ext'], 'mp4') + + for f1, f2 in zip(formats_order, formats_order[1:]): info_dict = _make_result([f1, f2], extractor='youtube') ydl = YDL({'format': 'best/bestvideo'}) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], f1id) + self.assertEqual(downloaded['format_id'], f1['format_id']) info_dict = _make_result([f2, f1], extractor='youtube') ydl = YDL({'format': 'best/bestvideo'}) @@ -251,7 +260,7 @@ class TestFormatSelection(unittest.TestCase): yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], f1id) + self.assertEqual(downloaded['format_id'], f1['format_id']) def test_format_filtering(self): formats = [ diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ef0f71bad..17a5407b9 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -21,6 +21,7 @@ import subprocess import socket import sys import time +import tokenize import traceback if os.name == 'nt': @@ -34,6 +35,7 @@ from .compat import ( compat_http_client, compat_kwargs, compat_str, + compat_tokenize_tokenize, compat_urllib_error, compat_urllib_request, ) @@ -851,8 +853,8 @@ class YoutubeDL(object): else: raise Exception('Invalid result type: %s' % result_type) - def _apply_format_filter(self, format_spec, available_formats): - " Returns a tuple of the remaining format_spec and filtered formats " + def _build_format_filter(self, filter_spec): + " Returns a function to filter the formats according to the filter_spec " OPERATORS = { '<': operator.lt, @@ -862,13 +864,13 @@ class YoutubeDL(object): '=': operator.eq, '!=': operator.ne, } - operator_rex = re.compile(r'''(?x)\s*\[ + operator_rex = re.compile(r'''(?x)\s* (?Pwidth|height|tbr|abr|vbr|asr|filesize|fps) \s*(?P%s)(?P\s*\?)?\s* (?P[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?) - \]$ + $ ''' % '|'.join(map(re.escape, OPERATORS.keys()))) - m = operator_rex.search(format_spec) + m = operator_rex.search(filter_spec) if m: try: comparison_value = int(m.group('value')) @@ -879,7 +881,7 @@ class YoutubeDL(object): if comparison_value is None: raise ValueError( 'Invalid value %r in format specification %r' % ( - m.group('value'), format_spec)) + m.group('value'), filter_spec)) op = OPERATORS[m.group('op')] if not m: @@ -887,85 +889,201 @@ class YoutubeDL(object): '=': operator.eq, '!=': operator.ne, } - str_operator_rex = re.compile(r'''(?x)\s*\[ + str_operator_rex = re.compile(r'''(?x) \s*(?Pext|acodec|vcodec|container|protocol) \s*(?P%s)(?P\s*\?)? \s*(?P[a-zA-Z0-9_-]+) - \s*\]$ + \s*$ ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) - m = str_operator_rex.search(format_spec) + m = str_operator_rex.search(filter_spec) if m: comparison_value = m.group('value') op = STR_OPERATORS[m.group('op')] if not m: - raise ValueError('Invalid format specification %r' % format_spec) + raise ValueError('Invalid filter specification %r' % filter_spec) def _filter(f): actual_value = f.get(m.group('key')) if actual_value is None: return m.group('none_inclusive') return op(actual_value, comparison_value) - new_formats = [f for f in available_formats if _filter(f)] + return _filter - new_format_spec = format_spec[:-len(m.group(0))] - if not new_format_spec: - new_format_spec = 'best' + def build_format_selector(self, format_spec): + def syntax_error(note, start): + message = ( + 'Invalid format specification: ' + '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1])) + return SyntaxError(message) - return (new_format_spec, new_formats) + PICKFIRST = 'PICKFIRST' + MERGE = 'MERGE' + SINGLE = 'SINGLE' + FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters']) - def select_format(self, format_spec, available_formats): - while format_spec.endswith(']'): - format_spec, available_formats = self._apply_format_filter( - format_spec, available_formats) - if not available_formats: - return None + def _parse_filter(tokens): + filter_parts = [] + for type, string, start, _, _ in tokens: + if type == tokenize.OP and string == ']': + return ''.join(filter_parts) + else: + filter_parts.append(string) - if format_spec in ['best', 'worst', None]: - format_idx = 0 if format_spec == 'worst' else -1 - audiovideo_formats = [ - f for f in available_formats - if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] - if audiovideo_formats: - return audiovideo_formats[format_idx] - # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format - elif (all(f.get('acodec') != 'none' for f in available_formats) or - all(f.get('vcodec') != 'none' for f in available_formats)): - return available_formats[format_idx] - elif format_spec == 'bestaudio': - audio_formats = [ - f for f in available_formats - if f.get('vcodec') == 'none'] - if audio_formats: - return audio_formats[-1] - elif format_spec == 'worstaudio': - audio_formats = [ - f for f in available_formats - if f.get('vcodec') == 'none'] - if audio_formats: - return audio_formats[0] - elif format_spec == 'bestvideo': - video_formats = [ - f for f in available_formats - if f.get('acodec') == 'none'] - if video_formats: - return video_formats[-1] - elif format_spec == 'worstvideo': - video_formats = [ - f for f in available_formats - if f.get('acodec') == 'none'] - if video_formats: - return video_formats[0] - else: - extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] - if format_spec in extensions: - filter_f = lambda f: f['ext'] == format_spec - else: - filter_f = lambda f: f['format_id'] == format_spec - matches = list(filter(filter_f, available_formats)) - if matches: - return matches[-1] - return None + def _parse_format_selection(tokens, endwith=[]): + selectors = [] + current_selector = None + for type, string, start, _, _ in tokens: + # ENCODING is only defined in python 3.x + if type == getattr(tokenize, 'ENCODING', None): + continue + elif type in [tokenize.NAME, tokenize.NUMBER]: + current_selector = FormatSelector(SINGLE, string, []) + elif type == tokenize.OP: + if string in endwith: + break + if string == ',': + selectors.append(current_selector) + current_selector = None + elif string == '/': + first_choice = current_selector + second_choice = _parse_format_selection(tokens, [',']) + current_selector = None + selectors.append(FormatSelector(PICKFIRST, (first_choice, second_choice), [])) + elif string == '[': + if not current_selector: + current_selector = FormatSelector(SINGLE, 'best', []) + format_filter = _parse_filter(tokens) + current_selector.filters.append(format_filter) + elif string == '+': + video_selector = current_selector + audio_selector = _parse_format_selection(tokens, [',']) + current_selector = None + selectors.append(FormatSelector(MERGE, (video_selector, audio_selector), [])) + else: + raise syntax_error('Operator not recognized: "{0}"'.format(string), start) + elif type == tokenize.ENDMARKER: + break + if current_selector: + selectors.append(current_selector) + return selectors + + def _build_selector_function(selector): + if isinstance(selector, list): + fs = [_build_selector_function(s) for s in selector] + + def selector_function(formats): + for f in fs: + for format in f(formats): + yield format + return selector_function + elif selector.type == PICKFIRST: + fs = [_build_selector_function(s) for s in selector.selector] + + def selector_function(formats): + for f in fs: + picked_formats = list(f(formats)) + if picked_formats: + return picked_formats + return [] + elif selector.type == SINGLE: + format_spec = selector.selector + + def selector_function(formats): + if format_spec in ['best', 'worst', None]: + format_idx = 0 if format_spec == 'worst' else -1 + audiovideo_formats = [ + f for f in formats + if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] + if audiovideo_formats: + yield audiovideo_formats[format_idx] + # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format + elif (all(f.get('acodec') != 'none' for f in formats) or + all(f.get('vcodec') != 'none' for f in formats)): + yield formats[format_idx] + elif format_spec == 'bestaudio': + audio_formats = [ + f for f in formats + if f.get('vcodec') == 'none'] + if audio_formats: + yield audio_formats[-1] + elif format_spec == 'worstaudio': + audio_formats = [ + f for f in formats + if f.get('vcodec') == 'none'] + if audio_formats: + yield audio_formats[0] + elif format_spec == 'bestvideo': + video_formats = [ + f for f in formats + if f.get('acodec') == 'none'] + if video_formats: + yield video_formats[-1] + elif format_spec == 'worstvideo': + video_formats = [ + f for f in formats + if f.get('acodec') == 'none'] + if video_formats: + yield video_formats[0] + else: + extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] + if format_spec in extensions: + filter_f = lambda f: f['ext'] == format_spec + else: + filter_f = lambda f: f['format_id'] == format_spec + matches = list(filter(filter_f, formats)) + if matches: + yield matches[-1] + elif selector.type == MERGE: + def _merge(formats_info): + format_1, format_2 = [f['format_id'] for f in formats_info] + # The first format must contain the video and the + # second the audio + if formats_info[0].get('vcodec') == 'none': + self.report_error('The first format must ' + 'contain the video, try using ' + '"-f %s+%s"' % (format_2, format_1)) + return + output_ext = ( + formats_info[0]['ext'] + if self.params.get('merge_output_format') is None + else self.params['merge_output_format']) + return { + 'requested_formats': formats_info, + 'format': '%s+%s' % (formats_info[0].get('format'), + formats_info[1].get('format')), + 'format_id': '%s+%s' % (formats_info[0].get('format_id'), + formats_info[1].get('format_id')), + 'width': formats_info[0].get('width'), + 'height': formats_info[0].get('height'), + 'resolution': formats_info[0].get('resolution'), + 'fps': formats_info[0].get('fps'), + 'vcodec': formats_info[0].get('vcodec'), + 'vbr': formats_info[0].get('vbr'), + 'stretched_ratio': formats_info[0].get('stretched_ratio'), + 'acodec': formats_info[1].get('acodec'), + 'abr': formats_info[1].get('abr'), + 'ext': output_ext, + } + video_selector, audio_selector = map(_build_selector_function, selector.selector) + + def selector_function(formats): + formats = list(formats) + for pair in itertools.product(video_selector(formats), audio_selector(formats)): + yield _merge(pair) + + filters = [self._build_format_filter(f) for f in selector.filters] + + def final_selector(formats): + for _filter in filters: + formats = list(filter(_filter, formats)) + return selector_function(formats) + return final_selector + + stream = io.BytesIO(format_spec.encode('utf-8')) + tokens = compat_tokenize_tokenize(stream.readline) + parsed_selector = _parse_format_selection(tokens) + return _build_selector_function(parsed_selector) def _calc_headers(self, info_dict): res = std_headers.copy() @@ -1112,52 +1230,8 @@ class YoutubeDL(object): if req_format == 'all': formats_to_download = formats else: - for rfstr in req_format.split(','): - # We can accept formats requested in the format: 34/5/best, we pick - # the first that is available, starting from left - req_formats = rfstr.split('/') - for rf in req_formats: - if re.match(r'.+?\+.+?', rf) is not None: - # Two formats have been requested like '137+139' - format_1, format_2 = rf.split('+') - formats_info = (self.select_format(format_1, formats), - self.select_format(format_2, formats)) - if all(formats_info): - # The first format must contain the video and the - # second the audio - if formats_info[0].get('vcodec') == 'none': - self.report_error('The first format must ' - 'contain the video, try using ' - '"-f %s+%s"' % (format_2, format_1)) - return - output_ext = ( - formats_info[0]['ext'] - if self.params.get('merge_output_format') is None - else self.params['merge_output_format']) - selected_format = { - 'requested_formats': formats_info, - 'format': '%s+%s' % (formats_info[0].get('format'), - formats_info[1].get('format')), - 'format_id': '%s+%s' % (formats_info[0].get('format_id'), - formats_info[1].get('format_id')), - 'width': formats_info[0].get('width'), - 'height': formats_info[0].get('height'), - 'resolution': formats_info[0].get('resolution'), - 'fps': formats_info[0].get('fps'), - 'vcodec': formats_info[0].get('vcodec'), - 'vbr': formats_info[0].get('vbr'), - 'stretched_ratio': formats_info[0].get('stretched_ratio'), - 'acodec': formats_info[1].get('acodec'), - 'abr': formats_info[1].get('abr'), - 'ext': output_ext, - } - else: - selected_format = None - else: - selected_format = self.select_format(rf, formats) - if selected_format is not None: - formats_to_download.append(selected_format) - break + format_selector = self.build_format_selector(req_format) + formats_to_download = list(format_selector(formats)) if not formats_to_download: raise ExtractorError('requested format not available', expected=True) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index f9529210d..bc218dd71 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -388,6 +388,10 @@ else: pass return _terminal_size(columns, lines) +if sys.version_info >= (3, 0): + from tokenize import tokenize as compat_tokenize_tokenize +else: + from tokenize import generate_tokens as compat_tokenize_tokenize __all__ = [ 'compat_HTTPError', @@ -408,6 +412,7 @@ __all__ = [ 'compat_socket_create_connection', 'compat_str', 'compat_subprocess_get_DEVNULL', + 'compat_tokenize_tokenize', 'compat_urllib_error', 'compat_urllib_parse', 'compat_urllib_parse_unquote', From 5acfa126c812c3ab7088af6c7df79697baee7831 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 28 Jun 2015 22:48:02 +0200 Subject: [PATCH 037/450] [YoutubeDL] format spec: treat 'all' like a normal specifier So you can use filters with it, for example 'all[width>=400][width<=600]'. --- test/test_YoutubeDL.py | 5 +++++ youtube_dl/YoutubeDL.py | 13 ++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 8f7aef512..709e3100f 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -317,6 +317,11 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'G') + ydl = YDL({'format': 'all[width>=400][width<=600]'}) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['B', 'C', 'D']) + class TestYoutubeDL(unittest.TestCase): def test_subtitles(self): diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 17a5407b9..258e612af 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -990,7 +990,10 @@ class YoutubeDL(object): format_spec = selector.selector def selector_function(formats): - if format_spec in ['best', 'worst', None]: + if format_spec == 'all': + for f in formats: + yield f + elif format_spec in ['best', 'worst', None]: format_idx = 0 if format_spec == 'worst' else -1 audiovideo_formats = [ f for f in formats @@ -1226,12 +1229,8 @@ class YoutubeDL(object): req_format_list.append('bestvideo+bestaudio') req_format_list.append('best') req_format = '/'.join(req_format_list) - formats_to_download = [] - if req_format == 'all': - formats_to_download = formats - else: - format_selector = self.build_format_selector(req_format) - formats_to_download = list(format_selector(formats)) + format_selector = self.build_format_selector(req_format) + formats_to_download = list(format_selector(formats)) if not formats_to_download: raise ExtractorError('requested format not available', expected=True) From bea41c7f3fa4f9072ad2f5354938ab1c8cef0a6d Mon Sep 17 00:00:00 2001 From: corone17 Date: Mon, 29 Jun 2015 00:59:18 +0200 Subject: [PATCH 038/450] Update rtlnl.py Better to extract 'http://manifest.us.rtl.nl' from the json, I'd say. And I think it's better to use the default json-url to make it more futureproof. Succesfully tested with tarball. --- youtube_dl/extractor/rtlnl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 41d202c28..e708e0093 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -51,7 +51,7 @@ class RtlNlIE(InfoExtractor): def _real_extract(self, url): uuid = self._match_id(url) info = self._download_json( - 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=flash/' % uuid, + 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=adaptive/' % uuid, uuid) material = info['material'][0] @@ -60,8 +60,8 @@ class RtlNlIE(InfoExtractor): description = material.get('synopsis') or info['episodes'][0]['synopsis'] # Use unencrypted m3u8 streams (See https://github.com/rg3/youtube-dl/issues/4118) - videopath = material['videopath'].replace('.f4m', '.m3u8') - m3u8_url = 'http://manifest.us.rtl.nl' + videopath + videopath = material['videopath'].replace('adaptive', 'flash') + m3u8_url = info['meta']['videohost'] + videopath formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4') From 0130afb76e5cb6f470f39f127c8d09eea3e82d0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 29 Jun 2015 12:42:02 +0200 Subject: [PATCH 039/450] [YoutubeDL] format spec: allow grouping specifiers with parentheses --- test/test_YoutubeDL.py | 24 ++++++++++++++++++++++++ youtube_dl/YoutubeDL.py | 39 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 709e3100f..6f374d7ea 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -245,6 +245,30 @@ class TestFormatSelection(unittest.TestCase): self.assertEqual(downloaded['format_id'], '137+141') self.assertEqual(downloaded['ext'], 'mp4') + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])+bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['137+141', '248+141']) + + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])[height<=720]+bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['136+141', '247+141']) + + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': '(bestvideo[ext=none]/bestvideo[ext=webm])+bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['248+141']) + for f1, f2 in zip(formats_order, formats_order[1:]): info_dict = _make_result([f1, f2], extractor='youtube') ydl = YDL({'format': 'best/bestvideo'}) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 258e612af..e5b46f87e 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -920,6 +920,7 @@ class YoutubeDL(object): PICKFIRST = 'PICKFIRST' MERGE = 'MERGE' SINGLE = 'SINGLE' + GROUP = 'GROUP' FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters']) def _parse_filter(tokens): @@ -942,6 +943,10 @@ class YoutubeDL(object): elif type == tokenize.OP: if string in endwith: break + elif string == ')': + # ')' will be handled by the parentheses group + tokens.restore_last_token() + break if string == ',': selectors.append(current_selector) current_selector = None @@ -955,6 +960,10 @@ class YoutubeDL(object): current_selector = FormatSelector(SINGLE, 'best', []) format_filter = _parse_filter(tokens) current_selector.filters.append(format_filter) + elif string == '(': + if current_selector: + raise syntax_error('Unexpected "("', start) + current_selector = FormatSelector(GROUP, _parse_format_selection(tokens, [')']), []) elif string == '+': video_selector = current_selector audio_selector = _parse_format_selection(tokens, [',']) @@ -977,6 +986,8 @@ class YoutubeDL(object): for format in f(formats): yield format return selector_function + elif selector.type == GROUP: + selector_function = _build_selector_function(selector.selector) elif selector.type == PICKFIRST: fs = [_build_selector_function(s) for s in selector.selector] @@ -1084,8 +1095,32 @@ class YoutubeDL(object): return final_selector stream = io.BytesIO(format_spec.encode('utf-8')) - tokens = compat_tokenize_tokenize(stream.readline) - parsed_selector = _parse_format_selection(tokens) + try: + tokens = list(compat_tokenize_tokenize(stream.readline)) + except tokenize.TokenError: + raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) + + class TokenIterator(object): + def __init__(self, tokens): + self.tokens = tokens + self.counter = 0 + + def __iter__(self): + return self + + def __next__(self): + if self.counter >= len(self.tokens): + raise StopIteration() + value = self.tokens[self.counter] + self.counter += 1 + return value + + next = __next__ + + def restore_last_token(self): + self.counter -= 1 + + parsed_selector = _parse_format_selection(iter(TokenIterator(tokens))) return _build_selector_function(parsed_selector) def _calc_headers(self, info_dict): From cf2ac6df6896dac4d23918867bb86fac1e1088d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 30 Jun 2015 19:45:42 +0200 Subject: [PATCH 040/450] [YoutubeDL] format spec: Fix handling of '+' with '/' 'bestvideo+bestaudio/best' was incorrectly interpreted as 'bestvideo+(bestaudio/best)', so it would fail if 'bestaudio' doesn't exist instead of falling back to 'best'. --- test/test_YoutubeDL.py | 8 ++++++++ youtube_dl/YoutubeDL.py | 25 +++++++++++++++---------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 6f374d7ea..1e4aaa559 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -245,6 +245,14 @@ class TestFormatSelection(unittest.TestCase): self.assertEqual(downloaded['format_id'], '137+141') self.assertEqual(downloaded['ext'], 'mp4') + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': 'bestvideo[height>=999999]+bestaudio/best'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], '38') + info_dict = _make_result(list(formats_order), extractor='youtube') ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])+bestaudio'}) yie = YoutubeIE(ydl) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e5b46f87e..5deb4848e 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -931,7 +931,7 @@ class YoutubeDL(object): else: filter_parts.append(string) - def _parse_format_selection(tokens, endwith=[]): + def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False): selectors = [] current_selector = None for type, string, start, _, _ in tokens: @@ -941,18 +941,23 @@ class YoutubeDL(object): elif type in [tokenize.NAME, tokenize.NUMBER]: current_selector = FormatSelector(SINGLE, string, []) elif type == tokenize.OP: - if string in endwith: + if string == ')': + if not inside_group: + # ')' will be handled by the parentheses group + tokens.restore_last_token() break - elif string == ')': - # ')' will be handled by the parentheses group + elif inside_merge and string in ['/', ',']: tokens.restore_last_token() break - if string == ',': + elif inside_choice and string == ',': + tokens.restore_last_token() + break + elif string == ',': selectors.append(current_selector) current_selector = None elif string == '/': first_choice = current_selector - second_choice = _parse_format_selection(tokens, [',']) + second_choice = _parse_format_selection(tokens, inside_choice=True) current_selector = None selectors.append(FormatSelector(PICKFIRST, (first_choice, second_choice), [])) elif string == '[': @@ -963,12 +968,12 @@ class YoutubeDL(object): elif string == '(': if current_selector: raise syntax_error('Unexpected "("', start) - current_selector = FormatSelector(GROUP, _parse_format_selection(tokens, [')']), []) + group = _parse_format_selection(tokens, inside_group=True) + current_selector = FormatSelector(GROUP, group, []) elif string == '+': video_selector = current_selector - audio_selector = _parse_format_selection(tokens, [',']) - current_selector = None - selectors.append(FormatSelector(MERGE, (video_selector, audio_selector), [])) + audio_selector = _parse_format_selection(tokens, inside_merge=True) + current_selector = FormatSelector(MERGE, (video_selector, audio_selector), []) else: raise syntax_error('Operator not recognized: "{0}"'.format(string), start) elif type == tokenize.ENDMARKER: From 1866432db74946c2b66263d38ed2c9d9d7e3177d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lio=20A=2E=20Heckert?= Date: Tue, 30 Jun 2015 16:22:09 -0300 Subject: [PATCH 041/450] Rename --pp-params to --postprocessor-args and access value as super class attribute --- README.md | 2 +- youtube_dl/YoutubeDL.py | 2 +- youtube_dl/__init__.py | 6 +----- youtube_dl/options.py | 4 ++-- youtube_dl/postprocessor/common.py | 6 ++++-- youtube_dl/postprocessor/ffmpeg.py | 11 +++++------ 6 files changed, 14 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 813ac4a15..7eb17a163 100644 --- a/README.md +++ b/README.md @@ -214,7 +214,7 @@ which means you can modify it, redistribute it or use it however you like. --audio-quality QUALITY Specify ffmpeg/avconv audio quality, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default 5) --recode-video FORMAT Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|xvid) - --pp-params Extra parameters for video post-processor. + --postprocessor-args Extra parameters for video post-processor. -k, --keep-video Keep the video file on disk after the post-processing; the video is erased by default --no-post-overwrites Do not overwrite post-processed files; the post-processed files are overwritten by default --embed-subs Embed subtitles in the video (only for mkv and mp4 videos) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 3bfe30c76..ff95add78 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -261,7 +261,7 @@ class YoutubeDL(object): The following options are used by the post processors: prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, otherwise prefer avconv. - pp_params: Extra parameters for external apps, like avconv. + postprocessor_args: Extra parameters for external apps, like avconv. """ params = None diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 8b54d4ae2..356697015 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -171,10 +171,6 @@ def _real_main(argv=None): if opts.recodevideo is not None: if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'xvid']: parser.error('invalid video recode format specified') - if opts.pp_params is None: - opts.pp_params = [] - else: - opts.pp_params = shlex.split(opts.pp_params) if opts.convertsubtitles is not None: if opts.convertsubtitles not in ['srt', 'vtt', 'ass']: parser.error('invalid subtitle format specified') @@ -231,7 +227,7 @@ def _real_main(argv=None): postprocessors.append({ 'key': 'FFmpegVideoConvertor', 'preferedformat': opts.recodevideo, - 'extra_params': opts.pp_params + 'extra_cmd_args': opts.postprocessor_args, }) if opts.convertsubtitles: postprocessors.append({ diff --git a/youtube_dl/options.py b/youtube_dl/options.py index fbba9b9d8..3d88428c4 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -688,8 +688,8 @@ def parseOpts(overrideArguments=None): metavar='FORMAT', dest='recodevideo', default=None, help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|xvid)') postproc.add_option( - '--pp-params', - dest='pp_params', default=None, metavar='ARGS', + '--postprocessor-args', + dest='postprocessor_args', default=None, metavar='ARGS', help='Extra parameters for video post-processor.') postproc.add_option( '-k', '--keep-video', diff --git a/youtube_dl/postprocessor/common.py b/youtube_dl/postprocessor/common.py index d944d9367..c44501b59 100644 --- a/youtube_dl/postprocessor/common.py +++ b/youtube_dl/postprocessor/common.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import os +import shlex from ..utils import ( PostProcessingError, @@ -23,12 +24,13 @@ class PostProcessor(object): PostProcessor objects follow a "mutual registration" process similar to InfoExtractor objects. And it can receive parameters from CLI trough - --pp-params. + --postprocessor-args. """ _downloader = None - def __init__(self, downloader=None): + def __init__(self, downloader=None, extra_cmd_args=None): + self._extra_cmd_args = shlex.split(extra_cmd_args or '') self._downloader = downloader def set_downloader(self, downloader): diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index a696b12b4..891c72769 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -29,8 +29,8 @@ class FFmpegPostProcessorError(PostProcessingError): class FFmpegPostProcessor(PostProcessor): - def __init__(self, downloader=None): - PostProcessor.__init__(self, downloader) + def __init__(self, downloader=None, extra_cmd_args=None): + PostProcessor.__init__(self, downloader, extra_cmd_args) self._determine_executables() def check_version(self): @@ -287,16 +287,15 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): class FFmpegVideoConvertorPP(FFmpegPostProcessor): - def __init__(self, downloader=None, preferedformat=None, extra_params=[]): - super(FFmpegVideoConvertorPP, self).__init__(downloader) + def __init__(self, downloader=None, preferedformat=None, extra_cmd_args=None): + super(FFmpegVideoConvertorPP, self).__init__(downloader, extra_cmd_args) self._preferedformat = preferedformat - self._extra_params = extra_params def run(self, information): path = information['filepath'] prefix, sep, ext = path.rpartition('.') ext = self._preferedformat - options = self._extra_params + options = self._extra_cmd_args if self._preferedformat == 'xvid': ext = 'avi' options.extend(['-c:v', 'libxvid', '-vtag', 'XVID']) From aa5d9a79d6b5c354ee4a6bfbb43f94c2485ab9b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lio=20A=2E=20Heckert?= Date: Wed, 1 Jul 2015 20:12:26 -0300 Subject: [PATCH 042/450] Simplify `postprocessor_args` transmission to PP base class * Remove `extra_cmd_args` transmission from sub to super class. * Simplify params transmission through `downloader.params`. --- youtube_dl/__init__.py | 2 +- youtube_dl/postprocessor/common.py | 5 ++--- youtube_dl/postprocessor/ffmpeg.py | 8 ++++---- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 356697015..249f76365 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -227,7 +227,6 @@ def _real_main(argv=None): postprocessors.append({ 'key': 'FFmpegVideoConvertor', 'preferedformat': opts.recodevideo, - 'extra_cmd_args': opts.postprocessor_args, }) if opts.convertsubtitles: postprocessors.append({ @@ -354,6 +353,7 @@ def _real_main(argv=None): 'extract_flat': opts.extract_flat, 'merge_output_format': opts.merge_output_format, 'postprocessors': postprocessors, + 'postprocessor_args': shlex.split(opts.postprocessor_args or ''), 'fixup': opts.fixup, 'source_address': opts.source_address, 'call_home': opts.call_home, diff --git a/youtube_dl/postprocessor/common.py b/youtube_dl/postprocessor/common.py index c44501b59..bee64c457 100644 --- a/youtube_dl/postprocessor/common.py +++ b/youtube_dl/postprocessor/common.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals import os -import shlex from ..utils import ( PostProcessingError, @@ -29,8 +28,8 @@ class PostProcessor(object): _downloader = None - def __init__(self, downloader=None, extra_cmd_args=None): - self._extra_cmd_args = shlex.split(extra_cmd_args or '') + def __init__(self, downloader=None): + self._extra_cmd_args = downloader.params.get('postprocessor_args') self._downloader = downloader def set_downloader(self, downloader): diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 891c72769..de8c225da 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -29,8 +29,8 @@ class FFmpegPostProcessorError(PostProcessingError): class FFmpegPostProcessor(PostProcessor): - def __init__(self, downloader=None, extra_cmd_args=None): - PostProcessor.__init__(self, downloader, extra_cmd_args) + def __init__(self, downloader=None): + PostProcessor.__init__(self, downloader) self._determine_executables() def check_version(self): @@ -287,8 +287,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): class FFmpegVideoConvertorPP(FFmpegPostProcessor): - def __init__(self, downloader=None, preferedformat=None, extra_cmd_args=None): - super(FFmpegVideoConvertorPP, self).__init__(downloader, extra_cmd_args) + def __init__(self, downloader=None, preferedformat=None): + super(FFmpegVideoConvertorPP, self).__init__(downloader) self._preferedformat = preferedformat def run(self, information): From 35eb649e9d96027f4d3eec77841791524cc345e2 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 4 Jul 2015 09:24:00 +0200 Subject: [PATCH 043/450] release 2015.07.04 --- README.md | 6 +++--- docs/supportedsites.md | 5 +++++ youtube_dl/version.py | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index e3452c9e1..93e7fb06f 100644 --- a/README.md +++ b/README.md @@ -108,7 +108,7 @@ which means you can modify it, redistribute it or use it however you like. --playlist-reverse Download playlist videos in reverse order --xattr-set-filesize Set file xattribute ytdl.filesize with expected filesize (experimental) --hls-prefer-native Use the native HLS downloader instead of ffmpeg (experimental) - --external-downloader COMMAND Use the specified external downloader. Currently supports aria2c,curl,wget + --external-downloader COMMAND Use the specified external downloader. Currently supports aria2c,curl,httpie,wget --external-downloader-args ARGS Give these arguments to the external downloader ## Filesystem Options: @@ -190,8 +190,8 @@ which means you can modify it, redistribute it or use it however you like. --all-formats Download all available video formats --prefer-free-formats Prefer free video formats unless a specific one is requested -F, --list-formats List all available formats - --youtube-skip-dash-manifest Do not download the DASH manifest on YouTube videos - --merge-output-format FORMAT If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv.Ignored if no + --youtube-skip-dash-manifest Do not download the DASH manifests and related data on YouTube videos + --merge-output-format FORMAT If a merge is required (e.g. bestvideo+bestaudio), output to given container format. One of mkv, mp4, ogg, webm, flv. Ignored if no merge is required ## Subtitle Options: diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 9a50fbd1c..687936103 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -283,6 +283,7 @@ - **Motherless** - **Motorsport**: motorsport.com - **MovieClips** + - **MovieFap** - **Moviezine** - **movshare**: MovShare - **MPORA** @@ -440,6 +441,8 @@ - **smotri:broadcast**: Smotri.com broadcasts - **smotri:community**: Smotri.com community videos - **smotri:user**: Smotri.com user videos + - **SnagFilms** + - **SnagFilmsEmbed** - **Snotr** - **Sohu** - **soompi** @@ -502,6 +505,7 @@ - **TheOnion** - **ThePlatform** - **TheSixtyOne** + - **ThisAmericanLife** - **ThisAV** - **THVideo** - **THVideoPlaylist** @@ -542,6 +546,7 @@ - **twitch:stream** - **twitch:video** - **twitch:vod** + - **TwitterCard** - **Ubu** - **udemy** - **udemy:course** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a225e03a1..eff4aebeb 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.06.25' +__version__ = '2015.07.04' From d7c9a3e976b00068e926e18d963e9fdb7c3cd678 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 4 Jul 2015 17:22:11 +0600 Subject: [PATCH 044/450] Credit @remitamine for snagfilms (#6096) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 117b9c219..d5418dd37 100644 --- a/AUTHORS +++ b/AUTHORS @@ -129,3 +129,4 @@ Mister Hat Peter Ding jackyzy823 George Brighton +Remita Amine From f5f4a27a964b41646303921104f4d6d6fd2098e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 4 Jul 2015 21:30:26 +0200 Subject: [PATCH 045/450] [YoutubeDL] format spec: fix handling of '/' with ',' When using 'bestvideo/best,bestaudio', 'bestvideo/best' must be set as the current_selector (instead of appending it to the selectors), otherwise when it gets the ',' it would append 'None' to the selectors. --- test/test_YoutubeDL.py | 8 ++++++++ youtube_dl/YoutubeDL.py | 3 +-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 1e4aaa559..f103779d3 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -253,6 +253,14 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], '38') + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': 'bestvideo/best,bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['137', '141']) + info_dict = _make_result(list(formats_order), extractor='youtube') ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])+bestaudio'}) yie = YoutubeIE(ydl) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 5deb4848e..5a79e5f1d 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -958,8 +958,7 @@ class YoutubeDL(object): elif string == '/': first_choice = current_selector second_choice = _parse_format_selection(tokens, inside_choice=True) - current_selector = None - selectors.append(FormatSelector(PICKFIRST, (first_choice, second_choice), [])) + current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), []) elif string == '[': if not current_selector: current_selector = FormatSelector(SINGLE, 'best', []) From bb8e55366289e0c129ef85abb8c1ac1cbae86a66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 4 Jul 2015 21:41:09 +0200 Subject: [PATCH 046/450] [YoutubeDL] format spec: Do not fail when a filter gives an empty result For example with 'best[height<40]' we ended getting a 'IndexError: list index out of range'. --- test/test_YoutubeDL.py | 9 ++++++++- youtube_dl/YoutubeDL.py | 3 +++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index f103779d3..bf2baae07 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -15,7 +15,7 @@ from youtube_dl import YoutubeDL from youtube_dl.compat import compat_str from youtube_dl.extractor import YoutubeIE from youtube_dl.postprocessor.common import PostProcessor -from youtube_dl.utils import match_filter_func +from youtube_dl.utils import ExtractorError, match_filter_func TEST_URL = 'http://localhost/sample.mp4' @@ -362,6 +362,13 @@ class TestFormatSelection(unittest.TestCase): downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] self.assertEqual(downloaded_ids, ['B', 'C', 'D']) + ydl = YDL({'format': 'best[height<40]'}) + try: + ydl.process_ie_result(info_dict) + except ExtractorError: + pass + self.assertEqual(ydl.downloaded_info_dicts, []) + class TestYoutubeDL(unittest.TestCase): def test_subtitles(self): diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 5a79e5f1d..6478d05dc 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1005,6 +1005,9 @@ class YoutubeDL(object): format_spec = selector.selector def selector_function(formats): + formats = list(formats) + if not formats: + return if format_spec == 'all': for f in formats: yield f From ede21449c8d87c2d000d16e5102ee85d87a9b14e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 5 Jul 2015 06:29:36 +0600 Subject: [PATCH 047/450] [crunchyroll] Fix extraction (Closes #5855, closes #5881) --- youtube_dl/extractor/crunchyroll.py | 31 ++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 41f0c736d..73f1e22ef 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -27,7 +27,7 @@ from ..aes import ( class CrunchyrollIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.(?:com|fr)/(?:[^/]*/[^/?&]*?|media/\?id=)(?P[0-9]+))(?:[/?&]|$)' + _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P[0-9]+))(?:[/?&]|$)' _NETRC_MACHINE = 'crunchyroll' _TESTS = [{ 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', @@ -45,6 +45,22 @@ class CrunchyrollIE(InfoExtractor): # rtmp 'skip_download': True, }, + }, { + 'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1', + 'info_dict': { + 'id': '589804', + 'ext': 'flv', + 'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11', + 'description': 'md5:fe2743efedb49d279552926d0bd0cd9e', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Danny Choo Network', + 'upload_date': '20120213', + }, + 'params': { + # rtmp + 'skip_download': True, + }, + }, { 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', 'only_matching': True, @@ -251,16 +267,17 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage): stream_quality, stream_format = self._FORMAT_IDS[fmt] video_format = fmt + 'p' - streamdata_req = compat_urllib_request.Request('http://www.crunchyroll.com/xml/') - # urlencode doesn't work! - streamdata_req.data = 'req=RpcApiVideoEncode%5FGetStreamInfo&video%5Fencode%5Fquality=' + stream_quality + '&media%5Fid=' + stream_id + '&video%5Fformat=' + stream_format + streamdata_req = compat_urllib_request.Request( + 'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s' + % (stream_id, stream_format, stream_quality), + compat_urllib_parse.urlencode({'current_page': url}).encode('utf-8')) streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') - streamdata_req.add_header('Content-Length', str(len(streamdata_req.data))) streamdata = self._download_xml( streamdata_req, video_id, note='Downloading media info for %s' % video_format) - video_url = streamdata.find('./host').text - video_play_path = streamdata.find('./file').text + stream_info = streamdata.find('./{default}preload/stream_info') + video_url = stream_info.find('./host').text + video_play_path = stream_info.find('./file').text formats.append({ 'url': video_url, 'play_path': video_play_path, From 43232d5c143b1025671f70ff34f5c0f28ec56847 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 5 Jul 2015 19:01:07 +0600 Subject: [PATCH 048/450] [rtlnl] Improve --- youtube_dl/extractor/rtlnl.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index e708e0093..d94861325 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -59,9 +59,11 @@ class RtlNlIE(InfoExtractor): subtitle = material['title'] or info['episodes'][0]['name'] description = material.get('synopsis') or info['episodes'][0]['synopsis'] + meta = info.get('meta', {}) + # Use unencrypted m3u8 streams (See https://github.com/rg3/youtube-dl/issues/4118) - videopath = material['videopath'].replace('adaptive', 'flash') - m3u8_url = info['meta']['videohost'] + videopath + videopath = material['videopath'].replace('/adaptive/', '/flash/') + m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4') @@ -82,7 +84,7 @@ class RtlNlIE(InfoExtractor): self._sort_formats(formats) thumbnails = [] - meta = info.get('meta', {}) + for p in ('poster_base_url', '"thumb_base_url"'): if not meta.get(p): continue From 9dfc4fa1a1a922fb6c0212d09d28b4a8495c1031 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 5 Jul 2015 19:07:07 +0600 Subject: [PATCH 049/450] [rtlnl] Add test with encrypted m3u8 streams for reference --- youtube_dl/extractor/rtlnl.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index d94861325..049deaf8d 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -43,6 +43,10 @@ class RtlNlIE(InfoExtractor): 'upload_date': '20150215', 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.', } + }, { + # encrypted m3u8 streams + 'url': 'http://www.rtlxl.nl/#!/afl-2-257632/52a74543-c504-4cde-8aa8-ec66fe8d68a7', + 'only_matching': True, }, { 'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0', 'only_matching': True, From 89d42c2c758a3cb06b71145c24bfe8057419295a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 6 Jul 2015 02:58:02 +0600 Subject: [PATCH 050/450] [rtlnl] Clarify test --- youtube_dl/extractor/rtlnl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 049deaf8d..1228245ad 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -44,7 +44,7 @@ class RtlNlIE(InfoExtractor): 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.', } }, { - # encrypted m3u8 streams + # encrypted m3u8 streams, georestricted 'url': 'http://www.rtlxl.nl/#!/afl-2-257632/52a74543-c504-4cde-8aa8-ec66fe8d68a7', 'only_matching': True, }, { From 0c20ee7d4b0c98221c71769482980a0a218dadb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 6 Jul 2015 04:16:56 +0600 Subject: [PATCH 051/450] [rtlnl] Clarify current adaptive -> flash workaround rationale --- youtube_dl/extractor/rtlnl.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 1228245ad..a4d3d73ff 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -66,6 +66,9 @@ class RtlNlIE(InfoExtractor): meta = info.get('meta', {}) # Use unencrypted m3u8 streams (See https://github.com/rg3/youtube-dl/issues/4118) + # NB: nowadays, recent ffmpeg and avconv can handle these encrypted streams, so + # this adaptive -> flash workaround is not required in general, but it also + # allows bypassing georestriction therefore is retained for now. videopath = material['videopath'].replace('/adaptive/', '/flash/') m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath From 13af92fdc443f753dcb3cb91736b09a4ae85c36a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 6 Jul 2015 08:39:38 +0800 Subject: [PATCH 052/450] [common] Add 'fatal' to _extract_m3u8_formats --- youtube_dl/extractor/common.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 81623bfe3..d859aea52 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -846,7 +846,8 @@ class InfoExtractor(object): def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, - m3u8_id=None, note=None, errnote=None): + m3u8_id=None, note=None, errnote=None, + fatal=True): formats = [{ 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), @@ -866,7 +867,10 @@ class InfoExtractor(object): m3u8_doc = self._download_webpage( m3u8_url, video_id, note=note or 'Downloading m3u8 information', - errnote=errnote or 'Failed to download m3u8 information') + errnote=errnote or 'Failed to download m3u8 information', + fatal=fatal) + if m3u8_doc is False: + return m3u8_doc last_info = None last_media = None kv_rex = re.compile( From 59a83d3e5b9c09d0c6e8fb430ea99d71ef6e2eba Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 6 Jul 2015 08:40:38 +0800 Subject: [PATCH 053/450] [spiegeltv] Skip invalid m3u8 manifests (closes #6157) --- youtube_dl/extractor/spiegeltv.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py index 08a5c4314..27f4033c5 100644 --- a/youtube_dl/extractor/spiegeltv.py +++ b/youtube_dl/extractor/spiegeltv.py @@ -77,11 +77,13 @@ class SpiegeltvIE(InfoExtractor): 'rtmp_live': True, }) elif determine_ext(endpoint) == 'm3u8': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( endpoint.replace('[video]', play_path), video_id, 'm4v', preference=1, # Prefer hls since it allows to workaround georestriction - m3u8_id='hls')) + m3u8_id='hls', fatal=False) + if m3u8_formats is not False: + formats.extend(m3u8_formats) else: formats.append({ 'url': endpoint, From 37c1e4025c6df834e93a64c1c13eebac23e90942 Mon Sep 17 00:00:00 2001 From: ping Date: Mon, 6 Jul 2015 15:26:49 +0800 Subject: [PATCH 054/450] [yinyuetai] New extractor for yinyuetai.com --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/yinyuetai.py | 47 +++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 youtube_dl/extractor/yinyuetai.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index aba62db53..8665855eb 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -733,6 +733,7 @@ from .yandexmusic import ( YandexMusicPlaylistIE, ) from .yesjapan import YesJapanIE +from .yinyuetai import YinYueTaiIE from .ynet import YnetIE from .youjizz import YouJizzIE from .youku import YoukuIE diff --git a/youtube_dl/extractor/yinyuetai.py b/youtube_dl/extractor/yinyuetai.py new file mode 100644 index 000000000..661c34602 --- /dev/null +++ b/youtube_dl/extractor/yinyuetai.py @@ -0,0 +1,47 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class YinYueTaiIE(InfoExtractor): + IE_NAME = 'yinyuetai:video' + _VALID_URL = r'https?://v\.yinyuetai\.com/video(/h5)?/(?P[0-9]+)' + _TEST = { + 'url': 'http://v.yinyuetai.com/video/2322376', + 'md5': '6e3abe28d38e3a54b591f9f040595ce0', + 'info_dict': { + 'id': '2322376', + 'ext': 'mp4', + 'title': '少女时代_PARTY_Music Video Teaser', + 'creator': '少女时代', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + info = self._download_json( + 'http://ext.yinyuetai.com/main/get-h-mv-info?json=true&videoId=%s' % video_id, video_id, + 'Downloading mv info')['videoInfo']['coreVideoInfo'] + + if info['error']: + raise ExtractorError(info['errorMsg'], expected=True) + + formats = [ + {'url': format_info['videoUrl'], 'format_id': format_info['qualityLevel'], + 'format': format_info['qualityLevelName'], 'filesize': format_info['fileSize'], + 'ext': 'mp4', 'preference': format_info['bitrate']} + for format_info in info['videoUrlModels'] + ] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': info['videoName'], + 'thumbnail': info['bigHeadImage'], + 'creator': info['artistNames'], + 'duration': info['duration'], + 'formats': formats, + } From e9d33454b552c7a5a2e048f439b90a58aa5528ad Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 6 Jul 2015 16:19:49 +0800 Subject: [PATCH 055/450] [qqmusic:playlist] Playlist names are optional --- youtube_dl/extractor/qqmusic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index f9aafcd28..d6bc05b7b 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -274,6 +274,6 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE): ) for song in list_json['songlist'] ] - list_name = list_json['dissname'] + list_name = list_json.get('dissname') list_description = clean_html(unescapeHTML(list_json.get('desc'))) return self.playlist_result(entries, list_id, list_name, list_description) From fc7ae675e26a98bc74918f311d22f515ec7e0477 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 6 Jul 2015 17:08:32 +0800 Subject: [PATCH 056/450] [qqmusic:album] Strip description --- youtube_dl/extractor/qqmusic.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index e704640e5..6d85d58e3 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -169,7 +169,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): 'info_dict': { 'id': '000gXCTb2AhRR1', 'title': '我们都是这样长大的', - 'description': 'md5:712f0cdbfc7e776820d08150e6df593d', + 'description': 'md5:179c5dce203a5931970d306aa9607ea6', }, 'playlist_count': 4, }, { @@ -177,7 +177,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): 'info_dict': { 'id': '002Y5a3b3AlCu3', 'title': '그리고...', - 'description': 'md5:b1d133b8c9bac8fed4e1a97df759f4cf', + 'description': 'md5:a48823755615508a95080e81b51ba729', }, 'playlist_count': 8, }] @@ -196,6 +196,8 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): ] album_name = album['name'] album_detail = album.get('desc') + if album_detail is not None: + album_detail = album_detail.strip() return self.playlist_result(entries, mid, album_name, album_detail) From dfc4eca21f7d34b5e65f42b284b24077c8bbc109 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 6 Jul 2015 17:09:17 +0800 Subject: [PATCH 057/450] [qqmusic:album] Playlist names are optional --- youtube_dl/extractor/qqmusic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 6d85d58e3..03e6c688f 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -194,7 +194,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid'] ) for song in album['list'] ] - album_name = album['name'] + album_name = album.get('name') album_detail = album.get('desc') if album_detail is not None: album_detail = album_detail.strip() From 85a064861f5213abf2b56dd671e1f45188a02adf Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 6 Jul 2015 17:54:41 +0800 Subject: [PATCH 058/450] [qqmusic] Use regex for thumbnails in test cases --- youtube_dl/extractor/qqmusic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 8a724ab51..476432330 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -27,7 +27,7 @@ class QQMusicIE(InfoExtractor): 'upload_date': '20141227', 'creator': '林俊杰', 'description': 'md5:d327722d0361576fde558f1ac68a7065', - 'thumbnail': 'http://i.gtimg.cn/music/photo/mid_album_500/7/p/001IV22P1RDX7p.jpg', + 'thumbnail': 're:^https?://.*\.jpg$', } }, { 'note': 'There is no mp3-320 version of this song.', @@ -40,7 +40,7 @@ class QQMusicIE(InfoExtractor): 'upload_date': '20050626', 'creator': '李季美', 'description': 'md5:46857d5ed62bc4ba84607a805dccf437', - 'thumbnail': 'http://i.gtimg.cn/music/photo/mid_album_500/r/Q/0042owYj46IxrQ.jpg', + 'thumbnail': 're:^https?://.*\.jpg$', } }] From 275c0423aa85691fc78dda253fa5d00ab471e7b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 7 Jul 2015 00:02:34 +0600 Subject: [PATCH 059/450] [vk] Fix extraction (Closes #6153) --- youtube_dl/extractor/vk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index f2ae109f9..4ae28d4ca 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -153,7 +153,7 @@ class VKIE(InfoExtractor): if not video_id: video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) - info_url = 'http://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id + info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id info_page = self._download_webpage(info_url, video_id) ERRORS = { From 7f220b2facecde21ac3257d20938b63c1b6b01fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 7 Jul 2015 00:04:19 +0600 Subject: [PATCH 060/450] [vk] Catch ownership confirmation request --- youtube_dl/extractor/vk.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 4ae28d4ca..ccd897084 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -156,6 +156,11 @@ class VKIE(InfoExtractor): info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id info_page = self._download_webpage(info_url, video_id) + if re.search(r'/login\.php\?.*\bact=security_check', info_page): + raise ExtractorError( + 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.', + expected=True) + ERRORS = { r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<': 'Video %s has been removed from public access due to rightholder complaint.', From d7b4d5dd5072fc82a7f3d437e5990aa74a35a100 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 7 Jul 2015 14:16:56 +0800 Subject: [PATCH 061/450] [gfycat] Extract id correctly (fixes #6165) --- youtube_dl/extractor/gfycat.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/gfycat.py b/youtube_dl/extractor/gfycat.py index 397f1d42e..048ee31e2 100644 --- a/youtube_dl/extractor/gfycat.py +++ b/youtube_dl/extractor/gfycat.py @@ -10,8 +10,8 @@ from ..utils import ( class GfycatIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gfycat\.com/(?P[^/?#]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?gfycat\.com/(?:ifr/)?(?P[^/?#]+)' + _TESTS = [{ 'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher', 'info_dict': { 'id': 'DeadlyDecisiveGermanpinscher', @@ -27,7 +27,23 @@ class GfycatIE(InfoExtractor): 'categories': list, 'age_limit': 0, } - } + }, { + 'url': 'http://gfycat.com/ifr/JauntyTimelyAmazontreeboa', + 'info_dict': { + 'id': 'JauntyTimelyAmazontreeboa', + 'ext': 'mp4', + 'title': 'JauntyTimelyAmazontreeboa', + 'timestamp': 1411720126, + 'upload_date': '20140926', + 'uploader': 'anonymous', + 'duration': 3.52, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'categories': list, + 'age_limit': 0, + } + }] def _real_extract(self, url): video_id = self._match_id(url) From 267dc07e6b71277b67abad08c515ca9c1bb09f61 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 7 Jul 2015 14:22:13 +0800 Subject: [PATCH 062/450] [gfycat] Catch errors --- youtube_dl/extractor/gfycat.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gfycat.py b/youtube_dl/extractor/gfycat.py index 048ee31e2..884700c52 100644 --- a/youtube_dl/extractor/gfycat.py +++ b/youtube_dl/extractor/gfycat.py @@ -6,6 +6,7 @@ from ..utils import ( int_or_none, float_or_none, qualities, + ExtractorError, ) @@ -50,7 +51,10 @@ class GfycatIE(InfoExtractor): gfy = self._download_json( 'http://gfycat.com/cajax/get/%s' % video_id, - video_id, 'Downloading video info')['gfyItem'] + video_id, 'Downloading video info') + if 'error' in gfy: + raise ExtractorError('Gfycat said: ' + gfy['error'], expected=True) + gfy = gfy['gfyItem'] title = gfy.get('title') or gfy['gfyName'] description = gfy.get('description') From 4dd09c9addeaccad6ad6d33f01d6240f3594fc84 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 7 Jul 2015 10:36:07 +0200 Subject: [PATCH 063/450] release 2015.07.07 --- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 687936103..0ca06c71d 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -384,6 +384,7 @@ - **Pyvideo** - **qqmusic** - **qqmusic:album** + - **qqmusic:playlist** - **qqmusic:singer** - **qqmusic:toplist** - **QuickVid** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index eff4aebeb..3364647ed 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.07.04' +__version__ = '2015.07.07' From cbc1fadd6f544619b711209b68ea15a912ca0fa1 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 8 Jul 2015 13:40:21 +0800 Subject: [PATCH 064/450] [clipsyndicate] Support chic subdomain (fixes #6176) --- youtube_dl/extractor/clipsyndicate.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index d07d544ea..b1e45a677 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -10,9 +10,9 @@ from ..utils import ( class ClipsyndicateIE(InfoExtractor): - _VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P\d+)' + _VALID_URL = r'http://(?:chic|www)\.clipsyndicate\.com/video/play(list/\d+)?/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', 'md5': '4d7d549451bad625e0ff3d7bd56d776c', 'info_dict': { @@ -22,7 +22,10 @@ class ClipsyndicateIE(InfoExtractor): 'duration': 612, 'thumbnail': 're:^https?://.+\.jpg', }, - } + }, { + 'url': 'http://chic.clipsyndicate.com/video/play/5844117/shark_attack', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From 1316b54956ac871b4644b6760c5bf88207b2e58f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 8 Jul 2015 13:43:23 +0800 Subject: [PATCH 065/450] [clipsyndicate] Use _match_id --- youtube_dl/extractor/clipsyndicate.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index b1e45a677..8306d6fb7 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -1,7 +1,5 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( find_xpath_attr, @@ -28,8 +26,7 @@ class ClipsyndicateIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) js_player = self._download_webpage( 'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id, video_id, 'Downlaoding player') From cf9cf7dd04d8e15635084dcf9f7a76acff5ac218 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 8 Jul 2015 20:27:06 +0600 Subject: [PATCH 066/450] [vk] Extend _VALID_URL to handle biqle.ru (Closes #6179) --- youtube_dl/extractor/vk.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index ccd897084..0f7ce45ca 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -21,7 +21,17 @@ from ..utils import ( class VKIE(InfoExtractor): IE_NAME = 'vk.com' - _VALID_URL = r'https?://(?:m\.)?vk\.com/(?:video_ext\.php\?.*?\boid=(?P-?\d+).*?\bid=(?P\d+)|(?:.+?\?.*?z=)?video(?P[^s].*?)(?:\?|%2F|$))' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:m\.)?vk\.com/video_ext\.php\?.*?\boid=(?P-?\d+).*?\bid=(?P\d+)| + (?: + (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video| + (?:www\.)?biqle\.ru/watch/ + ) + (?P[^s].*?)(?:\?|%2F|$) + ) + ''' _NETRC_MACHINE = 'vk' _TESTS = [ @@ -114,6 +124,11 @@ class VKIE(InfoExtractor): 'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a', 'only_matching': True, }, + { + # vk wrapper + 'url': 'http://www.biqle.ru/watch/847655_160197695', + 'only_matching': True, + } ] def _login(self): From 4647845679429739f72b0eba5b963e096ec5bf86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 8 Jul 2015 20:34:50 +0600 Subject: [PATCH 067/450] [vk] Fix youtube extraction --- youtube_dl/extractor/vk.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 0f7ce45ca..697064175 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -195,10 +195,11 @@ class VKIE(InfoExtractor): if re.search(error_re, info_page): raise ExtractorError(error_msg % video_id, expected=True) - m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page) - if m_yt is not None: - self.to_screen('Youtube video detected') - return self.url_result(m_yt.group(1), 'Youtube') + youtube_url = self._search_regex( + r']+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"', + info_page, 'youtube iframe', default=None) + if youtube_url: + return self.url_result(youtube_url, 'Youtube', video_id) m_rutube = re.search( r'\ssrc="((?:https?:)?//rutube\.ru\\?/video\\?/embed(?:.*?))\\?"', info_page) From 9281f6d25338c179e7bc2f48352da3311102158e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 8 Jul 2015 20:41:08 +0600 Subject: [PATCH 068/450] [vk] Add test for youtube embed --- youtube_dl/extractor/vk.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 697064175..23d153031 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -119,6 +119,21 @@ class VKIE(InfoExtractor): }, 'skip': 'Only works from Russia', }, + { + # youtube embed + 'url': 'https://vk.com/video276849682_170681728', + 'info_dict': { + 'id': 'V3K4mi0SYkc', + 'ext': 'mp4', + 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", + 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', + 'duration': 179, + 'upload_date': '20130116', + 'uploader': "Children's Joy Foundation", + 'uploader_id': 'thecjf', + 'view_count': int, + }, + }, { # removed video, just testing that we match the pattern 'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a', @@ -199,7 +214,7 @@ class VKIE(InfoExtractor): r']+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"', info_page, 'youtube iframe', default=None) if youtube_url: - return self.url_result(youtube_url, 'Youtube', video_id) + return self.url_result(youtube_url, 'Youtube') m_rutube = re.search( r'\ssrc="((?:https?:)?//rutube\.ru\\?/video\\?/embed(?:.*?))\\?"', info_page) From e2082ea9422aadf7ae2580d9333008279cda51f0 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 9 Jul 2015 00:50:32 +0800 Subject: [PATCH 069/450] [yinyuetai] Add test for h5/ part in _VALID_URL --- youtube_dl/extractor/yinyuetai.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/yinyuetai.py b/youtube_dl/extractor/yinyuetai.py index 661c34602..41ee89da4 100644 --- a/youtube_dl/extractor/yinyuetai.py +++ b/youtube_dl/extractor/yinyuetai.py @@ -8,7 +8,7 @@ from ..utils import ExtractorError class YinYueTaiIE(InfoExtractor): IE_NAME = 'yinyuetai:video' _VALID_URL = r'https?://v\.yinyuetai\.com/video(/h5)?/(?P[0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'http://v.yinyuetai.com/video/2322376', 'md5': '6e3abe28d38e3a54b591f9f040595ce0', 'info_dict': { @@ -17,7 +17,10 @@ class YinYueTaiIE(InfoExtractor): 'title': '少女时代_PARTY_Music Video Teaser', 'creator': '少女时代', }, - } + }, { + 'url': 'http://v.yinyuetai.com/video/h5/2322376', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From af0f9b0e95233862e758140b282497d04edfb885 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 9 Jul 2015 00:54:37 +0800 Subject: [PATCH 070/450] [yinyuetai] Style --- youtube_dl/extractor/yinyuetai.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/yinyuetai.py b/youtube_dl/extractor/yinyuetai.py index 41ee89da4..a4ada4872 100644 --- a/youtube_dl/extractor/yinyuetai.py +++ b/youtube_dl/extractor/yinyuetai.py @@ -32,12 +32,14 @@ class YinYueTaiIE(InfoExtractor): if info['error']: raise ExtractorError(info['errorMsg'], expected=True) - formats = [ - {'url': format_info['videoUrl'], 'format_id': format_info['qualityLevel'], - 'format': format_info['qualityLevelName'], 'filesize': format_info['fileSize'], - 'ext': 'mp4', 'preference': format_info['bitrate']} - for format_info in info['videoUrlModels'] - ] + formats = [{ + 'url': format_info['videoUrl'], + 'format_id': format_info['qualityLevel'], + 'format': format_info['qualityLevelName'], + 'filesize': format_info['fileSize'], + 'ext': 'mp4', + 'preference': format_info['bitrate'], + } for format_info in info['videoUrlModels']] self._sort_formats(formats) return { From d76dea001b5365b7646986cba12f9908cd321f6a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 9 Jul 2015 01:07:45 +0800 Subject: [PATCH 071/450] [yinyuetai] Miscellaneous improvements 1. Include all fields in _TEST 2. Use .get() for optional fields 3. Clarify the intention of 'ext' in formats --- youtube_dl/extractor/yinyuetai.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/yinyuetai.py b/youtube_dl/extractor/yinyuetai.py index a4ada4872..003df9233 100644 --- a/youtube_dl/extractor/yinyuetai.py +++ b/youtube_dl/extractor/yinyuetai.py @@ -16,6 +16,8 @@ class YinYueTaiIE(InfoExtractor): 'ext': 'mp4', 'title': '少女时代_PARTY_Music Video Teaser', 'creator': '少女时代', + 'duration': 25, + 'thumbnail': 're:^https?://.*\.jpg$', }, }, { 'url': 'http://v.yinyuetai.com/video/h5/2322376', @@ -35,18 +37,19 @@ class YinYueTaiIE(InfoExtractor): formats = [{ 'url': format_info['videoUrl'], 'format_id': format_info['qualityLevel'], - 'format': format_info['qualityLevelName'], - 'filesize': format_info['fileSize'], + 'format': format_info.get('qualityLevelName'), + 'filesize': format_info.get('fileSize'), + # though URLs ends with .flv, the downloaded files are in fact mp4 'ext': 'mp4', - 'preference': format_info['bitrate'], + 'tbr': format_info.get('bitrate'), } for format_info in info['videoUrlModels']] self._sort_formats(formats) return { 'id': video_id, 'title': info['videoName'], - 'thumbnail': info['bigHeadImage'], - 'creator': info['artistNames'], - 'duration': info['duration'], + 'thumbnail': info.get('bigHeadImage'), + 'creator': info.get('artistNames'), + 'duration': info.get('duration'), 'formats': formats, } From 082a0140eff05dfdac1020548a49541fcddd6ec6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 8 Jul 2015 23:40:19 +0600 Subject: [PATCH 072/450] [yinyuetai] Do not capture unused group --- youtube_dl/extractor/yinyuetai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/yinyuetai.py b/youtube_dl/extractor/yinyuetai.py index 003df9233..fa6b40816 100644 --- a/youtube_dl/extractor/yinyuetai.py +++ b/youtube_dl/extractor/yinyuetai.py @@ -7,7 +7,7 @@ from ..utils import ExtractorError class YinYueTaiIE(InfoExtractor): IE_NAME = 'yinyuetai:video' - _VALID_URL = r'https?://v\.yinyuetai\.com/video(/h5)?/(?P[0-9]+)' + _VALID_URL = r'https?://v\.yinyuetai\.com/video(?:/h5)?/(?P[0-9]+)' _TESTS = [{ 'url': 'http://v.yinyuetai.com/video/2322376', 'md5': '6e3abe28d38e3a54b591f9f040595ce0', From 77c6fb5b24c477c9de022f252dc9958a4b3b8b42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 9 Jul 2015 20:48:38 +0600 Subject: [PATCH 073/450] [youtube] Make further DASH manifests not fatal after succeeded one --- youtube_dl/extractor/youtube.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6769a009d..3c629d38a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -798,7 +798,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.') def _parse_dash_manifest( - self, video_id, dash_manifest_url, player_url, age_gate): + self, video_id, dash_manifest_url, player_url, age_gate, fatal=True): def decrypt_sig(mobj): s = mobj.group(1) dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) @@ -807,7 +807,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): dash_doc = self._download_xml( dash_manifest_url, video_id, note='Downloading DASH manifest', - errnote='Could not download DASH manifest') + errnote='Could not download DASH manifest', + fatal=fatal) + + if dash_doc is False: + return [] formats = [] for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'): @@ -1161,14 +1165,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Look for the DASH manifest if self._downloader.params.get('youtube_include_dash_manifest', True): + dash_mpd_fatal = True for dash_manifest_url in dash_mpds: dash_formats = {} try: for df in self._parse_dash_manifest( - video_id, dash_manifest_url, player_url, age_gate): + video_id, dash_manifest_url, player_url, age_gate, dash_mpd_fatal): # Do not overwrite DASH format found in some previous DASH manifest if df['format_id'] not in dash_formats: dash_formats[df['format_id']] = df + # Additional DASH manifests may end up in HTTP Error 403 therefore + # allow them to fail without bug report message if we already have + # some DASH manifest succeeded. This is temporary workaround to reduce + # burst of bug reports until we figure out the reason and whether it + # can be fixed at all. + dash_mpd_fatal = False except (ExtractorError, KeyError) as e: self.report_warning( 'Skipping DASH manifest: %r' % e, video_id) From 675e9f22ea3f43fbade9bbd13ee5de3eb45c538f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 9 Jul 2015 23:52:03 +0600 Subject: [PATCH 074/450] [vimple] Extract spruto player based extractor class --- youtube_dl/extractor/vimple.py | 46 +++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/vimple.py b/youtube_dl/extractor/vimple.py index aa3d6ddfd..92321d66e 100644 --- a/youtube_dl/extractor/vimple.py +++ b/youtube_dl/extractor/vimple.py @@ -4,7 +4,29 @@ from .common import InfoExtractor from ..utils import int_or_none -class VimpleIE(InfoExtractor): +class SprutoBaseIE(InfoExtractor): + def _extract_spruto(self, spruto, video_id): + playlist = spruto['playlist'][0] + title = playlist['title'] + video_id = playlist.get('videoId') or video_id + thumbnail = playlist.get('posterUrl') or playlist.get('thumbnailUrl') + duration = int_or_none(playlist.get('duration')) + + formats = [{ + 'url': f['url'], + } for f in playlist['video']] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } + + +class VimpleIE(SprutoBaseIE): IE_DESC = 'Vimple - one-click video hosting' _VALID_URL = r'https?://(?:player\.vimple\.ru/iframe|vimple\.ru)/(?P[\da-f-]{32,36})' _TESTS = [ @@ -30,25 +52,9 @@ class VimpleIE(InfoExtractor): webpage = self._download_webpage( 'http://player.vimple.ru/iframe/%s' % video_id, video_id) - playlist = self._parse_json( + spruto = self._parse_json( self._search_regex( r'sprutoData\s*:\s*({.+?}),\r\n', webpage, 'spruto data'), - video_id)['playlist'][0] + video_id) - title = playlist['title'] - video_id = playlist.get('videoId') or video_id - thumbnail = playlist.get('posterUrl') or playlist.get('thumbnailUrl') - duration = int_or_none(playlist.get('duration')) - - formats = [{ - 'url': f['url'], - } for f in playlist['video']] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - } + return self._extract_spruto(spruto, video_id) From 1c20ddc966a69a241027c2d9a132b9caf3d0ebde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 9 Jul 2015 23:53:50 +0600 Subject: [PATCH 075/450] [myvi:embed] Add extractor (Closes #6167) --- youtube_dl/extractor/myvi.py | 44 ++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 youtube_dl/extractor/myvi.py diff --git a/youtube_dl/extractor/myvi.py b/youtube_dl/extractor/myvi.py new file mode 100644 index 000000000..9f4330f50 --- /dev/null +++ b/youtube_dl/extractor/myvi.py @@ -0,0 +1,44 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .vimple import SprutoBaseIE + + +class MyviEmbedIE(SprutoBaseIE): + _VALID_URL = r'''(?x) + https?:// + myvi\.ru/player/ + (?: + (?: + embed/html| + api/Video/Get + )/| + content/preloader\.swf\?.*\bid= + ) + (?P[\da-zA-Z_]+) + ''' + _TESTS = [{ + 'url': 'http://myvi.ru/player/embed/html/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0', + 'md5': '571bbdfba9f9ed229dc6d34cc0f335bf', + 'info_dict': { + 'id': 'f16b2bbd-cde8-481c-a981-7cd48605df43', + 'ext': 'mp4', + 'title': 'хозяин жизни', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 25, + }, + }, { + 'url': 'http://myvi.ru/player/content/preloader.swf?id=oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wOYf1WFpPfc_bWTKGVf_Zafr0', + 'only_matching': True, + }, { + 'url': 'http://myvi.ru/player/api/Video/Get/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + spruto = self._download_json( + 'http://myvi.ru/player/api/Video/Get/%s?sig' % video_id, video_id)['sprutoData'] + + return self._extract_spruto(spruto, video_id) From 83423254ccdf60cb8756aaf7900c929c1cf1a3ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 10 Jul 2015 00:07:26 +0600 Subject: [PATCH 076/450] [myvi:embed] Extend _VALID_URL --- youtube_dl/extractor/myvi.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/myvi.py b/youtube_dl/extractor/myvi.py index 9f4330f50..a14a5365b 100644 --- a/youtube_dl/extractor/myvi.py +++ b/youtube_dl/extractor/myvi.py @@ -7,7 +7,7 @@ from .vimple import SprutoBaseIE class MyviEmbedIE(SprutoBaseIE): _VALID_URL = r'''(?x) https?:// - myvi\.ru/player/ + myvi\.(?:ru/player|tv)/ (?: (?: embed/html| @@ -33,6 +33,9 @@ class MyviEmbedIE(SprutoBaseIE): }, { 'url': 'http://myvi.ru/player/api/Video/Get/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0', 'only_matching': True, + }, { + 'url': 'http://myvi.tv/embed/html/oTGTNWdyz4Zwy_u1nraolwZ1odenTd9WkTnRfIL9y8VOgHYqOHApE575x4_xxS9Vn0?ap=0', + 'only_matching': True, }] def _real_extract(self, url): From e6c2d9ad29bcc4eaa0eed03d3852588b6c7a10c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 10 Jul 2015 00:25:36 +0600 Subject: [PATCH 077/450] [extractor/generic:myvi] Add support for myvi embeds --- youtube_dl/extractor/generic.py | 6 ++++++ youtube_dl/extractor/myvi.py | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ea60d4a96..f8d6a8c76 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -37,6 +37,7 @@ from .rutv import RUTVIE from .tvc import TVCIE from .sportbox import SportBoxEmbedIE from .smotri import SmotriIE +from .myvi import MyviEmbedIE from .condenast import CondeNastIE from .udn import UDNEmbedIE from .senateisvp import SenateISVPIE @@ -1425,6 +1426,11 @@ class GenericIE(InfoExtractor): if smotri_url: return self.url_result(smotri_url, 'Smotri') + # Look for embedded Myvi.ru player + myvi_url = MyviEmbedIE._extract_url(webpage) + if myvi_url: + return self.url_result(myvi_url) + # Look for embeded soundcloud player mobj = re.search( r'https?://(?:w\.)?soundcloud\.com/player[^"]+)"', diff --git a/youtube_dl/extractor/myvi.py b/youtube_dl/extractor/myvi.py index a14a5365b..896080c1e 100644 --- a/youtube_dl/extractor/myvi.py +++ b/youtube_dl/extractor/myvi.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .vimple import SprutoBaseIE @@ -38,6 +40,13 @@ class MyviEmbedIE(SprutoBaseIE): 'only_matching': True, }] + @classmethod + def _extract_url(cls, webpage): + mobj = re.search( + r']+?src=(["\'])(?P(?:https?:)?//myvi\.(?:ru/player|tv)/embed/html/[^"]+)\1', webpage) + if mobj: + return mobj.group('url') + def _real_extract(self, url): video_id = self._match_id(url) From f2f89c762a13392e8876c5e9dff8b418c3912ac6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 10 Jul 2015 00:27:02 +0600 Subject: [PATCH 078/450] [myvi:embed] Improve _VALID_URL --- youtube_dl/extractor/myvi.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/myvi.py b/youtube_dl/extractor/myvi.py index 896080c1e..cd80cc177 100644 --- a/youtube_dl/extractor/myvi.py +++ b/youtube_dl/extractor/myvi.py @@ -13,11 +13,12 @@ class MyviEmbedIE(SprutoBaseIE): (?: (?: embed/html| + flash| api/Video/Get )/| content/preloader\.swf\?.*\bid= ) - (?P[\da-zA-Z_]+) + (?P[\da-zA-Z_-]+) ''' _TESTS = [{ 'url': 'http://myvi.ru/player/embed/html/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0', @@ -38,12 +39,15 @@ class MyviEmbedIE(SprutoBaseIE): }, { 'url': 'http://myvi.tv/embed/html/oTGTNWdyz4Zwy_u1nraolwZ1odenTd9WkTnRfIL9y8VOgHYqOHApE575x4_xxS9Vn0?ap=0', 'only_matching': True, + }, { + 'url': 'http://myvi.ru/player/flash/ocp2qZrHI-eZnHKQBK4cZV60hslH8LALnk0uBfKsB-Q4WnY26SeGoYPi8HWHxu0O30', + 'only_matching': True, }] @classmethod def _extract_url(cls, webpage): mobj = re.search( - r']+?src=(["\'])(?P(?:https?:)?//myvi\.(?:ru/player|tv)/embed/html/[^"]+)\1', webpage) + r']+?src=(["\'])(?P(?:https?:)?//myvi\.(?:ru/player|tv)/(?:embed/html|flash)/[^"]+)\1', webpage) if mobj: return mobj.group('url') From 6dd94d3a79353f8e694efaf2fa27f4bb40227aff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 10 Jul 2015 00:27:44 +0600 Subject: [PATCH 079/450] [myvi:embed] Rename to myvi --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/generic.py | 4 ++-- youtube_dl/extractor/myvi.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a0e3b333d..a64b457bd 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -323,6 +323,7 @@ from .musicvault import MusicVaultIE from .muzu import MuzuTVIE from .myspace import MySpaceIE, MySpaceAlbumIE from .myspass import MySpassIE +from .myvi import MyviIE from .myvideo import MyVideoIE from .myvidster import MyVidsterIE from .nationalgeographic import NationalGeographicIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f8d6a8c76..7c604c554 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -37,7 +37,7 @@ from .rutv import RUTVIE from .tvc import TVCIE from .sportbox import SportBoxEmbedIE from .smotri import SmotriIE -from .myvi import MyviEmbedIE +from .myvi import MyviIE from .condenast import CondeNastIE from .udn import UDNEmbedIE from .senateisvp import SenateISVPIE @@ -1427,7 +1427,7 @@ class GenericIE(InfoExtractor): return self.url_result(smotri_url, 'Smotri') # Look for embedded Myvi.ru player - myvi_url = MyviEmbedIE._extract_url(webpage) + myvi_url = MyviIE._extract_url(webpage) if myvi_url: return self.url_result(myvi_url) diff --git a/youtube_dl/extractor/myvi.py b/youtube_dl/extractor/myvi.py index cd80cc177..4c65be122 100644 --- a/youtube_dl/extractor/myvi.py +++ b/youtube_dl/extractor/myvi.py @@ -6,7 +6,7 @@ import re from .vimple import SprutoBaseIE -class MyviEmbedIE(SprutoBaseIE): +class MyviIE(SprutoBaseIE): _VALID_URL = r'''(?x) https?:// myvi\.(?:ru/player|tv)/ From 06a12933f3621c4b5b84346edc0d5bd570f1aac3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 10 Jul 2015 00:58:01 +0600 Subject: [PATCH 080/450] [pbs] Add support for subtitles (Closes #6184) --- youtube_dl/extractor/pbs.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 1e2b965f9..fec5d65ad 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -224,6 +224,14 @@ class PBSIE(InfoExtractor): rating_str = rating_str.rpartition('-')[2] age_limit = US_RATINGS.get(rating_str) + subtitles = {} + closed_captions_url = info.get('closed_captions_url') + if closed_captions_url: + subtitles['en'] = [{ + 'ext': 'ttml', + 'url': closed_captions_url, + }] + return { 'id': video_id, 'display_id': display_id, @@ -234,4 +242,5 @@ class PBSIE(InfoExtractor): 'age_limit': age_limit, 'upload_date': upload_date, 'formats': formats, + 'subtitles': subtitles, } From bf20b9c5405b276e6de29b1b28b2e3ad2182e891 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 10 Jul 2015 01:15:55 +0600 Subject: [PATCH 081/450] [extractor/generic] Add test for myvi embed --- youtube_dl/extractor/generic.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7c604c554..392ad3648 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -339,6 +339,17 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + # Myvi.ru embed + { + 'url': 'http://www.kinomyvi.tv/news/detail/Pervij-dublirovannij-trejler--Uzhastikov-_nOw1', + 'info_dict': { + 'id': 'f4dafcad-ff21-423d-89b5-146cfd89fa1e', + 'ext': 'mp4', + 'title': 'Ужастики, русский трейлер (2015)', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 153, + } + }, # XHamster embed { 'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8', From 02b386f80abc16f5334e980c2fb729b0cc69a0d6 Mon Sep 17 00:00:00 2001 From: ping Date: Fri, 10 Jul 2015 13:29:57 +0800 Subject: [PATCH 082/450] [neteasemusic] Changes after review --- youtube_dl/extractor/neteasemusic.py | 42 ++++++++++++++++++---------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py index a70c65ca5..419f3ecc9 100644 --- a/youtube_dl/extractor/neteasemusic.py +++ b/youtube_dl/extractor/neteasemusic.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from hashlib import md5 +from base64 import b64encode from datetime import datetime import itertools import re @@ -20,14 +21,14 @@ class NetEaseMusicBaseIE(InfoExtractor): @classmethod def _encrypt(cls, dfsid): - salt_bytes = bytearray(str(cls._NETEASE_SALT)) + salt_bytes = bytearray(cls._NETEASE_SALT, 'utf-8') string_bytes = bytearray(str(dfsid)) salt_len = len(salt_bytes) - for i in xrange(len(string_bytes)): + for i in range(len(string_bytes)): string_bytes[i] = string_bytes[i] ^ salt_bytes[i % salt_len] m = md5() m.update(string_bytes) - result = m.digest().encode('base64')[:-1] + result = b64encode(m.digest()) return result.replace('/', '_').replace('+', '-') @classmethod @@ -41,12 +42,11 @@ class NetEaseMusicBaseIE(InfoExtractor): 'url': 'http://m1.music.126.net/%s/%s.%s' % (cls._encrypt(details['dfsId']), details['dfsId'], details['extension']), - 'ext': details['extension'], - 'abr': details['bitrate'] / 1000, - 'preference': details['bitrate'], + 'ext': details.get('extension'), + 'abr': details.get('bitrate', 0) / 1000, 'format_id': song_format, - 'filesize': details['size'], - 'asr': details['sr'] + 'filesize': details.get('size'), + 'asr': details.get('sr') }) return formats @@ -98,6 +98,19 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'upload_date': '20080211', 'timestamp': 1202745600, }, + }, { + 'note': 'Has translated name.', + 'url': 'http://music.163.com/#/song?id=22735043', + 'info_dict': { + 'id': '22735043', + 'ext': 'mp3', + 'title': '소원을 말해봐 (Genie)', + 'creator': '少女时代', + 'description': 'md5:79d99cc560e4ca97e0c4d86800ee4184', + 'upload_date': '20100127', + 'timestamp': 1264608000, + 'alt_title': '说出愿望吧(Genie)', + } }] def _process_lyrics(self, lyrics_info): @@ -109,10 +122,9 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): lyrics_expr = r'(\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\])([^\n]+)' original_ts_texts = re.findall(lyrics_expr, original) - translation_ts_dict = { - time_stamp: text for time_stamp, text in re.findall(lyrics_expr, translated) - } - + translation_ts_dict = dict( + (time_stamp, text) for time_stamp, text in re.findall(lyrics_expr, translated) + ) lyrics = '\n'.join([ '%s%s / %s' % (time_stamp, text, translation_ts_dict.get(time_stamp, '')) for time_stamp, text in original_ts_texts @@ -139,8 +151,8 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): lyrics = self._process_lyrics(lyrics_info) alt_title = None - if info.get('alias'): - alt_title = '/'.join(info.get('alias')) + if info.get('transNames'): + alt_title = '/'.join(info.get('transNames')) return { 'id': song_id, @@ -294,7 +306,7 @@ class NetEaseMusicMvIE(NetEaseMusicBaseIE): mv_id, 'Downloading mv info')['data'] formats = [ - {'url': mv_url, 'ext': 'mp4', 'format_id': '%sp' % brs, 'preference': int(brs)} + {'url': mv_url, 'ext': 'mp4', 'format_id': '%sp' % brs, 'height': int(brs)} for brs, mv_url in info['brs'].items() ] self._sort_formats(formats) From 2b0fa1f7dd8d158b69eec4d17b254b99e976bc5c Mon Sep 17 00:00:00 2001 From: ping Date: Fri, 10 Jul 2015 15:09:12 +0800 Subject: [PATCH 083/450] [kuwo] Merge KuwoSingerMusicIE into KuwoSingerIE --- youtube_dl/extractor/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 82cd85c44..a348b3077 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -262,7 +262,6 @@ from .kuwo import ( KuwoAlbumIE, KuwoChartIE, KuwoSingerIE, - KuwoSingerMusicIE, KuwoCategoryIE, KuwoMvIE, ) From 1633491bff3e393d7d095b7303f954dacce4f4da Mon Sep 17 00:00:00 2001 From: ping Date: Fri, 10 Jul 2015 15:19:07 +0800 Subject: [PATCH 084/450] [kuwo] Merge KuwoSingerMusicIE into KuwoSingerIE (missed kuwo.py) --- youtube_dl/extractor/kuwo.py | 53 ++++++++++++------------------------ 1 file changed, 17 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 6a96a1aa4..82d5f3f95 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -180,15 +180,22 @@ class KuwoChartIE(InfoExtractor): class KuwoSingerIE(InfoExtractor): IE_NAME = 'kuwo:singer' - _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P[^/]+?)/$' - _TEST = { + _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P[^/]+)' + _TESTS = [{ 'url': 'http://www.kuwo.cn/mingxing/bruno+mars/', 'info_dict': { 'id': 'bruno+mars', 'title': 'Bruno Mars', }, 'playlist_count': 10, - } + }, { + 'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm', + 'info_dict': { + 'id': 'Ali', + 'title': 'Ali', + }, + 'playlist_mincount': 95, + }] def _real_extract(self, url): singer_id = self._match_id(url) @@ -197,54 +204,28 @@ class KuwoSingerIE(InfoExtractor): errnote='Unable to get singer info') singer_name = self._html_search_regex( - r'姓名:(.+?)', webpage, 'singer name') + r'
[\n\s\t]*?

(.+?).+?', - webpage, flags=re.DOTALL) - ] - return self.playlist_result(entries, singer_id, singer_name) - - -class KuwoSingerMusicIE(InfoExtractor): - IE_NAME = 'kuwo:singermusic' - _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P[^/]+?)/music(_[0-9]+)?.htm' - _TEST = { - 'url': 'http://www.kuwo.cn/mingxing/Ali/music.htm', - 'info_dict': { - 'id': 'Ali', - 'title': 'Ali的热门歌曲', - }, - 'playlist_mincount': 95, - } - - def _real_extract(self, url): - singer_id = self._match_id(url) - - list_name = None entries = [] + first_page_only = False if re.match(r'.+/music(?:_[0-9]+)?\.htm', url) else True for page_num in itertools.count(1): webpage = self._download_webpage( 'http://www.kuwo.cn/mingxing/%s/music_%d.htm' % (singer_id, page_num), singer_id, note='Download song list page #%d' % page_num, errnote='Unable to get song list page #%d' % page_num) - if list_name is None: - list_name = self._html_search_regex( - r'

([^<>]+)', webpage, 'list name') - entries.extend([ self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id) for song_id in re.findall( r'

下一页', webpage): + ][:10 if first_page_only else None]) + + if first_page_only or not re.search(r'下一页', webpage): break - return self.playlist_result(entries, singer_id, list_name) + return self.playlist_result(entries, singer_id, singer_name) class KuwoCategoryIE(InfoExtractor): From 15830339eff9076b1c4ca2cb6322fc1ec55233a8 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 10 Jul 2015 18:30:23 +0800 Subject: [PATCH 085/450] [neteasemusic] PEP8 --- youtube_dl/extractor/neteasemusic.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py index 419f3ecc9..e73c77f89 100644 --- a/youtube_dl/extractor/neteasemusic.py +++ b/youtube_dl/extractor/neteasemusic.py @@ -52,7 +52,7 @@ class NetEaseMusicBaseIE(InfoExtractor): @classmethod def convert_milliseconds(cls, ms): - return int(round(ms/1000.0)) + return int(round(ms / 1000.0)) def query_api(self, endpoint, video_id, note): req = compat_urllib_request.Request('%s%s' % (self._API_BASE, endpoint)) @@ -146,7 +146,7 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): self._sort_formats(formats) lyrics_info = self.query_api( - 'song/lyric?id=%s&lv=-1&tv=-1' % song_id, + 'song/lyric?id=%s&lv=-1&tv=-1' % song_id, song_id, 'Downloading lyrics data') lyrics = self._process_lyrics(lyrics_info) @@ -183,7 +183,7 @@ class NetEaseMusicAlbumIE(NetEaseMusicBaseIE): album_id = self._match_id(url) info = self.query_api( - 'album/%s?id=%s' % (album_id, album_id), + 'album/%s?id=%s' % (album_id, album_id), album_id, 'Downloading album data')['album'] name = info['name'] @@ -221,7 +221,7 @@ class NetEaseMusicSingerIE(NetEaseMusicBaseIE): singer_id = self._match_id(url) info = self.query_api( - 'artist/%s?id=%s' % (singer_id, singer_id), + 'artist/%s?id=%s' % (singer_id, singer_id), singer_id, 'Downloading singer data') name = info['artist']['name'] @@ -264,7 +264,7 @@ class NetEaseMusicListIE(NetEaseMusicBaseIE): list_id = self._match_id(url) info = self.query_api( - 'playlist/detail?id=%s&lv=-1&tv=-1' % list_id, + 'playlist/detail?id=%s&lv=-1&tv=-1' % list_id, list_id, 'Downloading playlist data')['result'] name = info['name'] @@ -378,7 +378,7 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE): self.to_screen( 'Downloading just the main audio %s because of --no-playlist' % info['mainSong']['id']) - + formats = self.extract_formats(info['mainSong']) self._sort_formats(formats) @@ -429,7 +429,7 @@ class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE): entries = [] for offset in itertools.count(start=0, step=self._PAGE_SIZE): info = self.query_api( - 'dj/program/byradio?asc=false&limit=%d&radioId=%s&offset=%d' + 'dj/program/byradio?asc=false&limit=%d&radioId=%s&offset=%d' % (self._PAGE_SIZE, dj_id, offset), dj_id, 'Downloading dj programs - %d' % offset) From 397a8ea96e228f918f127313f837f7de7dd78aed Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 10 Jul 2015 18:43:38 +0800 Subject: [PATCH 086/450] [neteasemusic] Encoding fixes for Python 2.6 and 3.x --- youtube_dl/extractor/neteasemusic.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py index e73c77f89..9e99ddbda 100644 --- a/youtube_dl/extractor/neteasemusic.py +++ b/youtube_dl/extractor/neteasemusic.py @@ -11,6 +11,7 @@ from .common import InfoExtractor from ..compat import ( compat_urllib_request, compat_urllib_parse, + compat_str, ) @@ -21,14 +22,14 @@ class NetEaseMusicBaseIE(InfoExtractor): @classmethod def _encrypt(cls, dfsid): - salt_bytes = bytearray(cls._NETEASE_SALT, 'utf-8') - string_bytes = bytearray(str(dfsid)) + salt_bytes = bytearray(cls._NETEASE_SALT.encode('utf-8')) + string_bytes = bytearray(compat_str(dfsid).encode('ascii')) salt_len = len(salt_bytes) for i in range(len(string_bytes)): string_bytes[i] = string_bytes[i] ^ salt_bytes[i % salt_len] m = md5() - m.update(string_bytes) - result = b64encode(m.digest()) + m.update(bytes(string_bytes)) + result = b64encode(m.digest()).decode('ascii') return result.replace('/', '_').replace('+', '-') @classmethod From a0e060ac1e9871974c02fe7cb3e812a3cbe79e73 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 10 Jul 2015 18:58:12 +0800 Subject: [PATCH 087/450] [compat] Add compat_itertools_count 'step' parameter is added in Python 2.7 --- youtube_dl/compat.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index f9529210d..c3783337a 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -9,6 +9,7 @@ import shutil import socket import subprocess import sys +import itertools try: @@ -388,6 +389,15 @@ else: pass return _terminal_size(columns, lines) +try: + itertools.count(start=0, step=1) + compat_itertools_count = itertools.count +except TypeError: # Python 2.6 + def compat_itertools_count(start=0, step=1): + n = start + while True: + yield n + n += step __all__ = [ 'compat_HTTPError', @@ -401,6 +411,7 @@ __all__ = [ 'compat_html_entities', 'compat_http_client', 'compat_http_server', + 'compat_itertools_count', 'compat_kwargs', 'compat_ord', 'compat_parse_qs', From 4eab60cbd27ea8ce31ae9c6bf6b3f6e0e469d3af Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 10 Jul 2015 18:59:12 +0800 Subject: [PATCH 088/450] [netease:djradio] Use compat_itertools_count --- youtube_dl/extractor/neteasemusic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py index 9e99ddbda..bdfe7e63f 100644 --- a/youtube_dl/extractor/neteasemusic.py +++ b/youtube_dl/extractor/neteasemusic.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals from hashlib import md5 from base64 import b64encode from datetime import datetime -import itertools import re from .common import InfoExtractor @@ -12,6 +11,7 @@ from ..compat import ( compat_urllib_request, compat_urllib_parse, compat_str, + compat_itertools_count, ) @@ -428,7 +428,7 @@ class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE): name = None desc = None entries = [] - for offset in itertools.count(start=0, step=self._PAGE_SIZE): + for offset in compat_itertools_count(start=0, step=self._PAGE_SIZE): info = self.query_api( 'dj/program/byradio?asc=false&limit=%d&radioId=%s&offset=%d' % (self._PAGE_SIZE, dj_id, offset), From a34af8d0667d8f4ceba3380f808a6d563ca01d77 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 10 Jul 2015 19:13:52 +0800 Subject: [PATCH 089/450] [kuwo] PEP8 --- youtube_dl/extractor/kuwo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 82d5f3f95..9c62191b5 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -249,10 +249,10 @@ class KuwoCategoryIE(InfoExtractor): category_name = self._html_search_regex( r'

[^<>]+?

', webpage, 'category name') - + category_desc = re.sub( r'^.+简介:', '', get_element_by_id("intro", webpage).strip()) - + jsonm = self._parse_json(self._html_search_regex( r'var jsonm = (\{.+?\});', webpage, 'category songs'), category_id) From 9296e92e1cbb5788c18c9a09590ccbc0e3faec68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 10 Jul 2015 20:55:53 +0600 Subject: [PATCH 090/450] [twitch] Fix login (Closes #6186) --- youtube_dl/extractor/twitch.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index b56ee2959..b7a72a7bd 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -22,8 +22,8 @@ class TwitchBaseIE(InfoExtractor): _API_BASE = 'https://api.twitch.tv' _USHER_BASE = 'http://usher.twitch.tv' - _LOGIN_URL = 'https://secure.twitch.tv/user/login' - _LOGIN_POST_URL = 'https://secure-login.twitch.tv/login' + _LOGIN_URL = 'https://secure.twitch.tv/login' + _LOGIN_POST_URL = 'https://passport.twitch.tv/authorize' _NETRC_MACHINE = 'twitch' def _handle_error(self, response): @@ -59,20 +59,14 @@ class TwitchBaseIE(InfoExtractor): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') - authenticity_token = self._search_regex( - r' Date: Fri, 10 Jul 2015 21:15:09 +0600 Subject: [PATCH 091/450] [twitch] Fix error message regex --- youtube_dl/extractor/twitch.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index b7a72a7bd..f912d3825 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -74,11 +74,12 @@ class TwitchBaseIE(InfoExtractor): response = self._download_webpage( request, None, 'Logging in as %s' % username) - m = re.search( - r"id=([\"'])login_error_message\1[^>]*>(?P[^<]+)", response) - if m: + error_message = self._search_regex( + r']+class="subwindow_notice"[^>]*>([^<]+)

', + response, 'error message', default=None) + if error_message: raise ExtractorError( - 'Unable to login: %s' % m.group('msg').strip(), expected=True) + 'Unable to login. Twitch said: %s' % error_message, expected=True) def _prefer_source(self, formats): try: From 17b41a3337b5326fd2c8dec8aa1ab285225e1daa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 10 Jul 2015 21:16:42 +0600 Subject: [PATCH 092/450] [twitch] Show reset password request --- youtube_dl/extractor/twitch.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index f912d3825..df809ecfe 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -81,6 +81,9 @@ class TwitchBaseIE(InfoExtractor): raise ExtractorError( 'Unable to login. Twitch said: %s' % error_message, expected=True) + if '>Reset your password<' in response: + self.report_warning('Twitch asks you to reset your password, go to https://secure.twitch.tv/reset/submit') + def _prefer_source(self, formats): try: source = next(f for f in formats if f['format_id'] == 'Source') From a31e3e7dcb9d0471d90ec8562934a144d25d7132 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 10 Jul 2015 23:23:07 +0800 Subject: [PATCH 093/450] [kuwo] Regular expression improvements 1. Prevent .+ and .* 2. Use [^>]+ instead of spaces for HTML tags 3. Remove unnecessary trailing parts --- youtube_dl/extractor/kuwo.py | 47 +++++++++++++++------------- youtube_dl/extractor/neteasemusic.py | 2 +- 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 9c62191b5..1095a26e2 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -9,6 +9,7 @@ from ..utils import ( get_element_by_id, clean_html, ExtractorError, + remove_start, ) @@ -73,10 +74,10 @@ class KuwoIE(InfoExtractor): errnote='Unable to get song detail info') song_name = self._html_search_regex( - r'

', webpage, 'song name') + r']+title="([^"]+)">', webpage, 'song name') singer_name = self._html_search_regex( - r'
.+?title="(.+?)".+?
', webpage, 'singer name', - flags=re.DOTALL, default=None) + r']+class="s_img">\s*]+title="([^>]+)"', + webpage, 'singer name', default=None) lrc_content = clean_html(get_element_by_id("lrcContent", webpage)) if lrc_content == '暂无': # indicates no lyrics lrc_content = None @@ -84,7 +85,7 @@ class KuwoIE(InfoExtractor): formats = self._get_formats(song_id) album_id = self._html_search_regex( - r'

.+?]+class="album"[^<]+]+href="http://www\.kuwo\.cn/album/(\d+)/"', webpage, 'album id', default=None, fatal=False) publish_time = None @@ -131,15 +132,16 @@ class KuwoAlbumIE(InfoExtractor): errnote='Unable to get album info') album_name = self._html_search_regex( - r'

', webpage, - 'album name', flags=re.DOTALL) - album_intro = clean_html( - re.sub(r'^.+简介:', '', get_element_by_id("intro", webpage).strip())) + r']+class="comm"[^<]+]+title="([^"]+)"', webpage, + 'album name') + album_intro = remove_start( + clean_html(get_element_by_id("intro", webpage)), + '%s简介:' % album_name) entries = [ self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id) for song_id in re.findall( - r'

', + r']+class="listen">]+href="http://www\.kuwo\.cn/yinyue/(\d+)/"', webpage) ] return self.playlist_result(entries, album_id, album_name, album_intro) @@ -147,7 +149,7 @@ class KuwoAlbumIE(InfoExtractor): class KuwoChartIE(InfoExtractor): IE_NAME = 'kuwo:chart' - _VALID_URL = r'http://yinyue\.kuwo\.cn/billboard_(?P.+?).htm' + _VALID_URL = r'http://yinyue\.kuwo\.cn/billboard_(?P[^.]+).htm' _TEST = { 'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm', 'info_dict': { @@ -165,15 +167,15 @@ class KuwoChartIE(InfoExtractor): errnote='Unable to get chart info') chart_name = self._html_search_regex( - r'

(.+?)

', webpage, 'chart name') + r']+class="unDis">([^<]+)

', webpage, 'chart name') chart_desc = self._html_search_regex( - r'

([0-9]{4}第[0-9]{2}期)

', webpage, 'chart desc') + r']+class="tabDef">(\d{4}第\d{2}期)

', webpage, 'chart desc') entries = [ self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id) for song_id in re.findall( - r'.+?', webpage) + r']+href="http://www\.kuwo\.cn/yinyue/(\d+)/"', webpage) ] return self.playlist_result(entries, chart_id, chart_name, chart_desc) @@ -204,11 +206,11 @@ class KuwoSingerIE(InfoExtractor): errnote='Unable to get singer info') singer_name = self._html_search_regex( - r'
[\n\s\t]*?

(.+?)\s*

([^<]+)]+href="http://www\.kuwo\.cn/yinyue/([0-9]+)/', webpage) ][:10 if first_page_only else None]) - if first_page_only or not re.search(r'下一页', webpage): + if first_page_only or not re.search(r']+href="[^"]+">下一页', webpage): break return self.playlist_result(entries, singer_id, singer_name) @@ -248,13 +250,14 @@ class KuwoCategoryIE(InfoExtractor): errnote='Unable to get category info') category_name = self._html_search_regex( - r'

[^<>]+?

', webpage, 'category name') + r']+title="([^<>]+?)">[^<>]+?

', webpage, 'category name') - category_desc = re.sub( - r'^.+简介:', '', get_element_by_id("intro", webpage).strip()) + category_desc = remove_start( + get_element_by_id("intro", webpage).strip(), + '%s简介:' % category_name) jsonm = self._parse_json(self._html_search_regex( - r'var jsonm = (\{.+?\});', webpage, 'category songs'), category_id) + r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id) entries = [ self.url_result( @@ -289,7 +292,7 @@ class KuwoMvIE(KuwoIE): errnote='Unable to get mv detail info: %s' % song_id) mobj = re.search( - r'

[^<>]+[^<>]+

', + r']+title="(?P[^"]+)">[^<]+]+title="(?P[^"]+)"', webpage) if mobj: song_name = mobj.group('song') diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py index bdfe7e63f..ee52efaee 100644 --- a/youtube_dl/extractor/neteasemusic.py +++ b/youtube_dl/extractor/neteasemusic.py @@ -229,7 +229,7 @@ class NetEaseMusicSingerIE(NetEaseMusicBaseIE): if info['artist']['trans']: name = '%s - %s' % (name, info['artist']['trans']) if info['artist']['alias']: - name = '%s - %s' % (name, ";".join(info['artist']['alias'])) + name = '%s - %s' % (name, ';'.join(info['artist']['alias'])) entries = [ self.url_result('http://music.163.com/#/song?id=%s' % song['id'], From c0bf5e1c4dc0a236402c946b0b1fd358f9e39a74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 10 Jul 2015 21:40:21 +0600 Subject: [PATCH 094/450] [twitch] Fix non-ASCII logins/passwords on python 2 --- youtube_dl/extractor/twitch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index df809ecfe..49535cd80 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -64,8 +64,8 @@ class TwitchBaseIE(InfoExtractor): login_page)) login_form.update({ - 'login': username, - 'password': password, + 'login': username.encode('utf-8'), + 'password': password.encode('utf-8'), }) request = compat_urllib_request.Request( From a9684c0dbf3879478fd223ce7594d58be7dffa4f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 10 Jul 2015 23:46:44 +0800 Subject: [PATCH 095/450] [kuwo] Add KuwoBaseIE --- youtube_dl/extractor/kuwo.py | 63 +++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 1095a26e2..928f7f62d 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -13,32 +13,7 @@ from ..utils import ( ) -class KuwoIE(InfoExtractor): - IE_NAME = 'kuwo:song' - _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P[0-9]+?)/' - _TESTS = [{ - 'url': 'http://www.kuwo.cn/yinyue/635632/', - 'info_dict': { - 'id': '635632', - 'ext': 'ape', - 'title': '爱我别走', - 'creator': '张震岳', - 'upload_date': '20080122', - 'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c' - }, - }, { - 'url': 'http://www.kuwo.cn/yinyue/6446136/', - 'info_dict': { - 'id': '6446136', - 'ext': 'mp3', - 'title': '心', - 'creator': 'IU', - 'upload_date': '20150518', - }, - 'params': { - 'format': 'mp3-320' - }, - }] +class KuwoBaseIE(InfoExtractor): _FORMATS = [ {'format': 'ape', 'ext': 'ape', 'preference': 100}, {'format': 'mp3-320', 'ext': 'mp3', 'br': '320kmp3', 'abr': 320, 'preference': 80}, @@ -67,6 +42,34 @@ class KuwoIE(InfoExtractor): self._sort_formats(formats) return formats + +class KuwoIE(KuwoBaseIE): + IE_NAME = 'kuwo:song' + _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P[0-9]+?)/' + _TESTS = [{ + 'url': 'http://www.kuwo.cn/yinyue/635632/', + 'info_dict': { + 'id': '635632', + 'ext': 'ape', + 'title': '爱我别走', + 'creator': '张震岳', + 'upload_date': '20080122', + 'description': 'md5:ed13f58e3c3bf3f7fd9fbc4e5a7aa75c' + }, + }, { + 'url': 'http://www.kuwo.cn/yinyue/6446136/', + 'info_dict': { + 'id': '6446136', + 'ext': 'mp3', + 'title': '心', + 'creator': 'IU', + 'upload_date': '20150518', + }, + 'params': { + 'format': 'mp3-320' + }, + }] + def _real_extract(self, url): song_id = self._match_id(url) webpage = self._download_webpage( @@ -268,10 +271,10 @@ class KuwoCategoryIE(InfoExtractor): return self.playlist_result(entries, category_id, category_name, category_desc) -class KuwoMvIE(KuwoIE): +class KuwoMvIE(KuwoBaseIE): IE_NAME = 'kuwo:mv' _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P[0-9]+?)/' - _TESTS = [{ + _TEST = { 'url': 'http://www.kuwo.cn/mv/6480076/', 'info_dict': { 'id': '6480076', @@ -279,8 +282,8 @@ class KuwoMvIE(KuwoIE): 'title': '我们家MV', 'creator': '2PM', }, - }] - _FORMATS = KuwoIE._FORMATS + [ + } + _FORMATS = KuwoBaseIE._FORMATS + [ {'format': 'mkv', 'ext': 'mkv', 'preference': 250}, {'format': 'mp4', 'ext': 'mp4', 'preference': 200}, ] From cf2c5fda4f3e753cc64098e6a751cf1a220efae7 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 10 Jul 2015 23:48:48 +0800 Subject: [PATCH 096/450] [kuwo] Use single quotes --- youtube_dl/extractor/kuwo.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 928f7f62d..2b5321cc2 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -27,9 +27,9 @@ class KuwoBaseIE(InfoExtractor): formats = [] for file_format in self._FORMATS: song_url = self._download_webpage( - "http://antiserver.kuwo.cn/anti.s?format=%s&br=%s&rid=MUSIC_%s&type=convert_url&response=url" % + 'http://antiserver.kuwo.cn/anti.s?format=%s&br=%s&rid=MUSIC_%s&type=convert_url&response=url' % (file_format['ext'], file_format.get('br', ''), song_id), - song_id, note="Download %s url info" % file_format["format"], + song_id, note='Download %s url info' % file_format['format'], ) if song_url.startswith('http://') or song_url.startswith('https://'): formats.append({ @@ -81,7 +81,7 @@ class KuwoIE(KuwoBaseIE): singer_name = self._html_search_regex( r']+class="s_img">\s*]+title="([^>]+)"', webpage, 'singer name', default=None) - lrc_content = clean_html(get_element_by_id("lrcContent", webpage)) + lrc_content = clean_html(get_element_by_id('lrcContent', webpage)) if lrc_content == '暂无': # indicates no lyrics lrc_content = None @@ -94,7 +94,7 @@ class KuwoIE(KuwoBaseIE): publish_time = None if album_id is not None: album_info_page = self._download_webpage( - "http://www.kuwo.cn/album/%s/" % album_id, song_id, + 'http://www.kuwo.cn/album/%s/' % album_id, song_id, note='Download album detail info', errnote='Unable to get album detail info') @@ -138,11 +138,11 @@ class KuwoAlbumIE(InfoExtractor): r']+class="comm"[^<]+]+title="([^"]+)"', webpage, 'album name') album_intro = remove_start( - clean_html(get_element_by_id("intro", webpage)), + clean_html(get_element_by_id('intro', webpage)), '%s简介:' % album_name) entries = [ - self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id) + self.url_result('http://www.kuwo.cn/yinyue/%s/' % song_id, 'Kuwo', song_id) for song_id in re.findall( r']+class="listen">]+href="http://www\.kuwo\.cn/yinyue/(\d+)/"', webpage) @@ -176,7 +176,7 @@ class KuwoChartIE(InfoExtractor): r']+class="tabDef">(\d{4}第\d{2}期)

', webpage, 'chart desc') entries = [ - self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id) + self.url_result('http://www.kuwo.cn/yinyue/%s/' % song_id, 'Kuwo', song_id) for song_id in re.findall( r']+href="http://www\.kuwo\.cn/yinyue/(\d+)/"', webpage) ] @@ -221,7 +221,7 @@ class KuwoSingerIE(InfoExtractor): errnote='Unable to get song list page #%d' % page_num) entries.extend([ - self.url_result("http://www.kuwo.cn/yinyue/%s/" % song_id, 'Kuwo', song_id) + self.url_result('http://www.kuwo.cn/yinyue/%s/' % song_id, 'Kuwo', song_id) for song_id in re.findall( r']+class="m_name">]+href="http://www\.kuwo\.cn/yinyue/([0-9]+)/', webpage) @@ -256,7 +256,7 @@ class KuwoCategoryIE(InfoExtractor): r']+title="([^<>]+?)">[^<>]+?

', webpage, 'category name') category_desc = remove_start( - get_element_by_id("intro", webpage).strip(), + get_element_by_id('intro', webpage).strip(), '%s简介:' % category_name) jsonm = self._parse_json(self._html_search_regex( @@ -264,7 +264,7 @@ class KuwoCategoryIE(InfoExtractor): entries = [ self.url_result( - "http://www.kuwo.cn/yinyue/%s/" % song['musicrid'], + 'http://www.kuwo.cn/yinyue/%s/' % song['musicrid'], 'Kuwo', song['musicrid']) for song in jsonm['musiclist'] ] @@ -301,7 +301,7 @@ class KuwoMvIE(KuwoBaseIE): song_name = mobj.group('song') singer_name = mobj.group('singer') else: - raise ExtractorError("Unable to find song or singer names") + raise ExtractorError('Unable to find song or singer names') formats = self._get_formats(song_id) From 27713812a093ddfb329cd9cead878f3d0bf629cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 10 Jul 2015 21:49:09 +0600 Subject: [PATCH 097/450] [extractor/common] Add method for extracting form hidden input fields as dict --- youtube_dl/extractor/common.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index d859aea52..82f5de2d8 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -705,6 +705,12 @@ class InfoExtractor(object): return self._html_search_meta('twitter:player', html, 'twitter card player') + @staticmethod + def _form_hidden_inputs(html): + return dict(re.findall( + r' Date: Fri, 10 Jul 2015 21:49:39 +0600 Subject: [PATCH 098/450] [twitch] Use `_form_hidden_inputs` when logging in --- youtube_dl/extractor/twitch.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 49535cd80..af2b798fb 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -59,9 +59,7 @@ class TwitchBaseIE(InfoExtractor): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') - login_form = dict(re.findall( - r' Date: Fri, 10 Jul 2015 21:52:03 +0600 Subject: [PATCH 099/450] [vk] Use `_form_hidden_inputs` when logging in --- youtube_dl/extractor/vk.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 23d153031..c0292095b 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -154,9 +154,7 @@ class VKIE(InfoExtractor): login_page = self._download_webpage( 'https://vk.com', None, 'Downloading login page') - login_form = dict(re.findall( - r' Date: Fri, 10 Jul 2015 21:53:38 +0600 Subject: [PATCH 100/450] [vodlocker] Use `_form_hidden_inputs` --- youtube_dl/extractor/vodlocker.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py index 1c0966a79..431f4e2e3 100644 --- a/youtube_dl/extractor/vodlocker.py +++ b/youtube_dl/extractor/vodlocker.py @@ -28,12 +28,7 @@ class VodlockerIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - fields = dict(re.findall(r'''(?x) Date: Fri, 10 Jul 2015 23:53:48 +0800 Subject: [PATCH 101/450] [kuwo] Simpler calls to url_result() --- youtube_dl/extractor/kuwo.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 2b5321cc2..69afacac9 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -142,9 +142,8 @@ class KuwoAlbumIE(InfoExtractor): '%s简介:' % album_name) entries = [ - self.url_result('http://www.kuwo.cn/yinyue/%s/' % song_id, 'Kuwo', song_id) - for song_id in re.findall( - r']+class="listen">]+href="http://www\.kuwo\.cn/yinyue/(\d+)/"', + self.url_result(song_url, 'Kuwo') for song_url in re.findall( + r']+class="listen">]+href="(http://www\.kuwo\.cn/yinyue/\d+/)"', webpage) ] return self.playlist_result(entries, album_id, album_name, album_intro) @@ -176,9 +175,8 @@ class KuwoChartIE(InfoExtractor): r']+class="tabDef">(\d{4}第\d{2}期)

', webpage, 'chart desc') entries = [ - self.url_result('http://www.kuwo.cn/yinyue/%s/' % song_id, 'Kuwo', song_id) - for song_id in re.findall( - r']+href="http://www\.kuwo\.cn/yinyue/(\d+)/"', webpage) + self.url_result(song_url, 'Kuwo') for song_url in re.findall( + r']+href="(http://www\.kuwo\.cn/yinyue/\d+)/"', webpage) ] return self.playlist_result(entries, chart_id, chart_name, chart_desc) @@ -221,9 +219,8 @@ class KuwoSingerIE(InfoExtractor): errnote='Unable to get song list page #%d' % page_num) entries.extend([ - self.url_result('http://www.kuwo.cn/yinyue/%s/' % song_id, 'Kuwo', song_id) - for song_id in re.findall( - r']+class="m_name">]+href="http://www\.kuwo\.cn/yinyue/([0-9]+)/', + self.url_result(song_url, 'Kuwo') for song_url in re.findall( + r']+class="m_name">]+href="(http://www\.kuwo\.cn/yinyue/\d+)/', webpage) ][:10 if first_page_only else None]) @@ -263,9 +260,7 @@ class KuwoCategoryIE(InfoExtractor): r'var\s+jsonm\s*=\s*([^;]+);', webpage, 'category songs'), category_id) entries = [ - self.url_result( - 'http://www.kuwo.cn/yinyue/%s/' % song['musicrid'], - 'Kuwo', song['musicrid']) + self.url_result('http://www.kuwo.cn/yinyue/%s/' % song['musicrid'], 'Kuwo') for song in jsonm['musiclist'] ] return self.playlist_result(entries, category_id, category_name, category_desc) From de195c23a6837d1526198320561feba74886b9df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 10 Jul 2015 21:54:39 +0600 Subject: [PATCH 102/450] [vimeo] Use `_form_hidden_inputs` --- youtube_dl/extractor/vimeo.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index cae90205d..d63c03183 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -452,11 +452,7 @@ class VimeoChannelIE(InfoExtractor): password = self._downloader.params.get('videopassword', None) if password is None: raise ExtractorError('This album is protected by a password, use the --video-password option', expected=True) - fields = dict(re.findall(r'''(?x) Date: Fri, 10 Jul 2015 21:55:22 +0600 Subject: [PATCH 103/450] [shared] Use `_form_hidden_inputs` --- youtube_dl/extractor/shared.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 9f3e944e7..7fb68bc2d 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -35,8 +35,7 @@ class SharedIE(InfoExtractor): raise ExtractorError( 'Video %s does not exist' % video_id, expected=True) - download_form = dict(re.findall( - r' Date: Fri, 10 Jul 2015 21:56:14 +0600 Subject: [PATCH 104/450] [promptfile] Use `_form_hidden_inputs` --- youtube_dl/extractor/promptfile.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/promptfile.py b/youtube_dl/extractor/promptfile.py index f536e6e6c..81a63c7fc 100644 --- a/youtube_dl/extractor/promptfile.py +++ b/youtube_dl/extractor/promptfile.py @@ -35,10 +35,7 @@ class PromptFileIE(InfoExtractor): raise ExtractorError('Video %s does not exist' % video_id, expected=True) - fields = dict(re.findall(r'''(?x)type="hidden"\s+ - name="(.+?)"\s+ - value="(.*?)" - ''', webpage)) + fields = self._form_hidden_inputs(webpage) post = compat_urllib_parse.urlencode(fields) req = compat_urllib_request.Request(url, post) req.add_header('Content-type', 'application/x-www-form-urlencoded') From 46f0f500163f64ed8276466667d6733e036bfa2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 10 Jul 2015 21:56:41 +0600 Subject: [PATCH 105/450] [primesharetv] Use `_form_hidden_inputs` --- youtube_dl/extractor/primesharetv.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/youtube_dl/extractor/primesharetv.py b/youtube_dl/extractor/primesharetv.py index 01cc3d9ea..94c9fb2cb 100644 --- a/youtube_dl/extractor/primesharetv.py +++ b/youtube_dl/extractor/primesharetv.py @@ -31,12 +31,7 @@ class PrimeShareTVIE(InfoExtractor): if '>File not exist<' in webpage: raise ExtractorError('Video %s does not exist' % video_id, expected=True) - fields = dict(re.findall(r'''(?x) Date: Fri, 10 Jul 2015 23:56:51 +0800 Subject: [PATCH 106/450] [kuwo] Use \d instead of [0-9] --- youtube_dl/extractor/kuwo.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 69afacac9..18bf66404 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -45,7 +45,7 @@ class KuwoBaseIE(InfoExtractor): class KuwoIE(KuwoBaseIE): IE_NAME = 'kuwo:song' - _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P[0-9]+?)/' + _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P\d+?)/' _TESTS = [{ 'url': 'http://www.kuwo.cn/yinyue/635632/', 'info_dict': { @@ -116,7 +116,7 @@ class KuwoIE(KuwoBaseIE): class KuwoAlbumIE(InfoExtractor): IE_NAME = 'kuwo:album' - _VALID_URL = r'http://www\.kuwo\.cn/album/(?P[0-9]+?)/' + _VALID_URL = r'http://www\.kuwo\.cn/album/(?P\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/album/502294/', 'info_dict': { @@ -157,7 +157,7 @@ class KuwoChartIE(InfoExtractor): 'info_dict': { 'id': '香港中文龙虎榜', 'title': '香港中文龙虎榜', - 'description': 're:[0-9]{4}第[0-9]{2}期', + 'description': 're:\d{4}第\d{2}期', }, 'playlist_mincount': 10, } @@ -211,7 +211,7 @@ class KuwoSingerIE(InfoExtractor): ) entries = [] - first_page_only = False if re.search(r'/music(?:_[0-9]+)?\.htm', url) else True + first_page_only = False if re.search(r'/music(?:_\d+)?\.htm', url) else True for page_num in itertools.count(1): webpage = self._download_webpage( 'http://www.kuwo.cn/mingxing/%s/music_%d.htm' % (singer_id, page_num), @@ -232,7 +232,7 @@ class KuwoSingerIE(InfoExtractor): class KuwoCategoryIE(InfoExtractor): IE_NAME = 'kuwo:category' - _VALID_URL = r'http://yinyue\.kuwo\.cn/yy/cinfo_(?P[0-9]+?).htm' + _VALID_URL = r'http://yinyue\.kuwo\.cn/yy/cinfo_(?P\d+?).htm' _TEST = { 'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm', 'info_dict': { @@ -268,7 +268,7 @@ class KuwoCategoryIE(InfoExtractor): class KuwoMvIE(KuwoBaseIE): IE_NAME = 'kuwo:mv' - _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P[0-9]+?)/' + _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/mv/6480076/', 'info_dict': { From 01b89d5682adaecfd319897683b2043cfc7344c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 10 Jul 2015 21:57:29 +0600 Subject: [PATCH 107/450] [played] Use `_form_hidden_inputs` --- youtube_dl/extractor/played.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py index 45716c75d..9fe1524f2 100644 --- a/youtube_dl/extractor/played.py +++ b/youtube_dl/extractor/played.py @@ -38,9 +38,7 @@ class PlayedIE(InfoExtractor): if m_error: raise ExtractorError(m_error.group('msg'), expected=True) - fields = re.findall( - r'type="hidden" name="([^"]+)"\s+value="([^"]+)">', orig_webpage) - data = dict(fields) + data = self._form_hidden_inputs(orig_webpage) self._sleep(2, video_id) From 8fa7e5817a37756e4135fcb9daba47fc543be36a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 10 Jul 2015 21:58:24 +0600 Subject: [PATCH 108/450] [hostingbulk] Use `_form_hidden_inputs` --- youtube_dl/extractor/hostingbulk.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/youtube_dl/extractor/hostingbulk.py b/youtube_dl/extractor/hostingbulk.py index 704d0285d..63f579592 100644 --- a/youtube_dl/extractor/hostingbulk.py +++ b/youtube_dl/extractor/hostingbulk.py @@ -58,11 +58,7 @@ class HostingBulkIE(InfoExtractor): r' Date: Fri, 10 Jul 2015 21:58:49 +0600 Subject: [PATCH 109/450] [gorillavid] Use `_form_hidden_inputs` --- youtube_dl/extractor/gorillavid.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/youtube_dl/extractor/gorillavid.py b/youtube_dl/extractor/gorillavid.py index 6147596e4..aabf07a20 100644 --- a/youtube_dl/extractor/gorillavid.py +++ b/youtube_dl/extractor/gorillavid.py @@ -78,12 +78,7 @@ class GorillaVidIE(InfoExtractor): if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None: raise ExtractorError('Video %s does not exist' % video_id, expected=True) - fields = dict(re.findall(r'''(?x) Date: Sat, 11 Jul 2015 00:03:49 +0800 Subject: [PATCH 110/450] [kuwo:song] Give warnings for unavailable optional fields --- youtube_dl/extractor/kuwo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 18bf66404..a021f3cdf 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -80,7 +80,7 @@ class KuwoIE(KuwoBaseIE): r']+title="([^"]+)">', webpage, 'song name') singer_name = self._html_search_regex( r']+class="s_img">\s*]+title="([^>]+)"', - webpage, 'singer name', default=None) + webpage, 'singer name', fatal=False) lrc_content = clean_html(get_element_by_id('lrcContent', webpage)) if lrc_content == '暂无': # indicates no lyrics lrc_content = None @@ -89,7 +89,7 @@ class KuwoIE(KuwoBaseIE): album_id = self._html_search_regex( r']+class="album"[^<]+]+href="http://www\.kuwo\.cn/album/(\d+)/"', - webpage, 'album id', default=None, fatal=False) + webpage, 'album id', fatal=False) publish_time = None if album_id is not None: @@ -100,7 +100,7 @@ class KuwoIE(KuwoBaseIE): publish_time = self._html_search_regex( r'发行时间:(\d{4}-\d{2}-\d{2})', album_info_page, - 'publish time', default=None) + 'publish time', fatal=False) if publish_time: publish_time = publish_time.replace('-', '') From 446e7645001d276e6036b9a81e7996edc93b360b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 11 Jul 2015 00:49:59 +0800 Subject: [PATCH 111/450] [baidu] Add localized name --- youtube_dl/extractor/baidu.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/baidu.py b/youtube_dl/extractor/baidu.py index 906895c1e..e37ee4440 100644 --- a/youtube_dl/extractor/baidu.py +++ b/youtube_dl/extractor/baidu.py @@ -8,6 +8,7 @@ from ..compat import compat_urlparse class BaiduVideoIE(InfoExtractor): + IE_DESC = '百度视频' _VALID_URL = r'http://v\.baidu\.com/(?P[a-z]+)/(?P\d+)\.htm' _TESTS = [{ 'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6', From e1ba152352e95fb4cb6f3a5025166627d9df9787 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 11 Jul 2015 00:50:31 +0800 Subject: [PATCH 112/450] [ctsnews] Add localized name --- youtube_dl/extractor/ctsnews.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/ctsnews.py b/youtube_dl/extractor/ctsnews.py index 0226f8036..45049bf37 100644 --- a/youtube_dl/extractor/ctsnews.py +++ b/youtube_dl/extractor/ctsnews.py @@ -6,6 +6,7 @@ from ..utils import parse_iso8601, ExtractorError class CtsNewsIE(InfoExtractor): + IE_DESC = '華視新聞' # https connection failed (Connection reset) _VALID_URL = r'http://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P\d+)\.html' _TESTS = [{ From 513cbdda935b9f319cb61bffcdb0107249eb6b78 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 11 Jul 2015 00:50:45 +0800 Subject: [PATCH 113/450] [douyutv] Add localized name --- youtube_dl/extractor/douyutv.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index 479430c51..373b3b4b4 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -9,6 +9,7 @@ from ..compat import (compat_str, compat_basestring) class DouyuTVIE(InfoExtractor): + IE_DESC = '斗鱼' _VALID_URL = r'http://(?:www\.)?douyutv\.com/(?P[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://www.douyutv.com/iseven', From 44c514eb9c03ddf495feeaf94495bee39cffd842 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 11 Jul 2015 00:51:03 +0800 Subject: [PATCH 114/450] [iqiyi] Add localized name --- youtube_dl/extractor/iqiyi.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 9106dd074..0f6707d7c 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -20,6 +20,7 @@ from ..utils import ( class IqiyiIE(InfoExtractor): IE_NAME = 'iqiyi' + IE_DESC = '爱奇艺' _VALID_URL = r'http://(?:www\.)iqiyi.com/v_.+?\.html' From 0f08d7f851d8e41e09326d5e55dd93038d3ec752 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 11 Jul 2015 00:51:14 +0800 Subject: [PATCH 115/450] [kuwo] Add localized name --- youtube_dl/extractor/kuwo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index a021f3cdf..43f616a5d 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -45,6 +45,7 @@ class KuwoBaseIE(InfoExtractor): class KuwoIE(KuwoBaseIE): IE_NAME = 'kuwo:song' + IE_DESC = '酷我音乐' _VALID_URL = r'http://www\.kuwo\.cn/yinyue/(?P\d+?)/' _TESTS = [{ 'url': 'http://www.kuwo.cn/yinyue/635632/', From 963d0ce7e342841378945a8cb58d3c30bca02c60 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 11 Jul 2015 00:51:26 +0800 Subject: [PATCH 116/450] [letv] Add localized name --- youtube_dl/extractor/letv.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index da896caf1..ba2ae8085 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -19,6 +19,7 @@ from ..utils import ( class LetvIE(InfoExtractor): + IE_DESC = '乐视网' _VALID_URL = r'http://www\.letv\.com/ptv/vplay/(?P\d+).html' _TESTS = [{ From 6ce89aecc330fe4b812526da4e119e7587548e27 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 11 Jul 2015 00:51:35 +0800 Subject: [PATCH 117/450] [neteasemusic] Add localized name --- youtube_dl/extractor/neteasemusic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py index ee52efaee..8758a6f0b 100644 --- a/youtube_dl/extractor/neteasemusic.py +++ b/youtube_dl/extractor/neteasemusic.py @@ -63,6 +63,7 @@ class NetEaseMusicBaseIE(InfoExtractor): class NetEaseMusicIE(NetEaseMusicBaseIE): IE_NAME = 'netease:song' + IE_DESC = '网易云音乐' _VALID_URL = r'https?://music\.163\.com/(#/)?song\?id=(?P[0-9]+)' _TESTS = [{ 'url': 'http://music.163.com/#/song?id=32102397', From 9d16788ad928800f4b6c5682dd08d0814a98754b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 11 Jul 2015 00:51:55 +0800 Subject: [PATCH 118/450] [nextmedia] Add localized name --- youtube_dl/extractor/nextmedia.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py index d1b7cff4c..c10784f6b 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/youtube_dl/extractor/nextmedia.py @@ -6,6 +6,7 @@ from ..utils import parse_iso8601 class NextMediaIE(InfoExtractor): + IE_DESC = '蘋果日報' _VALID_URL = r'http://hk.apple.nextmedia.com/[^/]+/[^/]+/(?P\d+)/(?P\d+)' _TESTS = [{ 'url': 'http://hk.apple.nextmedia.com/realtime/news/20141108/53109199', @@ -66,6 +67,7 @@ class NextMediaIE(InfoExtractor): class NextMediaActionNewsIE(NextMediaIE): + IE_DESC = '蘋果日報 - 動新聞' _VALID_URL = r'http://hk.dv.nextmedia.com/actionnews/[^/]+/(?P\d+)/(?P\d+)/\d+' _TESTS = [{ 'url': 'http://hk.dv.nextmedia.com/actionnews/hit/20150121/19009428/20061460', @@ -90,6 +92,7 @@ class NextMediaActionNewsIE(NextMediaIE): class AppleDailyIE(NextMediaIE): + IE_DESC = '臺灣蘋果日報' _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P\d+)/(?P\d+)(/.*)?' _TESTS = [{ 'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694', From a7ada46bd90666b9c351bb54bd63b7225307afed Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 11 Jul 2015 00:52:07 +0800 Subject: [PATCH 119/450] [qqmusic] Add localized name --- youtube_dl/extractor/qqmusic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 476432330..4c05fded4 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -16,6 +16,7 @@ from ..compat import compat_urllib_request class QQMusicIE(InfoExtractor): IE_NAME = 'qqmusic' + IE_DESC = 'QQ音乐' _VALID_URL = r'http://y.qq.com/#type=song&mid=(?P[0-9A-Za-z]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD', From 9b15be97aa0e094cccaf13e181b51b4399347d4d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 11 Jul 2015 00:52:19 +0800 Subject: [PATCH 120/450] [udn] Add localized name --- youtube_dl/extractor/udn.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/udn.py b/youtube_dl/extractor/udn.py index c08428acf..2151f8338 100644 --- a/youtube_dl/extractor/udn.py +++ b/youtube_dl/extractor/udn.py @@ -11,6 +11,7 @@ from ..compat import compat_urlparse class UDNEmbedIE(InfoExtractor): + IE_DESC = '聯合影音' _VALID_URL = r'https?://video\.udn\.com/(?:embed|play)/news/(?P\d+)' _TESTS = [{ 'url': 'http://video.udn.com/embed/news/300040', From 4fa5f402321c71d6b2d8a281b60b8a23041c0e8f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 11 Jul 2015 00:52:56 +0800 Subject: [PATCH 121/450] [xuite] Add localized name --- youtube_dl/extractor/xuite.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index 81d885fdc..5aac8adb3 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -13,6 +13,7 @@ from ..utils import ( class XuiteIE(InfoExtractor): + IE_DESC = '隨意窩Xuite影音' _REGEX_BASE64 = r'(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?' _VALID_URL = r'https?://vlog\.xuite\.net/(?:play|embed)/(?P%s)' % _REGEX_BASE64 _TESTS = [{ From e014ff015d5240bac9371e5ffe5342ce025a574f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 11 Jul 2015 00:53:08 +0800 Subject: [PATCH 122/450] [yam] Add localized name --- youtube_dl/extractor/yam.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/yam.py b/youtube_dl/extractor/yam.py index 9d851bae3..001ee17b6 100644 --- a/youtube_dl/extractor/yam.py +++ b/youtube_dl/extractor/yam.py @@ -14,6 +14,7 @@ from ..utils import ( class YamIE(InfoExtractor): + IE_DESC = '蕃薯藤yam天空部落' _VALID_URL = r'http://mymedia.yam.com/m/(?P\d+)' _TESTS = [{ From b931fbe5aba8e68a822f88d91d4ec424bc6509d3 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 11 Jul 2015 00:57:59 +0800 Subject: [PATCH 123/450] [yinyuetai] Add localized name --- youtube_dl/extractor/yinyuetai.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/yinyuetai.py b/youtube_dl/extractor/yinyuetai.py index fa6b40816..834d860af 100644 --- a/youtube_dl/extractor/yinyuetai.py +++ b/youtube_dl/extractor/yinyuetai.py @@ -7,6 +7,7 @@ from ..utils import ExtractorError class YinYueTaiIE(InfoExtractor): IE_NAME = 'yinyuetai:video' + IE_DESC = '音悦Tai' _VALID_URL = r'https?://v\.yinyuetai\.com/video(?:/h5)?/(?P[0-9]+)' _TESTS = [{ 'url': 'http://v.yinyuetai.com/video/2322376', From 246995dbc8ac0eb61355966eba7c7559a9ab7b24 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 11 Jul 2015 00:58:13 +0800 Subject: [PATCH 124/450] [youku] Add localized name --- youtube_dl/extractor/youku.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index ced3a10cd..78caeb8b3 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -15,6 +15,7 @@ from ..compat import ( class YoukuIE(InfoExtractor): IE_NAME = 'youku' + IE_DESC = '优酷' _VALID_URL = r'''(?x) (?: http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)| From edd66be5be584dfb04d740cdd663fbd6203e3730 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 11 Jul 2015 01:21:04 +0800 Subject: [PATCH 125/450] [kuwo] Add more localized names --- youtube_dl/extractor/kuwo.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 43f616a5d..1077846f2 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -117,6 +117,7 @@ class KuwoIE(KuwoBaseIE): class KuwoAlbumIE(InfoExtractor): IE_NAME = 'kuwo:album' + IE_DESC = '酷我音乐 - 专辑' _VALID_URL = r'http://www\.kuwo\.cn/album/(?P\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/album/502294/', @@ -152,6 +153,7 @@ class KuwoAlbumIE(InfoExtractor): class KuwoChartIE(InfoExtractor): IE_NAME = 'kuwo:chart' + IE_DESC = '酷我音乐 - 排行榜' _VALID_URL = r'http://yinyue\.kuwo\.cn/billboard_(?P[^.]+).htm' _TEST = { 'url': 'http://yinyue.kuwo.cn/billboard_香港中文龙虎榜.htm', @@ -184,6 +186,7 @@ class KuwoChartIE(InfoExtractor): class KuwoSingerIE(InfoExtractor): IE_NAME = 'kuwo:singer' + IE_DESC = '酷我音乐 - 歌手' _VALID_URL = r'http://www\.kuwo\.cn/mingxing/(?P[^/]+)' _TESTS = [{ 'url': 'http://www.kuwo.cn/mingxing/bruno+mars/', @@ -233,6 +236,7 @@ class KuwoSingerIE(InfoExtractor): class KuwoCategoryIE(InfoExtractor): IE_NAME = 'kuwo:category' + IE_DESC = '酷我音乐 - 分类' _VALID_URL = r'http://yinyue\.kuwo\.cn/yy/cinfo_(?P\d+?).htm' _TEST = { 'url': 'http://yinyue.kuwo.cn/yy/cinfo_86375.htm', @@ -269,6 +273,7 @@ class KuwoCategoryIE(InfoExtractor): class KuwoMvIE(KuwoBaseIE): IE_NAME = 'kuwo:mv' + IE_DESC = '酷我音乐 - MV' _VALID_URL = r'http://www\.kuwo\.cn/mv/(?P\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/mv/6480076/', From ed848087d53d09797eec932e84c8c36c010d04a6 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 11 Jul 2015 01:21:18 +0800 Subject: [PATCH 126/450] [neteasemusic] Add more localized names --- youtube_dl/extractor/neteasemusic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/neteasemusic.py b/youtube_dl/extractor/neteasemusic.py index 8758a6f0b..a8e0a64ed 100644 --- a/youtube_dl/extractor/neteasemusic.py +++ b/youtube_dl/extractor/neteasemusic.py @@ -171,6 +171,7 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): class NetEaseMusicAlbumIE(NetEaseMusicBaseIE): IE_NAME = 'netease:album' + IE_DESC = '网易云音乐 - 专辑' _VALID_URL = r'https?://music\.163\.com/(#/)?album\?id=(?P[0-9]+)' _TEST = { 'url': 'http://music.163.com/#/album?id=220780', @@ -200,6 +201,7 @@ class NetEaseMusicAlbumIE(NetEaseMusicBaseIE): class NetEaseMusicSingerIE(NetEaseMusicBaseIE): IE_NAME = 'netease:singer' + IE_DESC = '网易云音乐 - 歌手' _VALID_URL = r'https?://music\.163\.com/(#/)?artist\?id=(?P[0-9]+)' _TESTS = [{ 'note': 'Singer has aliases.', @@ -242,6 +244,7 @@ class NetEaseMusicSingerIE(NetEaseMusicBaseIE): class NetEaseMusicListIE(NetEaseMusicBaseIE): IE_NAME = 'netease:playlist' + IE_DESC = '网易云音乐 - 歌单' _VALID_URL = r'https?://music\.163\.com/(#/)?(playlist|discover/toplist)\?id=(?P[0-9]+)' _TESTS = [{ 'url': 'http://music.163.com/#/playlist?id=79177352', @@ -287,6 +290,7 @@ class NetEaseMusicListIE(NetEaseMusicBaseIE): class NetEaseMusicMvIE(NetEaseMusicBaseIE): IE_NAME = 'netease:mv' + IE_DESC = '网易云音乐 - MV' _VALID_URL = r'https?://music\.163\.com/(#/)?mv\?id=(?P[0-9]+)' _TEST = { 'url': 'http://music.163.com/#/mv?id=415350', @@ -327,6 +331,7 @@ class NetEaseMusicMvIE(NetEaseMusicBaseIE): class NetEaseMusicProgramIE(NetEaseMusicBaseIE): IE_NAME = 'netease:program' + IE_DESC = '网易云音乐 - 电台节目' _VALID_URL = r'https?://music\.163\.com/(#/?)program\?id=(?P[0-9]+)' _TESTS = [{ 'url': 'http://music.163.com/#/program?id=10109055', @@ -411,6 +416,7 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE): class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE): IE_NAME = 'netease:djradio' + IE_DESC = '网易云音乐 - 电台' _VALID_URL = r'https?://music\.163\.com/(#/)?djradio\?id=(?P[0-9]+)' _TEST = { 'url': 'http://music.163.com/#/djradio?id=42', From 181c4ccaaa186685d4a5fce6f338486182079b9e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 11 Jul 2015 01:21:36 +0800 Subject: [PATCH 127/450] [qqmusic] Add more localized names --- youtube_dl/extractor/qqmusic.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 4c05fded4..1654a641f 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -143,6 +143,7 @@ class QQPlaylistBaseIE(InfoExtractor): class QQMusicSingerIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:singer' + IE_DESC = 'QQ音乐 - 歌手' _VALID_URL = r'http://y.qq.com/#type=singer&mid=(?P[0-9A-Za-z]+)' _TEST = { 'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2', @@ -187,6 +188,7 @@ class QQMusicSingerIE(QQPlaylistBaseIE): class QQMusicAlbumIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:album' + IE_DESC = 'QQ音乐 - 专辑' _VALID_URL = r'http://y.qq.com/#type=album&mid=(?P[0-9A-Za-z]+)' _TESTS = [{ @@ -229,6 +231,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): class QQMusicToplistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:toplist' + IE_DESC = 'QQ音乐 - 排行榜' _VALID_URL = r'http://y\.qq\.com/#type=toplist&p=(?P(top|global)_[0-9]+)' _TESTS = [{ @@ -282,6 +285,7 @@ class QQMusicToplistIE(QQPlaylistBaseIE): class QQMusicPlaylistIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:playlist' + IE_DESC = 'QQ音乐 - 歌单' _VALID_URL = r'http://y\.qq\.com/#type=taoge&id=(?P[0-9]+)' _TEST = { From 86f2541695ba0280982e47e52c5cf26946d5d7c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 10 Jul 2015 22:22:33 +0200 Subject: [PATCH 128/450] Remove unused 're' imports --- youtube_dl/extractor/primesharetv.py | 2 -- youtube_dl/extractor/shared.py | 1 - youtube_dl/extractor/vodlocker.py | 2 -- 3 files changed, 5 deletions(-) diff --git a/youtube_dl/extractor/primesharetv.py b/youtube_dl/extractor/primesharetv.py index 94c9fb2cb..9aa0c862a 100644 --- a/youtube_dl/extractor/primesharetv.py +++ b/youtube_dl/extractor/primesharetv.py @@ -1,7 +1,5 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import ( compat_urllib_parse, diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 7fb68bc2d..6e2b94e7d 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import re import base64 from .common import InfoExtractor diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py index 431f4e2e3..4804692bf 100644 --- a/youtube_dl/extractor/vodlocker.py +++ b/youtube_dl/extractor/vodlocker.py @@ -1,8 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import ( compat_urllib_parse, From 0a31a350981931ab9403d58258f5058ed98142a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 10 Jul 2015 22:46:25 +0200 Subject: [PATCH 129/450] [YoutubeDL] format spec: add additional checks for invalid syntax --- test/test_YoutubeDL.py | 10 ++++++++++ youtube_dl/YoutubeDL.py | 4 ++++ 2 files changed, 14 insertions(+) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index bf2baae07..20f45f439 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -302,6 +302,16 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], f1['format_id']) + def test_invalid_format_specs(self): + def assert_syntax_error(format_spec): + ydl = YDL({'format': format_spec}) + info_dict = _make_result([{'format_id': 'foo', 'url': TEST_URL}]) + self.assertRaises(SyntaxError, ydl.process_ie_result, info_dict) + + assert_syntax_error('bestvideo,,best') + assert_syntax_error('+bestaudio') + assert_syntax_error('bestvideo+') + def test_format_filtering(self): formats = [ {'format_id': 'A', 'filesize': 500, 'width': 1000}, diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 6478d05dc..da7c51008 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -953,6 +953,8 @@ class YoutubeDL(object): tokens.restore_last_token() break elif string == ',': + if not current_selector: + raise syntax_error('"," must follow a format selector', start) selectors.append(current_selector) current_selector = None elif string == '/': @@ -972,6 +974,8 @@ class YoutubeDL(object): elif string == '+': video_selector = current_selector audio_selector = _parse_format_selection(tokens, inside_merge=True) + if not video_selector or not audio_selector: + raise syntax_error('"+" must be between two format selectors', start) current_selector = FormatSelector(MERGE, (video_selector, audio_selector), []) else: raise syntax_error('Operator not recognized: "{0}"'.format(string), start) From dc48695ab9c84cda56ee723aee11e5da44c2258a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 10 Jul 2015 22:59:45 +0200 Subject: [PATCH 130/450] Document how to group format selectors --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e3452c9e1..4e6cb3fc7 100644 --- a/README.md +++ b/README.md @@ -268,7 +268,7 @@ youtube-dl_test_video_.mp4 # A simple file name By default youtube-dl tries to download the best quality, but sometimes you may want to download other format. The simplest case is requesting a specific format, for example `-f 22`. You can get the list of available formats using `--list-formats`, you can also use a file extension (currently it supports aac, m4a, mp3, mp4, ogg, wav, webm) or the special names `best`, `bestvideo`, `bestaudio` and `worst`. -If you want to download multiple videos and they don't have the same formats available, you can specify the order of preference using slashes, as in `-f 22/17/18`. You can also filter the video results by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"`). This works for filesize, height, width, tbr, abr, vbr, asr, and fps and the comparisons <, <=, >, >=, =, != and for ext, acodec, vcodec, container, and protocol and the comparisons =, != . Formats for which the value is not known are excluded unless you put a question mark (?) after the operator. You can combine format filters, so `-f "[height <=? 720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. Use commas to download multiple formats, such as `-f 136/137/mp4/bestvideo,140/m4a/bestaudio`. You can merge the video and audio of two formats into a single file using `-f +` (requires ffmpeg or avconv), for example `-f bestvideo+bestaudio`. +If you want to download multiple videos and they don't have the same formats available, you can specify the order of preference using slashes, as in `-f 22/17/18`. You can also filter the video results by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"`). This works for filesize, height, width, tbr, abr, vbr, asr, and fps and the comparisons <, <=, >, >=, =, != and for ext, acodec, vcodec, container, and protocol and the comparisons =, != . Formats for which the value is not known are excluded unless you put a question mark (?) after the operator. You can combine format filters, so `-f "[height <=? 720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. Use commas to download multiple formats, such as `-f 136/137/mp4/bestvideo,140/m4a/bestaudio`. You can merge the video and audio of two formats into a single file using `-f +` (requires ffmpeg or avconv), for example `-f bestvideo+bestaudio`. Format selectors can also be grouped using parentheses, for example if you want to download the best mp4 and webm formats with a height lower than 480 you can use `-f '(mp4,webm)[height<480]'`. Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see #5447, #5456). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some dash formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dl to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dl still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded and muxed. From fac54cb426d85572c6928ce0d9e5bf1efb459548 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 11 Jul 2015 04:43:29 +0600 Subject: [PATCH 131/450] [webofstories:playlist] Improve and add test --- youtube_dl/extractor/webofstories.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/webofstories.py b/youtube_dl/extractor/webofstories.py index d70e30c00..2037d9b3d 100644 --- a/youtube_dl/extractor/webofstories.py +++ b/youtube_dl/extractor/webofstories.py @@ -104,7 +104,14 @@ class WebOfStoriesIE(InfoExtractor): class WebOfStoriesPlaylistIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?webofstories\.com/playAll/(?P[^/]+)' - _TESTS = [] + _TEST = { + 'url': 'http://www.webofstories.com/playAll/donald.knuth', + 'info_dict': { + 'id': 'donald.knuth', + 'title': 'Donald Knuth (Scientist)', + }, + 'playlist_mincount': 97, + } def _real_extract(self, url): playlist_id = self._match_id(url) @@ -116,10 +123,19 @@ class WebOfStoriesPlaylistIE(InfoExtractor): for video_number in set(re.findall('href="/playAll/%s\?sId=(\d+)"' % playlist_id, webpage)) ] - title = self._html_search_regex( - r'([^<]+)\s*-\s*Web\sof\sStories', webpage, 'title') + title = self._search_regex( + r'
\s*([^<]+)', + webpage, 'speaker', default=None) + if title: + field = self._search_regex( + r'([^<]+)', + webpage, 'field', default=None) + if field: + title += ' (%s)' % field - description = self._html_search_meta( - 'description', webpage, 'description') + if not title: + title = self._search_regex( + r'Play\s+all\s+stories\s*-\s*([^<]+)\s*-\s*Web\s+of\s+Stories', + webpage, 'title') - return self.playlist_result(entries, playlist_id, title, description) + return self.playlist_result(entries, playlist_id, title) From 79913fde35ead6247d7b764f3a95059fc4f17dbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 11 Jul 2015 21:23:49 +0600 Subject: [PATCH 132/450] [vk] Add list id to info_url --- youtube_dl/extractor/vk.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index c0292095b..333c2a634 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -29,7 +29,7 @@ class VKIE(InfoExtractor): (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video| (?:www\.)?biqle\.ru/watch/ ) - (?P[^s].*?)(?:\?|%2F|$) + (?P[^s].*?)(?:\?(?:.*\blist=(?P[\da-f]+))?|%2F|$) ) ''' _NETRC_MACHINE = 'vk' @@ -119,6 +119,20 @@ class VKIE(InfoExtractor): }, 'skip': 'Only works from Russia', }, + { + # video (removed?) only available with list id + 'url': 'https://vk.com/video30481095_171201961?list=8764ae2d21f14088d4', + 'md5': '091287af5402239a1051c37ec7b92913', + 'info_dict': { + 'id': '171201961', + 'ext': 'mp4', + 'title': 'ТюменцевВВ_09.07.2015', + 'uploader': 'Anton Ivanov', + 'duration': 109, + 'upload_date': '20150709', + 'view_count': int, + }, + }, { # youtube embed 'url': 'https://vk.com/video276849682_170681728', @@ -182,6 +196,12 @@ class VKIE(InfoExtractor): video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id + + # Some videos (removed?) can only be downloaded with list id specified + list_id = mobj.group('list_id') + if list_id: + info_url += '&list=%s' % list_id + info_page = self._download_webpage(info_url, video_id) if re.search(r'/login\.php\?.*\bact=security_check', info_page): From d919fa3344463f94a152a10ba65981cd290d9ec8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 11 Jul 2015 21:26:03 +0600 Subject: [PATCH 133/450] [vk] Handle access denied error --- youtube_dl/extractor/vk.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 333c2a634..8ac3aeac0 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -222,6 +222,9 @@ class VKIE(InfoExtractor): r'Видео временно недоступно': 'Video %s is temporarily unavailable.', + + r'Access denied': + 'Access denied to video %s.', } for error_re, error_msg in ERRORS.items(): From f72b0a603270dff0fdd72dd5218126790232199a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 11 Jul 2015 22:15:16 +0600 Subject: [PATCH 134/450] Revert xvid to avi and make docs to be similar to existing external downloader option --- README.md | 4 ++-- youtube_dl/YoutubeDL.py | 3 ++- youtube_dl/__init__.py | 2 +- youtube_dl/options.py | 6 +++--- youtube_dl/postprocessor/ffmpeg.py | 8 +++----- 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index d8926d2b7..9779c2058 100644 --- a/README.md +++ b/README.md @@ -214,8 +214,8 @@ which means you can modify it, redistribute it or use it however you like. --audio-format FORMAT Specify audio format: "best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; "best" by default --audio-quality QUALITY Specify ffmpeg/avconv audio quality, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default 5) - --recode-video FORMAT Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|xvid) - --postprocessor-args Extra parameters for video post-processor. + --recode-video FORMAT Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|avi) + --postprocessor-args ARGS Give these arguments to the postprocessor -k, --keep-video Keep the video file on disk after the post-processing; the video is erased by default --no-post-overwrites Do not overwrite post-processed files; the post-processed files are overwritten by default --embed-subs Embed subtitles in the video (only for mkv and mp4 videos) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 8580f99a7..00af78e06 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -262,7 +262,8 @@ class YoutubeDL(object): The following options are used by the post processors: prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, otherwise prefer avconv. - postprocessor_args: Extra parameters for external apps, like avconv. + postprocessor_args: A list of additional command-line arguments for the + postprocessor. """ params = None diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index fb31d1569..2d416943f 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -169,7 +169,7 @@ def _real_main(argv=None): if not opts.audioquality.isdigit(): parser.error('invalid audio quality specified') if opts.recodevideo is not None: - if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'xvid']: + if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'avi']: parser.error('invalid video recode format specified') if opts.convertsubtitles is not None: if opts.convertsubtitles not in ['srt', 'vtt', 'ass']: diff --git a/youtube_dl/options.py b/youtube_dl/options.py index c15dadb21..85365d769 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -691,11 +691,11 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--recode-video', metavar='FORMAT', dest='recodevideo', default=None, - help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|xvid)') + help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|avi)') postproc.add_option( '--postprocessor-args', - dest='postprocessor_args', default=None, metavar='ARGS', - help='Extra parameters for video post-processor.') + dest='postprocessor_args', metavar='ARGS', + help='Give these arguments to the postprocessor') postproc.add_option( '-k', '--keep-video', action='store_true', dest='keepvideo', default=False, diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index d4ba3572b..62d13a567 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -295,12 +295,10 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor): def run(self, information): path = information['filepath'] prefix, sep, ext = path.rpartition('.') - ext = self._preferedformat + outpath = prefix + sep + self._preferedformat options = self._extra_cmd_args - if self._preferedformat == 'xvid': - ext = 'avi' + if self._preferedformat == 'avi': options.extend(['-c:v', 'libxvid', '-vtag', 'XVID']) - outpath = prefix + sep + ext if information['ext'] == self._preferedformat: self._downloader.to_screen('[ffmpeg] Not converting video file %s - already is in target format %s' % (path, self._preferedformat)) return [], information @@ -308,7 +306,7 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor): self.run_ffmpeg(path, outpath, options) information['filepath'] = outpath information['format'] = self._preferedformat - information['ext'] = ext + information['ext'] = self._preferedformat return [path], information From e35b23f54df83886dd14428b618aa9d0221968ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 11 Jul 2015 22:41:33 +0600 Subject: [PATCH 135/450] [postprocessor/common] Improve postprocessor args fetching and clarify doc --- youtube_dl/postprocessor/common.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/postprocessor/common.py b/youtube_dl/postprocessor/common.py index bee64c457..4191d040b 100644 --- a/youtube_dl/postprocessor/common.py +++ b/youtube_dl/postprocessor/common.py @@ -22,14 +22,15 @@ class PostProcessor(object): of the chain is reached. PostProcessor objects follow a "mutual registration" process similar - to InfoExtractor objects. And it can receive parameters from CLI trough - --postprocessor-args. + to InfoExtractor objects. + + Optionally PostProcessor can use a list of additional command-line arguments + with self._configuration_args. """ _downloader = None def __init__(self, downloader=None): - self._extra_cmd_args = downloader.params.get('postprocessor_args') self._downloader = downloader def set_downloader(self, downloader): @@ -59,6 +60,13 @@ class PostProcessor(object): except Exception: self._downloader.report_warning(errnote) + def _configuration_args(self, default=[]): + pp_args = self._downloader.params.get('postprocessor_args') + if pp_args is None: + return default + assert isinstance(pp_args, list) + return pp_args + class AudioConversionError(PostProcessingError): pass From 15006fedb9511599c0e948a982fa282cde74f2ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 11 Jul 2015 22:42:03 +0600 Subject: [PATCH 136/450] [postprocessor/ffmpeg] Spread postprocessor args usage on all ffmpeg extractors --- youtube_dl/postprocessor/ffmpeg.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 62d13a567..1ecce22e7 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -131,6 +131,8 @@ class FFmpegPostProcessor(PostProcessor): oldest_mtime = min( os.stat(encodeFilename(path)).st_mtime for path in input_paths) + opts += self._configuration_args() + files_cmd = [] for path in input_paths: files_cmd.extend([encodeArgument('-i'), encodeFilename(path, True)]) @@ -294,14 +296,14 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor): def run(self, information): path = information['filepath'] - prefix, sep, ext = path.rpartition('.') - outpath = prefix + sep + self._preferedformat - options = self._extra_cmd_args - if self._preferedformat == 'avi': - options.extend(['-c:v', 'libxvid', '-vtag', 'XVID']) if information['ext'] == self._preferedformat: self._downloader.to_screen('[ffmpeg] Not converting video file %s - already is in target format %s' % (path, self._preferedformat)) return [], information + options = [] + if self._preferedformat == 'avi': + options.extend(['-c:v', 'libxvid', '-vtag', 'XVID']) + prefix, sep, ext = path.rpartition('.') + outpath = prefix + sep + self._preferedformat self._downloader.to_screen('[' + 'ffmpeg' + '] Converting video from %s to %s, Destination: ' % (information['ext'], self._preferedformat) + outpath) self.run_ffmpeg(path, outpath, options) information['filepath'] = outpath From 369e195a44530b43bfd0e95087c9e1a238d91184 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 11 Jul 2015 22:43:02 +0600 Subject: [PATCH 137/450] Handle postprocessor_args similarly to external_downloader_args --- youtube_dl/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 2d416943f..55b22c889 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -263,6 +263,9 @@ def _real_main(argv=None): external_downloader_args = None if opts.external_downloader_args: external_downloader_args = shlex.split(opts.external_downloader_args) + postprocessor_args = None + if opts.postprocessor_args: + postprocessor_args = shlex.split(opts.postprocessor_args) match_filter = ( None if opts.match_filter is None else match_filter_func(opts.match_filter)) @@ -354,7 +357,6 @@ def _real_main(argv=None): 'extract_flat': opts.extract_flat, 'merge_output_format': opts.merge_output_format, 'postprocessors': postprocessors, - 'postprocessor_args': shlex.split(opts.postprocessor_args or ''), 'fixup': opts.fixup, 'source_address': opts.source_address, 'call_home': opts.call_home, @@ -368,6 +370,7 @@ def _real_main(argv=None): 'ffmpeg_location': opts.ffmpeg_location, 'hls_prefer_native': opts.hls_prefer_native, 'external_downloader_args': external_downloader_args, + 'postprocessor_args': postprocessor_args, 'cn_verification_proxy': opts.cn_verification_proxy, } From 41c0d2f8cb22fe34d957bc9b5f9032a9160685ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 11 Jul 2015 23:00:19 +0600 Subject: [PATCH 138/450] Credit @aurium for avi recode and postprocessort args (#5942) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index d5418dd37..c0201d875 100644 --- a/AUTHORS +++ b/AUTHORS @@ -130,3 +130,4 @@ Peter Ding jackyzy823 George Brighton Remita Amine +Aurélio A. Heckert From ac8f97f2b37115bf81a653360819b5d30eebda18 Mon Sep 17 00:00:00 2001 From: felix Date: Sat, 11 Jul 2015 18:56:07 +0200 Subject: [PATCH 139/450] [rdsca] New extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/rdsca.py | 50 ++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 youtube_dl/extractor/rdsca.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index cbaa07391..73fa0e4ed 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -460,6 +460,7 @@ from .radiobremen import RadioBremenIE from .radiofrance import RadioFranceIE from .rai import RaiIE from .rbmaradio import RBMARadioIE +from .rdsca import RDScaIE from .redtube import RedTubeIE from .restudy import RestudyIE from .reverbnation import ReverbNationIE diff --git a/youtube_dl/extractor/rdsca.py b/youtube_dl/extractor/rdsca.py new file mode 100644 index 000000000..a8df927e4 --- /dev/null +++ b/youtube_dl/extractor/rdsca.py @@ -0,0 +1,50 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + url_basename, +) + + +class RDScaIE(InfoExtractor): + IE_NAME = 'RDS.ca' + _VALID_URL = r'http://(?:www\.)?rds\.ca/videos/(?P.*)' + + _TESTS = [{ + 'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799', + 'info_dict': { + "ext": "mp4", + "title": "Fowler Jr. prend la direction de Jacksonville", + "description": "Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ", + "timestamp": 1430397346, + } + }] + + def _real_extract(self, url): + video_id = url_basename(url) + + webpage = self._download_webpage(url, video_id) + + title = self._search_regex( + r']*>([^\n]*)', webpage, 'video title', default=None) + video_url = self._search_regex( + r']*>([^\n]*)', webpage, 'description', default=None) + thumbnail = self._search_regex( + r' Date: Sun, 12 Jul 2015 12:53:15 +0600 Subject: [PATCH 140/450] [onionstudios] Fix extraction --- youtube_dl/extractor/onionstudios.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py index 8fa507dec..0f1f448fe 100644 --- a/youtube_dl/extractor/onionstudios.py +++ b/youtube_dl/extractor/onionstudios.py @@ -49,19 +49,21 @@ class OnionStudiosIE(InfoExtractor): self._sort_formats(formats) title = self._search_regex( - r'share_title\s*=\s*"([^"]+)"', webpage, 'title') + r'share_title\s*=\s*(["\'])(?P[^\1]+?)\1', + webpage, 'title', group='title') description = self._search_regex( - r'share_description\s*=\s*"([^"]+)"', webpage, - 'description', default=None) + r'share_description\s*=\s*(["\'])(?P<description>[^\1]+?)\1', + webpage, 'description', default=None, group='description') thumbnail = self._search_regex( - r'poster="([^"]+)"', webpage, 'thumbnail', default=False) + r'poster\s*=\s*(["\'])(?P<thumbnail>[^\1]+?)\1', + webpage, 'thumbnail', default=False, group='thumbnail') uploader_id = self._search_regex( - r'twitter_handle\s*=\s*"([^"]+)"', - webpage, 'uploader id', fatal=False) + r'twitter_handle\s*=\s*(["\'])(?P<uploader_id>[^\1]+?)\1', + webpage, 'uploader id', fatal=False, group='uploader_id') uploader = self._search_regex( - r'window\.channelName\s*=\s*"Embedded:([^"]+)"', - webpage, 'uploader', default=False) + r'window\.channelName\s*=\s*(["\'])Embedded:(?P<uploader>[^\1]+?)\1', + webpage, 'uploader', default=False, group='uploader') return { 'id': video_id, From 667170e2c7dee6bcf5e357dd53d461807434c5b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 12 Jul 2015 20:40:00 +0600 Subject: [PATCH 141/450] [nrk:tv] Add support for radio URLs (Closes #6200) --- youtube_dl/extractor/nrk.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 9e4581cf9..eb49f292e 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -116,7 +116,8 @@ class NRKPlaylistIE(InfoExtractor): class NRKTVIE(InfoExtractor): - _VALID_URL = r'(?P<baseurl>https?://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' + IE_DESC = 'NRK TV and NRK Radio' + _VALID_URL = r'(?P<baseurl>https?://(?:tv|radio)\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' _TESTS = [ { @@ -188,6 +189,10 @@ class NRKTVIE(InfoExtractor): 'duration': 6947.5199999999995, }, 'skip': 'Only works from Norway', + }, + { + 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', + 'only_matching': True, } ] From c4f1fde75bf69e05be29283c9211089e6ab269ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 12 Jul 2015 20:42:18 +0600 Subject: [PATCH 142/450] [nrk:tv] Add format id prefixes --- youtube_dl/extractor/nrk.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index eb49f292e..d066a96db 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -211,7 +211,8 @@ class NRKTVIE(InfoExtractor): ]} def _extract_f4m(self, manifest_url, video_id): - return self._extract_f4m_formats(manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id) + return self._extract_f4m_formats( + manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id, f4m_id='hds') def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -273,7 +274,7 @@ class NRKTVIE(InfoExtractor): m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage) if m3u8_url: - formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4')) + formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4', m3u8_id='hls')) self._sort_formats(formats) subtitles_url = self._html_search_regex( From b6ea9ef21ad3a43bddf24f5769bdf21a372381fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 12 Jul 2015 23:35:56 +0600 Subject: [PATCH 143/450] [rds] Improve --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/rds.py | 70 ++++++++++++++++++++++++++++++++ youtube_dl/extractor/rdsca.py | 50 ----------------------- 3 files changed, 71 insertions(+), 51 deletions(-) create mode 100644 youtube_dl/extractor/rds.py delete mode 100644 youtube_dl/extractor/rdsca.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 73fa0e4ed..3f4f23521 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -460,7 +460,7 @@ from .radiobremen import RadioBremenIE from .radiofrance import RadioFranceIE from .rai import RaiIE from .rbmaradio import RBMARadioIE -from .rdsca import RDScaIE +from .rds import RDSIE from .redtube import RedTubeIE from .restudy import RestudyIE from .reverbnation import ReverbNationIE diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py new file mode 100644 index 000000000..fdd3f36a6 --- /dev/null +++ b/youtube_dl/extractor/rds.py @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_duration, + parse_iso8601, +) + + +class RDSIE(InfoExtractor): + IE_DESC = 'RDS.ca' + _VALID_URL = r'https?://(?:www\.)?rds\.ca/videos/(?:[^/]+/)+(?P<display_id>[^/]+)-(?P<id>\d+\.\d+)' + + _TEST = { + 'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799', + 'info_dict': { + 'id': '3.1132799', + 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville', + 'ext': 'mp4', + 'title': 'Fowler Jr. prend la direction de Jacksonville', + 'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ', + 'timestamp': 1430397346, + 'upload_date': '20150430', + 'duration': 154.354, + 'age_limit': 0, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + # TODO: extract f4m from 9c9media.com + video_url = self._search_regex( + r'<span[^>]+itemprop="contentURL"[^>]+content="([^"]+)"', + webpage, 'video url') + + title = self._og_search_title(webpage) or self._html_search_meta( + 'title', webpage, 'title', fatal=True) + description = self._og_search_description(webpage) or self._html_search_meta( + 'description', webpage, 'description') + thumbnail = self._og_search_thumbnail(webpage) or self._search_regex( + [r'<link[^>]+itemprop="thumbnailUrl"[^>]+href="([^"]+)"', + r'<span[^>]+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'], + webpage, 'thumbnail', fatal=False) + timestamp = parse_iso8601(self._search_regex( + r'<span[^>]+itemprop="uploadDate"[^>]+content="([^"]+)"', + webpage, 'upload date', fatal=False)) + duration = parse_duration(self._search_regex( + r'<span[^>]+itemprop="duration"[^>]+content="([^"]+)"', + webpage, 'duration', fatal=False)) + age_limit = self._family_friendly_search(webpage) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'age_limit': age_limit, + } diff --git a/youtube_dl/extractor/rdsca.py b/youtube_dl/extractor/rdsca.py deleted file mode 100644 index a8df927e4..000000000 --- a/youtube_dl/extractor/rdsca.py +++ /dev/null @@ -1,50 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - parse_iso8601, - url_basename, -) - - -class RDScaIE(InfoExtractor): - IE_NAME = 'RDS.ca' - _VALID_URL = r'http://(?:www\.)?rds\.ca/videos/(?P<id>.*)' - - _TESTS = [{ - 'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799', - 'info_dict': { - "ext": "mp4", - "title": "Fowler Jr. prend la direction de Jacksonville", - "description": "Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ", - "timestamp": 1430397346, - } - }] - - def _real_extract(self, url): - video_id = url_basename(url) - - webpage = self._download_webpage(url, video_id) - - title = self._search_regex( - r'<span itemprop="name"[^>]*>([^\n]*)</span>', webpage, 'video title', default=None) - video_url = self._search_regex( - r'<span itemprop="contentURL" content="([^"]+)"', webpage, 'video URL') - upload_date = parse_iso8601(self._search_regex( - r'<span itemprop="uploadDate" content="([^"]+)"', webpage, 'upload date', default=None)) - description = self._search_regex( - r'<span itemprop="description"[^>]*>([^\n]*)</span>', webpage, 'description', default=None) - thumbnail = self._search_regex( - r'<span itemprop="thumbnailUrl" content="([^"]+)"', webpage, 'upload date', default=None) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': upload_date, - 'formats': [{ - 'url': video_url, - }], - } From 28fb109ed0276e85207966ac925e9885c2ed0e7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 12 Jul 2015 23:45:47 +0600 Subject: [PATCH 144/450] [rds] Improve _VALID_URL --- youtube_dl/extractor/rds.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py index fdd3f36a6..796adfdf9 100644 --- a/youtube_dl/extractor/rds.py +++ b/youtube_dl/extractor/rds.py @@ -12,9 +12,9 @@ from ..utils import ( class RDSIE(InfoExtractor): IE_DESC = 'RDS.ca' - _VALID_URL = r'https?://(?:www\.)?rds\.ca/videos/(?:[^/]+/)+(?P<display_id>[^/]+)-(?P<id>\d+\.\d+)' + _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<display_id>[^/]+)-(?P<id>\d+\.\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799', 'info_dict': { 'id': '3.1132799', @@ -27,7 +27,10 @@ class RDSIE(InfoExtractor): 'duration': 154.354, 'age_limit': 0, } - } + }, { + 'url': 'http://www.rds.ca/vid%C3%A9os/un-voyage-positif-3.877934', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From 75a40b22511b4d4344cd6903e28429fa6c3991f1 Mon Sep 17 00:00:00 2001 From: Seamus Phelan <SeamusPhelan@gmail.com> Date: Mon, 13 Jul 2015 22:35:45 +1000 Subject: [PATCH 145/450] [SBS] fixes due to website changes --- youtube_dl/extractor/sbs.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index d4bd1a0d7..ab4d1c884 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -1,12 +1,8 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -import re from .common import InfoExtractor -from ..utils import ( - js_to_json, - remove_end, -) +from ..utils import remove_end class SBSIE(InfoExtractor): @@ -34,18 +30,22 @@ class SBSIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + # the video is in the following iframe + iframe_url = 'http://www.sbs.com.au/ondemand/video/single/' + video_id + '?context=web' + webpage = self._download_webpage(iframe_url, video_id) - player = self._search_regex( - r'(?s)playerParams\.releaseUrls\s*=\s*(\{.*?\n\});\n', - webpage, 'player') - player = re.sub(r"'\s*\+\s*[\da-zA-Z_]+\s*\+\s*'", '', player) + player_params = self._search_regex( + r'(?s)(playerParams.+?releaseUrls.+?\n)', + webpage, 'playerParams') + player_params_js = self._search_regex( + r'({.*})', + player_params, 'player_param_js') - release_urls = self._parse_json(js_to_json(player), video_id) + player_params_json = self._parse_json(player_params_js, video_id) - theplatform_url = release_urls.get('progressive') or release_urls['standard'] + theplatform_url = player_params_json.get('releaseUrls')['progressive'] or player_params_json.get('releaseUrls')['standard'] - title = remove_end(self._og_search_title(webpage), ' (The Feed)') + title = remove_end(self._og_search_title(webpage, default=video_id, fatal=False), ' (The Feed)') description = self._html_search_meta('description', webpage) thumbnail = self._og_search_thumbnail(webpage) From 6dfa0602f059397d9a41ef382d9dfa8b432ff0fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 13 Jul 2015 22:11:05 +0600 Subject: [PATCH 146/450] [nowtv] Fix extraction (Closes #6169) --- youtube_dl/extractor/nowtv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index 173e46cd8..0b5ff4760 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -133,7 +133,7 @@ class NowTVIE(InfoExtractor): station = mobj.group('station') info = self._download_json( - 'https://api.nowtv.de/v3/movies/%s?fields=*,format,files' % display_id, + 'https://api.nowtv.de/v3/movies/%s?fields=id,title,free,geoblocked,articleLong,articleShort,broadcastStartDate,seoUrl,duration,format,files' % display_id, display_id) video_id = compat_str(info['id']) From 7869eb3fc44826533d859d3e7e32df5cd4be7ef4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 13 Jul 2015 22:21:35 +0600 Subject: [PATCH 147/450] Credit Bernhard Minks for nowtv patch --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index c0201d875..531ec5767 100644 --- a/AUTHORS +++ b/AUTHORS @@ -131,3 +131,4 @@ jackyzy823 George Brighton Remita Amine Aurélio A. Heckert +Bernhard Minks From cfe5537ee5ca5156f3969f513a1257be83015b1d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 14 Jul 2015 00:21:11 +0800 Subject: [PATCH 148/450] [myspass] Fix extraction (closes #6206) --- youtube_dl/extractor/myspass.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py index 5b9b9fbcd..4557a2b13 100644 --- a/youtube_dl/extractor/myspass.py +++ b/youtube_dl/extractor/myspass.py @@ -35,7 +35,8 @@ class MySpassIE(InfoExtractor): # get metadata metadata_url = META_DATA_URL_TEMPLATE % video_id - metadata = self._download_xml(metadata_url, video_id) + metadata = self._download_xml( + metadata_url, video_id, transform_source=lambda s: s.strip()) # extract values from metadata url_flv_el = metadata.find('url_flv') From 1d1dd597ed96790b3d041274044742baec13132e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 14 Jul 2015 01:35:44 +0600 Subject: [PATCH 149/450] [dramafever] Extract srt (Closes #6207) --- youtube_dl/extractor/dramafever.py | 49 +++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index ca41a3abf..38e6597c8 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -23,8 +23,23 @@ class DramaFeverBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.dramafever.com/accounts/login/' _NETRC_MACHINE = 'dramafever' + _CONSUMER_SECRET = 'DA59dtVXYLxajktV' + + _consumer_secret = None + + def _get_consumer_secret(self): + mainjs = self._download_webpage( + 'http://www.dramafever.com/static/51afe95/df2014/scripts/main.js', + None, 'Downloading main.js', fatal=False) + if not mainjs: + return self._CONSUMER_SECRET + return self._search_regex( + r"var\s+cs\s*=\s*'([^']+)'", mainjs, + 'consumer secret', default=self._CONSUMER_SECRET) + def _real_initialize(self): self._login() + self._consumer_secret = self._get_consumer_secret() def _login(self): (username, password) = self._get_login_info() @@ -119,6 +134,23 @@ class DramaFeverIE(DramaFeverBaseIE): 'url': href, }] + series_id, episode_number = video_id.split('.') + episode_info = self._download_json( + # We only need a single episode info, so restricting page size to one episode + # and dealing with page number as with episode number + r'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_number=%s&page_size=1' + % (self._consumer_secret, series_id, episode_number), + video_id, 'Downloading episode info JSON', fatal=False) + if episode_info: + value = episode_info.get('value') + if value: + subfile = value[0].get('subfile') or value[0].get('new_subfile') + if subfile and subfile != 'http://www.dramafever.com/st/': + subtitles.setdefault('English', []).append({ + 'ext': 'srt', + 'url': subfile, + }) + return { 'id': video_id, 'title': title, @@ -152,27 +184,14 @@ class DramaFeverSeriesIE(DramaFeverBaseIE): 'playlist_count': 20, }] - _CONSUMER_SECRET = 'DA59dtVXYLxajktV' _PAGE_SIZE = 60 # max is 60 (see http://api.drama9.com/#get--api-4-episode-series-) - def _get_consumer_secret(self, video_id): - mainjs = self._download_webpage( - 'http://www.dramafever.com/static/51afe95/df2014/scripts/main.js', - video_id, 'Downloading main.js', fatal=False) - if not mainjs: - return self._CONSUMER_SECRET - return self._search_regex( - r"var\s+cs\s*=\s*'([^']+)'", mainjs, - 'consumer secret', default=self._CONSUMER_SECRET) - def _real_extract(self, url): series_id = self._match_id(url) - consumer_secret = self._get_consumer_secret(series_id) - series = self._download_json( 'http://www.dramafever.com/api/4/series/query/?cs=%s&series_id=%s' - % (consumer_secret, series_id), + % (self._consumer_secret, series_id), series_id, 'Downloading series JSON')['series'][series_id] title = clean_html(series['name']) @@ -182,7 +201,7 @@ class DramaFeverSeriesIE(DramaFeverBaseIE): for page_num in itertools.count(1): episodes = self._download_json( 'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_size=%d&page_number=%d' - % (consumer_secret, series_id, self._PAGE_SIZE, page_num), + % (self._consumer_secret, series_id, self._PAGE_SIZE, page_num), series_id, 'Downloading episodes JSON page #%d' % page_num) for episode in episodes.get('value', []): episode_url = episode.get('episode_url') From 50aa2bb6b94805d901f00d4da7a1b4ff1fcd5169 Mon Sep 17 00:00:00 2001 From: cazulu <jvlarapeinado@gmail.com> Date: Tue, 14 Jul 2015 15:33:55 +0900 Subject: [PATCH 150/450] [dailymotion] Extract duration (closes #6221) --- youtube_dl/extractor/dailymotion.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 8852f0add..1a41c0db1 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -53,6 +53,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'uploader': 'IGN', 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', 'upload_date': '20150306', + 'duration': 74, } }, # Vevo video @@ -164,6 +165,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'thumbnail': info['thumbnail_url'], 'age_limit': age_limit, 'view_count': view_count, + 'duration': info['duration'] } def _get_subtitles(self, video_id, webpage): From 9750e7d70eed92a6b05637465698cdd30e87a44c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 14 Jul 2015 12:56:32 +0200 Subject: [PATCH 151/450] [postprocessor/ffmpeg] Don't use '[youtube] ...' in messages Because it can be used for other extractors. --- youtube_dl/postprocessor/ffmpeg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 1ecce22e7..1f723908b 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -265,7 +265,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): # If we download foo.mp3 and convert it to... foo.mp3, then don't delete foo.mp3, silly. if (new_path == path or (self._nopostoverwrites and os.path.exists(encodeFilename(new_path)))): - self._downloader.to_screen('[youtube] Post-process file %s exists, skipping' % new_path) + self._downloader.to_screen('[ffmpeg] Post-process file %s exists, skipping' % new_path) return [], information try: From f8da79f828637757889f3f35d7adfa9aabbfc721 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 14 Jul 2015 22:36:30 +0600 Subject: [PATCH 152/450] [extractor/common] Improve _form_hidden_inputs and rename to _hidden_inputs --- youtube_dl/extractor/common.py | 15 +++++++++++---- youtube_dl/extractor/gorillavid.py | 2 +- youtube_dl/extractor/hostingbulk.py | 2 +- youtube_dl/extractor/played.py | 2 +- youtube_dl/extractor/primesharetv.py | 2 +- youtube_dl/extractor/promptfile.py | 2 +- youtube_dl/extractor/shared.py | 2 +- youtube_dl/extractor/twitch.py | 2 +- youtube_dl/extractor/vimeo.py | 2 +- youtube_dl/extractor/vk.py | 2 +- youtube_dl/extractor/vodlocker.py | 2 +- 11 files changed, 21 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 82f5de2d8..315fe4a72 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -706,10 +706,17 @@ class InfoExtractor(object): 'twitter card player') @staticmethod - def _form_hidden_inputs(html): - return dict(re.findall( - r'<input\s+type="hidden"\s+name="([^"]+)"\s+(?:id="[^"]+"\s+)?value="([^"]*)"', - html)) + def _hidden_inputs(html): + return dict([ + (input.group('name'), input.group('value')) for input in re.finditer( + r'''(?x) + <input\s+ + type=(?P<q_hidden>["\'])hidden(?P=q_hidden)\s+ + name=(?P<q_name>["\'])(?P<name>.+?)(?P=q_name)\s+ + (?:id=(?P<q_id>["\']).+?(?P=q_id)\s+)? + value=(?P<q_value>["\'])(?P<value>.*?)(?P=q_value) + ''', html) + ]) def _sort_formats(self, formats, field_preference=None): if not formats: diff --git a/youtube_dl/extractor/gorillavid.py b/youtube_dl/extractor/gorillavid.py index aabf07a20..f006f0cb1 100644 --- a/youtube_dl/extractor/gorillavid.py +++ b/youtube_dl/extractor/gorillavid.py @@ -78,7 +78,7 @@ class GorillaVidIE(InfoExtractor): if re.search(self._FILE_NOT_FOUND_REGEX, webpage) is not None: raise ExtractorError('Video %s does not exist' % video_id, expected=True) - fields = self._form_hidden_inputs(webpage) + fields = self._hidden_inputs(webpage) if fields['op'] == 'download1': countdown = int_or_none(self._search_regex( diff --git a/youtube_dl/extractor/hostingbulk.py b/youtube_dl/extractor/hostingbulk.py index 63f579592..a3154cfde 100644 --- a/youtube_dl/extractor/hostingbulk.py +++ b/youtube_dl/extractor/hostingbulk.py @@ -58,7 +58,7 @@ class HostingBulkIE(InfoExtractor): r'<img src="([^"]+)".+?class="pic"', webpage, 'thumbnail', fatal=False) - fields = self._form_hidden_inputs(webpage) + fields = self._hidden_inputs(webpage) request = compat_urllib_request.Request(url, urlencode_postdata(fields)) request.add_header('Content-type', 'application/x-www-form-urlencoded') diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py index 9fe1524f2..8a1c296dd 100644 --- a/youtube_dl/extractor/played.py +++ b/youtube_dl/extractor/played.py @@ -38,7 +38,7 @@ class PlayedIE(InfoExtractor): if m_error: raise ExtractorError(m_error.group('msg'), expected=True) - data = self._form_hidden_inputs(orig_webpage) + data = self._hidden_inputs(orig_webpage) self._sleep(2, video_id) diff --git a/youtube_dl/extractor/primesharetv.py b/youtube_dl/extractor/primesharetv.py index 9aa0c862a..304359dc5 100644 --- a/youtube_dl/extractor/primesharetv.py +++ b/youtube_dl/extractor/primesharetv.py @@ -29,7 +29,7 @@ class PrimeShareTVIE(InfoExtractor): if '>File not exist<' in webpage: raise ExtractorError('Video %s does not exist' % video_id, expected=True) - fields = self._form_hidden_inputs(webpage) + fields = self._hidden_inputs(webpage) headers = { 'Referer': url, diff --git a/youtube_dl/extractor/promptfile.py b/youtube_dl/extractor/promptfile.py index 81a63c7fc..8190ed676 100644 --- a/youtube_dl/extractor/promptfile.py +++ b/youtube_dl/extractor/promptfile.py @@ -35,7 +35,7 @@ class PromptFileIE(InfoExtractor): raise ExtractorError('Video %s does not exist' % video_id, expected=True) - fields = self._form_hidden_inputs(webpage) + fields = self._hidden_inputs(webpage) post = compat_urllib_parse.urlencode(fields) req = compat_urllib_request.Request(url, post) req.add_header('Content-type', 'application/x-www-form-urlencoded') diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 6e2b94e7d..a07677686 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -34,7 +34,7 @@ class SharedIE(InfoExtractor): raise ExtractorError( 'Video %s does not exist' % video_id, expected=True) - download_form = self._form_hidden_inputs(webpage) + download_form = self._hidden_inputs(webpage) request = compat_urllib_request.Request( url, compat_urllib_parse.urlencode(download_form)) request.add_header('Content-Type', 'application/x-www-form-urlencoded') diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index af2b798fb..92b6dc1b8 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -59,7 +59,7 @@ class TwitchBaseIE(InfoExtractor): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') - login_form = self._form_hidden_inputs(login_page) + login_form = self._hidden_inputs(login_page) login_form.update({ 'login': username.encode('utf-8'), diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index d63c03183..10d6745af 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -452,7 +452,7 @@ class VimeoChannelIE(InfoExtractor): password = self._downloader.params.get('videopassword', None) if password is None: raise ExtractorError('This album is protected by a password, use the --video-password option', expected=True) - fields = self._form_hidden_inputs(login_form) + fields = self._hidden_inputs(login_form) token = self._search_regex(r'xsrft[\s=:"\']+([^"\']+)', webpage, 'login token') fields['token'] = token fields['password'] = password diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 8ac3aeac0..8f677cae3 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -168,7 +168,7 @@ class VKIE(InfoExtractor): login_page = self._download_webpage( 'https://vk.com', None, 'Downloading login page') - login_form = self._form_hidden_inputs(login_page) + login_form = self._hidden_inputs(login_page) login_form.update({ 'email': username.encode('cp1251'), diff --git a/youtube_dl/extractor/vodlocker.py b/youtube_dl/extractor/vodlocker.py index 4804692bf..ccf1928b5 100644 --- a/youtube_dl/extractor/vodlocker.py +++ b/youtube_dl/extractor/vodlocker.py @@ -26,7 +26,7 @@ class VodlockerIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - fields = self._form_hidden_inputs(webpage) + fields = self._hidden_inputs(webpage) if fields['op'] == 'download1': self._sleep(3, video_id) # they do detect when requests happen too fast! From cf61d96df0984e28d8c34328177504a2d1424bd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 14 Jul 2015 22:38:10 +0600 Subject: [PATCH 153/450] [extractor/common] Add _form_hidden_inputs --- youtube_dl/extractor/common.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 315fe4a72..3a396c0b0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -718,6 +718,12 @@ class InfoExtractor(object): ''', html) ]) + def _form_hidden_inputs(self, form_id, html): + form = self._search_regex( + r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id, + html, '%s form' % form_id, group='form') + return self._hidden_inputs(form) + def _sort_formats(self, formats, field_preference=None): if not formats: raise ExtractorError('No video formats found') From dcd4d95c8e9acec6dde0706de75bf866a43c9aa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 14 Jul 2015 22:39:41 +0600 Subject: [PATCH 154/450] [udemy] Fix authentication (Closes #6224) --- youtube_dl/extractor/udemy.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 4667ed83b..192606077 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -15,7 +15,8 @@ from ..utils import ( class UdemyIE(InfoExtractor): IE_NAME = 'udemy' _VALID_URL = r'https?://www\.udemy\.com/(?:[^#]+#/lecture/|lecture/view/?\?lectureId=)(?P<id>\d+)' - _LOGIN_URL = 'https://www.udemy.com/join/login-submit/' + _LOGIN_URL = 'https://www.udemy.com/join/login-popup/?displayType=ajax&showSkipButton=1' + _ORIGIN_URL = 'https://www.udemy.com' _NETRC_MACHINE = 'udemy' _TESTS = [{ @@ -74,29 +75,34 @@ class UdemyIE(InfoExtractor): expected=True) login_popup = self._download_webpage( - 'https://www.udemy.com/join/login-popup?displayType=ajax&showSkipButton=1', None, - 'Downloading login popup') + self._LOGIN_URL, None, 'Downloading login popup') if login_popup == '<div class="run-command close-popup redirect" data-url="https://www.udemy.com/"></div>': return - csrf = self._html_search_regex( - r'<input type="hidden" name="csrf" value="(.+?)"', - login_popup, 'csrf token') + login_form = self._form_hidden_inputs('login-form', login_popup) - login_form = { - 'email': username, - 'password': password, - 'csrf': csrf, + login_form.update({ 'displayType': 'json', - 'isSubmitted': '1', - } + 'email': username.encode('utf-8'), + 'password': password.encode('utf-8'), + }) + request = compat_urllib_request.Request( self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) - response = self._download_json( + request.add_header('Referer', self._ORIGIN_URL) + request.add_header('Origin', self._ORIGIN_URL) + + response = self._download_webpage( request, None, 'Logging in as %s' % username) - if 'returnUrl' not in response: + if all(logout_pattern not in response + for logout_pattern in ['href="https://www.udemy.com/user/logout/', '>Logout<']): + error = self._html_search_regex( + r'(?s)<div[^>]+class="form-errors[^"]*">(.+?)</div>', + response, 'error message', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') def _real_extract(self, url): From 79057965a87a9179b3c3c2249df5495684c5e4ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 14 Jul 2015 22:46:26 +0600 Subject: [PATCH 155/450] [udemy] Remove superfluous field --- youtube_dl/extractor/udemy.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 192606077..e2bab52fe 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -83,7 +83,6 @@ class UdemyIE(InfoExtractor): login_form = self._form_hidden_inputs('login-form', login_popup) login_form.update({ - 'displayType': 'json', 'email': username.encode('utf-8'), 'password': password.encode('utf-8'), }) From 01d115b06b5b752324a9346f128d659a98b58f31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 14 Jul 2015 19:07:40 +0200 Subject: [PATCH 156/450] [jeuxvideo] Relax _VALID_URL (fixes #6230) --- youtube_dl/extractor/jeuxvideo.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index d0720ff56..1df084d87 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -8,9 +8,9 @@ from .common import InfoExtractor class JeuxVideoIE(InfoExtractor): - _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)-\d+\.htm' + _VALID_URL = r'http://.*?\.jeuxvideo\.com/.*/(.*?)\.htm' - _TEST = { + _TESTS = [{ 'url': 'http://www.jeuxvideo.com/reportages-videos-jeux/0004/00046170/tearaway-playstation-vita-gc-2013-tearaway-nous-presente-ses-papiers-d-identite-00115182.htm', 'md5': '046e491afb32a8aaac1f44dd4ddd54ee', 'info_dict': { @@ -19,7 +19,10 @@ class JeuxVideoIE(InfoExtractor): 'title': 'Tearaway : GC 2013 : Tearaway nous présente ses papiers d\'identité', 'description': 'Lorsque les développeurs de LittleBigPlanet proposent un nouveau titre, on ne peut que s\'attendre à un résultat original et fort attrayant.', }, - } + }, { + 'url': 'http://www.jeuxvideo.com/videos/chroniques/434220/l-histoire-du-jeu-video-la-saturn.htm', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From eae89f92e61d8227a6463c50750310a56c68cf09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 14 Jul 2015 23:54:13 +0600 Subject: [PATCH 157/450] [dfb] Update test --- youtube_dl/extractor/dfb.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py index 8049779b0..6732e1751 100644 --- a/youtube_dl/extractor/dfb.py +++ b/youtube_dl/extractor/dfb.py @@ -10,13 +10,13 @@ class DFBIE(InfoExtractor): _VALID_URL = r'https?://tv\.dfb\.de/video/[^/]+/(?P<id>\d+)' _TEST = { - 'url': 'http://tv.dfb.de/video/highlights-des-empfangs-in-berlin/9070/', + 'url': 'http://tv.dfb.de/video/u-19-em-stimmen-zum-spiel-gegen-russland/11633/', # The md5 is different each time 'info_dict': { - 'id': '9070', + 'id': '11633', 'ext': 'flv', - 'title': 'Highlights des Empfangs in Berlin', - 'upload_date': '20140716', + 'title': 'U 19-EM: Stimmen zum Spiel gegen Russland', + 'upload_date': '20150714', }, } From ddcdc684e20af96a4fc57da49c27b31c75462aae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 14 Jul 2015 23:59:21 +0600 Subject: [PATCH 158/450] [dfb] Extract display_id and modernize --- youtube_dl/extractor/dfb.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py index 6732e1751..f1df9b9a0 100644 --- a/youtube_dl/extractor/dfb.py +++ b/youtube_dl/extractor/dfb.py @@ -3,17 +3,19 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import unified_strdate class DFBIE(InfoExtractor): IE_NAME = 'tv.dfb.de' - _VALID_URL = r'https?://tv\.dfb\.de/video/[^/]+/(?P<id>\d+)' + _VALID_URL = r'https?://tv\.dfb\.de/video/(?P<display_id>[^/]+)/(?P<id>\d+)' _TEST = { 'url': 'http://tv.dfb.de/video/u-19-em-stimmen-zum-spiel-gegen-russland/11633/', # The md5 is different each time 'info_dict': { 'id': '11633', + 'display_id': 'u-19-em-stimmen-zum-spiel-gegen-russland', 'ext': 'flv', 'title': 'U 19-EM: Stimmen zum Spiel gegen Russland', 'upload_date': '20150714', @@ -23,22 +25,25 @@ class DFBIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + display_id = mobj.group('display_id') - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, display_id) player_info = self._download_xml( 'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id, - video_id) + display_id) video_info = player_info.find('video') - f4m_info = self._download_xml(self._proto_relative_url(video_info.find('url').text.strip()), video_id) + f4m_info = self._download_xml( + self._proto_relative_url(video_info.find('url').text.strip()), display_id) token_el = f4m_info.find('token') manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0' return { 'id': video_id, + 'display_id': display_id, 'title': video_info.find('title').text, 'url': manifest_url, 'ext': 'flv', 'thumbnail': self._og_search_thumbnail(webpage), - 'upload_date': ''.join(video_info.find('time_date').text.split('.')[::-1]), + 'upload_date': unified_strdate(video_info.find('time_date').text), } From 6c1b0c0ed2678198e3196e2acf32f63806d4ccb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 15 Jul 2015 00:01:41 +0600 Subject: [PATCH 159/450] [dfb] Extract formats --- youtube_dl/extractor/dfb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py index f1df9b9a0..263532cc6 100644 --- a/youtube_dl/extractor/dfb.py +++ b/youtube_dl/extractor/dfb.py @@ -37,13 +37,13 @@ class DFBIE(InfoExtractor): self._proto_relative_url(video_info.find('url').text.strip()), display_id) token_el = f4m_info.find('token') manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0' + formats = self._extract_f4m_formats(manifest_url, display_id) return { 'id': video_id, 'display_id': display_id, 'title': video_info.find('title').text, - 'url': manifest_url, - 'ext': 'flv', 'thumbnail': self._og_search_thumbnail(webpage), 'upload_date': unified_strdate(video_info.find('time_date').text), + 'formats': formats, } From b062d94eefcc9327af5e60f3280cedaff9e40ac8 Mon Sep 17 00:00:00 2001 From: "Sergey M." <dstftw@gmail.com> Date: Wed, 15 Jul 2015 03:08:36 +0600 Subject: [PATCH 160/450] [README.md] Clarify authentication with .netrc file --- README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/README.md b/README.md index 9779c2058..a2cc89cdb 100644 --- a/README.md +++ b/README.md @@ -238,6 +238,26 @@ which means you can modify it, redistribute it or use it however you like. You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl/config`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\<user name>\youtube-dl.conf`. +### Authentication with `.netrc` file ### + +You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in shell command history. You can achieve this using [`.netrc` file](http://stackoverflow.com/tags/.netrc/info) on per extractor basis. For that you will need to create `.netrc` file in your `$HOME` and restrict permissions to read/write by you only: +``` +touch $HOME/.netrc +chmod a-rwx,u+rw $HOME/.netrc +``` +After that you can add credentials for extractor in the following format, where *extractor* is the name of extractor in lowercase: +``` +machine <extractor> login <login> password <password> +``` +For example: +``` +machine youtube login myaccount@gmail.com password my_youtube_password +machine twitch login my_twitch_account_name password my_twitch_password +``` +To activate authentication with `.netrc` file you should pass `--netrc` to youtube-dl or to place it in [configuration file](#configuration). + +On Windows you may also need to setup `%HOME%` environment variable manually. + # OUTPUT TEMPLATE The `-o` option allows users to indicate a template for the output file names. The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences have the format `%(NAME)s`. To clarify, that is a percent symbol followed by a name in parenthesis, followed by a lowercase S. Allowed names are: From 2af0f87c8b56567e0254aae7a1ccbedb04413b1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 15 Jul 2015 23:32:52 +0600 Subject: [PATCH 161/450] [prosiebensat1] Fix extraction (Closes #6215) --- youtube_dl/extractor/prosiebensat1.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 536a42dc8..22efa903f 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -9,8 +9,9 @@ from ..compat import ( compat_urllib_parse, ) from ..utils import ( - unified_strdate, + fix_xml_ampersands, int_or_none, + unified_strdate, ) @@ -208,7 +209,7 @@ class ProSiebenSat1IE(InfoExtractor): clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id') access_token = 'prosieben' - client_name = 'kolibri-1.12.6' + client_name = 'kolibri-2.0.19-splec4' client_location = url videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({ @@ -275,8 +276,9 @@ class ProSiebenSat1IE(InfoExtractor): for source in urls_sources: protocol = source['protocol'] + source_url = source['url'] if protocol == 'rtmp' or protocol == 'rtmpe': - mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source['url']) + mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) if not mobj: continue path = mobj.group('path') @@ -293,9 +295,18 @@ class ProSiebenSat1IE(InfoExtractor): 'ext': 'mp4', 'format_id': '%s_%s' % (source['cdn'], source['bitrate']), }) + elif 'f4mgenerator' in source_url: + manifest = self._download_xml( + source_url, clip_id, 'Downloading generated f4m manifest', + transform_source=lambda s: fix_xml_ampersands(s).strip()) + for media in manifest.findall('./{http://ns.adobe.com/f4m/2.0}media'): + manifest_url = media.get('href') + if manifest_url: + formats.extend(self._extract_f4m_formats( + manifest_url, clip_id, f4m_id='hds')) else: formats.append({ - 'url': source['url'], + 'url': source_url, 'vbr': fix_bitrate(source['bitrate']), }) From 97f4aecfc1c5fd446e1d3edd37e49aafe246fe0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 16 Jul 2015 01:14:08 +0600 Subject: [PATCH 162/450] [extractor/common] Handle malformed f4m manifests --- youtube_dl/extractor/common.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3a396c0b0..f8a5ecced 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -28,6 +28,7 @@ from ..utils import ( clean_html, compiled_regex_type, ExtractorError, + fix_xml_ampersands, float_or_none, int_or_none, RegexNotFoundError, @@ -837,7 +838,10 @@ class InfoExtractor(object): def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None): manifest = self._download_xml( manifest_url, video_id, 'Downloading f4m manifest', - 'Unable to download f4m manifest') + 'Unable to download f4m manifest', + # Some manifests may be malformed, e.g. prosiebensat1 generated manifests + # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) + transform_source=lambda s: fix_xml_ampersands(s).strip()) formats = [] manifest_version = '1.0' From cc357c4db8112ff6736a227b47fb9527d327797f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 16 Jul 2015 01:14:52 +0600 Subject: [PATCH 163/450] [extractor/common] Properly handle full URLs --- youtube_dl/extractor/common.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f8a5ecced..78e5cf8d0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -851,8 +851,10 @@ class InfoExtractor(object): media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') for i, media_el in enumerate(media_nodes): if manifest_version == '2.0': - manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' + - (media_el.attrib.get('href') or media_el.attrib.get('url'))) + media_url = media_el.attrib.get('href') or media_el.attrib['url'] + manifest_url = ( + media_url if media_url.startswith('http://') or media_url.startswith('https://') + else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url)) tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])), From 70f0f5a8ca53d4426fc079b3ab46e9d4a8e81ea4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 16 Jul 2015 01:15:15 +0600 Subject: [PATCH 164/450] [extractor/common] Recursively extract child f4m manifests --- youtube_dl/extractor/common.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 78e5cf8d0..e3c610aa4 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -27,6 +27,7 @@ from ..utils import ( bug_reports_message, clean_html, compiled_regex_type, + determine_ext, ExtractorError, fix_xml_ampersands, float_or_none, @@ -855,6 +856,13 @@ class InfoExtractor(object): manifest_url = ( media_url if media_url.startswith('http://') or media_url.startswith('https://') else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url)) + # If media_url is itself a f4m manifest do the recursive extraction + # since bitrates in parent manifest (this one) and media_url manifest + # may differ leading to inability to resolve the format by requested + # bitrate in f4m downloader + if determine_ext(manifest_url) == 'f4m': + formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id)) + continue tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])), From f01f731107010e0c10fc94782daa7a3ba543e92a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 16 Jul 2015 01:15:47 +0600 Subject: [PATCH 165/450] [prosiebensat1] Use generic f4m manifest extraction --- youtube_dl/extractor/prosiebensat1.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 22efa903f..2f9d95800 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -9,7 +9,7 @@ from ..compat import ( compat_urllib_parse, ) from ..utils import ( - fix_xml_ampersands, + determine_ext, int_or_none, unified_strdate, ) @@ -295,15 +295,8 @@ class ProSiebenSat1IE(InfoExtractor): 'ext': 'mp4', 'format_id': '%s_%s' % (source['cdn'], source['bitrate']), }) - elif 'f4mgenerator' in source_url: - manifest = self._download_xml( - source_url, clip_id, 'Downloading generated f4m manifest', - transform_source=lambda s: fix_xml_ampersands(s).strip()) - for media in manifest.findall('./{http://ns.adobe.com/f4m/2.0}media'): - manifest_url = media.get('href') - if manifest_url: - formats.extend(self._extract_f4m_formats( - manifest_url, clip_id, f4m_id='hds')) + elif 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': + formats.extend(self._extract_f4m_formats(source_url, clip_id)) else: formats.append({ 'url': source_url, From 31c746e5dc46491f997eca757c5e35842f04cb59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 16 Jul 2015 01:25:33 +0600 Subject: [PATCH 166/450] [extractor/common] Keep going in some media_url is missing --- youtube_dl/extractor/common.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e3c610aa4..271bf8596 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -852,7 +852,9 @@ class InfoExtractor(object): media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') for i, media_el in enumerate(media_nodes): if manifest_version == '2.0': - media_url = media_el.attrib.get('href') or media_el.attrib['url'] + media_url = media_el.attrib.get('href') or media_el.attrib.get('url') + if not media_url: + continue manifest_url = ( media_url if media_url.startswith('http://') or media_url.startswith('https://') else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url)) From ab9b890b524a49a9ffa4c8ac7243cd8afc15d270 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 16 Jul 2015 02:23:07 +0600 Subject: [PATCH 167/450] [prosiebensat1] Clarify test purpose --- youtube_dl/extractor/prosiebensat1.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 2f9d95800..fec008ce7 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -22,6 +22,11 @@ class ProSiebenSat1IE(InfoExtractor): _TESTS = [ { + # Tests changes introduced in https://github.com/rg3/youtube-dl/pull/6242 + # in response to fixing https://github.com/rg3/youtube-dl/issues/6215: + # - malformed f4m manifest support + # - proper handling of URLs starting with `https?://` in 2.0 manifests + # - recursive child f4m manifests extraction 'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge', 'info_dict': { 'id': '2104602', From e37c932fca29d93af77b7a47cccc9bb8578e3163 Mon Sep 17 00:00:00 2001 From: fnord <fnord@fnord.mobi> Date: Wed, 15 Jul 2015 15:13:56 -0500 Subject: [PATCH 168/450] compat_urllib_parse_unquote: crash fix: only decode valid hex on python 2 the following has a { "crash_rate": "100%" } of the time as it tries to parse '" ' as hex. --- youtube_dl/compat.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index c3783337a..1f4ccf443 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -94,6 +94,8 @@ except ImportError: try: if not item: raise ValueError + if not re.match('[0-9a-fA-F][0-9a-fA-F]',item[:2]): + raise ValueError pct_sequence += item[:2].decode('hex') rest = item[2:] if not rest: From 45eedbe58c8ab6344f11f1e1376d01648c1967ee Mon Sep 17 00:00:00 2001 From: fnord <fnord@fnord.mobi> Date: Wed, 15 Jul 2015 15:30:47 -0500 Subject: [PATCH 169/450] Generic: use compat_urllib_parse_unquote to prevent utf8 mangling of the entire page in python 2. -requires- fixed compat_urllib_parse_unquote example - the following will save with a mangled playlist title, instead of the kanji for 'tsunami'. This affects all utf8encoded urls as well youtube-dl -f18 -o '%(playlist_title)s-%(title)s.%(ext)s' \ https://gist.githubusercontent.com/atomicdryad/fcb97465e6060fc519e1/raw/61c14c1e3a4985471dcf56c281d24d7e781a4e0e/tsunami.html --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 392ad3648..fc1bf2b6e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1115,7 +1115,7 @@ class GenericIE(InfoExtractor): # Sometimes embedded video player is hidden behind percent encoding # (e.g. https://github.com/rg3/youtube-dl/issues/2448) # Unescaping the whole page allows to handle those cases in a generic way - webpage = compat_urllib_parse.unquote(webpage) + webpage = compat_urllib_parse_unquote(webpage) # it's tempting to parse this further, but you would # have to take into account all the variations like From e118031ef827e851e537daa5b439cf5c249ca88d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 16 Jul 2015 23:21:04 +0600 Subject: [PATCH 170/450] [npo] Extend _VALID_URL to support ntr.nl (Closes #6248) --- youtube_dl/extractor/npo.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 62d12b7a6..1c823ec7f 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -37,8 +37,9 @@ class NPOBaseIE(InfoExtractor): class NPOIE(NPOBaseIE): - IE_NAME = 'npo.nl' - _VALID_URL = r'https?://(?:www\.)?npo\.nl/(?!live|radio)[^/]+/[^/]+/(?P<id>[^/?]+)' + IE_NAME = 'npo' + IE_DESC = 'npo.nl and ntr.nl' + _VALID_URL = r'https?://(?:www\.)?(?:npo|ntr)\.nl/(?!live|radio)(?:[^/]+/){2,}(?P<id>[^/?#]+)' _TESTS = [ { @@ -100,6 +101,18 @@ class NPOIE(NPOBaseIE): 'title': 'Hoe gaat Europa verder na Parijs?', }, }, + { + 'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content', + 'md5': '01c6a2841675995da1f0cf776f03a9c3', + 'info_dict': { + 'id': 'VPWON_1233944', + 'ext': 'm4v', + 'title': 'Aap, poot, pies', + 'description': 'md5:c9c8005d1869ae65b858e82c01a91fde', + 'upload_date': '20150508', + 'duration': 599, + }, + } ] def _real_extract(self, url): From 525daedd5a092b0f5329952eee99a7dac5537433 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 16 Jul 2015 23:54:43 +0600 Subject: [PATCH 171/450] [npo] Add support for omroepwnl fragments --- youtube_dl/extractor/npo.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 1c823ec7f..a5162c0c6 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -1,6 +1,12 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import ( + compat_urllib_request, + compat_urllib_parse, +) from ..utils import ( fix_xml_ampersands, parse_duration, @@ -39,7 +45,16 @@ class NPOBaseIE(InfoExtractor): class NPOIE(NPOBaseIE): IE_NAME = 'npo' IE_DESC = 'npo.nl and ntr.nl' - _VALID_URL = r'https?://(?:www\.)?(?:npo|ntr)\.nl/(?!live|radio)(?:[^/]+/){2,}(?P<id>[^/?#]+)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + npo\.nl/(?!live|radio)(?:[^/]+/){2}| + ntr\.nl/(?:[^/]+/){2,}| + omroepwnl\.nl/video/fragment/[^/]+__ + ) + (?P<id>[^/?#]+) + ''' _TESTS = [ { @@ -112,6 +127,18 @@ class NPOIE(NPOBaseIE): 'upload_date': '20150508', 'duration': 599, }, + }, + { + 'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698', + 'md5': 'd30cd8417b8b9bca1fdff27428860d08', + 'info_dict': { + 'id': 'POW_00996502', + 'ext': 'm4v', + 'title': '''"Dit is wel een 'landslide'..."''', + 'description': 'md5:f8d66d537dfb641380226e31ca57b8e8', + 'upload_date': '20150508', + 'duration': 462, + }, } ] @@ -127,6 +154,11 @@ class NPOIE(NPOBaseIE): transform_source=strip_jsonp, ) + # For some videos actual video id (prid) is different (e.g. for + # http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698 + # video id is POMS_WNL_853698 but prid is POW_00996502) + video_id = metadata.get('prid') or video_id + token = self._get_token(video_id) formats = [] From 50ea2bb20d3a3e219910e87b8b30fc79ce534595 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 16 Jul 2015 23:56:57 +0600 Subject: [PATCH 172/450] [npo] Update test --- youtube_dl/extractor/npo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index a5162c0c6..cf6a388e5 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -87,7 +87,7 @@ class NPOIE(NPOBaseIE): 'id': 'VPWON_1169289', 'ext': 'm4v', 'title': 'Tegenlicht', - 'description': 'md5:d6476bceb17a8c103c76c3b708f05dd1', + 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', 'upload_date': '20130225', 'duration': 3000, }, From 03f32a7eadf9d832aef55673edf38023a8daff95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 17 Jul 2015 00:14:38 +0600 Subject: [PATCH 173/450] [wnl] Add extractor for omroepwnl playlists --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/npo.py | 45 +++++++++++++++++++++++++++----- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3f4f23521..1d55275dc 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -390,6 +390,7 @@ from .npo import ( NPORadioIE, NPORadioFragmentIE, TegenlichtVproIE, + WNLIE ) from .nrk import ( NRKIE, diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index cf6a388e5..c6bf7619d 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -46,12 +46,15 @@ class NPOIE(NPOBaseIE): IE_NAME = 'npo' IE_DESC = 'npo.nl and ntr.nl' _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?: - npo\.nl/(?!live|radio)(?:[^/]+/){2}| - ntr\.nl/(?:[^/]+/){2,}| - omroepwnl\.nl/video/fragment/[^/]+__ + (?: + npo:| + https?:// + (?:www\.)? + (?: + npo\.nl/(?!live|radio)(?:[^/]+/){2}| + ntr\.nl/(?:[^/]+/){2,}| + omroepwnl\.nl/video/fragment/[^/]+__ + ) ) (?P<id>[^/?#]+) ''' @@ -426,3 +429,33 @@ class TegenlichtVproIE(NPOIE): info_page = self._download_json( 'http://rs.vpro.nl/v2/api/media/%s.json' % urn, name) return self._get_info(info_page['mid']) + + +class WNLIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?omroepwnl\.nl/video/detail/(?P<id>[^/]+)__\d+' + + _TEST = { + 'url': 'http://www.omroepwnl.nl/video/detail/vandaag-de-dag-6-mei__060515', + 'info_dict': { + 'id': 'vandaag-de-dag-6-mei', + 'title': 'Vandaag de Dag 6 mei', + }, + 'playlist_count': 4, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('npo:%s' % video_id, 'NPO') + for video_id, part in re.findall( + r'<a[^>]+href="([^"]+)"[^>]+class="js-mid"[^>]*>(Deel \d+)', webpage) + ] + + playlist_title = self._html_search_regex( + r'(?s)<h1[^>]+class="subject"[^>]*>(.+?)</h1>', + webpage, 'playlist title') + + return self.playlist_result(entries, playlist_id, playlist_title) From 611ac379bb466267aded6726f9c85e79b08168c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 17 Jul 2015 00:34:24 +0600 Subject: [PATCH 174/450] [vpro] Fix extraction and add support for vpro playlists --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/npo.py | 35 +++++++++++++++++++++++--------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1d55275dc..06f21064b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -389,7 +389,7 @@ from .npo import ( NPOLiveIE, NPORadioIE, NPORadioFragmentIE, - TegenlichtVproIE, + VPROIE, WNLIE ) from .nrk import ( diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index c6bf7619d..28d5c90b3 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -404,9 +404,8 @@ class NPORadioFragmentIE(InfoExtractor): } -class TegenlichtVproIE(NPOIE): - IE_NAME = 'tegenlicht.vpro.nl' - _VALID_URL = r'https?://tegenlicht\.vpro\.nl/afleveringen/.*?' +class VPROIE(NPOIE): + _VALID_URL = r'https?://(?:www\.)?(?:tegenlicht\.)?vpro\.nl/(?:[^/]+/){2,}(?P<id>[^/]+)\.html' _TESTS = [ { @@ -416,19 +415,35 @@ class TegenlichtVproIE(NPOIE): 'id': 'VPWON_1169289', 'ext': 'm4v', 'title': 'Tegenlicht', - 'description': 'md5:d6476bceb17a8c103c76c3b708f05dd1', + 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', 'upload_date': '20130225', }, }, + { + 'url': 'http://www.vpro.nl/programmas/2doc/2015/sergio-herman.html', + 'info_dict': { + 'id': 'sergio-herman', + 'title': 'Sergio Herman: Fucking perfect', + }, + 'playlist_count': 2, + } ] def _real_extract(self, url): - name = url_basename(url) - webpage = self._download_webpage(url, name) - urn = self._html_search_meta('mediaurn', webpage) - info_page = self._download_json( - 'http://rs.vpro.nl/v2/api/media/%s.json' % urn, name) - return self._get_info(info_page['mid']) + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('npo:%s' % video_id, 'NPO') + for video_id in re.findall(r'data-media-id="([^"]+)"', webpage) + ] + + playlist_title = self._search_regex( + r'<title>\s*([^>]+?)\s*-\s*Teledoc\s*-\s*VPRO\s*', + webpage, 'playlist title', default=None) or self._og_search_title(webpage) + + return self.playlist_result(entries, playlist_id, playlist_title) class WNLIE(InfoExtractor): From 5ba761eb854d6e415b3ab542293cb31c073dc0f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 00:39:22 +0600 Subject: [PATCH 175/450] [npo] Prefer aflevering_titel over titel --- youtube_dl/extractor/npo.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 28d5c90b3..91adb23f0 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -234,7 +234,9 @@ class NPOIE(NPOBaseIE): return { 'id': video_id, - 'title': metadata['titel'], + # prefer aflevering_titel if any since titel may be too generic, e.g. + # http://tegenlicht.vpro.nl/afleveringen/2014-2015/access-to-africa.html + 'title': metadata.get('aflevering_titel') or metadata['titel'], 'description': metadata['info'], 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], 'upload_date': unified_strdate(metadata.get('gidsdatum')), @@ -414,7 +416,7 @@ class VPROIE(NPOIE): 'info_dict': { 'id': 'VPWON_1169289', 'ext': 'm4v', - 'title': 'Tegenlicht', + 'title': 'De toekomst komt uit Afrika', 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', 'upload_date': '20130225', }, From 536b0700b03f0b29a1025be0b7753253bd627d6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 00:40:04 +0600 Subject: [PATCH 176/450] [npo] Allow missing description --- youtube_dl/extractor/npo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 91adb23f0..e733d96f6 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -237,7 +237,7 @@ class NPOIE(NPOBaseIE): # prefer aflevering_titel if any since titel may be too generic, e.g. # http://tegenlicht.vpro.nl/afleveringen/2014-2015/access-to-africa.html 'title': metadata.get('aflevering_titel') or metadata['titel'], - 'description': metadata['info'], + 'description': metadata.get('info'), 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], 'upload_date': unified_strdate(metadata.get('gidsdatum')), 'duration': parse_duration(metadata.get('tijdsduur')), From 574f42d79a8596ceda681b205e19e766e7bab046 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 00:53:12 +0600 Subject: [PATCH 177/450] [vpro] Improve playlist extraction --- youtube_dl/extractor/npo.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index e733d96f6..583ed3e14 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -428,6 +428,15 @@ class VPROIE(NPOIE): 'title': 'Sergio Herman: Fucking perfect', }, 'playlist_count': 2, + }, + { + # playlist with youtube embed + 'url': 'http://www.vpro.nl/programmas/2doc/2015/education-education.html', + 'info_dict': { + 'id': 'education-education', + 'title': '2Doc', + }, + 'playlist_count': 2, } ] @@ -437,7 +446,7 @@ class VPROIE(NPOIE): webpage = self._download_webpage(url, playlist_id) entries = [ - self.url_result('npo:%s' % video_id, 'NPO') + self.url_result('npo:%s' % video_id if not video_id.startswith('http') else video_id) for video_id in re.findall(r'data-media-id="([^"]+)"', webpage) ] From 1540119723e7e2195a47d659993a6a3bcc02d3e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 00:54:05 +0600 Subject: [PATCH 178/450] [npo] Remove unused imports --- youtube_dl/extractor/npo.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 583ed3e14..f5ffe1231 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -3,17 +3,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_request, - compat_urllib_parse, -) from ..utils import ( fix_xml_ampersands, parse_duration, qualities, strip_jsonp, unified_strdate, - url_basename, ) From 23fc384f2c3cf9afd41fd7e033fe0823d0fc5fa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 01:28:52 +0600 Subject: [PATCH 179/450] [npo] Compound title --- youtube_dl/extractor/npo.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index f5ffe1231..0c2d02c10 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -72,7 +72,7 @@ class NPOIE(NPOBaseIE): 'info_dict': { 'id': 'VARA_101191800', 'ext': 'm4v', - 'title': 'De Mega Mike & Mega Thomas show', + 'title': 'De Mega Mike & Mega Thomas show: The best of.', 'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4', 'upload_date': '20090227', 'duration': 2400, @@ -84,7 +84,7 @@ class NPOIE(NPOBaseIE): 'info_dict': { 'id': 'VPWON_1169289', 'ext': 'm4v', - 'title': 'Tegenlicht', + 'title': 'Tegenlicht: De toekomst komt uit Afrika', 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', 'upload_date': '20130225', 'duration': 3000, @@ -157,6 +157,13 @@ class NPOIE(NPOBaseIE): # video id is POMS_WNL_853698 but prid is POW_00996502) video_id = metadata.get('prid') or video_id + # titel is too generic in some cases so utilize aflevering_titel as well + # when available (e.g. http://tegenlicht.vpro.nl/afleveringen/2014-2015/access-to-africa.html) + title = metadata['titel'] + sub_title = metadata.get('aflevering_titel') + if sub_title and sub_title != title: + title += ': %s' % sub_title + token = self._get_token(video_id) formats = [] @@ -229,9 +236,7 @@ class NPOIE(NPOBaseIE): return { 'id': video_id, - # prefer aflevering_titel if any since titel may be too generic, e.g. - # http://tegenlicht.vpro.nl/afleveringen/2014-2015/access-to-africa.html - 'title': metadata.get('aflevering_titel') or metadata['titel'], + 'title': title, 'description': metadata.get('info'), 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], 'upload_date': unified_strdate(metadata.get('gidsdatum')), From a38436e8898b6eca29d9279346e6e2136ec0bc8f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 17 Jul 2015 12:02:49 +0800 Subject: [PATCH 180/450] [extractor/common] Add 'transform_source' parameter to _extract_f4m_formats() --- youtube_dl/extractor/common.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 271bf8596..5a2d0d995 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -836,13 +836,14 @@ class InfoExtractor(object): self.to_screen(msg) time.sleep(timeout) - def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None): + def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, + transform_source=lambda s: fix_xml_ampersands(s).strip()): manifest = self._download_xml( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest', # Some manifests may be malformed, e.g. prosiebensat1 generated manifests # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) - transform_source=lambda s: fix_xml_ampersands(s).strip()) + transform_source=transform_source) formats = [] manifest_version = '1.0' From c9c854cea7fa5992356dee5eab0d3615b4d40dc6 Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 01:31:29 -0500 Subject: [PATCH 181/450] replace old compat_urllib_parse_unquote with backport from python3's function * required unquote_to_bytes function ported as well (uses .decode('hex') instead of dynamically populated _hextobyte global) * required implicit conversion to bytes and/or unicode in places due to differing type assumptions in p3 --- youtube_dl/compat.py | 75 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 1f4ccf443..2fd2278aa 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -74,10 +74,81 @@ try: except ImportError: import BaseHTTPServer as compat_http_server +from pprint import (pprint, pformat) + + +def dprint(fmt): + sys.stderr.write(pformat(fmt) + "\n") + try: from urllib.parse import unquote as compat_urllib_parse_unquote except ImportError: - def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): + def compat_urllib_parse_unquote_to_bytes(string): + """unquote_to_bytes('abc%20def') -> b'abc def'.""" + # Note: strings are encoded as UTF-8. This is only an issue if it contains + # unescaped non-ASCII characters, which URIs should not. + if not string: + # Is it a string-like object? + string.split + return b'' + if isinstance(string, str): + string = string.encode('utf-8') + # string = encode('utf-8') + + # python3 -> 2: must implicitly convert to bits + bits = bytes(string).split(b'%') + + if len(bits) == 1: + return string + res = [bits[0]] + append = res.append + + for item in bits[1:]: + try: + append(item[:2].decode('hex')) + append(item[2:]) + except: + append(b'%') + append(item) + return b''.join(res) + + compat_urllib_parse_asciire = re.compile('([\x00-\x7f]+)') + + def new_compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): + """Replace %xx escapes by their single-character equivalent. The optional + encoding and errors parameters specify how to decode percent-encoded + sequences into Unicode characters, as accepted by the bytes.decode() + method. + By default, percent-encoded sequences are decoded with UTF-8, and invalid + sequences are replaced by a placeholder character. + + unquote('abc%20def') -> 'abc def'. + """ + + if '%' not in string: + string.split + return string + if encoding is None: + encoding = 'utf-8' + if errors is None: + errors = 'replace' + + bits = compat_urllib_parse_asciire.split(string) + res = [bits[0]] + append = res.append + for i in range(1, len(bits), 2): + foo = compat_urllib_parse_unquote_to_bytes(bits[i]) + foo = foo.decode(encoding, errors) + append(foo) + + if bits[i + 1]: + bar = bits[i + 1] + if not isinstance(bar, unicode): + bar = bar.decode('utf-8') + append(bar) + return ''.join(res) + + def old_compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): if string == '': return string res = string.split('%') @@ -114,6 +185,8 @@ except ImportError: string += pct_sequence.decode(encoding, errors) return string + compat_urllib_parse_unquote = new_compat_urllib_parse_unquote + try: compat_str = unicode # Python 2 except NameError: From 851229a01f34129286a57d46f8a27b9bb5fd9a6b Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 01:49:55 -0500 Subject: [PATCH 182/450] remove debugprint --- youtube_dl/compat.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 2fd2278aa..554e3d5db 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -74,12 +74,6 @@ try: except ImportError: import BaseHTTPServer as compat_http_server -from pprint import (pprint, pformat) - - -def dprint(fmt): - sys.stderr.write(pformat(fmt) + "\n") - try: from urllib.parse import unquote as compat_urllib_parse_unquote except ImportError: From a0f28f90fa277d9c00f0305624dea36a20b8066e Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 01:50:43 -0500 Subject: [PATCH 183/450] remove kebab --- youtube_dl/compat.py | 41 +---------------------------------------- 1 file changed, 1 insertion(+), 40 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 554e3d5db..de9ba2c14 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -108,7 +108,7 @@ except ImportError: compat_urllib_parse_asciire = re.compile('([\x00-\x7f]+)') - def new_compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): + def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): """Replace %xx escapes by their single-character equivalent. The optional encoding and errors parameters specify how to decode percent-encoded sequences into Unicode characters, as accepted by the bytes.decode() @@ -142,45 +142,6 @@ except ImportError: append(bar) return ''.join(res) - def old_compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): - if string == '': - return string - res = string.split('%') - if len(res) == 1: - return string - if encoding is None: - encoding = 'utf-8' - if errors is None: - errors = 'replace' - # pct_sequence: contiguous sequence of percent-encoded bytes, decoded - pct_sequence = b'' - string = res[0] - for item in res[1:]: - try: - if not item: - raise ValueError - if not re.match('[0-9a-fA-F][0-9a-fA-F]',item[:2]): - raise ValueError - pct_sequence += item[:2].decode('hex') - rest = item[2:] - if not rest: - # This segment was just a single percent-encoded character. - # May be part of a sequence of code units, so delay decoding. - # (Stored in pct_sequence). - continue - except ValueError: - rest = '%' + item - # Encountered non-percent-encoded characters. Flush the current - # pct_sequence. - string += pct_sequence.decode(encoding, errors) + rest - pct_sequence = b'' - if pct_sequence: - # Flush the final pct_sequence - string += pct_sequence.decode(encoding, errors) - return string - - compat_urllib_parse_unquote = new_compat_urllib_parse_unquote - try: compat_str = unicode # Python 2 except NameError: From 36da48798a28b8261d2f39f73f2522651d58a364 Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 02:27:50 -0500 Subject: [PATCH 184/450] handle titles and captions set to '' --- youtube_dl/extractor/bbc.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 471d865d2..c910eb55a 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -497,11 +497,13 @@ class BBCNewsIE(BBCCoUkIE): programme_id = jent.get('externalId') xml_url = jent.get('href') - title = jent.get('caption',list_title) + title = jent.get('caption','') + if title == '': + title = list_title duration = parse_duration(jent.get('duration')) description = list_title - if jent.get('caption'): + if jent.get('caption', '') != '': description += ' - ' + jent.get('caption') thumbnail = None if jent.has_key('image'): From a3bfddfa5ee33cf085b959536f1025c0aa53cc77 Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 02:47:02 -0500 Subject: [PATCH 185/450] bbc.py: correct syntax --- youtube_dl/extractor/bbc.py | 106 ++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index c910eb55a..c8f285165 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -397,14 +397,14 @@ class BBCNewsIE(BBCCoUkIE): 'title': 'Russia stages massive WW2 parade despite Western boycott', }, 'playlist_count': 2, - },{ + }, { 'url': 'http://www.bbc.com/news/business-28299555', 'info_dict': { 'id': 'business-28299555', 'title': 'Farnborough Airshow: Video highlights', }, 'playlist_count': 9, - },{ + }, { 'url': 'http://www.bbc.com/news/world-europe-32041533', 'note': 'Video', 'info_dict': { @@ -419,7 +419,7 @@ class BBCNewsIE(BBCCoUkIE): 'params': { 'skip_download': True, } - },{ + }, { 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', 'note': 'Video', 'info_dict': { @@ -434,7 +434,7 @@ class BBCNewsIE(BBCCoUkIE): 'params': { 'skip_download': True, } - },{ + }, { 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', 'note': 'Video', 'info_dict': { @@ -459,88 +459,88 @@ class BBCNewsIE(BBCCoUkIE): pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) if pubdate: - pubdate = pubdate.replace('-','') + pubdate = pubdate.replace('-', '') ret = [] jsent = [] # works with bbc.com/news/something-something-123456 articles jsent = map( - lambda m: self._parse_json(m,list_id), - re.findall(r"data-media-meta='({[^']+})'", webpage) + lambda m: self._parse_json(m, list_id), + re.findall(r"data-media-meta='({[^']+})'", webpage) ) if len(jsent) == 0: - # http://www.bbc.com/news/video_and_audio/international - # and single-video articles - masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None) - if masset: - jmasset = self._parse_json(masset,list_id) - for key, val in jmasset.get('videos',{}).items(): - for skey, sval in val.items(): - sval['id'] = skey - jsent.append(sval) + # http://www.bbc.com/news/video_and_audio/international + # and single-video articles + masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None) + if masset: + jmasset = self._parse_json(masset, list_id) + for key, val in jmasset.get('videos', {}).items(): + for skey, sval in val.items(): + sval['id'] = skey + jsent.append(sval) if len(jsent) == 0: - # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} - # in http://www.bbc.com/news/video_and_audio/international - # prone to breaking if entries have sourceFiles list - jsent = map( - lambda m: self._parse_json(m,list_id), - re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) - ) + # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} + # in http://www.bbc.com/news/video_and_audio/international + # prone to breaking if entries have sourceFiles list + jsent = map( + lambda m: self._parse_json(m, list_id), + re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) + ) if len(jsent) == 0: - raise ExtractorError('No video found', expected=True) + raise ExtractorError('No video found', expected=True) for jent in jsent: programme_id = jent.get('externalId') xml_url = jent.get('href') - title = jent.get('caption','') + title = jent.get('caption', '') if title == '': - title = list_title + title = list_title duration = parse_duration(jent.get('duration')) description = list_title if jent.get('caption', '') != '': - description += ' - ' + jent.get('caption') + description += ' - ' + jent.get('caption') thumbnail = None - if jent.has_key('image'): - thumbnail=jent['image'].get('href') + if jent.get('image') is not None: + thumbnail = jent['image'].get('href') formats = [] subtitles = [] if programme_id: - formats, subtitles = self._download_media_selector(programme_id) - elif jent.has_key('sourceFiles'): - # mediaselector not used at - # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu - for key, val in jent['sourceFiles'].items(): - formats.append( { - 'ext': val.get('encoding'), - 'url': val.get('url'), - 'filesize': int(val.get('filesize')), - 'format_id': key - } ) + formats, subtitles = self._download_media_selector(programme_id) + elif jent.get('sourceFiles') is not None: + # mediaselector not used at + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu + for key, val in jent['sourceFiles'].items(): + formats.append({ + 'ext': val.get('encoding'), + 'url': val.get('url'), + 'filesize': int(val.get('filesize')), + 'format_id': key + }) elif xml_url: - # Cheap fallback - # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml - xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') - programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') - formats, subtitles = self._download_media_selector(programme_id) + # Cheap fallback + # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml + xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') + programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') + formats, subtitles = self._download_media_selector(programme_id) if len(formats) == 0: - raise ExtractorError('unsupported json media entry.\n '+str(jent)+'\n') - + raise ExtractorError('unsupported json media entry.\n ' + str(jent) + '\n') + self._sort_formats(formats) - id = jent.get('id') if programme_id == None else programme_id - if id == None: - id = 'NA' + id = jent.get('id') if programme_id is None else programme_id + if id is None: + id = 'NA' - ret.append( { + ret.append({ 'id': id, 'uploader': 'BBC News', 'upload_date': pubdate, @@ -550,8 +550,8 @@ class BBCNewsIE(BBCCoUkIE): 'duration': duration, 'formats': formats, 'subtitles': subtitles, - } ) + }) if len(ret) > 0: - return self.playlist_result(ret, list_id, list_title) + return self.playlist_result(ret, list_id, list_title) raise ExtractorError('No video found', expected=True) From 9fefc88656eceac13604fd86dfb25dc736ed239a Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 07:24:07 -0500 Subject: [PATCH 186/450] fix TestCompat test_all_present --- youtube_dl/compat.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index de9ba2c14..8b4d0287c 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -451,7 +451,9 @@ __all__ = [ 'compat_subprocess_get_DEVNULL', 'compat_urllib_error', 'compat_urllib_parse', + 'compat_urllib_parse_asciire', 'compat_urllib_parse_unquote', + 'compat_urllib_parse_unquote_to_bytes', 'compat_urllib_parse_urlparse', 'compat_urllib_request', 'compat_urlparse', From 593b77064c51c411071e310578b542017b9b2ec8 Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 09:45:49 -0500 Subject: [PATCH 187/450] Don't forget trailing '%' --- youtube_dl/compat.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 8b4d0287c..9e506352f 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -98,6 +98,9 @@ except ImportError: append = res.append for item in bits[1:]: + if item == '': + append(b'%') + continue try: append(item[:2].decode('hex')) append(item[2:]) From 4a632911443f0dbc2384fb82ade85382aeecc8dc Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 09:46:08 -0500 Subject: [PATCH 188/450] Add tests for compat_urllib_parse_unquote --- test/test_compat.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test/test_compat.py b/test/test_compat.py index 1eb454e06..431e6bdf1 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -14,6 +14,7 @@ from youtube_dl.utils import get_filesystem_encoding from youtube_dl.compat import ( compat_getenv, compat_expanduser, + compat_urllib_parse_unquote, ) @@ -42,5 +43,24 @@ class TestCompat(unittest.TestCase): dir(youtube_dl.compat))) - set(['unicode_literals']) self.assertEqual(all_names, sorted(present_names)) + def test_compat_urllib_parse_unquote(self): + test_strings = [ + ['''''', ''''''], + ['''津波''', '''%E6%B4%A5%E6%B3%A2'''], + ['''津波''', str('%E6%B4%A5%E6%B3%A2')], + [''' +%%a''', + ''' +%%a'''], + ['''(^◣_◢^)っ︻デ═一 ⇀ ⇀ ⇀ ⇀ ⇀ ↶%I%Break%Things%''', + '''%28%5E%E2%97%A3_%E2%97%A2%5E%29%E3%81%A3%EF%B8%BB%E3%83%87%E2%95%90%E4%B8%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%86%B6%I%Break%25Things%'''] + ] + for test in test_strings: + strutf = test[0] + strurlenc = test[1] + strurldec = compat_urllib_parse_unquote(strurlenc) + self.assertEqual(strutf, strurldec) + self.assertEqual(strutf, compat_urllib_parse_unquote(strurlenc)) + if __name__ == '__main__': unittest.main() From cabe001590914338524dff0c1edffbde4a1447ab Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 17 Jul 2015 16:18:33 +0100 Subject: [PATCH 189/450] [howstuffwoks] fix _VALID_URL regex --- youtube_dl/extractor/howstuffworks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py index e97339121..f59393150 100644 --- a/youtube_dl/extractor/howstuffworks.py +++ b/youtube_dl/extractor/howstuffworks.py @@ -10,7 +10,7 @@ from ..utils import ( class HowStuffWorksIE(InfoExtractor): - _VALID_URL = r'https?://[\da-z-]+\.howstuffworks\.com/(?:[^/]+/)*\d+-(?P.+?)-video\.htm' + _VALID_URL = r'https?://[\da-z-]+\.howstuffworks\.com/(?:[^/]+/)*(\d+-)*(?P.+?)-video\.htm' _TESTS = [ { 'url': 'http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm', From f354385bf50e04b0e7e1defcc14264d1a66e0a07 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 17 Jul 2015 16:43:27 +0100 Subject: [PATCH 190/450] Add test for urls without a number --- youtube_dl/extractor/howstuffworks.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py index f59393150..c9b6579a6 100644 --- a/youtube_dl/extractor/howstuffworks.py +++ b/youtube_dl/extractor/howstuffworks.py @@ -46,6 +46,17 @@ class HowStuffWorksIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', }, }, + { + 'url': 'http://shows.howstuffworks.com/stuff-to-blow-your-mind/optical-illusions-video.htm', + 'info_dict': { + 'id': '855410', + 'ext': 'mp4', + 'title': 'Stuff to Blow Your Mind', + 'description': 'When it comes to optical illusions it’s pretty easy to jailbreak the operating system of your mind and load a new program onto it. Find out why your visual system willingly makes errors in interpretation.', + 'display_id': 'optical-illusions', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + } ] def _real_extract(self, url): From 1186e3f91ab940017be263c65c27b821992e58cc Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 17 Jul 2015 16:45:53 +0100 Subject: [PATCH 191/450] do not capture a group --- youtube_dl/extractor/howstuffworks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py index c9b6579a6..cc931d571 100644 --- a/youtube_dl/extractor/howstuffworks.py +++ b/youtube_dl/extractor/howstuffworks.py @@ -10,7 +10,7 @@ from ..utils import ( class HowStuffWorksIE(InfoExtractor): - _VALID_URL = r'https?://[\da-z-]+\.howstuffworks\.com/(?:[^/]+/)*(\d+-)*(?P.+?)-video\.htm' + _VALID_URL = r'https?://[\da-z-]+\.howstuffworks\.com/(?:[^/]+/)*(?:\d+-)?(?P.+?)-video\.htm' _TESTS = [ { 'url': 'http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm', From c4fe07c7afcc59dfcd5cfa4626ad37deb19e1033 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 17 Jul 2015 17:07:55 +0100 Subject: [PATCH 192/450] match only the test url --- youtube_dl/extractor/howstuffworks.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/youtube_dl/extractor/howstuffworks.py b/youtube_dl/extractor/howstuffworks.py index cc931d571..663e6632a 100644 --- a/youtube_dl/extractor/howstuffworks.py +++ b/youtube_dl/extractor/howstuffworks.py @@ -48,14 +48,7 @@ class HowStuffWorksIE(InfoExtractor): }, { 'url': 'http://shows.howstuffworks.com/stuff-to-blow-your-mind/optical-illusions-video.htm', - 'info_dict': { - 'id': '855410', - 'ext': 'mp4', - 'title': 'Stuff to Blow Your Mind', - 'description': 'When it comes to optical illusions it’s pretty easy to jailbreak the operating system of your mind and load a new program onto it. Find out why your visual system willingly makes errors in interpretation.', - 'display_id': 'optical-illusions', - 'thumbnail': 're:^https?://.*\.jpg$', - }, + 'only_matching': True, } ] From 55139679261f8c2409ca150906a2693731452a13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 22:58:13 +0600 Subject: [PATCH 193/450] [compat] Simplify and use latest cpython 3 code --- youtube_dl/compat.py | 41 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 9e506352f..54ccf1d28 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -75,8 +75,13 @@ except ImportError: import BaseHTTPServer as compat_http_server try: + from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes from urllib.parse import unquote as compat_urllib_parse_unquote -except ImportError: +except ImportError: # Python 2 + # HACK: The following are the correct unquote_to_bytes and unquote + # implementations from cpython 3.4.3's stdlib. Python 2's version + # is apparently broken (see https://github.com/rg3/youtube-dl/pull/6244) + def compat_urllib_parse_unquote_to_bytes(string): """unquote_to_bytes('abc%20def') -> b'abc def'.""" # Note: strings are encoded as UTF-8. This is only an issue if it contains @@ -85,32 +90,22 @@ except ImportError: # Is it a string-like object? string.split return b'' - if isinstance(string, str): + if isinstance(string, unicode): string = string.encode('utf-8') - # string = encode('utf-8') - - # python3 -> 2: must implicitly convert to bits - bits = bytes(string).split(b'%') - + bits = string.split(b'%') if len(bits) == 1: return string res = [bits[0]] append = res.append - for item in bits[1:]: - if item == '': - append(b'%') - continue try: - append(item[:2].decode('hex')) + append(compat_urllib_parse._hextochr[item[:2]]) append(item[2:]) - except: + except KeyError: append(b'%') append(item) return b''.join(res) - compat_urllib_parse_asciire = re.compile('([\x00-\x7f]+)') - def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'): """Replace %xx escapes by their single-character equivalent. The optional encoding and errors parameters specify how to decode percent-encoded @@ -121,7 +116,6 @@ except ImportError: unquote('abc%20def') -> 'abc def'. """ - if '%' not in string: string.split return string @@ -129,20 +123,12 @@ except ImportError: encoding = 'utf-8' if errors is None: errors = 'replace' - - bits = compat_urllib_parse_asciire.split(string) + bits = compat_urllib_parse._asciire.split(string) res = [bits[0]] append = res.append for i in range(1, len(bits), 2): - foo = compat_urllib_parse_unquote_to_bytes(bits[i]) - foo = foo.decode(encoding, errors) - append(foo) - - if bits[i + 1]: - bar = bits[i + 1] - if not isinstance(bar, unicode): - bar = bar.decode('utf-8') - append(bar) + append(compat_urllib_parse_unquote_to_bytes(bits[i]).decode(encoding, errors)) + append(bits[i + 1]) return ''.join(res) try: @@ -454,7 +440,6 @@ __all__ = [ 'compat_subprocess_get_DEVNULL', 'compat_urllib_error', 'compat_urllib_parse', - 'compat_urllib_parse_asciire', 'compat_urllib_parse_unquote', 'compat_urllib_parse_unquote_to_bytes', 'compat_urllib_parse_urlparse', From 14309e1ddc476a7e2fc444a0443b2fc23186a385 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 22:58:39 +0600 Subject: [PATCH 194/450] [test_compat] Make tests more idiomatic --- test/test_compat.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index 431e6bdf1..2ffbc2c48 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -44,23 +44,22 @@ class TestCompat(unittest.TestCase): self.assertEqual(all_names, sorted(present_names)) def test_compat_urllib_parse_unquote(self): - test_strings = [ - ['''''', ''''''], - ['''津波''', '''%E6%B4%A5%E6%B3%A2'''], - ['''津波''', str('%E6%B4%A5%E6%B3%A2')], - [''' -%%a''', - ''' -%%a'''], - ['''(^◣_◢^)っ︻デ═一 ⇀ ⇀ ⇀ ⇀ ⇀ ↶%I%Break%Things%''', - '''%28%5E%E2%97%A3_%E2%97%A2%5E%29%E3%81%A3%EF%B8%BB%E3%83%87%E2%95%90%E4%B8%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%86%B6%I%Break%25Things%'''] - ] - for test in test_strings: - strutf = test[0] - strurlenc = test[1] - strurldec = compat_urllib_parse_unquote(strurlenc) - self.assertEqual(strutf, strurldec) - self.assertEqual(strutf, compat_urllib_parse_unquote(strurlenc)) + self.assertEqual(compat_urllib_parse_unquote(''), '') + self.assertEqual(compat_urllib_parse_unquote('%'), '%') + self.assertEqual(compat_urllib_parse_unquote('%%'), '%%') + self.assertEqual(compat_urllib_parse_unquote('%%%'), '%%%') + self.assertEqual(compat_urllib_parse_unquote('%2F'), '/') + self.assertEqual(compat_urllib_parse_unquote('%2f'), '/') + self.assertEqual(compat_urllib_parse_unquote('%E6%B4%A5%E6%B3%A2'), '津波') + self.assertEqual(compat_urllib_parse_unquote(str('%E6%B4%A5%E6%B3%A2')), '津波') + self.assertEqual( + compat_urllib_parse_unquote(''' +%%a'''), + ''' +%%a''') + self.assertEqual( + compat_urllib_parse_unquote('''%28%5E%E2%97%A3_%E2%97%A2%5E%29%E3%81%A3%EF%B8%BB%E3%83%87%E2%95%90%E4%B8%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%86%B6%I%Break%25Things%'''), + '''(^◣_◢^)っ︻デ═一 ⇀ ⇀ ⇀ ⇀ ⇀ ↶%I%Break%Things%''') if __name__ == '__main__': unittest.main() From d79febcd0619608aa100c17b0a2aefe9f4836d9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:09:56 +0600 Subject: [PATCH 195/450] [test_compat] Remove redundant test --- test/test_compat.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_compat.py b/test/test_compat.py index 2ffbc2c48..d816a9236 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -51,7 +51,6 @@ class TestCompat(unittest.TestCase): self.assertEqual(compat_urllib_parse_unquote('%2F'), '/') self.assertEqual(compat_urllib_parse_unquote('%2f'), '/') self.assertEqual(compat_urllib_parse_unquote('%E6%B4%A5%E6%B3%A2'), '津波') - self.assertEqual(compat_urllib_parse_unquote(str('%E6%B4%A5%E6%B3%A2')), '津波') self.assertEqual( compat_urllib_parse_unquote(''' %%a'''), From aa99aa4e853966d4f01e280313e3aadf2ddf7ad6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:28:34 +0600 Subject: [PATCH 196/450] [compat] Add compat_urllib_parse_unquote_plus --- youtube_dl/compat.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 54ccf1d28..a3a2aef53 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -77,6 +77,7 @@ except ImportError: try: from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes from urllib.parse import unquote as compat_urllib_parse_unquote + from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus except ImportError: # Python 2 # HACK: The following are the correct unquote_to_bytes and unquote # implementations from cpython 3.4.3's stdlib. Python 2's version @@ -131,6 +132,15 @@ except ImportError: # Python 2 append(bits[i + 1]) return ''.join(res) + def compat_urllib_parse_unquote_plus(string, encoding='utf-8', errors='replace'): + """Like unquote(), but also replace plus signs by spaces, as required for + unquoting HTML form values. + + unquote_plus('%7e/abc+def') -> '~/abc def' + """ + string = string.replace('+', ' ') + return compat_urllib_parse_unquote(string, encoding, errors) + try: compat_str = unicode # Python 2 except NameError: @@ -441,6 +451,7 @@ __all__ = [ 'compat_urllib_error', 'compat_urllib_parse', 'compat_urllib_parse_unquote', + 'compat_urllib_parse_unquote_plus', 'compat_urllib_parse_unquote_to_bytes', 'compat_urllib_parse_urlparse', 'compat_urllib_request', From 8954e481404746c65ae1e82a6c5edec69114bfb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:31:23 +0600 Subject: [PATCH 197/450] [test_compat] Add tests for compat_urllib_parse_unquote_plus --- test/test_compat.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/test_compat.py b/test/test_compat.py index d816a9236..c3ba8ad2e 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -15,6 +15,7 @@ from youtube_dl.compat import ( compat_getenv, compat_expanduser, compat_urllib_parse_unquote, + compat_urllib_parse_unquote_plus, ) @@ -44,6 +45,8 @@ class TestCompat(unittest.TestCase): self.assertEqual(all_names, sorted(present_names)) def test_compat_urllib_parse_unquote(self): + self.assertEqual(compat_urllib_parse_unquote('abc%20def'), 'abc def') + self.assertEqual(compat_urllib_parse_unquote('%7e/abc+def'), '~/abc+def') self.assertEqual(compat_urllib_parse_unquote(''), '') self.assertEqual(compat_urllib_parse_unquote('%'), '%') self.assertEqual(compat_urllib_parse_unquote('%%'), '%%') @@ -60,5 +63,9 @@ class TestCompat(unittest.TestCase): compat_urllib_parse_unquote('''%28%5E%E2%97%A3_%E2%97%A2%5E%29%E3%81%A3%EF%B8%BB%E3%83%87%E2%95%90%E4%B8%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%87%80 %E2%86%B6%I%Break%25Things%'''), '''(^◣_◢^)っ︻デ═一 ⇀ ⇀ ⇀ ⇀ ⇀ ↶%I%Break%Things%''') + def test_compat_urllib_parse_unquote_plus(self): + self.assertEqual(compat_urllib_parse_unquote_plus('abc%20def'), 'abc def') + self.assertEqual(compat_urllib_parse_unquote_plus('%7e/abc+def'), '~/abc def') + if __name__ == '__main__': unittest.main() From 4d08161ac23cfdf563b5d79b9f75cde6e6e57178 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:32:43 +0600 Subject: [PATCH 198/450] [compat] Mention unquote_plus --- youtube_dl/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index a3a2aef53..e950a4688 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -79,7 +79,7 @@ try: from urllib.parse import unquote as compat_urllib_parse_unquote from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus except ImportError: # Python 2 - # HACK: The following are the correct unquote_to_bytes and unquote + # HACK: The following are the correct unquote_to_bytes, unquote and unquote_plus # implementations from cpython 3.4.3's stdlib. Python 2's version # is apparently broken (see https://github.com/rg3/youtube-dl/pull/6244) From b94b78971ce70a999584acab57d0a620a91bc2aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:35:14 +0600 Subject: [PATCH 199/450] [bet] Use compat_urllib_parse_unquote --- youtube_dl/extractor/bet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py index 26b934543..03dad4636 100644 --- a/youtube_dl/extractor/bet.py +++ b/youtube_dl/extractor/bet.py @@ -1,7 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_unquote from ..utils import ( xpath_text, xpath_with_ns, @@ -57,7 +57,7 @@ class BetIE(InfoExtractor): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - media_url = compat_urllib_parse.unquote(self._search_regex( + media_url = compat_urllib_parse_unquote(self._search_regex( [r'mediaURL\s*:\s*"([^"]+)"', r"var\s+mrssMediaUrl\s*=\s*'([^']+)'"], webpage, 'media URL')) From 3e72f5f10e8dbac71e267ce10e791251a54e61ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:36:11 +0600 Subject: [PATCH 200/450] [ceskatelevize] Use compat_urllib_parse_unquote --- youtube_dl/extractor/ceskatelevize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 65f6be623..dda583680 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..compat import ( compat_urllib_request, compat_urllib_parse, + compat_urllib_parse_unquote, compat_urllib_parse_urlparse, ) from ..utils import ( @@ -88,7 +89,7 @@ class CeskaTelevizeIE(InfoExtractor): if playlist_url == 'error_region': raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) - req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlist_url)) + req = compat_urllib_request.Request(compat_urllib_parse_unquote(playlist_url)) req.add_header('Referer', url) playlist = self._download_json(req, video_id) From a60cccbf9f76e7e42509bf1ed4e054643ecc348f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:36:58 +0600 Subject: [PATCH 201/450] [crunchyroll] Use compat_urllib_parse_unquote --- youtube_dl/extractor/crunchyroll.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 73f1e22ef..d1b6d7366 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -12,6 +12,7 @@ from math import pow, sqrt, floor from .common import InfoExtractor from ..compat import ( compat_urllib_parse, + compat_urllib_parse_unquote, compat_urllib_request, ) from ..utils import ( @@ -254,7 +255,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text video_upload_date = unified_strdate(video_upload_date) video_uploader = self._html_search_regex(r'
\s*Publisher:(.+?)
', webpage, 'video_uploader', fatal=False, flags=re.DOTALL) - playerdata_url = compat_urllib_parse.unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url')) + playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url')) playerdata_req = compat_urllib_request.Request(playerdata_url) playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url}) playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') From d3671b344f663c3aa82aea86de51a70f24511792 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:37:23 +0600 Subject: [PATCH 202/450] [ehow] Use compat_urllib_parse_unquote --- youtube_dl/extractor/ehow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ehow.py b/youtube_dl/extractor/ehow.py index 9cb1bf301..ba9685991 100644 --- a/youtube_dl/extractor/ehow.py +++ b/youtube_dl/extractor/ehow.py @@ -1,7 +1,7 @@ from __future__ import unicode_literals from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_unquote, ) from .common import InfoExtractor @@ -26,7 +26,7 @@ class EHowIE(InfoExtractor): webpage = self._download_webpage(url, video_id) video_url = self._search_regex( r'(?:file|source)=(http[^\'"&]*)', webpage, 'video URL') - final_url = compat_urllib_parse.unquote(video_url) + final_url = compat_urllib_parse_unquote(video_url) uploader = self._html_search_meta('uploader', webpage) title = self._og_search_title(webpage).replace(' | eHow', '') From d7011316d0bce88796a6d33e895fdafa231fd8ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:37:56 +0600 Subject: [PATCH 203/450] [facebook] Use compat_urllib_parse_unquote --- youtube_dl/extractor/facebook.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 82dc27bc6..e17bb9aea 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -9,7 +9,7 @@ from ..compat import ( compat_http_client, compat_str, compat_urllib_error, - compat_urllib_parse, + compat_urllib_parse_unquote, compat_urllib_request, ) from ..utils import ( @@ -136,7 +136,7 @@ class FacebookIE(InfoExtractor): else: raise ExtractorError('Cannot parse data') data = dict(json.loads(m.group(1))) - params_raw = compat_urllib_parse.unquote(data['params']) + params_raw = compat_urllib_parse_unquote(data['params']) params = json.loads(params_raw) video_data = params['video_data'][0] From 1f80e360fce9fc057847379ec06d0e635f697198 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:38:30 +0600 Subject: [PATCH 204/450] [gamespot] Use compat_urllib_parse_unquote --- youtube_dl/extractor/gamespot.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 2d33fa7f5..b3f1bafcc 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -5,7 +5,7 @@ import json from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_unquote, compat_urlparse, ) from ..utils import ( @@ -75,7 +75,7 @@ class GameSpotIE(InfoExtractor): return { 'id': data_video['guid'], 'display_id': page_id, - 'title': compat_urllib_parse.unquote(data_video['title']), + 'title': compat_urllib_parse_unquote(data_video['title']), 'formats': formats, 'description': self._html_search_meta('description', webpage), 'thumbnail': self._og_search_thumbnail(webpage), From f7e6f7fa2374864ba28fbe6840a0462b7c1033db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:39:32 +0600 Subject: [PATCH 205/450] [extractor/generic] Use compat_urllib_parse_unquote --- youtube_dl/extractor/generic.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index fc1bf2b6e..a62287e50 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -8,7 +8,6 @@ import re from .common import InfoExtractor from .youtube import YoutubeIE from ..compat import ( - compat_urllib_parse, compat_urllib_parse_unquote, compat_urllib_request, compat_urlparse, @@ -1369,7 +1368,7 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url')) mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P[^&]+)', webpage) if mobj is not None: - return self.url_result(compat_urllib_parse.unquote(mobj.group('url'))) + return self.url_result(compat_urllib_parse_unquote(mobj.group('url'))) # Look for funnyordie embed matches = re.findall(r']+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage) @@ -1682,7 +1681,7 @@ class GenericIE(InfoExtractor): entries = [] for video_url in found: video_url = compat_urlparse.urljoin(url, video_url) - video_id = compat_urllib_parse.unquote(os.path.basename(video_url)) + video_id = compat_urllib_parse_unquote(os.path.basename(video_url)) # Sometimes, jwplayer extraction will result in a YouTube URL if YoutubeIE.suitable(video_url): From 8ee4ecb48dd7824f451c11c4cebb05b603a2da27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:39:54 +0600 Subject: [PATCH 206/450] [infoq] Use compat_urllib_parse_unquote --- youtube_dl/extractor/infoq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 91a1b3ccb..71cfd12c5 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -4,7 +4,7 @@ import base64 from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_unquote, compat_urlparse, ) @@ -39,7 +39,7 @@ class InfoQIE(InfoExtractor): # Extract video URL encoded_id = self._search_regex( r"jsclassref\s*=\s*'([^']*)'", webpage, 'encoded id') - real_id = compat_urllib_parse.unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8')) + real_id = compat_urllib_parse_unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8')) playpath = 'mp4:' + real_id video_filename = playpath.split('/')[-1] From 899a3e2f139a7b774d1a9483bb975e480e642c75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:40:35 +0600 Subject: [PATCH 207/450] [karaoketv] Use compat_urllib_parse_unquote_plus --- youtube_dl/extractor/karaoketv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/karaoketv.py b/youtube_dl/extractor/karaoketv.py index e3b43ff8d..06daf5a89 100644 --- a/youtube_dl/extractor/karaoketv.py +++ b/youtube_dl/extractor/karaoketv.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_unquote_plus from ..utils import ( js_to_json, ) @@ -24,7 +24,7 @@ class KaraoketvIE(InfoExtractor): webpage = self._download_webpage(url, video_id) page_video_url = self._og_search_video_url(webpage, video_id) - config_json = compat_urllib_parse.unquote_plus(self._search_regex( + config_json = compat_urllib_parse_unquote_plus(self._search_regex( r'config=(.*)', page_video_url, 'configuration')) urls_info_json = self._download_json( From 977a247a0639728146dd5c0d369b542e0a1af69b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:41:14 +0600 Subject: [PATCH 208/450] [malemotion] Use compat_urllib_parse_unquote --- youtube_dl/extractor/malemotion.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/malemotion.py b/youtube_dl/extractor/malemotion.py index 0b85a59d1..92511a671 100644 --- a/youtube_dl/extractor/malemotion.py +++ b/youtube_dl/extractor/malemotion.py @@ -2,9 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, -) +from ..compat import compat_urllib_parse_unquote class MalemotionIE(InfoExtractor): @@ -24,7 +22,7 @@ class MalemotionIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_url = compat_urllib_parse.unquote(self._search_regex( + video_url = compat_urllib_parse_unquote(self._search_regex( r'(.*?) Date: Fri, 17 Jul 2015 23:41:47 +0600 Subject: [PATCH 209/450] [metacafe] Use compat_urllib_parse_unquote --- youtube_dl/extractor/metacafe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 8bc333b02..6e2e73a51 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..compat import ( compat_parse_qs, compat_urllib_parse, + compat_urllib_parse_unquote, compat_urllib_request, ) from ..utils import ( @@ -155,7 +156,7 @@ class MetacafeIE(InfoExtractor): video_url = None mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage) if mobj is not None: - mediaURL = compat_urllib_parse.unquote(mobj.group(1)) + mediaURL = compat_urllib_parse_unquote(mobj.group(1)) video_ext = mediaURL[-3:] # Extract gdaKey if available From 09b718c4398b2ce9964ec3ae5e329278d6211668 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:42:11 +0600 Subject: [PATCH 210/450] [mitele] Use compat_urllib_parse_unquote --- youtube_dl/extractor/mitele.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 7091f3335..852d72266 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -5,6 +5,7 @@ import json from .common import InfoExtractor from ..compat import ( compat_urllib_parse, + compat_urllib_parse_unquote, compat_urlparse, ) from ..utils import ( @@ -48,7 +49,7 @@ class MiTeleIE(InfoExtractor): domain = 'http://' + domain info_url = compat_urlparse.urljoin( domain, - compat_urllib_parse.unquote(embed_data['flashvars']['host']) + compat_urllib_parse_unquote(embed_data['flashvars']['host']) ) info_el = self._download_xml(info_url, episode).find('./video/info') From c2daf8dfa44b07179b272874c336d2435a2ff653 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:42:43 +0600 Subject: [PATCH 211/450] [mixcloud] Use compat_urllib_parse_unquote --- youtube_dl/extractor/mixcloud.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 425a4ccf1..d47aeceda 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -3,9 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, -) +from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, HEADRequest, @@ -60,7 +58,7 @@ class MixcloudIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) uploader = mobj.group(1) cloudcast_name = mobj.group(2) - track_id = compat_urllib_parse.unquote('-'.join((uploader, cloudcast_name))) + track_id = compat_urllib_parse_unquote('-'.join((uploader, cloudcast_name))) webpage = self._download_webpage(url, track_id) From e97bb3de838219a8c2800f3eb646313977503af2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:43:36 +0600 Subject: [PATCH 212/450] [mofosex] Use compat_urllib_parse_unquote --- youtube_dl/extractor/mofosex.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py index 2cec12d35..9bf99a54a 100644 --- a/youtube_dl/extractor/mofosex.py +++ b/youtube_dl/extractor/mofosex.py @@ -5,9 +5,9 @@ import re from .common import InfoExtractor from ..compat import ( + compat_urllib_parse_unquote, compat_urllib_parse_urlparse, compat_urllib_request, - compat_urllib_parse, ) @@ -34,7 +34,7 @@ class MofosexIE(InfoExtractor): webpage = self._download_webpage(req, video_id) video_title = self._html_search_regex(r'

(.+?)<', webpage, 'title') - video_url = compat_urllib_parse.unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, 'video_url')) + video_url = compat_urllib_parse_unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, 'video_url')) path = compat_urllib_parse_urlparse(video_url).path extension = os.path.splitext(path)[1][1:] format = path.split('/')[5].split('_')[:2] From 9fd3bf04b7b160c1f24f9818689075f2570b8a2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:44:38 +0600 Subject: [PATCH 213/450] [myvideo] Use compat_urllib_parse_unquote --- youtube_dl/extractor/myvideo.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py index 5e754fcff..c96f472a3 100644 --- a/youtube_dl/extractor/myvideo.py +++ b/youtube_dl/extractor/myvideo.py @@ -10,6 +10,7 @@ from .common import InfoExtractor from ..compat import ( compat_ord, compat_urllib_parse, + compat_urllib_parse_unquote, compat_urllib_request, ) from ..utils import ( @@ -107,7 +108,7 @@ class MyVideoIE(InfoExtractor): if not a == '_encxml': params[a] = b else: - encxml = compat_urllib_parse.unquote(b) + encxml = compat_urllib_parse_unquote(b) if not params.get('domain'): params['domain'] = 'www.myvideo.de' xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params)) @@ -135,7 +136,7 @@ class MyVideoIE(InfoExtractor): video_url = None mobj = re.search('connectionurl=\'(.*?)\'', dec_data) if mobj: - video_url = compat_urllib_parse.unquote(mobj.group(1)) + video_url = compat_urllib_parse_unquote(mobj.group(1)) if 'myvideo2flash' in video_url: self.report_warning( 'Rewriting URL to use unencrypted rtmp:// ...', @@ -147,10 +148,10 @@ class MyVideoIE(InfoExtractor): mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data) if mobj is None: raise ExtractorError('unable to extract url') - video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2)) + video_url = compat_urllib_parse_unquote(mobj.group(1)) + compat_urllib_parse_unquote(mobj.group(2)) video_file = self._search_regex('source=\'(.*?)\'', dec_data, 'video file') - video_file = compat_urllib_parse.unquote(video_file) + video_file = compat_urllib_parse_unquote(video_file) if not video_file.endswith('f4m'): ppath, prefix = video_file.split('.') @@ -159,7 +160,7 @@ class MyVideoIE(InfoExtractor): video_playpath = '' video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, 'swfobj') - video_swfobj = compat_urllib_parse.unquote(video_swfobj) + video_swfobj = compat_urllib_parse_unquote(video_swfobj) video_title = self._html_search_regex("(.*?)

", webpage, 'title') From b78f5ec4c338e5e9379c1d7a2b0c99451e06897a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:45:00 +0600 Subject: [PATCH 214/450] [odnoklassniki] Use compat_urllib_parse_unquote --- youtube_dl/extractor/odnoklassniki.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 6c7149fe3..215ffe87b 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_unquote from ..utils import ( unified_strdate, int_or_none, @@ -62,7 +62,7 @@ class OdnoklassnikiIE(InfoExtractor): metadata = self._parse_json(metadata, video_id) else: metadata = self._download_json( - compat_urllib_parse.unquote(flashvars['metadataUrl']), + compat_urllib_parse_unquote(flashvars['metadataUrl']), video_id, 'Downloading metadata JSON') movie = metadata['movie'] From d54f1c74776a8d6c7cf07ae16ae67bd39ee51f93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:45:26 +0600 Subject: [PATCH 215/450] [openfilm] Use compat_urllib_parse_unquote_plus --- youtube_dl/extractor/openfilm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/openfilm.py b/youtube_dl/extractor/openfilm.py index 2249657eb..7e10bdf2c 100644 --- a/youtube_dl/extractor/openfilm.py +++ b/youtube_dl/extractor/openfilm.py @@ -5,7 +5,7 @@ import json from .common import InfoExtractor from ..utils import ( parse_iso8601, - compat_urllib_parse, + compat_urllib_parse_unquote_plus, parse_age_limit, int_or_none, ) @@ -37,7 +37,7 @@ class OpenFilmIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - player = compat_urllib_parse.unquote_plus( + player = compat_urllib_parse_unquote_plus( self._og_search_video_url(webpage)) video = json.loads(self._search_regex( From 2ebbb6f1f771e5c81b103b6216073341e85cc1aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:45:55 +0600 Subject: [PATCH 216/450] [photobucket] Use compat_urllib_parse_unquote --- youtube_dl/extractor/photobucket.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/photobucket.py b/youtube_dl/extractor/photobucket.py index c66db3cdc..788411ccc 100644 --- a/youtube_dl/extractor/photobucket.py +++ b/youtube_dl/extractor/photobucket.py @@ -4,7 +4,7 @@ import json import re from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_unquote class PhotobucketIE(InfoExtractor): @@ -34,7 +34,7 @@ class PhotobucketIE(InfoExtractor): info_json = self._search_regex(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (.*?)\);', webpage, 'info json') info = json.loads(info_json) - url = compat_urllib_parse.unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url')) + url = compat_urllib_parse_unquote(self._html_search_regex(r'file=(.+\.mp4)', info['linkcodes']['html'], 'url')) return { 'id': video_id, 'url': url, From 388ad0c05c976a769ae9cf9d1f0e59526921b68c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:46:33 +0600 Subject: [PATCH 217/450] [playvid] Use compat_urllib_parse_unquote and compat_urllib_parse_unquote_plus --- youtube_dl/extractor/playvid.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/playvid.py b/youtube_dl/extractor/playvid.py index c3e667e9e..2eb4fd96d 100644 --- a/youtube_dl/extractor/playvid.py +++ b/youtube_dl/extractor/playvid.py @@ -4,7 +4,8 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_unquote, + compat_urllib_parse_unquote_plus, ) from ..utils import ( clean_html, @@ -44,7 +45,7 @@ class PlayvidIE(InfoExtractor): flashvars = self._html_search_regex( r'flashvars="(.+?)"', webpage, 'flashvars') - infos = compat_urllib_parse.unquote(flashvars).split(r'&') + infos = compat_urllib_parse_unquote(flashvars).split(r'&') for info in infos: videovars_match = re.match(r'^video_vars\[(.+?)\]=(.+?)$', info) if videovars_match: @@ -52,7 +53,7 @@ class PlayvidIE(InfoExtractor): val = videovars_match.group(2) if key == 'title': - video_title = compat_urllib_parse.unquote_plus(val) + video_title = compat_urllib_parse_unquote_plus(val) if key == 'duration': try: duration = int(val) From 605cbef65326b52ba9fdb994fc06798f15a3a1ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:47:23 +0600 Subject: [PATCH 218/450] [pornhub] Use compat_urllib_parse_unquote and compat_urllib_parse_unquote_plus --- youtube_dl/extractor/pornhub.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 8172bc997..0b7886840 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -5,7 +5,8 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_unquote, + compat_urllib_parse_unquote_plus, compat_urllib_parse_urlparse, compat_urllib_request, ) @@ -69,7 +70,7 @@ class PornHubIE(InfoExtractor): webpage, 'uploader', fatal=False) thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False) if thumbnail: - thumbnail = compat_urllib_parse.unquote(thumbnail) + thumbnail = compat_urllib_parse_unquote(thumbnail) view_count = self._extract_count( r'([\d,\.]+) views', webpage, 'view') @@ -80,9 +81,9 @@ class PornHubIE(InfoExtractor): comment_count = self._extract_count( r'All Comments\s*\(([\d,.]+)\)', webpage, 'comment') - video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage))) + video_urls = list(map(compat_urllib_parse_unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage))) if webpage.find('"encrypted":true') != -1: - password = compat_urllib_parse.unquote_plus( + password = compat_urllib_parse_unquote_plus( self._search_regex(r'"video_title":"([^"]+)', webpage, 'password')) video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) From 47af21e8f15be1ddb6195327719569d8723a669e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:47:51 +0600 Subject: [PATCH 219/450] [spankwire] Use compat_urllib_parse_unquote --- youtube_dl/extractor/spankwire.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index bff75d6b2..5fa6faf18 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -4,7 +4,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_unquote, compat_urllib_parse_urlparse, compat_urllib_request, ) @@ -68,7 +68,7 @@ class SpankwireIE(InfoExtractor): webpage, 'comment count', fatal=False)) video_urls = list(map( - compat_urllib_parse.unquote, + compat_urllib_parse_unquote, re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage))) if webpage.find('flashvars\.encrypted = "true"') != -1: password = self._search_regex( From 736f003f2e96b37a9554d77a26fdef6c0a38fd5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:49:14 +0600 Subject: [PATCH 220/450] [xbef] Use compat_urllib_parse_unquote --- youtube_dl/extractor/xbef.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/xbef.py b/youtube_dl/extractor/xbef.py index 80c48c37d..4ff99e5ca 100644 --- a/youtube_dl/extractor/xbef.py +++ b/youtube_dl/extractor/xbef.py @@ -1,9 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, -) +from ..compat import compat_urllib_parse_unquote class XBefIE(InfoExtractor): @@ -30,7 +28,7 @@ class XBefIE(InfoExtractor): config_url_enc = self._download_webpage( 'http://xbef.com/Main/GetVideoURLEncoded/%s' % video_id, video_id, note='Retrieving config URL') - config_url = compat_urllib_parse.unquote(config_url_enc) + config_url = compat_urllib_parse_unquote(config_url_enc) config = self._download_xml( config_url, video_id, note='Retrieving config') From 7dde5f6a8d10ba98b43f920c73248a8a7e73dc38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 17 Jul 2015 23:49:38 +0600 Subject: [PATCH 221/450] [xnxx] Use compat_urllib_parse_unquote --- youtube_dl/extractor/xnxx.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py index 79ed6c744..5a41f8ffa 100644 --- a/youtube_dl/extractor/xnxx.py +++ b/youtube_dl/extractor/xnxx.py @@ -2,9 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, -) +from ..compat import compat_urllib_parse_unquote class XNXXIE(InfoExtractor): @@ -26,7 +24,7 @@ class XNXXIE(InfoExtractor): video_url = self._search_regex(r'flv_url=(.*?)&', webpage, 'video URL') - video_url = compat_urllib_parse.unquote(video_url) + video_url = compat_urllib_parse_unquote(video_url) video_title = self._html_search_regex(r'(.*?)\s+-\s+XNXX.COM', webpage, 'title') From ee8de13e14f0dee555c340ef8b9f9c15a6bac6dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 17 Jul 2015 23:50:02 +0600 Subject: [PATCH 222/450] [xtube] Use compat_urllib_parse_unquote --- youtube_dl/extractor/xtube.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 1644f53c8..779e4f46a 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -5,7 +5,7 @@ import re from .common import InfoExtractor from ..compat import ( compat_urllib_request, - compat_urllib_parse, + compat_urllib_parse_unquote, ) from ..utils import ( parse_duration, @@ -59,7 +59,7 @@ class XTubeIE(InfoExtractor): for format_id, video_url in re.findall( r'flashvars\.quality_(.+?)\s*=\s*"([^"]+)"', webpage): fmt = { - 'url': compat_urllib_parse.unquote(video_url), + 'url': compat_urllib_parse_unquote(video_url), 'format_id': format_id, } m = re.search(r'^(?P<height>\d+)[pP]', format_id) @@ -68,7 +68,7 @@ class XTubeIE(InfoExtractor): formats.append(fmt) if not formats: - video_url = compat_urllib_parse.unquote(self._search_regex( + video_url = compat_urllib_parse_unquote(self._search_regex( r'flashvars\.video_url\s*=\s*"([^"]+)"', webpage, 'video URL')) formats.append({'url': video_url}) From aa4789d632b71abb79561ecf6769258a66e158f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 17 Jul 2015 23:50:25 +0600 Subject: [PATCH 223/450] [xvideos] Use compat_urllib_parse_unquote --- youtube_dl/extractor/xvideos.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index d8415bed4..5dcf2fdd1 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -4,7 +4,7 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse, + compat_urllib_parse_unquote, compat_urllib_request, ) from ..utils import ( @@ -37,7 +37,7 @@ class XVideosIE(InfoExtractor): if mobj: raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True) - video_url = compat_urllib_parse.unquote( + video_url = compat_urllib_parse_unquote( self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL')) video_title = self._html_search_regex( r'<title>(.*?)\s+-\s+XVID', webpage, 'title') From db6c50f109e732f287cc0c61216cdd865a031ea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 17 Jul 2015 23:50:52 +0600 Subject: [PATCH 224/450] [ynet] Use compat_urllib_parse_unquote_plus --- youtube_dl/extractor/ynet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py index 894678a23..869f3e819 100644 --- a/youtube_dl/extractor/ynet.py +++ b/youtube_dl/extractor/ynet.py @@ -5,7 +5,7 @@ import re import json from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import compat_urllib_parse_unquote_plus class YnetIE(InfoExtractor): @@ -34,7 +34,7 @@ class YnetIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - content = compat_urllib_parse.unquote_plus(self._og_search_video_url(webpage)) + content = compat_urllib_parse_unquote_plus(self._og_search_video_url(webpage)) config = json.loads(self._search_regex(r'config=({.+?})$', content, 'video config')) f4m_url = config['clip']['url'] title = self._og_search_title(webpage) From 7fd002c0061f9f96eca2224553a2551748970e7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 17 Jul 2015 23:51:57 +0600 Subject: [PATCH 225/450] [youtube] Use compat_urllib_parse_unquote and compat_urllib_parse_unquote_plus --- youtube_dl/extractor/youtube.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3c629d38a..e7f5c7861 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -17,6 +17,8 @@ from ..compat import ( compat_chr, compat_parse_qs, compat_urllib_parse, + compat_urllib_parse_unquote, + compat_urllib_parse_unquote_plus, compat_urllib_request, compat_urlparse, compat_str, @@ -865,7 +867,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Extract original video URL from URL with redirection, like age verification, using next_url parameter mobj = re.search(self._NEXT_URL_RE, url) if mobj: - url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/') + url = proto + '://www.youtube.com/' + compat_urllib_parse_unquote(mobj.group(1)).lstrip('/') video_id = self.extract_id(url) # Get video webpage @@ -973,7 +975,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # uploader if 'author' not in video_info: raise ExtractorError('Unable to extract uploader name') - video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0]) + video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0]) # uploader_id video_uploader_id = None @@ -1000,7 +1002,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._downloader.report_warning('unable to extract video thumbnail') video_thumbnail = None else: # don't panic if we can't find it - video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0]) + video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0]) # upload date upload_date = self._html_search_meta( @@ -1062,7 +1064,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._downloader.report_warning('unable to extract video duration') video_duration = None else: - video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])) + video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0])) # annotations video_annotations = None @@ -1609,7 +1611,7 @@ class YoutubeSearchURLIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - query = compat_urllib_parse.unquote_plus(mobj.group('query')) + query = compat_urllib_parse_unquote_plus(mobj.group('query')) webpage = self._download_webpage(url, query) result_code = self._search_regex( From c60e8cfaf7405e8b8075f06a62e7547adad8beba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 17 Jul 2015 23:54:38 +0600 Subject: [PATCH 226/450] [ehow] Simplify --- youtube_dl/extractor/ehow.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/ehow.py b/youtube_dl/extractor/ehow.py index ba9685991..b1cd4f5d4 100644 --- a/youtube_dl/extractor/ehow.py +++ b/youtube_dl/extractor/ehow.py @@ -1,9 +1,7 @@ from __future__ import unicode_literals -from ..compat import ( - compat_urllib_parse_unquote, -) from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote class EHowIE(InfoExtractor): From 7bd42d0d963b543aae1f4b4e8c157dc3421189fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 17 Jul 2015 23:56:27 +0600 Subject: [PATCH 227/450] [openfilm] Fix compat_urllib_parse_unquote_plus import --- youtube_dl/extractor/openfilm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openfilm.py b/youtube_dl/extractor/openfilm.py index 7e10bdf2c..d2ceedd01 100644 --- a/youtube_dl/extractor/openfilm.py +++ b/youtube_dl/extractor/openfilm.py @@ -3,9 +3,9 @@ from __future__ import unicode_literals import json from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote_plus from ..utils import ( parse_iso8601, - compat_urllib_parse_unquote_plus, parse_age_limit, int_or_none, ) From 6b19647d5706b75b5e2d7da836495fe5174919f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 18 Jul 2015 00:04:25 +0600 Subject: [PATCH 228/450] [veehd] Use compat_urllib_parse_unquote --- youtube_dl/extractor/veehd.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/veehd.py b/youtube_dl/extractor/veehd.py index 346edf485..0d8d832cc 100644 --- a/youtube_dl/extractor/veehd.py +++ b/youtube_dl/extractor/veehd.py @@ -5,6 +5,7 @@ import json from .common import InfoExtractor from ..compat import ( + compat_urllib_parse_unquote, compat_urlparse, ) from ..utils import ( @@ -76,7 +77,7 @@ class VeeHDIE(InfoExtractor): if config_json: config = json.loads(config_json) - video_url = compat_urlparse.unquote(config['clip']['url']) + video_url = compat_urllib_parse_unquote(config['clip']['url']) if not video_url: video_url = self._html_search_regex( From 3cc8b4c327585ff4dbb045d17d6f5c160df6fdb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 18 Jul 2015 00:24:39 +0600 Subject: [PATCH 229/450] [compat] Fix missing _asciire on python 2.6 --- youtube_dl/compat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index e950a4688..db0da5828 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -79,6 +79,8 @@ try: from urllib.parse import unquote as compat_urllib_parse_unquote from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus except ImportError: # Python 2 + _asciire = re.compile('([\x00-\x7f]+)') if sys.version_info < (2, 7) else compat_urllib_parse._asciire + # HACK: The following are the correct unquote_to_bytes, unquote and unquote_plus # implementations from cpython 3.4.3's stdlib. Python 2's version # is apparently broken (see https://github.com/rg3/youtube-dl/pull/6244) @@ -124,7 +126,7 @@ except ImportError: # Python 2 encoding = 'utf-8' if errors is None: errors = 'replace' - bits = compat_urllib_parse._asciire.split(string) + bits = _asciire.split(string) res = [bits[0]] append = res.append for i in range(1, len(bits), 2): From 9441f77faaf078aca2947b7613f61f46b6d0aea2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 18 Jul 2015 01:45:36 +0600 Subject: [PATCH 230/450] [bliptv] Use xpath for extraction and fix test --- youtube_dl/extractor/bliptv.py | 51 +++++++++++++++++----------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index fb56cd78d..cd06dbcea 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -14,6 +14,8 @@ from ..utils import ( int_or_none, parse_iso8601, unescapeHTML, + xpath_text, + xpath_with_ns, ) @@ -23,10 +25,10 @@ class BlipTVIE(InfoExtractor): _TESTS = [ { 'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352', - 'md5': 'c6934ad0b6acf2bd920720ec888eb812', + 'md5': '80baf1ec5c3d2019037c1c707d676b9f', 'info_dict': { 'id': '5779306', - 'ext': 'mov', + 'ext': 'm4v', 'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3', 'description': 'md5:9bc31f227219cde65e47eeec8d2dc596', 'timestamp': 1323138843, @@ -128,35 +130,34 @@ class BlipTVIE(InfoExtractor): rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS') - def blip(s): - return '{http://blip.tv/dtd/blip/1.0}%s' % s - - def media(s): - return '{http://search.yahoo.com/mrss/}%s' % s - - def itunes(s): - return '{http://www.itunes.com/dtds/podcast-1.0.dtd}%s' % s + def _x(p): + return xpath_with_ns(p, { + 'blip': 'http://blip.tv/dtd/blip/1.0', + 'media': 'http://search.yahoo.com/mrss/', + 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', + }) item = rss.find('channel/item') - video_id = item.find(blip('item_id')).text - title = item.find('./title').text - description = clean_html(compat_str(item.find(blip('puredescription')).text)) - timestamp = parse_iso8601(item.find(blip('datestamp')).text) - uploader = item.find(blip('user')).text - uploader_id = item.find(blip('userid')).text - duration = int(item.find(blip('runtime')).text) - media_thumbnail = item.find(media('thumbnail')) - thumbnail = media_thumbnail.get('url') if media_thumbnail is not None else item.find(itunes('image')).text - categories = [category.text for category in item.findall('category')] + video_id = xpath_text(item, _x('blip:item_id'), 'video id') or lookup_id + title = xpath_text(item, 'title', 'title', fatal=True) + description = clean_html(xpath_text(item, _x('blip:puredescription'), 'description')) + timestamp = parse_iso8601(xpath_text(item, _x('blip:datestamp'), 'timestamp')) + uploader = xpath_text(item, _x('blip:user'), 'uploader') + uploader_id = xpath_text(item, _x('blip:userid'), 'uploader id') + duration = int_or_none(xpath_text(item, _x('blip:runtime'), 'duration')) + media_thumbnail = item.find(_x('media:thumbnail')) + thumbnail = (media_thumbnail.get('url') if media_thumbnail is not None + else xpath_text(item, 'image', 'thumbnail')) + categories = [category.text for category in item.findall('category') if category is not None] formats = [] subtitles_urls = {} - media_group = item.find(media('group')) - for media_content in media_group.findall(media('content')): + media_group = item.find(_x('media:group')) + for media_content in media_group.findall(_x('media:content')): url = media_content.get('url') - role = media_content.get(blip('role')) + role = media_content.get(_x('blip:role')) msg = self._download_webpage( url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url', video_id, 'Resolving URL for %s' % role) @@ -175,8 +176,8 @@ class BlipTVIE(InfoExtractor): 'url': real_url, 'format_id': role, 'format_note': media_type, - 'vcodec': media_content.get(blip('vcodec')) or 'none', - 'acodec': media_content.get(blip('acodec')), + 'vcodec': media_content.get(_x('blip:vcodec')) or 'none', + 'acodec': media_content.get(_x('blip:acodec')), 'filesize': media_content.get('filesize'), 'width': int_or_none(media_content.get('width')), 'height': int_or_none(media_content.get('height')), From 530857182df72db028f959c0a58a9daf219d6b22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 18 Jul 2015 01:45:54 +0600 Subject: [PATCH 231/450] [bliptv] Add test with missing duration --- youtube_dl/extractor/bliptv.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index cd06dbcea..a69ee482b 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -102,6 +102,20 @@ class BlipTVIE(InfoExtractor): 'vcodec': 'none', } }, + { + # missing duration + 'url': 'http://blip.tv/rss/flash/6700880', + 'info_dict': { + 'id': '6684191', + 'ext': 'm4v', + 'title': 'Cowboy Bebop: Gateway Shuffle Review', + 'description': 'md5:3acc480c0f9ae157f5fe88547ecaf3f8', + 'timestamp': 1386639757, + 'upload_date': '20131210', + 'uploader': 'sfdebris', + 'uploader_id': '706520', + } + } ] @staticmethod From 3c283a381e4f7a69bf57c3ea85aab3c85ce0e309 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 18 Jul 2015 02:43:18 +0600 Subject: [PATCH 232/450] [sbs] Simplify --- youtube_dl/extractor/sbs.py | 40 +++++++++++++++---------------------- 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index ab4d1c884..d6ee2d9e2 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -2,12 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import remove_end class SBSIE(InfoExtractor): IE_DESC = 'sbs.com.au' - _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/ondemand/video/(?:single/)?(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand|news)/video/(?:single/)?(?P<id>[0-9]+)' _TESTS = [{ # Original URL is handled by the generic IE which finds the iframe: @@ -17,43 +16,36 @@ class SBSIE(InfoExtractor): 'info_dict': { 'id': '320403011771', 'ext': 'mp4', - 'title': 'Dingo Conservation', - 'description': 'Dingoes are on the brink of extinction; most of the animals we think are dingoes are in fact crossbred with wild dogs. This family run a dingo conservation park to prevent their extinction', + 'title': 'Dingo Conservation (The Feed)', + 'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5', 'thumbnail': 're:http://.*\.jpg', + 'duration': 308, }, - 'add_ies': ['generic'], }, { 'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed', 'only_matching': True, + }, { + 'url': 'http://www.sbs.com.au/news/video/471395907773/The-Feed-July-9', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - # the video is in the following iframe - iframe_url = 'http://www.sbs.com.au/ondemand/video/single/' + video_id + '?context=web' - webpage = self._download_webpage(iframe_url, video_id) + webpage = self._download_webpage( + 'http://www.sbs.com.au/ondemand/video/single/%s?context=web' % video_id, video_id) - player_params = self._search_regex( - r'(?s)(playerParams.+?releaseUrls.+?\n)', - webpage, 'playerParams') - player_params_js = self._search_regex( - r'({.*})', - player_params, 'player_param_js') + player_params = self._parse_json( + self._search_regex( + r'(?s)var\s+playerParams\s*=\s*({.+?});', webpage, 'playerParams'), + video_id) - player_params_json = self._parse_json(player_params_js, video_id) - - theplatform_url = player_params_json.get('releaseUrls')['progressive'] or player_params_json.get('releaseUrls')['standard'] - - title = remove_end(self._og_search_title(webpage, default=video_id, fatal=False), ' (The Feed)') - description = self._html_search_meta('description', webpage) - thumbnail = self._og_search_thumbnail(webpage) + urls = player_params['releaseUrls'] + theplatform_url = (urls.get('progressive') or urls.get('standard') or + urls.get('html') or player_params['relatedItemsURL']) return { '_type': 'url_transparent', 'id': video_id, 'url': theplatform_url, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, } From 4e1ad6e9a86e03a4309ed63914a08484aaff2784 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 18 Jul 2015 04:13:45 +0600 Subject: [PATCH 233/450] [videomega] Extend _VALID_URL and improve extraction (Closes #6260) --- youtube_dl/extractor/videomega.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py index eb309a7cd..d45f88646 100644 --- a/youtube_dl/extractor/videomega.py +++ b/youtube_dl/extractor/videomega.py @@ -8,17 +8,14 @@ from ..compat import compat_urllib_request class VideoMegaIE(InfoExtractor): - _VALID_URL = r'''(?x)https?:// - (?:www\.)?videomega\.tv/ - (?:iframe\.php|cdn\.php)?\?ref=(?P<id>[A-Za-z0-9]+) - ''' + _VALID_URL = r'https?://(?:www\.)?videomega\.tv/(?:(?:view|iframe|cdn)\.php)?\?ref=(?P<id>[A-Za-z0-9]+)' _TEST = { - 'url': 'http://videomega.tv/?ref=4GNA688SU99US886ANG4', - 'md5': 'bf5c2f95c4c917536e80936af7bc51e1', + 'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA&width=1070&height=600', + 'md5': 'cc1920a58add3f05c6a93285b84fb3aa', 'info_dict': { - 'id': '4GNA688SU99US886ANG4', + 'id': 'AOSQBJYKIDDIKYJBQSOA', 'ext': 'mp4', - 'title': 'BigBuckBunny_320x180', + 'title': '1254207', 'thumbnail': 're:^https?://.*\.jpg$', } } @@ -26,15 +23,15 @@ class VideoMegaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - iframe_url = 'http://videomega.tv/cdn.php?ref=%s' % video_id - req = compat_urllib_request.Request(iframe_url) + req = compat_urllib_request.Request(url) req.add_header('Referer', url) + req.add_header('Cookie', 'noadvtday=0') webpage = self._download_webpage(req, video_id) title = self._html_search_regex( - r'<title>(.*?)', webpage, 'title') + r'(.+?)', webpage, 'title') title = re.sub( - r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s?|\s?-\svideomega\.tv$)', '', title) + r'(?:^[Vv]ideo[Mm]ega\.tv\s-\s*|\s*-\svideomega\.tv$)', '', title) thumbnail = self._search_regex( r']+?poster="([^"]+)"', webpage, 'thumbnail', fatal=False) video_url = self._search_regex( @@ -46,6 +43,6 @@ class VideoMegaIE(InfoExtractor): 'url': video_url, 'thumbnail': thumbnail, 'http_headers': { - 'Referer': iframe_url, + 'Referer': url, }, } From 8a37f53685c3a9227aad139c9faf13df8576abb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jul 2015 04:25:10 +0600 Subject: [PATCH 234/450] [videomega] Revert iframe URL --- youtube_dl/extractor/videomega.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py index d45f88646..42d42adb0 100644 --- a/youtube_dl/extractor/videomega.py +++ b/youtube_dl/extractor/videomega.py @@ -10,7 +10,7 @@ from ..compat import compat_urllib_request class VideoMegaIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?videomega\.tv/(?:(?:view|iframe|cdn)\.php)?\?ref=(?P[A-Za-z0-9]+)' _TEST = { - 'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA&width=1070&height=600', + 'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA', 'md5': 'cc1920a58add3f05c6a93285b84fb3aa', 'info_dict': { 'id': 'AOSQBJYKIDDIKYJBQSOA', @@ -23,7 +23,8 @@ class VideoMegaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - req = compat_urllib_request.Request(url) + iframe_url = 'http://videomega.tv/cdn.php?ref=%s' % video_id + req = compat_urllib_request.Request(iframe_url) req.add_header('Referer', url) req.add_header('Cookie', 'noadvtday=0') webpage = self._download_webpage(req, video_id) @@ -43,6 +44,6 @@ class VideoMegaIE(InfoExtractor): 'url': video_url, 'thumbnail': thumbnail, 'http_headers': { - 'Referer': url, + 'Referer': iframe_url, }, } From 4bdfef5a1874d99457cf5758c5d8e1cc0c487f08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jul 2015 04:25:30 +0600 Subject: [PATCH 235/450] [videomega] Add tests --- youtube_dl/extractor/videomega.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py index 42d42adb0..b2ef637fa 100644 --- a/youtube_dl/extractor/videomega.py +++ b/youtube_dl/extractor/videomega.py @@ -9,7 +9,7 @@ from ..compat import compat_urllib_request class VideoMegaIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?videomega\.tv/(?:(?:view|iframe|cdn)\.php)?\?ref=(?P[A-Za-z0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA', 'md5': 'cc1920a58add3f05c6a93285b84fb3aa', 'info_dict': { @@ -18,7 +18,13 @@ class VideoMegaIE(InfoExtractor): 'title': '1254207', 'thumbnail': 're:^https?://.*\.jpg$', } - } + }, { + 'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA&width=1070&height=600', + 'only_matching': True, + }, { + 'url': 'http://videomega.tv/view.php?ref=090051111052065112106089103052052103089106112065052111051090', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From 4211e1941b39cb153505c64ab8fcb31a762674d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jul 2015 04:27:09 +0600 Subject: [PATCH 236/450] [videomega] Add shortcut to _VALID_URL --- youtube_dl/extractor/videomega.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py index b2ef637fa..78ff6310a 100644 --- a/youtube_dl/extractor/videomega.py +++ b/youtube_dl/extractor/videomega.py @@ -8,7 +8,7 @@ from ..compat import compat_urllib_request class VideoMegaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?videomega\.tv/(?:(?:view|iframe|cdn)\.php)?\?ref=(?P[A-Za-z0-9]+)' + _VALID_URL = r'(?:videomega:|https?://(?:www\.)?videomega\.tv/(?:(?:view|iframe|cdn)\.php)?\?ref=)(?P[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://videomega.tv/cdn.php?ref=AOSQBJYKIDDIKYJBQSOA', 'md5': 'cc1920a58add3f05c6a93285b84fb3aa', From 1793d71db61acc9f6cceefbc63d7de2b558b7582 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jul 2015 06:18:03 +0600 Subject: [PATCH 237/450] [twitch:stream] Fix channel_id in different case (Closes #6263) --- youtube_dl/extractor/twitch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 92b6dc1b8..ab557a1a5 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -346,6 +346,8 @@ class TwitchStreamIE(TwitchBaseIE): 'http://www.twitch.tv/%s/profile' % channel_id, 'TwitchProfile', channel_id) + channel_id = stream.get('channel', {}).get('name') or channel_id.lower() + access_token = self._download_json( '%s/api/channels/%s/access_token' % (self._API_BASE, channel_id), channel_id, 'Downloading channel access token') From 06966677343c4bbb51e8da3f0ed4d4985bdf54f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jul 2015 06:27:45 +0600 Subject: [PATCH 238/450] [twitch:stream] Clarify channel_id reassignment rationale (#6263) --- youtube_dl/extractor/twitch.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index ab557a1a5..948c8ce39 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -346,6 +346,10 @@ class TwitchStreamIE(TwitchBaseIE): 'http://www.twitch.tv/%s/profile' % channel_id, 'TwitchProfile', channel_id) + # Channel name may be typed if different case than the original channel name + # (e.g. http://www.twitch.tv/TWITCHPLAYSPOKEMON) that will lead to constructing + # an invalid m3u8 URL. Working around by use of original channel name from stream + # JSON and fallback to lowercase if it's not available. channel_id = stream.get('channel', {}).get('name') or channel_id.lower() access_token = self._download_json( From f29ac588ddb5b36ebfe38e3a809a70cf0cb543a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jul 2015 06:33:50 +0600 Subject: [PATCH 239/450] [nationalgeographics] Fix extraction (Closes #6262) --- youtube_dl/extractor/nationalgeographic.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index c18640c5a..f793b72f5 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -25,8 +25,11 @@ class NationalGeographicIE(InfoExtractor): name = url_basename(url) webpage = self._download_webpage(url, name) - feed_url = self._search_regex(r'data-feed-url="([^"]+)"', webpage, 'feed url') - guid = self._search_regex(r'data-video-guid="([^"]+)"', webpage, 'guid') + feed_url = self._search_regex( + r'data-feed-url="([^"]+)"', webpage, 'feed url') + guid = self._search_regex( + r'id="(?:videoPlayer|player-container)"[^>]+data-guid="([^"]+)"', + webpage, 'guid') feed = self._download_xml('%s?byGuid=%s' % (feed_url, guid), name) content = feed.find('.//{http://search.yahoo.com/mrss/}content') From 1b541d8d6ee5e3543253501d725af7ce4a55cad8 Mon Sep 17 00:00:00 2001 From: sceext Date: Sat, 18 Jul 2015 12:29:35 +0800 Subject: [PATCH 240/450] [iqiyi] fix iqiyi (2015-07-17), update the md5 salt (enc_key) to iqiyi latest (2015-07-17) flash player. --- youtube_dl/extractor/iqiyi.py | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 0f6707d7c..ac5416953 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -212,20 +212,7 @@ class IqiyiIE(InfoExtractor): return raw_data def get_enc_key(self, swf_url, video_id): - filename, _ = os.path.splitext(url_basename(swf_url)) - enc_key_json = self._downloader.cache.load('iqiyi-enc-key', filename) - if enc_key_json is not None: - return enc_key_json[0] - - req = self._request_webpage( - swf_url, video_id, note='download swf content') - cn = req.read() - cn = zlib.decompress(cn[8:]) - pt = re.compile(b'MixerRemote\x08(?P.+?)\$&vv') - enc_key = self._search_regex(pt, cn, 'enc_key').decode('utf8') - - self._downloader.cache.store('iqiyi-enc-key', filename, [enc_key]) - + enc_key = '8e29ab5666d041c3a1ea76e06dabdffb' return enc_key def _real_extract(self, url): From b0bff54b08a44966d321e8de0ed08dc1dde6b96e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 18 Jul 2015 11:59:45 +0200 Subject: [PATCH 241/450] release 2015.07.18 --- docs/supportedsites.md | 58 +++++++++++++++++++++++++++--------------- youtube_dl/version.py | 2 +- 2 files changed, 39 insertions(+), 21 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 0ca06c71d..a84878026 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -28,7 +28,7 @@ - **anitube.se** - **AnySex** - **Aparat** - - **AppleDaily** + - **AppleDaily**: 臺灣蘋果日報 - **AppleTrailers** - **archive.org**: archive.org videos - **ARD** @@ -45,7 +45,7 @@ - **audiomack** - **audiomack:album** - **Azubu** - - **BaiduVideo** + - **BaiduVideo**: 百度视频 - **bambuser** - **bambuser:channel** - **Bandcamp** @@ -106,7 +106,7 @@ - **Crunchyroll** - **crunchyroll:playlist** - **CSpan**: C-SPAN - - **CtsNews** + - **CtsNews**: 華視新聞 - **culturebox.francetvinfo.fr** - **dailymotion** - **dailymotion:playlist** @@ -121,7 +121,7 @@ - **Discovery** - **divxstage**: DivxStage - **Dotsub** - - **DouyuTV** + - **DouyuTV**: 斗鱼 - **dramafever** - **dramafever:series** - **DRBonanza** @@ -222,7 +222,7 @@ - **instagram:user**: Instagram user profile - **InternetVideoArchive** - **IPrima** - - **iqiyi** + - **iqiyi**: 爱奇艺 - **ivi**: ivi.ru - **ivi:compilation**: ivi.ru compilations - **Izlesene** @@ -243,9 +243,15 @@ - **kontrtube**: KontrTube.ru - Труба зовёт - **KrasView**: Красвью - **Ku6** + - **kuwo:album**: 酷我音乐 - 专辑 + - **kuwo:category**: 酷我音乐 - 分类 + - **kuwo:chart**: 酷我音乐 - 排行榜 + - **kuwo:mv**: 酷我音乐 - MV + - **kuwo:singer**: 酷我音乐 - 歌手 + - **kuwo:song**: 酷我音乐 - **la7.tv** - **Laola1Tv** - - **Letv** + - **Letv**: 乐视网 - **LetvPlaylist** - **LetvTv** - **Libsyn** @@ -297,6 +303,7 @@ - **MySpace** - **MySpace:album** - **MySpass** + - **Myvi** - **myvideo** - **MyVidster** - **N-JOY** @@ -312,11 +319,18 @@ - **NDTV** - **NerdCubedFeed** - **Nerdist** + - **netease:album**: 网易云音乐 - 专辑 + - **netease:djradio**: 网易云音乐 - 电台 + - **netease:mv**: 网易云音乐 - MV + - **netease:playlist**: 网易云音乐 - 歌单 + - **netease:program**: 网易云音乐 - 电台节目 + - **netease:singer**: 网易云音乐 - 歌手 + - **netease:song**: 网易云音乐 - **Netzkino** - **Newgrounds** - **Newstube** - - **NextMedia** - - **NextMediaActionNews** + - **NextMedia**: 蘋果日報 + - **NextMediaActionNews**: 蘋果日報 - 動新聞 - **nfb**: National Film Board of Canada - **nfl.com** - **nhl.com** @@ -332,13 +346,14 @@ - **Nowness** - **NowTV** - **nowvideo**: NowVideo - - **npo.nl** + - **npo**: npo.nl and ntr.nl + - **npo**: npo.nl and ntr.nl - **npo.nl:live** - **npo.nl:radio** - **npo.nl:radio:fragment** - **NRK** - **NRKPlaylist** - - **NRKTV** + - **NRKTV**: NRK TV and NRK Radio - **ntv.ru** - **Nuvid** - **NYTimes** @@ -382,11 +397,11 @@ - **prosiebensat1**: ProSiebenSat.1 Digital - **Puls4** - **Pyvideo** - - **qqmusic** - - **qqmusic:album** - - **qqmusic:playlist** - - **qqmusic:singer** - - **qqmusic:toplist** + - **qqmusic**: QQ音乐 + - **qqmusic:album**: QQ音乐 - 专辑 + - **qqmusic:playlist**: QQ音乐 - 歌单 + - **qqmusic:singer**: QQ音乐 - 歌手 + - **qqmusic:toplist**: QQ音乐 - 排行榜 - **QuickVid** - **R7** - **radio.de** @@ -395,6 +410,7 @@ - **RadioJavan** - **Rai** - **RBMARadio** + - **RDS**: RDS.ca - **RedTube** - **Restudy** - **ReverbNation** @@ -495,7 +511,6 @@ - **TechTalks** - **techtv.mit.edu** - **ted** - - **tegenlicht.vpro.nl** - **TeleBruxelles** - **telecinco.es** - **TeleMB** @@ -551,7 +566,7 @@ - **Ubu** - **udemy** - **udemy:course** - - **UDNEmbed** + - **UDNEmbed**: 聯合影音 - **Ultimedia** - **Unistra** - **Urort**: NRK P3 Urørt @@ -613,9 +628,11 @@ - **wdr:mobile** - **WDRMaus**: Sendung mit der Maus - **WebOfStories** + - **WebOfStoriesPlaylist** - **Weibo** - **Wimp** - **Wistia** + - **WNL** - **WorldStarHipHop** - **wrzuta.pl** - **WSJ**: Wall Street Journal @@ -628,18 +645,19 @@ - **Xstream** - **XTube** - **XTubeUser**: XTube user profile - - **Xuite** + - **Xuite**: 隨意窩Xuite影音 - **XVideos** - **XXXYMovies** - **Yahoo**: Yahoo screen and movies - - **Yam** + - **Yam**: 蕃薯藤yam天空部落 - **yandexmusic:album**: Яндекс.Музыка - Альбом - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист - **yandexmusic:track**: Яндекс.Музыка - Трек - **YesJapan** + - **yinyuetai:video**: 音悦Tai - **Ynet** - **YouJizz** - - **youku** + - **youku**: 优酷 - **YouPorn** - **YourUpload** - **youtube**: YouTube.com diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 3364647ed..3ad7a2bc0 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.07.07' +__version__ = '2015.07.18' From 74fe23ec35483c552f9c253be4c565546f78f001 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jul 2015 16:35:28 +0600 Subject: [PATCH 242/450] [extractor/common] Style --- youtube_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5a2d0d995..b9014fc23 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -996,7 +996,7 @@ class InfoExtractor(object): def _parse_smil_video(self, video, video_id, base, rtmp_count): src = video.get('src') if not src: - return ([], rtmp_count) + return [], rtmp_count bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) width = int_or_none(video.get('width')) height = int_or_none(video.get('height')) @@ -1009,7 +1009,7 @@ class InfoExtractor(object): proto = 'http' ext = video.get('ext') if proto == 'm3u8': - return (self._extract_m3u8_formats(src, video_id, ext), rtmp_count) + return self._extract_m3u8_formats(src, video_id, ext), rtmp_count elif proto == 'rtmp': rtmp_count += 1 streamer = video.get('streamer') or base From dc786d3db59a611cbcb716a476602719e74a34e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 18 Jul 2015 17:22:25 +0600 Subject: [PATCH 243/450] [vk:uservideos] Improve extraction --- youtube_dl/extractor/vk.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 8f677cae3..93c5b18c9 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -291,23 +291,32 @@ class VKIE(InfoExtractor): class VKUserVideosIE(InfoExtractor): IE_NAME = 'vk.com:user-videos' IE_DESC = 'vk.com:All of a user\'s videos' - _VALID_URL = r'https?://vk\.com/videos(?P[0-9]+)(?:m\?.*)?' + _VALID_URL = r'https?://vk\.com/videos(?P-?[0-9]+)$' _TEMPLATE_URL = 'https://vk.com/videos' - _TEST = { + _TESTS = [{ 'url': 'http://vk.com/videos205387401', 'info_dict': { 'id': '205387401', + 'title': "Tom Cruise's Videos", }, 'playlist_mincount': 4, - } + }, { + 'url': 'http://vk.com/videos-77521', + 'only_matching': True, + }] def _real_extract(self, url): page_id = self._match_id(url) - page = self._download_webpage(url, page_id) - video_ids = orderedSet( - m.group(1) for m in re.finditer(r'href="/video([0-9_]+)"', page)) - url_entries = [ + + webpage = self._download_webpage(url, page_id) + + entries = [ self.url_result( 'http://vk.com/video' + video_id, 'VK', video_id=video_id) - for video_id in video_ids] - return self.playlist_result(url_entries, page_id) + for video_id in set(re.findall(r'href="/video(-?[0-9_]+)"', webpage))] + + title = unescapeHTML(self._search_regex( + r'\s*([^<]+?)\s+\|\s+\d+\s+videos', + webpage, 'title', default=page_id)) + + return self.playlist_result(entries, page_id, title) From 1ecb5d1d83970d6295c655a37c99e390fb5d7dee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 18 Jul 2015 17:23:33 +0600 Subject: [PATCH 244/450] [vk] Clarify extractor names --- youtube_dl/extractor/vk.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 93c5b18c9..ef94b6894 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -20,7 +20,8 @@ from ..utils import ( class VKIE(InfoExtractor): - IE_NAME = 'vk.com' + IE_NAME = 'vk' + IE_DESC = 'VK' _VALID_URL = r'''(?x) https?:// (?: @@ -289,8 +290,8 @@ class VKIE(InfoExtractor): class VKUserVideosIE(InfoExtractor): - IE_NAME = 'vk.com:user-videos' - IE_DESC = 'vk.com:All of a user\'s videos' + IE_NAME = 'vk:uservideos' + IE_DESC = "VK - User's Videos" _VALID_URL = r'https?://vk\.com/videos(?P<id>-?[0-9]+)$' _TEMPLATE_URL = 'https://vk.com/videos' _TESTS = [{ From c6b68648f45498bcacc71cc8f29696fa93259a7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 18 Jul 2015 17:36:46 +0600 Subject: [PATCH 245/450] [bilibili] Show georestriction error --- youtube_dl/extractor/bilibili.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index bf60450c2..ecc17ebeb 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -41,8 +41,15 @@ class BiliBiliIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - if self._search_regex(r'(此视频不存在或被删除)', webpage, 'error message', default=None): - raise ExtractorError('The video does not exist or was deleted', expected=True) + if '(此视频不存在或被删除)' in webpage: + raise ExtractorError( + 'The video does not exist or was deleted', expected=True) + + if '>你没有权限浏览! 由于版权相关问题 我们不对您所在的地区提供服务<' in webpage: + raise ExtractorError( + 'The video is not available in your region due to copyright reasons', + expected=True) + video_code = self._search_regex( r'(?s)<div itemprop="video".*?>(.*?)</div>', webpage, 'video code') From fec73daaa30d006bc6c5d0483d255b7bc1c256b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 18 Jul 2015 18:05:08 +0600 Subject: [PATCH 246/450] [vk:uservideos] Revert orderedSet --- youtube_dl/extractor/vk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index ef94b6894..98bd3a141 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -314,7 +314,7 @@ class VKUserVideosIE(InfoExtractor): entries = [ self.url_result( 'http://vk.com/video' + video_id, 'VK', video_id=video_id) - for video_id in set(re.findall(r'href="/video(-?[0-9_]+)"', webpage))] + for video_id in orderedSet(re.findall(r'href="/video(-?[0-9_]+)"', webpage))] title = unescapeHTML(self._search_regex( r'<title>\s*([^<]+?)\s+\|\s+\d+\s+videos', From 22603348aa0b3e02c520589dea092507a04ab06a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 18 Jul 2015 18:32:52 +0600 Subject: [PATCH 247/450] [compat] Fix _asciire --- youtube_dl/compat.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index db0da5828..0c57c7aeb 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -79,7 +79,8 @@ try: from urllib.parse import unquote as compat_urllib_parse_unquote from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus except ImportError: # Python 2 - _asciire = re.compile('([\x00-\x7f]+)') if sys.version_info < (2, 7) else compat_urllib_parse._asciire + _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire') + else re.compile('([\x00-\x7f]+)')) # HACK: The following are the correct unquote_to_bytes, unquote and unquote_plus # implementations from cpython 3.4.3's stdlib. Python 2's version From 9ac09ed4deafc77cfd94e6a8827e4b0c47517b5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 18 Jul 2015 19:13:10 +0600 Subject: [PATCH 248/450] [bliptv] Remove unused import --- youtube_dl/extractor/bliptv.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index a69ee482b..c3296283d 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..compat import ( - compat_str, compat_urllib_request, compat_urlparse, ) From ee48b6a88f4b8f076699f539ee4c92389fa37643 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 18 Jul 2015 19:15:20 +0600 Subject: [PATCH 249/450] [vk] Capture error message --- youtube_dl/extractor/vk.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 98bd3a141..9487fc9f5 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -205,6 +205,12 @@ class VKIE(InfoExtractor): info_page = self._download_webpage(info_url, video_id) + error_message = self._html_search_regex( + r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', + info_page, 'error message', default=None) + if error_message: + raise ExtractorError(error_message, expected=True) + if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page): raise ExtractorError( 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.', From e58066e244bf9a04c2853e8829f7939bdefc9084 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 18 Jul 2015 19:25:06 +0600 Subject: [PATCH 250/450] [vk] Add age restricted video test for reference --- youtube_dl/extractor/vk.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 9487fc9f5..c30c5a8e5 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -154,6 +154,11 @@ class VKIE(InfoExtractor): 'url': 'http://vk.com/feed?z=video-43215063_166094326%2Fbb50cacd3177146d7a', 'only_matching': True, }, + { + # age restricted video, requires vk account credentials + 'url': 'https://vk.com/video205387401_164765225', + 'only_matching': True, + }, { # vk wrapper 'url': 'http://www.biqle.ru/watch/847655_160197695', From 3f5c6d0c1b955f9312b83fb0e1f3ae59d1230c82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 18 Jul 2015 22:41:34 +0600 Subject: [PATCH 251/450] [francetv] Add support for embeds and clean up _VALID_URL --- youtube_dl/extractor/francetv.py | 44 ++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index b2c984bf2..827400172 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -160,11 +160,19 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): class FranceTVIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetv' IE_DESC = 'France 2, 3, 4, 5 and Ô' - _VALID_URL = r'''(?x)https?://www\.france[2345o]\.fr/ - (?: - emissions/.*?/(videos|emissions)/(?P<id>[^/?]+) - | (emissions?|jt)/(?P<key>[^/?]+) - )''' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?france[2345o]\.fr/ + (?: + emissions/[^/]+/(?:videos|diffusions)?| + videos + ) + /| + embed\.francetv\.fr/\?ue= + ) + (?P<id>[^/?]+) + ''' _TESTS = [ # france2 @@ -232,13 +240,33 @@ class FranceTVIE(FranceTVBaseInfoExtractor): 'timestamp': 1410822000, }, }, + { + # francetv embed + 'url': 'http://embed.francetv.fr/?ue=8d7d3da1e3047c42ade5a5d7dfd3fc87', + 'info_dict': { + 'id': 'EV_30231', + 'ext': 'flv', + 'title': 'Alcaline, le concert avec Calogero', + 'description': 'md5:', + 'upload_date': '20150226', + 'timestamp': 1424989860, + }, + }, + { + 'url': 'http://www.france4.fr/emission/highlander/diffusion-du-17-07-2015-04h05', + 'only_matching': True, + }, + { + 'url': 'http://www.franceo.fr/videos/125377617', + 'only_matching': True, + } ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - webpage = self._download_webpage(url, mobj.group('key') or mobj.group('id')) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) video_id, catalogue = self._html_search_regex( - r'href="http://videos\.francetv\.fr/video/([^@]+@[^"]+)"', + r'href="http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', webpage, 'video ID').split('@') return self._extract_video(video_id, catalogue) From 5705ee6ef8d49175bf4afac36af4b3b9a4e72596 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 18 Jul 2015 22:43:42 +0600 Subject: [PATCH 252/450] [francetv] Fix duration extraction --- youtube_dl/extractor/francetv.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 827400172..d4f98ca16 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -87,7 +87,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor): 'title': info['titre'], 'description': clean_html(info['synopsis']), 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']), - 'duration': float_or_none(info.get('real_duration'), 1000) or parse_duration(info['duree']), + 'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']), 'timestamp': int_or_none(info['diffusion']['timestamp']), 'formats': formats, } @@ -250,6 +250,7 @@ class FranceTVIE(FranceTVBaseInfoExtractor): 'description': 'md5:', 'upload_date': '20150226', 'timestamp': 1424989860, + 'duration': 5400, }, }, { From cbd55ade680c5a3ffe5f7b5d36dc01af2c6dc48d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 18 Jul 2015 22:56:00 +0600 Subject: [PATCH 253/450] [extractor/generic] Add support for francetv embeds --- youtube_dl/extractor/generic.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a62287e50..dc24a8a8b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -407,6 +407,26 @@ class GenericIE(InfoExtractor): 'skip_download': 'Requires rtmpdump' } }, + # francetv embed + { + 'url': 'http://www.tsprod.com/replay-du-concert-alcaline-de-calogero', + 'info_dict': { + 'id': 'EV_30231', + 'ext': 'mp4', + 'title': 'Alcaline, le concert avec Calogero', + 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff', + 'upload_date': '20150226', + 'timestamp': 1424989860, + 'duration': 5400, + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + 'expected_warnings': [ + 'Forbidden' + ] + }, # Condé Nast embed { 'url': 'http://www.wired.com/2014/04/honda-asimo/', @@ -1431,6 +1451,13 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'ArteTVEmbed') + # Look for embedded francetv player + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url')) + # Look for embedded smotri.com player smotri_url = SmotriIE._extract_url(webpage) if smotri_url: From 826a7da808448932d17c09a13a106d75ba0c10d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 19 Jul 2015 00:27:23 +0600 Subject: [PATCH 254/450] [rtlnl] Avoid episodes completely (Closes #6275) --- youtube_dl/extractor/rtlnl.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index a4d3d73ff..e0c530d64 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -43,6 +43,21 @@ class RtlNlIE(InfoExtractor): 'upload_date': '20150215', 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.', } + }, { + # empty synopsis and missing episodes (see https://github.com/rg3/youtube-dl/issues/6275) + 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false', + 'info_dict': { + 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a', + 'ext': 'mp4', + 'title': 'RTL Nieuws - Meer beelden van overval juwelier', + 'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$', + 'timestamp': 1437233400, + 'upload_date': '20150718', + 'duration': 30.474, + }, + 'params': { + 'skip_download': True, + }, }, { # encrypted m3u8 streams, georestricted 'url': 'http://www.rtlxl.nl/#!/afl-2-257632/52a74543-c504-4cde-8aa8-ec66fe8d68a7', @@ -59,9 +74,11 @@ class RtlNlIE(InfoExtractor): uuid) material = info['material'][0] - progname = info['abstracts'][0]['name'] - subtitle = material['title'] or info['episodes'][0]['name'] - description = material.get('synopsis') or info['episodes'][0]['synopsis'] + title = info['abstracts'][0]['name'] + subtitle = material.get('title') + if subtitle: + title += ' - %s' % subtitle + description = material.get('synopsis') meta = info.get('meta', {}) @@ -107,7 +124,7 @@ class RtlNlIE(InfoExtractor): return { 'id': uuid, - 'title': '%s - %s' % (progname, subtitle), + 'title': title, 'formats': formats, 'timestamp': material['original_date'], 'description': description, From 761ee0d827d17eefff0538a99c453e050b6e933e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 19 Jul 2015 02:28:43 +0800 Subject: [PATCH 255/450] [iqiyi] Remove unused imports --- youtube_dl/extractor/iqiyi.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index ac5416953..7caaa1183 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -3,19 +3,13 @@ from __future__ import unicode_literals import hashlib import math -import os.path import random -import re import time import uuid -import zlib from .common import InfoExtractor from ..compat import compat_urllib_parse -from ..utils import ( - ExtractorError, - url_basename, -) +from ..utils import ExtractorError class IqiyiIE(InfoExtractor): From 3af1fac7b0f43778e44b3b86e0c74bf25fb6f489 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 18 Jul 2015 09:51:59 +0100 Subject: [PATCH 256/450] [dcn] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/dcn.py | 46 ++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) create mode 100644 youtube_dl/extractor/dcn.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 06f21064b..cc0da81d1 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -110,6 +110,7 @@ from .dailymotion import ( ) from .daum import DaumIE from .dbtv import DBTVIE +from .dcn import DcnIE from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE from .dfb import DFBIE diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py new file mode 100644 index 000000000..5263def4c --- /dev/null +++ b/youtube_dl/extractor/dcn.py @@ -0,0 +1,46 @@ +from .common import InfoExtractor + +class DcnIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/.+|show/\d+/.+?)/(?P<id>\d+)/?' + _TEST = { + 'url': 'http://www.dcndigital.ae/#/show/199074/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375/6887', + 'info_dict': + { + 'id': '17375', + 'ext': 'm3u8', + 'title': 'رحلة العمر : الحلقة 1', + 'description': '"في هذه الحلقة من برنامج رحلة العمر يقدّم الدكتور عمر عبد الكافي تبسيطاً لمناسك الحج والعمرة ويجيب مباشرة على استفسارات حجاج بيت الله الحرام بخصوص مناسك الحج والعمرة1"', + 'thumbnail': 'http://admin.mangomolo.com/analytics/uploads/71/images/media/2/2cefc09d7bec80afa754682f40e49503.jpg', + 'duration': '2041' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json( + 'http://admin.mangomolo.com/analytics/index.php/plus/video?id='+video_id, + video_id + ) + title = json_data['title_ar']; + thumbnail = 'http://admin.mangomolo.com/analytics/'+json_data['img']; + duration = json_data['duration']; + description = json_data['description_ar']; + webpage = self._download_webpage( + 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?id='+json_data['id']+'&user_id='+json_data['user_id']+'&countries=Q0M=&w=100%&h=100%&filter=DENY&signature='+json_data['signature'], + video_id + ) + m3u8_url = self._html_search_regex( + r'file: "(?P<m3u8_url>.*?)"', + webpage, + 'm3u8_url', + group='m3u8_url' + ) + formats = self._extract_m3u8_formats(m3u8_url, video_id) + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'description': description, + 'formats': formats, + } From 9d681c2bb3b75a666b76d8e346ffab66b65f9132 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 18 Jul 2015 10:00:24 +0100 Subject: [PATCH 257/450] remove unnecessary group name --- youtube_dl/extractor/dcn.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 5263def4c..f76ebda9e 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -30,10 +30,9 @@ class DcnIE(InfoExtractor): video_id ) m3u8_url = self._html_search_regex( - r'file: "(?P<m3u8_url>.*?)"', + r'file:\s*"([^"]+)', webpage, - 'm3u8_url', - group='m3u8_url' + 'm3u8_url' ) formats = self._extract_m3u8_formats(m3u8_url, video_id) return { From 36068ae019138710a3023334db2241ec815e2fe6 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 19 Jul 2015 03:54:59 +0800 Subject: [PATCH 258/450] [iqiyi] Comment out some MD5 sums The value is different on Travis CI server --- youtube_dl/extractor/iqiyi.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 7caaa1183..d93b0867d 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -33,56 +33,48 @@ class IqiyiIE(InfoExtractor): 'title': '名侦探柯南第752集', }, 'playlist': [{ - 'md5': '7e49376fecaffa115d951634917fe105', 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb_part1', 'ext': 'f4v', 'title': '名侦探柯南第752集', }, }, { - 'md5': '41b75ba13bb7ac0e411131f92bc4f6ca', 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb_part2', 'ext': 'f4v', 'title': '名侦探柯南第752集', }, }, { - 'md5': '0cee1dd0a3d46a83e71e2badeae2aab0', 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb_part3', 'ext': 'f4v', 'title': '名侦探柯南第752集', }, }, { - 'md5': '4f8ad72373b0c491b582e7c196b0b1f9', 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb_part4', 'ext': 'f4v', 'title': '名侦探柯南第752集', }, }, { - 'md5': 'd89ad028bcfad282918e8098e811711d', 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb_part5', 'ext': 'f4v', 'title': '名侦探柯南第752集', }, }, { - 'md5': '9cb1e5c95da25dff0660c32ae50903b7', 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb_part6', 'ext': 'f4v', 'title': '名侦探柯南第752集', }, }, { - 'md5': '155116e0ff1867bbc9b98df294faabc9', 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb_part7', 'ext': 'f4v', 'title': '名侦探柯南第752集', }, }, { - 'md5': '53f5db77622ae14fa493ed2a278a082b', 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb_part8', 'ext': 'f4v', From e89d7e30294d3c5a36e6af5dd730ed543934d40e Mon Sep 17 00:00:00 2001 From: Roland Hieber <rohieb@rohieb.name> Date: Sun, 19 Jul 2015 03:34:22 +0200 Subject: [PATCH 259/450] [tagesschau] add support for more video types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I found that currently only tagesschau videos are played. There are some more shows hosted on tagesschau.de (see [0] for example) which are easily playable by adjusting the regex. So this patch adds support for: * tagesthemen * tagesschau vor 20 Jahren * tagesschau (mit Gebärdensprache) * nachtmagazin Note that some videos don't provide a description, so in order for the tests to succeed, an ExtractorError needs to get caught. [0]: http://www.tagesschau.de/multimedia/video/videoarchiv2~_date-20150714.html --- youtube_dl/extractor/tagesschau.py | 55 +++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index bfe07b024..2a2aafca0 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -4,11 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import parse_filesize +from ..utils import parse_filesize, ExtractorError class TagesschauIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:sendung/ts|video/video)(?P<id>-?[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:sendung/(ts|tsg|tt|nm)|video/video|tsvorzwanzig)(?P<id>-?[0-9]+)\.html' _TESTS = [{ 'url': 'http://www.tagesschau.de/multimedia/video/video1399128.html', @@ -30,6 +30,46 @@ class TagesschauIE(InfoExtractor): 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', 'thumbnail': 're:^http:.*\.jpg$', } + }, { + 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', + 'md5': '90757268b49ef56deae90c7b48928d58', + 'info_dict': { + 'id': '3771', + 'ext': 'mp4', + 'description': '', + 'title': 'Sendung: tagesschau (mit Gebärdensprache) \t14.07.2015 20:00 Uhr', + 'thumbnail': 're:^http:.*\.jpg$', + } + }, { + 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html', + 'md5': '6e3ebdc75e8d67da966a8d06721eda71', + 'info_dict': { + 'id': '3827', + 'ext': 'mp4', + 'description': 'md5:d511d0e278b0ad341a95ad9ab992ce66', + 'title': 'Sendung: tagesthemen \t14.07.2015 22:15 Uhr', + 'thumbnail': 're:^http:.*\.jpg$', + } + }, { + 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html', + 'md5': '8a8875a568f0a5ae5ceef93c501a225f', + 'info_dict': { + 'id': '3475', + 'ext': 'mp4', + 'description': 'md5:ed149f5649cda3dac86813a9d777e131', + 'title': 'Sendung: nachtmagazin \t15.07.2015 00:15 Uhr', + 'thumbnail': 're:^http:.*\.jpg$', + } + }, { + 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html', + 'md5': 'be4d6f0421f2acd8abe25ea29f6f015b', + 'info_dict': { + 'id': '959', + 'ext': 'mp4', + 'description': '', + 'title': 'Sendung: tagesschau vor 20 Jahren \t14.07.2015 22:45 Uhr', + 'thumbnail': 're:^http:.*\.jpg$', + } }] _FORMATS = { @@ -102,9 +142,14 @@ class TagesschauIE(InfoExtractor): thumbnail_fn = self._search_regex( r'(?s)<img alt="Sendungsbild".*?src="([^"]+)"', webpage, 'thumbnail', fatal=False) - description = self._html_search_regex( - r'(?s)<p class="teasertext">(.*?)</p>', - webpage, 'description', fatal=False) + # there are some videos without description + description = "" + try: + description = self._html_search_regex( + r'(?s)<p class="teasertext">(.*?)</p>', + webpage, 'description', fatal=False) + except ExtractorError: + pass title = self._html_search_regex( r'<span class="headline".*?>(.*?)</span>', webpage, 'title') From c51bc70e0feec42664c57228cc4b395a3a629420 Mon Sep 17 00:00:00 2001 From: Roland Hieber <rohieb@rohieb.name> Date: Sun, 19 Jul 2015 03:52:54 +0200 Subject: [PATCH 260/450] [tagesschau] fix test which links to nonexisting video --- youtube_dl/extractor/tagesschau.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index bfe07b024..cf1b37a75 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -11,13 +11,13 @@ class TagesschauIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:sendung/ts|video/video)(?P<id>-?[0-9]+)\.html' _TESTS = [{ - 'url': 'http://www.tagesschau.de/multimedia/video/video1399128.html', - 'md5': 'bcdeac2194fb296d599ce7929dfa4009', + 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', + 'md5': '917a228bc7df7850783bc47979673a09', 'info_dict': { - 'id': '1399128', + 'id': '102143', 'ext': 'mp4', - 'title': 'Harald Range, Generalbundesanwalt, zu den Ermittlungen', - 'description': 'md5:69da3c61275b426426d711bde96463ab', + 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', + 'description': 'md5:171feccd9d9b3dd54d05d501568f6359', 'thumbnail': 're:^http:.*\.jpg$', }, }, { From 3c6ae8b59ed2517891ff19507e4cef2322ef9eb4 Mon Sep 17 00:00:00 2001 From: Roland Hieber <rohieb@rohieb.name> Date: Sun, 19 Jul 2015 04:27:58 +0200 Subject: [PATCH 261/450] [tagesschau] add support for Bericht aus Berlin videos --- youtube_dl/extractor/tagesschau.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index 2a2aafca0..682f8df8f 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -8,7 +8,7 @@ from ..utils import parse_filesize, ExtractorError class TagesschauIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:sendung/(ts|tsg|tt|nm)|video/video|tsvorzwanzig)(?P<id>-?[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:sendung/(ts|tsg|tt|nm|bab/bab)|video/video|tsvorzwanzig)(?P<id>-?[0-9]+)(?:~[-_a-zA-Z0-9]*)?\.html' _TESTS = [{ 'url': 'http://www.tagesschau.de/multimedia/video/video1399128.html', @@ -70,6 +70,16 @@ class TagesschauIE(InfoExtractor): 'title': 'Sendung: tagesschau vor 20 Jahren \t14.07.2015 22:45 Uhr', 'thumbnail': 're:^http:.*\.jpg$', } + }, { + 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html', + 'md5': '42e3757018d9908581481a80cc1806da', + 'info_dict': { + 'id': '3299', + 'ext': 'mp4', + 'description': '', + 'title': 'Nach dem Referendum: Schaltgespräch nach Athen', + 'thumbnail': 're:^http:.*\.jpg$', + } }] _FORMATS = { From 726adc43ec184b6d7e05cdbc197aa02c53b32347 Mon Sep 17 00:00:00 2001 From: Roland Hieber <rohieb@rohieb.name> Date: Sun, 19 Jul 2015 05:09:29 +0200 Subject: [PATCH 262/450] [tagesschau] set description=None for empty descriptions --- youtube_dl/extractor/tagesschau.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index 682f8df8f..4a755c657 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -36,7 +36,7 @@ class TagesschauIE(InfoExtractor): 'info_dict': { 'id': '3771', 'ext': 'mp4', - 'description': '', + 'description': None, 'title': 'Sendung: tagesschau (mit Gebärdensprache) \t14.07.2015 20:00 Uhr', 'thumbnail': 're:^http:.*\.jpg$', } @@ -66,7 +66,7 @@ class TagesschauIE(InfoExtractor): 'info_dict': { 'id': '959', 'ext': 'mp4', - 'description': '', + 'description': None, 'title': 'Sendung: tagesschau vor 20 Jahren \t14.07.2015 22:45 Uhr', 'thumbnail': 're:^http:.*\.jpg$', } @@ -76,7 +76,7 @@ class TagesschauIE(InfoExtractor): 'info_dict': { 'id': '3299', 'ext': 'mp4', - 'description': '', + 'description': None, 'title': 'Nach dem Referendum: Schaltgespräch nach Athen', 'thumbnail': 're:^http:.*\.jpg$', } @@ -154,12 +154,9 @@ class TagesschauIE(InfoExtractor): webpage, 'thumbnail', fatal=False) # there are some videos without description description = "" - try: - description = self._html_search_regex( - r'(?s)<p class="teasertext">(.*?)</p>', - webpage, 'description', fatal=False) - except ExtractorError: - pass + description = self._html_search_regex( + r'(?s)<p class="teasertext">(.*?)</p>', + webpage, 'description', fatal=False, default=None) title = self._html_search_regex( r'<span class="headline".*?>(.*?)</span>', webpage, 'title') From 4951c9f8214153d84033a77a4ef798ae29e1466a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 19 Jul 2015 11:16:29 +0800 Subject: [PATCH 263/450] Credit @sceext2 for fixing iQiyi extractor (#6266) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 531ec5767..4fd65f46f 100644 --- a/AUTHORS +++ b/AUTHORS @@ -132,3 +132,4 @@ George Brighton Remita Amine Aurélio A. Heckert Bernhard Minks +sceext From c2d1be8981c0f6b91635c32a3163e01b91d5db54 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 19 Jul 2015 11:20:05 +0800 Subject: [PATCH 264/450] [iqiyi] Add skip_download to reduce network traffic MD5 checksums are commented out in 36068ae019138710a3023334db2241ec815e2fe6, and actual downloads are not necessary anymore. --- youtube_dl/extractor/iqiyi.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index d93b0867d..afb7f4e61 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -81,6 +81,9 @@ class IqiyiIE(InfoExtractor): 'title': '名侦探柯南第752集', }, }], + 'params': { + 'skip_download': True, + }, }] _FORMATS_MAP = [ From 0215103e92c7a40aa0cea319e72dcf49f02053fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 19 Jul 2015 11:13:27 +0200 Subject: [PATCH 265/450] [francetv] Remove unused import --- youtube_dl/extractor/francetv.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index d4f98ca16..477e22793 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -14,7 +14,6 @@ from ..utils import ( clean_html, ExtractorError, int_or_none, - float_or_none, parse_duration, determine_ext, ) From 8a7a2089050957c942bde0b9ab70963b2dea8f35 Mon Sep 17 00:00:00 2001 From: slangangular <slangangular@infrablue.org> Date: Sun, 12 Jul 2015 10:30:51 +0200 Subject: [PATCH 266/450] [sportschau] Add support for sportschau.de Closes #6199. --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/sportschau.py | 43 ++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 youtube_dl/extractor/sportschau.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 06f21064b..d70de7690 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -552,6 +552,7 @@ from .sportbox import ( SportBoxEmbedIE, ) from .sportdeutschland import SportDeutschlandIE +from .sportschau import SportschauIE from .srf import SrfIE from .srmediathek import SRMediathekIE from .ssa import SSAIE diff --git a/youtube_dl/extractor/sportschau.py b/youtube_dl/extractor/sportschau.py new file mode 100644 index 000000000..ff30e1f0e --- /dev/null +++ b/youtube_dl/extractor/sportschau.py @@ -0,0 +1,43 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class SportschauIE(InfoExtractor): + IE_NAME = 'Sportschau' + _VALID_URL = r'https?://(?:www\.)?sportschau\.de/\w+(?:/\w+)?/video(?P<id>\w+)\.html' + _TEST = { + 'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html', + 'md5': 'a6ef460ab9f4089b079832e06d554cec', + 'info_dict': { + 'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100', + 'ext': 'mp4', + 'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun" - Tour de France - sportschau.de', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + ext = '-mc_defaultQuality-h.json' + json_url = url[:-5] + ext + + json = self._download_json(json_url, video_id) + thumb_url = json['_previewImage'] + + m3u8_url = json['_mediaArray'][1]['_mediaStreamArray'][0]['_stream'][0] + m3u8_formats = self._extract_m3u8_formats(m3u8_url, video_id, ext="mp4") + + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex(r'<title>(.*?)', webpage, 'title') + desc = self._html_search_meta('description', webpage) + + return { + 'id': video_id, + 'title': title, + 'formats': m3u8_formats, + 'description': desc, + 'thumbnail': thumb_url, + } From 8b61bfd6389b62f054cdf9dcb3436395c82a8e28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 19 Jul 2015 11:21:18 +0200 Subject: [PATCH 267/450] [sportschau] skip download in test --- youtube_dl/extractor/sportschau.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/sportschau.py b/youtube_dl/extractor/sportschau.py index ff30e1f0e..28797266f 100644 --- a/youtube_dl/extractor/sportschau.py +++ b/youtube_dl/extractor/sportschau.py @@ -9,14 +9,17 @@ class SportschauIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?sportschau\.de/\w+(?:/\w+)?/video(?P\w+)\.html' _TEST = { 'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html', - 'md5': 'a6ef460ab9f4089b079832e06d554cec', 'info_dict': { 'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100', 'ext': 'mp4', 'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun" - Tour de France - sportschau.de', 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.', - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): From 32470bf619d31605dc9c51ad107839a097f829f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 19 Jul 2015 11:24:19 +0200 Subject: [PATCH 268/450] [sportschau] Improve title extraction The html '' ends with '- sportschau.de', which shouldn't be part of the title. --- youtube_dl/extractor/sportschau.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/sportschau.py b/youtube_dl/extractor/sportschau.py index 28797266f..bf9b075db 100644 --- a/youtube_dl/extractor/sportschau.py +++ b/youtube_dl/extractor/sportschau.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import get_element_by_attribute class SportschauIE(InfoExtractor): @@ -12,7 +13,7 @@ class SportschauIE(InfoExtractor): 'info_dict': { 'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100', 'ext': 'mp4', - 'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun" - Tour de France - sportschau.de', + 'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"', 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.', }, @@ -34,7 +35,7 @@ class SportschauIE(InfoExtractor): m3u8_formats = self._extract_m3u8_formats(m3u8_url, video_id, ext="mp4") webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<title>(.*?)', webpage, 'title') + title = get_element_by_attribute('class', 'headline', webpage) desc = self._html_search_meta('description', webpage) return { From 1dc31c2786b34f833acc5fc646afcf992f71444d Mon Sep 17 00:00:00 2001 From: Zach Bruggeman Date: Fri, 10 Jul 2015 19:27:48 -0700 Subject: [PATCH 269/450] [appleconnect] Add new extractor (fixes #6189) Closes #6190. --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/appleconnect.py | 50 ++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 youtube_dl/extractor/appleconnect.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d70de7690..5033d67ed 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -19,6 +19,7 @@ from .anysex import AnySexIE from .aol import AolIE from .allocine import AllocineIE from .aparat import AparatIE +from .appleconnect import AppleConnectIE from .appletrailers import AppleTrailersIE from .archiveorg import ArchiveOrgIE from .ard import ARDIE, ARDMediathekIE diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py new file mode 100644 index 000000000..ea7a70393 --- /dev/null +++ b/youtube_dl/extractor/appleconnect.py @@ -0,0 +1,50 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + str_to_int, + ExtractorError +) + + +class AppleConnectIE(InfoExtractor): + _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P[\w-]+)' + _TEST = { + 'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3', + 'md5': '10d0f2799111df4cb1c924520ca78f98', + 'info_dict': { + 'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3', + 'ext': 'm4v', + 'title': 'Energy', + 'uploader': 'Drake', + 'thumbnail': 'http://is5.mzstatic.com/image/thumb/Video5/v4/78/61/c5/7861c5fa-ad6d-294b-1464-cf7605b911d6/source/1920x1080sr.jpg', + 'upload_date': '20150710', + 'timestamp': 1436545535, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + try: + video_json = self._html_search_regex( + r'class="auc-video-data">(\{.*?\})', webpage, 'json') + except ExtractorError: + raise ExtractorError('This post doesn\'t contain a video', expected=True) + + video_data = self._parse_json(video_json, video_id) + timestamp = str_to_int(self._html_search_regex(r'data-timestamp="(\d+)"', webpage, 'timestamp')) + like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count')) + + return { + 'id': video_id, + 'url': video_data['sslSrc'], + 'title': video_data['title'], + 'description': video_data['description'], + 'uploader': video_data['artistName'], + 'thumbnail': video_data['artworkUrl'], + 'timestamp': timestamp, + 'like_count': like_count, + } From ecdbe09e1017fdd771720f9954f8427b9084bbf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 19 Jul 2015 21:45:49 +0600 Subject: [PATCH 270/450] [francetv] Fix f4m extraction --- youtube_dl/extractor/francetv.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 477e22793..624895ff9 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -58,9 +58,8 @@ class FranceTVBaseInfoExtractor(InfoExtractor): # See https://github.com/rg3/youtube-dl/issues/3963 # m3u8 urls work fine continue - video_url_parsed = compat_urllib_parse_urlparse(video_url) f4m_url = self._download_webpage( - 'http://hdfauth.francetv.fr/esi/TA?url=%s' % video_url_parsed.path, + 'http://hdfauth.francetv.fr/esi/TA?url=%s' % video_url, video_id, 'Downloading f4m manifest token', fatal=False) if f4m_url: formats.extend(self._extract_f4m_formats(f4m_url, video_id, 1, format_id)) From 789a12aaafe7036a155435d9c93d062309f547b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 19 Jul 2015 21:50:25 +0600 Subject: [PATCH 271/450] [francetv] Restore support for jt videos --- youtube_dl/extractor/francetv.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 624895ff9..11db6d921 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -164,7 +164,8 @@ class FranceTVIE(FranceTVBaseInfoExtractor): (?:www\.)?france[2345o]\.fr/ (?: emissions/[^/]+/(?:videos|diffusions)?| - videos + videos| + jt ) /| embed\.francetv\.fr/\?ue= From 632cbb8efa32ef704219a769972fdeae04577ab8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 19 Jul 2015 21:51:06 +0600 Subject: [PATCH 272/450] [francetv] Fix f4m extraction completely --- youtube_dl/extractor/francetv.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 11db6d921..746ab9d84 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -62,7 +62,8 @@ class FranceTVBaseInfoExtractor(InfoExtractor): 'http://hdfauth.francetv.fr/esi/TA?url=%s' % video_url, video_id, 'Downloading f4m manifest token', fatal=False) if f4m_url: - formats.extend(self._extract_f4m_formats(f4m_url, video_id, 1, format_id)) + formats.extend(self._extract_f4m_formats( + f4m_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, 1, format_id)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4', m3u8_id=format_id)) elif video_url.startswith('rtmp'): From 3bc9fb58892d0ca3dfc044b6e0ef65426ecd8c1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 19 Jul 2015 21:53:09 +0600 Subject: [PATCH 273/450] [francetv] Update jt test --- youtube_dl/extractor/francetv.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 746ab9d84..f5aeab110 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -229,15 +229,16 @@ class FranceTVIE(FranceTVBaseInfoExtractor): }, # franceo { - 'url': 'http://www.franceo.fr/jt/info-afrique/04-12-2013', - 'md5': '52f0bfe202848b15915a2f39aaa8981b', + 'url': 'http://www.franceo.fr/jt/info-soir/18-07-2015', + 'md5': '47d5816d3b24351cdce512ad7ab31da8', 'info_dict': { - 'id': '108634970', + 'id': '125377621', 'ext': 'flv', - 'title': 'Infô Afrique', - 'description': 'md5:ebf346da789428841bee0fd2a935ea55', - 'upload_date': '20140915', - 'timestamp': 1410822000, + 'title': 'Infô soir', + 'description': 'md5:01b8c6915a3d93d8bbbd692651714309', + 'upload_date': '20150718', + 'timestamp': 1437241200, + 'duration': 414, }, }, { From ac4b8df5e47a113da5bb66fc975f938bc176b58c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 19 Jul 2015 21:53:54 +0600 Subject: [PATCH 274/450] [francetv] Fix embed test --- youtube_dl/extractor/francetv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index f5aeab110..08b124805 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -248,7 +248,7 @@ class FranceTVIE(FranceTVBaseInfoExtractor): 'id': 'EV_30231', 'ext': 'flv', 'title': 'Alcaline, le concert avec Calogero', - 'description': 'md5:', + 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff', 'upload_date': '20150226', 'timestamp': 1424989860, 'duration': 5400, From 1d18e26ecad636cb88ee9710807e9583eec98cd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 19 Jul 2015 21:54:12 +0600 Subject: [PATCH 275/450] [francetv] Remove unused import --- youtube_dl/extractor/francetv.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 08b124805..706ed9c99 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -6,10 +6,7 @@ import re import json from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlparse, - compat_urlparse, -) +from ..compat import compat_urlparse from ..utils import ( clean_html, ExtractorError, From 7a4a945f138078b95f02e8cec1e38123d66d7f09 Mon Sep 17 00:00:00 2001 From: fnord Date: Sun, 19 Jul 2015 11:31:38 -0500 Subject: [PATCH 276/450] fix/support news.vice.com --- youtube_dl/extractor/vice.py | 40 ++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 04e2b0ba7..dfb621468 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -7,20 +7,34 @@ from ..utils import ExtractorError class ViceIE(InfoExtractor): - _VALID_URL = r'http://www\.vice\.com/.*?/(?P.+)' + _VALID_URL = r'https?://(.+?\.)?vice\.com/.*?/(?P.+)' - _TEST = { - 'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1', - 'info_dict': { - 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', - 'ext': 'mp4', - 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', - }, - 'params': { - # Requires ffmpeg (m3u8 manifest) - 'skip_download': True, - }, - } + _TESTS = [ + { + 'url': 'http://www.vice.com/Fringes/cowboy-capitalists-part-1', + 'info_dict': { + 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', + 'ext': 'mp4', + 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', + }, + 'params': { + # Requires ffmpeg (m3u8 manifest) + 'skip_download': True, + }, + }, { + 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', + 'info_dict': { + 'id': 'N2bzkydjraWDGwnt8jAttCF6Y0PDv4Zj', + 'ext': 'mp4', + 'title': 'VICE News - Inside the Monkey Lab', + 'description': 'md5:1f660d467d3515f29d11e5ef742a4b82', + }, + 'params': { + # Requires ffmpeg (m3u8 manifest) + 'skip_download': True, + }, + } + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From a5dd9a0c5dcf6aace1b7af87acb982480d606753 Mon Sep 17 00:00:00 2001 From: fnord Date: Sun, 19 Jul 2015 11:47:58 -0500 Subject: [PATCH 277/450] pbs: fix vague 'Full Episode' titles; prepend name of show Many videos are given the title 'Full Episode' or 'Episode x', etc; in the info json 'title' is the episode title (for oneshots this is 'Full Episode') while the program name is 'program':{'title'. This changes the title from '(episode title)' to '(program name) - (episode title)' The following urls demonstrate useless titles http://video.pbs.org/video/2365367186/ http://video.pbs.org/video/2365519307/ http://video.pbs.org/video/2365527039/ http://video.pbs.org/video/2365530605/ Before change Full Episode Episode 5 | Preview Season 3 | Episode 4 Preview July 17, 2015 After change To Catch A Comet - Full Episode The Crimson Field - Episode 5 | Preview Last Tango in Halifax - Season 3 | Episode 4 Preview Charlie Rose The Week - July 17, 2015 --- youtube_dl/extractor/pbs.py | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index fec5d65ad..cccff270b 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -32,7 +32,7 @@ class PBSIE(InfoExtractor): 'info_dict': { 'id': '2365006249', 'ext': 'mp4', - 'title': 'A More Perfect Union', + 'title': 'Constitution USA with Peter Sagal - A More Perfect Union', 'description': 'md5:ba0c207295339c8d6eced00b7c363c6a', 'duration': 3190, }, @@ -46,7 +46,7 @@ class PBSIE(InfoExtractor): 'info_dict': { 'id': '2365297690', 'ext': 'mp4', - 'title': 'Losing Iraq', + 'title': 'FRONTLINE - Losing Iraq', 'description': 'md5:f5bfbefadf421e8bb8647602011caf8e', 'duration': 5050, }, @@ -60,7 +60,7 @@ class PBSIE(InfoExtractor): 'info_dict': { 'id': '2201174722', 'ext': 'mp4', - 'title': 'Cyber Schools Gain Popularity, but Quality Questions Persist', + 'title': 'PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist', 'description': 'md5:5871c15cba347c1b3d28ac47a73c7c28', 'duration': 801, }, @@ -72,7 +72,7 @@ class PBSIE(InfoExtractor): 'id': '2365297708', 'ext': 'mp4', 'description': 'md5:68d87ef760660eb564455eb30ca464fe', - 'title': 'Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full', + 'title': 'Great Performances - Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full', 'duration': 6559, 'thumbnail': 're:^https?://.*\.jpg$', }, @@ -88,7 +88,7 @@ class PBSIE(InfoExtractor): 'display_id': 'killer-typhoon', 'ext': 'mp4', 'description': 'md5:c741d14e979fc53228c575894094f157', - 'title': 'Killer Typhoon', + 'title': 'NOVA - Killer Typhoon', 'duration': 3172, 'thumbnail': 're:^https?://.*\.jpg$', 'upload_date': '20140122', @@ -110,7 +110,7 @@ class PBSIE(InfoExtractor): 'id': '2280706814', 'display_id': 'player', 'ext': 'mp4', - 'title': 'Death and the Civil War', + 'title': 'American Experience - Death and the Civil War', 'description': 'American Experience, TV’s most-watched history series, brings to life the compelling stories from our past that inform our understanding of the world today.', 'duration': 6705, 'thumbnail': 're:^https?://.*\.jpg$', @@ -118,6 +118,21 @@ class PBSIE(InfoExtractor): 'params': { 'skip_download': True, # requires ffmpeg }, + }, + { + 'url': 'http://video.pbs.org/video/2365367186/', + 'info_dict': { + 'id': '2365367186', + 'display_id': '2365367186', + 'ext': 'mp4', + 'title': 'To Catch A Comet - Full Episode', + 'description': 'On November 12, 2014, billions of kilometers from Earth, spacecraft orbiter Rosetta and lander Philae did what no other had dared to attempt \u2014 land on the volatile surface of a comet as it zooms around the sun at 67,000 km/hr. The European Space Agency hopes this mission can help peer into our past and unlock secrets of our origins.', + 'duration': 3342, + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, } ] @@ -232,6 +247,12 @@ class PBSIE(InfoExtractor): 'url': closed_captions_url, }] + # video.pbs.org video.pbs.org/videoInfo/... frequently provides an obscure 'title' value, like + # 'Full Episode', 'Episode 5', etc. prepend program->title + alt_title = info.get('program', {}).get('title', '') + if alt_title != '': + info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + '[\s\-\:]+', '', info['title']) + return { 'id': video_id, 'display_id': display_id, From e37c92ec6d137cd2d09c5f13e9a261c4fae552dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 19 Jul 2015 23:59:50 +0600 Subject: [PATCH 278/450] [ard] Extract all formats --- youtube_dl/extractor/ard.py | 125 ++++++++++++++++++++++++++---------- 1 file changed, 91 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 6a35ea463..55f940d57 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -35,6 +35,87 @@ class ARDMediathekIE(InfoExtractor): 'skip': 'Blocked outside of Germany', }] + def _extract_media_info(self, media_info_url, webpage, video_id): + media_info = self._download_json( + media_info_url, video_id, 'Downloading media JSON') + + formats = self._extract_formats(media_info, video_id) + + if not formats: + if '"fsk"' in webpage: + raise ExtractorError( + 'This video is only available after 20:00', expected=True) + elif media_info.get('_geoblocked'): + raise ExtractorError('This video is not available due to geo restriction', expected=True) + + self._sort_formats(formats) + + duration = int_or_none(media_info.get('_duration')) + thumbnail = media_info.get('_previewImage') + + subtitles = {} + subtitle_url = media_info.get('_subtitleUrl') + if subtitle_url: + subtitles['de'] = [{ + 'ext': 'srt', + 'url': subtitle_url, + }] + + return { + 'id': video_id, + 'duration': duration, + 'thumbnail': thumbnail, + 'formats': formats, + 'subtitles': subtitles, + } + + def _extract_formats(self, media_info, video_id): + type_ = media_info.get('_type') + media_array = media_info.get('_mediaArray', []) + formats = [] + for num, media in enumerate(media_array): + for stream in media.get('_mediaStreamArray', []): + stream_urls = stream.get('_stream') + if not stream_urls: + continue + if not isinstance(stream_urls, list): + stream_urls = [stream_urls] + quality = stream.get('_quality') + server = stream.get('_server') + for stream_url in stream_urls: + ext = determine_ext(stream_url) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', + video_id, preference=-1, f4m_id='hds')) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', preference=1, m3u8_id='hls')) + else: + if server and server.startswith('rtmp'): + f = { + 'url': server, + 'play_path': stream_url, + 'format_id': 'a%s-rtmp-%s' % (num, quality), + } + elif stream_url.startswith('http'): + f = { + 'url': stream_url, + 'format_id': 'a%s-%s-%s' % (num, ext, quality) + } + else: + continue + m = re.search(r'_(?P\d+)x(?P\d+)\.mp4$', stream_url) + if m: + f.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + if type_ == 'audio': + f['vcodec'] = 'none' + formats.append(f) + return formats + def _real_extract(self, url): # determine video id from url m = re.match(self._VALID_URL, url) @@ -92,46 +173,22 @@ class ARDMediathekIE(InfoExtractor): 'format_id': fid, 'url': furl, }) + self._sort_formats(formats) + info = { + 'formats': formats, + } else: # request JSON file - media_info = self._download_json( - 'http://www.ardmediathek.de/play/media/%s' % video_id, video_id) - # The second element of the _mediaArray contains the standard http urls - streams = media_info['_mediaArray'][1]['_mediaStreamArray'] - if not streams: - if '"fsk"' in webpage: - raise ExtractorError('This video is only available after 20:00') + info = self._extract_media_info( + 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id) - formats = [] - for s in streams: - if type(s['_stream']) == list: - for index, url in enumerate(s['_stream'][::-1]): - quality = s['_quality'] + index - formats.append({ - 'quality': quality, - 'url': url, - 'format_id': '%s-%s' % (determine_ext(url), quality) - }) - continue - - format = { - 'quality': s['_quality'], - 'url': s['_stream'], - } - - format['format_id'] = '%s-%s' % ( - determine_ext(format['url']), format['quality']) - - formats.append(format) - - self._sort_formats(formats) - - return { + info.update({ 'id': video_id, 'title': title, 'description': description, - 'formats': formats, 'thumbnail': thumbnail, - } + }) + + return info class ARDIE(InfoExtractor): From 3bf8c316a663741ab806f48cf9121209aba88b72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 20 Jul 2015 00:01:22 +0600 Subject: [PATCH 279/450] [sportschau] Reimplement in terms of ard extractor --- youtube_dl/extractor/__init__.py | 7 +++-- youtube_dl/extractor/ard.py | 39 +++++++++++++++++++++++++ youtube_dl/extractor/sportschau.py | 47 ------------------------------ 3 files changed, 44 insertions(+), 49 deletions(-) delete mode 100644 youtube_dl/extractor/sportschau.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5033d67ed..50da08830 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -22,7 +22,11 @@ from .aparat import AparatIE from .appleconnect import AppleConnectIE from .appletrailers import AppleTrailersIE from .archiveorg import ArchiveOrgIE -from .ard import ARDIE, ARDMediathekIE +from .ard import ( + ARDIE, + ARDMediathekIE, + SportschauIE, +) from .arte import ( ArteTvIE, ArteTVPlus7IE, @@ -553,7 +557,6 @@ from .sportbox import ( SportBoxEmbedIE, ) from .sportdeutschland import SportDeutschlandIE -from .sportschau import SportschauIE from .srf import SrfIE from .srmediathek import SRMediathekIE from .ssa import SSAIE diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 55f940d57..2c368d833 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -8,6 +8,7 @@ from .generic import GenericIE from ..utils import ( determine_ext, ExtractorError, + get_element_by_attribute, qualities, int_or_none, parse_duration, @@ -246,3 +247,41 @@ class ARDIE(InfoExtractor): 'upload_date': upload_date, 'thumbnail': thumbnail, } + + +class SportschauIE(ARDMediathekIE): + IE_NAME = 'Sportschau' + _VALID_URL = r'(?Phttps?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P[^/#?]+))\.html' + _TEST = { + 'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html', + 'info_dict': { + 'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100', + 'ext': 'mp4', + 'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + base_url = mobj.group('baseurl') + + webpage = self._download_webpage(url, video_id) + title = get_element_by_attribute('class', 'headline', webpage) + description = self._html_search_meta('description', webpage, 'description') + + info = self._extract_media_info( + base_url + '-mc_defaultQuality-h.json', webpage, video_id) + + info.update({ + 'title': title, + 'description': description, + }) + + return info diff --git a/youtube_dl/extractor/sportschau.py b/youtube_dl/extractor/sportschau.py deleted file mode 100644 index bf9b075db..000000000 --- a/youtube_dl/extractor/sportschau.py +++ /dev/null @@ -1,47 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import get_element_by_attribute - - -class SportschauIE(InfoExtractor): - IE_NAME = 'Sportschau' - _VALID_URL = r'https?://(?:www\.)?sportschau\.de/\w+(?:/\w+)?/video(?P\w+)\.html' - _TEST = { - 'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html', - 'info_dict': { - 'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100', - 'ext': 'mp4', - 'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - ext = '-mc_defaultQuality-h.json' - json_url = url[:-5] + ext - - json = self._download_json(json_url, video_id) - thumb_url = json['_previewImage'] - - m3u8_url = json['_mediaArray'][1]['_mediaStreamArray'][0]['_stream'][0] - m3u8_formats = self._extract_m3u8_formats(m3u8_url, video_id, ext="mp4") - - webpage = self._download_webpage(url, video_id) - title = get_element_by_attribute('class', 'headline', webpage) - desc = self._html_search_meta('description', webpage) - - return { - 'id': video_id, - 'title': title, - 'formats': m3u8_formats, - 'description': desc, - 'thumbnail': thumb_url, - } From 86b4e98ac6f57c3703cd2082a69351ece797111c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 20 Jul 2015 00:05:10 +0600 Subject: [PATCH 280/450] [ard:mediathek] Add audio test --- youtube_dl/extractor/ard.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 2c368d833..fdb94af6b 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -34,6 +34,17 @@ class ARDMediathekIE(InfoExtractor): 'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.', }, 'skip': 'Blocked outside of Germany', + }, { + # audio + 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', + 'md5': '219d94d8980b4f538c7fcb0865eb7f2c', + 'info_dict': { + 'id': '28488308', + 'ext': 'mp3', + 'title': 'Tod eines Fußballers', + 'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef', + 'duration': 3240, + }, }] def _extract_media_info(self, media_info_url, webpage, video_id): @@ -252,7 +263,7 @@ class ARDIE(InfoExtractor): class SportschauIE(ARDMediathekIE): IE_NAME = 'Sportschau' _VALID_URL = r'(?Phttps?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P[^/#?]+))\.html' - _TEST = { + _TESTS = [{ 'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html', 'info_dict': { 'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100', @@ -265,7 +276,7 @@ class SportschauIE(ARDMediathekIE): # m3u8 download 'skip_download': True, }, - } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From 769efa16afc1c9a2871543f69081f1bced361dc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 20 Jul 2015 00:08:21 +0600 Subject: [PATCH 281/450] [ard:mediathek] Remove dead test --- youtube_dl/extractor/ard.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index fdb94af6b..c78475748 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -23,17 +23,18 @@ class ARDMediathekIE(InfoExtractor): _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' _TESTS = [{ - 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', - 'only_matching': True, - }, { - 'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916', + 'url': 'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114', 'info_dict': { - 'id': '22490580', + 'id': '29582122', 'ext': 'mp4', - 'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)', - 'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.', + 'title': 'Ich liebe das Leben trotzdem', + 'description': 'md5:45e4c225c72b27993314b31a84a5261c', + 'duration': 4557, + }, + 'params': { + # m3u8 download + 'skip_download': True, }, - 'skip': 'Blocked outside of Germany', }, { # audio 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', @@ -45,6 +46,9 @@ class ARDMediathekIE(InfoExtractor): 'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef', 'duration': 3240, }, + }, { + 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', + 'only_matching': True, }] def _extract_media_info(self, media_info_url, webpage, video_id): From d719c6a5abf746c31f301ed1a1d8934715eaeb05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 20 Jul 2015 00:13:39 +0600 Subject: [PATCH 282/450] [ard:mediathek] Add test for direct mp4 --- youtube_dl/extractor/ard.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index c78475748..6f465789b 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -35,6 +35,16 @@ class ARDMediathekIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + }, { + 'url': 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916', + 'md5': 'f4d98b10759ac06c0072bbcd1f0b9e3e', + 'info_dict': { + 'id': '29522730', + 'ext': 'mp4', + 'title': 'Tatort: Scheinwelten - Hörfassung (Video tgl. ab 20 Uhr)', + 'description': 'md5:196392e79876d0ac94c94e8cdb2875f1', + 'duration': 5252, + }, }, { # audio 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', From 772acaf31f4614d390a445fe4c0d4e533c165774 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 20 Jul 2015 00:39:55 +0600 Subject: [PATCH 283/450] [vice] Do not capture unused groups in _VALID_URL --- youtube_dl/extractor/vice.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index dfb621468..e3ad4ef54 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -7,7 +7,7 @@ from ..utils import ExtractorError class ViceIE(InfoExtractor): - _VALID_URL = r'https?://(.+?\.)?vice\.com/.*?/(?P.+)' + _VALID_URL = r'https?://(?:.+?\.)?vice\.com/.*?/(?P.+)' _TESTS = [ { From 65c2b21df15053581cac21690605d1f7ac834083 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 20 Jul 2015 00:40:43 +0600 Subject: [PATCH 284/450] [vice] Make test only matching --- youtube_dl/extractor/vice.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index e3ad4ef54..ae1faca1c 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -23,16 +23,7 @@ class ViceIE(InfoExtractor): }, }, { 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', - 'info_dict': { - 'id': 'N2bzkydjraWDGwnt8jAttCF6Y0PDv4Zj', - 'ext': 'mp4', - 'title': 'VICE News - Inside the Monkey Lab', - 'description': 'md5:1f660d467d3515f29d11e5ef742a4b82', - }, - 'params': { - # Requires ffmpeg (m3u8 manifest) - 'skip_download': True, - }, + 'only_matching': True, } ] From 18ae46ad4bcf4d427984e85a9feaf647c3f80737 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 20 Jul 2015 00:42:25 +0600 Subject: [PATCH 285/450] [vice] Modernize --- youtube_dl/extractor/vice.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index ae1faca1c..01af7a995 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -1,5 +1,4 @@ from __future__ import unicode_literals -import re from .common import InfoExtractor from .ooyala import OoyalaIE @@ -7,7 +6,7 @@ from ..utils import ExtractorError class ViceIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?vice\.com/.*?/(?P.+)' + _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)+(?P.+)' _TESTS = [ { @@ -28,9 +27,8 @@ class ViceIE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - name = mobj.group('name') - webpage = self._download_webpage(url, name) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) try: embed_code = self._search_regex( r'embedCode=([^&\'"]+)', webpage, From 0eacd2aaae7d4150d9cf4e1dd8ffc2ebc7ab030b Mon Sep 17 00:00:00 2001 From: fnord Date: Sun, 19 Jul 2015 13:59:12 -0500 Subject: [PATCH 286/450] less clunky if statement --- youtube_dl/extractor/pbs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index cccff270b..ccbe2a9f3 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -249,8 +249,8 @@ class PBSIE(InfoExtractor): # video.pbs.org video.pbs.org/videoInfo/... frequently provides an obscure 'title' value, like # 'Full Episode', 'Episode 5', etc. prepend program->title - alt_title = info.get('program', {}).get('title', '') - if alt_title != '': + alt_title = info.get('program', {}).get('title') + if alt_title: info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + '[\s\-\:]+', '', info['title']) return { From 308c505c3d66b2a11df8c1a5cd7e0fdaa694b585 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 20 Jul 2015 01:03:43 +0600 Subject: [PATCH 287/450] [francetv] Improve _VALID_URL --- youtube_dl/extractor/francetv.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 706ed9c99..75723c00d 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -161,7 +161,8 @@ class FranceTVIE(FranceTVBaseInfoExtractor): (?: (?:www\.)?france[2345o]\.fr/ (?: - emissions/[^/]+/(?:videos|diffusions)?| + emissions/[^/]+/(?:videos|diffusions)| + emission/[^/]+| videos| jt ) From faa1f83ab481a1f456db13d3ead7f0f22908aab1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 20 Jul 2015 02:16:07 +0600 Subject: [PATCH 288/450] [twitch:stream] Improve _VALID_URL (Closes #6294) --- youtube_dl/extractor/twitch.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 948c8ce39..73ce335b7 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -310,9 +310,9 @@ class TwitchBookmarksIE(TwitchPlaylistBaseIE): class TwitchStreamIE(TwitchBaseIE): IE_NAME = 'twitch:stream' - _VALID_URL = r'%s/(?P[^/]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE + _VALID_URL = r'%s/(?P[^/#?]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE - _TEST = { + _TESTS = [{ 'url': 'http://www.twitch.tv/shroomztv', 'info_dict': { 'id': '12772022048', @@ -331,7 +331,10 @@ class TwitchStreamIE(TwitchBaseIE): # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'http://www.twitch.tv/miracle_doto#profile-0', + 'only_matching': True, + }] def _real_extract(self, url): channel_id = self._match_id(url) From a1b85269a49404cab1f549d7c59352fe48dfbf19 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 20 Jul 2015 18:49:53 +0800 Subject: [PATCH 289/450] [extractor/generic] Support vid.me embeds --- youtube_dl/extractor/generic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index dc24a8a8b..cd133a10c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1196,6 +1196,12 @@ class GenericIE(InfoExtractor): if vimeo_url is not None: return self.url_result(vimeo_url) + vid_me_embed_url = self._search_regex( + r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', + webpage, 'vid.me embed', default=None) + if vid_me_embed_url is not None: + return self.url_result(vid_me_embed_url, 'Vidme') + # Look for embedded YouTube player matches = re.findall(r'''(?x) (?: From c909e5820e94ecfefd9a2eb41b1beaa684c530c4 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 20 Jul 2015 18:51:25 +0800 Subject: [PATCH 290/450] [tumblr] Delegate to GenericIE for non-tumblr videos Fixes #6162 --- youtube_dl/extractor/tumblr.py | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 9ead13a91..772f94f7d 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,8 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from .pornhub import PornHubIE -from .vimeo import VimeoIE class TumblrIE(InfoExtractor): @@ -60,25 +58,14 @@ class TumblrIE(InfoExtractor): blog = m_url.group('blog_name') url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) - webpage = self._download_webpage(url, video_id) - - vid_me_embed_url = self._search_regex( - r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', - webpage, 'vid.me embed', default=None) - if vid_me_embed_url is not None: - return self.url_result(vid_me_embed_url, 'Vidme') - - pornhub_url = PornHubIE._extract_url(webpage) - if pornhub_url: - return self.url_result(pornhub_url, 'PornHub') - - vimeo_url = VimeoIE._extract_vimeo_url(url, webpage) - if vimeo_url: - return self.url_result(vimeo_url, 'Vimeo') + webpage, urlh = self._download_webpage_handle(url, video_id) iframe_url = self._search_regex( r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', - webpage, 'iframe url') + webpage, 'iframe url', default=None) + if iframe_url is None: + return self.url_result(urlh.geturl(), 'Generic') + iframe = self._download_webpage(iframe_url, video_id) video_url = self._search_regex(r' Date: Mon, 20 Jul 2015 18:54:47 +0800 Subject: [PATCH 291/450] [tumblr] Improve downloading notes --- youtube_dl/extractor/tumblr.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 772f94f7d..3d3b635e4 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -66,7 +66,8 @@ class TumblrIE(InfoExtractor): if iframe_url is None: return self.url_result(urlh.geturl(), 'Generic') - iframe = self._download_webpage(iframe_url, video_id) + iframe = self._download_webpage(iframe_url, video_id, + 'Downloading iframe page') video_url = self._search_regex(r' Date: Mon, 20 Jul 2015 19:48:50 +0200 Subject: [PATCH 292/450] [youtube] Fix upload_date in test --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9a08924ef..f73032abd 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -543,7 +543,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'CsmdDsKjzN8', 'ext': 'mp4', - 'upload_date': '20150510', + 'upload_date': '20150501', # According to ' Date: Mon, 20 Jul 2015 20:14:20 +0200 Subject: [PATCH 293/450] [youtube] Set 'is_live' --- youtube_dl/extractor/youtube.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f73032abd..3d8b31f98 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -915,6 +915,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Get video info embed_webpage = None + is_live = None if re.search(r'player-age-gate-content">', video_webpage) is not None: age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} @@ -947,6 +948,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Convert to the same format returned by compat_parse_qs video_info = dict((k, [v]) for k, v in args.items()) add_dash_mpd(video_info) + if args.get('livestream') == '1' or args.get('live_playback') == 1: + is_live = True if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): # We also try looking in get_video_info since it may contain different dashmpd # URL that points to a DASH manifest with possibly different itag set (some itags @@ -1251,6 +1254,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'dislike_count': dislike_count, 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]), 'formats': formats, + 'is_live': is_live, } From 8250c32f49fed0986bd514d1435e00a0182e8314 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 20 Jul 2015 20:25:53 +0200 Subject: [PATCH 294/450] [YoutubeDL] don't default to 'bestvideo+bestaudio/best' for live videos Doesn't work currently. --- youtube_dl/YoutubeDL.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 00af78e06..702a6ad50 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1104,7 +1104,8 @@ class YoutubeDL(object): if req_format is None: req_format_list = [] if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and - info_dict['extractor'] in ['youtube', 'ted']): + info_dict['extractor'] in ['youtube', 'ted'] and + not info_dict.get('is_live')): merger = FFmpegMergerPP(self) if merger.available and merger.can_merge(): req_format_list.append('bestvideo+bestaudio') From 7c80519cbf7e3daa029239fbbd147652817877f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 20 Jul 2015 21:10:28 +0200 Subject: [PATCH 295/450] [youtube] Extract start_time From the 't=*' in the url. Currently youtube-dl doesn't use the value, but it was requested for the mpv plugin. --- youtube_dl/extractor/common.py | 2 ++ youtube_dl/extractor/youtube.py | 14 +++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b9014fc23..9e8751877 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -183,6 +183,8 @@ class InfoExtractor(object): ["Sports", "Berlin"] is_live: True, False, or None (=unknown). Whether this video is a live stream that goes on instead of a fixed-length video. + start_time: Time in seconds where the reproduction should start, as + specified in the url. Unless mentioned otherwise, the fields should be Unicode strings. diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3d8b31f98..afbd34f4a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -19,6 +19,7 @@ from ..compat import ( compat_urllib_parse, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, + compat_urllib_parse_urlparse, compat_urllib_request, compat_urlparse, compat_str, @@ -31,6 +32,7 @@ from ..utils import ( get_element_by_id, int_or_none, orderedSet, + parse_duration, str_to_int, unescapeHTML, unified_strdate, @@ -317,7 +319,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): IE_NAME = 'youtube' _TESTS = [ { - 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc', + 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s', 'info_dict': { 'id': 'BaW_jenozKc', 'ext': 'mp4', @@ -329,6 +331,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'categories': ['Science & Technology'], 'like_count': int, 'dislike_count': int, + 'start_time': 1, } }, { @@ -889,6 +892,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'http' if self._downloader.params.get('prefer_insecure', False) else 'https') + start_time = None + parsed_url = compat_urllib_parse_urlparse(url) + for component in [parsed_url.fragment, parsed_url.query]: + query = compat_parse_qs(component) + if 't' in query: + start_time = parse_duration(query['t'][0]) + break + # Extract original video URL from URL with redirection, like age verification, using next_url parameter mobj = re.search(self._NEXT_URL_RE, url) if mobj: @@ -1255,6 +1266,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]), 'formats': formats, 'is_live': is_live, + 'start_time': start_time, } From f993afb26d20ac79891621494922887b3c8df02b Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Mon, 20 Jul 2015 23:10:29 +0200 Subject: [PATCH 296/450] README: fix a typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a2cc89cdb..ac54d7b67 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ which means you can modify it, redistribute it or use it however you like. ## Video Selection: --playlist-start NUMBER Playlist video to start at (default is 1) --playlist-end NUMBER Playlist video to end at (default is last) - --playlist-items ITEM_SPEC Playlist video items to download. Specify indices of the videos in the playlist seperated by commas like: "--playlist-items 1,2,5,8" + --playlist-items ITEM_SPEC Playlist video items to download. Specify indices of the videos in the playlist separated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13. --match-title REGEX Download only matching titles (regex or caseless sub-string) From 4eb59a6b1cd1c4ca5f7dd284f0c26a329bb8e973 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 21 Jul 2015 09:11:23 +0600 Subject: [PATCH 297/450] [options] Fix a typo (#6307) --- youtube_dl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 85365d769..9016e3498 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -219,7 +219,7 @@ def parseOpts(overrideArguments=None): selection.add_option( '--playlist-items', dest='playlist_items', metavar='ITEM_SPEC', default=None, - help='Playlist video items to download. Specify indices of the videos in the playlist seperated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13.') + help='Playlist video items to download. Specify indices of the videos in the playlist separated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13.') selection.add_option( '--match-title', dest='matchtitle', metavar='REGEX', From ce9512b78bbd5f180f44670f6b78784a0be3b815 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 21 Jul 2015 17:20:54 +0200 Subject: [PATCH 298/450] release 2015.07.21 --- docs/supportedsites.md | 6 ++++-- youtube_dl/version.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a84878026..73445137f 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -28,6 +28,7 @@ - **anitube.se** - **AnySex** - **Aparat** + - **AppleConnect** - **AppleDaily**: 臺灣蘋果日報 - **AppleTrailers** - **archive.org**: archive.org videos @@ -486,6 +487,7 @@ - **SportBox** - **SportBoxEmbed** - **SportDeutschland** + - **Sportschau** - **Srf** - **SRMediathek**: Saarländischer Rundfunk - **SSA** @@ -611,8 +613,8 @@ - **Vimple**: Vimple - one-click video hosting - **Vine** - **vine:user** - - **vk.com** - - **vk.com:user-videos**: vk.com:All of a user's videos + - **vk**: VK + - **vk:uservideos**: VK - User's Videos - **Vodlocker** - **VoiceRepublic** - **Vporn** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 3ad7a2bc0..280afdd7f 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.07.18' +__version__ = '2015.07.21' From 496ce6b3493757e806d857363955a2289410a4e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 21 Jul 2015 23:54:31 +0600 Subject: [PATCH 299/450] [snagfilms] Improve m3u8 extraction (Closes #6309) --- youtube_dl/extractor/snagfilms.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/snagfilms.py b/youtube_dl/extractor/snagfilms.py index cf495f310..0e820d250 100644 --- a/youtube_dl/extractor/snagfilms.py +++ b/youtube_dl/extractor/snagfilms.py @@ -23,6 +23,15 @@ class SnagFilmsEmbedIE(InfoExtractor): 'ext': 'mp4', 'title': '#whilewewatch', } + }, { + # invalid labels, 360p is better that 480p + 'url': 'http://www.snagfilms.com/embed/player?filmId=17ca0950-a74a-11e0-a92a-0026bb61d036', + 'md5': '882fca19b9eb27ef865efeeaed376a48', + 'info_dict': { + 'id': '17ca0950-a74a-11e0-a92a-0026bb61d036', + 'ext': 'mp4', + 'title': 'Life in Limbo', + } }, { 'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017', 'only_matching': True, @@ -52,14 +61,15 @@ class SnagFilmsEmbedIE(InfoExtractor): if not file_: continue type_ = source.get('type') - format_id = source.get('label') ext = determine_ext(file_) - if any(_ == 'm3u8' for _ in (type_, ext)): + format_id = source.get('label') or ext + if all(_ == 'm3u8' for _ in (type_, ext)): formats.extend(self._extract_m3u8_formats( file_, video_id, 'mp4', m3u8_id='hls')) else: bitrate = int_or_none(self._search_regex( - r'(\d+)kbps', file_, 'bitrate', default=None)) + [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext], + file_, 'bitrate', default=None)) height = int_or_none(self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None)) formats.append({ From 28afa6e77a8fb95541ae9bead0ec0e57958cbfc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 21 Jul 2015 20:50:02 +0200 Subject: [PATCH 300/450] [snagfilms] Don't use '_' as a variable that is used --- youtube_dl/extractor/snagfilms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/snagfilms.py b/youtube_dl/extractor/snagfilms.py index 0e820d250..6977afb27 100644 --- a/youtube_dl/extractor/snagfilms.py +++ b/youtube_dl/extractor/snagfilms.py @@ -63,7 +63,7 @@ class SnagFilmsEmbedIE(InfoExtractor): type_ = source.get('type') ext = determine_ext(file_) format_id = source.get('label') or ext - if all(_ == 'm3u8' for _ in (type_, ext)): + if all(v == 'm3u8' for v in (type_, ext)): formats.extend(self._extract_m3u8_formats( file_, video_id, 'mp4', m3u8_id='hls')) else: From b68a2613f856fbbf943abf8d6de6ffc6aff2e930 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 22 Jul 2015 02:00:21 +0600 Subject: [PATCH 301/450] [viewster] Rewrite for new API (Closes #6317) --- youtube_dl/extractor/viewster.py | 218 ++++++++++++++++--------------- 1 file changed, 113 insertions(+), 105 deletions(-) diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 1742e66f4..03c873608 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -1,129 +1,137 @@ +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_request +from ..compat import ( + compat_urllib_request, + compat_urllib_parse, +) +from ..utils import ( + determine_ext, + int_or_none, + parse_iso8601, +) class ViewsterIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?viewster\.com/movie/(?P\d+-\d+-\d+)' + _VALID_URL = r'http://(?:www\.)?viewster\.com/(?:serie|movie)/(?P\d+-\d+-\d+)' _TESTS = [{ - # movielink, paymethod=fre - 'url': 'http://www.viewster.com/movie/1293-19341-000/hout-wood/', - 'playlist': [{ - 'md5': '8f9d94b282d80c42b378dffdbb11caf3', - 'info_dict': { - 'id': '1293-19341-000-movie', - 'ext': 'flv', - 'title': "'Hout' (Wood) - Movie", - }, - }], - 'info_dict': { - 'id': '1293-19341-000', - 'title': "'Hout' (Wood)", - 'description': 'md5:925733185a9242ef96f436937683f33b', - } - }, { - # movielink, paymethod=adv + # movie, Type=Movie 'url': 'http://www.viewster.com/movie/1140-11855-000/the-listening-project/', - 'playlist': [{ - 'md5': '77a005453ca7396cbe3d35c9bea30aef', - 'info_dict': { - 'id': '1140-11855-000-movie', - 'ext': 'flv', - 'title': "THE LISTENING PROJECT - Movie", - }, - }], + 'md5': '14d3cfffe66d57b41ae2d9c873416f01', 'info_dict': { 'id': '1140-11855-000', - 'title': "THE LISTENING PROJECT", - 'description': 'md5:714421ae9957e112e672551094bf3b08', - } + 'ext': 'flv', + 'title': 'The listening Project', + 'description': 'md5:bac720244afd1a8ea279864e67baa071', + 'timestamp': 1214870400, + 'upload_date': '20080701', + 'duration': 4680, + }, }, { - # direct links, no movielink - 'url': 'http://www.viewster.com/movie/1198-56411-000/sinister/', - 'playlist': [{ - 'md5': '0307b7eac6bfb21ab0577a71f6eebd8f', - 'info_dict': { - 'id': '1198-56411-000-trailer', - 'ext': 'mp4', - 'title': "Sinister - Trailer", - }, - }, { - 'md5': '80b9ee3ad69fb368f104cb5d9732ae95', - 'info_dict': { - 'id': '1198-56411-000-behind-scenes', - 'ext': 'mp4', - 'title': "Sinister - Behind Scenes", - }, - }, { - 'md5': '3b3ea897ecaa91fca57a8a94ac1b15c5', - 'info_dict': { - 'id': '1198-56411-000-scene-from-movie', - 'ext': 'mp4', - 'title': "Sinister - Scene from movie", - }, - }], + # series episode, Type=Episode + 'url': 'http://www.viewster.com/serie/1284-19427-001/the-world-and-a-wall/', + 'md5': 'd5434c80fcfdb61651cc2199a88d6ba3', 'info_dict': { - 'id': '1198-56411-000', - 'title': "Sinister", - 'description': 'md5:014c40b0488848de9683566a42e33372', - } + 'id': '1284-19427-001', + 'ext': 'flv', + 'title': 'The World and a Wall', + 'description': 'md5:24814cf74d3453fdf5bfef9716d073e3', + 'timestamp': 1428192000, + 'upload_date': '20150405', + 'duration': 1500, + }, + }, { + # serie, Type=Serie + 'url': 'http://www.viewster.com/serie/1303-19426-000/', + 'info_dict': { + 'id': '1303-19426-000', + 'title': 'Is It Wrong to Try to Pick up Girls in a Dungeon?', + 'description': 'md5:eeda9bef25b0d524b3a29a97804c2f11', + }, + 'playlist_count': 13, + }, { + # unfinished serie, no Type + 'url': 'http://www.viewster.com/serie/1284-19427-000/baby-steps-season-2/', + 'info_dict': { + 'id': '1284-19427-000', + 'title': 'Baby Steps—Season 2', + 'description': 'md5:e7097a8fc97151e25f085c9eb7a1cdb1', + }, + 'playlist_mincount': 16, }] _ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01' + _AUTH_TOKEN = '/YqhSYsx8EaU9Bsta3ojlA==' + + def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True): + request = compat_urllib_request.Request(url) + request.add_header('Accept', self._ACCEPT_HEADER) + request.add_header('Auth-token', self._AUTH_TOKEN) + return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal) def _real_extract(self, url): video_id = self._match_id(url) - request = compat_urllib_request.Request( - 'http://api.live.viewster.com/api/v1/movie/%s' % video_id) - request.add_header('Accept', self._ACCEPT_HEADER) + info = self._download_json( + 'https://public-api.viewster.com/search/%s' % video_id, + video_id, 'Downloading entry JSON') - movie = self._download_json( - request, video_id, 'Downloading movie metadata JSON') + entry_id = info.get('Id') or info['id'] - title = movie.get('title') or movie['original_title'] - description = movie.get('synopsis') - thumbnail = movie.get('large_artwork') or movie.get('artwork') + # unfinished serie has no Type + if info.get('Type') in ['Serie', None]: + episodes = self._download_json( + 'https://public-api.viewster.com/series/%s/episodes' % entry_id, + video_id, 'Downloading series JSON') + entries = [ + self.url_result( + 'http://www.viewster.com/movie/%s' % episode['OriginId'], 'Viewster') + for episode in episodes] + title = info.get('Title') or info['Synopsis']['Title'] + description = info.get('Synopsis', {}).get('Detailed') + return self.playlist_result(entries, video_id, title, description) - entries = [] - for clip in movie['play_list']: - entry = None - - # movielink api - link_request = clip.get('link_request') - if link_request: - request = compat_urllib_request.Request( - 'http://api.live.viewster.com/api/v1/movielink?movieid=%(movieid)s&action=%(action)s&paymethod=%(paymethod)s&price=%(price)s¤cy=%(currency)s&language=%(language)s&subtitlelanguage=%(subtitlelanguage)s&ischromecast=%(ischromecast)s' - % link_request) - request.add_header('Accept', self._ACCEPT_HEADER) - - movie_link = self._download_json( - request, video_id, 'Downloading movie link JSON', fatal=False) - - if movie_link: - formats = self._extract_f4m_formats( - movie_link['url'] + '&hdcore=3.2.0&plugin=flowplayer-3.2.0.1', video_id) - self._sort_formats(formats) - entry = { - 'formats': formats, - } - - # direct link - clip_url = clip.get('clip_data', {}).get('url') - if clip_url: - entry = { - 'url': clip_url, - 'ext': 'mp4', - } - - if entry: - entry.update({ - 'id': '%s-%s' % (video_id, clip['canonical_title']), - 'title': '%s - %s' % (title, clip['title']), + formats = [] + for media_type in ('application/f4m+xml', 'application/x-mpegURL'): + media = self._download_json( + 'https://public-api.viewster.com/movies/%s/video?mediaType=%s' + % (entry_id, compat_urllib_parse.quote(media_type)), + video_id, 'Downloading %s JSON' % media_type, fatal=False) + if not media: + continue + video_url = media.get('Uri') + if not video_url: + continue + ext = determine_ext(video_url) + if ext == 'f4m': + video_url += '&' if '?' in video_url else '?' + video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1' + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id='hds')) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id='hls', + fatal=False # m3u8 sometimes fail + )) + else: + formats.append({ + 'url': video_url, }) - entries.append(entry) + self._sort_formats(formats) - playlist = self.playlist_result(entries, video_id, title, description) - playlist['thumbnail'] = thumbnail - return playlist + synopsis = info.get('Synopsis', {}) + # Prefer title outside synopsis since it's less messy + title = info.get('Title') or synopsis['Title'].strip() + description = synopsis.get('Detailed') or info.get('Synopsis', {}).get('Short') + duration = int_or_none(info.get('Duration')) + timestamp = parse_iso8601(info.get('ReleaseDate')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } From c84683c88bbcb3cfd8d27af54a418035f431371d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 22 Jul 2015 02:08:25 +0600 Subject: [PATCH 302/450] [viewster] Strip titles --- youtube_dl/extractor/viewster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 03c873608..6ef36290b 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -88,7 +88,7 @@ class ViewsterIE(InfoExtractor): self.url_result( 'http://www.viewster.com/movie/%s' % episode['OriginId'], 'Viewster') for episode in episodes] - title = info.get('Title') or info['Synopsis']['Title'] + title = (info.get('Title') or info['Synopsis']['Title']).strip() description = info.get('Synopsis', {}).get('Detailed') return self.playlist_result(entries, video_id, title, description) @@ -122,7 +122,7 @@ class ViewsterIE(InfoExtractor): synopsis = info.get('Synopsis', {}) # Prefer title outside synopsis since it's less messy - title = info.get('Title') or synopsis['Title'].strip() + title = (info.get('Title') or synopsis['Title']).strip() description = synopsis.get('Detailed') or info.get('Synopsis', {}).get('Short') duration = int_or_none(info.get('Duration')) timestamp = parse_iso8601(info.get('ReleaseDate')) From 70c857b7283270146864f92d159445e038f40781 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 22 Jul 2015 11:49:54 +0200 Subject: [PATCH 303/450] Credit Zach Bruggeman for the appleconnect extractor (#6190) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 4fd65f46f..373e05c9f 100644 --- a/AUTHORS +++ b/AUTHORS @@ -133,3 +133,4 @@ Remita Amine Aurélio A. Heckert Bernhard Minks sceext +Zach Bruggeman From 981b9cdc8c12d817eaf3ec6b030538c252efe48e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 22 Jul 2015 22:09:01 +0800 Subject: [PATCH 304/450] [lecture2go] Improve some regular expressions --- youtube_dl/extractor/lecture2go.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/lecture2go.py b/youtube_dl/extractor/lecture2go.py index 9cf28e31c..fd115ff54 100644 --- a/youtube_dl/extractor/lecture2go.py +++ b/youtube_dl/extractor/lecture2go.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class Lecture2GoIE(InfoExtractor): - _VALID_URL = r'https?://lecture2go.uni-hamburg.de/veranstaltungen/-/v/(?P[0-9]+)' + _VALID_URL = r'https?://lecture2go\.uni-hamburg\.de/veranstaltungen/-/v/(?P\d+)' _TEST = { 'url': 'https://lecture2go.uni-hamburg.de/veranstaltungen/-/v/17473', 'md5': 'a9e76f83b3ef58019c4b7dbc35f406c1', @@ -21,9 +21,9 @@ class Lecture2GoIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'(.*?)', webpage, 'title') + title = self._html_search_regex(r']+class="title">(.+)', webpage, 'title') video_url = self._search_regex(r'b.isFirefox..a.useHTML5\).b.setOption.a,"src","(.*.mp4)"\).else', webpage, 'video_url') - creator = self._html_search_regex(r'
(.*)
', webpage, 'creator') + creator = self._html_search_regex(r']+id="description">([^<]+)
', webpage, 'creator') return { 'id': video_id, From 795704f0f1f963d3f61a7e20074ce41eeb3cdf95 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 22 Jul 2015 22:39:46 +0800 Subject: [PATCH 305/450] [lecture2go] Support more formats --- youtube_dl/extractor/lecture2go.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/lecture2go.py b/youtube_dl/extractor/lecture2go.py index fd115ff54..0075b8a2e 100644 --- a/youtube_dl/extractor/lecture2go.py +++ b/youtube_dl/extractor/lecture2go.py @@ -1,7 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import determine_ext class Lecture2GoIE(InfoExtractor): @@ -22,12 +25,26 @@ class Lecture2GoIE(InfoExtractor): webpage = self._download_webpage(url, video_id) title = self._html_search_regex(r']+class="title">(.+)', webpage, 'title') - video_url = self._search_regex(r'b.isFirefox..a.useHTML5\).b.setOption.a,"src","(.*.mp4)"\).else', webpage, 'video_url') + + formats = [] + for url in set(re.findall(r'"src","([^"]+)"', webpage)): + ext = determine_ext(url) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats(url, video_id)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats(url, video_id)) + else: + formats.append({ + 'url': url, + }) + + self._sort_formats(formats) + creator = self._html_search_regex(r']+id="description">([^<]+)
', webpage, 'creator') return { 'id': video_id, 'title': title, - 'url': video_url, + 'formats': formats, 'creator': creator } From 1e124295644e3760cd457bb1a6ae717e1cb2c0fc Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 22 Jul 2015 23:05:14 +0800 Subject: [PATCH 306/450] [lecture2go] Update _TEST --- youtube_dl/extractor/lecture2go.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/lecture2go.py b/youtube_dl/extractor/lecture2go.py index 0075b8a2e..d0e9416f5 100644 --- a/youtube_dl/extractor/lecture2go.py +++ b/youtube_dl/extractor/lecture2go.py @@ -11,12 +11,12 @@ class Lecture2GoIE(InfoExtractor): _VALID_URL = r'https?://lecture2go\.uni-hamburg\.de/veranstaltungen/-/v/(?P\d+)' _TEST = { 'url': 'https://lecture2go.uni-hamburg.de/veranstaltungen/-/v/17473', - 'md5': 'a9e76f83b3ef58019c4b7dbc35f406c1', + 'md5': 'ac02b570883020d208d405d5a3fd2f7f', 'info_dict': { 'id': '17473', - 'ext': 'mp4', - 'url': 'https://fms1.rrz.uni-hamburg.de/abo/64.050_FrankHeitmann_2015-04-13_14-35.mp4', - 'title': '2 - Endliche Automaten und reguläre Sprachen' + 'ext': 'flv', + 'title': '2 - Endliche Automaten und reguläre Sprachen', + 'creator': 'Frank Heitmann', } } From 9c29bc69f7d6365835f495dff10f3c5f49671a55 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 22 Jul 2015 23:15:22 +0800 Subject: [PATCH 307/450] [utils] Improve parse_duration Now dots are parsed. For example '87 Min.' --- test/test_utils.py | 1 + youtube_dl/utils.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index e13e11b59..65692a9fb 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -324,6 +324,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_duration('02:03:04'), 7384) self.assertEqual(parse_duration('01:02:03:04'), 93784) self.assertEqual(parse_duration('1 hour 3 minutes'), 3780) + self.assertEqual(parse_duration('87 Min.'), 5220) def test_fix_xml_ampersands(self): self.assertEqual( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 942f76d24..ae813099d 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1309,10 +1309,10 @@ def parse_duration(s): m = re.match( r'''(?ix)(?:P?T)? (?: - (?P[0-9.]+)\s*(?:mins?|minutes?)\s*| + (?P[0-9.]+)\s*(?:mins?\.?|minutes?)\s*| (?P[0-9.]+)\s*(?:hours?)| - \s*(?P[0-9]+)\s*(?:[:h]|hours?)\s*(?P[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*| + \s*(?P[0-9]+)\s*(?:[:h]|hours?)\s*(?P[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*| (?: (?: (?:(?P[0-9]+)\s*(?:[:d]|days?)\s*)? From e9c6deffee26db40992293b3055df31804ca7e12 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 22 Jul 2015 23:22:19 +0800 Subject: [PATCH 308/450] [lecture2go] Add more metadata fields --- youtube_dl/extractor/lecture2go.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/lecture2go.py b/youtube_dl/extractor/lecture2go.py index d0e9416f5..a2f9d5c54 100644 --- a/youtube_dl/extractor/lecture2go.py +++ b/youtube_dl/extractor/lecture2go.py @@ -4,7 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import determine_ext +from ..utils import ( + determine_ext, + parse_duration, + int_or_none, +) class Lecture2GoIE(InfoExtractor): @@ -17,6 +21,7 @@ class Lecture2GoIE(InfoExtractor): 'ext': 'flv', 'title': '2 - Endliche Automaten und reguläre Sprachen', 'creator': 'Frank Heitmann', + 'duration': 5220, } } @@ -41,10 +46,16 @@ class Lecture2GoIE(InfoExtractor): self._sort_formats(formats) creator = self._html_search_regex(r']+id="description">([^<]+)', webpage, 'creator') + duration = parse_duration(self._html_search_regex( + r'Duration:\s*\s*]*>([^<]+)', webpage, 'duration', fatal=False)) + view_count = int_or_none(self._html_search_regex( + r'Views:\s*\s*]+>(\d+)', webpage, 'view count', fatal=False)) return { 'id': video_id, 'title': title, 'formats': formats, - 'creator': creator + 'creator': creator, + 'duration': duration, + 'view_count': view_count, } From 40101dc311909523852a88ba69df76be9b6bc920 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 22 Jul 2015 23:25:32 +0800 Subject: [PATCH 309/450] [lecture2go] Make optional fields non-fatal --- youtube_dl/extractor/lecture2go.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/lecture2go.py b/youtube_dl/extractor/lecture2go.py index a2f9d5c54..40a3d2346 100644 --- a/youtube_dl/extractor/lecture2go.py +++ b/youtube_dl/extractor/lecture2go.py @@ -45,7 +45,8 @@ class Lecture2GoIE(InfoExtractor): self._sort_formats(formats) - creator = self._html_search_regex(r']+id="description">([^<]+)', webpage, 'creator') + creator = self._html_search_regex( + r']+id="description">([^<]+)', webpage, 'creator', fatal=False) duration = parse_duration(self._html_search_regex( r'Duration:\s*\s*]*>([^<]+)', webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( From d609edf4f14e56e3dbb3ffc0be057585c0533666 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 22 Jul 2015 22:49:00 +0600 Subject: [PATCH 310/450] [udemy] Handle already-logged-in scenario (Closes #6327) --- youtube_dl/extractor/udemy.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index e2bab52fe..4a0eaf65f 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -77,7 +77,11 @@ class UdemyIE(InfoExtractor): login_popup = self._download_webpage( self._LOGIN_URL, None, 'Downloading login popup') - if login_popup == '
': + def is_logged(webpage): + return any(p in webpage for p in ['href="https://www.udemy.com/user/logout/', '>Logout<']) + + # already logged in + if is_logged(login_popup): return login_form = self._form_hidden_inputs('login-form', login_popup) @@ -95,8 +99,7 @@ class UdemyIE(InfoExtractor): response = self._download_webpage( request, None, 'Logging in as %s' % username) - if all(logout_pattern not in response - for logout_pattern in ['href="https://www.udemy.com/user/logout/', '>Logout<']): + if not is_logged(response): error = self._html_search_regex( r'(?s)]+class="form-errors[^"]*">(.+?)', response, 'error message', default=None) From ce1bafdce9f7f9e19c172b319a5ae75d1c85759b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 22 Jul 2015 23:49:08 +0600 Subject: [PATCH 311/450] [pbs] Clean up title construction rationale --- youtube_dl/extractor/pbs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index ccbe2a9f3..ba6527b80 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -247,8 +247,8 @@ class PBSIE(InfoExtractor): 'url': closed_captions_url, }] - # video.pbs.org video.pbs.org/videoInfo/... frequently provides an obscure 'title' value, like - # 'Full Episode', 'Episode 5', etc. prepend program->title + # info['title'] is often incomplete (e.g. 'Full Episode', 'Episode 5', etc) + # Try turning it to 'program - title' naming scheme if possible alt_title = info.get('program', {}).get('title') if alt_title: info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + '[\s\-\:]+', '', info['title']) From c7620992d20973320fddb85f936f5dc602d1d708 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 22 Jul 2015 23:49:55 +0600 Subject: [PATCH 312/450] [pbs] No need to escape colon --- youtube_dl/extractor/pbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index ba6527b80..a53479aad 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -251,7 +251,7 @@ class PBSIE(InfoExtractor): # Try turning it to 'program - title' naming scheme if possible alt_title = info.get('program', {}).get('title') if alt_title: - info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + '[\s\-\:]+', '', info['title']) + info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + '[\s\-:]+', '', info['title']) return { 'id': video_id, From f79ebf09a24f9bf0e91d0dc26782eeea02f10ad1 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 23 Jul 2015 01:56:55 +0800 Subject: [PATCH 313/450] Credit @nichdu for Lecture2Go extractor --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 373e05c9f..e75e9885d 100644 --- a/AUTHORS +++ b/AUTHORS @@ -134,3 +134,4 @@ Aurélio A. Heckert Bernhard Minks sceext Zach Bruggeman +Tjark Saul From 948199deac4d9ec6ba6bc4359ad43db7951a1c53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 23 Jul 2015 00:30:48 +0600 Subject: [PATCH 314/450] [tagesschau] Relax _VALID_URL and simplify --- youtube_dl/extractor/tagesschau.py | 62 ++++++++---------------------- 1 file changed, 17 insertions(+), 45 deletions(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index 636607db5..b84892364 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -8,7 +8,7 @@ from ..utils import parse_filesize, ExtractorError class TagesschauIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:sendung/(ts|tsg|tt|nm|bab/bab)|video/video|tsvorzwanzig)(?P-?[0-9]+)(?:~[-_a-zA-Z0-9]*)?\.html' + _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:[^/]+/)*?[^/#?]+?(?P-?[0-9]+)(?:~_[^/#?]+?)?\.html' _TESTS = [{ 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', @@ -29,57 +29,31 @@ class TagesschauIE(InfoExtractor): 'description': 'md5:695c01bfd98b7e313c501386327aea59', 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', 'thumbnail': 're:^http:.*\.jpg$', - } + }, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', - 'md5': '90757268b49ef56deae90c7b48928d58', - 'info_dict': { - 'id': '3771', - 'ext': 'mp4', - 'description': None, - 'title': 'Sendung: tagesschau (mit Gebärdensprache) \t14.07.2015 20:00 Uhr', - 'thumbnail': 're:^http:.*\.jpg$', - } + 'only_matching': True, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/tt-3827.html', - 'md5': '6e3ebdc75e8d67da966a8d06721eda71', - 'info_dict': { - 'id': '3827', - 'ext': 'mp4', - 'description': 'md5:d511d0e278b0ad341a95ad9ab992ce66', - 'title': 'Sendung: tagesthemen \t14.07.2015 22:15 Uhr', - 'thumbnail': 're:^http:.*\.jpg$', - } + 'only_matching': True, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/nm-3475.html', - 'md5': '8a8875a568f0a5ae5ceef93c501a225f', - 'info_dict': { - 'id': '3475', - 'ext': 'mp4', - 'description': 'md5:ed149f5649cda3dac86813a9d777e131', - 'title': 'Sendung: nachtmagazin \t15.07.2015 00:15 Uhr', - 'thumbnail': 're:^http:.*\.jpg$', - } + 'only_matching': True, + }, { + 'url': 'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html', + 'only_matching': True, }, { 'url': 'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html', - 'md5': 'be4d6f0421f2acd8abe25ea29f6f015b', - 'info_dict': { - 'id': '959', - 'ext': 'mp4', - 'description': None, - 'title': 'Sendung: tagesschau vor 20 Jahren \t14.07.2015 22:45 Uhr', - 'thumbnail': 're:^http:.*\.jpg$', - } + 'only_matching': True, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html', - 'md5': '42e3757018d9908581481a80cc1806da', - 'info_dict': { - 'id': '3299', - 'ext': 'mp4', - 'description': None, - 'title': 'Nach dem Referendum: Schaltgespräch nach Athen', - 'thumbnail': 're:^http:.*\.jpg$', - } + 'only_matching': True, + }, { + 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html', + 'only_matching': True, + }, { + 'url': 'http://www.tagesschau.de/multimedia/politikimradio/audio-18407.html', + 'only_matching': True, }] _FORMATS = { @@ -152,11 +126,9 @@ class TagesschauIE(InfoExtractor): thumbnail_fn = self._search_regex( r'(?s)Sendungsbild(.*?)

', - webpage, 'description', fatal=False, default=None) + webpage, 'description', default=None) title = self._html_search_regex( r'(.*?)', webpage, 'title') From a47b602b0877dcde1b795bf53bfe3629c6595870 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 23 Jul 2015 01:01:04 +0600 Subject: [PATCH 315/450] [tagesschau] Add support for audio --- youtube_dl/extractor/tagesschau.py | 37 +++++++++++++++++++----------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index b84892364..7fd0ba987 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -30,6 +30,16 @@ class TagesschauIE(InfoExtractor): 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', 'thumbnail': 're:^http:.*\.jpg$', }, + }, { + 'url': 'http://www.tagesschau.de/multimedia/politikimradio/audio-18407.html', + 'md5': 'aef45de271c4bf0a5db834aa40bf774c', + 'info_dict': { + 'id': '18407', + 'ext': 'mp3', + 'title': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', + 'description': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', + 'thumbnail': 're:^https?:.*\.jpg$', + }, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', 'only_matching': True, @@ -51,9 +61,6 @@ class TagesschauIE(InfoExtractor): }, { 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html', 'only_matching': True, - }, { - 'url': 'http://www.tagesschau.de/multimedia/politikimradio/audio-18407.html', - 'only_matching': True, }] _FORMATS = { @@ -73,19 +80,26 @@ class TagesschauIE(InfoExtractor): playerpage = self._download_webpage( player_url, display_id, 'Downloading player page') - medias = re.findall( - r'"(http://media.+?)", type:"video/(.+?)", quality:"(.+?)"', - playerpage) formats = [] - for url, ext, res in medias: + for media in re.finditer( + r'''(?x) + (?P["\'])(?Phttp://media.+?)(?P=q_url) + ,\s*type:(?P["\'])(?Pvideo|audio)/(?P.+?)(?P=q_type) + (?:,\s*quality:(?P["\'])(?P.+?)(?P=q_quality))? + ''', playerpage): + url = media.group('url') + type_ = media.group('type') + ext = media.group('ext') + res = media.group('quality') f = { - 'format_id': res + '_' + ext, + 'format_id': '%s_%s' % (res, ext) if res else ext, 'url': url, 'ext': ext, + 'vcodec': 'none' if type_ == 'audio' else None, } f.update(self._FORMATS.get(res, {})) formats.append(f) - thumbnail_fn = re.findall(r'"(/multimedia/.+?\.jpg)"', playerpage)[-1] + thumbnail = self._og_search_thumbnail(playerpage) title = self._og_search_title(webpage).strip() description = self._og_search_description(webpage).strip() else: @@ -123,9 +137,7 @@ class TagesschauIE(InfoExtractor): 'filesize_approx': parse_filesize(m.group('filesize_approx')), }) formats.append(format) - thumbnail_fn = self._search_regex( - r'(?s)Sendungsbild(.*?)

', webpage, 'description', default=None) @@ -133,7 +145,6 @@ class TagesschauIE(InfoExtractor): r'(.*?)', webpage, 'title') self._sort_formats(formats) - thumbnail = 'http://www.tagesschau.de' + thumbnail_fn return { 'id': display_id, From 3e214851a49992b010ae90ef6dbaed11f70ceb3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 23 Jul 2015 01:01:35 +0600 Subject: [PATCH 316/450] [tagesschau] Improve tests --- youtube_dl/extractor/tagesschau.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index 7fd0ba987..1a7ea98d2 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -18,7 +18,7 @@ class TagesschauIE(InfoExtractor): 'ext': 'mp4', 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', 'description': 'md5:171feccd9d9b3dd54d05d501568f6359', - 'thumbnail': 're:^http:.*\.jpg$', + 'thumbnail': 're:^https?:.*\.jpg$', }, }, { 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', @@ -28,7 +28,7 @@ class TagesschauIE(InfoExtractor): 'ext': 'mp4', 'description': 'md5:695c01bfd98b7e313c501386327aea59', 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', - 'thumbnail': 're:^http:.*\.jpg$', + 'thumbnail': 're:^https?:.*\.jpg$', }, }, { 'url': 'http://www.tagesschau.de/multimedia/politikimradio/audio-18407.html', From c59b61c0da8fc0fb73d745de59122e290a33b122 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 23 Jul 2015 02:13:51 +0600 Subject: [PATCH 317/450] [viki] Fix height (Closes #6333) --- youtube_dl/extractor/viki.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 51cdc6b65..3a7e9a0f2 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -242,8 +242,8 @@ class VikiIE(VikiBaseIE): formats = [] for format_id, stream_dict in streams.items(): - height = self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None) + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)) for protocol, format_dict in stream_dict.items(): if format_id == 'm3u8': formats = self._extract_m3u8_formats( From 61be92e26a16e24a702c9c65b46f15c375726345 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 23 Jul 2015 03:04:23 +0600 Subject: [PATCH 318/450] [prosiebensat1] Recognize DRM protected videos (#6334) --- youtube_dl/extractor/prosiebensat1.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index fec008ce7..0739234c6 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -9,6 +9,7 @@ from ..compat import ( compat_urllib_parse, ) from ..utils import ( + ExtractorError, determine_ext, int_or_none, unified_strdate, @@ -224,10 +225,13 @@ class ProSiebenSat1IE(InfoExtractor): 'ids': clip_id, }) - videos = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON') + video = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')[0] - duration = float(videos[0]['duration']) - source_ids = [source['id'] for source in videos[0]['sources']] + if video.get('is_protected') is True: + raise ExtractorError('This video is DRM protected.', expected=True) + + duration = float(video['duration']) + source_ids = [source['id'] for source in video['sources']] source_ids_str = ','.join(map(str, source_ids)) g = '01!8d8F_)r9]4s[qeuXfP%' From 993df6bc228ca540bba45e0f5b6041ac9440d695 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 23 Jul 2015 03:05:16 +0600 Subject: [PATCH 319/450] [prosiebensat1] Modernize --- youtube_dl/extractor/prosiebensat1.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 0739234c6..effcf1db3 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -11,6 +11,7 @@ from ..compat import ( from ..utils import ( ExtractorError, determine_ext, + float_or_none, int_or_none, unified_strdate, ) @@ -230,7 +231,7 @@ class ProSiebenSat1IE(InfoExtractor): if video.get('is_protected') is True: raise ExtractorError('This video is DRM protected.', expected=True) - duration = float(video['duration']) + duration = float_or_none(video.get('duration')) source_ids = [source['id'] for source in video['sources']] source_ids_str = ','.join(map(str, source_ids)) From ca4456eda8f49ed2f939c7ae5cfe55b45777217c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 22 Jul 2015 23:23:38 +0200 Subject: [PATCH 320/450] [tagesschau] Remove unused import --- youtube_dl/extractor/tagesschau.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tagesschau.py b/youtube_dl/extractor/tagesschau.py index 1a7ea98d2..73e7657d4 100644 --- a/youtube_dl/extractor/tagesschau.py +++ b/youtube_dl/extractor/tagesschau.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import parse_filesize, ExtractorError +from ..utils import parse_filesize class TagesschauIE(InfoExtractor): From 87dc451108b278eb5e5600a0159bdbd337272392 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 20 Jul 2015 20:35:26 +0200 Subject: [PATCH 321/450] [youtube] Don't use the DASH manifest from 'get_video_info' if 'use_cipher_signature' is True (#5118) Currently they give a 403 Forbidden error. --- youtube_dl/extractor/youtube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3d8b31f98..323681960 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -967,7 +967,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_id, note=False, errnote='unable to download video info webpage') get_video_info = compat_parse_qs(video_info_webpage) - add_dash_mpd(get_video_info) + if get_video_info.get('use_cipher_signature') != ['True']: + add_dash_mpd(get_video_info) if not video_info: video_info = get_video_info if 'token' in get_video_info: From b37317d8b0391651b396fc2dc4e35b59e52fddf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 22 Jul 2015 23:33:49 +0200 Subject: [PATCH 322/450] [generic] Unescape HTML escape sequences in redirect urls (fixes #6311) --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index cd133a10c..6d2efb22e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1702,7 +1702,7 @@ class GenericIE(InfoExtractor): if refresh_header: found = re.search(REDIRECT_REGEX, refresh_header) if found: - new_url = compat_urlparse.urljoin(url, found.group(1)) + new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1))) self.report_following_redirect(new_url) return { '_type': 'url', From 41597d9bed9eaa5e55d5bb572f2ec3f5a312d392 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 23 Jul 2015 13:39:19 +0800 Subject: [PATCH 323/450] [viki] Fix description extraction (closes #6339) --- youtube_dl/extractor/viki.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 3a7e9a0f2..e987badbd 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -173,6 +173,19 @@ class VikiIE(VikiBaseIE): }, { 'url': 'http://www.viki.com/player/44699v', 'only_matching': True, + }, { + # non-English description + 'url': 'http://www.viki.com/videos/158036v-love-in-magic', + 'md5': '1713ae35df5a521b31f6dc40730e7c9c', + 'info_dict': { + 'id': '158036v', + 'ext': 'mp4', + 'uploader': 'I Planet Entertainment', + 'upload_date': '20111122', + 'timestamp': 1321985454, + 'description': 'md5:44b1e46619df3a072294645c770cef36', + 'title': 'Love In Magic', + }, }] def _real_extract(self, url): @@ -192,8 +205,12 @@ class VikiIE(VikiBaseIE): container_title = container_titles.get('en') or container_titles[container_titles.keys()[0]] title = '%s - %s' % (container_title, title) - descriptions = video.get('descriptions') - description = descriptions.get('en') or descriptions[titles.keys()[0]] if descriptions else None + descriptions = video.get('descriptions', {}) + description = descriptions.get('en') + if description is None: + filtered_descriptions = list(filter(None, [descriptions.get(k) for k in titles.keys()])) + if filtered_descriptions: + description = filtered_descriptions[0] duration = int_or_none(video.get('duration')) timestamp = parse_iso8601(video.get('created_at')) From b73b14f72c8240466b5bded9fb891697549b89ec Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 23 Jul 2015 14:02:19 +0800 Subject: [PATCH 324/450] [viki] Rewrite dict selection codes --- youtube_dl/extractor/viki.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index e987badbd..ddbd395c8 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -88,6 +88,14 @@ class VikiBaseIE(InfoExtractor): if not self._token: self.report_warning('Unable to get session token, login has probably failed') + @staticmethod + def dict_selection(dict_obj, preferred_key): + if preferred_key in dict_obj: + return dict_obj.get(preferred_key) + + filtered_dict = list(filter(None, [dict_obj.get(k) for k in dict_obj.keys()])) + return filtered_dict[0] if filtered_dict else None + class VikiIE(VikiBaseIE): IE_NAME = 'viki' @@ -194,23 +202,14 @@ class VikiIE(VikiBaseIE): video = self._call_api( 'videos/%s.json' % video_id, video_id, 'Downloading video JSON') - title = None - titles = video.get('titles') - if titles: - title = titles.get('en') or titles[titles.keys()[0]] + title = self.dict_selection(video.get('titles', {}), 'en') if not title: title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id - container_titles = video.get('container', {}).get('titles') - if container_titles: - container_title = container_titles.get('en') or container_titles[container_titles.keys()[0]] - title = '%s - %s' % (container_title, title) + container_titles = video.get('container', {}).get('titles', {}) + container_title = self.dict_selection(container_titles, 'en') + title = '%s - %s' % (container_title, title) - descriptions = video.get('descriptions', {}) - description = descriptions.get('en') - if description is None: - filtered_descriptions = list(filter(None, [descriptions.get(k) for k in titles.keys()])) - if filtered_descriptions: - description = filtered_descriptions[0] + description = self.dict_selection(video.get('descriptions', {}), 'en') duration = int_or_none(video.get('duration')) timestamp = parse_iso8601(video.get('created_at')) @@ -316,11 +315,9 @@ class VikiChannelIE(VikiBaseIE): 'containers/%s.json' % channel_id, channel_id, 'Downloading channel JSON') - titles = channel['titles'] - title = titles.get('en') or titles[titles.keys()[0]] + title = self.dict_selection(channel['titles'], 'en') - descriptions = channel['descriptions'] - description = descriptions.get('en') or descriptions[descriptions.keys()[0]] + description = self.dict_selection(channel['descriptions'], 'en') entries = [] for video_type in ('episodes', 'clips', 'movies'): From 59db9f80187c4dc3768f8656ede281347a805e1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 23 Jul 2015 12:09:30 +0200 Subject: [PATCH 325/450] [downloader/dash] Improve 'combine_url' (fixes #6341) In some videos the base_url already ends with '/'. --- youtube_dl/downloader/dash.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index a4685d307..8b6fa2753 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -37,7 +37,7 @@ class DashSegmentsFD(FileDownloader): def combine_url(base_url, target_url): if re.match(r'^https?://', target_url): return target_url - return '%s/%s' % (base_url, target_url) + return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url) with open(tmpfilename, 'wb') as outf: append_url_to_file( From 53b8247cb5b5ac4a7822c82b94ec2f1221a40625 Mon Sep 17 00:00:00 2001 From: fnord Date: Thu, 23 Jul 2015 01:38:55 -0500 Subject: [PATCH 326/450] NationalGeographic._VALID_URL: work site-wide Closes #6343. --- youtube_dl/extractor/nationalgeographic.py | 35 +++++++++++++++------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index f793b72f5..6fc9e7b05 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -8,18 +8,30 @@ from ..utils import ( class NationalGeographicIE(InfoExtractor): - _VALID_URL = r'http://video\.nationalgeographic\.com/video/.*?' + _VALID_URL = r'http://video\.nationalgeographic\.com/.*?' - _TEST = { - 'url': 'http://video.nationalgeographic.com/video/news/150210-news-crab-mating-vin?source=featuredvideo', - 'info_dict': { - 'id': '4DmDACA6Qtk_', - 'ext': 'flv', - 'title': 'Mating Crabs Busted by Sharks', - 'description': 'md5:16f25aeffdeba55aaa8ec37e093ad8b3', + _TESTS = [ + { + 'url': 'http://video.nationalgeographic.com/video/news/150210-news-crab-mating-vin?source=featuredvideo', + 'info_dict': { + 'id': '4DmDACA6Qtk_', + 'ext': 'flv', + 'title': 'Mating Crabs Busted by Sharks', + 'description': 'md5:16f25aeffdeba55aaa8ec37e093ad8b3', + }, + 'add_ie': ['ThePlatform'], }, - 'add_ie': ['ThePlatform'], - } + { + 'url': 'http://video.nationalgeographic.com/wild/when-sharks-attack/the-real-jaws', + 'info_dict': { + 'id': '_JeBD_D7PlS5', + 'ext': 'flv', + 'title': 'The Real Jaws', + 'description': 'md5:8d3e09d9d53a85cd397b4b21b2c77be6', + }, + 'add_ie': ['ThePlatform'], + }, + ] def _real_extract(self, url): name = url_basename(url) @@ -37,5 +49,6 @@ class NationalGeographicIE(InfoExtractor): return self.url_result(smuggle_url( 'http://link.theplatform.com/s/ngs/%s?format=SMIL&formats=MPEG4&manifest=f4m' % theplatform_id, - # For some reason, the normal links don't work and we must force the use of f4m + # For some reason, the normal links don't work and we must force + # the use of f4m {'force_smil_url': True})) From 297a564beeb20ca8b00d94f5707532110631f409 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 23 Jul 2015 13:20:21 +0200 Subject: [PATCH 327/450] [youtube] Extract end_time --- youtube_dl/extractor/common.py | 2 ++ youtube_dl/extractor/youtube.py | 10 +++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 9e8751877..1272834c5 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -185,6 +185,8 @@ class InfoExtractor(object): live stream that goes on instead of a fixed-length video. start_time: Time in seconds where the reproduction should start, as specified in the url. + end_time: Time in seconds where the reproduction should end, as + specified in the url. Unless mentioned otherwise, the fields should be Unicode strings. diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index afbd34f4a..117ef2e77 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -319,7 +319,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): IE_NAME = 'youtube' _TESTS = [ { - 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s', + 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9', 'info_dict': { 'id': 'BaW_jenozKc', 'ext': 'mp4', @@ -332,6 +332,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'like_count': int, 'dislike_count': int, 'start_time': 1, + 'end_time': 9, } }, { @@ -893,12 +894,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else 'https') start_time = None + end_time = None parsed_url = compat_urllib_parse_urlparse(url) for component in [parsed_url.fragment, parsed_url.query]: query = compat_parse_qs(component) - if 't' in query: + if start_time is None and 't' in query: start_time = parse_duration(query['t'][0]) - break + if end_time is None and 'end' in query: + end_time = parse_duration(query['end'][0]) # Extract original video URL from URL with redirection, like age verification, using next_url parameter mobj = re.search(self._NEXT_URL_RE, url) @@ -1267,6 +1270,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'formats': formats, 'is_live': is_live, 'start_time': start_time, + 'end_time': end_time, } From 2929fa0e79dfd3a1366e7e23eb4344bc93dd3a10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 23 Jul 2015 13:21:18 +0200 Subject: [PATCH 328/450] [youtube] Also look into the 'start' field for start_time --- youtube_dl/extractor/youtube.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 117ef2e77..462d244d8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -900,6 +900,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): query = compat_parse_qs(component) if start_time is None and 't' in query: start_time = parse_duration(query['t'][0]) + if start_time is None and 'start' in query: + start_time = parse_duration(query['start'][0]) if end_time is None and 'end' in query: end_time = parse_duration(query['end'][0]) From d3f007af183a0f5ed278602128e6bba3cc1350b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 23 Jul 2015 22:04:47 +0600 Subject: [PATCH 329/450] [daylimotion] Adapt to player v5 and modernize (Closes #6151, closes #6250) --- youtube_dl/extractor/dailymotion.py | 160 ++++++++++++++++++++-------- 1 file changed, 117 insertions(+), 43 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 1a41c0db1..8fcae7402 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -13,8 +13,10 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + determine_ext, int_or_none, orderedSet, + parse_iso8601, str_to_int, unescapeHTML, ) @@ -28,10 +30,12 @@ class DailymotionBaseInfoExtractor(InfoExtractor): request.add_header('Cookie', 'family_filter=off; ff=off') return request + def _download_webpage_no_ff(self, url, *args, **kwargs): + request = self._build_request(url) + return self._download_webpage(request, *args, **kwargs) + class DailymotionIE(DailymotionBaseInfoExtractor): - """Information Extractor for Dailymotion""" - _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(embed|#)/)?video/(?P[^/?_]+)' IE_NAME = 'dailymotion' @@ -50,10 +54,17 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'info_dict': { 'id': 'x2iuewm', 'ext': 'mp4', - 'uploader': 'IGN', 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', - 'upload_date': '20150306', + 'description': 'Several come bundled with the Steam Controller.', + 'thumbnail': 're:^https?:.*\.(?:jpg|png)$', 'duration': 74, + 'timestamp': 1425657362, + 'upload_date': '20150306', + 'uploader': 'IGN', + 'uploader_id': 'xijv66', + 'age_limit': 0, + 'view_count': int, + 'comment_count': int, } }, # Vevo video @@ -87,38 +98,106 @@ class DailymotionIE(DailymotionBaseInfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - url = 'https://www.dailymotion.com/video/%s' % video_id - # Retrieve video webpage to extract further information - request = self._build_request(url) - webpage = self._download_webpage(request, video_id) - - # Extract URL, uploader and title from webpage - self.report_extraction(video_id) - - # It may just embed a vevo video: - m_vevo = re.search( - r'[\w]*)', - webpage) - if m_vevo is not None: - vevo_id = m_vevo.group('id') - self.to_screen('Vevo video detected: %s' % vevo_id) - return self.url_result('vevo:%s' % vevo_id, ie='Vevo') + webpage = self._download_webpage_no_ff( + 'https://www.dailymotion.com/video/%s' % video_id, video_id) age_limit = self._rta_search(webpage) - video_upload_date = None - mobj = re.search(r'', webpage) - if mobj is not None: - video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3) + description = self._og_search_description(webpage) or self._html_search_meta( + 'description', webpage, 'description') + + view_count = str_to_int(self._search_regex( + [r']+itemprop="interactionCount"[^>]+content="UserPlays:(\d+)"', + r'video_views_count[^>]+>\s+([\d\.,]+)'], + webpage, 'view count', fatal=False)) + comment_count = int_or_none(self._search_regex( + r']+itemprop="interactionCount"[^>]+content="UserComments:(\d+)"', + webpage, 'comment count', fatal=False)) + + player_v5 = self._search_regex( + r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);', + webpage, 'player v5', default=None) + if player_v5: + player = self._parse_json(player_v5, video_id) + metadata = player['metadata'] + formats = [] + for quality, media_list in metadata['qualities'].items(): + for media in media_list: + media_url = media.get('url') + if not media_url: + continue + type_ = media.get('type') + if type_ == 'application/vnd.lumberjack.manifest': + continue + if type_ == 'application/x-mpegURL' or determine_ext(media_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', m3u8_id='hls')) + else: + f = { + 'url': media_url, + 'format_id': quality, + } + m = re.search(r'H264-(?P\d+)x(?P\d+)', media_url) + if m: + f.update({ + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) + formats.append(f) + self._sort_formats(formats) + + title = metadata['title'] + duration = int_or_none(metadata.get('duration')) + timestamp = int_or_none(metadata.get('created_time')) + thumbnail = metadata.get('poster_url') + uploader = metadata.get('owner', {}).get('screenname') + uploader_id = metadata.get('owner', {}).get('id') + + subtitles = {} + for subtitle_lang, subtitle in metadata.get('subtitles', {}).get('data', {}).items(): + subtitles[subtitle_lang] = [{ + 'ext': determine_ext(subtitle_url), + 'url': subtitle_url, + } for subtitle_url in subtitle.get('urls', [])] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'age_limit': age_limit, + 'view_count': view_count, + 'comment_count': comment_count, + 'formats': formats, + 'subtitles': subtitles, + } + + # vevo embed + vevo_id = self._search_regex( + r'[\w]*)', + webpage, 'vevo embed', default=None) + if vevo_id: + return self.url_result('vevo:%s' % vevo_id, 'Vevo') + + # fallback old player + embed_page = self._download_webpage_no_ff( + 'https://www.dailymotion.com/embed/video/%s' % video_id, + video_id, 'Downloading embed page') + + timestamp = parse_iso8601(self._html_search_meta( + 'video:release_date', webpage, 'upload date')) + + info = self._parse_json( + self._search_regex( + r'var info = ({.*?}),$', embed_page, + 'video info', flags=re.MULTILINE), + video_id) - embed_url = 'https://www.dailymotion.com/embed/video/%s' % video_id - embed_request = self._build_request(embed_url) - embed_page = self._download_webpage( - embed_request, video_id, 'Downloading embed page') - info = self._search_regex(r'var info = ({.*?}),$', embed_page, - 'video info', flags=re.MULTILINE) - info = json.loads(info) if info.get('error') is not None: msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title'] raise ExtractorError(msg, expected=True) @@ -139,16 +218,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'width': width, 'height': height, }) - if not formats: - raise ExtractorError('Unable to extract video URL') + self._sort_formats(formats) # subtitles video_subtitles = self.extract_subtitles(video_id, webpage) - view_count = str_to_int(self._search_regex( - r'video_views_count[^>]+>\s+([\d\.,]+)', - webpage, 'view count', fatal=False)) - title = self._og_search_title(webpage, default=None) if title is None: title = self._html_search_regex( @@ -159,8 +233,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'id': video_id, 'formats': formats, 'uploader': info['owner.screenname'], - 'upload_date': video_upload_date, + 'timestamp': timestamp, 'title': title, + 'description': description, 'subtitles': video_subtitles, 'thumbnail': info['thumbnail_url'], 'age_limit': age_limit, @@ -201,9 +276,9 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): def _extract_entries(self, id): video_ids = [] for pagenum in itertools.count(1): - request = self._build_request(self._PAGE_TEMPLATE % (id, pagenum)) - webpage = self._download_webpage(request, - id, 'Downloading page %s' % pagenum) + webpage = self._download_webpage_no_ff( + self._PAGE_TEMPLATE % (id, pagenum), + id, 'Downloading page %s' % pagenum) video_ids.extend(re.findall(r'data-xid="(.+?)"', webpage)) @@ -286,8 +361,7 @@ class DailymotionCloudIE(DailymotionBaseInfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - request = self._build_request(url) - webpage = self._download_webpage(request, video_id) + webpage = self._download_webpage_no_ff(url, video_id) title = self._html_search_regex(r'([^>]+)', webpage, 'title') From ba911137fa53737b663cb2879d357bc8a35ab558 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 23 Jul 2015 22:50:59 +0600 Subject: [PATCH 330/450] [rts] Add support for articles with videos on rhs (Closes #6332) --- youtube_dl/extractor/rts.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py index 9fbe239d8..12639f08b 100644 --- a/youtube_dl/extractor/rts.py +++ b/youtube_dl/extractor/rts.py @@ -19,7 +19,16 @@ from ..utils import ( class RTSIE(InfoExtractor): IE_DESC = 'RTS.ch' - _VALID_URL = r'https?://(?:www\.)?rts\.ch/(?:(?:[^/]+/){2,}(?P[0-9]+)-(?P.+?)\.html|play/tv/[^/]+/video/(?P.+?)\?id=(?P[0-9]+))' + _VALID_URL = r'''(?x) + (?: + rts:(?P\d+)| + https?:// + (?:www\.)?rts\.ch/ + (?: + (?:[^/]+/){2,}(?P[0-9]+)-(?P.+?)\.html| + play/tv/[^/]+/video/(?P.+?)\?id=(?P[0-9]+) + ) + )''' _TESTS = [ { @@ -122,6 +131,15 @@ class RTSIE(InfoExtractor): 'view_count': int, }, }, + { + # article with videos on rhs + 'url': 'http://www.rts.ch/sport/hockey/6693917-hockey-davos-decroche-son-31e-titre-de-champion-de-suisse.html', + 'info_dict': { + 'id': '6693917', + 'title': 'Hockey: Davos décroche son 31e titre de champion de Suisse', + }, + 'playlist_mincount': 5, + }, { 'url': 'http://www.rts.ch/play/tv/le-19h30/video/le-chantier-du-nouveau-parlement-vaudois-a-permis-une-trouvaille-historique?id=6348280', 'only_matching': True, @@ -130,7 +148,7 @@ class RTSIE(InfoExtractor): def _real_extract(self, url): m = re.match(self._VALID_URL, url) - video_id = m.group('id') or m.group('id_new') + video_id = m.group('rts_id') or m.group('id') or m.group('id_new') display_id = m.group('display_id') or m.group('display_id_new') def download_json(internal_id): @@ -143,6 +161,15 @@ class RTSIE(InfoExtractor): # video_id extracted out of URL is not always a real id if 'video' not in all_info and 'audio' not in all_info: page = self._download_webpage(url, display_id) + + # article with videos on rhs + videos = re.findall( + r']+class="content-item"[^>]*>\s*]+data-video-urn="urn:rts:video:(\d+)"', + page) + if videos: + entries = [self.url_result('rts:%s' % video_urn, 'RTS') for video_urn in videos] + return self.playlist_result(entries, video_id, self._og_search_title(page)) + internal_id = self._html_search_regex( r'<(?:video|audio) data-id="([0-9]+)"', page, 'internal video id') From a8b7b26068fa8de9983ffef995ea6cd4fd3ce90a Mon Sep 17 00:00:00 2001 From: corone17 Date: Wed, 22 Jul 2015 22:38:04 +0200 Subject: [PATCH 331/450] Undo adaptive -> flash workaround For a couple of days now rtlXL's non-DRM adaptive (m3u8) streams don't work anymore. By undoing the adaptive -> flash workaround youtube-dl returns the DRM streams again. Too bad for foreigners (geoblock). The progressive streams still work fine. --- youtube_dl/extractor/rtlnl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index e0c530d64..518f7a7b4 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -86,12 +86,12 @@ class RtlNlIE(InfoExtractor): # NB: nowadays, recent ffmpeg and avconv can handle these encrypted streams, so # this adaptive -> flash workaround is not required in general, but it also # allows bypassing georestriction therefore is retained for now. - videopath = material['videopath'].replace('/adaptive/', '/flash/') + videopath = material['videopath'] m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4') - video_urlpart = videopath.split('/flash/')[1][:-5] + video_urlpart = videopath.split('/adaptive/')[1][:-5] PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4' formats.extend([ From a9e8f60ef686eeeb3d31f2e5211ce176dc8f5298 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 23 Jul 2015 23:20:24 +0600 Subject: [PATCH 332/450] [rtlnl] Update unencrypted streams comment (#6337) --- youtube_dl/extractor/rtlnl.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 518f7a7b4..543d94417 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -82,10 +82,15 @@ class RtlNlIE(InfoExtractor): meta = info.get('meta', {}) - # Use unencrypted m3u8 streams (See https://github.com/rg3/youtube-dl/issues/4118) - # NB: nowadays, recent ffmpeg and avconv can handle these encrypted streams, so - # this adaptive -> flash workaround is not required in general, but it also - # allows bypassing georestriction therefore is retained for now. + # m3u8 streams are encrypted and may not be handled properly by older ffmpeg/avconv. + # To workaround this previously adaptive -> flash trick was used to obtain + # unencrypted m3u8 streams (see https://github.com/rg3/youtube-dl/issues/4118) + # and bypass georestrictions as well. + # Currently, unencrypted m3u8 playlists are (intentionally?) invalid and therefore + # unusable albeit can be fixed by simple string replacement (see + # https://github.com/rg3/youtube-dl/pull/6337) + # Since recent ffmpeg and avconv handle encrypted streams just fine encrypted + # streams are used now. videopath = material['videopath'] m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath From 10952eb2cf2c86c841e46690a3bfa6a39c82d67d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 23 Jul 2015 23:37:45 +0600 Subject: [PATCH 333/450] [extractor/common] Consistent URL spelling --- youtube_dl/extractor/common.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 1272834c5..14b9b4fe2 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -65,7 +65,7 @@ class InfoExtractor(object): Potential fields: * url Mandatory. The URL of the video file - * ext Will be calculated from url if missing + * ext Will be calculated from URL if missing * format A human-readable description of the format ("mp4 container with h264/opus"). Calculated from the format_id, width, height. @@ -155,7 +155,7 @@ class InfoExtractor(object): lower to higher preference, each element is a dictionary with the "ext" entry and one of: * "data": The subtitles file contents - * "url": A url pointing to the subtitles file + * "url": A URL pointing to the subtitles file automatic_captions: Like 'subtitles', used by the YoutubeIE for automatically generated captions duration: Length of the video in seconds, as an integer. @@ -176,7 +176,7 @@ class InfoExtractor(object): Set to "root" to indicate that this is a comment to the original video. age_limit: Age restriction for the video, as an integer (years) - webpage_url: The url to the video webpage, if given to youtube-dl it + webpage_url: The URL to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set by YoutubeDL if it's missing) categories: A list of categories that the video falls in, for example @@ -184,9 +184,9 @@ class InfoExtractor(object): is_live: True, False, or None (=unknown). Whether this video is a live stream that goes on instead of a fixed-length video. start_time: Time in seconds where the reproduction should start, as - specified in the url. + specified in the URL. end_time: Time in seconds where the reproduction should end, as - specified in the url. + specified in the URL. Unless mentioned otherwise, the fields should be Unicode strings. @@ -505,7 +505,7 @@ class InfoExtractor(object): # Methods for following #608 @staticmethod def url_result(url, ie=None, video_id=None, video_title=None): - """Returns a url that points to a page that should be processed""" + """Returns a URL that points to a page that should be processed""" # TODO: ie should be the class used for getting the info video_info = {'_type': 'url', 'url': url, @@ -639,7 +639,7 @@ class InfoExtractor(object): return unescapeHTML(escaped) def _og_search_thumbnail(self, html, **kargs): - return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs) + return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs) def _og_search_description(self, html, **kargs): return self._og_search_property('description', html, fatal=False, **kargs) @@ -1120,7 +1120,7 @@ class InfoExtractor(object): class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. - They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query} + They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query} Instances should define _SEARCH_KEY and _MAX_RESULTS. """ From 660f9459dabfc01c7235b4cc57f9c561de522e07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 24 Jul 2015 00:17:36 +0600 Subject: [PATCH 334/450] [canalplus] Fix m3u8 videos extension --- youtube_dl/extractor/canalplus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 699b4f7d0..8671236dd 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -106,7 +106,7 @@ class CanalplusIE(InfoExtractor): continue format_id = fmt.tag if format_id == 'HLS': - hls_formats = self._extract_m3u8_formats(format_url, video_id, 'flv') + hls_formats = self._extract_m3u8_formats(format_url, video_id, 'mp4') for fmt in hls_formats: fmt['preference'] = preference(format_id) formats.extend(hls_formats) From f3f0b8e4030a07e07385afe6ebb4485ac33e5357 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 24 Jul 2015 00:21:13 +0600 Subject: [PATCH 335/450] [canalplus] Modernize --- youtube_dl/extractor/canalplus.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 8671236dd..57e0cda2c 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -106,15 +106,11 @@ class CanalplusIE(InfoExtractor): continue format_id = fmt.tag if format_id == 'HLS': - hls_formats = self._extract_m3u8_formats(format_url, video_id, 'mp4') - for fmt in hls_formats: - fmt['preference'] = preference(format_id) - formats.extend(hls_formats) + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', preference=preference(format_id))) elif format_id == 'HDS': - hds_formats = self._extract_f4m_formats(format_url + '?hdcore=2.11.3', video_id) - for fmt in hds_formats: - fmt['preference'] = preference(format_id) - formats.extend(hds_formats) + formats.extend(self._extract_f4m_formats( + format_url + '?hdcore=2.11.3', video_id, preference=preference(format_id))) else: formats.append({ 'url': format_url, From d97f5cd795b0bd8b6bd636b899bdbc8b10d5907b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 24 Jul 2015 02:56:54 +0600 Subject: [PATCH 336/450] [bbccouk] Make more robust (Closes #6345) --- youtube_dl/extractor/bbccouk.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index 5825d2867..b2e5f7418 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -210,12 +210,12 @@ class BBCCoUkIE(InfoExtractor): def _extract_video(self, media, programme_id): formats = [] - vbr = int(media.get('bitrate')) + vbr = int_or_none(media.get('bitrate')) vcodec = media.get('encoding') service = media.get('service') - width = int(media.get('width')) - height = int(media.get('height')) - file_size = int(media.get('media_file_size')) + width = int_or_none(media.get('width')) + height = int_or_none(media.get('height')) + file_size = int_or_none(media.get('media_file_size')) for connection in self._extract_connections(media): conn_formats = self._extract_connection(connection, programme_id) for format in conn_formats: @@ -232,7 +232,7 @@ class BBCCoUkIE(InfoExtractor): def _extract_audio(self, media, programme_id): formats = [] - abr = int(media.get('bitrate')) + abr = int_or_none(media.get('bitrate')) acodec = media.get('encoding') service = media.get('service') for connection in self._extract_connections(media): @@ -300,7 +300,7 @@ class BBCCoUkIE(InfoExtractor): if kind != 'programme' and kind != 'radioProgramme': continue programme_id = item.get('vpid') - duration = int(item.get('duration')) + duration = int_or_none(item.get('duration')) formats, subtitles = self._download_media_selector(programme_id) return programme_id, title, description, duration, formats, subtitles except ExtractorError as ee: @@ -332,7 +332,7 @@ class BBCCoUkIE(InfoExtractor): title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text programme_id = item.get('identifier') - duration = int(item.get('duration')) + duration = int_or_none(item.get('duration')) formats, subtitles = self._download_media_selector(programme_id) return programme_id, title, description, duration, formats, subtitles From 2b2ee140c3c6f08b4078cc6a5a289e5e74bec2b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 24 Jul 2015 03:40:24 +0600 Subject: [PATCH 337/450] [dailymotion:user] Fix _VALID_URL (Closes #6346) --- youtube_dl/extractor/dailymotion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 8fcae7402..5a4987772 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -302,7 +302,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): class DailymotionUserIE(DailymotionPlaylistIE): IE_NAME = 'dailymotion:user' - _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:(?:old/)?user/)?(?P[^/]+)$' + _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|#|video|playlist)/)(?:(?:old/)?user/)?(?P[^/]+)' _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' _TESTS = [{ 'url': 'https://www.dailymotion.com/user/nqtv', From 1a117a77287e7dbd4d92f29062dabcf4efb86cb5 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 24 Jul 2015 12:00:20 +0100 Subject: [PATCH 338/450] [clipfish] extract mp4 video link --- youtube_dl/extractor/clipfish.py | 37 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index a5c3cb7c6..09dfaac60 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -1,13 +1,11 @@ from __future__ import unicode_literals -import re -import time -import xml.etree.ElementTree - from .common import InfoExtractor from ..utils import ( ExtractorError, - parse_duration, + int_or_none, + js_to_json, + determine_ext, ) @@ -17,37 +15,40 @@ class ClipfishIE(InfoExtractor): _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P[0-9]+)/' _TEST = { 'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/', - 'md5': '2521cd644e862936cf2e698206e47385', + 'md5': '79bc922f3e8a9097b3d68a93780fd475', 'info_dict': { 'id': '3966754', 'ext': 'mp4', 'title': 'FIFA 14 - E3 2013 Trailer', 'duration': 82, - }, - 'skip': 'Blocked in the US' + } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_info = self._parse_json( + js_to_json(self._html_search_regex('var videoObject = ({[^}]+?})', webpage, 'videoObject')), + video_id + ) + info_url = self._parse_json( + js_to_json(self._html_search_regex('var globalFlashvars = ({[^}]+?})', webpage, 'globalFlashvars')), + video_id + )['data'] - info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' % - (video_id, int(time.time()))) doc = self._download_xml( info_url, video_id, note='Downloading info page') title = doc.find('title').text video_url = doc.find('filename').text - if video_url is None: - xml_bytes = xml.etree.ElementTree.tostring(doc) - raise ExtractorError('Cannot find video URL in document %r' % - xml_bytes) thumbnail = doc.find('imageurl').text - duration = parse_duration(doc.find('duration').text) + duration = int_or_none(video_info['length']) + formats = [{'url': video_info['videourl']},{'url': video_url}] + self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'url': video_url, + 'formats': formats, 'thumbnail': thumbnail, 'duration': duration, } From 12434026574bcaaaa705c31ef14428cc91a5efad Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 24 Jul 2015 21:29:44 +0800 Subject: [PATCH 339/450] [dailymotion:playlist] Detect problematic redirection (fixes #6347) --- youtube_dl/extractor/dailymotion.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 5a4987772..85d945509 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -30,6 +30,10 @@ class DailymotionBaseInfoExtractor(InfoExtractor): request.add_header('Cookie', 'family_filter=off; ff=off') return request + def _download_webpage_handle_no_ff(self, url, *args, **kwargs): + request = self._build_request(url) + return self._download_webpage_handle(request, *args, **kwargs) + def _download_webpage_no_ff(self, url, *args, **kwargs): request = self._build_request(url) return self._download_webpage(request, *args, **kwargs) @@ -275,10 +279,17 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): def _extract_entries(self, id): video_ids = [] + processed_urls = set() for pagenum in itertools.count(1): - webpage = self._download_webpage_no_ff( - self._PAGE_TEMPLATE % (id, pagenum), - id, 'Downloading page %s' % pagenum) + page_url = self._PAGE_TEMPLATE % (id, pagenum) + webpage, urlh = self._download_webpage_handle_no_ff( + page_url, id, 'Downloading page %s' % pagenum) + if urlh.geturl() in processed_urls: + self.report_warning('Stopped at duplicated page %s, which is the same as %s' % ( + page_url, urlh.geturl()), id) + break + + processed_urls.add(urlh.geturl()) video_ids.extend(re.findall(r'data-xid="(.+?)"', webpage)) @@ -311,6 +322,17 @@ class DailymotionUserIE(DailymotionPlaylistIE): 'title': 'Rémi Gaillard', }, 'playlist_mincount': 100, + }, { + 'url': 'http://www.dailymotion.com/user/UnderProject', + 'info_dict': { + 'id': 'UnderProject', + 'title': 'UnderProject', + }, + 'playlist_mincount': 1800, + 'expected_warnings': [ + 'Stopped at duplicated page', + ], + 'skip': 'Takes too long time', }] def _real_extract(self, url): From 593ddd851b87eb7ac327a9216800162f07b2acca Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 24 Jul 2015 14:46:45 +0100 Subject: [PATCH 340/450] [letv] fix height --- youtube_dl/extractor/letv.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index ba2ae8085..a28abb0f0 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -15,6 +15,7 @@ from ..utils import ( determine_ext, ExtractorError, parse_iso8601, + int_or_none, ) @@ -134,7 +135,7 @@ class LetvIE(InfoExtractor): } if format_id[-1:] == 'p': - url_info_dict['height'] = format_id[:-1] + url_info_dict['height'] = int_or_none(format_id[:-1]) urls.append(url_info_dict) From 678e436f2e77f1ae3a57c4b5d1fc3d74342ab412 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 25 Jul 2015 02:09:34 +0600 Subject: [PATCH 341/450] [youtube] Handle empty allowed regions (Closes #6351) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 229fc3a0f..4023a6e50 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -993,7 +993,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if 'reason' in video_info: if 'The uploader has not made this video available in your country.' in video_info['reason']: regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None) - if regions_allowed is not None: + if regions_allowed: raise ExtractorError('YouTube said: This video is available in %s only' % ( ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))), expected=True) From b14fa8e6874818a3f210b2a67cf53345000defdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 25 Jul 2015 15:47:53 +0600 Subject: [PATCH 342/450] [soundcloud:set] Defer download link resolve (Closes #6354) --- youtube_dl/extractor/soundcloud.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 118ca4832..cdee8e2a3 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -282,9 +282,11 @@ class SoundcloudSetIE(SoundcloudIE): msgs = (compat_str(err['error_message']) for err in info['errors']) raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs)) + entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in info['tracks']] + return { '_type': 'playlist', - 'entries': [self._extract_info_dict(track, secret_token=token) for track in info['tracks']], + 'entries': entries, 'id': '%s' % info['id'], 'title': info['title'], } From 40a2d17052e9b542eb3c360a0ce067d244e07fb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 25 Jul 2015 15:48:44 +0600 Subject: [PATCH 343/450] [soundcloud:playlist] Defer download link resolve --- youtube_dl/extractor/soundcloud.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index cdee8e2a3..0a6c9fe72 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -381,9 +381,7 @@ class SoundcloudPlaylistIE(SoundcloudIE): data = self._download_json( base_url + data, playlist_id, 'Downloading playlist') - entries = [ - self._extract_info_dict(t, quiet=True, secret_token=token) - for t in data['tracks']] + entries = [self.url_result(track['permalink_url'], 'Soundcloud') for track in data['tracks']] return { '_type': 'playlist', From eab7faa0c1e8511bc91c64347d0dffc28c94f101 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 25 Jul 2015 18:39:01 +0800 Subject: [PATCH 344/450] [ir90tv] Test (?:www\.)? part in _VALID_URL --- youtube_dl/extractor/ir90tv.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py index b79529b1b..880a6e32f 100644 --- a/youtube_dl/extractor/ir90tv.py +++ b/youtube_dl/extractor/ir90tv.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class Ir90TvIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?90tv\.ir/video/(?P[0-9]+)/.*' - _TEST = { + _TESTS = [{ 'url': 'http://90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218', 'md5': '411dbd94891381960cb9e13daa47a869', 'info_dict': { @@ -15,7 +15,10 @@ class Ir90TvIE(InfoExtractor): 'title': 'شایعات نقل و انتقالات مهم فوتبال اروپا 94/02/18', 'thumbnail': 're:^https?://.*\.jpg$', } - } + }, { + 'url': 'http://www.90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From 9700cd9097445d598515dc41fb3cb9421403b9b9 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 25 Jul 2015 18:42:40 +0800 Subject: [PATCH 345/450] [ir90tv] Improve title extraction --- youtube_dl/extractor/ir90tv.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py index 880a6e32f..92333c3ea 100644 --- a/youtube_dl/extractor/ir90tv.py +++ b/youtube_dl/extractor/ir90tv.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import remove_start class Ir90TvIE(InfoExtractor): @@ -24,8 +25,8 @@ class Ir90TvIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'\n90tv.ir :: (.*?)', webpage, 'title') + title = remove_start(self._html_search_regex( + r'([^<]+)', webpage, 'title'), '90tv.ir :: ') video_url = self._search_regex( r']+src="([^"]+)"', webpage, 'video url') From 7523647391969f8d747ba0fc178592f7f3d5e453 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 25 Jul 2015 18:43:07 +0800 Subject: [PATCH 346/450] [ir90tv] PEP8 --- youtube_dl/extractor/ir90tv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py index 92333c3ea..6418d6178 100644 --- a/youtube_dl/extractor/ir90tv.py +++ b/youtube_dl/extractor/ir90tv.py @@ -37,6 +37,6 @@ class Ir90TvIE(InfoExtractor): 'url': video_url, 'id': video_id, 'title': title, - 'video_url' : video_url, - 'thumbnail' : thumbnail, + 'video_url': video_url, + 'thumbnail': thumbnail, } From 2c7c721933e53ece49bee0140d2dad9a8219d6e4 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 25 Jul 2015 18:48:00 +0800 Subject: [PATCH 347/450] [ir90tv] Optional fields should be non-fatal --- youtube_dl/extractor/ir90tv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py index 6418d6178..214bcd5b5 100644 --- a/youtube_dl/extractor/ir90tv.py +++ b/youtube_dl/extractor/ir90tv.py @@ -31,7 +31,7 @@ class Ir90TvIE(InfoExtractor): video_url = self._search_regex( r']+src="([^"]+)"', webpage, 'video url') - thumbnail = self._search_regex(r'poster="([^"]+)"', webpage, 'thumbnail url') + thumbnail = self._search_regex(r'poster="([^"]+)"', webpage, 'thumbnail url', fatal=False) return { 'url': video_url, From 9afa1770d1a6835bc8fee48dc86cd1a702d1f67a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 25 Jul 2015 20:21:42 +0600 Subject: [PATCH 348/450] [bbc] Improve playlist extraction, refactor, expand support and document --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/bbc.py | 375 +++++++++++++++++++++---------- 2 files changed, 259 insertions(+), 121 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bc61cbdc5..d77ed3ba2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -43,7 +43,10 @@ from .azubu import AzubuIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE -from .bbc import BBCCoUkIE, BBCNewsIE +from .bbc import ( + BBCCoUkIE, + BBCIE, +) from .beeg import BeegIE from .behindkink import BehindKinkIE from .beatportpro import BeatportProIE diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 86327d8ed..2a0901ee4 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1,15 +1,18 @@ +# coding: utf-8 from __future__ import unicode_literals +import re import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( ExtractorError, - parse_duration, + float_or_none, int_or_none, + parse_duration, + parse_iso8601, ) from ..compat import compat_HTTPError -import re class BBCCoUkIE(InfoExtractor): @@ -17,7 +20,7 @@ class BBCCoUkIE(InfoExtractor): IE_DESC = 'BBC iPlayer' _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' - mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' + _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' _TESTS = [ { @@ -264,16 +267,21 @@ class BBCCoUkIE(InfoExtractor): return subtitles def _download_media_selector(self, programme_id): + return self._download_media_selector_url( + self._MEDIASELECTOR_URL % programme_id, programme_id) + + def _download_media_selector_url(self, url, programme_id=None): try: media_selection = self._download_xml( - self.mediaselector_url % programme_id, - programme_id, 'Downloading media selection XML') + url, programme_id, 'Downloading media selection XML') except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) else: raise + return self._process_media_selector(media_selection, programme_id) + def _process_media_selector(self, media_selection, programme_id): formats = [] subtitles = None @@ -312,10 +320,21 @@ class BBCCoUkIE(InfoExtractor): raise # fallback to legacy playlist - playlist = self._download_xml( - 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, - playlist_id, 'Downloading legacy playlist XML') + return self._process_legacy_playlist(playlist_id) + def _process_legacy_playlist_url(self, url, display_id): + playlist = self._download_legacy_playlist_url(url, display_id) + return self._extract_from_legacy_playlist(playlist, display_id) + + def _process_legacy_playlist(self, playlist_id): + return self._process_legacy_playlist_url( + 'http://www.bbc.co.uk/iplayer/playlist/%s' % playlist_id, playlist_id) + + def _download_legacy_playlist_url(self, url, playlist_id=None): + return self._download_xml( + url, playlist_id, 'Downloading legacy playlist XML') + + def _extract_from_legacy_playlist(self, playlist, playlist_id): no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems') if no_items is not None: reason = no_items.get('reason') @@ -335,8 +354,23 @@ class BBCCoUkIE(InfoExtractor): continue title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text - programme_id = item.get('identifier') + + def get_programme_id(item): + def get_from_attributes(item): + for p in('identifier', 'group'): + value = item.get(p) + if value and re.match(r'^[pb][\da-z]{7}$', value): + return value + get_from_attributes(item) + mediator = item.find('./{http://bbc.co.uk/2008/emp/playlist}mediator') + if mediator is not None: + return get_from_attributes(mediator) + + programme_id = get_programme_id(item) duration = int_or_none(item.get('duration')) + # TODO: programme_id can be None and media items can be incorporated right inside + # playlist's item (e.g. http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) + # as f4m and m3u8 formats, subtitles = self._download_media_selector(programme_id) return programme_id, title, description, duration, formats, subtitles @@ -383,175 +417,276 @@ class BBCCoUkIE(InfoExtractor): } -class BBCNewsIE(BBCCoUkIE): - IE_NAME = 'bbc.com' - IE_DESC = 'BBC news' - _VALID_URL = r'https?://(?:www\.)?bbc\.com/.+?/(?P[^/]+)$' +class BBCIE(BBCCoUkIE): + IE_NAME = 'bbc' + IE_DESC = 'BBC' + _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' - mediaselector_url = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' + # fails with notukerror for some videos + #_MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' + _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s' _TESTS = [{ + # article with multiple videos embedded with data-media-meta containing + # playlist.sxml, externalId and no direct video links 'url': 'http://www.bbc.com/news/world-europe-32668511', 'info_dict': { 'id': 'world-europe-32668511', 'title': 'Russia stages massive WW2 parade despite Western boycott', + 'description': 'md5:00ff61976f6081841f759a08bf78cc9c', }, 'playlist_count': 2, }, { + # article with multiple videos embedded with data-media-meta (more videos) 'url': 'http://www.bbc.com/news/business-28299555', 'info_dict': { 'id': 'business-28299555', 'title': 'Farnborough Airshow: Video highlights', + 'description': 'BBC reports and video highlights at the Farnborough Airshow.', }, 'playlist_count': 9, + 'skip': 'Save time', }, { + # single video embedded with mediaAssetPage.init() 'url': 'http://www.bbc.com/news/world-europe-32041533', - 'note': 'Video', 'info_dict': { 'id': 'p02mprgb', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', - 'description': 'Germanwings plane crash site in aerial video - Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', 'duration': 47, + 'timestamp': 1427219242, 'upload_date': '20150324', - 'uploader': 'BBC News', }, 'params': { + # rtmp download 'skip_download': True, } }, { + # article with single video embedded with data-media-meta containing + # direct video links (for now these are extracted) and playlist.xml (with + # media items as f4m and m3u8 - currently unsupported) 'url': 'http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu', - 'note': 'Video', 'info_dict': { - 'id': 'NA', + 'id': '150615_telabyad_kentin_cogu', 'ext': 'mp4', - 'title': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde', - 'description': 'YPG: Tel Abyad\'\u0131n tamam\u0131 kontrol\xfcm\xfczde', + 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde", 'duration': 47, + 'timestamp': 1434397334, 'upload_date': '20150615', - 'uploader': 'BBC News', }, 'params': { 'skip_download': True, } }, { + # single video embedded with mediaAssetPage.init() (regional section) 'url': 'http://www.bbc.com/mundo/video_fotos/2015/06/150619_video_honduras_militares_hospitales_corrupcion_aw', - 'note': 'Video', 'info_dict': { - 'id': '39275083', + 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', 'ext': 'mp4', - 'title': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n', - 'description': 'Honduras militariza sus hospitales por nuevo esc\xe1ndalo de corrupci\xf3n', + 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', 'duration': 87, + 'timestamp': 1434713142, 'upload_date': '20150619', - 'uploader': 'BBC News', }, 'params': { 'skip_download': True, } + }, { + # single video story with digitalData + 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', + 'info_dict': { + 'id': 'p02q6gc4', + 'ext': 'flv', + 'title': 'Sri Lanka’s spicy secret', + 'description': 'As a new train line to Jaffna opens up the country’s north, travellers can experience a truly distinct slice of Tamil culture.', + 'timestamp': 1437674293, + 'upload_date': '20150723', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # single video story without digitalData + 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', + 'info_dict': { + 'id': 'p018zqqg', + 'ext': 'flv', + 'title': 'Hyundai Santa Fe Sport: Rock star', + 'description': 'md5:b042a26142c4154a6e472933cf20793d', + 'timestamp': 1368473503, + 'upload_date': '20130513', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # single video with playlist.sxml URL + 'url': 'http://www.bbc.com/sport/0/football/33653409', + 'info_dict': { + 'id': 'p02xycnp', + 'ext': 'flv', + 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', + 'description': 'md5:398fca0e2e701c609d726e034fa1fc89', + 'duration': 140, + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + # single video with playlist URL from weather section + 'url': 'http://www.bbc.com/weather/features/33601775', + 'only_matching': True, + }, { + # custom redirection to www.bbc.com + 'url': 'http://www.bbc.co.uk/news/science-environment-33661876', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if BBCCoUkIE.suitable(url) else super(BBCIE, cls).suitable(url) + + def _extract_from_media_meta(self, media_meta, video_id): + # Direct links to media in media metadata (e.g. + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) + # TODO: there are also f4m and m3u8 streams incorporated in playlist.sxml + source_files = media_meta.get('sourceFiles') + if source_files: + return [{ + 'url': f['url'], + 'format_id': format_id, + 'ext': f.get('encoding'), + 'tbr': float_or_none(f.get('bitrate'), 1000), + 'filesize': int_or_none(f.get('filesize')), + } for format_id, f in source_files.items() if f.get('url')], [] + + programme_id = media_meta.get('externalId') + if programme_id: + return self._download_media_selector(programme_id) + + # Process playlist.sxml as legacy playlist + href = media_meta.get('href') + if href: + playlist = self._download_legacy_playlist_url(href) + _, _, _, _, formats, subtitles = self._extract_from_legacy_playlist(playlist, video_id) + return formats, subtitles + + return [], [] + def _real_extract(self, url): - list_id = self._match_id(url) - webpage = self._download_webpage(url, list_id) + playlist_id = self._match_id(url) - list_title = self._html_search_regex(r'(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'list title') + webpage = self._download_webpage(url, playlist_id) - pubdate = self._html_search_regex(r'"datePublished":\s*"(\d+-\d+-\d+)', webpage, 'date', default=None) - if pubdate: - pubdate = pubdate.replace('-', '') - - ret = [] - jsent = [] - - # works with bbc.com/news/something-something-123456 articles - jsent = map( - lambda m: self._parse_json(m, list_id), - re.findall(r"data-media-meta='({[^']+})'", webpage) - ) - - if len(jsent) == 0: - # http://www.bbc.com/news/video_and_audio/international - # and single-video articles - masset = self._html_search_regex(r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'mediaassets', default=None) - if masset: - jmasset = self._parse_json(masset, list_id) - for key, val in jmasset.get('videos', {}).items(): - for skey, sval in val.items(): - sval['id'] = skey - jsent.append(sval) - - if len(jsent) == 0: - # stubbornly generic extractor for {json with "image":{allvideoshavethis},etc} - # in http://www.bbc.com/news/video_and_audio/international - # prone to breaking if entries have sourceFiles list - jsent = map( - lambda m: self._parse_json(m, list_id), - re.findall(r"({[^{}]+image\":{[^}]+}[^}]+})", webpage) - ) - - if len(jsent) == 0: - raise ExtractorError('No video found', expected=True) - - for jent in jsent: - programme_id = jent.get('externalId') - xml_url = jent.get('href') - - title = jent.get('caption', '') - if title == '': - title = list_title - - duration = parse_duration(jent.get('duration')) - description = list_title - if jent.get('caption', '') != '': - description += ' - ' + jent.get('caption') - thumbnail = None - if jent.get('image') is not None: - thumbnail = jent['image'].get('href') - - formats = [] - subtitles = [] - - if programme_id: - formats, subtitles = self._download_media_selector(programme_id) - elif jent.get('sourceFiles') is not None: - # mediaselector not used at - # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu - for key, val in jent['sourceFiles'].items(): - formats.append({ - 'ext': val.get('encoding'), - 'url': val.get('url'), - 'filesize': int(val.get('filesize')), - 'format_id': key - }) - elif xml_url: - # Cheap fallback - # http://playlists.bbc.co.uk/news/(list_id)[ABC..]/playlist.sxml - xml = self._download_webpage(xml_url, programme_id, 'Downloading playlist.sxml for externalId (fallback)') - programme_id = self._search_regex(r']*identifier="(.+?)"', xml, 'playlist.sxml (externalId fallback)') - formats, subtitles = self._download_media_selector(programme_id) - - if len(formats) == 0: - raise ExtractorError('unsupported json media entry.\n ' + str(jent) + '\n') + timestamp = parse_iso8601(self._search_regex( + [r'"datePublished":\s*"([^"]+)', + r']+property="article:published_time"[^>]+content="([^"]+)"', + r'itemprop="datePublished"[^>]+datetime="([^"]+)"'], + webpage, 'date', default=None)) + # single video with playlist.sxml URL (e.g. http://www.bbc.com/sport/0/football/3365340ng) + playlist = self._search_regex( + r']+name="playlist"[^>]+value="([^"]+)"', + webpage, 'playlist', default=None) + if playlist: + programme_id, title, description, duration, formats, subtitles = \ + self._process_legacy_playlist_url(playlist, playlist_id) self._sort_formats(formats) - - id = jent.get('id') if programme_id is None else programme_id - if id is None: - id = 'NA' - - ret.append({ - 'id': id, - 'uploader': 'BBC News', - 'upload_date': pubdate, + return { + 'id': programme_id, 'title': title, 'description': description, - 'thumbnail': thumbnail, 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + } + + # single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) + programme_id = self._search_regex( + [r'data-video-player-vpid="([\da-z]{8})"', + r']+name="externalIdentifier"[^>]+value="([\da-z]{8})"'], + webpage, 'vpid', default=None) + if programme_id: + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star) + digital_data = self._parse_json( + self._search_regex( + r'var\s+digitalData\s*=\s*({.+?});?\n', webpage, 'digital data', default='{}'), + programme_id, fatal=False) + page_info = digital_data.get('page', {}).get('pageInfo', {}) + title = page_info.get('pageName') or self._og_search_title(webpage) + description = page_info.get('description') or self._og_search_description(webpage) + timestamp = parse_iso8601(page_info.get('publicationDate')) or timestamp + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'formats': formats, + 'subtitles': subtitles, + } + + playlist_title = self._html_search_regex( + r'(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'playlist title') + playlist_description = self._og_search_description(webpage) + + # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511) + medias = list(filter(None, map( + lambda s: self._parse_json(s, playlist_id, fatal=False), + re.findall(r"data-media-meta='({[^']+})'", webpage)))) + + if not medias: + # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international) + media_asset_page = self._parse_json( + self._search_regex( + r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'media asset'), + playlist_id) + medias = [] + for video in media_asset_page.get('videos', {}).values(): + medias.extend(video.values()) + + entries = [] + for num, media_meta in enumerate(medias, start=1): + formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id) + if not formats: + continue + self._sort_formats(formats) + + video_id = media_meta.get('externalId') + if not video_id: + video_id = playlist_id if len(medias) == 1 else '%s-%s' % (playlist_id, num) + + title = media_meta.get('caption') + if not title: + title = playlist_title if len(medias) == 1 else '%s - Video %s' % (playlist_title, num) + + duration = int_or_none(media_meta.get('durationInSeconds')) or parse_duration(media_meta.get('duration')) + + images = [] + for image in media_meta.get('images', {}).values(): + images.extend(image.values()) + if 'image' in media_meta: + images.append(media_meta['image']) + + thumbnails = [{ + 'url': image.get('href'), + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in images] + + entries.append({ + 'id': video_id, + 'title': title, + 'thumbnails': thumbnails, + 'duration': duration, + 'timestamp': timestamp, 'formats': formats, 'subtitles': subtitles, }) - if len(ret) > 0: - return self.playlist_result(ret, list_id, list_title) - raise ExtractorError('No video found', expected=True) + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) From cf7e015f250d806916bed1ed2a1dfd8d943c05c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 25 Jul 2015 21:30:34 +0600 Subject: [PATCH 349/450] [youtube] Add support for multifeed videos --- youtube_dl/extractor/youtube.py | 126 +++++++++++++++++++++++++------- 1 file changed, 99 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4023a6e50..afe0a781b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -33,9 +33,11 @@ from ..utils import ( int_or_none, orderedSet, parse_duration, + smuggle_url, str_to_int, unescapeHTML, unified_strdate, + unsmuggle_url, uppercase_escape, ISO3166Utils, ) @@ -558,6 +560,59 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'format': '135', # bestvideo } }, + { + # Multifeed videos (multiple cameras), URL is for Main Camera + 'url': 'https://www.youtube.com/watch?v=jqWvoWXjCVs', + 'info_dict': { + 'id': 'jqWvoWXjCVs', + 'title': 'teamPGP: Rocket League Noob Stream', + 'description': 'md5:dc7872fb300e143831327f1bae3af010', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'jqWvoWXjCVs', + 'ext': 'mp4', + 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)', + 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'upload_date': '20150721', + 'uploader': 'Beer Games Beer', + 'uploader_id': 'beergamesbeer', + }, + }, { + 'info_dict': { + 'id': '6h8e8xoXJzg', + 'ext': 'mp4', + 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)', + 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'upload_date': '20150721', + 'uploader': 'Beer Games Beer', + 'uploader_id': 'beergamesbeer', + }, + }, { + 'info_dict': { + 'id': 'PUOgX5z9xZw', + 'ext': 'mp4', + 'title': 'teamPGP: Rocket League Noob Stream (grizzle)', + 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'upload_date': '20150721', + 'uploader': 'Beer Games Beer', + 'uploader_id': 'beergamesbeer', + }, + }, { + 'info_dict': { + 'id': 'teuwxikvS5k', + 'ext': 'mp4', + 'title': 'teamPGP: Rocket League Noob Stream (zim)', + 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'upload_date': '20150721', + 'uploader': 'Beer Games Beer', + 'uploader_id': 'beergamesbeer', + }, + }], + 'params': { + 'skip_download': True, + }, + } ] def __init__(self, *args, **kwargs): @@ -889,6 +944,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return formats def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + proto = ( 'http' if self._downloader.params.get('prefer_insecure', False) else 'https') @@ -1005,6 +1062,48 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '"token" parameter not in video info for unknown reason', video_id=video_id) + # title + if 'title' in video_info: + video_title = video_info['title'][0] + else: + self._downloader.report_warning('Unable to extract video title') + video_title = '_' + + # description + video_description = get_element_by_id("eow-description", video_webpage) + if video_description: + video_description = re.sub(r'''(?x) + + [^<]+ +
+ ''', r'\1', video_description) + video_description = clean_html(video_description) + else: + fd_mobj = re.search(r'', @@ -1072,26 +1164,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_categories = None - # description - video_description = get_element_by_id("eow-description", video_webpage) - if video_description: - video_description = re.sub(r'''(?x) - - [^<]+ - - ''', r'\1', video_description) - video_description = clean_html(video_description) - else: - fd_mobj = re.search(r']+>]+class="yt-uix-button-content"[^>]*>([\d,]+)' From d8f0a9ecea6aab5b148d06904934c6a504ba3b03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 25 Jul 2015 22:17:06 +0600 Subject: [PATCH 350/450] [youtube] Respect noplaylist for multifeed videos --- youtube_dl/extractor/youtube.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index afe0a781b..bcd27408d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1089,7 +1089,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_description = '' - if not smuggled_data.get('force_singlefeed', False) and 'multifeed_metadata_list' in video_info: + if (not self._downloader.params.get('noplaylist') and + not smuggled_data.get('force_singlefeed', False) and + 'multifeed_metadata_list' in video_info): entries = [] multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0]) for feed in multifeed_metadata_list.split(','): From 51da40e6218f1dda2fc61650c308194e9b4acbc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 25 Jul 2015 22:19:54 +0600 Subject: [PATCH 351/450] [bbc] PEP8 --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 2a0901ee4..4b23f82ca 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -423,7 +423,7 @@ class BBCIE(BBCCoUkIE): _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' # fails with notukerror for some videos - #_MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' + # _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s' _TESTS = [{ From 7a896817226405a772baa3808d63062d4ad11c94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 25 Jul 2015 22:32:54 +0600 Subject: [PATCH 352/450] [bbc] Skip DASH until supported --- youtube_dl/extractor/bbc.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 4b23f82ca..66e52641b 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -172,6 +172,7 @@ class BBCCoUkIE(InfoExtractor): supplier = connection.get('supplier') if protocol == 'http': href = connection.get('href') + transfer_format = connection.get('transferFormat') # ASX playlist if supplier == 'asx': for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): @@ -179,6 +180,9 @@ class BBCCoUkIE(InfoExtractor): 'url': ref, 'format_id': 'ref%s_%s' % (i, supplier), }) + # Skip DASH until supported + elif transfer_format == 'dash': + pass # Direct link else: formats.append({ From 5bdec59de15b9bde73a3077a6b9ce517c10b9906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 26 Jul 2015 09:51:54 +0600 Subject: [PATCH 353/450] [comcarcoff] Add support for singleshots (Closes #6366) --- youtube_dl/extractor/comcarcoff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py index 9c25b2223..81f3d7697 100644 --- a/youtube_dl/extractor/comcarcoff.py +++ b/youtube_dl/extractor/comcarcoff.py @@ -36,7 +36,7 @@ class ComCarCoffIE(InfoExtractor): webpage, 'full data json')) video_id = full_data['activeVideo']['video'] - video_data = full_data['videos'][video_id] + video_data = full_data.get('videos', {}).get(video_id) or full_data['singleshots'][video_id] thumbnails = [{ 'url': video_data['images']['thumb'], }, { From aeb7b41d44313b6bb007b2f1cd0fc8cda84e59d5 Mon Sep 17 00:00:00 2001 From: tippfeler Date: Sun, 26 Jul 2015 12:57:06 +0200 Subject: [PATCH 354/450] [spiegel] Accept iframe urls Closes #6370. --- youtube_dl/extractor/spiegel.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index b868241d5..5bd3c0087 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -9,7 +9,7 @@ from .spiegeltv import SpiegeltvIE class SpiegelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P[0-9]+)(?:-embed)?(?:\.html)?(?:#.*)?$' + _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', 'md5': '2c2754212136f35fb4b19767d242f66e', @@ -39,6 +39,9 @@ class SpiegelIE(InfoExtractor): 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', } + }, { + 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', + 'only_matching': True, }] def _real_extract(self, url): From 4c6bd5b5b61adfd912e14f8d704fde47628d164e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 26 Jul 2015 14:14:28 +0200 Subject: [PATCH 355/450] [youtube] Use 'vp8' and 'vp9' in lowercase (fixes #6358) That's how YouTube reports them in their DASH manifest. --- youtube_dl/extractor/youtube.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4023a6e50..0e411bfb6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -281,13 +281,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'}, # Dash webm - '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, - '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'}, + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'vp9'}, '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, @@ -297,11 +297,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, - '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, - '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, - '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'}, - '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, + '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, + '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, + '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'vp9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'vp9'}, # Dash webm audio '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, From 2c7ed247967f3563d26879f2206e0f54dfdf9b23 Mon Sep 17 00:00:00 2001 From: Raphael Michel Date: Sun, 26 Jul 2015 16:37:51 +0200 Subject: [PATCH 356/450] Remove redundant (and wrong) class parameters --- youtube_dl/utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ae813099d..88f9f9070 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -576,11 +576,9 @@ class ContentTooShortError(Exception): download is too small for what the server announced first, indicating the connection was probably interrupted. """ - # Both in bytes - downloaded = None - expected = None def __init__(self, downloaded, expected): + # Both in bytes self.downloaded = downloaded self.expected = expected From cb23bcba294563857561914a19e7d06990c71829 Mon Sep 17 00:00:00 2001 From: fnord Date: Mon, 27 Jul 2015 04:24:04 -0500 Subject: [PATCH 357/450] BBCIE: Fix missing .mp4 formats on news sites Prior to merge BBCNewsIE from pr #6026 was rewritten into BBCIE in 9afa177. Support was added for non-news sites (/travel, /sports), however support for the news mediaselector was removed to support these sites. This removed support for .mp4. pr #6026 ( news site, news mediaseletor : http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/ ) format code extension resolution note journalism_nonuk_stream_h264_flv_200_akamai flv 400x224 h264@ 176k, 1001.77KiB journalism_nonuk_stream_h264_flv_400_akamai flv 400x224 h264@ 512k, 2.85MiB journalism_nonuk_stream_h264_flv_med_akamai flv 640x360 h264@ 800k, 4.43MiB journalism_nonuk_stream_h264_flv_hi_akamai flv 688x384 h264@1500k, 8.34MiB journalism_world_stream_h264_http_200_sis_news_http mp4 400x224 h264@ 176k, 1001.77KiB journalism_world_stream_h264_http_400_sis_news_http mp4 400x224 h264@ 512k, 2.85MiB journalism_world_stream_h264_http_hi_sis_news_http mp4 688x384 h264@1500k, 8.34MiB (best) # 9afa177 ( same url, non-news mediaselector: http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/ ) format code extension resolution note journalism_nonuk_stream_h264_flv_lo_akamai flv 384x216 h264@ 496k, 2.76MiB journalism_nonuk_stream_h264_flv_med_akamai flv 640x360 h264@ 800k, 4.43MiB journalism_nonuk_stream_h264_flv_hi_akamai flv 688x384 h264@1500k, 8.34MiB (best) This change corrects the above, by trying /mediaselector/5 if /mediaselector/4 fails. --- youtube_dl/extractor/bbc.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 66e52641b..0f0ea7cfd 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -14,7 +14,6 @@ from ..utils import ( ) from ..compat import compat_HTTPError - class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' @@ -271,8 +270,16 @@ class BBCCoUkIE(InfoExtractor): return subtitles def _download_media_selector(self, programme_id): - return self._download_media_selector_url( - self._MEDIASELECTOR_URL % programme_id, programme_id) + try: + return self._download_media_selector_url( + self._MEDIASELECTOR_URL % programme_id, programme_id) + except ExtractorError as e: + if hasattr(self, '_MEDIASELECTOR_ALT_URL') and str(e) == 'bbc returned error: notukerror': + # notukerror on bbc.com/travel using bbc news mediaselector: fallback to /mediaselector/5/ + return self._download_media_selector_url( + self._MEDIASELECTOR_ALT_URL % programme_id, programme_id) + else: + raise def _download_media_selector_url(self, url, programme_id=None): try: @@ -297,7 +304,6 @@ class BBCCoUkIE(InfoExtractor): formats.extend(self._extract_video(media, programme_id)) elif kind == 'captions': subtitles = self.extract_subtitles(media, programme_id) - return formats, subtitles def _download_playlist(self, playlist_id): @@ -426,9 +432,10 @@ class BBCIE(BBCCoUkIE): IE_DESC = 'BBC' _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' - # fails with notukerror for some videos - # _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' - _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s' + # fails with notukerror for some videos ( non news sites such as bbc.com/travel ) + _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' + # limited selection of formats but may work where the above does not + _MEDIASELECTOR_ALT_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s' _TESTS = [{ # article with multiple videos embedded with data-media-meta containing @@ -455,7 +462,7 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/news/world-europe-32041533', 'info_dict': { 'id': 'p02mprgb', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Aerial footage showed the site of the crash in the Alps - courtesy BFM TV', 'duration': 47, 'timestamp': 1427219242, @@ -515,7 +522,7 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/autos/story/20130513-hyundais-rock-star', 'info_dict': { 'id': 'p018zqqg', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Hyundai Santa Fe Sport: Rock star', 'description': 'md5:b042a26142c4154a6e472933cf20793d', 'timestamp': 1368473503, @@ -530,7 +537,7 @@ class BBCIE(BBCCoUkIE): 'url': 'http://www.bbc.com/sport/0/football/33653409', 'info_dict': { 'id': 'p02xycnp', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Transfers: Cristiano Ronaldo to Man Utd, Arsenal to spend?', 'description': 'md5:398fca0e2e701c609d726e034fa1fc89', 'duration': 140, From 88ed52aec9f2e622188f304d74f2f5568b0caa1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 27 Jul 2015 22:05:51 +0600 Subject: [PATCH 358/450] [bbc] Add support for direct bbc.co.uk embeds --- youtube_dl/extractor/bbc.py | 34 +++++++++++++++++++++++++++++---- youtube_dl/extractor/generic.py | 8 -------- 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 66e52641b..c0433eabd 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -450,6 +450,14 @@ class BBCIE(BBCCoUkIE): }, 'playlist_count': 9, 'skip': 'Save time', + }, { + # article with multiple videos embedded with `new SMP()` + 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460', + 'info_dict': { + 'id': '3662a707-0af9-3149-963f-47bea720b460', + 'title': 'BBC Blogs - Adam Curtis - BUGGER', + }, + 'playlist_count': 18, }, { # single video embedded with mediaAssetPage.init() 'url': 'http://www.bbc.com/news/world-europe-32041533', @@ -637,12 +645,30 @@ class BBCIE(BBCCoUkIE): playlist_title = self._html_search_regex( r'(.*?)(?:\s*-\s*BBC [^ ]+)?', webpage, 'playlist title') - playlist_description = self._og_search_description(webpage) + playlist_description = self._og_search_description(webpage, default=None) + + def extract_all(pattern): + return list(filter(None, map( + lambda s: self._parse_json(s, playlist_id, fatal=False), + re.findall(pattern, webpage)))) + + # Multiple video article (e.g. + # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) + EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]*)?' + entries = [] + for match in extract_all(r'new\s+SMP\(({.+?})\)'): + embed_url = match.get('playerSettings', {}).get('externalEmbedUrl') + if embed_url and re.match(EMBED_URL, embed_url): + entries.append(embed_url) + entries.extend(re.findall( + r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage)) + if entries: + return self.playlist_result( + [self.url_result(entry, 'BBCCoUk') for entry in entries], + playlist_id, playlist_title, playlist_description) # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511) - medias = list(filter(None, map( - lambda s: self._parse_json(s, playlist_id, fatal=False), - re.findall(r"data-media-meta='({[^']+})'", webpage)))) + medias = extract_all(r"data-media-meta='({[^']+})'") if not medias: # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6d2efb22e..8cef61c3c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -276,14 +276,6 @@ class GenericIE(InfoExtractor): 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.', }, }, - # BBC iPlayer embeds - { - 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER', - 'info_dict': { - 'title': 'BBC - Blogs - Adam Curtis - BUGGER', - }, - 'playlist_mincount': 18, - }, # RUTV embed { 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html', From d247a2c8bf2fe4e071cb35a2e2777fe16acecab3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 28 Jul 2015 02:06:27 +0600 Subject: [PATCH 359/450] [bbc] Fix regex --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index c0433eabd..01d07c9c0 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -654,7 +654,7 @@ class BBCIE(BBCCoUkIE): # Multiple video article (e.g. # http://www.bbc.co.uk/blogs/adamcurtis/entries/3662a707-0af9-3149-963f-47bea720b460) - EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]*)?' + EMBED_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:[^/]+/)+[\da-z]{8}(?:\b[^"]+)?' entries = [] for match in extract_all(r'new\s+SMP\(({.+?})\)'): embed_url = match.get('playerSettings', {}).get('externalEmbedUrl') From 289bbb350e11ae1460db515560345088d7c58663 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 28 Jul 2015 11:28:33 +0200 Subject: [PATCH 360/450] release 2015.07.28 --- docs/supportedsites.md | 3 +++ youtube_dl/version.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 73445137f..657935dc6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -51,6 +51,7 @@ - **bambuser:channel** - **Bandcamp** - **Bandcamp:album** + - **bbc**: BBC - **bbc.co.uk**: BBC iPlayer - **BeatportPro** - **Beeg** @@ -224,6 +225,7 @@ - **InternetVideoArchive** - **IPrima** - **iqiyi**: 爱奇艺 + - **Ir90Tv** - **ivi**: ivi.ru - **ivi:compilation**: ivi.ru compilations - **Izlesene** @@ -252,6 +254,7 @@ - **kuwo:song**: 酷我音乐 - **la7.tv** - **Laola1Tv** + - **Lecture2Go** - **Letv**: 乐视网 - **LetvPlaylist** - **LetvTv** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 280afdd7f..fa157cadb 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.07.21' +__version__ = '2015.07.28' From f171bc8b59ecf4560dd4076be56570a4f090d519 Mon Sep 17 00:00:00 2001 From: Purdea Andrei Date: Tue, 28 Jul 2015 18:14:06 +0300 Subject: [PATCH 361/450] [youtube] save keywords in info jason when --write-info-json is used --- youtube_dl/extractor/youtube.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0e411bfb6..15e327ec8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1072,6 +1072,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_categories = None + m = re.findall(r''''"]+?)['"]?\s*>''' + , video_webpage, re.DOTALL | re.IGNORECASE); + video_tags = ", ".join(m) # description video_description = get_element_by_id("eow-description", video_webpage) if video_description: @@ -1259,6 +1262,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': video_title, 'thumbnail': video_thumbnail, 'description': video_description, + 'tags' : video_tags, 'categories': video_categories, 'subtitles': video_subtitles, 'automatic_captions': automatic_captions, From a6f774e9015995393a086273df8db1d7b0c098c4 Mon Sep 17 00:00:00 2001 From: Purdea Andrei Date: Tue, 28 Jul 2015 18:29:13 +0300 Subject: [PATCH 362/450] [youtube]: tags key in info jason is now a list --- youtube_dl/extractor/youtube.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 15e327ec8..c0fafbfd5 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1072,9 +1072,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_categories = None - m = re.findall(r''''"]+?)['"]?\s*>''' + video_tags = re.findall(r''''"]+?)['"]?\s*>''' , video_webpage, re.DOTALL | re.IGNORECASE); - video_tags = ", ".join(m) # description video_description = get_element_by_id("eow-description", video_webpage) if video_description: From 5316bf7487b608b7c085950ff2fb0444f2c36dc0 Mon Sep 17 00:00:00 2001 From: Purdea Andrei Date: Tue, 28 Jul 2015 18:30:42 +0300 Subject: [PATCH 363/450] Documented tags as a possible dict key --- youtube_dl/extractor/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 14b9b4fe2..a227aeb9c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -187,6 +187,7 @@ class InfoExtractor(object): specified in the URL. end_time: Time in seconds where the reproduction should end, as specified in the URL. + tags: A list of keywords attached to the video. Unless mentioned otherwise, the fields should be Unicode strings. From 95d8f7ea12f40a986b541ee17cdf384dbedcfa55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 29 Jul 2015 02:26:16 +0600 Subject: [PATCH 364/450] [fragment] Generalize fragmented media file downloader --- youtube_dl/downloader/fragment.py | 110 ++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 youtube_dl/downloader/fragment.py diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py new file mode 100644 index 000000000..5f9d6796d --- /dev/null +++ b/youtube_dl/downloader/fragment.py @@ -0,0 +1,110 @@ +from __future__ import division, unicode_literals + +import os +import time + +from .common import FileDownloader +from .http import HttpFD +from ..utils import ( + encodeFilename, + sanitize_open, +) + + +class HttpQuietDownloader(HttpFD): + def to_screen(self, *args, **kargs): + pass + + +class FragmentFD(FileDownloader): + """ + A base file downloader class for fragmented media (e.g. f4m/m3u8 manifests). + """ + + def _prepare_and_start_frag_download(self, ctx): + self._prepare_frag_download(ctx) + self._start_frag_download(ctx) + + def _prepare_frag_download(self, ctx): + self.to_screen('[%s] Total fragments: %d' % (self.FD_NAME, ctx['total_frags'])) + self.report_destination(ctx['filename']) + dl = HttpQuietDownloader( + self.ydl, + { + 'continuedl': True, + 'quiet': True, + 'noprogress': True, + 'ratelimit': self.params.get('ratelimit', None), + 'test': self.params.get('test', False), + } + ) + tmpfilename = self.temp_name(ctx['filename']) + dest_stream, tmpfilename = sanitize_open(tmpfilename, 'wb') + ctx.update({ + 'dl': dl, + 'dest_stream': dest_stream, + 'tmpfilename': tmpfilename, + }) + + def _start_frag_download(self, ctx): + total_frags = ctx['total_frags'] + # This dict stores the download progress, it's updated by the progress + # hook + state = { + 'status': 'downloading', + 'downloaded_bytes': 0, + 'frag_index': 0, + 'frag_count': total_frags, + 'filename': ctx['filename'], + 'tmpfilename': ctx['tmpfilename'], + } + start = time.time() + ctx['started'] = start + + def frag_progress_hook(s): + if s['status'] not in ('downloading', 'finished'): + return + + frag_total_bytes = s.get('total_bytes', 0) + if s['status'] == 'finished': + state['downloaded_bytes'] += frag_total_bytes + state['frag_index'] += 1 + + estimated_size = ( + (state['downloaded_bytes'] + frag_total_bytes) / + (state['frag_index'] + 1) * total_frags) + time_now = time.time() + state['total_bytes_estimate'] = estimated_size + state['elapsed'] = time_now - start + + if s['status'] == 'finished': + progress = self.calc_percent(state['frag_index'], total_frags) + else: + frag_downloaded_bytes = s['downloaded_bytes'] + frag_progress = self.calc_percent(frag_downloaded_bytes, + frag_total_bytes) + progress = self.calc_percent(state['frag_index'], total_frags) + progress += frag_progress / float(total_frags) + + state['eta'] = self.calc_eta( + start, time_now, estimated_size, state['downloaded_bytes'] + frag_downloaded_bytes) + state['speed'] = s.get('speed') + self._hook_progress(state) + + ctx['dl'].add_progress_hook(frag_progress_hook) + + return start + + def _finish_frag_download(self, ctx): + ctx['dest_stream'].close() + elapsed = time.time() - ctx['started'] + self.try_rename(ctx['tmpfilename'], ctx['filename']) + fsize = os.path.getsize(encodeFilename(ctx['filename'])) + + self._hook_progress({ + 'downloaded_bytes': fsize, + 'total_bytes': fsize, + 'filename': ctx['filename'], + 'status': 'finished', + 'elapsed': elapsed, + }) From ab81ef8fa7d0f9e814890024140c7ed9587d3151 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 29 Jul 2015 02:27:50 +0600 Subject: [PATCH 365/450] [f4m] Implement f4m fd in terms of fragment fd --- youtube_dl/downloader/f4m.py | 98 +++++++----------------------------- 1 file changed, 17 insertions(+), 81 deletions(-) diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index b1a858c45..275564b59 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -7,8 +7,7 @@ import os import time import xml.etree.ElementTree as etree -from .common import FileDownloader -from .http import HttpFD +from .fragment import FragmentFD from ..compat import ( compat_urlparse, compat_urllib_error, @@ -16,8 +15,6 @@ from ..compat import ( from ..utils import ( struct_pack, struct_unpack, - encodeFilename, - sanitize_open, xpath_text, ) @@ -226,16 +223,13 @@ def _add_ns(prop): return '{http://ns.adobe.com/f4m/1.0}%s' % prop -class HttpQuietDownloader(HttpFD): - def to_screen(self, *args, **kargs): - pass - - -class F4mFD(FileDownloader): +class F4mFD(FragmentFD): """ A downloader for f4m manifests or AdobeHDS. """ + FD_NAME = 'f4m' + def _get_unencrypted_media(self, doc): media = doc.findall(_add_ns('media')) if not media: @@ -288,7 +282,7 @@ class F4mFD(FileDownloader): def real_download(self, filename, info_dict): man_url = info_dict['url'] requested_bitrate = info_dict.get('tbr') - self.to_screen('[download] Downloading f4m manifest') + self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME) manifest = self.ydl.urlopen(man_url).read() doc = etree.fromstring(manifest) @@ -320,67 +314,20 @@ class F4mFD(FileDownloader): # For some akamai manifests we'll need to add a query to the fragment url akamai_pv = xpath_text(doc, _add_ns('pv-2.0')) - self.report_destination(filename) - http_dl = HttpQuietDownloader( - self.ydl, - { - 'continuedl': True, - 'quiet': True, - 'noprogress': True, - 'ratelimit': self.params.get('ratelimit', None), - 'test': self.params.get('test', False), - } - ) - tmpfilename = self.temp_name(filename) - (dest_stream, tmpfilename) = sanitize_open(tmpfilename, 'wb') + ctx = { + 'filename': filename, + 'total_frags': total_frags, + } + + self._prepare_frag_download(ctx) + + dest_stream = ctx['dest_stream'] write_flv_header(dest_stream) if not live: write_metadata_tag(dest_stream, metadata) - # This dict stores the download progress, it's updated by the progress - # hook - state = { - 'status': 'downloading', - 'downloaded_bytes': 0, - 'frag_index': 0, - 'frag_count': total_frags, - 'filename': filename, - 'tmpfilename': tmpfilename, - } - start = time.time() - - def frag_progress_hook(s): - if s['status'] not in ('downloading', 'finished'): - return - - frag_total_bytes = s.get('total_bytes', 0) - if s['status'] == 'finished': - state['downloaded_bytes'] += frag_total_bytes - state['frag_index'] += 1 - - estimated_size = ( - (state['downloaded_bytes'] + frag_total_bytes) / - (state['frag_index'] + 1) * total_frags) - time_now = time.time() - state['total_bytes_estimate'] = estimated_size - state['elapsed'] = time_now - start - - if s['status'] == 'finished': - progress = self.calc_percent(state['frag_index'], total_frags) - else: - frag_downloaded_bytes = s['downloaded_bytes'] - frag_progress = self.calc_percent(frag_downloaded_bytes, - frag_total_bytes) - progress = self.calc_percent(state['frag_index'], total_frags) - progress += frag_progress / float(total_frags) - - state['eta'] = self.calc_eta( - start, time_now, estimated_size, state['downloaded_bytes'] + frag_downloaded_bytes) - state['speed'] = s.get('speed') - self._hook_progress(state) - - http_dl.add_progress_hook(frag_progress_hook) + self._start_frag_download(ctx) frags_filenames = [] while fragments_list: @@ -391,9 +338,9 @@ class F4mFD(FileDownloader): url += '?' + akamai_pv.strip(';') if info_dict.get('extra_param_to_segment_url'): url += info_dict.get('extra_param_to_segment_url') - frag_filename = '%s-%s' % (tmpfilename, name) + frag_filename = '%s-%s' % (ctx['tmpfilename'], name) try: - success = http_dl.download(frag_filename, {'url': url}) + success = ctx['dl'].download(frag_filename, {'url': url}) if not success: return False with open(frag_filename, 'rb') as down: @@ -425,20 +372,9 @@ class F4mFD(FileDownloader): msg = 'Missed %d fragments' % (fragments_list[0][1] - (frag_i + 1)) self.report_warning(msg) - dest_stream.close() + self._finish_frag_download(ctx) - elapsed = time.time() - start - self.try_rename(tmpfilename, filename) for frag_file in frags_filenames: os.remove(frag_file) - fsize = os.path.getsize(encodeFilename(filename)) - self._hook_progress({ - 'downloaded_bytes': fsize, - 'total_bytes': fsize, - 'filename': filename, - 'status': 'finished', - 'elapsed': elapsed, - }) - return True From f9a5affad968c6629c022f489dc3c1561a4c57de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 29 Jul 2015 02:28:30 +0600 Subject: [PATCH 366/450] [hls] Implement hlsnative fd in terms of fragment fd --- youtube_dl/downloader/hls.py | 81 +++++++++++++++++------------------- 1 file changed, 38 insertions(+), 43 deletions(-) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 8be4f4249..60dca0ab1 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -4,12 +4,11 @@ import os import re import subprocess -from ..postprocessor.ffmpeg import FFmpegPostProcessor from .common import FileDownloader -from ..compat import ( - compat_urlparse, - compat_urllib_request, -) +from .fragment import FragmentFD + +from ..compat import compat_urlparse +from ..postprocessor.ffmpeg import FFmpegPostProcessor from ..utils import ( encodeArgument, encodeFilename, @@ -51,54 +50,50 @@ class HlsFD(FileDownloader): return False -class NativeHlsFD(FileDownloader): +class NativeHlsFD(FragmentFD): """ A more limited implementation that does not require ffmpeg """ - def real_download(self, filename, info_dict): - url = info_dict['url'] - self.report_destination(filename) - tmpfilename = self.temp_name(filename) + FD_NAME = 'hlsnative' - self.to_screen( - '[hlsnative] %s: Downloading m3u8 manifest' % info_dict['id']) - data = self.ydl.urlopen(url).read() - s = data.decode('utf-8', 'ignore') - segment_urls = [] + def real_download(self, filename, info_dict): + man_url = info_dict['url'] + self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) + manifest = self.ydl.urlopen(man_url).read() + + s = manifest.decode('utf-8', 'ignore') + fragment_urls = [] for line in s.splitlines(): line = line.strip() if line and not line.startswith('#'): segment_url = ( line if re.match(r'^https?://', line) - else compat_urlparse.urljoin(url, line)) - segment_urls.append(segment_url) - - is_test = self.params.get('test', False) - remaining_bytes = self._TEST_FILE_SIZE if is_test else None - byte_counter = 0 - with open(tmpfilename, 'wb') as outf: - for i, segurl in enumerate(segment_urls): - self.to_screen( - '[hlsnative] %s: Downloading segment %d / %d' % - (info_dict['id'], i + 1, len(segment_urls))) - seg_req = compat_urllib_request.Request(segurl) - if remaining_bytes is not None: - seg_req.add_header('Range', 'bytes=0-%d' % (remaining_bytes - 1)) - - segment = self.ydl.urlopen(seg_req).read() - if remaining_bytes is not None: - segment = segment[:remaining_bytes] - remaining_bytes -= len(segment) - outf.write(segment) - byte_counter += len(segment) - if remaining_bytes is not None and remaining_bytes <= 0: + else compat_urlparse.urljoin(man_url, line)) + fragment_urls.append(segment_url) + # We only download the first fragment during the test + if self.params.get('test', False): break - self._hook_progress({ - 'downloaded_bytes': byte_counter, - 'total_bytes': byte_counter, + ctx = { 'filename': filename, - 'status': 'finished', - }) - self.try_rename(tmpfilename, filename) + 'total_frags': len(fragment_urls), + } + + self._prepare_and_start_frag_download(ctx) + + frags_filenames = [] + for i, frag_url in enumerate(fragment_urls): + frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i) + success = ctx['dl'].download(frag_filename, {'url': frag_url}) + if not success: + return False + with open(frag_filename, 'rb') as down: + ctx['dest_stream'].write(down.read()) + frags_filenames.append(frag_filename) + + self._finish_frag_download(ctx) + + for frag_file in frags_filenames: + os.remove(frag_file) + return True From 864f24bd2c0cf9bde034812a2049c3750c1bb05c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 29 Jul 2015 03:43:03 +0600 Subject: [PATCH 367/450] [extractor/common] Add _meta_regex and clarify tags field --- youtube_dl/extractor/common.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a227aeb9c..d54866d1f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -181,13 +181,13 @@ class InfoExtractor(object): by YoutubeDL if it's missing) categories: A list of categories that the video falls in, for example ["Sports", "Berlin"] + tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"] is_live: True, False, or None (=unknown). Whether this video is a live stream that goes on instead of a fixed-length video. start_time: Time in seconds where the reproduction should start, as specified in the URL. end_time: Time in seconds where the reproduction should end, as specified in the URL. - tags: A list of keywords attached to the video. Unless mentioned otherwise, the fields should be Unicode strings. @@ -631,6 +631,12 @@ class InfoExtractor(object): template % (content_re, property_re), ] + @staticmethod + def _meta_regex(prop): + return r'''(?isx)]+(?:itemprop|name|property)=(["\']?)%s\1) + [^>]+?content=(["\'])(?P.*?)\2''' % re.escape(prop) + def _og_search_property(self, prop, html, name=None, **kargs): if name is None: name = 'OpenGraph %s' % prop @@ -661,9 +667,7 @@ class InfoExtractor(object): if display_name is None: display_name = name return self._html_search_regex( - r'''(?isx)]+(?:itemprop|name|property)=(["\']?)%s\1) - [^>]+?content=(["\'])(?P.*?)\2''' % re.escape(name), + self._meta_regex(name), html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): From 000b6b5ae5cc214906effe4ac5b78b579bc7db70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 29 Jul 2015 03:43:32 +0600 Subject: [PATCH 368/450] [youtube] Improve tags extraction and add test --- youtube_dl/extractor/youtube.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c0fafbfd5..4c449fd74 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -329,6 +329,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20121002', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], + 'tags': ['youtube-dl'], 'like_count': int, 'dislike_count': int, 'start_time': 1, @@ -343,7 +344,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'upload_date': '20120506', 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', - 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f', + 'description': 'md5:782e8651347686cba06e58f71ab51773', + 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', + 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', + 'iconic ep', 'iconic', 'love', 'it'], 'uploader': 'Icona Pop', 'uploader_id': 'IconaPop', } @@ -1072,8 +1076,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_categories = None - video_tags = re.findall(r''''"]+?)['"]?\s*>''' - , video_webpage, re.DOTALL | re.IGNORECASE); + video_tags = [ + unescapeHTML(m.group('content')) + for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)] + # description video_description = get_element_by_id("eow-description", video_webpage) if video_description: @@ -1261,8 +1267,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': video_title, 'thumbnail': video_thumbnail, 'description': video_description, - 'tags' : video_tags, 'categories': video_categories, + 'tags': video_tags, 'subtitles': video_subtitles, 'automatic_captions': automatic_captions, 'duration': video_duration, From 5e1eddb939a4fbc2a5ef10111c8141c842cf01d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 29 Jul 2015 21:18:16 +0600 Subject: [PATCH 369/450] [youtube] Show info message for multifeed videos according to noplaylist option --- youtube_dl/extractor/youtube.py | 37 +++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index bcd27408d..8a5ef2e70 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1089,22 +1089,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_description = '' - if (not self._downloader.params.get('noplaylist') and - not smuggled_data.get('force_singlefeed', False) and - 'multifeed_metadata_list' in video_info): - entries = [] - multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0]) - for feed in multifeed_metadata_list.split(','): - feed_data = compat_parse_qs(feed) - entries.append({ - '_type': 'url_transparent', - 'ie_key': 'Youtube', - 'url': smuggle_url( - '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), - {'force_singlefeed': True}), - 'title': '%s (%s)' % (video_title, feed_data['title'][0]), - }) - return self.playlist_result(entries, video_id, video_title, video_description) + if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False): + if not self._downloader.params.get('noplaylist'): + entries = [] + feed_ids = [] + multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0]) + for feed in multifeed_metadata_list.split(','): + feed_data = compat_parse_qs(feed) + entries.append({ + '_type': 'url_transparent', + 'ie_key': 'Youtube', + 'url': smuggle_url( + '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), + {'force_singlefeed': True}), + 'title': '%s (%s)' % (video_title, feed_data['title'][0]), + }) + feed_ids.append(feed_data['id'][0]) + self.to_screen( + 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' + % (', '.join(feed_ids), video_id)) + return self.playlist_result(entries, video_id, video_title, video_description) + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) if 'view_count' in video_info: view_count = int(video_info['view_count'][0]) From 2711e41bcdaf8e8234dd32e5df6fb76f23179f46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 29 Jul 2015 18:47:20 +0200 Subject: [PATCH 370/450] Credit slangangular for the sportschau extractor (#6199) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index e75e9885d..aa6b88cc0 100644 --- a/AUTHORS +++ b/AUTHORS @@ -135,3 +135,4 @@ Bernhard Minks sceext Zach Bruggeman Tjark Saul +slangangular From d12a1a47d5ff0833a1ecd7fe47f4ffa67a9b4a50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 30 Jul 2015 00:55:06 +0600 Subject: [PATCH 371/450] [bbc] Improve work with mediaselection URLs --- youtube_dl/extractor/bbc.py | 50 ++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 3d9366644..9a1b6e3dc 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -14,12 +14,15 @@ from ..utils import ( ) from ..compat import compat_HTTPError + class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:(?:programmes|iplayer(?:/[^/]+)?/(?:episode|playlist))/)|music/clips[/#])(?P[\da-z]{8})' - _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' + _MEDIASELECTOR_URLS = [ + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', + ] _TESTS = [ { @@ -161,6 +164,10 @@ class BBCCoUkIE(InfoExtractor): } ] + class MediaSelectionError(Exception): + def __init__(self, id): + self.id = id + def _extract_asx_playlist(self, connection, programme_id): asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist') return [ref.get('href') for ref in asx.findall('./Entry/ref')] @@ -211,8 +218,7 @@ class BBCCoUkIE(InfoExtractor): def _extract_medias(self, media_selection): error = media_selection.find('./{http://bbc.co.uk/2008/mp/mediaselection}error') if error is not None: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error.get('id')), expected=True) + raise BBCCoUkIE.MediaSelectionError(error.get('id')) return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media') def _extract_connections(self, media): @@ -269,17 +275,23 @@ class BBCCoUkIE(InfoExtractor): ] return subtitles + def _raise_extractor_error(self, media_selection_error): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, media_selection_error.id), + expected=True) + def _download_media_selector(self, programme_id): - try: - return self._download_media_selector_url( - self._MEDIASELECTOR_URL % programme_id, programme_id) - except ExtractorError as e: - if hasattr(self, '_MEDIASELECTOR_ALT_URL') and str(e) == 'bbc returned error: notukerror': - # notukerror on bbc.com/travel using bbc news mediaselector: fallback to /mediaselector/5/ - return self._download_media_selector_url( - self._MEDIASELECTOR_ALT_URL % programme_id, programme_id) - else: - raise + last_exception = None + for mediaselector_url in self._MEDIASELECTOR_URLS: + try: + return self._download_media_selector_url( + mediaselector_url % programme_id, programme_id) + except BBCCoUkIE.MediaSelectionError as e: + if e.id == 'notukerror': + last_exception = e + continue + self._raise_extractor_error(e) + self._raise_extractor_error(last_exception) def _download_media_selector_url(self, url, programme_id=None): try: @@ -432,10 +444,14 @@ class BBCIE(BBCCoUkIE): IE_DESC = 'BBC' _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' - # fails with notukerror for some videos ( non news sites such as bbc.com/travel ) - _MEDIASELECTOR_URL = 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s' - # limited selection of formats but may work where the above does not - _MEDIASELECTOR_ALT_URL = 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s' + _MEDIASELECTOR_URLS = [ + # Provides more formats, namely direct mp4 links, but fails on some videos with + # notukerror for non UK (?) users (e.g. + # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) + 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s', + # Provides fewer formats, but works everywhere for everybody (hopefully) + 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s', + ] _TESTS = [{ # article with multiple videos embedded with data-media-meta containing From 799207e838e0404aaa5cb6658e41bef108aced16 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 29 Jul 2015 23:20:37 +0100 Subject: [PATCH 372/450] [viewster] extract the api auth token Closes #6406. --- youtube_dl/compat.py | 6 ++++++ youtube_dl/extractor/common.py | 8 ++++++++ youtube_dl/extractor/viewster.py | 5 ++++- 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 0c57c7aeb..e4b9286c0 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -42,6 +42,11 @@ try: except ImportError: # Python 2 import cookielib as compat_cookiejar +try: + import http.cookies as compat_cookies +except ImportError: # Python 2 + import Cookie as compat_cookies + try: import html.entities as compat_html_entities except ImportError: # Python 2 @@ -436,6 +441,7 @@ __all__ = [ 'compat_basestring', 'compat_chr', 'compat_cookiejar', + 'compat_cookies', 'compat_expanduser', 'compat_get_terminal_size', 'compat_getenv', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index d54866d1f..dc5080504 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -14,10 +14,12 @@ import xml.etree.ElementTree from ..compat import ( compat_cookiejar, + compat_cookies, compat_HTTPError, compat_http_client, compat_urllib_error, compat_urllib_parse_urlparse, + compat_urllib_request, compat_urlparse, compat_str, ) @@ -1074,6 +1076,12 @@ class InfoExtractor(object): None, '/', True, False, expire_time, '', None, None, None) self._downloader.cookiejar.set_cookie(cookie) + def _get_cookies(self, url): + """ Return a compat_cookies.SimpleCookie with the cookies for the url """ + req = compat_urllib_request.Request(url) + self._downloader.cookiejar.add_cookie_header(req) + return compat_cookies.SimpleCookie(req.get_header('Cookie')) + def get_testcases(self, include_onlymatching=False): t = getattr(self, '_TEST', None) if t: diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 6ef36290b..393b63618 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -62,7 +62,6 @@ class ViewsterIE(InfoExtractor): }] _ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01' - _AUTH_TOKEN = '/YqhSYsx8EaU9Bsta3ojlA==' def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True): request = compat_urllib_request.Request(url) @@ -72,6 +71,10 @@ class ViewsterIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + # Get 'api_token' cookie + self._request_webpage(url, video_id) + cookies = self._get_cookies(url) + self._AUTH_TOKEN = compat_urllib_parse.unquote(cookies['api_token'].value) info = self._download_json( 'https://public-api.viewster.com/search/%s' % video_id, From 1f04873517e41d2f54fc4e65c46c08ba85b23010 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 30 Jul 2015 19:12:37 +0200 Subject: [PATCH 373/450] [viewster] Use 'compat_urllib_parse_unquote' --- youtube_dl/extractor/viewster.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 393b63618..65324d903 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..compat import ( compat_urllib_request, compat_urllib_parse, + compat_urllib_parse_unquote, ) from ..utils import ( determine_ext, @@ -74,7 +75,7 @@ class ViewsterIE(InfoExtractor): # Get 'api_token' cookie self._request_webpage(url, video_id) cookies = self._get_cookies(url) - self._AUTH_TOKEN = compat_urllib_parse.unquote(cookies['api_token'].value) + self._AUTH_TOKEN = compat_urllib_parse_unquote(cookies['api_token'].value) info = self._download_json( 'https://public-api.viewster.com/search/%s' % video_id, From 80fb6d4aa47154a1e963b28a17a065dc40a436b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 31 Jul 2015 00:54:26 +0600 Subject: [PATCH 374/450] [soundcloud:user] Rework extractor (Closes #6399) --- youtube_dl/extractor/soundcloud.py | 131 ++++++++++++++++++++++------- 1 file changed, 101 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 0a6c9fe72..f1307dc83 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -29,7 +29,7 @@ class SoundcloudIE(InfoExtractor): _VALID_URL = r'''(?x)^(?:https?://)? (?:(?:(?:www\.|m\.)?soundcloud\.com/ (?P[\w\d-]+)/ - (?!sets/|(?:likes|tracks)/?(?:$|[?#])) + (?!(?:tracks|sets|reposts|likes|spotlight)/?(?:$|[?#])) (?P[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) @@ -293,60 +293,131 @@ class SoundcloudSetIE(SoundcloudIE): class SoundcloudUserIE(SoundcloudIE): - _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$' + _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|sets|reposts|likes|spotlight)/?)?(\?.*)?$' IE_NAME = 'soundcloud:user' _TESTS = [{ - 'url': 'https://soundcloud.com/the-concept-band', + 'url': 'https://soundcloud.com/the-akashic-chronicler', 'info_dict': { - 'id': '9615865', - 'title': 'The Royal Concept', + 'id': '114582580', + 'title': 'The Akashic Chronicler (All)', }, - 'playlist_mincount': 12 - }, { - 'url': 'https://soundcloud.com/the-concept-band/likes', - 'info_dict': { - 'id': '9615865', - 'title': 'The Royal Concept', - }, - 'playlist_mincount': 1, + 'playlist_mincount': 112, }, { 'url': 'https://soundcloud.com/the-akashic-chronicler/tracks', - 'only_matching': True, + 'info_dict': { + 'id': '114582580', + 'title': 'The Akashic Chronicler (Tracks)', + }, + 'playlist_mincount': 50, + }, { + 'url': 'https://soundcloud.com/the-akashic-chronicler/sets', + 'info_dict': { + 'id': '114582580', + 'title': 'The Akashic Chronicler (Playlists)', + }, + 'playlist_mincount': 3, + }, { + 'url': 'https://soundcloud.com/the-akashic-chronicler/reposts', + 'info_dict': { + 'id': '114582580', + 'title': 'The Akashic Chronicler (Reposts)', + }, + 'playlist_mincount': 9, + }, { + 'url': 'https://soundcloud.com/the-akashic-chronicler/likes', + 'info_dict': { + 'id': '114582580', + 'title': 'The Akashic Chronicler (Likes)', + }, + 'playlist_mincount': 333, + }, { + 'url': 'https://soundcloud.com/grynpyret/spotlight', + 'info_dict': { + 'id': '7098329', + 'title': 'Grynpyret (Spotlight)', + }, + 'playlist_mincount': 1, }] + _API_BASE = 'https://api.soundcloud.com' + _API_V2_BASE = 'https://api-v2.soundcloud.com' + + _BASE_URL_MAP = { + 'all': '%s/profile/soundcloud:users:%%s' % _API_V2_BASE, + 'tracks': '%s/users/%%s/tracks' % _API_BASE, + 'sets': '%s/users/%%s/playlists' % _API_V2_BASE, + 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % _API_V2_BASE, + 'likes': '%s/users/%%s/likes' % _API_V2_BASE, + 'spotlight': '%s/users/%%s/spotlight' % _API_V2_BASE, + } + + _TITLE_MAP = { + 'all': 'All', + 'tracks': 'Tracks', + 'sets': 'Playlists', + 'reposts': 'Reposts', + 'likes': 'Likes', + 'spotlight': 'Spotlight', + } + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) uploader = mobj.group('user') - resource = mobj.group('rsrc') - if resource is None: - resource = 'tracks' - elif resource == 'likes': - resource = 'favorites' url = 'http://soundcloud.com/%s/' % uploader resolv_url = self._resolv_url(url) user = self._download_json( resolv_url, uploader, 'Downloading user info') - base_url = 'http://api.soundcloud.com/users/%s/%s.json?' % (uploader, resource) + + resource = mobj.group('rsrc') or 'all' + base_url = self._BASE_URL_MAP[resource] % user['id'] + + next_href = None entries = [] for i in itertools.count(): - data = compat_urllib_parse.urlencode({ - 'offset': i * 50, - 'limit': 50, - 'client_id': self._CLIENT_ID, - }) - new_entries = self._download_json( - base_url + data, uploader, 'Downloading track page %s' % (i + 1)) - if len(new_entries) == 0: + if not next_href: + data = compat_urllib_parse.urlencode({ + 'offset': i * 50, + 'limit': 50, + 'client_id': self._CLIENT_ID, + 'linked_partitioning': '1', + 'representation': 'speedy', + }) + next_href = base_url + '?' + data + + response = self._download_json( + next_href, uploader, 'Downloading track page %s' % (i + 1)) + + collection = response['collection'] + + if not collection: self.to_screen('%s: End page received' % uploader) break - entries.extend(self.url_result(e['permalink_url'], 'Soundcloud') for e in new_entries) + + def resolve_permalink_url(candidates): + for cand in candidates: + if isinstance(cand, dict): + permalink_url = cand.get('permalink_url') + if permalink_url and permalink_url.startswith('http'): + return permalink_url + + for e in collection: + permalink_url = resolve_permalink_url((e, e.get('track'), e.get('playlist'))) + if permalink_url: + entries.append(self.url_result(permalink_url)) + + if 'next_href' in response: + next_href = response['next_href'] + if not next_href: + break + else: + next_href = None return { '_type': 'playlist', 'id': compat_str(user['id']), - 'title': user['username'], + 'title': '%s (%s)' % (user['username'], self._TITLE_MAP[resource]), 'entries': entries, } From 09b6468d3027bbc0180cd8b10aa848180cae4005 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 31 Jul 2015 21:28:59 +0100 Subject: [PATCH 375/450] [vidme] extract the uploader --- youtube_dl/extractor/vidme.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index e0b55078b..575a62f3a 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -50,6 +50,7 @@ class VidmeIE(InfoExtractor): like_count = str_to_int(self._html_search_regex( r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">', webpage, 'like count', fatal=False)) + uploader = self._html_search_regex('class="video_author_username">([^<]+)', webpage, 'uploader', fatal=False) return { 'id': video_id, @@ -63,4 +64,5 @@ class VidmeIE(InfoExtractor): 'duration': duration, 'view_count': view_count, 'like_count': like_count, + 'uploader': uploader, } From 16a089780e87d57b5295c4dfb8da1217ef6eda70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 1 Aug 2015 03:50:03 +0600 Subject: [PATCH 376/450] [soundcloud] Clean up _VALID_URLs --- youtube_dl/extractor/soundcloud.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index f1307dc83..6ce86cbcd 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -29,7 +29,7 @@ class SoundcloudIE(InfoExtractor): _VALID_URL = r'''(?x)^(?:https?://)? (?:(?:(?:www\.|m\.)?soundcloud\.com/ (?P<uploader>[\w\d-]+)/ - (?!(?:tracks|sets|reposts|likes|spotlight)/?(?:$|[?#])) + (?!(?:tracks|sets(?:/[^/?#]+)?|reposts|likes|spotlight)/?(?:$|[?#])) (?P<title>[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) @@ -293,7 +293,15 @@ class SoundcloudSetIE(SoundcloudIE): class SoundcloudUserIE(SoundcloudIE): - _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|sets|reposts|likes|spotlight)/?)?(\?.*)?$' + _VALID_URL = r'''(?x) + https?:// + (?:(?:www|m)\.)?soundcloud\.com/ + (?P<user>[^/]+) + (?:/ + (?P<rsrc>tracks|sets|reposts|likes|spotlight) + )? + /?(?:[?#].*)?$ + ''' IE_NAME = 'soundcloud:user' _TESTS = [{ 'url': 'https://soundcloud.com/the-akashic-chronicler', From 3b58d94f71b0e7b3ace6f5965f7335aa95fd0c1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 1 Aug 2015 04:03:52 +0600 Subject: [PATCH 377/450] [vidme] Make uploader optional and add test --- youtube_dl/extractor/vidme.py | 37 ++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 575a62f3a..157bb74fe 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -22,6 +22,27 @@ class VidmeIE(InfoExtractor): 'timestamp': 1406313244, 'upload_date': '20140725', 'thumbnail': 're:^https?://.*\.jpg', + 'view_count': int, + 'like_count': int, + }, + }, { + # tests uploader field + 'url': 'https://vid.me/4Iib', + 'info_dict': { + 'id': '4Iib', + 'ext': 'mp4', + 'title': 'The Carver', + 'description': 'md5:e9c24870018ae8113be936645b93ba3c', + 'duration': 97.859999999999999, + 'timestamp': 1433203629, + 'upload_date': '20150602', + 'uploader': 'Thomas', + 'thumbnail': 're:^https?://.*\.jpg', + 'view_count': int, + 'like_count': int, + }, + 'params': { + 'skip_download': True, }, }, { # From http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching @@ -40,17 +61,23 @@ class VidmeIE(InfoExtractor): title = self._og_search_title(webpage) description = self._og_search_description(webpage, default='') thumbnail = self._og_search_thumbnail(webpage) - timestamp = int_or_none(self._og_search_property('updated_time', webpage, fatal=False)) - width = int_or_none(self._og_search_property('video:width', webpage, fatal=False)) - height = int_or_none(self._og_search_property('video:height', webpage, fatal=False)) + timestamp = int_or_none(self._og_search_property( + 'updated_time', webpage, fatal=False)) + width = int_or_none(self._og_search_property( + 'video:width', webpage, fatal=False)) + height = int_or_none(self._og_search_property( + 'video:height', webpage, fatal=False)) duration = float_or_none(self._html_search_regex( r'data-duration="([^"]+)"', webpage, 'duration', fatal=False)) view_count = str_to_int(self._html_search_regex( - r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?', webpage, 'view count', fatal=False)) + r'<(?:li|span) class="video_views">\s*([\d,\.]+)\s*plays?', + webpage, 'view count', fatal=False)) like_count = str_to_int(self._html_search_regex( r'class="score js-video-vote-score"[^>]+data-score="([\d,\.\s]+)">', webpage, 'like count', fatal=False)) - uploader = self._html_search_regex('class="video_author_username">([^<]+)', webpage, 'uploader', fatal=False) + uploader = self._html_search_regex( + 'class="video_author_username"[^>]*>([^<]+)', + webpage, 'uploader', default=None) return { 'id': video_id, From 30a453884e6a4d228ced4cdddcfa8e79a768755a Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 31 Jul 2015 14:41:30 +0100 Subject: [PATCH 378/450] [viewster] use head request to extract api token Closes #6419. --- youtube_dl/extractor/viewster.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 65324d903..cda02ba24 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -11,6 +11,7 @@ from ..utils import ( determine_ext, int_or_none, parse_iso8601, + HEADRequest, ) @@ -73,7 +74,7 @@ class ViewsterIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) # Get 'api_token' cookie - self._request_webpage(url, video_id) + self._request_webpage(HEADRequest(url), video_id) cookies = self._get_cookies(url) self._AUTH_TOKEN = compat_urllib_parse_unquote(cookies['api_token'].value) From 44cae2fb2e7b8cffcdfdd57b1224115345009cc5 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 30 Jul 2015 23:55:57 +0100 Subject: [PATCH 379/450] [gdcvault] Add support for audio extraction (fixes #5784) Closes #6412. --- youtube_dl/extractor/gdcvault.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 43f916412..c3017d451 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -7,7 +7,10 @@ from ..compat import ( compat_urllib_parse, compat_urllib_request, ) -from ..utils import remove_end +from ..utils import ( + remove_end, + HEADRequest, +) class GDCVaultIE(InfoExtractor): @@ -73,10 +76,20 @@ class GDCVaultIE(InfoExtractor): return video_formats def _parse_flv(self, xml_description): - video_formats = [] + formats = [] akamai_url = xml_description.find('./metadata/akamaiHost').text + audios = xml_description.find('./metadata/audios') + if audios is not None: + for audio in audios: + formats.append({ + 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, + 'play_path': remove_end(audio.get('url'), '.flv'), + 'ext': 'flv', + 'vcodec': 'none', + 'format_id': audio.get('code'), + }) slide_video_path = xml_description.find('./metadata/slideVideo').text - video_formats.append({ + formats.append({ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, 'play_path': remove_end(slide_video_path, '.flv'), 'ext': 'flv', @@ -86,7 +99,7 @@ class GDCVaultIE(InfoExtractor): 'format_id': 'slides', }) speaker_video_path = xml_description.find('./metadata/speakerVideo').text - video_formats.append({ + formats.append({ 'url': 'rtmp://%s/ondemand?ovpfv=1.1' % akamai_url, 'play_path': remove_end(speaker_video_path, '.flv'), 'ext': 'flv', @@ -95,7 +108,7 @@ class GDCVaultIE(InfoExtractor): 'preference': -1, 'format_id': 'speaker', }) - return video_formats + return formats def _login(self, webpage_url, display_id): (username, password) = self._get_login_info() @@ -133,16 +146,18 @@ class GDCVaultIE(InfoExtractor): r's1\.addVariable\("file",\s*encodeURIComponent\("(/[^"]+)"\)\);', start_page, 'url', default=None) if direct_url: - video_url = 'http://www.gdcvault.com/' + direct_url title = self._html_search_regex( r'<td><strong>Session Name</strong></td>\s*<td>(.*?)</td>', start_page, 'title') + video_url = 'http://www.gdcvault.com' + direct_url + # resolve the url so that we can detect the correct extension + head = self._request_webpage(HEADRequest(video_url), video_id) + video_url = head.geturl() return { 'id': video_id, 'display_id': display_id, 'url': video_url, - 'ext': 'flv', 'title': title, } From 525a87f58ee9c4ee91b5b0384184dabf6d87eef3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 1 Aug 2015 11:40:34 +0200 Subject: [PATCH 380/450] [gdcvault] Fix typo: xml_decription_url -> xml_description_url --- youtube_dl/extractor/gdcvault.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index c3017d451..a6834db43 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -183,8 +183,8 @@ class GDCVaultIE(InfoExtractor): # Fallback to the older format xml_name = self._html_search_regex(r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename') - xml_decription_url = xml_root + 'xml/' + xml_name - xml_description = self._download_xml(xml_decription_url, display_id) + xml_description_url = xml_root + 'xml/' + xml_name + xml_description = self._download_xml(xml_description_url, display_id) video_title = xml_description.find('./metadata/title').text video_formats = self._parse_mp4(xml_description) From ee114368ad0bb9822449295910263a99f9de4e1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 1 Aug 2015 20:22:13 +0600 Subject: [PATCH 381/450] [utils] Make value optional for find_xpath_attr This allows selecting particular attributes by name but without specifying the value and similar to xpath syntax `[@attrib]` --- test/test_utils.py | 9 +++++++++ youtube_dl/utils.py | 13 ++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 65692a9fb..a759b2da9 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -235,12 +235,21 @@ class TestUtil(unittest.TestCase): <node x="a"/> <node x="a" y="c" /> <node x="b" y="d" /> + <node x="" /> </root>''' doc = xml.etree.ElementTree.fromstring(testxml) + self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n'), None) self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None) + self.assertEqual(find_xpath_attr(doc, './/node', 'n'), None) + self.assertEqual(find_xpath_attr(doc, './/node', 'n', 'v'), None) + self.assertEqual(find_xpath_attr(doc, './/node', 'x'), doc[1]) self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1]) + self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'b'), doc[3]) + self.assertEqual(find_xpath_attr(doc, './/node', 'y'), doc[2]) self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2]) + self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'd'), doc[3]) + self.assertEqual(find_xpath_attr(doc, './/node', 'x', ''), doc[4]) def test_xpath_with_ns(self): testxml = '''<root xmlns:media="http://example.com/"> diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 88f9f9070..78dc2b449 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -139,21 +139,24 @@ def write_json_file(obj, fn): if sys.version_info >= (2, 7): - def find_xpath_attr(node, xpath, key, val): + def find_xpath_attr(node, xpath, key, val=None): """ Find the xpath xpath[@key=val] """ assert re.match(r'^[a-zA-Z-]+$', key) - assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) - expr = xpath + "[@%s='%s']" % (key, val) + if val: + assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) + expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val)) return node.find(expr) else: - def find_xpath_attr(node, xpath, key, val): + def find_xpath_attr(node, xpath, key, val=None): # Here comes the crazy part: In 2.6, if the xpath is a unicode, # .//node does not match if a node is a direct child of . ! if isinstance(xpath, compat_str): xpath = xpath.encode('ascii') for f in node.findall(xpath): - if f.attrib.get(key) == val: + if key not in f.attrib: + continue + if val is None or f.attrib.get(key) == val: return f return None From 3f125c8c70e8109bc90d4446b40740133e343b85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 1 Aug 2015 21:43:33 +0600 Subject: [PATCH 382/450] [nbcnews] Extend _VALID_URL --- youtube_dl/extractor/nbc.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index dc2091be0..ccdbfb6c9 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -124,7 +124,7 @@ class NBCSportsIE(InfoExtractor): class NBCNewsIE(InfoExtractor): _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/ (?:video/.+?/(?P<id>\d+)| - (?:feature|nightly-news)/[^/]+/(?P<title>.+)) + (?:watch|feature|nightly-news)/[^/]+/(?P<title>.+)) ''' _TESTS = [ @@ -169,6 +169,10 @@ class NBCNewsIE(InfoExtractor): 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', }, }, + { + 'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952', + 'only_matching': True, + }, ] def _real_extract(self, url): From 55eae65b39d754d699ad9de3f9c99fcdf62e0176 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 2 Aug 2015 00:42:23 +0800 Subject: [PATCH 383/450] Credit @cyb3r for the ir90tv extractor --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index aa6b88cc0..d16d34272 100644 --- a/AUTHORS +++ b/AUTHORS @@ -136,3 +136,4 @@ sceext Zach Bruggeman Tjark Saul slangangular +Behrouz Abbasi From a107193e4b7a3d5414dd7422263c34ac0e309ec4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Aug 2015 01:13:21 +0600 Subject: [PATCH 384/450] [extractor/common] Extract f4m and m3u8 formats, subtitles and info --- youtube_dl/extractor/common.py | 200 ++++++++++++++++++++++++--------- 1 file changed, 149 insertions(+), 51 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index dc5080504..f9578b838 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -18,6 +18,7 @@ from ..compat import ( compat_HTTPError, compat_http_client, compat_urllib_error, + compat_urllib_parse, compat_urllib_parse_urlparse, compat_urllib_request, compat_urlparse, @@ -37,6 +38,7 @@ from ..utils import ( RegexNotFoundError, sanitize_filename, unescapeHTML, + url_basename, ) @@ -978,69 +980,165 @@ class InfoExtractor(object): self._sort_formats(formats) return formats - # TODO: improve extraction - def _extract_smil_formats(self, smil_url, video_id, fatal=True): - smil = self._download_xml( - smil_url, video_id, 'Downloading SMIL file', - 'Unable to download SMIL file', fatal=fatal) + @staticmethod + def _xpath_ns(path, namespace=None): + if not namespace: + return path + out = [] + for c in path.split('/'): + if not c or c == '.': + out.append(c) + else: + out.append('{%s}%s' % (namespace, c)) + return '/'.join(out) + + def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None): + smil = self._download_smil(smil_url, video_id, fatal=fatal) + if smil is False: assert not fatal return [] - base = smil.find('./head/meta').get('base') + namespace = self._search_regex( + r'{([^}]+)?}smil', smil.tag, 'namespace', default=None) + + return self._parse_smil_formats( + smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) + + def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): + smil = self._download_smil(smil_url, video_id, fatal=fatal) + if smil is False: + return {} + return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) + + def _download_smil(self, smil_url, video_id, fatal=True): + return self._download_xml( + smil_url, video_id, 'Downloading SMIL file', + 'Unable to download SMIL file', fatal=fatal) + + def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): + namespace = self._search_regex( + r'{([^}]+)?}smil', smil.tag, 'namespace', default=None) + + formats = self._parse_smil_formats( + smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) + subtitles = self._parse_smil_subtitles(smil, namespace=namespace) + + video_id = os.path.splitext(url_basename(smil_url))[0] + title = None + description = None + for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): + name = meta.attrib.get('name') + content = meta.attrib.get('content') + if not name or not content: + continue + if not title and name == 'title': + title = content + elif not description and name in ('description', 'abstract'): + description = content + + return { + 'id': video_id, + 'title': title or video_id, + 'description': description, + 'formats': formats, + 'subtitles': subtitles, + } + + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None): + base = smil_url + for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): + b = meta.get('base') or meta.get('httpBase') + if b: + base = b + break formats = [] rtmp_count = 0 - if smil.findall('./body/seq/video'): - video = smil.findall('./body/seq/video')[0] - fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count) - formats.extend(fmts) - else: - for video in smil.findall('./body/switch/video'): - fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count) - formats.extend(fmts) + http_count = 0 + + videos = smil.findall(self._xpath_ns('.//video', namespace)) + for video in videos: + src = video.get('src') + if not src: + continue + + bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + filesize = int_or_none(video.get('size') or video.get('fileSize')) + width = int_or_none(video.get('width')) + height = int_or_none(video.get('height')) + proto = video.get('proto') + ext = video.get('ext') + src_ext = determine_ext(src) + streamer = video.get('streamer') or base + + if proto == 'rtmp' or streamer.startswith('rtmp'): + rtmp_count += 1 + formats.append({ + 'url': streamer, + 'play_path': src, + 'ext': 'flv', + 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), + 'tbr': bitrate, + 'filesize': filesize, + 'width': width, + 'height': height, + }) + continue + + src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) + + if proto == 'm3u8' or src_ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src_url, video_id, ext or 'mp4', m3u8_id='hls')) + continue + + if src_ext == 'f4m': + f4m_url = src_url + if not f4m_params: + f4m_params = { + 'hdcore': '3.2.0', + 'plugin': 'flowplayer-3.2.0.1', + } + f4m_url += '&' if '?' in f4m_url else '?' + f4m_url += compat_urllib_parse.urlencode(f4m_params).encode('utf-8') + formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds')) + continue + + if src_url.startswith('http'): + http_count += 1 + formats.append({ + 'url': src_url, + 'ext': ext or src_ext or 'flv', + 'format_id': 'http-%d' % (bitrate or http_count), + 'tbr': bitrate, + 'filesize': filesize, + 'width': width, + 'height': height, + }) + continue self._sort_formats(formats) return formats - def _parse_smil_video(self, video, video_id, base, rtmp_count): - src = video.get('src') - if not src: - return [], rtmp_count - bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) - width = int_or_none(video.get('width')) - height = int_or_none(video.get('height')) - proto = video.get('proto') - if not proto: - if base: - if base.startswith('rtmp'): - proto = 'rtmp' - elif base.startswith('http'): - proto = 'http' - ext = video.get('ext') - if proto == 'm3u8': - return self._extract_m3u8_formats(src, video_id, ext), rtmp_count - elif proto == 'rtmp': - rtmp_count += 1 - streamer = video.get('streamer') or base - return ([{ - 'url': streamer, - 'play_path': src, - 'ext': 'flv', - 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), - 'tbr': bitrate, - 'width': width, - 'height': height, - }], rtmp_count) - elif proto.startswith('http'): - return ([{ - 'url': base + src, - 'ext': ext or 'flv', - 'tbr': bitrate, - 'width': width, - 'height': height, - }], rtmp_count) + def _parse_smil_subtitles(self, smil, namespace=None): + subtitles = {} + for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): + src = textstream.get('src') + if not src: + continue + ext = textstream.get('ext') or determine_ext(src) + if not ext: + type_ = textstream.get('type') + if type_ == 'text/srt': + ext = 'srt' + lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') + subtitles.setdefault(lang, []).append({ + 'url': src, + 'ext': ext, + }) + return subtitles def _live_title(self, name): """ Generate the title for a live video """ From e5e8d20a3a65832c74b002f247866fcbb92e9246 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Aug 2015 01:13:59 +0600 Subject: [PATCH 385/450] [extractor/generic] Improve generic SMIL detection --- youtube_dl/extractor/generic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8cef61c3c..6900ed96f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1110,11 +1110,13 @@ class GenericIE(InfoExtractor): self.report_extraction(video_id) - # Is it an RSS feed? + # Is it an RSS feed or a SMIL file? try: doc = parse_xml(webpage) if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) + elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): + return self._parse_smil(doc, url, video_id) except compat_xml_parse_error: pass From 308cfe0ab3ec7122602ba2d6a4e3acd2caa7a757 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Aug 2015 01:14:41 +0600 Subject: [PATCH 386/450] [test_downloader] Respect --force-generic-extractor --- test/test_download.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_download.py b/test/test_download.py index 1110357a7..284418834 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -136,7 +136,9 @@ def generator(test_case): # We're not using .download here sine that is just a shim # for outside error handling, and returns the exit code # instead of the result dict. - res_dict = ydl.extract_info(test_case['url']) + res_dict = ydl.extract_info( + test_case['url'], + force_generic_extractor=params.get('force_generic_extractor', False)) except (DownloadError, ExtractorError) as err: # Check if the exception is not a network related one if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503): From 645f814544f9d40386e504a1eb8cf3558f2c109e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Aug 2015 01:15:33 +0600 Subject: [PATCH 387/450] [test/helper] Allow dicts for mincount --- test/helper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/helper.py b/test/helper.py index e1129e58f..c8b34654d 100644 --- a/test/helper.py +++ b/test/helper.py @@ -133,8 +133,8 @@ def expect_info_dict(self, got_dict, expected_dict): elif isinstance(expected, compat_str) and expected.startswith('mincount:'): got = got_dict.get(info_field) self.assertTrue( - isinstance(got, list), - 'Expected field %s to be a list, but it is of type %s' % ( + isinstance(got, (list, dict)), + 'Expected field %s to be a list or a dict, but it is of type %s' % ( info_field, type(got).__name__)) expected_num = int(expected.partition(':')[2]) assertGreaterEqual( From 8765222d2211cd6f2a40611249181af0bbb2d531 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Aug 2015 01:16:21 +0600 Subject: [PATCH 388/450] [extractor/generic] Add generic SMIL tests --- youtube_dl/extractor/generic.py | 68 +++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6900ed96f..27584c44c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -130,6 +130,74 @@ class GenericIE(InfoExtractor): 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', } }, + # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng + { + 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', + 'info_dict': { + 'id': 'smil', + 'ext': 'mp4', + 'title': 'Automatics, robotics and biocybernetics', + 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', + 'formats': 'mincount:16', + 'subtitles': 'mincount:1', + }, + 'params': { + 'force_generic_extractor': True, + 'skip_download': True, + }, + }, + # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html + { + 'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil', + 'info_dict': { + 'id': 'hds', + 'ext': 'flv', + 'title': 'hds', + 'formats': 'mincount:1', + }, + 'params': { + 'skip_download': True, + }, + }, + # SMIL from https://www.restudy.dk/video/play/id/1637 + { + 'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml', + 'info_dict': { + 'id': 'video_1637', + 'ext': 'flv', + 'title': 'video_1637', + 'formats': 'mincount:3', + }, + 'params': { + 'skip_download': True, + }, + }, + # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm + { + 'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil', + 'info_dict': { + 'id': 'smil-service', + 'ext': 'flv', + 'title': 'smil-service', + 'formats': 'mincount:1', + }, + 'params': { + 'skip_download': True, + }, + }, + # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370 + { + 'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil', + 'info_dict': { + 'id': '4719370', + 'ext': 'mp4', + 'title': '571de1fd-47bc-48db-abf9-238872a58d1f', + 'formats': 'mincount:3', + }, + 'params': { + 'skip_download': True, + }, + }, # google redirect { 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', From 41c3a5a7beebbf5f60c5edb5093d564f0829c5c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Aug 2015 01:20:49 +0600 Subject: [PATCH 389/450] [extractor/common] Fix python 3 --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f9578b838..c123d9fca 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1101,7 +1101,7 @@ class InfoExtractor(object): 'plugin': 'flowplayer-3.2.0.1', } f4m_url += '&' if '?' in f4m_url else '?' - f4m_url += compat_urllib_parse.urlencode(f4m_params).encode('utf-8') + f4m_url += compat_urllib_parse.urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds')) continue From 17712eeb1933f53696c1fc53606174e988a96472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Aug 2015 01:31:17 +0600 Subject: [PATCH 390/450] [extractor/common] Extract namespace parse routine --- youtube_dl/extractor/common.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index c123d9fca..717dcec7b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -999,8 +999,7 @@ class InfoExtractor(object): assert not fatal return [] - namespace = self._search_regex( - r'{([^}]+)?}smil', smil.tag, 'namespace', default=None) + namespace = self._parse_smil_namespace(smil) return self._parse_smil_formats( smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) @@ -1017,8 +1016,7 @@ class InfoExtractor(object): 'Unable to download SMIL file', fatal=fatal) def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): - namespace = self._search_regex( - r'{([^}]+)?}smil', smil.tag, 'namespace', default=None) + namespace = self._parse_smil_namespace(smil) formats = self._parse_smil_formats( smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) @@ -1045,6 +1043,10 @@ class InfoExtractor(object): 'subtitles': subtitles, } + def _parse_smil_namespace(self, smil): + return self._search_regex( + r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None): base = smil_url for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): From fa7a1cc5ef52a8dd9a355ab37a74be55ac2ddc1f Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 22 Jul 2015 12:34:42 +0100 Subject: [PATCH 391/450] [screenwavemedia] fix info extraction (fixes #6270) Closes #6330. --- youtube_dl/extractor/screenwavemedia.py | 84 +++++++++++-------------- 1 file changed, 36 insertions(+), 48 deletions(-) diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py index d1ab66b32..09c085dcf 100644 --- a/youtube_dl/extractor/screenwavemedia.py +++ b/youtube_dl/extractor/screenwavemedia.py @@ -1,12 +1,11 @@ # encoding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( int_or_none, unified_strdate, + js_to_json, ) @@ -22,59 +21,48 @@ class ScreenwaveMediaIE(InfoExtractor): video_id = self._match_id(url) playerdata = self._download_webpage( - 'http://player.screenwavemedia.com/play/player.php?id=%s' % video_id, + 'http://player.screenwavemedia.com/player.php?id=%s' % video_id, video_id, 'Downloading player webpage') vidtitle = self._search_regex( r'\'vidtitle\'\s*:\s*"([^"]+)"', playerdata, 'vidtitle').replace('\\/', '/') - vidurl = self._search_regex( - r'\'vidurl\'\s*:\s*"([^"]+)"', playerdata, 'vidurl').replace('\\/', '/') - videolist_url = None + playerconfig = self._download_webpage( + 'http://player.screenwavemedia.com/player.js', + video_id, 'Downloading playerconfig webpage') - mobj = re.search(r"'videoserver'\s*:\s*'(?P<videoserver>[^']+)'", playerdata) - if mobj: - videoserver = mobj.group('videoserver') - mobj = re.search(r'\'vidid\'\s*:\s*"(?P<vidid>[^\']+)"', playerdata) - vidid = mobj.group('vidid') if mobj else video_id - videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid) - else: - mobj = re.search(r"file\s*:\s*'(?P<smil>http.+?/jwplayer\.smil)'", playerdata) - if mobj: - videolist_url = mobj.group('smil') + videoserver = self._search_regex(r"'videoserver'\s*:\s*'([^']+)", playerconfig, 'videoserver') - if videolist_url: - videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML') - formats = [] - baseurl = vidurl[:vidurl.rfind('/') + 1] - for video in videolist.findall('.//video'): - src = video.get('src') - if not src: - continue - file_ = src.partition(':')[-1] - width = int_or_none(video.get('width')) - height = int_or_none(video.get('height')) - bitrate = int_or_none(video.get('system-bitrate'), scale=1000) - format = { - 'url': baseurl + file_, - 'format_id': src.rpartition('.')[0].rpartition('_')[-1], - } - if width or height: - format.update({ - 'tbr': bitrate, - 'width': width, - 'height': height, - }) - else: - format.update({ - 'abr': bitrate, - 'vcodec': 'none', - }) - formats.append(format) - else: - formats = [{ - 'url': vidurl, - }] + sources = self._parse_json( + js_to_json( + self._search_regex( + r"sources\s*:\s*(\[[^\]]+?\])", playerconfig, + 'sources', + ).replace( + "' + thisObj.options.videoserver + '", + videoserver + ).replace( + "' + playerVidId + '", + video_id + ) + ), + video_id + ) + + formats = [] + for source in sources: + if source['type'] == 'hls': + formats.extend(self._extract_m3u8_formats(source['file'], video_id)) + else: + format_label = source.get('label') + height = int_or_none(self._search_regex( + r'^(\d+)[pP]', format_label, 'height', default=None)) + formats.append({ + 'url': source['file'], + 'format': format_label, + 'ext': source.get('type'), + 'height': height, + }) self._sort_formats(formats) return { From 9cc93c64aa321260475a2bdf7d8626cdd16bf8ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 1 Aug 2015 22:15:43 +0200 Subject: [PATCH 392/450] [screenwavemedia] Use the IP for the videoserver (fixes #6397) For http://cinemassacre.com/2015/07/28/avgn-seaman-for-dreamcast/ the other server returns a 403 error. --- youtube_dl/extractor/screenwavemedia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py index 09c085dcf..3bc84989e 100644 --- a/youtube_dl/extractor/screenwavemedia.py +++ b/youtube_dl/extractor/screenwavemedia.py @@ -31,7 +31,7 @@ class ScreenwaveMediaIE(InfoExtractor): 'http://player.screenwavemedia.com/player.js', video_id, 'Downloading playerconfig webpage') - videoserver = self._search_regex(r"'videoserver'\s*:\s*'([^']+)", playerconfig, 'videoserver') + videoserver = self._search_regex(r"\[ipaddress\]\s*=>\s*([\d\.]+)", playerdata, 'videoserver') sources = self._parse_json( js_to_json( From cdc682d5a467b7188eb13b5eeb76eb5dd544d1f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Aug 2015 04:21:16 +0600 Subject: [PATCH 393/450] [nowtv] Fix extraction (Closes #6357) --- youtube_dl/extractor/nowtv.py | 63 +++++++++++++++-------------------- 1 file changed, 27 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index 0b5ff4760..de6bc6d96 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, + determine_ext, int_or_none, parse_iso8601, parse_duration, @@ -15,7 +16,7 @@ from ..utils import ( class NowTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?P<station>rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/player' + _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/player' _TESTS = [{ # rtl @@ -23,7 +24,7 @@ class NowTVIE(InfoExtractor): 'info_dict': { 'id': '203519', 'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Die neuen Bauern und eine Hochzeit', 'description': 'md5:e234e1ed6d63cf06be5c070442612e7e', 'thumbnail': 're:^https?://.*\.jpg$', @@ -32,7 +33,7 @@ class NowTVIE(InfoExtractor): 'duration': 2786, }, 'params': { - # m3u8 download + # rtmp download 'skip_download': True, }, }, { @@ -41,7 +42,7 @@ class NowTVIE(InfoExtractor): 'info_dict': { 'id': '203481', 'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Berlin - Tag & Nacht (Folge 934)', 'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0', 'thumbnail': 're:^https?://.*\.jpg$', @@ -50,7 +51,7 @@ class NowTVIE(InfoExtractor): 'duration': 2641, }, 'params': { - # m3u8 download + # rtmp download 'skip_download': True, }, }, { @@ -59,7 +60,7 @@ class NowTVIE(InfoExtractor): 'info_dict': { 'id': '165780', 'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Hals- und Beinbruch', 'description': 'md5:b50d248efffe244e6f56737f0911ca57', 'thumbnail': 're:^https?://.*\.jpg$', @@ -68,7 +69,7 @@ class NowTVIE(InfoExtractor): 'duration': 2742, }, 'params': { - # m3u8 download + # rtmp download 'skip_download': True, }, }, { @@ -77,7 +78,7 @@ class NowTVIE(InfoExtractor): 'info_dict': { 'id': '99205', 'display_id': 'medicopter-117/angst', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Angst!', 'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e', 'thumbnail': 're:^https?://.*\.jpg$', @@ -86,7 +87,7 @@ class NowTVIE(InfoExtractor): 'duration': 3025, }, 'params': { - # m3u8 download + # rtmp download 'skip_download': True, }, }, { @@ -95,7 +96,7 @@ class NowTVIE(InfoExtractor): 'info_dict': { 'id': '203521', 'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Thema u.a.: Der erste Blick: Die Apple Watch', 'description': 'md5:4312b6c9d839ffe7d8caf03865a531af', 'thumbnail': 're:^https?://.*\.jpg$', @@ -104,7 +105,7 @@ class NowTVIE(InfoExtractor): 'duration': 1083, }, 'params': { - # m3u8 download + # rtmp download 'skip_download': True, }, }, { @@ -113,7 +114,7 @@ class NowTVIE(InfoExtractor): 'info_dict': { 'id': '128953', 'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel', - 'ext': 'mp4', + 'ext': 'flv', 'title': "Büro-Fall / Chihuahua 'Joel'", 'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d', 'thumbnail': 're:^https?://.*\.jpg$', @@ -122,15 +123,13 @@ class NowTVIE(InfoExtractor): 'duration': 3092, }, 'params': { - # m3u8 download + # rtmp download 'skip_download': True, }, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') - station = mobj.group('station') + display_id = self._match_id(url) info = self._download_json( 'https://api.nowtv.de/v3/movies/%s?fields=id,title,free,geoblocked,articleLong,articleShort,broadcastStartDate,seoUrl,duration,format,files' % display_id, @@ -148,29 +147,19 @@ class NowTVIE(InfoExtractor): raise ExtractorError( 'Video %s is not available for free' % video_id, expected=True) - f = info.get('format', {}) - station = f.get('station') or station - - STATIONS = { - 'rtl': 'rtlnow', - 'rtl2': 'rtl2now', - 'vox': 'voxnow', - 'nitro': 'rtlnitronow', - 'ntv': 'n-tvnow', - 'superrtl': 'superrtlnow' - } - formats = [] for item in files['items']: - item_path = remove_start(item['path'], '/') - tbr = int_or_none(item['bitrate']) - m3u8_url = 'http://hls.fra.%s.de/hls-vod-enc/%s.m3u8' % (STATIONS[station], item_path) - m3u8_url = m3u8_url.replace('now/', 'now/videos/') + if determine_ext(item['path']) != 'f4v': + continue + app, play_path = remove_start(item['path'], '/').split('/', 1) formats.append({ - 'url': m3u8_url, - 'format_id': '%s-%sk' % (item['id'], tbr), - 'ext': 'mp4', - 'tbr': tbr, + 'url': 'rtmpe://fms.rtl.de', + 'app': app, + 'play_path': 'mp4:%s' % play_path, + 'ext': 'flv', + 'page_url': url, + 'player_url': 'http://rtl-now.rtl.de/includes/nc_player.swf', + 'tbr': int_or_none(item.get('bitrate')), }) self._sort_formats(formats) @@ -178,6 +167,8 @@ class NowTVIE(InfoExtractor): description = info.get('articleLong') or info.get('articleShort') timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') duration = parse_duration(info.get('duration')) + + f = info.get('format', {}) thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') return { From e422d7f4f78994de8483d2207ab4e00174a2408c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Aug 2015 04:26:59 +0600 Subject: [PATCH 394/450] [nowtv] Expand _VALID_URL --- youtube_dl/extractor/nowtv.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index de6bc6d96..11ce37168 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -16,7 +16,7 @@ from ..utils import ( class NowTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/player' + _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/(?:player|preview)' _TESTS = [{ # rtl @@ -126,6 +126,9 @@ class NowTVIE(InfoExtractor): # rtmp download 'skip_download': True, }, + }, { + 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview', + 'only_matching': True, }] def _real_extract(self, url): From d41d04c0f513ad3b83ab6aee60cf2201710b6063 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Aug 2015 06:35:35 +0600 Subject: [PATCH 395/450] [videolectures] Fix _VALID_URL --- youtube_dl/extractor/videolecturesnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py index d6a7eb203..24584dc80 100644 --- a/youtube_dl/extractor/videolecturesnet.py +++ b/youtube_dl/extractor/videolecturesnet.py @@ -12,7 +12,7 @@ from ..utils import ( class VideoLecturesNetIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/' + _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)(?:/?[#?].*)?$' IE_NAME = 'videolectures.net' _TEST = { From 5c45bbe57bd791debfd64052ab030298a7c6b718 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 2 Aug 2015 15:19:30 +0200 Subject: [PATCH 396/450] [nowtv] Remove unused import --- youtube_dl/extractor/nowtv.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index 11ce37168..ad938fb62 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( From 25a4c5a9ed59eca0241922363e83e61172527658 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 2 Aug 2015 15:19:57 +0200 Subject: [PATCH 397/450] [dailymotion:playlist] Use an iterator for the entries So that using '--playlist-end' only downloads the required pages (reported in #2175). --- youtube_dl/extractor/dailymotion.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 85d945509..2d90b2224 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -15,7 +15,6 @@ from ..utils import ( ExtractorError, determine_ext, int_or_none, - orderedSet, parse_iso8601, str_to_int, unescapeHTML, @@ -278,7 +277,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): }] def _extract_entries(self, id): - video_ids = [] + video_ids = set() processed_urls = set() for pagenum in itertools.count(1): page_url = self._PAGE_TEMPLATE % (id, pagenum) @@ -291,12 +290,13 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): processed_urls.add(urlh.geturl()) - video_ids.extend(re.findall(r'data-xid="(.+?)"', webpage)) + for video_id in re.findall(r'data-xid="(.+?)"', webpage): + if video_id not in video_ids: + yield self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') + video_ids.add(video_id) if re.search(self._MORE_PAGES_INDICATOR, webpage) is None: break - return [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') - for video_id in orderedSet(video_ids)] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From d7d2a9a3dbf1cef78c5085a4aab5d2f336c64cff Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sun, 2 Aug 2015 03:28:04 +0100 Subject: [PATCH 398/450] [utils] restart download if server does not support byte ranges --- youtube_dl/downloader/http.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index b7f144af9..b2e82cfde 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -57,6 +57,20 @@ class HttpFD(FileDownloader): # Establish connection try: data = self.ydl.urlopen(request) + + if resume_len > 0: + content_range = data.headers.get('Content-Range') + if content_range: + content_range_m = re.search(r'bytes (\d+)-', content_range) + if content_range_m: + # Content-Range is correct - go on + if resume_len == int(content_range_m.group(1)): + break + + # Content-Range is invalid - wipe the file and do entire redownload + resume_len = 0 + open_mode = 'wb' + break except (compat_urllib_error.HTTPError, ) as err: if (err.code < 500 or err.code >= 600) and err.code != 416: From 8d5b8b477e4b1051482b21ea451f0de1ce23bce7 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sun, 2 Aug 2015 03:58:02 +0100 Subject: [PATCH 399/450] [utils] import re --- youtube_dl/downloader/http.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index b2e82cfde..f796ee113 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -4,6 +4,7 @@ import errno import os import socket import time +import re from .common import FileDownloader from ..compat import ( From c3124c3085e6a9a83ee31ace3a7d528a324c42da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 3 Aug 2015 02:25:08 +0600 Subject: [PATCH 400/450] [downloader/http] Simplify --- youtube_dl/downloader/http.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index f796ee113..0862e90bb 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -58,20 +58,16 @@ class HttpFD(FileDownloader): # Establish connection try: data = self.ydl.urlopen(request) - if resume_len > 0: content_range = data.headers.get('Content-Range') if content_range: content_range_m = re.search(r'bytes (\d+)-', content_range) - if content_range_m: - # Content-Range is correct - go on - if resume_len == int(content_range_m.group(1)): - break - + # Content-Range is correct - go on + if content_range_m and resume_len == int(content_range_m.group(1)): + break # Content-Range is invalid - wipe the file and do entire redownload resume_len = 0 open_mode = 'wb' - break except (compat_urllib_error.HTTPError, ) as err: if (err.code < 500 or err.code >= 600) and err.code != 416: From 10eaa8ef1d2a9699052af9262aa472456548e99b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 3 Aug 2015 02:25:40 +0600 Subject: [PATCH 401/450] [downloader/http] Report unable to resume --- youtube_dl/downloader/http.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 0862e90bb..2f8490f02 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -66,6 +66,7 @@ class HttpFD(FileDownloader): if content_range_m and resume_len == int(content_range_m.group(1)): break # Content-Range is invalid - wipe the file and do entire redownload + self.report_unable_to_resume() resume_len = 0 open_mode = 'wb' break From 84bc4dcb0f678f0a8c9f993e101b9769e3959f76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 3 Aug 2015 02:27:47 +0600 Subject: [PATCH 402/450] [downloader/http] Clarify rationale for Content-Range check (#6426) --- youtube_dl/downloader/http.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 2f8490f02..a29f5cf31 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -58,14 +58,21 @@ class HttpFD(FileDownloader): # Establish connection try: data = self.ydl.urlopen(request) + # When trying to resume, Content-Range HTTP header of response has to be checked + # to match the value of requested Range HTTP header. This is due to a webservers + # that don't support resuming and serve a whole file with no Content-Range + # set in response despite of requested Range (see + # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799) if resume_len > 0: content_range = data.headers.get('Content-Range') if content_range: content_range_m = re.search(r'bytes (\d+)-', content_range) - # Content-Range is correct - go on - if content_range_m and resume_len == int(content_range_m.group(1)): - break - # Content-Range is invalid - wipe the file and do entire redownload + # Content-Range is present and matches requested Range, resume is possible + if content_range_m and resume_len == int(content_range_m.group(1)): + break + # Content-Range is either not present or invalid. Assuming remote webserver is + # trying to send the whole file, resume is not possible, so wiping the local file + # and performing entire redownload self.report_unable_to_resume() resume_len = 0 open_mode = 'wb' From 754e70cf3e74218ae5d840985fbf07bbe274332a Mon Sep 17 00:00:00 2001 From: George Brighton <george@gebn.co.uk> Date: Sun, 2 Aug 2015 19:21:10 +0100 Subject: [PATCH 403/450] [pornhub] Fix video url regular expression. PornHub seems to have subtly changed their JavaScript. Before, video URL strings were embedded directly in the video's `flashvars_*` object, but they are now assigned to variables of the form `player_quality_*`, which are then added to this object later under the relevant quality key. --- youtube_dl/extractor/pornhub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 0b7886840..fbaa830d6 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -81,7 +81,7 @@ class PornHubIE(InfoExtractor): comment_count = self._extract_count( r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') - video_urls = list(map(compat_urllib_parse_unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage))) + video_urls = list(map(compat_urllib_parse_unquote, re.findall(r"var player_quality_[0-9]{3}p = '([^']+)'", webpage))) if webpage.find('"encrypted":true') != -1: password = compat_urllib_parse_unquote_plus( self._search_regex(r'"video_title":"([^"]+)', webpage, 'password')) From 524229a2975c20887a9a71cae77132e775003537 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 3 Aug 2015 02:41:17 +0600 Subject: [PATCH 404/450] [pornhub] Improve --- youtube_dl/extractor/pornhub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index fbaa830d6..fec493046 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -81,7 +81,7 @@ class PornHubIE(InfoExtractor): comment_count = self._extract_count( r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') - video_urls = list(map(compat_urllib_parse_unquote, re.findall(r"var player_quality_[0-9]{3}p = '([^']+)'", webpage))) + video_urls = list(map(compat_urllib_parse_unquote, re.findall(r"player_quality_[0-9]{3}p\s*=\s*'([^']+)'", webpage))) if webpage.find('"encrypted":true') != -1: password = compat_urllib_parse_unquote_plus( self._search_regex(r'"video_title":"([^"]+)', webpage, 'password')) From 51a575159a5a83e4477b03544f419dcf2e9ff0fa Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sun, 2 Aug 2015 22:52:12 +0100 Subject: [PATCH 405/450] [facebook] extract uploader --- youtube_dl/extractor/facebook.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index e17bb9aea..734de4da2 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -17,6 +17,8 @@ from ..utils import ( int_or_none, limit_length, urlencode_postdata, + get_element_by_id, + clean_html, ) @@ -161,6 +163,7 @@ class FacebookIE(InfoExtractor): video_title = limit_length(video_title, 80) if not video_title: video_title = 'Facebook video #%s' % video_id + uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) return { 'id': video_id, @@ -168,4 +171,5 @@ class FacebookIE(InfoExtractor): 'formats': formats, 'duration': int_or_none(video_data.get('video_duration')), 'thumbnail': video_data.get('thumbnail_src'), + 'uploader': uploader, } From 67b8a28a2f69764259cf2e90c0a3785c05c55551 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 3 Aug 2015 00:09:21 +0100 Subject: [PATCH 406/450] [facebook] add uploader value to the tests --- youtube_dl/extractor/facebook.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 734de4da2..178a7ca4c 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -44,6 +44,7 @@ class FacebookIE(InfoExtractor): 'id': '637842556329505', 'ext': 'mp4', 'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam', + 'uploader': 'Tennis on Facebook', } }, { 'note': 'Video without discernible title', @@ -52,6 +53,7 @@ class FacebookIE(InfoExtractor): 'id': '274175099429670', 'ext': 'mp4', 'title': 'Facebook video #274175099429670', + 'uploader': 'Asif Nawab Butt', }, 'expected_warnings': [ 'title' From 8de922724b8f3ad31ff7249799de371ff8a5c3ad Mon Sep 17 00:00:00 2001 From: "Sergey M." <dstftw@gmail.com> Date: Mon, 3 Aug 2015 05:36:17 +0600 Subject: [PATCH 407/450] [README.md] Clarify using cookies --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index ac54d7b67..2db3139ee 100644 --- a/README.md +++ b/README.md @@ -439,6 +439,12 @@ Either prepend `http://www.youtube.com/watch?v=` or separate the ID from the opt youtube-dl -- -wNyEUrxzFU youtube-dl "http://www.youtube.com/watch?v=-wNyEUrxzFU" +### How do I pass cookies to youtube-dl? + +Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. Note that cookies file must be in Mozilla/Netscape format and the first line of cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in cookies file and convert newlines if necessary to correspond your OS, namely `CRLF` (`\r\n`) for Windows, `LF` (`\n`) for Linux and `CR` (`\r`) for Mac OS. `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. + +Passing cookies to youtube-dl is a good way to workaround login when particular extractor does not implement it explicitly. + ### Can you add support for this anime video site, or site which shows current movies for free? As a matter of policy (as well as legality), youtube-dl does not include support for services that specialize in infringing copyright. As a rule of thumb, if you cannot easily find a video that the service is quite obviously allowed to distribute (i.e. that has been uploaded by the creator, the creator's distributor, or is published under a free license), the service is probably unfit for inclusion to youtube-dl. From 47a8b7c14a085ce558db3b5a85ded850cd5df642 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 3 Aug 2015 12:00:08 +0200 Subject: [PATCH 408/450] [mdr] Change XPath to make it work in python 2.6 (fixes #6443) The 'progressiveDownloadUrl' element is a direct child, so they should be equivalent. --- youtube_dl/extractor/mdr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 5fdd19027..fc7499958 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -29,7 +29,7 @@ class MDRIE(InfoExtractor): doc = self._download_xml(domain + xmlurl, video_id) formats = [] for a in doc.findall('./assets/asset'): - url_el = a.find('.//progressiveDownloadUrl') + url_el = a.find('./progressiveDownloadUrl') if url_el is None: continue abr = int(a.find('bitrateAudio').text) // 1000 From 8f5639afcbb967f276fb8b35a24559cdcc3b6d32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 3 Aug 2015 19:37:48 +0600 Subject: [PATCH 409/450] [pornhub] Improve video quality regex --- youtube_dl/extractor/pornhub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index fec493046..7b0cdc41a 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -94,7 +94,7 @@ class PornHubIE(InfoExtractor): format = path.split('/')[5].split('_')[:2] format = "-".join(format) - m = re.match(r'^(?P<height>[0-9]+)P-(?P<tbr>[0-9]+)K$', format) + m = re.match(r'^(?P<height>[0-9]+)[pP]-(?P<tbr>[0-9]+)[kK]$', format) if m is None: height = None tbr = None From e704f87f869b98bbed56d7dd0fe27710306c8272 Mon Sep 17 00:00:00 2001 From: Niklas Haas <git@nand.wakku.to> Date: Mon, 3 Aug 2015 01:54:21 +0200 Subject: [PATCH 410/450] [twitch] Parse start_time from 't' (closes #6441) Eg. for VOD links like http://www.twitch.tv/gamesdonequick/v/9136645?t=14h29m15s --- youtube_dl/extractor/twitch.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 73ce335b7..a2b6a35aa 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -7,12 +7,15 @@ import random from .common import InfoExtractor from ..compat import ( + compat_parse_qs, compat_str, compat_urllib_parse, + compat_urllib_parse_urlparse, compat_urllib_request, ) from ..utils import ( ExtractorError, + parse_duration, parse_iso8601, ) @@ -185,7 +188,7 @@ class TwitchVodIE(TwitchItemBaseIE): _ITEM_SHORTCUT = 'v' _TEST = { - 'url': 'http://www.twitch.tv/riotgames/v/6528877', + 'url': 'http://www.twitch.tv/riotgames/v/6528877?t=5m10s', 'info_dict': { 'id': 'v6528877', 'ext': 'mp4', @@ -197,6 +200,7 @@ class TwitchVodIE(TwitchItemBaseIE): 'uploader': 'Riot Games', 'uploader_id': 'riotgames', 'view_count': int, + 'start_time': 310, }, 'params': { # m3u8 download @@ -216,6 +220,12 @@ class TwitchVodIE(TwitchItemBaseIE): item_id, 'mp4') self._prefer_source(formats) info['formats'] = formats + + parsed_url = compat_urllib_parse_urlparse(url) + query = compat_parse_qs(parsed_url.query) + if 't' in query: + info['start_time'] = parse_duration(query['t'][0]) + return info From d96d604e5311628ece0234733dbbfe73a58c8d18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 3 Aug 2015 23:04:11 +0200 Subject: [PATCH 411/450] YoutubeDL: format spec: don't accept a bare '/' (#6124) --- test/test_YoutubeDL.py | 1 + youtube_dl/YoutubeDL.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 20f45f439..9a3c28f8c 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -311,6 +311,7 @@ class TestFormatSelection(unittest.TestCase): assert_syntax_error('bestvideo,,best') assert_syntax_error('+bestaudio') assert_syntax_error('bestvideo+') + assert_syntax_error('/') def test_format_filtering(self): formats = [ diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index efa3254ce..c608ff91a 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -960,6 +960,8 @@ class YoutubeDL(object): selectors.append(current_selector) current_selector = None elif string == '/': + if not current_selector: + raise syntax_error('"/" must follow a format selector', start) first_choice = current_selector second_choice = _parse_format_selection(tokens, inside_choice=True) current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), []) From a346b1ff57a94382e80fd4edd5a6d4b91a7cb45e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 4 Aug 2015 20:44:22 +0600 Subject: [PATCH 412/450] [bbc] Add support for vxp-playlist-data embeds (Closes #6453) --- youtube_dl/extractor/bbc.py | 45 ++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 9a1b6e3dc..abc5a44a1 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -526,6 +526,18 @@ class BBCIE(BBCCoUkIE): 'params': { 'skip_download': True, } + }, { + # single video from video playlist embedded with vxp-playlist-data JSON + 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', + 'info_dict': { + 'id': 'p02w6qjc', + 'ext': 'mp4', + 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', + 'duration': 56, + }, + 'params': { + 'skip_download': True, + } }, { # single video story with digitalData 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', @@ -695,13 +707,36 @@ class BBCIE(BBCCoUkIE): if not medias: # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international) - media_asset_page = self._parse_json( + media_asset = self._search_regex( + r'mediaAssetPage\.init\(\s*({.+?}), "/', + webpage, 'media asset', default=None) + if media_asset: + media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False) + medias = [] + for video in media_asset_page.get('videos', {}).values(): + medias.extend(video.values()) + + if not medias: + # Multiple video playlist with single `now playing` entry (e.g. + # http://www.bbc.com/news/video_and_audio/must_see/33767813) + vxp_playlist = self._parse_json( self._search_regex( - r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'media asset'), + r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>', + webpage, 'playlist data'), playlist_id) - medias = [] - for video in media_asset_page.get('videos', {}).values(): - medias.extend(video.values()) + playlist_medias = [] + for item in vxp_playlist: + media = item.get('media') + if not media: + continue + playlist_medias.append(media) + # Download single video if found media with asset id matching the video id from URL + if item.get('advert', {}).get('assetId') == playlist_id: + medias = [media] + break + # Fallback to the whole playlist + if not medias: + medias = playlist_medias entries = [] for num, media_meta in enumerate(medias, start=1): From 232541df441741d3d55605f03e28ec3c34249a5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 4 Aug 2015 22:29:23 +0200 Subject: [PATCH 413/450] [YoutubeDL] format spec: correctly handle dashes and other unused operators 'mp4-baseline-16x9' must be handled as a single string, but the '-' was treated as an operator. --- test/test_YoutubeDL.py | 6 ++++++ youtube_dl/YoutubeDL.py | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 9a3c28f8c..0388c0bf3 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -105,6 +105,7 @@ class TestFormatSelection(unittest.TestCase): def test_format_selection(self): formats = [ {'format_id': '35', 'ext': 'mp4', 'preference': 1, 'url': TEST_URL}, + {'format_id': 'example-with-dashes', 'ext': 'webm', 'preference': 1, 'url': TEST_URL}, {'format_id': '45', 'ext': 'webm', 'preference': 2, 'url': TEST_URL}, {'format_id': '47', 'ext': 'webm', 'preference': 3, 'url': TEST_URL}, {'format_id': '2', 'ext': 'flv', 'preference': 4, 'url': TEST_URL}, @@ -136,6 +137,11 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], '35') + ydl = YDL({'format': 'example-with-dashes'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'example-with-dashes') + def test_format_selection_audio(self): formats = [ {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none', 'url': TEST_URL}, diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index c608ff91a..1446b3254 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -933,6 +933,37 @@ class YoutubeDL(object): else: filter_parts.append(string) + def _remove_unused_ops(tokens): + # Remove operators that we don't use and join them with the sourrounding strings + # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' + ALLOWED_OPS = ('/', '+', ',', '(', ')') + last_string, last_start, last_end, last_line = None, None, None, None + for type, string, start, end, line in tokens: + if type == tokenize.OP and string == '[': + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + last_string = None + yield type, string, start, end, line + # everything inside brackets will be handled by _parse_filter + for type, string, start, end, line in tokens: + yield type, string, start, end, line + if type == tokenize.OP and string == ']': + break + elif type == tokenize.OP and string in ALLOWED_OPS: + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + last_string = None + yield type, string, start, end, line + elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]: + if not last_string: + last_string = string + last_start = start + last_end = end + else: + last_string += string + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False): selectors = [] current_selector = None @@ -1111,7 +1142,7 @@ class YoutubeDL(object): stream = io.BytesIO(format_spec.encode('utf-8')) try: - tokens = list(compat_tokenize_tokenize(stream.readline)) + tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline))) except tokenize.TokenError: raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) From 8a5601e42f6974e6694f01089b4c7e014b6a1b7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 5 Aug 2015 19:52:04 +0600 Subject: [PATCH 414/450] [lynda] Fix login (Closes #6462) --- youtube_dl/extractor/lynda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index a00f6e5e5..39214de2f 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -17,7 +17,7 @@ from ..utils import ( class LyndaBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.lynda.com/login/login.aspx' - _SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn: true' + _SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn\s*:\s*true' _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' _NETRC_MACHINE = 'lynda' From 5b7dab2dd640c93ec0f63ca8b901e701679a4c7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 5 Aug 2015 20:06:48 +0600 Subject: [PATCH 415/450] [lynda] Make login more robust --- youtube_dl/extractor/lynda.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 39214de2f..deead220a 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -17,7 +17,6 @@ from ..utils import ( class LyndaBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.lynda.com/login/login.aspx' - _SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn\s*:\s*true' _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' _NETRC_MACHINE = 'lynda' @@ -41,7 +40,7 @@ class LyndaBaseIE(InfoExtractor): request, None, 'Logging in as %s' % username) # Not (yet) logged in - m = re.search(r'loginResultJson = \'(?P<json>[^\']+)\';', login_page) + m = re.search(r'loginResultJson\s*=\s*\'(?P<json>[^\']+)\';', login_page) if m is not None: response = m.group('json') response_json = json.loads(response) @@ -70,7 +69,7 @@ class LyndaBaseIE(InfoExtractor): request, None, 'Confirming log in and log out from another device') - if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: + if all(not re.search(p, login_page) for p in ('isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')): raise ExtractorError('Unable to log in') From 354b4b8604ec13ccf4bd89b9d1b77cb7246fe379 Mon Sep 17 00:00:00 2001 From: vijayanand nandam <vijay@cybrilla.com> Date: Wed, 5 Aug 2015 19:37:59 +0530 Subject: [PATCH 416/450] fixing xhamster file extraction --- youtube_dl/extractor/xhamster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index b4ad513a0..9d025530f 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -47,7 +47,7 @@ class XHamsterIE(InfoExtractor): def _real_extract(self, url): def extract_video_url(webpage): - mp4 = re.search(r'<video\s+.*?file="([^"]+)".*?>', webpage) + mp4 = re.search(r'file:\s+\'([^\']+)\'', webpage) if mp4 is None: raise ExtractorError('Unable to extract media URL') else: From be7a8379b47c35afe66abcc02aee597e5143b1d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 5 Aug 2015 20:32:44 +0600 Subject: [PATCH 417/450] [xhamster] Make more robust --- youtube_dl/extractor/xhamster.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 9d025530f..481d79b89 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -46,12 +46,12 @@ class XHamsterIE(InfoExtractor): ] def _real_extract(self, url): - def extract_video_url(webpage): - mp4 = re.search(r'file:\s+\'([^\']+)\'', webpage) - if mp4 is None: - raise ExtractorError('Unable to extract media URL') - else: - return mp4.group(1) + def extract_video_url(webpage, name): + return self._search_regex( + [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''', + r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''', + r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''], + webpage, name, group='mp4') def is_hd(webpage): return '<div class=\'icon iconHD\'' in webpage @@ -97,7 +97,9 @@ class XHamsterIE(InfoExtractor): hd = is_hd(webpage) - video_url = extract_video_url(webpage) + format_id = 'hd' if hd else 'sd' + + video_url = extract_video_url(webpage, format_id) formats = [{ 'url': video_url, 'format_id': 'hd' if hd else 'sd', @@ -108,7 +110,7 @@ class XHamsterIE(InfoExtractor): mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url') webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage') if is_hd(webpage): - video_url = extract_video_url(webpage) + video_url = extract_video_url(webpage, 'hd') formats.append({ 'url': video_url, 'format_id': 'hd', From 251a44b776264c17d7799e017b856143c6cacd9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 5 Aug 2015 20:36:37 +0600 Subject: [PATCH 418/450] [xhamster] Fix thumbnail extraction --- youtube_dl/extractor/xhamster.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 481d79b89..b57e7c813 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -78,7 +78,10 @@ class XHamsterIE(InfoExtractor): uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)', webpage, 'uploader id', default='anonymous') - thumbnail = self._html_search_regex(r'<video\s+.*?poster="([^"]+)".*?>', webpage, 'thumbnail', fatal=False) + thumbnail = self._search_regex( + [r'''thumb\s*:\s*(?P<q>["'])(?P<thumbnail>.+?)(?P=q)''', + r'''<video[^>]+poster=(?P<q>["'])(?P<thumbnail>.+?)(?P=q)[^>]*>'''], + webpage, 'thumbnail', fatal=False, group='thumbnail') duration = parse_duration(self._html_search_regex(r'<span>Runtime:</span> (\d+:\d+)</div>', webpage, 'duration', fatal=False)) From 3e4852247744b131600ba43275ab321eb1b32bb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 5 Aug 2015 20:41:40 +0600 Subject: [PATCH 419/450] [xhamster] Fix uploader extraction --- youtube_dl/extractor/xhamster.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index b57e7c813..06fedf840 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -22,7 +22,7 @@ class XHamsterIE(InfoExtractor): 'ext': 'mp4', 'title': 'FemaleAgent Shy beauty takes the bait', 'upload_date': '20121014', - 'uploader_id': 'Ruseful2011', + 'uploader': 'Ruseful2011', 'duration': 893, 'age_limit': 18, } @@ -34,7 +34,7 @@ class XHamsterIE(InfoExtractor): 'ext': 'mp4', 'title': 'Britney Spears Sexy Booty', 'upload_date': '20130914', - 'uploader_id': 'jojo747400', + 'uploader': 'jojo747400', 'duration': 200, 'age_limit': 18, } @@ -75,8 +75,9 @@ class XHamsterIE(InfoExtractor): if upload_date: upload_date = unified_strdate(upload_date) - uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)', - webpage, 'uploader id', default='anonymous') + uploader = self._html_search_regex( + r"<a href='[^']+xhamster\.com/user/[^>]+>(?P<uploader>[^<]+)", + webpage, 'uploader', default='anonymous') thumbnail = self._search_regex( [r'''thumb\s*:\s*(?P<q>["'])(?P<thumbnail>.+?)(?P=q)''', @@ -127,7 +128,7 @@ class XHamsterIE(InfoExtractor): 'title': title, 'description': description, 'upload_date': upload_date, - 'uploader_id': uploader_id, + 'uploader': uploader, 'thumbnail': thumbnail, 'duration': duration, 'view_count': view_count, From 54a9328b205e8a2c916d59fd81bdb1ede25cf87a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 5 Aug 2015 21:19:52 +0600 Subject: [PATCH 420/450] [generic] Expand jwplayer support --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8cef61c3c..6df89f814 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1655,7 +1655,7 @@ class GenericIE(InfoExtractor): if not found: # Broaden the findall a little bit: JWPlayer JS loader found = filter_video(re.findall( - r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)) + r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)) if not found: # Flow player found = filter_video(re.findall(r'''(?xs) From c71a3195afa8c2a9ed5fe0ffa56ff6c969147d91 Mon Sep 17 00:00:00 2001 From: Delon <liuxi326@qq.com> Date: Wed, 5 Aug 2015 18:22:25 +0800 Subject: [PATCH 421/450] [tudou] Fix extracion --- youtube_dl/extractor/tudou.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index c89de5ba4..9b934cb57 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -29,6 +29,8 @@ class TudouIE(InfoExtractor): } }] + _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' + def _url_for_id(self, id, quality=None): info_url = "http://v2.tudou.com/f?id=" + str(id) if quality: @@ -76,6 +78,9 @@ class TudouIE(InfoExtractor): 'ext': ext, 'title': title, 'thumbnail': thumbnail_url, + 'http_headers': { + 'Referer': self._PLAYER_URL, + }, } result.append(part_info) From 238755752f4f9169a1edda91067c8627afe19cce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 5 Aug 2015 23:07:52 +0600 Subject: [PATCH 422/450] [tudou] Extract player URL from the webpage --- youtube_dl/extractor/tudou.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 9b934cb57..84fe71aef 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -30,7 +30,7 @@ class TudouIE(InfoExtractor): }] _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' - + def _url_for_id(self, id, quality=None): info_url = "http://v2.tudou.com/f?id=" + str(id) if quality: @@ -56,6 +56,10 @@ class TudouIE(InfoExtractor): thumbnail_url = self._search_regex( r",pic:\s*[\"'](.+?)[\"']", webpage, 'thumbnail URL', fatal=False) + player_url = self._search_regex( + r"playerUrl\s*:\s*['\"](.+?\.swf)[\"']", + webpage, 'player URL', default=self._PLAYER_URL) + segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments') segments = json.loads(segs_json) # It looks like the keys are the arguments that have to be passed as @@ -79,7 +83,7 @@ class TudouIE(InfoExtractor): 'title': title, 'thumbnail': thumbnail_url, 'http_headers': { - 'Referer': self._PLAYER_URL, + 'Referer': player_url, }, } result.append(part_info) From f535ec8278c8f465b47919d3f451571ae8ccfc7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 5 Aug 2015 23:08:26 +0600 Subject: [PATCH 423/450] [xhamster] Remove unused import --- youtube_dl/extractor/xhamster.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 06fedf840..f76ee8fd4 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -4,7 +4,6 @@ import re from .common import InfoExtractor from ..utils import ( - ExtractorError, unified_strdate, str_to_int, int_or_none, From c73cdd800f0dc7b465ac0b36d338875bb80c23aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 5 Aug 2015 23:08:55 +0600 Subject: [PATCH 424/450] [xhamster] flake8 --- youtube_dl/extractor/xhamster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index f76ee8fd4..97315750f 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -81,7 +81,7 @@ class XHamsterIE(InfoExtractor): thumbnail = self._search_regex( [r'''thumb\s*:\s*(?P<q>["'])(?P<thumbnail>.+?)(?P=q)''', r'''<video[^>]+poster=(?P<q>["'])(?P<thumbnail>.+?)(?P=q)[^>]*>'''], - webpage, 'thumbnail', fatal=False, group='thumbnail') + webpage, 'thumbnail', fatal=False, group='thumbnail') duration = parse_duration(self._html_search_regex(r'<span>Runtime:</span> (\d+:\d+)</div>', webpage, 'duration', fatal=False)) From 51f267d9d4d26c3cd67f318a2040513946f2b4d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 6 Aug 2015 22:01:01 +0600 Subject: [PATCH 425/450] [YoutubeDL:utils] Move percent encode non-ASCII URLs workaround to http_request and simplify (Closes #6457) --- youtube_dl/YoutubeDL.py | 21 --------------------- youtube_dl/utils.py | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 1446b3254..079d42ce8 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1860,27 +1860,6 @@ class YoutubeDL(object): def urlopen(self, req): """ Start an HTTP download """ - - # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not - # always respected by websites, some tend to give out URLs with non percent-encoded - # non-ASCII characters (see telemb.py, ard.py [#3412]) - # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) - # To work around aforementioned issue we will replace request's original URL with - # percent-encoded one - req_is_string = isinstance(req, compat_basestring) - url = req if req_is_string else req.get_full_url() - url_escaped = escape_url(url) - - # Substitute URL if any change after escaping - if url != url_escaped: - if req_is_string: - req = url_escaped - else: - req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request - req = req_type( - url_escaped, data=req.data, headers=req.headers, - origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) - return self._opener.open(req, timeout=self._socket_timeout) def print_debug_header(self): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 78dc2b449..c7db75f80 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -651,6 +651,26 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): return ret def http_request(self, req): + # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not + # always respected by websites, some tend to give out URLs with non percent-encoded + # non-ASCII characters (see telemb.py, ard.py [#3412]) + # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) + # To work around aforementioned issue we will replace request's original URL with + # percent-encoded one + # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) + # the code of this workaround has been moved here from YoutubeDL.urlopen() + url = req.get_full_url() + url_escaped = escape_url(url) + + # Substitute URL if any change after escaping + if url != url_escaped: + req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request + new_req = req_type( + url_escaped, data=req.data, headers=req.headers, + origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) + new_req.timeout = req.timeout + req = new_req + for h, v in std_headers.items(): # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 # The dict keys are capitalized because of this bug by urllib From bd690a9f9368095f561184778fb2f3ef12c66342 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 6 Aug 2015 22:01:31 +0600 Subject: [PATCH 426/450] [southpark:de] Add test for non-ASCII in URLs --- youtube_dl/extractor/southpark.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index 7fb165a87..87b650468 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -45,6 +45,14 @@ class SouthParkDeIE(SouthParkIE): 'title': 'The Government Won\'t Respect My Privacy', 'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', }, + }, { + # non-ASCII characters in initial URL + 'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen', + 'playlist_count': 4, + }, { + # non-ASCII characters in redirect URL + 'url': 'http://www.southpark.de/alle-episoden/s18e09', + 'playlist_count': 4, }] From 4f34cdb0a87a506d25a352ff265678c86cb9b979 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 6 Aug 2015 23:56:44 +0600 Subject: [PATCH 427/450] [southpark:de] Skip test --- youtube_dl/extractor/southpark.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index 87b650468..ad63a8785 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -53,6 +53,7 @@ class SouthParkDeIE(SouthParkIE): # non-ASCII characters in redirect URL 'url': 'http://www.southpark.de/alle-episoden/s18e09', 'playlist_count': 4, + 'skip': 'Broken python 3', }] From 671302b5c0ff8cefa5f26e599423ef7799b19631 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 7 Aug 2015 00:08:11 +0600 Subject: [PATCH 428/450] [YoutubeDL] Remove unused imports --- youtube_dl/YoutubeDL.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 079d42ce8..cad6b026e 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -28,7 +28,6 @@ if os.name == 'nt': import ctypes from .compat import ( - compat_basestring, compat_cookiejar, compat_expanduser, compat_get_terminal_size, @@ -40,7 +39,6 @@ from .compat import ( compat_urllib_request, ) from .utils import ( - escape_url, ContentTooShortError, date_from_str, DateRange, @@ -51,7 +49,6 @@ from .utils import ( ExtractorError, format_bytes, formatSeconds, - HEADRequest, locked_file, make_HTTPS_handler, MaxDownloadsReached, From cd6b555e19c601d575679dd29da0080eda7f8890 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 6 Aug 2015 19:17:50 +0100 Subject: [PATCH 429/450] [dcn] add origin to api request and fix the test and check with flake8 --- youtube_dl/extractor/dcn.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index f76ebda9e..d44e8cef0 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -1,4 +1,9 @@ +# coding: utf-8 +from __future__ import unicode_literals + from .common import InfoExtractor +from ..compat import compat_urllib_request + class DcnIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/.+|show/\d+/.+?)/(?P<id>\d+)/?' @@ -9,24 +14,29 @@ class DcnIE(InfoExtractor): 'id': '17375', 'ext': 'm3u8', 'title': 'رحلة العمر : الحلقة 1', - 'description': '"في هذه الحلقة من برنامج رحلة العمر يقدّم الدكتور عمر عبد الكافي تبسيطاً لمناسك الحج والعمرة ويجيب مباشرة على استفسارات حجاج بيت الله الحرام بخصوص مناسك الحج والعمرة1"', + 'description': 'في هذه الحلقة من برنامج رحلة العمر يقدّم الدكتور عمر عبد الكافي تبسيطاً لمناسك الحج والعمرة ويجيب مباشرة على استفسارات حجاج بيت الله الحرام بخصوص مناسك الحج والعمرة\n1', 'thumbnail': 'http://admin.mangomolo.com/analytics/uploads/71/images/media/2/2cefc09d7bec80afa754682f40e49503.jpg', 'duration': '2041' - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): video_id = self._match_id(url) - json_data = self._download_json( - 'http://admin.mangomolo.com/analytics/index.php/plus/video?id='+video_id, - video_id + request = compat_urllib_request.Request( + 'http://admin.mangomolo.com/analytics/index.php/plus/video?id=' + video_id, + headers={'Origin': 'http://www.dcndigital.ae'} ) - title = json_data['title_ar']; - thumbnail = 'http://admin.mangomolo.com/analytics/'+json_data['img']; - duration = json_data['duration']; - description = json_data['description_ar']; + json_data = self._download_json(request, video_id) + title = json_data['title_ar'] + thumbnail = 'http://admin.mangomolo.com/analytics/' + json_data['img'] + duration = json_data['duration'] + description = json_data['description_ar'] webpage = self._download_webpage( - 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?id='+json_data['id']+'&user_id='+json_data['user_id']+'&countries=Q0M=&w=100%&h=100%&filter=DENY&signature='+json_data['signature'], + 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?id=' + json_data['id'] + '&user_id=' + json_data['user_id'] + '&countries=Q0M=&w=100%&h=100%&filter=DENY&signature=' + json_data['signature'], video_id ) m3u8_url = self._html_search_regex( From 5a4d9ddb218e761fe7ab15d197690e0cb132a536 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 7 Aug 2015 01:26:40 +0600 Subject: [PATCH 430/450] [utils] Percent-encode redirect URL of Location header (Closes #6457) --- youtube_dl/utils.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c7db75f80..e265c7574 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -715,6 +715,17 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): gz = io.BytesIO(self.deflate(resp.read())) resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg + # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 + if 300 <= resp.code < 400: + location = resp.headers.get('Location') + if location: + # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 + if sys.version_info >= (3, 0): + location = location.encode('iso-8859-1').decode('utf-8') + location_escaped = escape_url(location) + if location != location_escaped: + del resp.headers['Location'] + resp.headers['Location'] = location_escaped return resp https_request = http_request From 9663bd3abb78911bddad75742bd41006677d628e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 7 Aug 2015 01:27:07 +0600 Subject: [PATCH 431/450] [southpark:de] Enable non-ASCII redirect URL test --- youtube_dl/extractor/southpark.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index ad63a8785..87b650468 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -53,7 +53,6 @@ class SouthParkDeIE(SouthParkIE): # non-ASCII characters in redirect URL 'url': 'http://www.southpark.de/alle-episoden/s18e09', 'playlist_count': 4, - 'skip': 'Broken python 3', }] From 3eb5fdb58112032a9831eda1d2e3b8a151ea217f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 6 Aug 2015 22:55:43 +0200 Subject: [PATCH 432/450] release 2015.08.06 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index fa157cadb..b81d5e658 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.07.28' +__version__ = '2015.08.06' From 430b092a5f59fbe407b92ebcb0c42b9f7062a334 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 6 Aug 2015 23:06:21 +0200 Subject: [PATCH 433/450] release 2015.08.06.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b81d5e658..9f209499c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.08.06' +__version__ = '2015.08.06.1' From 6d30cf04db9c9662dbb30c2490e24eb5c6dca4c3 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 7 Aug 2015 10:01:18 +0100 Subject: [PATCH 434/450] [dcn] fix type and key errors --- youtube_dl/extractor/dcn.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index d44e8cef0..22ff35b56 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_request +from ..utils import int_or_none class DcnIE(InfoExtractor): @@ -16,7 +17,7 @@ class DcnIE(InfoExtractor): 'title': 'رحلة العمر : الحلقة 1', 'description': 'في هذه الحلقة من برنامج رحلة العمر يقدّم الدكتور عمر عبد الكافي تبسيطاً لمناسك الحج والعمرة ويجيب مباشرة على استفسارات حجاج بيت الله الحرام بخصوص مناسك الحج والعمرة\n1', 'thumbnail': 'http://admin.mangomolo.com/analytics/uploads/71/images/media/2/2cefc09d7bec80afa754682f40e49503.jpg', - 'duration': '2041' + 'duration': 2041 }, 'params': { # m3u8 download @@ -32,9 +33,9 @@ class DcnIE(InfoExtractor): ) json_data = self._download_json(request, video_id) title = json_data['title_ar'] - thumbnail = 'http://admin.mangomolo.com/analytics/' + json_data['img'] - duration = json_data['duration'] - description = json_data['description_ar'] + thumbnail = 'http://admin.mangomolo.com/analytics/' + json_data.get('img') + duration = int_or_none(json_data.get('duration')) + description = json_data.get('description_ar') webpage = self._download_webpage( 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?id=' + json_data['id'] + '&user_id=' + json_data['user_id'] + '&countries=Q0M=&w=100%&h=100%&filter=DENY&signature=' + json_data['signature'], video_id From 8002ac9e0a88d918735c06599dbf8f2005f79666 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 7 Aug 2015 19:04:44 +0600 Subject: [PATCH 435/450] [nowtv] Add support for .at TLD --- youtube_dl/extractor/nowtv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index ad938fb62..78e8851c0 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -14,7 +14,7 @@ from ..utils import ( class NowTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/(?:player|preview)' + _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/(?:player|preview)' _TESTS = [{ # rtl From acc1adbe7ab93657cd4d303cee1fba4464931a50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 7 Aug 2015 19:50:54 +0600 Subject: [PATCH 436/450] [nowtv] Add support for .ch TLD --- youtube_dl/extractor/nowtv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index 78e8851c0..fc21d8e3f 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -14,7 +14,7 @@ from ..utils import ( class NowTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/(?:player|preview)' + _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/(?:player|preview)' _TESTS = [{ # rtl From 0f422256d6eea5aff062a4c35d7434cd118c7a0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 7 Aug 2015 19:51:09 +0600 Subject: [PATCH 437/450] [nowtv] Add .at test --- youtube_dl/extractor/nowtv.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index fc21d8e3f..66c627bec 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -127,6 +127,9 @@ class NowTVIE(InfoExtractor): }, { 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview', 'only_matching': True, + }, { + 'url': 'http://www.nowtv.at/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview?return=/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', + 'only_matching': True, }] def _real_extract(self, url): From f94639fadf91312bf3365802981f506ecba698dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 00:06:03 +0600 Subject: [PATCH 438/450] [dcn] Improve --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/dcn.py | 78 ++++++++++++++++++++++---------- 2 files changed, 54 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index eb8ef1fe3..922d9b3d8 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -118,7 +118,7 @@ from .dailymotion import ( ) from .daum import DaumIE from .dbtv import DBTVIE -from .dcn import DcnIE +from .dcn import DCNIE from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE from .dfb import DFBIE diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 22ff35b56..b98a6c032 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -2,22 +2,30 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_request -from ..utils import int_or_none +from ..compat import ( + compat_urllib_parse, + compat_urllib_request, +) +from ..utils import ( + int_or_none, + parse_iso8601, +) -class DcnIE(InfoExtractor): +class DCNIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/.+|show/\d+/.+?)/(?P<id>\d+)/?' _TEST = { 'url': 'http://www.dcndigital.ae/#/show/199074/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375/6887', 'info_dict': { 'id': '17375', - 'ext': 'm3u8', + 'ext': 'mp4', 'title': 'رحلة العمر : الحلقة 1', - 'description': 'في هذه الحلقة من برنامج رحلة العمر يقدّم الدكتور عمر عبد الكافي تبسيطاً لمناسك الحج والعمرة ويجيب مباشرة على استفسارات حجاج بيت الله الحرام بخصوص مناسك الحج والعمرة\n1', - 'thumbnail': 'http://admin.mangomolo.com/analytics/uploads/71/images/media/2/2cefc09d7bec80afa754682f40e49503.jpg', - 'duration': 2041 + 'description': 'md5:0156e935d870acb8ef0a66d24070c6d6', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 2041, + 'timestamp': 1227504126, + 'upload_date': '20081124', }, 'params': { # m3u8 download @@ -27,30 +35,50 @@ class DcnIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + request = compat_urllib_request.Request( - 'http://admin.mangomolo.com/analytics/index.php/plus/video?id=' + video_id, - headers={'Origin': 'http://www.dcndigital.ae'} - ) - json_data = self._download_json(request, video_id) - title = json_data['title_ar'] - thumbnail = 'http://admin.mangomolo.com/analytics/' + json_data.get('img') - duration = int_or_none(json_data.get('duration')) - description = json_data.get('description_ar') + 'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id, + headers={'Origin': 'http://www.dcndigital.ae'}) + + video = self._download_json(request, video_id) + title = video.get('title_en') or video['title_ar'] + webpage = self._download_webpage( - 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?id=' + json_data['id'] + '&user_id=' + json_data['user_id'] + '&countries=Q0M=&w=100%&h=100%&filter=DENY&signature=' + json_data['signature'], - video_id - ) - m3u8_url = self._html_search_regex( - r'file:\s*"([^"]+)', - webpage, - 'm3u8_url' - ) - formats = self._extract_m3u8_formats(m3u8_url, video_id) + 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' + + compat_urllib_parse.urlencode({ + 'id': video['id'], + 'user_id': video['user_id'], + 'signature': video['signature'], + 'countries': 'Q0M=', + 'filter': 'DENY', + }), video_id) + + m3u8_url = self._html_search_regex(r'file:\s*"([^"]+)', webpage, 'm3u8 url') + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + + rtsp_url = self._search_regex( + r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False) + if rtsp_url: + formats.append({ + 'url': rtsp_url, + 'format_id': 'rtsp', + }) + + self._sort_formats(formats) + + img = video.get('img') + thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None + duration = int_or_none(video.get('duration')) + description = video.get('description_en') or video.get('description_ar') + timestamp = parse_iso8601(video.get('create_time') or video.get('update_time'), ' ') + return { 'id': video_id, 'title': title, + 'description': description, 'thumbnail': thumbnail, 'duration': duration, - 'description': description, + 'timestamp': timestamp, 'formats': formats, } From 4a7434d0b09e14b773c2d278c8299efa6225b84e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 00:19:40 +0600 Subject: [PATCH 439/450] [dcn] Simplify _VALID_URL --- youtube_dl/extractor/dcn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index b98a6c032..82261e25c 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -13,7 +13,7 @@ from ..utils import ( class DCNIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/.+|show/\d+/.+?)/(?P<id>\d+)/?' + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/.+|show/\d+/.+?)/(?P<id>\d+)' _TEST = { 'url': 'http://www.dcndigital.ae/#/show/199074/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375/6887', 'info_dict': From fd5d8270dcd6d8baada3390a4a1cae5bdbcb6da4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 01:10:41 +0600 Subject: [PATCH 440/450] [clipfish] Fix extraction, minimize requests, get rid of drm hds, extract m3u8 and more metadata --- youtube_dl/extractor/clipfish.py | 56 ++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index 09dfaac60..7af903571 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -1,18 +1,19 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( - ExtractorError, + determine_ext, int_or_none, js_to_json, - determine_ext, + parse_iso8601, + remove_end, ) class ClipfishIE(InfoExtractor): - IE_NAME = 'clipfish' - - _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/' + _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/', 'md5': '79bc922f3e8a9097b3d68a93780fd475', @@ -20,35 +21,48 @@ class ClipfishIE(InfoExtractor): 'id': '3966754', 'ext': 'mp4', 'title': 'FIFA 14 - E3 2013 Trailer', + 'timestamp': 1370938118, + 'upload_date': '20130611', 'duration': 82, } } def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_info = self._parse_json( - js_to_json(self._html_search_regex('var videoObject = ({[^}]+?})', webpage, 'videoObject')), - video_id - ) - info_url = self._parse_json( - js_to_json(self._html_search_regex('var globalFlashvars = ({[^}]+?})', webpage, 'globalFlashvars')), - video_id - )['data'] - doc = self._download_xml( - info_url, video_id, note='Downloading info page') - title = doc.find('title').text - video_url = doc.find('filename').text - thumbnail = doc.find('imageurl').text - duration = int_or_none(video_info['length']) - formats = [{'url': video_info['videourl']},{'url': video_url}] + webpage = self._download_webpage(url, video_id) + + video_info = self._parse_json( + js_to_json(self._html_search_regex( + '(?s)videoObject\s*=\s*({.+?});', webpage, 'video object')), + video_id) + + formats = [] + for video_url in re.findall(r'var\s+videourl\s*=\s*"([^"]+)"', webpage): + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.append({ + 'url': video_url.replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'), + 'ext': 'mp4', + 'format_id': 'hls', + }) + else: + formats.append({ + 'url': video_url, + 'format_id': ext, + }) self._sort_formats(formats) + title = remove_end(self._og_search_title(webpage), ' - Video') + thumbnail = self._og_search_thumbnail(webpage) + duration = int_or_none(video_info.get('length')) + timestamp = parse_iso8601(self._html_search_meta('uploadDate', webpage, 'upload date')) + return { 'id': video_id, 'title': title, 'formats': formats, 'thumbnail': thumbnail, 'duration': duration, + 'timestamp': timestamp, } From 8a37aa1517ccc474b3e2831b77e48534cb8ed47c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 01:55:59 +0600 Subject: [PATCH 441/450] [extractor/generic] Expand ooyala regex (Closes #6485) --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6df89f814..649c0bce6 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1320,7 +1320,7 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url')) # Look for Ooyala videos - mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or + mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) From bf94d763ba73e09fd77d25110c7219254b63c786 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 02:00:49 +0600 Subject: [PATCH 442/450] [extractor/generic] Add test for #6485 --- youtube_dl/extractor/generic.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 649c0bce6..469909a51 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -236,6 +236,19 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Ooyala'], }, + { + # ooyala video embedded with http://player.ooyala.com/iframe.js + 'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/', + 'info_dict': { + 'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB', + 'ext': 'mp4', + 'title': '"Steve Jobs: Man in the Machine" trailer', + 'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."', + }, + 'params': { + 'skip_download': True, + }, + }, # multiple ooyala embeds on SBN network websites { 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok', From e0ac521438218e978b9c4bbcd92cfc2d5fef79cb Mon Sep 17 00:00:00 2001 From: vijayanand nandam <vijay@cybrilla.com> Date: Thu, 6 Aug 2015 22:42:58 +0530 Subject: [PATCH 443/450] adding support for axel download manager --- youtube_dl/downloader/external.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 1d5cc9904..30699934b 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -83,6 +83,16 @@ class CurlFD(ExternalFD): return cmd +class AxelFD(ExternalFD): + def _make_cmd(self, tmpfilename, info_dict): + cmd = [self.exe, '-o', tmpfilename] + for key, val in info_dict['http_headers'].items(): + cmd += ['-H', '%s: %s' % (key, val)] + cmd += self._configuration_args() + cmd += ['--', info_dict['url']] + return cmd + + class WgetFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] From 5b0c40da24b5ddb789428de731e02ac8759a363c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 03:36:29 +0600 Subject: [PATCH 444/450] [extractor/common] Expand meta regex --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index dc5080504..507ea5ec0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -636,7 +636,7 @@ class InfoExtractor(object): @staticmethod def _meta_regex(prop): return r'''(?isx)<meta - (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) + (?=[^>]+(?:itemprop|name|property|id)=(["\']?)%s\1) [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) def _og_search_property(self, prop, html, name=None, **kargs): From 3550821fb4ca2f0e47542a7fa16b6543b06df724 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 03:38:55 +0600 Subject: [PATCH 445/450] [periscope] Add extractor (Closes #5850, closes #6459) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/periscope.py | 66 +++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 youtube_dl/extractor/periscope.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 922d9b3d8..bd86a5be2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -432,6 +432,7 @@ from .orf import ( from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE +from .periscope import PeriscopeIE from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py new file mode 100644 index 000000000..5219e1a75 --- /dev/null +++ b/youtube_dl/extractor/periscope.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + unescapeHTML, +) + + +class PeriscopeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?periscope\.tv/w/(?P<id>[^/?#]+)' + _TEST = { + 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', + 'md5': '65b57957972e503fcbbaeed8f4fa04ca', + 'info_dict': { + 'id': '56102209', + 'ext': 'mp4', + 'title': 'Bec Boop - 🚠✈️🇬🇧 Fly above #London in Emirates Air Line cable car at night 🇬🇧✈️🚠 #BoopScope 🎀💗', + 'timestamp': 1438978559, + 'upload_date': '20150807', + 'uploader': 'Bec Boop', + 'uploader_id': '1465763', + }, + 'skip': 'Expires in 24 hours', + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + replay = self._download_json( + 'https://api.periscope.tv/api/v2/getAccessPublic?token=%s' % video_id, video_id) + + video_url = replay['replay_url'] + + webpage = self._download_webpage(url, video_id) + + broadcast_data = self._parse_json( + unescapeHTML(self._html_search_meta( + 'broadcast-data', webpage, 'broadcast data', fatal=True)), + video_id) + + broadcast = broadcast_data['broadcast'] + status = broadcast['status'] + + uploader = broadcast.get('user_display_name') or broadcast_data.get('user', {}).get('display_name') + uploader_id = broadcast.get('user_id') or broadcast_data.get('user', {}).get('id') + + title = '%s - %s' % (uploader, status) if uploader else status + timestamp = parse_iso8601(broadcast.get('created_at')) + + thumbnails = [{ + 'url': broadcast[image], + } for image in ('image_url', 'image_url_small') if broadcast.get(image)] + + return { + 'id': broadcast.get('id') or video_id, + 'url': video_url, + 'ext': 'mp4', + 'protocol': 'm3u8_native', + 'title': title, + 'timestamp': timestamp, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'thumbnails': thumbnails, + } From 621d6a9516e0f9cd8c45e12904f4d4b7615e7fb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 04:00:52 +0600 Subject: [PATCH 446/450] [periscope] Switch to API for broadcast data --- youtube_dl/extractor/periscope.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 5219e1a75..11648a511 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -25,21 +25,17 @@ class PeriscopeIE(InfoExtractor): 'skip': 'Expires in 24 hours', } + def _call_api(self, method, token): + return self._download_json( + 'https://api.periscope.tv/api/v2/%s?token=%s' % (method, token), token) + def _real_extract(self, url): - video_id = self._match_id(url) - - replay = self._download_json( - 'https://api.periscope.tv/api/v2/getAccessPublic?token=%s' % video_id, video_id) + token = self._match_id(url) + replay = self._call_api('getAccessPublic', token) video_url = replay['replay_url'] - webpage = self._download_webpage(url, video_id) - - broadcast_data = self._parse_json( - unescapeHTML(self._html_search_meta( - 'broadcast-data', webpage, 'broadcast data', fatal=True)), - video_id) - + broadcast_data = self._call_api('getBroadcastPublic', token) broadcast = broadcast_data['broadcast'] status = broadcast['status'] @@ -54,7 +50,7 @@ class PeriscopeIE(InfoExtractor): } for image in ('image_url', 'image_url_small') if broadcast.get(image)] return { - 'id': broadcast.get('id') or video_id, + 'id': broadcast.get('id') or token, 'url': video_url, 'ext': 'mp4', 'protocol': 'm3u8_native', From 1e83741c9a5d67e8bbe65510d41b558361496fe8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 05:33:53 +0600 Subject: [PATCH 447/450] [periscope] Add support for running streams --- youtube_dl/extractor/periscope.py | 34 +++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 11648a511..de53b752d 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -2,13 +2,15 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ( - parse_iso8601, - unescapeHTML, +from ..compat import ( + compat_urllib_parse, + compat_urllib_request, ) +from ..utils import parse_iso8601 class PeriscopeIE(InfoExtractor): + IE_DESC = 'Periscope' _VALID_URL = r'https?://(?:www\.)?periscope\.tv/w/(?P<id>[^/?#]+)' _TEST = { 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', @@ -32,9 +34,6 @@ class PeriscopeIE(InfoExtractor): def _real_extract(self, url): token = self._match_id(url) - replay = self._call_api('getAccessPublic', token) - video_url = replay['replay_url'] - broadcast_data = self._call_api('getBroadcastPublic', token) broadcast = broadcast_data['broadcast'] status = broadcast['status'] @@ -43,20 +42,37 @@ class PeriscopeIE(InfoExtractor): uploader_id = broadcast.get('user_id') or broadcast_data.get('user', {}).get('id') title = '%s - %s' % (uploader, status) if uploader else status + state = broadcast.get('state').lower() + if state == 'running': + title = self._live_title(title) timestamp = parse_iso8601(broadcast.get('created_at')) thumbnails = [{ 'url': broadcast[image], } for image in ('image_url', 'image_url_small') if broadcast.get(image)] + stream = self._call_api('getAccessPublic', token) + + formats = [] + for format_id in ('replay', 'rtmp', 'hls', 'https_hls'): + video_url = stream.get(format_id + '_url') + if not video_url: + continue + f = { + 'url': video_url, + 'ext': 'flv' if format_id == 'rtmp' else 'mp4', + } + if format_id != 'rtmp': + f['protocol'] = 'm3u8_native' if state == 'ended' else 'm3u8' + formats.append(f) + self._sort_formats(formats) + return { 'id': broadcast.get('id') or token, - 'url': video_url, - 'ext': 'mp4', - 'protocol': 'm3u8_native', 'title': title, 'timestamp': timestamp, 'uploader': uploader, 'uploader_id': uploader_id, 'thumbnails': thumbnails, + 'formats': formats, } From 428e4e4a850df81031e8267dddf759da605639e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 05:37:38 +0600 Subject: [PATCH 448/450] [quickscope] Add extractor --- youtube_dl/extractor/__init__.py | 5 ++++- youtube_dl/extractor/periscope.py | 21 +++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bd86a5be2..e38e77a27 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -432,7 +432,10 @@ from .orf import ( from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE -from .periscope import PeriscopeIE +from .periscope import ( + PeriscopeIE, + QuickscopeIE, +) from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index de53b752d..578b53a24 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -76,3 +76,24 @@ class PeriscopeIE(InfoExtractor): 'thumbnails': thumbnails, 'formats': formats, } + + +class QuickscopeIE(InfoExtractor): + IE_DESC = 'Quisck Scope' + _VALID_URL = r'https?://watchonperiscope\.com/broadcast/(?P<id>\d+)' + _TEST = { + 'url': 'https://watchonperiscope.com/broadcast/56180087', + 'only_matching': True, + } + + def _real_extract(self, url): + broadcast_id = self._match_id(url) + request = compat_urllib_request.Request( + 'https://watchonperiscope.com/api/accessChannel', compat_urllib_parse.urlencode({ + 'broadcast_id': broadcast_id, + 'entry_ticket': '', + 'from_push': 'false', + 'uses_sessions': 'true', + }).encode('utf-8')) + return self.url_result( + self._download_json(request, broadcast_id)['share_url'], 'Periscope') From b2f82948ee5eadc483c01dc589b82426bb32ba68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 05:40:41 +0600 Subject: [PATCH 449/450] [quickscope] Fix typo --- youtube_dl/extractor/periscope.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 578b53a24..8ad936758 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -79,7 +79,7 @@ class PeriscopeIE(InfoExtractor): class QuickscopeIE(InfoExtractor): - IE_DESC = 'Quisck Scope' + IE_DESC = 'Quick Scope' _VALID_URL = r'https?://watchonperiscope\.com/broadcast/(?P<id>\d+)' _TEST = { 'url': 'https://watchonperiscope.com/broadcast/56180087', From 154655a85ae8b7740aa9fe7821544050fd65641b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 19:21:05 +0600 Subject: [PATCH 450/450] [downloader/external] Respect --no-check-certificate for wget --- youtube_dl/downloader/external.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 30699934b..07ce59f7d 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -51,6 +51,9 @@ class ExternalFD(FileDownloader): return [] return [command_option, source_address] + def _no_check_certificate(self, command_option): + return [command_option] if self.params.get('nocheckcertificate', False) else [] + def _configuration_args(self, default=[]): ex_args = self.params.get('external_downloader_args') if ex_args is None: @@ -99,6 +102,7 @@ class WgetFD(ExternalFD): for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] cmd += self._source_address('--bind-address') + cmd += self._no_check_certificate('--no-check-certificate') cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd