From 78c2e1c978e745e0a23a5f4cc61ca9b0c0152f7a Mon Sep 17 00:00:00 2001 From: Guy Veis Date: Sun, 8 Mar 2015 15:20:58 +0200 Subject: [PATCH 0001/2145] Fixed parsing spaced xml files --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f9e8e2bad..1ce16a779 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -439,7 +439,7 @@ class InfoExtractor(object): return xml_string if transform_source: xml_string = transform_source(xml_string) - return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) + return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8').strip()) def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', From 8bc516c67be299e9288b0ebb31bda0d1dc4e4e19 Mon Sep 17 00:00:00 2001 From: Guy Veis Date: Sun, 8 Mar 2015 15:24:30 +0200 Subject: [PATCH 0002/2145] Fixed walla extractor, added site-wide video extraction --- youtube_dl/extractor/walla.py | 63 +++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/walla.py b/youtube_dl/extractor/walla.py index 24efbd6e6..a1326df74 100644 --- a/youtube_dl/extractor/walla.py +++ b/youtube_dl/extractor/walla.py @@ -5,29 +5,43 @@ import re from .common import InfoExtractor from ..utils import ( + ExtractorError, xpath_text, int_or_none, ) class WallaIE(InfoExtractor): - _VALID_URL = r'http://vod\.walla\.co\.il/[^/]+/(?P\d+)/(?P.+)' - _TEST = { + _VALID_URL = r'http://[^\.]+\.walla\.co\.il/[^/]+/(?P\d+)' + _TESTS = [{ + 'url': 'http://news.walla.co.il/item/2835878', + 'info_dict': { + 'id': '2663876', + 'ext': 'mp4', + 'title': 'בנט יורה: "בפעם הבאה יהיה רב ראשי אחד לישראל"', + 'description': 'md5:5f3ac43a8abc132ccaa1a6894a137440', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 112, + }, + 'params': { + # stream download + 'skip_download': True, + } + }, { 'url': 'http://vod.walla.co.il/movie/2642630/one-direction-all-for-one', 'info_dict': { 'id': '2642630', - 'display_id': 'one-direction-all-for-one', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'וואן דיירקשן: ההיסטריה', 'description': 'md5:de9e2512a92442574cdb0913c49bc4d8', 'thumbnail': 're:^https?://.*\.jpg', 'duration': 3600, }, 'params': { - # rtmp download + # stream download 'skip_download': True, } - } + }] _SUBTITLE_LANGS = { 'עברית': 'heb', @@ -36,18 +50,27 @@ class WallaIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - display_id = mobj.group('display_id') video = self._download_xml( - 'http://video2.walla.co.il/?w=null/null/%s/@@/video/flv_pl' % video_id, - display_id) + 'http://video.walla.co.il/?w=//%s/@@/video/flv_pl' % video_id, + video_id) item = video.find('./items/item') + if item is None: + raise ExtractorError('The item doesn\'t exist or has no video.', expected=True) + title = xpath_text(item, './title', 'title') - description = xpath_text(item, './synopsis', 'description') + description = next( + item for item in [ + xpath_text(item, './synopsis', 'description'), + xpath_text(item, './subtitle', 'description'), + '', + ] if item is not None + ) thumbnail = xpath_text(item, './preview_pic', 'thumbnail') duration = int_or_none(xpath_text(item, './duration', 'duration')) + default_file = xpath_text(item, './src', 'src') subtitles = {} for subtitle in item.findall('./subtitles/subtitle'): @@ -57,16 +80,23 @@ class WallaIE(InfoExtractor): 'url': xpath_text(subtitle, './src'), }] + playlist_url = 'http://walla-s.vidnt.com/walla_vod/_definst_/%s.mp4/playlist.m3u8' + formats = [] + formats.append( + { + 'url': playlist_url % default_file, + 'ext': 'mp4', + 'format_id': 40, + } + ) + for quality in item.findall('./qualities/quality'): format_id = xpath_text(quality, './title') fmt = { - 'url': 'rtmp://wafla.walla.co.il/vod', - 'play_path': xpath_text(quality, './src'), - 'player_url': 'http://isc.walla.co.il/w9/swf/video_swf/vod/WallaMediaPlayerAvod.swf', - 'page_url': url, - 'ext': 'flv', - 'format_id': xpath_text(quality, './title'), + 'url': playlist_url % xpath_text(quality, './src'), + 'ext': 'mp4', + 'format_id': quality.attrib['type'], } m = re.search(r'^(?P\d+)[Pp]', format_id) if m: @@ -76,7 +106,6 @@ class WallaIE(InfoExtractor): return { 'id': video_id, - 'display_id': display_id, 'title': title, 'description': description, 'thumbnail': thumbnail, From 80a35f5321463457819eb20e1300dacfd9177a50 Mon Sep 17 00:00:00 2001 From: Guy Veis Date: Sun, 8 Mar 2015 15:37:49 +0200 Subject: [PATCH 0003/2145] Fixed test item URL --- youtube_dl/extractor/walla.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/walla.py b/youtube_dl/extractor/walla.py index a1326df74..9345efb11 100644 --- a/youtube_dl/extractor/walla.py +++ b/youtube_dl/extractor/walla.py @@ -14,7 +14,7 @@ from ..utils import ( class WallaIE(InfoExtractor): _VALID_URL = r'http://[^\.]+\.walla\.co\.il/[^/]+/(?P\d+)' _TESTS = [{ - 'url': 'http://news.walla.co.il/item/2835878', + 'url': 'http://news.walla.co.il/item/2663876', 'info_dict': { 'id': '2663876', 'ext': 'mp4', From f11554092b419baa919875432fe6ebc1f22f5307 Mon Sep 17 00:00:00 2001 From: Tjark Saul Date: Fri, 17 Apr 2015 09:21:54 +0200 Subject: [PATCH 0004/2145] [Lecture2Go] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/lecture2go.py | 33 ++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 youtube_dl/extractor/lecture2go.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bbf3be41d..3d6e981b2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -249,6 +249,7 @@ from .krasview import KrasViewIE from .ku6 import Ku6IE from .la7 import LA7IE from .laola1tv import Laola1TvIE +from .lecture2go import Lecture2GoIE from .letv import ( LetvIE, LetvTvIE, diff --git a/youtube_dl/extractor/lecture2go.py b/youtube_dl/extractor/lecture2go.py new file mode 100644 index 000000000..9cf28e31c --- /dev/null +++ b/youtube_dl/extractor/lecture2go.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class Lecture2GoIE(InfoExtractor): + _VALID_URL = r'https?://lecture2go.uni-hamburg.de/veranstaltungen/-/v/(?P[0-9]+)' + _TEST = { + 'url': 'https://lecture2go.uni-hamburg.de/veranstaltungen/-/v/17473', + 'md5': 'a9e76f83b3ef58019c4b7dbc35f406c1', + 'info_dict': { + 'id': '17473', + 'ext': 'mp4', + 'url': 'https://fms1.rrz.uni-hamburg.de/abo/64.050_FrankHeitmann_2015-04-13_14-35.mp4', + 'title': '2 - Endliche Automaten und reguläre Sprachen' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'(.*?)', webpage, 'title') + video_url = self._search_regex(r'b.isFirefox..a.useHTML5\).b.setOption.a,"src","(.*.mp4)"\).else', webpage, 'video_url') + creator = self._html_search_regex(r'
(.*)
', webpage, 'creator') + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'creator': creator + } From 233c1c0e76d64c9e13dc8968bfd8a014c49e66a8 Mon Sep 17 00:00:00 2001 From: Antti Ajanki Date: Sun, 3 May 2015 11:04:14 +0300 Subject: [PATCH 0005/2145] [downloader/f4m] Fragment filenames must be sanitized because the fragment was written to a file with a sanitized name by http_dl.download() --- youtube_dl/downloader/f4m.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index b1a858c45..3cb07e15f 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -396,18 +396,19 @@ class F4mFD(FileDownloader): success = http_dl.download(frag_filename, {'url': url}) if not success: return False - with open(frag_filename, 'rb') as down: - down_data = down.read() - reader = FlvReader(down_data) - while True: - _, box_type, box_data = reader.read_box_info() - if box_type == b'mdat': - dest_stream.write(box_data) - break + (down, frag_sanitized) = sanitize_open(frag_filename, 'rb') + down_data = down.read() + down.close() + reader = FlvReader(down_data) + while True: + _, box_type, box_data = reader.read_box_info() + if box_type == b'mdat': + dest_stream.write(box_data) + break if live: - os.remove(frag_filename) + os.remove(encodeFilename(frag_sanitized)) else: - frags_filenames.append(frag_filename) + frags_filenames.append(frag_sanitized) except (compat_urllib_error.HTTPError, ) as err: if live and (err.code == 404 or err.code == 410): # We didn't keep up with the live window. Continue @@ -430,7 +431,7 @@ class F4mFD(FileDownloader): elapsed = time.time() - start self.try_rename(tmpfilename, filename) for frag_file in frags_filenames: - os.remove(frag_file) + os.remove(encodeFilename(frag_file)) fsize = os.path.getsize(encodeFilename(filename)) self._hook_progress({ From ac6c358c2ab882427f74af47fe5df762dd348c20 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 7 May 2015 02:08:47 +0800 Subject: [PATCH 0006/2145] [teamcoco] Fix extracting preload data again --- youtube_dl/extractor/teamcoco.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 2381676b4..95d58ddd0 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import base64 +import binascii import re from .common import InfoExtractor @@ -9,6 +10,7 @@ from ..utils import ( ExtractorError, qualities, ) +from ..compat import compat_ord class TeamcocoIE(InfoExtractor): @@ -66,7 +68,7 @@ class TeamcocoIE(InfoExtractor): video_id = self._html_search_regex( self._VIDEO_ID_REGEXES, webpage, 'video id') - preload = None + data = preload = None preloads = re.findall(r'"preload":\s*"([^"]+)"', webpage) if preloads: preload = max([(len(p), p) for p in preloads])[1] @@ -80,11 +82,27 @@ class TeamcocoIE(InfoExtractor): ], webpage.replace('","', ''), 'preload data', default=None) if not preload: + preload_codes = self._html_search_regex( + r'(function.+)setTimeout\(function\(\)\{playlist', + webpage, 'preload codes') + base64_fragments = re.findall(r'"([a-zA-z0-9+/=]+)"', preload_codes) + base64_fragments.remove('init') + for i in range(len(base64_fragments)): + cur_sequence = (''.join(base64_fragments[i:] + base64_fragments[:i])).encode('ascii') + try: + raw_data = base64.b64decode(cur_sequence) + except (TypeError, binascii.Error): + continue + if compat_ord(raw_data[0]) == compat_ord('{'): + data = self._parse_json(raw_data.decode('utf-8'), video_id, fatal=False) + + if not preload and not data: raise ExtractorError( 'Preload information could not be extracted', expected=True) - data = self._parse_json( - base64.b64decode(preload.encode('ascii')).decode('utf-8'), video_id) + if not data: + data = self._parse_json( + base64.b64decode(preload.encode('ascii')).decode('utf-8'), video_id) formats = [] get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) From d9a743d9178b0ed1e44168e42e8cec2d7dd8d63e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 7 May 2015 18:05:37 +0800 Subject: [PATCH 0007/2145] [vice] Remove a redundant print --- youtube_dl/extractor/vice.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 71f520fb5..04e2b0ba7 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -31,7 +31,6 @@ class ViceIE(InfoExtractor): r'embedCode=([^&\'"]+)', webpage, 'ooyala embed code') ooyala_url = OoyalaIE._url_for_embed_code(embed_code) - print(ooyala_url) except ExtractorError: raise ExtractorError('The page doesn\'t contain a video', expected=True) return self.url_result(ooyala_url, ie='Ooyala') From 05d5392cdaa558dba285c328182d4f3e82fb8e8b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 7 May 2015 18:06:22 +0800 Subject: [PATCH 0008/2145] [common] Ignore subtitles in m3u8 --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 34e27a35a..981e34bc7 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -896,7 +896,7 @@ class InfoExtractor(object): format_id = [] if m3u8_id: format_id.append(m3u8_id) - last_media_name = last_media.get('NAME') if last_media else None + last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) f = { 'format_id': '-'.join(format_id), From 84bf31aaf8b9b7397de5f3189295d93e8e93e5e2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 7 May 2015 18:12:01 +0800 Subject: [PATCH 0009/2145] [ooyala] Extract m3u8 information (#2292) --- youtube_dl/extractor/ooyala.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index d5b05c18f..b33e8230d 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..utils import ( unescapeHTML, ExtractorError, + determine_ext, ) @@ -44,11 +45,21 @@ class OoyalaIE(InfoExtractor): ie=cls.ie_key()) def _extract_result(self, info, more_info): + embedCode = info['embedCode'] + video_url = info.get('ipad_url') or info['url'] + + if determine_ext(video_url) == 'm3u8': + formats = self._extract_m3u8_formats(video_url, embedCode, ext='mp4') + else: + formats = [{ + 'url': video_url, + 'ext': 'mp4', + }] + return { - 'id': info['embedCode'], - 'ext': 'mp4', + 'id': embedCode, 'title': unescapeHTML(info['title']), - 'url': info.get('ipad_url') or info['url'], + 'formats': formats, 'description': unescapeHTML(more_info['description']), 'thumbnail': more_info['promo'], } From c09593c04e0b345df02cae663dc064d29e241cba Mon Sep 17 00:00:00 2001 From: blissland Date: Thu, 7 May 2015 15:07:11 +0100 Subject: [PATCH 0010/2145] [BildIE] Escape ampersands in xml and update test thumbnail --- youtube_dl/extractor/bild.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bild.py b/youtube_dl/extractor/bild.py index 77b562d99..ba0c185eb 100644 --- a/youtube_dl/extractor/bild.py +++ b/youtube_dl/extractor/bild.py @@ -2,7 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + fix_xml_ampersands, +) class BildIE(InfoExtractor): @@ -15,7 +18,7 @@ class BildIE(InfoExtractor): 'id': '38184146', 'ext': 'mp4', 'title': 'BILD hat sie getestet', - 'thumbnail': 'http://bilder.bild.de/fotos/stand-das-koennen-die-neuen-ipads-38184138/Bild/1.bild.jpg', + 'thumbnail': 'http://bilder.bild.de/fotos/bild-hat-sie-getestet-das-koennen-apples-neue-ipads-38184138/Bild/1.bild.jpg', 'duration': 196, 'description': 'Mit dem iPad Air 2 und dem iPad Mini 3 hat Apple zwei neue Tablet-Modelle präsentiert. BILD-Reporter Sven Stein durfte die Geräte bereits testen. ', } @@ -25,7 +28,7 @@ class BildIE(InfoExtractor): video_id = self._match_id(url) xml_url = url.split(".bild.html")[0] + ",view=xml.bild.xml" - doc = self._download_xml(xml_url, video_id) + doc = self._download_xml(xml_url, video_id, transform_source=fix_xml_ampersands) duration = int_or_none(doc.attrib.get('duration'), scale=1000) From aafe2739909882931f7f624e83fe532af0bfafc1 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 7 May 2015 22:07:32 +0800 Subject: [PATCH 0011/2145] [ooyala] Use SAS API to extract info (fixes #4336) --- youtube_dl/extractor/ooyala.py | 43 ++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index b33e8230d..0b049274a 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -1,12 +1,14 @@ from __future__ import unicode_literals import re import json +import base64 from .common import InfoExtractor from ..utils import ( unescapeHTML, ExtractorError, determine_ext, + int_or_none, ) @@ -33,6 +35,17 @@ class OoyalaIE(InfoExtractor): 'description': '', }, }, + { + # Information available only through SAS api + # From http://community.plm.automation.siemens.com/t5/News-NX-Manufacturing/Tool-Path-Divide/ba-p/4187 + 'url': 'http://player.ooyala.com/player.js?embedCode=FiOG81ZTrvckcchQxmalf4aQj590qTEx', + 'md5': 'a84001441b35ea492bc03736e59e7935', + 'info_dict': { + 'id': 'FiOG81ZTrvckcchQxmalf4aQj590qTEx', + 'ext': 'mp4', + 'title': 'Ooyala video', + } + } ] @staticmethod @@ -88,6 +101,36 @@ class OoyalaIE(InfoExtractor): mobile_player, 'info', fatal=False, default=None) if videos_info: break + + if not videos_info: + formats = [] + auth_data = self._download_json( + 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=mp4,webm' % (embedCode, embedCode), + embedCode) + + cur_auth_data = auth_data['authorization_data'][embedCode] + + for stream in cur_auth_data['streams']: + formats.append({ + 'url': base64.b64decode(stream['url']['data'].encode('ascii')).decode('utf-8'), + 'ext': stream.get('delivery_type'), + 'format': stream.get('video_codec'), + 'format_id': stream.get('profile'), + 'width': int_or_none(stream.get('width')), + 'height': int_or_none(stream.get('height')), + 'abr': int_or_none(stream.get('audio_bitrate')), + 'vbr': int_or_none(stream.get('video_bitrate')), + }) + if len(formats): + return { + 'id': embedCode, + 'formats': formats, + 'title': 'Ooyala video', + } + + if not cur_auth_data['authorized']: + raise ExtractorError(cur_auth_data['message'], expected=True) + if not videos_info: raise ExtractorError('Unable to extract info') videos_info = videos_info.replace('\\"', '"') From bc08873cff6d36ba175e5121b0ba1ad270c664c9 Mon Sep 17 00:00:00 2001 From: blissland Date: Thu, 7 May 2015 15:09:27 +0100 Subject: [PATCH 0012/2145] Fix indents --- youtube_dl/extractor/bild.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bild.py b/youtube_dl/extractor/bild.py index ba0c185eb..41dd1cbc1 100644 --- a/youtube_dl/extractor/bild.py +++ b/youtube_dl/extractor/bild.py @@ -3,8 +3,8 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - int_or_none, - fix_xml_ampersands, + int_or_none, + fix_xml_ampersands, ) From f22834a3723dd59f1e04c27d0bc3a373ebc17183 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 7 May 2015 20:20:43 +0600 Subject: [PATCH 0013/2145] [bild] Relax thumbnail test check --- youtube_dl/extractor/bild.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bild.py b/youtube_dl/extractor/bild.py index 41dd1cbc1..4d8cce1ef 100644 --- a/youtube_dl/extractor/bild.py +++ b/youtube_dl/extractor/bild.py @@ -18,7 +18,7 @@ class BildIE(InfoExtractor): 'id': '38184146', 'ext': 'mp4', 'title': 'BILD hat sie getestet', - 'thumbnail': 'http://bilder.bild.de/fotos/bild-hat-sie-getestet-das-koennen-apples-neue-ipads-38184138/Bild/1.bild.jpg', + 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 196, 'description': 'Mit dem iPad Air 2 und dem iPad Mini 3 hat Apple zwei neue Tablet-Modelle präsentiert. BILD-Reporter Sven Stein durfte die Geräte bereits testen. ', } From 3799834dcff27ffeea66e3ec96166f8da8fa73ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 7 May 2015 20:46:11 +0600 Subject: [PATCH 0014/2145] [YoutubeDL] Do not force bestvideo+bestaudio when outtmpl is stdout (#5627) --- youtube_dl/YoutubeDL.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index eb7470f72..d8583a8eb 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1086,7 +1086,9 @@ class YoutubeDL(object): req_format = self.params.get('format') if req_format is None: req_format_list = [] - if info_dict['extractor'] in ['youtube', 'ted'] and FFmpegMergerPP(self).available: + if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' + and info_dict['extractor'] in ['youtube', 'ted'] + and FFmpegMergerPP(self).available): req_format_list.append('bestvideo+bestaudio') req_format_list.append('best') req_format = '/'.join(req_format_list) From 406224be5231e602b543579706ad6056b75fbe68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 7 May 2015 21:02:59 +0600 Subject: [PATCH 0015/2145] [extractor/generic] Fix following incomplete redirects (#5640) --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d09e85665..cd7c47d6d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1453,7 +1453,7 @@ class GenericIE(InfoExtractor): if refresh_header: found = re.search(REDIRECT_REGEX, refresh_header) if found: - new_url = found.group(1) + new_url = compat_urlparse.urljoin(url, found.group(1)) self.report_following_redirect(new_url) return { '_type': 'url', From 5268a05e4722d74e125a97b023d92943745bb249 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 7 May 2015 17:04:15 +0200 Subject: [PATCH 0016/2145] [ooyala] Style fix --- youtube_dl/extractor/ooyala.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 0b049274a..c0e6d643d 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -121,7 +121,7 @@ class OoyalaIE(InfoExtractor): 'abr': int_or_none(stream.get('audio_bitrate')), 'vbr': int_or_none(stream.get('video_bitrate')), }) - if len(formats): + if formats: return { 'id': embedCode, 'formats': formats, From 09b412dafa4bc95b9850d77b3bce5f7eac47a578 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 8 May 2015 02:12:28 +0800 Subject: [PATCH 0017/2145] [nhl] Partial support for hlg id (fixes #4285) --- youtube_dl/extractor/nhl.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index 407465998..b572370c2 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -21,6 +21,9 @@ class NHLBaseInfoExtractor(InfoExtractor): return json_string.replace('\\\'', '\'') def _real_extract_video(self, video_id): + vid_parts = video_id.split(',') + if len(vid_parts) == 3: + video_id = '%s0%s%s-X-h' % (vid_parts[0][:4], vid_parts[1], vid_parts[2].rjust(4, '0')) json_url = 'http://video.nhl.com/videocenter/servlets/playlist?ids=%s&format=json' % video_id data = self._download_json( json_url, video_id, transform_source=self._fix_json) @@ -60,7 +63,7 @@ class NHLBaseInfoExtractor(InfoExtractor): class NHLIE(NHLBaseInfoExtractor): IE_NAME = 'nhl.com' - _VALID_URL = r'https?://video(?P\.[^.]*)?\.nhl\.com/videocenter/(?:console)?(?:\?(?:.*?[?&])?)id=(?P[-0-9a-zA-Z]+)' + _VALID_URL = r'https?://video(?P\.[^.]*)?\.nhl\.com/videocenter/(?:console)?(?:\?(?:.*?[?&])?)(?:id|hlg)=(?P[-0-9a-zA-Z,]+)' _TESTS = [{ 'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614', @@ -101,6 +104,17 @@ class NHLIE(NHLBaseInfoExtractor): }, { 'url': 'http://video.nhl.com/videocenter/?id=736722', 'only_matching': True, + }, { + 'url': 'http://video.nhl.com/videocenter/console?hlg=20142015,2,299&lang=en', + 'md5': '076fcb88c255154aacbf0a7accc3f340', + 'info_dict': { + 'id': '2014020299-X-h', + 'ext': 'mp4', + 'title': 'Penguins at Islanders / Game Highlights', + 'description': 'Home broadcast - Pittsburgh Penguins at New York Islanders - November 22, 2014', + 'duration': 268, + 'upload_date': '20141122', + } }] def _real_extract(self, url): From 46be82b811d91be0b0876cf141e6a94e65b8fd7f Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis Date: Thu, 7 May 2015 21:58:03 +0300 Subject: [PATCH 0018/2145] [vessel] Use `main_video_asset` when searching for video_asset (Fixes #5623) --- youtube_dl/extractor/vessel.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py index 6215f0642..3c8d2a943 100644 --- a/youtube_dl/extractor/vessel.py +++ b/youtube_dl/extractor/vessel.py @@ -38,9 +38,13 @@ class VesselIE(InfoExtractor): return req @staticmethod - def find_assets(data, asset_type): + def find_assets(data, asset_type, asset_id=None): for asset in data.get('assets', []): - if asset.get('type') == asset_type: + if not asset.get('type') == asset_type: + continue + elif asset_id is not None and not asset.get('id') == asset_id: + continue + else: yield asset def _check_access_rights(self, data): @@ -82,11 +86,13 @@ class VesselIE(InfoExtractor): req = VesselIE.make_json_request( self._API_URL_TEMPLATE % asset_id, {'client': 'web'}) data = self._download_json(req, video_id) + video_asset_id = data.get('main_video_asset') self._check_access_rights(data) try: - video_asset = next(VesselIE.find_assets(data, 'video')) + video_asset = next( + VesselIE.find_assets(data, 'video', asset_id=video_asset_id)) except StopIteration: raise ExtractorError('No video assets found') From 156fc83a55b14258bb4a2fa1ec3b02d4db679603 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 8 May 2015 03:02:34 +0800 Subject: [PATCH 0019/2145] [downloader/rtmp] Fix a typo --- youtube_dl/downloader/rtmp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index 6865b5e2f..7d19bb808 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -131,7 +131,7 @@ class RtmpFD(FileDownloader): if play_path is not None: basic_args += ['--playpath', play_path] if tc_url is not None: - basic_args += ['--tcUrl', url] + basic_args += ['--tcUrl', tc_url] if test: basic_args += ['--stop', '1'] if flash_version is not None: From 7ef00afe9da87c7d0fdbea93af39b47d5447f1a0 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 8 May 2015 03:09:19 +0800 Subject: [PATCH 0020/2145] [nhl] Support RTMP videos (fixes #4481) --- youtube_dl/extractor/nhl.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index b572370c2..279b18386 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -50,7 +50,7 @@ class NHLBaseInfoExtractor(InfoExtractor): video_url = initial_video_url join = compat_urlparse.urljoin - return { + ret = { 'id': video_id, 'title': info['name'], 'url': video_url, @@ -59,6 +59,15 @@ class NHLBaseInfoExtractor(InfoExtractor): 'thumbnail': join(join(video_url, '/u/'), info['bigImage']), 'upload_date': unified_strdate(info['releaseDate'].split('.')[0]), } + if video_url.startswith('rtmp:'): + mobj = re.match(r'(?Prtmp://[^/]+/(?P[a-z0-9/]+))/(?Pmp4:.*)', video_url) + ret.update({ + 'tc_url': mobj.group('tc_url'), + 'play_path': mobj.group('play_path'), + 'app': mobj.group('app'), + 'no_resume': True, + }) + return ret class NHLIE(NHLBaseInfoExtractor): @@ -115,6 +124,18 @@ class NHLIE(NHLBaseInfoExtractor): 'duration': 268, 'upload_date': '20141122', } + }, { + 'url': 'http://video.oilers.nhl.com/videocenter/console?id=691469&catid=4', + 'info_dict': { + 'id': '691469', + 'ext': 'mp4', + 'title': 'RAW | Craig MacTavish Full Press Conference', + 'description': 'Oilers GM Craig MacTavish addresses the media at Rexall Place on Friday.', + 'upload_date': '20141205', + }, + 'params': { + 'skip_download': True, # Requires rtmpdump + } }] def _real_extract(self, url): From a745475808e125a590afb14df48c565309d3f75c Mon Sep 17 00:00:00 2001 From: Behrooz Date: Fri, 8 May 2015 02:50:46 +0200 Subject: [PATCH 0021/2145] Ir90Tv Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/ir90tv.py | 41 ++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 youtube_dl/extractor/ir90tv.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5dfa781f8..ee05a6958 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -229,6 +229,7 @@ from .infoq import InfoQIE from .instagram import InstagramIE, InstagramUserIE from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE +from .ir90tv import Ir90TvIE from .ivi import ( IviIE, IviCompilationIE diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py new file mode 100644 index 000000000..5aa9d6ff4 --- /dev/null +++ b/youtube_dl/extractor/ir90tv.py @@ -0,0 +1,41 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class Ir90TvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?90tv\.ir/video/(?P[0-9]+)/.*' + _TEST = { + 'url': 'http://90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218', + 'md5': '411dbd94891381960cb9e13daa47a869', + 'info_dict': { + 'id': '95719', + 'ext': 'mp4', + 'title': 'شایعات نقل و انتقالات مهم فوتبال اروپا 94/02/18', + 'thumbnail': 're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # TODO more code goes here, for example ... + title = self._html_search_regex( + r'\n90tv.ir :: (.*?)', webpage, 'title') + + video_url = self._search_regex( + r']+src="([^"]+)"', webpage, 'video url') + + thumbnail = self._search_regex(r'poster="([^"]+)"', webpage, 'thumbnail url') + print thumbnail + + + return { + 'url': video_url, + 'id': video_id, + 'title': title, + 'video_url' : video_url, + 'thumbnail' : thumbnail, + } \ No newline at end of file From 54b31d149e7be08eb7be9981a9eec398d11f17ef Mon Sep 17 00:00:00 2001 From: Behrooz Date: Fri, 8 May 2015 02:55:01 +0200 Subject: [PATCH 0022/2145] Ir90Tv Add new extractor --- youtube_dl/extractor/ir90tv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py index 5aa9d6ff4..3a3cb4887 100644 --- a/youtube_dl/extractor/ir90tv.py +++ b/youtube_dl/extractor/ir90tv.py @@ -38,4 +38,4 @@ class Ir90TvIE(InfoExtractor): 'title': title, 'video_url' : video_url, 'thumbnail' : thumbnail, - } \ No newline at end of file + } From a650110ba762b2658c64392317c1afd2a284dd3d Mon Sep 17 00:00:00 2001 From: Behrooz Date: Fri, 8 May 2015 04:32:08 +0200 Subject: [PATCH 0023/2145] remove print --- youtube_dl/extractor/ir90tv.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/youtube_dl/extractor/ir90tv.py b/youtube_dl/extractor/ir90tv.py index 3a3cb4887..b79529b1b 100644 --- a/youtube_dl/extractor/ir90tv.py +++ b/youtube_dl/extractor/ir90tv.py @@ -21,7 +21,6 @@ class Ir90TvIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - # TODO more code goes here, for example ... title = self._html_search_regex( r'\n90tv.ir :: (.*?)', webpage, 'title') @@ -29,8 +28,6 @@ class Ir90TvIE(InfoExtractor): r']+src="([^"]+)"', webpage, 'video url') thumbnail = self._search_regex(r'poster="([^"]+)"', webpage, 'thumbnail url') - print thumbnail - return { 'url': video_url, From 541168039d8f3e7680a15cc366fcc94335308d81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 8 May 2015 11:01:24 +0200 Subject: [PATCH 0024/2145] [utils] get_exe_version: encode executable name (fixes #5647) It failed in python 2.x when $PATH contains a directory with non-ascii characters. --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 1013f7c18..de09b53b2 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1380,7 +1380,7 @@ def get_exe_version(exe, args=['--version'], or False if the executable is not present """ try: out, _ = subprocess.Popen( - [exe] + args, + [encodeArgument(exe)] + args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate() except OSError: return False From 249962ffa2b155795ebfe0a267eb025cd5e30c56 Mon Sep 17 00:00:00 2001 From: blissland Date: Thu, 7 May 2015 16:56:15 +0100 Subject: [PATCH 0025/2145] [bet] Use unique part of xml url as the video id and fix tests (closes #5642) The guid changes often. --- youtube_dl/extractor/bet.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py index d2abd4d77..26b934543 100644 --- a/youtube_dl/extractor/bet.py +++ b/youtube_dl/extractor/bet.py @@ -16,11 +16,11 @@ class BetIE(InfoExtractor): { 'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html', 'info_dict': { - 'id': '740ab250-bb94-4a8a-8787-fe0de7c74471', + 'id': 'news/national/2014/a-conversation-with-president-obama', 'display_id': 'in-bet-exclusive-obama-talks-race-and-racism', 'ext': 'flv', - 'title': 'BET News Presents: A Conversation With President Obama', - 'description': 'md5:5a88d8ae912c1b33e090290af7ec33c6', + 'title': 'A Conversation With President Obama', + 'description': 'md5:699d0652a350cf3e491cd15cc745b5da', 'duration': 1534, 'timestamp': 1418075340, 'upload_date': '20141208', @@ -35,7 +35,7 @@ class BetIE(InfoExtractor): { 'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html', 'info_dict': { - 'id': 'bcd1b1df-673a-42cf-8d01-b282db608f2d', + 'id': 'news/national/2014/justice-for-ferguson-a-community-reacts', 'display_id': 'justice-for-ferguson-a-community-reacts', 'ext': 'flv', 'title': 'Justice for Ferguson: A Community Reacts', @@ -61,6 +61,9 @@ class BetIE(InfoExtractor): [r'mediaURL\s*:\s*"([^"]+)"', r"var\s+mrssMediaUrl\s*=\s*'([^']+)'"], webpage, 'media URL')) + video_id = self._search_regex( + r'/video/(.*)/_jcr_content/', media_url, 'video id') + mrss = self._download_xml(media_url, display_id) item = mrss.find('./channel/item') @@ -75,8 +78,6 @@ class BetIE(InfoExtractor): description = xpath_text( item, './description', 'description', fatal=False) - video_id = xpath_text(item, './guid', 'video id', fatal=False) - timestamp = parse_iso8601(xpath_text( item, xpath_with_ns('./dc:date', NS_MAP), 'upload date', fatal=False)) From 43837189c18af635cfb1cd8fe503265b4b218c32 Mon Sep 17 00:00:00 2001 From: blissland Date: Fri, 8 May 2015 10:40:25 +0100 Subject: [PATCH 0026/2145] Fix URL template extraction for netzkino. Fixes #5614 --- youtube_dl/extractor/netzkino.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/netzkino.py b/youtube_dl/extractor/netzkino.py index bc17e20aa..0d165a82a 100644 --- a/youtube_dl/extractor/netzkino.py +++ b/youtube_dl/extractor/netzkino.py @@ -49,7 +49,7 @@ class NetzkinoIE(InfoExtractor): 'http://www.netzkino.de/beta/dist/production.min.js', video_id, note='Downloading player code') avo_js = self._search_regex( - r'window\.avoCore\s*=.*?urlTemplate:\s*(\{.*?"\})', + r'var urlTemplate=(\{.*?"\})', production_js, 'URL templates') templates = self._parse_json( avo_js, video_id, transform_source=js_to_json) From d1feb308116f57ceae3888db5e1b93394300f564 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 8 May 2015 20:07:53 +0600 Subject: [PATCH 0027/2145] [mlb] Fallback to extracting video id from webpage for all URLs that does not contain it explicitly (Closes #5630) --- youtube_dl/extractor/mlb.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index ee9ff73bf..109eecefd 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -10,7 +10,21 @@ from ..utils import ( class MLBIE(InfoExtractor): - _VALID_URL = r'https?://m(?:lb)?\.(?:[\da-z_-]+\.)?mlb\.com/(?:(?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|(?:shared/video/embed/(?:embed|m-internal-embed)\.html|[^/]+/video/play\.jsp)\?.*?\bcontent_id=)(?Pn?\d+)' + _VALID_URL = r'''(?x) + https?:// + m(?:lb)?\.(?:[\da-z_-]+\.)?mlb\.com/ + (?: + (?: + (?:.*?/)?video/(?:topic/[\da-z_-]+/)?v| + (?: + shared/video/embed/(?:embed|m-internal-embed)\.html| + [^/]+/video/play\.jsp + )\?.*?\bcontent_id= + ) + (?Pn?\d+)| + (?P.+?) + ) + ''' _TESTS = [ { 'url': 'http://m.mlb.com/sea/video/topic/51231442/v34698933/nymsea-ackley-robs-a-home-run-with-an-amazing-catch/?c_id=sea', @@ -95,6 +109,12 @@ class MLBIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + if not video_id: + video_path = mobj.group('path') + webpage = self._download_webpage(url, video_path) + video_id = self._search_regex( + r'data-videoid="(\d+)"', webpage, 'video id') + detail = self._download_xml( 'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml' % (video_id[-3], video_id[-2], video_id[-1], video_id), video_id) From 4e6e9d21bd62c4e2ab2576347e066891092a5783 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 8 May 2015 21:48:47 +0600 Subject: [PATCH 0028/2145] [mlb] Improve _VALID_URL --- youtube_dl/extractor/mlb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index 109eecefd..4e054fb53 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -22,7 +22,7 @@ class MLBIE(InfoExtractor): )\?.*?\bcontent_id= ) (?Pn?\d+)| - (?P.+?) + (?:[^/]+/)*(?P[^/]+) ) ''' _TESTS = [ From 34e7dc81a94d39d48c5b4aac8cddcca46edba94d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 8 May 2015 22:03:03 +0600 Subject: [PATCH 0029/2145] [vgtv] Add support for generic bt.no URLs (#5620) --- youtube_dl/extractor/vgtv.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 69dc9a759..b0f0b3bc2 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -8,7 +8,8 @@ from ..utils import float_or_none class VGTVIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?vgtv\.no/#!/[^/]+/(?P[0-9]+)' + IE_DESC = 'VGTV and BTTV' + _VALID_URL = r'http://(?:www\.)?(?Pvgtv|bt)\.no/(?:(?:tv/)?#!/(?:video|live)/(?P[0-9]+)|(?:[^/]+/)*(?P[^/]+))' _TESTS = [ { # streamType: vod @@ -64,12 +65,25 @@ class VGTVIE(InfoExtractor): 'skip_download': True, }, }, + { + 'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien', + 'only_matching': True, + }, ] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + host = mobj.group('host') + + HOST_WEBSITES = { + 'vgtv': 'vgtv', + 'bt': 'bttv', + } + data = self._download_json( - 'http://svp.vg.no/svp/api/v1/vgtv/assets/%s?appName=vgtv-website' % video_id, + 'http://svp.vg.no/svp/api/v1/%s/assets/%s?appName=%s-website' + % (host, video_id, HOST_WEBSITES[host]), video_id, 'Downloading media JSON') streams = data['streamUrls'] From 0ceab8474924c4e7a6e28497c8da40cc5002c8d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 8 May 2015 22:18:43 +0600 Subject: [PATCH 0030/2145] [vgtv] Add support for bt.no articles (#5620) --- youtube_dl/extractor/__init__.py | 5 +++- youtube_dl/extractor/vgtv.py | 39 +++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5dfa781f8..587a45940 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -587,7 +587,10 @@ from .veoh import VeohIE from .vessel import VesselIE from .vesti import VestiIE from .vevo import VevoIE -from .vgtv import VGTVIE +from .vgtv import ( + BTArticleIE, + VGTVIE, +) from .vh1 import VH1IE from .vice import ViceIE from .viddler import ViddlerIE diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index b0f0b3bc2..ad07e54c9 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -9,7 +9,18 @@ from ..utils import float_or_none class VGTVIE(InfoExtractor): IE_DESC = 'VGTV and BTTV' - _VALID_URL = r'http://(?:www\.)?(?Pvgtv|bt)\.no/(?:(?:tv/)?#!/(?:video|live)/(?P[0-9]+)|(?:[^/]+/)*(?P[^/]+))' + _VALID_URL = r'''(?x) + (?: + vgtv:| + http://(?:www\.)? + ) + (?Pvgtv|bt) + (?: + :| + \.no/(?:tv/)?#!/(?:video|live)/ + ) + (?P[0-9]+) + ''' _TESTS = [ { # streamType: vod @@ -129,3 +140,29 @@ class VGTVIE(InfoExtractor): 'view_count': data['displays'], 'formats': formats, } + + +class BTArticleIE(InfoExtractor): + IE_DESC = 'Bergens Tidende' + _VALID_URL = 'http://(?:www\.)?bt\.no/(?:[^/]+/)+(?P[^/]+)-\d+\.html' + _TEST = { + 'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html', + 'md5': 'd055e8ee918ef2844745fcfd1a4175fb', + 'info_dict': { + 'id': '23199', + 'ext': 'mp4', + 'title': 'Alrekstad internat', + 'description': 'md5:dc81a9056c874fedb62fc48a300dac58', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 191, + 'timestamp': 1289991323, + 'upload_date': '20101117', + 'view_count': int, + }, + } + + def _real_extract(self, url): + webpage = self._download_webpage(url, self._match_id(url)) + video_id = self._search_regex( + r'SVP\.Player\.load\(\s*(\d+)', webpage, 'video id') + return self.url_result('vgtv:bt:%s' % video_id, 'VGTV') From 2c0c9dc46cda490137b6788d6d66f31ca092f58f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 8 May 2015 22:50:01 +0600 Subject: [PATCH 0031/2145] [xstream] Move xstream to separate extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/xstream.py | 115 +++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 youtube_dl/extractor/xstream.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 587a45940..5cc35c8eb 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -653,6 +653,7 @@ from .xboxclips import XboxClipsIE from .xhamster import XHamsterIE from .xminus import XMinusIE from .xnxx import XNXXIE +from .xstream import XstreamIE from .xvideos import XVideosIE from .xtube import XTubeUserIE, XTubeIE from .xuite import XuiteIE diff --git a/youtube_dl/extractor/xstream.py b/youtube_dl/extractor/xstream.py new file mode 100644 index 000000000..71584c291 --- /dev/null +++ b/youtube_dl/extractor/xstream.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + xpath_with_ns, + xpath_text, + find_xpath_attr, +) + + +class XstreamIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + xstream:| + https?://frontend\.xstream\.(?:dk|net)/ + ) + (?P[^/]+) + (?: + :| + /feed/video/\?.*?\bid= + ) + (?P\d+) + ''' + _TESTS = [{ + 'url': 'http://frontend.xstream.dk/btno/feed/video/?platform=web&id=86588', + 'md5': 'd7d17e3337dc80de6d3a540aefbe441b', + 'info_dict': { + 'id': '86588', + 'ext': 'mov', + 'title': 'Otto Wollertsen', + 'description': 'Vestlendingen Otto Fredrik Wollertsen', + 'timestamp': 1430473209, + 'upload_date': '20150501', + }, + }, { + 'url': 'http://frontend.xstream.dk/ap/feed/video/?platform=web&id=21039', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + partner_id = mobj.group('partner_id') + video_id = mobj.group('id') + + data = self._download_xml( + 'http://frontend.xstream.dk/%s/feed/video/?platform=web&id=%s' + % (partner_id, video_id), + video_id) + + NS_MAP = { + 'atom': 'http://www.w3.org/2005/Atom', + 'xt': 'http://xstream.dk/', + 'media': 'http://search.yahoo.com/mrss/', + } + + entry = data.find(xpath_with_ns('./atom:entry', NS_MAP)) + + title = xpath_text( + entry, xpath_with_ns('./atom:title', NS_MAP), 'title') + description = xpath_text( + entry, xpath_with_ns('./atom:summary', NS_MAP), 'description') + timestamp = parse_iso8601(xpath_text( + entry, xpath_with_ns('./atom:published', NS_MAP), 'upload date')) + + formats = [] + media_group = entry.find(xpath_with_ns('./media:group', NS_MAP)) + for media_content in media_group.findall(xpath_with_ns('./media:content', NS_MAP)): + media_url = media_content.get('url') + if not media_url: + continue + tbr = int_or_none(media_content.get('bitrate')) + mobj = re.search(r'^(?Prtmp://[^/]+/(?P[^/]+))/(?P.+)$', media_url) + if mobj: + formats.append({ + 'url': mobj.group('url'), + 'play_path': 'mp4:%s' % mobj.group('playpath'), + 'app': mobj.group('app'), + 'ext': 'flv', + 'tbr': tbr, + 'format_id': 'rtmp-%d' % tbr, + }) + else: + formats.append({ + 'url': media_url, + 'tbr': tbr, + }) + self._sort_formats(formats) + + link = find_xpath_attr( + entry, xpath_with_ns('./atom:link', NS_MAP), 'rel', 'original') + if link is not None: + formats.append({ + 'url': link.get('href'), + 'format_id': link.get('rel'), + }) + + thumbnails = [{ + 'url': splash.get('url'), + 'width': int_or_none(splash.get('width')), + 'height': int_or_none(splash.get('height')), + } for splash in media_group.findall(xpath_with_ns('./xt:splash', NS_MAP))] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'formats': formats, + 'thumbnails': thumbnails, + } From cbe443362f91ab111e2a01fe8246e17a98668f88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 8 May 2015 22:52:20 +0600 Subject: [PATCH 0032/2145] [aftenposten] Implement in terms of xtream extractor --- youtube_dl/extractor/aftenposten.py | 77 +---------------------------- 1 file changed, 1 insertion(+), 76 deletions(-) diff --git a/youtube_dl/extractor/aftenposten.py b/youtube_dl/extractor/aftenposten.py index e15c015fb..0c00acfb5 100644 --- a/youtube_dl/extractor/aftenposten.py +++ b/youtube_dl/extractor/aftenposten.py @@ -1,21 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_iso8601, - xpath_with_ns, - xpath_text, - find_xpath_attr, -) class AftenpostenIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/(?:#!/)?video/(?P\d+)' - _TEST = { 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', 'md5': 'fd828cd29774a729bf4d4425fe192972', @@ -30,69 +20,4 @@ class AftenpostenIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) - - data = self._download_xml( - 'http://frontend.xstream.dk/ap/feed/video/?platform=web&id=%s' % video_id, video_id) - - NS_MAP = { - 'atom': 'http://www.w3.org/2005/Atom', - 'xt': 'http://xstream.dk/', - 'media': 'http://search.yahoo.com/mrss/', - } - - entry = data.find(xpath_with_ns('./atom:entry', NS_MAP)) - - title = xpath_text( - entry, xpath_with_ns('./atom:title', NS_MAP), 'title') - description = xpath_text( - entry, xpath_with_ns('./atom:summary', NS_MAP), 'description') - timestamp = parse_iso8601(xpath_text( - entry, xpath_with_ns('./atom:published', NS_MAP), 'upload date')) - - formats = [] - media_group = entry.find(xpath_with_ns('./media:group', NS_MAP)) - for media_content in media_group.findall(xpath_with_ns('./media:content', NS_MAP)): - media_url = media_content.get('url') - if not media_url: - continue - tbr = int_or_none(media_content.get('bitrate')) - mobj = re.search(r'^(?Prtmp://[^/]+/(?P[^/]+))/(?P.+)$', media_url) - if mobj: - formats.append({ - 'url': mobj.group('url'), - 'play_path': 'mp4:%s' % mobj.group('playpath'), - 'app': mobj.group('app'), - 'ext': 'flv', - 'tbr': tbr, - 'format_id': 'rtmp-%d' % tbr, - }) - else: - formats.append({ - 'url': media_url, - 'tbr': tbr, - }) - self._sort_formats(formats) - - link = find_xpath_attr( - entry, xpath_with_ns('./atom:link', NS_MAP), 'rel', 'original') - if link is not None: - formats.append({ - 'url': link.get('href'), - 'format_id': link.get('rel'), - }) - - thumbnails = [{ - 'url': splash.get('url'), - 'width': int_or_none(splash.get('width')), - 'height': int_or_none(splash.get('height')), - } for splash in media_group.findall(xpath_with_ns('./xt:splash', NS_MAP))] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'formats': formats, - 'thumbnails': thumbnails, - } + return self.url_result('xstream:ap:%s' % self._match_id(url), 'Xstream') From fe373287ebdda002ed84dca1d8b9d6f8a5686138 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 8 May 2015 22:59:50 +0600 Subject: [PATCH 0033/2145] [vgtv] Add support for bt vestlendingen (Closes #5620) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/vgtv.py | 26 +++++++++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5cc35c8eb..96cf28efe 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -589,6 +589,7 @@ from .vesti import VestiIE from .vevo import VevoIE from .vgtv import ( BTArticleIE, + BTVestlendingenIE, VGTVIE, ) from .vh1 import VH1IE diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index ad07e54c9..db7a4bdb1 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -143,7 +143,8 @@ class VGTVIE(InfoExtractor): class BTArticleIE(InfoExtractor): - IE_DESC = 'Bergens Tidende' + IE_NAME = 'bt:article' + IE_DESC = 'Bergens Tidende Articles' _VALID_URL = 'http://(?:www\.)?bt\.no/(?:[^/]+/)+(?P[^/]+)-\d+\.html' _TEST = { 'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html', @@ -166,3 +167,26 @@ class BTArticleIE(InfoExtractor): video_id = self._search_regex( r'SVP\.Player\.load\(\s*(\d+)', webpage, 'video id') return self.url_result('vgtv:bt:%s' % video_id, 'VGTV') + + +class BTVestlendingenIE(InfoExtractor): + IE_NAME = 'bt:vestlendingen' + IE_DESC = 'Bergens Tidende - Vestlendingen' + _VALID_URL = 'http://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P\d+)' + _TEST = { + 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588', + 'md5': 'd7d17e3337dc80de6d3a540aefbe441b', + 'info_dict': { + 'id': '86588', + 'ext': 'mov', + 'title': 'Otto Wollertsen', + 'description': 'Vestlendingen Otto Fredrik Wollertsen', + 'timestamp': 1430473209, + 'upload_date': '20150501', + }, + } + + def _real_extract(self, url): + return self.url_result('xstream:btno:%s' % self._match_id(url), 'Xstream') + + From 4384cf9e7d59492141ebd45f77830238097c695c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 8 May 2015 23:04:27 +0600 Subject: [PATCH 0034/2145] [extractor/__init__] Fix alphabetic order --- youtube_dl/extractor/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 96cf28efe..7f0070784 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -655,9 +655,9 @@ from .xhamster import XHamsterIE from .xminus import XMinusIE from .xnxx import XNXXIE from .xstream import XstreamIE -from .xvideos import XVideosIE from .xtube import XTubeUserIE, XTubeIE from .xuite import XuiteIE +from .xvideos import XVideosIE from .xxxymovies import XXXYMoviesIE from .yahoo import ( YahooIE, From bb03fdae0d4da9c591c2967044c5e30bf797c22a Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Fri, 8 May 2015 23:19:57 +0600 Subject: [PATCH 0035/2145] [README.md] Clarify format selection when streaming to stdout --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9aeb114f3..b6e9429df 100644 --- a/README.md +++ b/README.md @@ -269,7 +269,7 @@ The simplest case is requesting a specific format, for example `-f 22`. You can If you want to download multiple videos and they don't have the same formats available, you can specify the order of preference using slashes, as in `-f 22/17/18`. You can also filter the video results by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"`). This works for filesize, height, width, tbr, abr, vbr, asr, and fps and the comparisons <, <=, >, >=, =, != and for ext, acodec, vcodec, container, and protocol and the comparisons =, != . Formats for which the value is not known are excluded unless you put a question mark (?) after the operator. You can combine format filters, so `-f "[height <=? 720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. Use commas to download multiple formats, such as `-f 136/137/mp4/bestvideo,140/m4a/bestaudio`. You can merge the video and audio of two formats into a single file using `-f +` (requires ffmpeg or avconv), for example `-f bestvideo+bestaudio`. -Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see #5447, #5456). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some dash formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. +Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see #5447, #5456). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some dash formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dl to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dl still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded an muxed. If you want to preserve the old format selection behavior (prior to youtube-dl 2015.04.26), i.e. you want to download best available quality media served as a single file, you should explicitly specify your choice with `-f best`. You may want to add it to the [configuration file](#configuration) in order not to type it every time you run youtube-dl. From 50b901306406d5c37f31880860e2a4dbb5e0a165 Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Fri, 8 May 2015 23:21:23 +0600 Subject: [PATCH 0036/2145] [README.md] Fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b6e9429df..3d9436456 100644 --- a/README.md +++ b/README.md @@ -269,7 +269,7 @@ The simplest case is requesting a specific format, for example `-f 22`. You can If you want to download multiple videos and they don't have the same formats available, you can specify the order of preference using slashes, as in `-f 22/17/18`. You can also filter the video results by putting a condition in brackets, as in `-f "best[height=720]"` (or `-f "[filesize>10M]"`). This works for filesize, height, width, tbr, abr, vbr, asr, and fps and the comparisons <, <=, >, >=, =, != and for ext, acodec, vcodec, container, and protocol and the comparisons =, != . Formats for which the value is not known are excluded unless you put a question mark (?) after the operator. You can combine format filters, so `-f "[height <=? 720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. Use commas to download multiple formats, such as `-f 136/137/mp4/bestvideo,140/m4a/bestaudio`. You can merge the video and audio of two formats into a single file using `-f +` (requires ffmpeg or avconv), for example `-f bestvideo+bestaudio`. -Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see #5447, #5456). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some dash formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dl to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dl still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded an muxed. +Since the end of April 2015 and version 2015.04.26 youtube-dl uses `-f bestvideo+bestaudio/best` as default format selection (see #5447, #5456). If ffmpeg or avconv are installed this results in downloading `bestvideo` and `bestaudio` separately and muxing them together into a single file giving the best overall quality available. Otherwise it falls back to `best` and results in downloading best available quality served as a single file. `best` is also needed for videos that don't come from YouTube because they don't provide the audio and video in two different files. If you want to only download some dash formats (for example if you are not interested in getting videos with a resolution higher than 1080p), you can add `-f bestvideo[height<=?1080]+bestaudio/best` to your configuration file. Note that if you use youtube-dl to stream to `stdout` (and most likely to pipe it to your media player then), i.e. you explicitly specify output template as `-o -`, youtube-dl still uses `-f best` format selection in order to start content delivery immediately to your player and not to wait until `bestvideo` and `bestaudio` are downloaded and muxed. If you want to preserve the old format selection behavior (prior to youtube-dl 2015.04.26), i.e. you want to download best available quality media served as a single file, you should explicitly specify your choice with `-f best`. You may want to add it to the [configuration file](#configuration) in order not to type it every time you run youtube-dl. From 79998cd5afb2e16fe14cebdbec81b21c45c24c32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 9 May 2015 00:12:42 +0600 Subject: [PATCH 0037/2145] [svtplay] Generalize svt extractors and add svt.se extractor --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/svtplay.py | 98 ++++++++++++++++++++------------ 2 files changed, 67 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 7f0070784..79236c6c1 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -505,7 +505,10 @@ from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE from .streetvoice import StreetVoiceIE from .sunporno import SunPornoIE -from .svtplay import SVTPlayIE +from .svtplay import ( + SVTIE, + SVTPlayIE, +) from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE from .sztvhu import SztvHuIE diff --git a/youtube_dl/extractor/svtplay.py b/youtube_dl/extractor/svtplay.py index 433dfd1cb..732f02048 100644 --- a/youtube_dl/extractor/svtplay.py +++ b/youtube_dl/extractor/svtplay.py @@ -9,41 +9,9 @@ from ..utils import ( ) -class SVTPlayIE(InfoExtractor): - IE_DESC = 'SVT Play and Öppet arkiv' - _VALID_URL = r'https?://(?:www\.)?(?Psvtplay|oppetarkiv)\.se/video/(?P[0-9]+)' - _TESTS = [{ - 'url': 'http://www.svtplay.se/video/2609989/sm-veckan/sm-veckan-rally-final-sasong-1-sm-veckan-rally-final', - 'md5': 'ade3def0643fa1c40587a422f98edfd9', - 'info_dict': { - 'id': '2609989', - 'ext': 'flv', - 'title': 'SM veckan vinter, Örebro - Rally, final', - 'duration': 4500, - 'thumbnail': 're:^https?://.*[\.-]jpg$', - 'age_limit': 0, - }, - }, { - 'url': 'http://www.oppetarkiv.se/video/1058509/rederiet-sasong-1-avsnitt-1-av-318', - 'md5': 'c3101a17ce9634f4c1f9800f0746c187', - 'info_dict': { - 'id': '1058509', - 'ext': 'flv', - 'title': 'Farlig kryssning', - 'duration': 2566, - 'thumbnail': 're:^https?://.*[\.-]jpg$', - 'age_limit': 0, - }, - 'skip': 'Only works from Sweden', - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - host = mobj.group('host') - - info = self._download_json( - 'http://www.%s.se/video/%s?output=json' % (host, video_id), video_id) +class SVTBaseIE(InfoExtractor): + def _extract_video(self, url, video_id): + info = self._download_json(url, video_id) title = info['context']['title'] thumbnail = info['context'].get('thumbnailImage') @@ -80,3 +48,63 @@ class SVTPlayIE(InfoExtractor): 'duration': duration, 'age_limit': age_limit, } + + +class SVTIE(SVTBaseIE): + _VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P\d+)&.*?\barticleId=(?P\d+)' + _TEST = { + 'url': 'http://www.svt.se/wd?widgetId=23991§ionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false', + 'md5': '9648197555fc1b49e3dc22db4af51d46', + 'info_dict': { + 'id': '2900353', + 'ext': 'flv', + 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)', + 'duration': 27, + 'age_limit': 0, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + widget_id = mobj.group('widget_id') + article_id = mobj.group('id') + return self._extract_video( + 'http://www.svt.se/wd?widgetId=%s&articleId=%s&format=json&type=embed&output=json' % (widget_id, article_id), + article_id) + + +class SVTPlayIE(SVTBaseIE): + IE_DESC = 'SVT Play and Öppet arkiv' + _VALID_URL = r'https?://(?:www\.)?(?Psvtplay|oppetarkiv)\.se/video/(?P[0-9]+)' + _TESTS = [{ + 'url': 'http://www.svtplay.se/video/2609989/sm-veckan/sm-veckan-rally-final-sasong-1-sm-veckan-rally-final', + 'md5': 'ade3def0643fa1c40587a422f98edfd9', + 'info_dict': { + 'id': '2609989', + 'ext': 'flv', + 'title': 'SM veckan vinter, Örebro - Rally, final', + 'duration': 4500, + 'thumbnail': 're:^https?://.*[\.-]jpg$', + 'age_limit': 0, + }, + }, { + 'url': 'http://www.oppetarkiv.se/video/1058509/rederiet-sasong-1-avsnitt-1-av-318', + 'md5': 'c3101a17ce9634f4c1f9800f0746c187', + 'info_dict': { + 'id': '1058509', + 'ext': 'flv', + 'title': 'Farlig kryssning', + 'duration': 2566, + 'thumbnail': 're:^https?://.*[\.-]jpg$', + 'age_limit': 0, + }, + 'skip': 'Only works from Sweden', + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + host = mobj.group('host') + return self._extract_video( + 'http://www.%s.se/video/%s?output=json' % (host, video_id), + video_id) From 322915014f0378e2675a2a17cd67fe89a6e6a7d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 9 May 2015 00:13:40 +0600 Subject: [PATCH 0038/2145] [svtplay] Rename to svt --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/{svtplay.py => svt.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename youtube_dl/extractor/{svtplay.py => svt.py} (100%) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 79236c6c1..0a18dba5c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -505,7 +505,7 @@ from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE from .streetvoice import StreetVoiceIE from .sunporno import SunPornoIE -from .svtplay import ( +from .svt import ( SVTIE, SVTPlayIE, ) diff --git a/youtube_dl/extractor/svtplay.py b/youtube_dl/extractor/svt.py similarity index 100% rename from youtube_dl/extractor/svtplay.py rename to youtube_dl/extractor/svt.py From bab19a8e91153705d6600fe1d1a0b0aa0bf93bb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 9 May 2015 00:23:35 +0600 Subject: [PATCH 0039/2145] [extractor/generic] Add support for svt embeds (Closes #5622) --- youtube_dl/extractor/generic.py | 6 ++++++ youtube_dl/extractor/svt.py | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index cd7c47d6d..046bcb0f0 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -37,6 +37,7 @@ from .condenast import CondeNastIE from .udn import UDNEmbedIE from .senateisvp import SenateISVPIE from .bliptv import BlipTVIE +from .svt import SVTIE class GenericIE(InfoExtractor): @@ -1091,6 +1092,11 @@ class GenericIE(InfoExtractor): if bliptv_url: return self.url_result(bliptv_url, 'BlipTV') + # Look for SVT player + svt_url = SVTIE._extract_url(webpage) + if svt_url: + return self.url_result(svt_url, 'SVT') + # Look for embedded condenast player matches = re.findall( r' Date: Sat, 9 May 2015 00:27:37 +0600 Subject: [PATCH 0040/2145] [extractor/generic] Add test for svt embed --- youtube_dl/extractor/generic.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 046bcb0f0..7c38bce7c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -659,6 +659,17 @@ class GenericIE(InfoExtractor): 'title': 'Facebook Creates "On This Day" | Crunch Report', }, }, + # SVT embed + { + 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun', + 'info_dict': { + 'id': '2900353', + 'ext': 'flv', + 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)', + 'duration': 27, + 'age_limit': 0, + }, + }, # RSS feed with enclosure { 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', From de765f6c3188802bb2dea704a645f539fa61c8aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 9 May 2015 02:15:51 +0600 Subject: [PATCH 0041/2145] [foxsports] Support some more URLs (#5611) --- youtube_dl/extractor/foxsports.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py index 363866b64..df7665176 100644 --- a/youtube_dl/extractor/foxsports.py +++ b/youtube_dl/extractor/foxsports.py @@ -5,7 +5,7 @@ from ..utils import smuggle_url class FoxSportsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?foxsports\.com/video\?vid=(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*(?P[^/]+)' _TEST = { 'url': 'http://www.foxsports.com/video?vid=432609859715', From 3dbec410a0e195036025aa3a3792932783f371d2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 9 May 2015 13:19:54 +0800 Subject: [PATCH 0042/2145] [sohu] Enhance error handling --- youtube_dl/extractor/sohu.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index f8a4840f7..13b9e9133 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -8,7 +8,10 @@ from ..compat import ( compat_str, compat_urllib_request ) -from ..utils import sanitize_url_path_consecutive_slashes +from ..utils import ( + sanitize_url_path_consecutive_slashes, + ExtractorError, +) class SohuIE(InfoExtractor): @@ -117,6 +120,15 @@ class SohuIE(InfoExtractor): r'var vid ?= ?["\'](\d+)["\']', webpage, 'video path') vid_data = _fetch_data(vid, mytv) + if vid_data['play'] != 1: + if vid_data.get('status') == 12: + raise ExtractorError( + 'Sohu said: There\'s something wrong in the video.', + expected=True) + else: + raise ExtractorError( + 'Sohu said: The video is only licensed to users in Mainland China.', + expected=True) formats_json = {} for format_id in ('nor', 'high', 'super', 'ori', 'h2644k', 'h2654k'): From 32060c6d6b618fa858b2ce43db34d02fd43bc542 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 9 May 2015 13:54:28 +0800 Subject: [PATCH 0043/2145] [sohu] Update extractor The original extraction logic always fails for all test videos --- youtube_dl/extractor/sohu.py | 44 ++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 13b9e9133..eab4adfca 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -8,10 +8,7 @@ from ..compat import ( compat_str, compat_urllib_request ) -from ..utils import ( - sanitize_url_path_consecutive_slashes, - ExtractorError, -) +from ..utils import ExtractorError class SohuIE(InfoExtractor): @@ -31,7 +28,7 @@ class SohuIE(InfoExtractor): } }, { 'url': 'http://tv.sohu.com/20150305/n409385080.shtml', - 'md5': '699060e75cf58858dd47fb9c03c42cfb', + 'md5': 'ac9a5d322b4bf9ae184d53e4711e4f1a', 'info_dict': { 'id': '409385080', 'ext': 'mp4', @@ -39,7 +36,7 @@ class SohuIE(InfoExtractor): } }, { 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', - 'md5': '9bf34be48f2f4dadcb226c74127e203c', + 'md5': '49308ff6dafde5ece51137d04aec311e', 'info_dict': { 'id': '78693464', 'ext': 'mp4', @@ -53,7 +50,7 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', }, 'playlist': [{ - 'md5': 'bdbfb8f39924725e6589c146bc1883ad', + 'md5': '492923eac023ba2f13ff69617c32754a', 'info_dict': { 'id': '78910339_part1', 'ext': 'mp4', @@ -61,7 +58,7 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }, { - 'md5': '3e1f46aaeb95354fd10e7fca9fc1804e', + 'md5': 'de604848c0e8e9c4a4dde7e1347c0637', 'info_dict': { 'id': '78910339_part2', 'ext': 'mp4', @@ -69,7 +66,7 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }, { - 'md5': '8407e634175fdac706766481b9443450', + 'md5': '93584716ee0657c0b205b8aa3d27aa13', 'info_dict': { 'id': '78910339_part3', 'ext': 'mp4', @@ -144,24 +141,21 @@ class SohuIE(InfoExtractor): for i in range(part_count): formats = [] for format_id, format_data in formats_json.items(): - allot = format_data['allot'] - prot = format_data['prot'] - data = format_data['data'] - clips_url = data['clipsURL'] - su = data['su'] - part_str = self._download_webpage( - 'http://%s/?prot=%s&file=%s&new=%s' % - (allot, prot, clips_url[i], su[i]), - video_id, - 'Downloading %s video URL part %d of %d' - % (format_id, i + 1, part_count)) - - part_info = part_str.split('|') - - video_url = sanitize_url_path_consecutive_slashes( - '%s%s?key=%s' % (part_info[0], su[i], part_info[3])) + # URLs starts with http://newflv.sohu.ccgslb.net/ is not usable + # so retry until got a working URL + video_url = 'newflv.sohu.ccgslb.net' + retries = 0 + while 'newflv.sohu.ccgslb.net' in video_url and retries < 5: + download_note = 'Download information from CDN gateway for format ' + format_id + if retries > 0: + download_note += ' (retry #%d)' % retries + retries += 1 + cdn_info = self._download_json( + 'http://data.vod.itc.cn/cdnList?new=' + data['su'][i], + video_id, download_note) + video_url = cdn_info['url'] formats.append({ 'url': video_url, From 6d14d08e062ff3d6e0fd17f04cb341099097902c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 9 May 2015 17:36:07 +0800 Subject: [PATCH 0044/2145] [yam] Fix title and uploader id --- youtube_dl/extractor/yam.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/yam.py b/youtube_dl/extractor/yam.py index 19f8762ae..9d851bae3 100644 --- a/youtube_dl/extractor/yam.py +++ b/youtube_dl/extractor/yam.py @@ -9,6 +9,7 @@ from ..utils import ( float_or_none, month_by_abbreviation, ExtractorError, + get_element_by_attribute, ) @@ -23,6 +24,7 @@ class YamIE(InfoExtractor): 'id': '2283921', 'ext': 'mp3', 'title': '發現 - 趙薇 京華煙雲主題曲', + 'description': '發現 - 趙薇 京華煙雲主題曲', 'uploader_id': 'princekt', 'upload_date': '20080807', 'duration': 313.0, @@ -55,6 +57,17 @@ class YamIE(InfoExtractor): 'ext': 'mp4', }, 'skip': 'invalid YouTube URL', + }, { + 'url': 'http://mymedia.yam.com/m/2373534', + 'md5': '7ff74b91b7a817269d83796f8c5890b1', + 'info_dict': { + 'id': '2373534', + 'ext': 'mp3', + 'title': '林俊傑&蔡卓妍-小酒窩', + 'description': 'md5:904003395a0fcce6cfb25028ff468420', + 'upload_date': '20080928', + 'uploader_id': 'onliner2', + } }] def _real_extract(self, url): @@ -75,15 +88,19 @@ class YamIE(InfoExtractor): if youtube_url: return self.url_result(youtube_url, 'Youtube') + title = self._html_search_regex( + r']+class="heading"[^>]*>\s*(.+)\s*', page, 'title') + api_page = self._download_webpage( 'http://mymedia.yam.com/api/a/?pID=' + video_id, video_id, note='Downloading API page') api_result_obj = compat_urlparse.parse_qs(api_page) + info_table = get_element_by_attribute('class', 'info', page) uploader_id = self._html_search_regex( - r':[\n ]+(?P[A-Z][a-z]{2}) ' + + r':[\n ]+(?P[A-Z][a-z]{2})\s+' + r'(?P\d{1,2}), (?P\d{4})', page) if mobj: upload_date = '%s%02d%02d' % ( @@ -97,7 +114,8 @@ class YamIE(InfoExtractor): return { 'id': video_id, 'url': api_result_obj['mp3file'][0], - 'title': self._html_search_meta('description', page), + 'title': title, + 'description': self._html_search_meta('description', page), 'duration': duration, 'uploader_id': uploader_id, 'upload_date': upload_date, From d39e0f05db226ef5691f5730d40da796aec6bac6 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 9 May 2015 17:37:39 +0800 Subject: [PATCH 0045/2145] [utils] Remove sanitize_url_path_consecutive_slashes() This function is used only in SohuIE, which is updated to use a new extraction logic. --- test/test_utils.py | 21 --------------------- youtube_dl/utils.py | 7 ------- 2 files changed, 28 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 032d3656a..86b110a7d 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -40,7 +40,6 @@ from youtube_dl.utils import ( read_batch_urls, sanitize_filename, sanitize_path, - sanitize_url_path_consecutive_slashes, prepend_extension, replace_extension, shell_quote, @@ -176,26 +175,6 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_path('./abc'), 'abc') self.assertEqual(sanitize_path('./../abc'), '..\\abc') - def test_sanitize_url_path_consecutive_slashes(self): - self.assertEqual( - sanitize_url_path_consecutive_slashes('http://hostname/foo//bar/filename.html'), - 'http://hostname/foo/bar/filename.html') - self.assertEqual( - sanitize_url_path_consecutive_slashes('http://hostname//foo/bar/filename.html'), - 'http://hostname/foo/bar/filename.html') - self.assertEqual( - sanitize_url_path_consecutive_slashes('http://hostname//'), - 'http://hostname/') - self.assertEqual( - sanitize_url_path_consecutive_slashes('http://hostname/foo/bar/filename.html'), - 'http://hostname/foo/bar/filename.html') - self.assertEqual( - sanitize_url_path_consecutive_slashes('http://hostname/'), - 'http://hostname/') - self.assertEqual( - sanitize_url_path_consecutive_slashes('http://hostname/abc//'), - 'http://hostname/abc/') - def test_prepend_extension(self): self.assertEqual(prepend_extension('abc.ext', 'temp'), 'abc.temp.ext') self.assertEqual(prepend_extension('abc.ext', 'temp', 'ext'), 'abc.temp.ext') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index de09b53b2..d73efcf25 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -327,13 +327,6 @@ def sanitize_path(s): return os.path.join(*sanitized_path) -def sanitize_url_path_consecutive_slashes(url): - """Collapses consecutive slashes in URLs' path""" - parsed_url = list(compat_urlparse.urlparse(url)) - parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2]) - return compat_urlparse.urlunparse(parsed_url) - - def orderedSet(iterable): """ Remove all duplicates from the input iterable """ res = [] From 5c0b2c16a80c509dbcee48f48da3de0bf9912cda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 9 May 2015 12:34:45 +0200 Subject: [PATCH 0046/2145] [vgtv] Escape '#' in _VALID_URL and remove empty newlines at the end In verbose mode, '#' is interpreted as the start of a comment. --- youtube_dl/extractor/vgtv.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index db7a4bdb1..eb2652fb1 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -17,7 +17,7 @@ class VGTVIE(InfoExtractor): (?Pvgtv|bt) (?: :| - \.no/(?:tv/)?#!/(?:video|live)/ + \.no/(?:tv/)?\#!/(?:video|live)/ ) (?P[0-9]+) ''' @@ -188,5 +188,3 @@ class BTVestlendingenIE(InfoExtractor): def _real_extract(self, url): return self.url_result('xstream:btno:%s' % self._match_id(url), 'Xstream') - - From 3b5f65a64c06859cdee0b93f319c80d5c116cedd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 9 May 2015 12:41:56 +0200 Subject: [PATCH 0047/2145] [mlb] Fix extraction of articles And move test from generic, since it's directly handled by MLBIE --- youtube_dl/extractor/generic.py | 13 ------------- youtube_dl/extractor/mlb.py | 14 +++++++++++++- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7c38bce7c..3d756e848 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -414,19 +414,6 @@ class GenericIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', }, }, - # MLB articles - { - 'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer', - 'md5': 'b190e70141fb9a1552a85426b4da1b5d', - 'info_dict': { - 'id': '75609783', - 'ext': 'mp4', - 'title': 'Must C: Pillar climbs for catch', - 'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run', - 'timestamp': 1429124820, - 'upload_date': '20150415', - } - }, # Wistia embed { 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index 4e054fb53..40c9ecb35 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -82,6 +82,18 @@ class MLBIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', }, }, + { + 'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer', + 'md5': 'b190e70141fb9a1552a85426b4da1b5d', + 'info_dict': { + 'id': '75609783', + 'ext': 'mp4', + 'title': 'Must C: Pillar climbs for catch', + 'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run', + 'timestamp': 1429124820, + 'upload_date': '20150415', + } + }, { 'url': 'http://m.mlb.com/shared/video/embed/embed.html?content_id=35692085&topic_id=6479266&width=400&height=224&property=mlb', 'only_matching': True, @@ -113,7 +125,7 @@ class MLBIE(InfoExtractor): video_path = mobj.group('path') webpage = self._download_webpage(url, video_path) video_id = self._search_regex( - r'data-videoid="(\d+)"', webpage, 'video id') + r'data-video-?id="(\d+)"', webpage, 'video id') detail = self._download_xml( 'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml' From d592b42f5c37c4b0d9f1587dd76f225b0287959f Mon Sep 17 00:00:00 2001 From: blissland Date: Sat, 9 May 2015 15:26:00 +0100 Subject: [PATCH 0048/2145] Updated two tests for BRIE --- youtube_dl/extractor/br.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 45ba51732..04a3ecd17 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -16,27 +16,27 @@ class BRIE(InfoExtractor): _TESTS = [ { - 'url': 'http://www.br.de/mediathek/video/sendungen/heimatsound/heimatsound-festival-2014-trailer-100.html', - 'md5': '93556dd2bcb2948d9259f8670c516d59', + 'url': 'http://www.br.de/mediathek/video/sendungen/abendschau/betriebliche-altersvorsorge-104.html', + 'md5': '83a0477cf0b8451027eb566d88b51106', 'info_dict': { - 'id': '25e279aa-1ffd-40fd-9955-5325bd48a53a', + 'id': '48f656ef-287e-486f-be86-459122db22cc', 'ext': 'mp4', - 'title': 'Wenn das Traditions-Theater wackelt', - 'description': 'Heimatsound-Festival 2014: Wenn das Traditions-Theater wackelt', - 'duration': 34, - 'uploader': 'BR', - 'upload_date': '20140802', + 'title': 'Die böse Überraschung', + 'description': 'Betriebliche Altersvorsorge: Die böse Überraschung', + 'duration': 180, + 'uploader': 'Reinhard Weber', + 'upload_date': '20150422', } }, { - 'url': 'http://www.br.de/nachrichten/schaeuble-haushaltsentwurf-bundestag-100.html', - 'md5': '3db0df1a9a9cd9fa0c70e6ea8aa8e820', + 'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html', + 'md5': 'a44396d73ab6a68a69a568fae10705bb', 'info_dict': { - 'id': 'c6aae3de-2cf9-43f2-957f-f17fef9afaab', - 'ext': 'aac', - 'title': '"Keine neuen Schulden im nächsten Jahr"', - 'description': 'Haushaltsentwurf: "Keine neuen Schulden im nächsten Jahr"', - 'duration': 64, + 'id': 'a4b83e34-123d-4b81-9f4e-c0d3121a4e05', + 'ext': 'mp4', + 'title': 'Manfred Schreiber ist tot', + 'description': 'Abendschau kompakt: Manfred Schreiber ist tot', + 'duration': 26, } }, { From 0892090a56e04726175c247d13ecce7f6c9cb839 Mon Sep 17 00:00:00 2001 From: blissland Date: Sat, 9 May 2015 16:02:07 +0100 Subject: [PATCH 0049/2145] Added audio test for BRIE --- youtube_dl/extractor/br.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 04a3ecd17..66e394e10 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -39,6 +39,17 @@ class BRIE(InfoExtractor): 'duration': 26, } }, + { + 'url': 'http://www.br.de/radio/br-klassik/sendungen/allegro/premiere-urauffuehrung-the-land-2015-dance-festival-muenchen-100.html', + 'md5': '8b5b27c0b090f3b35eac4ab3f7a73d3d', + 'info_dict': { + 'id': '74c603c9-26d3-48bb-b85b-079aeed66e0b', + 'ext': 'aac', + 'title': 'Kurzweilig und sehr bewegend', + 'description': '"The Land" von Peeping Tom: Kurzweilig und sehr bewegend', + 'duration': 296, + } + }, { 'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html', 'md5': 'dbab0aef2e047060ea7a21fc1ce1078a', From 32fffff2ccc044c639c8723281981aa347423762 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 9 May 2015 21:19:09 +0600 Subject: [PATCH 0050/2145] [eroprofile] Fix video URL extraction (Closes #5657) --- youtube_dl/extractor/eroprofile.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py index 0cbca90b0..316033cf1 100644 --- a/youtube_dl/extractor/eroprofile.py +++ b/youtube_dl/extractor/eroprofile.py @@ -4,7 +4,10 @@ import re from .common import InfoExtractor from ..compat import compat_urllib_parse -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + unescapeHTML +) class EroProfileIE(InfoExtractor): @@ -75,8 +78,8 @@ class EroProfileIE(InfoExtractor): [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'], webpage, 'video id', default=None) - video_url = self._search_regex( - r'([^<]+)', webpage, 'title') thumbnail = self._search_regex( From f2e0056579ac507b776ce2c86b5281fc28bbc275 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 9 May 2015 21:23:09 +0600 Subject: [PATCH 0051/2145] [vgtv] Avoid duplicate format_id --- youtube_dl/extractor/vgtv.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index eb2652fb1..e6ee1e471 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -103,11 +103,14 @@ class VGTVIE(InfoExtractor): hls_url = streams.get('hls') if hls_url: - formats.extend(self._extract_m3u8_formats(hls_url, video_id, 'mp4')) + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, 'mp4', m3u8_id='hls')) hds_url = streams.get('hds') if hds_url: - formats.extend(self._extract_f4m_formats(hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id)) + formats.extend(self._extract_f4m_formats( + hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', + video_id, f4m_id='hds')) mp4_url = streams.get('mp4') if mp4_url: From 480065172d4c97f00973b3f0bf24cd1b8e567627 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 May 2015 00:26:42 +0600 Subject: [PATCH 0052/2145] [lifenews] Add support for video URLs (Closes #5660) --- youtube_dl/extractor/lifenews.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index 081016b80..92031e843 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -14,7 +14,7 @@ from ..utils import ( class LifeNewsIE(InfoExtractor): IE_NAME = 'lifenews' IE_DESC = 'LIFE | NEWS' - _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?news/(?P\d+)' + _VALID_URL = r'http://lifenews\.ru/(?:mobile/)?(?P
news|video)/(?P\d+)' _TESTS = [{ 'url': 'http://lifenews.ru/news/126342', @@ -55,12 +55,15 @@ class LifeNewsIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + section = mobj.group('section') - webpage = self._download_webpage('http://lifenews.ru/news/%s' % video_id, video_id, 'Downloading page') + webpage = self._download_webpage( + 'http://lifenews.ru/%s/%s' % (section, video_id), + video_id, 'Downloading page') videos = re.findall(r'[^"]+)".*?src="(?P', webpage) iframe_link = self._html_search_regex( - ']+src="([^"]+)', webpage, 'iframe link', default=None) + ']+src=["\']([^"\']+)["\']', webpage, 'iframe link', default=None) if not videos and not iframe_link: raise ExtractorError('No media links available for %s' % video_id) From 057ebeaca3da40b901b2592e2302a0d4bbab48e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 May 2015 00:27:49 +0600 Subject: [PATCH 0053/2145] [lifenews] Add test for #5660 --- youtube_dl/extractor/lifenews.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index 92031e843..7d5b8621b 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -50,6 +50,9 @@ class LifeNewsIE(InfoExtractor): 'upload_date': '20150505', 'uploader': 'embed.life.ru', } + }, { + 'url': 'http://lifenews.ru/video/13035', + 'only_matching': True, }] def _real_extract(self, url): From 754270313a2b337eda98fa95232bd1064e294173 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 May 2015 01:03:26 +0600 Subject: [PATCH 0054/2145] [life:embed] Move to separated extractor and extract m3u8 formats --- youtube_dl/extractor/__init__.py | 5 +++- youtube_dl/extractor/lifenews.py | 47 ++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 0a18dba5c..f117578a2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -258,7 +258,10 @@ from .letv import ( LetvPlaylistIE ) from .libsyn import LibsynIE -from .lifenews import LifeNewsIE +from .lifenews import ( + LifeNewsIE, + LifeEmbedIE, +) from .liveleak import LiveLeakIE from .livestream import ( LivestreamIE, diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index 7d5b8621b..7f39fa4cf 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -4,7 +4,9 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( + determine_ext, int_or_none, unified_strdate, ExtractorError, @@ -119,3 +121,48 @@ class LifeNewsIE(InfoExtractor): return make_entry(video_id, videos[0]) else: return [make_entry(video_id, media, video_number + 1) for video_number, media in enumerate(videos)] + + +class LifeEmbedIE(InfoExtractor): + IE_NAME = 'life:embed' + _VALID_URL = r'http://embed\.life\.ru/embed/(?P[\da-f]{32})' + + _TEST = { + 'url': 'http://embed.life.ru/embed/e50c2dec2867350528e2574c899b8291', + 'md5': 'b889715c9e49cb1981281d0e5458fbbe', + 'info_dict': { + 'id': 'e50c2dec2867350528e2574c899b8291', + 'ext': 'mp4', + 'title': 'e50c2dec2867350528e2574c899b8291', + 'thumbnail': 're:http://.*\.jpg', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + formats = [] + for video_url in re.findall(r'"file"\s*:\s*"([^"]+)', webpage): + video_url = compat_urlparse.urljoin(url, video_url) + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id='m3u8')) + else: + formats.append({ + 'url': video_url, + 'format_id': ext, + 'preference': 1, + }) + + thumbnail = self._search_regex( + r'"image"\s*:\s*"([^"]+)', webpage, 'thumbnail', default=None) + + return { + 'id': video_id, + 'title': video_id, + 'thumbnail': thumbnail, + 'formats': formats, + } From 69fe3a5f0961c0ae602da531d2b0fb3f11b9d7c9 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 10 May 2015 01:05:24 +0200 Subject: [PATCH 0055/2145] release 2015.05.10 --- docs/supportedsites.md | 9 ++++++++- youtube_dl/version.py | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 6d2e496a8..98b625380 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -64,6 +64,8 @@ - **BR**: Bayerischer Rundfunk Mediathek - **Break** - **Brightcove** + - **bt:article**: Bergens Tidende Articles + - **bt:vestlendingen**: Bergens Tidende - Vestlendingen - **BuzzFeed** - **BYUtv** - **Camdemy** @@ -240,6 +242,7 @@ - **LetvPlaylist** - **LetvTv** - **Libsyn** + - **life:embed** - **lifenews**: LIFE | NEWS - **LiveLeak** - **livestream** @@ -328,6 +331,7 @@ - **ntv.ru** - **Nuvid** - **NYTimes** + - **NYTimesArticle** - **ocw.mit.edu** - **Odnoklassniki** - **OktoberfestTV** @@ -434,6 +438,7 @@ - **southpark.cc.com** - **southpark.cc.com:español** - **southpark.de** + - **southpark.nl** - **Space** - **SpankBang** - **Spankwire** @@ -453,6 +458,7 @@ - **StreamCZ** - **StreetVoice** - **SunPorno** + - **SVT** - **SVTPlay**: SVT Play and Öppet arkiv - **SWRMediathek** - **Syfy** @@ -529,7 +535,7 @@ - **Vessel** - **Vesti**: Вести.Ru - **Vevo** - - **VGTV** + - **VGTV**: VGTV and BTTV - **vh1.com** - **Vice** - **Viddler** @@ -587,6 +593,7 @@ - **XHamster** - **XMinus** - **XNXX** + - **Xstream** - **XTube** - **XTubeUser**: XTube user profile - **Xuite** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a5a81bcd2..83c5a1659 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.05.04' +__version__ = '2015.05.10' From 3800b908b1976f242b41d5a2d114418559ce3b48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 May 2015 06:14:34 +0600 Subject: [PATCH 0056/2145] [mlb] Fix #5663 --- youtube_dl/extractor/mlb.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index 40c9ecb35..e242b897f 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -12,13 +12,13 @@ from ..utils import ( class MLBIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// - m(?:lb)?\.(?:[\da-z_-]+\.)?mlb\.com/ + (?:[\da-z_-]+\.)*mlb\.com/ (?: (?: (?:.*?/)?video/(?:topic/[\da-z_-]+/)?v| (?: shared/video/embed/(?:embed|m-internal-embed)\.html| - [^/]+/video/play\.jsp + (?:[^/]+/)+(?:play|index)\.jsp| )\?.*?\bcontent_id= ) (?Pn?\d+)| @@ -114,6 +114,10 @@ class MLBIE(InfoExtractor): # From http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer 'url': 'http://mlb.mlb.com/shared/video/embed/m-internal-embed.html?content_id=75609783&property=mlb&autoplay=true&hashmode=false&siteSection=mlb/multimedia/article_118550098/article_embed&club=mlb', 'only_matching': True, + }, + { + 'url': 'http://washington.nationals.mlb.com/mlb/gameday/index.jsp?c_id=was&gid=2015_05_09_atlmlb_wasmlb_1&lang=en&content_id=108309983&mode=video#', + 'only_matching': True, } ] @@ -125,7 +129,7 @@ class MLBIE(InfoExtractor): video_path = mobj.group('path') webpage = self._download_webpage(url, video_path) video_id = self._search_regex( - r'data-video-?id="(\d+)"', webpage, 'video id') + [r'data-video-?id="(\d+)"', r'content_id=(\d+)'], webpage, 'video id') detail = self._download_xml( 'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml' From c6ddbdb66c5d6ead5e198013c54ef53d641063f1 Mon Sep 17 00:00:00 2001 From: Duncan Date: Sun, 10 May 2015 12:30:07 +1200 Subject: [PATCH 0057/2145] [voicerepublic] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/voicerepublic.py | 55 +++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 youtube_dl/extractor/voicerepublic.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f117578a2..5cb3c304d 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -634,6 +634,7 @@ from .vk import ( VKUserVideosIE, ) from .vodlocker import VodlockerIE +from .voicerepublic import VoiceRepublicIE from .vporn import VpornIE from .vrt import VRTIE from .vube import VubeIE diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py new file mode 100644 index 000000000..1a90693cb --- /dev/null +++ b/youtube_dl/extractor/voicerepublic.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..compat import ( + compat_urllib_request, +) + + +class VoiceRepublicIE(InfoExtractor): + _VALID_URL = r'https?://voicerepublic\.com/talks/(?P[0-9a-z-]+)' + _TEST = { + 'url': 'https://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state', + 'md5': '0554a24d1657915aa8e8f84e15dc9353', + 'info_dict': { + 'id': '2296', + 'ext': 'm4a', + 'title': 'Watching the Watchers: Building a Sousveillance State', + 'thumbnail': 'https://voicerepublic.com/system/flyer/2296.png', + 'description': 'md5:715ba964958afa2398df615809cfecb1', + 'creator': 'M. C. McGrath', + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + req = compat_urllib_request.Request(url) + # Older versions of Firefox get redirected to an "upgrade browser" page + req.add_header('User-Agent', 'youtube-dl') + webpage = self._download_webpage(req, display_id) + thumbnail = self._og_search_thumbnail(webpage) + video_id = self._search_regex(r'/(\d+)\.png', thumbnail, 'id') + + if '
', webpage, 'author', fatal=False), + } From 95eb1adda8692a61db639fb21344ad22d1847044 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 May 2015 08:54:50 +0600 Subject: [PATCH 0058/2145] [life:embed] Sort formats --- youtube_dl/extractor/lifenews.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index 7f39fa4cf..42cb6e35f 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -156,6 +156,7 @@ class LifeEmbedIE(InfoExtractor): 'format_id': ext, 'preference': 1, }) + self._sort_formats(formats) thumbnail = self._search_regex( r'"image"\s*:\s*"([^"]+)', webpage, 'thumbnail', default=None) From f900dc3fb9e17e399b0f33925ee239696cc46010 Mon Sep 17 00:00:00 2001 From: Duncan Date: Sun, 10 May 2015 15:01:58 +1200 Subject: [PATCH 0059/2145] [voicerepublic] Extract author using _html_search_meta --- youtube_dl/extractor/voicerepublic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index 1a90693cb..7d255d6fa 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -51,5 +51,5 @@ class VoiceRepublicIE(InfoExtractor): 'url': self._og_search_url(webpage), 'thumbnail': thumbnail, 'description': self._og_search_description(webpage), - 'creator': self._search_regex(r'', webpage, 'author', fatal=False), + 'creator': self._html_search_meta('author', webpage), } From 03f760b1c0478c1f65cf6e978d7592be46873313 Mon Sep 17 00:00:00 2001 From: Duncan Date: Sun, 10 May 2015 15:40:09 +1200 Subject: [PATCH 0060/2145] [voicerepublic] Remove creator field --- youtube_dl/extractor/voicerepublic.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index 7d255d6fa..960974e16 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -19,7 +19,6 @@ class VoiceRepublicIE(InfoExtractor): 'title': 'Watching the Watchers: Building a Sousveillance State', 'thumbnail': 'https://voicerepublic.com/system/flyer/2296.png', 'description': 'md5:715ba964958afa2398df615809cfecb1', - 'creator': 'M. C. McGrath', } } @@ -51,5 +50,4 @@ class VoiceRepublicIE(InfoExtractor): 'url': self._og_search_url(webpage), 'thumbnail': thumbnail, 'description': self._og_search_description(webpage), - 'creator': self._html_search_meta('author', webpage), } From f03a8a3c4ec4dc95164c12181ffc1ddcb7583ef6 Mon Sep 17 00:00:00 2001 From: Duncan Date: Sun, 10 May 2015 15:12:29 +1200 Subject: [PATCH 0061/2145] [voicerepublic] Raise ExtractorError if audio is still being processed --- youtube_dl/extractor/voicerepublic.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index 960974e16..d3e35a815 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -2,10 +2,8 @@ from __future__ import unicode_literals from .common import InfoExtractor - -from ..compat import ( - compat_urllib_request, -) +from ..compat import compat_urllib_request +from ..utils import ExtractorError class VoiceRepublicIE(InfoExtractor): @@ -31,17 +29,16 @@ class VoiceRepublicIE(InfoExtractor): thumbnail = self._og_search_thumbnail(webpage) video_id = self._search_regex(r'/(\d+)\.png', thumbnail, 'id') - if '
Queued for processing, please stand by...' in webpage: + raise ExtractorError('Audio is still queued for processing') + + formats = [{ + 'url': 'https://voicerepublic.com/vrmedia/{}-clean.{}'.format(video_id, ext), + 'ext': ext, + 'format_id': ext, + 'vcodec': 'none', + } for ext in ['m4a', 'mp3', 'ogg']] + self._sort_formats(formats) return { 'id': video_id, From 28ebef0b1b1b7b97137fbd8e093c09cb51954606 Mon Sep 17 00:00:00 2001 From: Duncan Date: Sun, 10 May 2015 16:03:09 +1200 Subject: [PATCH 0062/2145] [voicerepublic] Detect list of available formats from the web page --- youtube_dl/extractor/voicerepublic.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index d3e35a815..d150b5b5e 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_urllib_request from ..utils import ExtractorError @@ -32,12 +34,15 @@ class VoiceRepublicIE(InfoExtractor): if 'Queued for processing, please stand by...' in webpage: raise ExtractorError('Audio is still queued for processing') + ext_matches = re.finditer(r'data-\w+=\'/vrmedia/\d+-clean\.(\w+)\'', webpage) + exts = [match.group(1) for match in ext_matches] + formats = [{ 'url': 'https://voicerepublic.com/vrmedia/{}-clean.{}'.format(video_id, ext), 'ext': ext, 'format_id': ext, 'vcodec': 'none', - } for ext in ['m4a', 'mp3', 'ogg']] + } for ext in exts] self._sort_formats(formats) return { From 1dcb52188d3709711b3ea5ae1ff6bdb985e79c62 Mon Sep 17 00:00:00 2001 From: Duncan Date: Sun, 10 May 2015 16:38:26 +1200 Subject: [PATCH 0063/2145] [voicerepublic] Remove hardcoded paths to media files --- youtube_dl/extractor/voicerepublic.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index d150b5b5e..a3e40b940 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -34,15 +34,12 @@ class VoiceRepublicIE(InfoExtractor): if 'Queued for processing, please stand by...' in webpage: raise ExtractorError('Audio is still queued for processing') - ext_matches = re.finditer(r'data-\w+=\'/vrmedia/\d+-clean\.(\w+)\'', webpage) - exts = [match.group(1) for match in ext_matches] - formats = [{ - 'url': 'https://voicerepublic.com/vrmedia/{}-clean.{}'.format(video_id, ext), + 'url': 'https://voicerepublic.com' + path, 'ext': ext, 'format_id': ext, 'vcodec': 'none', - } for ext in exts] + } for ext, path in re.findall(r"data-([^=]+)='(/[^']+\.\1)'", webpage)] self._sort_formats(formats) return { From a909e6ad43f9d9661691739a810d7b8853e17175 Mon Sep 17 00:00:00 2001 From: ping Date: Sun, 10 May 2015 15:27:55 +0800 Subject: [PATCH 0064/2145] [dailymotion] Patch upload_date detection. (closes #5665) --- youtube_dl/extractor/dailymotion.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index aa595af20..db10b8d00 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -52,6 +52,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'ext': 'mp4', 'uploader': 'IGN', 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', + 'upload_date': '20150306', } }, # Vevo video @@ -106,9 +107,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor): age_limit = self._rta_search(webpage) video_upload_date = None - mobj = re.search(r'
([0-9]{2})-([0-9]{2})-([0-9]{4})
', webpage) + mobj = re.search(r'', webpage) if mobj is not None: - video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) + video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3) embed_url = 'https://www.dailymotion.com/embed/video/%s' % video_id embed_request = self._build_request(embed_url) From 1934f3a0eaf16ae1d1644178b7128806b8629867 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 10 May 2015 18:22:07 +0800 Subject: [PATCH 0065/2145] [ndr] Extended to support n-joy.de as well (closes #4527) According to http://en.wikipedia.org/wiki/N-Joy, n-joy.de is a service hosted by NDR, so I put them together. --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/ndr.py | 90 ++++++++++++++++++++------------ 2 files changed, 61 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f117578a2..66adb4de7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -324,7 +324,10 @@ from .nbc import ( NBCSportsIE, NBCSportsVPlayerIE, ) -from .ndr import NDRIE +from .ndr import ( + NDRIE, + NJoyIE, +) from .ndtv import NDTVIE from .netzkino import NetzkinoIE from .nerdcubed import NerdCubedFeedIE diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index f49c66690..afb9eda27 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -8,41 +8,11 @@ from ..utils import ( ExtractorError, int_or_none, qualities, + parse_duration, ) -class NDRIE(InfoExtractor): - IE_NAME = 'ndr' - IE_DESC = 'NDR.de - Mediathek' - _VALID_URL = r'https?://www\.ndr\.de/.+?(?P\d+)\.html' - - _TESTS = [ - { - 'url': 'http://www.ndr.de/fernsehen/sendungen/nordmagazin/Kartoffeltage-in-der-Lewitz,nordmagazin25866.html', - 'md5': '5bc5f5b92c82c0f8b26cddca34f8bb2c', - 'note': 'Video file', - 'info_dict': { - 'id': '25866', - 'ext': 'mp4', - 'title': 'Kartoffeltage in der Lewitz', - 'description': 'md5:48c4c04dde604c8a9971b3d4e3b9eaa8', - 'duration': 166, - } - }, - { - 'url': 'http://www.ndr.de/info/audio51535.html', - 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', - 'note': 'Audio file', - 'info_dict': { - 'id': '51535', - 'ext': 'mp3', - 'title': 'La Valette entgeht der Hinrichtung', - 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', - 'duration': 884, - } - } - ] - +class NDRBaseIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') @@ -54,7 +24,11 @@ class NDRIE(InfoExtractor): if description: description = description.strip() - duration = int_or_none(self._html_search_regex(r'duration: (\d+),\n', page, 'duration', fatal=False)) + duration = int_or_none(self._html_search_regex(r'duration: (\d+),\n', page, 'duration', default=None)) + if not duration: + duration = parse_duration(self._html_search_regex( + r'(\d+:\d+)', + page, 'duration', default=None)) formats = [] @@ -92,3 +66,53 @@ class NDRIE(InfoExtractor): 'duration': duration, 'formats': formats, } + + +class NDRIE(NDRBaseIE): + IE_NAME = 'ndr' + IE_DESC = 'NDR.de - Mediathek' + _VALID_URL = r'https?://www\.ndr\.de/.+?(?P\d+)\.html' + + _TESTS = [ + { + 'url': 'http://www.ndr.de/fernsehen/sendungen/nordmagazin/Kartoffeltage-in-der-Lewitz,nordmagazin25866.html', + 'md5': '5bc5f5b92c82c0f8b26cddca34f8bb2c', + 'note': 'Video file', + 'info_dict': { + 'id': '25866', + 'ext': 'mp4', + 'title': 'Kartoffeltage in der Lewitz', + 'description': 'md5:48c4c04dde604c8a9971b3d4e3b9eaa8', + 'duration': 166, + } + }, + { + 'url': 'http://www.ndr.de/info/audio51535.html', + 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', + 'note': 'Audio file', + 'info_dict': { + 'id': '51535', + 'ext': 'mp3', + 'title': 'La Valette entgeht der Hinrichtung', + 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', + 'duration': 884, + } + } + ] + + +class NJoyIE(NDRBaseIE): + IE_NAME = 'N-JOY' + _VALID_URL = r'https?://www\.n-joy\.de/.+?(?P\d+)\.html' + + _TEST = { + 'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html', + 'md5': 'cb63be60cd6f9dd75218803146d8dc67', + 'info_dict': { + 'id': '2480', + 'ext': 'mp4', + 'title': 'Benaissa beim NDR Comedy Contest', + 'description': 'Von seinem sehr "behaarten" Leben lässt sich Benaissa trotz aller Schwierigkeiten nicht unterkriegen.', + 'duration': 654, + } + } From 63cbd19f500eb4d90c1fc7c09f04de5df43a6a04 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 10 May 2015 18:30:26 +0800 Subject: [PATCH 0066/2145] [ndr] Replace the 404 test case --- youtube_dl/extractor/ndr.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index afb9eda27..79a13958b 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -84,7 +84,19 @@ class NDRIE(NDRBaseIE): 'title': 'Kartoffeltage in der Lewitz', 'description': 'md5:48c4c04dde604c8a9971b3d4e3b9eaa8', 'duration': 166, - } + }, + 'skip': '404 Not found', + }, + { + 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', + 'md5': 'dadc003c55ae12a5d2f6bd436cd73f59', + 'info_dict': { + 'id': '988', + 'ext': 'mp4', + 'title': 'Party, Pötte und Parade', + 'description': 'Hunderttausende feiern zwischen Speicherstadt und St. Pauli den 826. Hafengeburtstag. Die NDR Sondersendung zeigt die schönsten und spektakulärsten Bilder vom Auftakt.', + 'duration': 3498, + }, }, { 'url': 'http://www.ndr.de/info/audio51535.html', From a6762c4a22325b5b69770de82df8725d2eb5c3df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 May 2015 18:29:15 +0600 Subject: [PATCH 0067/2145] [voicerepublic] Make more robust and extract more metadata --- youtube_dl/extractor/voicerepublic.py | 95 ++++++++++++++++++++------- 1 file changed, 71 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index a3e40b940..1106c655b 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -1,52 +1,99 @@ -# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_request -from ..utils import ExtractorError +from ..compat import ( + compat_urllib_request, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + determine_ext, + int_or_none, +) class VoiceRepublicIE(InfoExtractor): - _VALID_URL = r'https?://voicerepublic\.com/talks/(?P[0-9a-z-]+)' - _TEST = { - 'url': 'https://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state', + _VALID_URL = r'https?://voicerepublic\.com/(?:talks|embed)/(?P[0-9a-z-]+)' + _TESTS = [{ + 'url': 'http://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state', 'md5': '0554a24d1657915aa8e8f84e15dc9353', 'info_dict': { 'id': '2296', + 'display_id': 'watching-the-watchers-building-a-sousveillance-state', 'ext': 'm4a', 'title': 'Watching the Watchers: Building a Sousveillance State', - 'thumbnail': 'https://voicerepublic.com/system/flyer/2296.png', 'description': 'md5:715ba964958afa2398df615809cfecb1', + 'thumbnail': 're:^https?://.*\.(?:png|jpg)$', + 'duration': 1800, + 'view_count': int, } - } + }, { + 'url': 'http://voicerepublic.com/embed/watching-the-watchers-building-a-sousveillance-state', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) - req = compat_urllib_request.Request(url) + + req = compat_urllib_request.Request( + compat_urlparse.urljoin(url, '/talks/%s' % display_id)) # Older versions of Firefox get redirected to an "upgrade browser" page req.add_header('User-Agent', 'youtube-dl') webpage = self._download_webpage(req, display_id) - thumbnail = self._og_search_thumbnail(webpage) - video_id = self._search_regex(r'/(\d+)\.png', thumbnail, 'id') - if 'Queued for processing, please stand by...' in webpage: - raise ExtractorError('Audio is still queued for processing') + if '>Queued for processing, please stand by...<' in webpage: + raise ExtractorError( + 'Audio is still queued for processing', expected=True) - formats = [{ - 'url': 'https://voicerepublic.com' + path, - 'ext': ext, - 'format_id': ext, - 'vcodec': 'none', - } for ext, path in re.findall(r"data-([^=]+)='(/[^']+\.\1)'", webpage)] + data = self._parse_json( + self._search_regex( + r'(?s)return ({.+?});\s*\n', webpage, + 'data', default=None), + display_id, fatal=False) + + if data: + title = data['title'] + description = data.get('teaser') + talk_id = data.get('talk_id') or display_id + talk = data['talk'] + duration = int_or_none(talk.get('duration')) + formats = [{ + 'url': compat_urlparse.urljoin(url, talk_url), + 'format_id': format_id, + 'ext': determine_ext(talk_url) or format_id, + 'vcodec': 'none', + } for format_id, talk_url in talk['links'].items()] + else: + title = self._og_search_title(webpage) + description = self._html_search_regex( + r"(?s)
]*>(.+?)
", + webpage, 'description', fatal=False) + talk_id = self._search_regex( + [r"id='jc-(\d+)'", r"data-shareable-id='(\d+)'"], + webpage, 'talk id', default=None) or display_id + duration = None + formats = [{ + 'url': compat_urlparse.urljoin(url, talk_url), + 'format_id': format_id, + 'ext': determine_ext(talk_url) or format_id, + 'vcodec': 'none', + } for format_id, talk_url in re.findall(r"data-([^=]+)='([^']+)'", webpage)] self._sort_formats(formats) + thumbnail = self._og_search_thumbnail(webpage) + view_count = int_or_none(self._search_regex( + r"class='play-count[^']*'>\s*(\d+) plays", + webpage, 'play count', fatal=False)) + return { - 'id': video_id, - 'title': self._og_search_title(webpage), - 'formats': formats, - 'url': self._og_search_url(webpage), + 'id': talk_id, + 'display_id': display_id, + 'title': title, + 'description': description, 'thumbnail': thumbnail, - 'description': self._og_search_description(webpage), + 'duration': duration, + 'view_count': view_count, + 'formats': formats, } From 370b39e8ece9f475d489eda721130eec9a9f15e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 May 2015 18:37:52 +0600 Subject: [PATCH 0068/2145] [voicerepublic] Fix fallback branch formats extraction --- youtube_dl/extractor/voicerepublic.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index 1106c655b..254383d6c 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -47,12 +47,10 @@ class VoiceRepublicIE(InfoExtractor): raise ExtractorError( 'Audio is still queued for processing', expected=True) - data = self._parse_json( - self._search_regex( - r'(?s)return ({.+?});\s*\n', webpage, - 'data', default=None), - display_id, fatal=False) - + config = self._search_regex( + r'(?s)return ({.+?});\s*\n', webpage, + 'data', default=None) + data = self._parse_json(config, display_id, fatal=False) if config else None if data: title = data['title'] description = data.get('teaser') @@ -74,12 +72,14 @@ class VoiceRepublicIE(InfoExtractor): [r"id='jc-(\d+)'", r"data-shareable-id='(\d+)'"], webpage, 'talk id', default=None) or display_id duration = None + player = self._search_regex( + r"class='vr-player jp-jplayer'([^>]+)>", webpage, 'player') formats = [{ 'url': compat_urlparse.urljoin(url, talk_url), 'format_id': format_id, 'ext': determine_ext(talk_url) or format_id, 'vcodec': 'none', - } for format_id, talk_url in re.findall(r"data-([^=]+)='([^']+)'", webpage)] + } for format_id, talk_url in re.findall(r"data-([^=]+)='([^']+)'", player)] self._sort_formats(formats) thumbnail = self._og_search_thumbnail(webpage) From 95c5534f8ed016a81f715f291ab09c4ea2c3679c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 10 May 2015 17:41:11 +0200 Subject: [PATCH 0069/2145] ExecAfterDownloadPP, YoutubeDL: remove unused parameters --- youtube_dl/YoutubeDL.py | 1 - youtube_dl/__init__.py | 2 -- youtube_dl/postprocessor/execafterdownload.py | 3 +-- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index d8583a8eb..4cf83c510 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -260,7 +260,6 @@ class YoutubeDL(object): The following options are used by the post processors: prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, otherwise prefer avconv. - exec_cmd: Arbitrary command to run after downloading """ params = None diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index c88489f29..9cc9f851f 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -246,7 +246,6 @@ def _real_main(argv=None): if opts.exec_cmd: postprocessors.append({ 'key': 'ExecAfterDownload', - 'verboseOutput': opts.verbose, 'exec_cmd': opts.exec_cmd, }) if opts.xattr_set_filesize: @@ -345,7 +344,6 @@ def _real_main(argv=None): 'default_search': opts.default_search, 'youtube_include_dash_manifest': opts.youtube_include_dash_manifest, 'encoding': opts.encoding, - 'exec_cmd': opts.exec_cmd, 'extract_flat': opts.extract_flat, 'merge_output_format': opts.merge_output_format, 'postprocessors': postprocessors, diff --git a/youtube_dl/postprocessor/execafterdownload.py b/youtube_dl/postprocessor/execafterdownload.py index 341437575..765fd8fe4 100644 --- a/youtube_dl/postprocessor/execafterdownload.py +++ b/youtube_dl/postprocessor/execafterdownload.py @@ -8,8 +8,7 @@ from ..utils import PostProcessingError class ExecAfterDownloadPP(PostProcessor): - def __init__(self, downloader=None, verboseOutput=None, exec_cmd=None): - self.verboseOutput = verboseOutput + def __init__(self, downloader=None, exec_cmd=None): self.exec_cmd = exec_cmd def run(self, information): From 69b46b3d956220e4b3a3d5eda55768753a67f19d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 10 May 2015 17:47:49 +0200 Subject: [PATCH 0070/2145] ExecAfterDownloadPP: fix __init__ method --- youtube_dl/postprocessor/execafterdownload.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/postprocessor/execafterdownload.py b/youtube_dl/postprocessor/execafterdownload.py index 765fd8fe4..13794b7ba 100644 --- a/youtube_dl/postprocessor/execafterdownload.py +++ b/youtube_dl/postprocessor/execafterdownload.py @@ -8,7 +8,8 @@ from ..utils import PostProcessingError class ExecAfterDownloadPP(PostProcessor): - def __init__(self, downloader=None, exec_cmd=None): + def __init__(self, downloader, exec_cmd): + super(ExecAfterDownloadPP, self).__init__(downloader) self.exec_cmd = exec_cmd def run(self, information): From 70484b9f8ae629ccb87e8c0569f8f4bf2dfdb0ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 11 May 2015 00:26:39 +0600 Subject: [PATCH 0071/2145] [postprocessor/ffmpeg] Extract `check_outdated` method --- youtube_dl/postprocessor/ffmpeg.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 214de39f9..211faf69a 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -36,7 +36,9 @@ class FFmpegPostProcessor(PostProcessor): def check_version(self): if not self.available: raise FFmpegPostProcessorError('ffmpeg or avconv not found. Please install one.') + self.check_outdated() + def check_outdated(self): required_version = '10-0' if self.basename == 'avconv' else '1.0' if is_outdated_version( self._versions[self.basename], required_version): @@ -44,6 +46,8 @@ class FFmpegPostProcessor(PostProcessor): self.basename, self.basename, required_version) if self._downloader: self._downloader.report_warning(warning) + return True + return False @staticmethod def get_versions(downloader=None): From 7fcb605b82796e79a5f559624808ca9404df1154 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 11 May 2015 00:27:29 +0600 Subject: [PATCH 0072/2145] [YoutubeDL] Fallback to `-f best` when merger is outdated --- youtube_dl/YoutubeDL.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 4cf83c510..7c3bdb964 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1086,9 +1086,10 @@ class YoutubeDL(object): if req_format is None: req_format_list = [] if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' - and info_dict['extractor'] in ['youtube', 'ted'] - and FFmpegMergerPP(self).available): - req_format_list.append('bestvideo+bestaudio') + and info_dict['extractor'] in ['youtube', 'ted']): + merger = FFmpegMergerPP(self) + if merger.available and not merger.check_outdated(): + req_format_list.append('bestvideo+bestaudio') req_format_list.append('best') req_format = '/'.join(req_format_list) formats_to_download = [] From 13763ce599c8fbba43e57d2d79a9b007cfbd4ced Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 11 May 2015 02:00:31 +0600 Subject: [PATCH 0073/2145] [postprocessor/ffmpeg] Add `can_merge` method --- youtube_dl/postprocessor/ffmpeg.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 211faf69a..cc65b34e7 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -36,9 +36,7 @@ class FFmpegPostProcessor(PostProcessor): def check_version(self): if not self.available: raise FFmpegPostProcessorError('ffmpeg or avconv not found. Please install one.') - self.check_outdated() - def check_outdated(self): required_version = '10-0' if self.basename == 'avconv' else '1.0' if is_outdated_version( self._versions[self.basename], required_version): @@ -46,8 +44,6 @@ class FFmpegPostProcessor(PostProcessor): self.basename, self.basename, required_version) if self._downloader: self._downloader.report_warning(warning) - return True - return False @staticmethod def get_versions(downloader=None): @@ -595,6 +591,23 @@ class FFmpegMergerPP(FFmpegPostProcessor): os.rename(encodeFilename(temp_filename), encodeFilename(filename)) return info['__files_to_merge'], info + def can_merge(self): + # TODO: figure out merge-capable ffmpeg version + if self.basename != 'avconv': + return True + + required_version = '10-0' + if is_outdated_version( + self._versions[self.basename], required_version): + warning = ('Your copy of %s is outdated and unable to properly mux separate video and audio files, ' + 'youtube-dl will download single file media. ' + 'Update %s to version %s or newer to fix this.') % ( + self.basename, self.basename, required_version) + if self._downloader: + self._downloader.report_warning(warning) + return False + return True + class FFmpegFixupStretchedPP(FFmpegPostProcessor): def run(self, info): From 97fcf1bbd07ae0c5b6e530dcf2623d199452a76c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 11 May 2015 02:01:16 +0600 Subject: [PATCH 0074/2145] [YoutubeDL] Check if merger can actually merge --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 7c3bdb964..00f86b342 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1088,7 +1088,7 @@ class YoutubeDL(object): if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and info_dict['extractor'] in ['youtube', 'ted']): merger = FFmpegMergerPP(self) - if merger.available and not merger.check_outdated(): + if merger.available and merger.can_merge(): req_format_list.append('bestvideo+bestaudio') req_format_list.append('best') req_format = '/'.join(req_format_list) From e41f450f2860ab5aa3f3a04bc646594c6dbc6714 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 11 May 2015 20:04:05 +0800 Subject: [PATCH 0075/2145] [tmz] Add support for articles (fixes #5477) --- youtube_dl/extractor/__init__.py | 5 ++++- youtube_dl/extractor/tmz.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e808f2734..b376fd279 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -543,7 +543,10 @@ from .thesixtyone import TheSixtyOneIE from .thisav import ThisAVIE from .tinypic import TinyPicIE from .tlc import TlcIE, TlcDeIE -from .tmz import TMZIE +from .tmz import ( + TMZIE, + TMZArticleIE, +) from .tnaflix import TNAFlixIE from .thvideo import ( THVideoIE, diff --git a/youtube_dl/extractor/tmz.py b/youtube_dl/extractor/tmz.py index c5c6fdc51..7dbe68b5c 100644 --- a/youtube_dl/extractor/tmz.py +++ b/youtube_dl/extractor/tmz.py @@ -30,3 +30,31 @@ class TMZIE(InfoExtractor): 'description': self._og_search_description(webpage), 'thumbnail': self._html_search_meta('ThumbURL', webpage), } + + +class TMZArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P[^/]+)/?' + _TEST = { + 'url': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert', + 'md5': 'e482a414a38db73087450e3a6ce69d00', + 'info_dict': { + 'id': '0_6snoelag', + 'ext': 'mp4', + 'title': 'Bobby Brown Tells Crowd ... Bobbi Kristina is Awake', + 'description': 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + embedded_video_info_str = self._html_search_regex( + r'tmzVideoEmbedV2\("([^)]+)"\);', webpage, 'embedded video info') + + embedded_video_info = self._parse_json( + embedded_video_info_str, video_id, + transform_source=lambda s: s.replace('\\', '')) + + return self.url_result( + 'http://www.tmz.com/videos/%s/' % embedded_video_info['id']) From 1f92865494c6efa1a0d5d90ffa849e85b80c8248 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 11 May 2015 21:05:39 +0600 Subject: [PATCH 0076/2145] [dumpert] Add cpc cookie (Closes #5672) --- youtube_dl/extractor/dumpert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py index 9c594b757..999fb5620 100644 --- a/youtube_dl/extractor/dumpert.py +++ b/youtube_dl/extractor/dumpert.py @@ -26,7 +26,7 @@ class DumpertIE(InfoExtractor): video_id = self._match_id(url) req = compat_urllib_request.Request(url) - req.add_header('Cookie', 'nsfw=1') + req.add_header('Cookie', 'nsfw=1; cpc=10') webpage = self._download_webpage(req, video_id) files_base64 = self._search_regex( From 511565282861e26913caddc1bcc0c865a9eec786 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 11 May 2015 21:31:36 +0600 Subject: [PATCH 0077/2145] [zingmp3] Capture error message --- youtube_dl/extractor/zingmp3.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py index 1afbe68ed..7dc1e2f2b 100644 --- a/youtube_dl/extractor/zingmp3.py +++ b/youtube_dl/extractor/zingmp3.py @@ -4,12 +4,18 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ExtractorError class ZingMp3BaseInfoExtractor(InfoExtractor): - @staticmethod - def _extract_item(item): + def _extract_item(self, item): + error_message = item.find('./errormessage').text + if error_message: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error_message), + expected=True) + title = item.find('./title').text.strip() source = item.find('./source').text extension = item.attrib['type'] From 81ed3bb9c0edb9a11b43964459ef57cca5683461 Mon Sep 17 00:00:00 2001 From: rrooij Date: Mon, 11 May 2015 17:38:08 +0200 Subject: [PATCH 0078/2145] [southpark] Sort alphabetically --- youtube_dl/extractor/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b376fd279..3368edf7c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -489,8 +489,8 @@ from .soundgasm import ( ) from .southpark import ( SouthParkIE, - SouthParkEsIE, SouthParkDeIE, + SouthParkEsIE, SouthParkNlIE ) from .space import SpaceIE From 968ee176777a1bf4e33cfb849a7241b0ac45d254 Mon Sep 17 00:00:00 2001 From: rrooij Date: Mon, 11 May 2015 18:02:25 +0200 Subject: [PATCH 0079/2145] [southparkdk] Add extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/southpark.py | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3368edf7c..de19dfd7a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -490,6 +490,7 @@ from .soundgasm import ( from .southpark import ( SouthParkIE, SouthParkDeIE, + SouthParkDkIE, SouthParkEsIE, SouthParkNlIE ) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index 59e31198c..83e5a7659 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -57,3 +57,13 @@ class SouthParkNlIE(SouthParkIE): 'url': 'http://www.southpark.nl/full-episodes/s18e06-freemium-isnt-free', 'playlist_count': 4, }] + +class SouthParkDkIE(SouthParkIE): + IE_NAME = 'southpark.dk' + _VALID_URL = r'https?://(?:www\.)?(?Psouthparkstudios\.dk/(?:clips|full-episodes)/(?P.+?)(\?|#|$))' + _FEED_URL = 'http://www.southparkstudios.dk/feeds/video-player/mrss/' + + _TESTS = [{ + 'url': 'http://www.southparkstudios.dk/full-episodes/s18e07-grounded-vindaloop', + 'playlist_count': 4, + }] From 6d3f5935e516760964052718e6b90324c6f07391 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 11 May 2015 23:47:50 +0600 Subject: [PATCH 0080/2145] [southpark] Fix IE_NAME --- youtube_dl/extractor/southpark.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index 83e5a7659..7fb165a87 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -58,8 +58,9 @@ class SouthParkNlIE(SouthParkIE): 'playlist_count': 4, }] + class SouthParkDkIE(SouthParkIE): - IE_NAME = 'southpark.dk' + IE_NAME = 'southparkstudios.dk' _VALID_URL = r'https?://(?:www\.)?(?Psouthparkstudios\.dk/(?:clips|full-episodes)/(?P.+?)(\?|#|$))' _FEED_URL = 'http://www.southparkstudios.dk/feeds/video-player/mrss/' From d4b963d0a68f81f4fef5495af14e2e41add21a0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 12 May 2015 01:54:56 +0600 Subject: [PATCH 0081/2145] [vine] Relax `alt_title` (Closes #5677) --- youtube_dl/extractor/vine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 65c459fad..c733a48fa 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -75,7 +75,7 @@ class VineIE(InfoExtractor): return { 'id': video_id, 'title': self._og_search_title(webpage), - 'alt_title': self._og_search_description(webpage), + 'alt_title': self._og_search_description(webpage, default=None), 'description': data['description'], 'thumbnail': data['thumbnailUrl'], 'upload_date': unified_strdate(data['created']), From 5332fd91bf16867b6777bd6cfd0b5086f84112c5 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 12 May 2015 12:42:13 +0800 Subject: [PATCH 0082/2145] [nytimes] Correct _VALID_URL of NYTimesArticleIE --- youtube_dl/extractor/nytimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 6ffbe3863..7f254b867 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -89,7 +89,7 @@ class NYTimesIE(NYTimesBaseIE): class NYTimesArticleIE(NYTimesBaseIE): - _VALID_URL = r'https?://(?:www)?\.nytimes\.com/(.(?[^.]+)(?:\.html)?' + _VALID_URL = r'https?://(?:www\.)?nytimes\.com/(.(?[^.]+)(?:\.html)?' _TESTS = [{ 'url': 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0', 'md5': 'e2076d58b4da18e6a001d53fd56db3c9', From 7dff03636a843a6990e52200edb3ecca1246b3df Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 12 May 2015 12:47:37 +0800 Subject: [PATCH 0083/2145] [utils] Support 'dur' field in TTML --- test/test_utils.py | 2 +- youtube_dl/utils.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 86b110a7d..b40107037 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -600,7 +600,7 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')

The following line contains Chinese characters and special symbols

第二行
♪♪

-

Third
Line

+

Third
Line

''' diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d73efcf25..5439fcb35 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1866,10 +1866,14 @@ def dfxp2srt(dfxp_data): paras = dfxp.findall(_x('.//ttml:p')) for para, index in zip(paras, itertools.count(1)): + begin_time = parse_dfxp_time_expr(para.attrib['begin']) + end_time = parse_dfxp_time_expr(para.attrib.get('end')) + if not end_time: + end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur']) out.append('%d\n%s --> %s\n%s\n\n' % ( index, - format_srt_time(parse_dfxp_time_expr(para.attrib.get('begin'))), - format_srt_time(parse_dfxp_time_expr(para.attrib.get('end'))), + format_srt_time(begin_time), + format_srt_time(end_time), parse_node(para))) return ''.join(out) From 1c7e2e64f6328024711d5fa999d4498396f4cb5c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 12 May 2015 12:55:14 +0800 Subject: [PATCH 0084/2145] [nrk] Remove TTML to srt conversion codes A common routine is implemented in utils.py and can be used via --convert-subtitles. --- youtube_dl/extractor/nrk.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index e91d3a248..cc70c2950 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, float_or_none, @@ -200,20 +199,10 @@ class NRKTVIE(InfoExtractor): url = "%s%s" % (baseurl, subtitlesurl) self._debug_print('%s: Subtitle url: %s' % (video_id, url)) captions = self._download_xml( - url, video_id, 'Downloading subtitles', - transform_source=lambda s: s.replace(r'
', '\r\n')) + url, video_id, 'Downloading subtitles') lang = captions.get('lang', 'no') - ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/ns/ttml}')) - srt = '' - for pos, p in enumerate(ps): - begin = parse_duration(p.get('begin')) - duration = parse_duration(p.get('dur')) - starttime = self._subtitles_timecode(begin) - endtime = self._subtitles_timecode(begin + duration) - srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (compat_str(pos), starttime, endtime, p.text) return {lang: [ {'ext': 'ttml', 'url': url}, - {'ext': 'srt', 'data': srt}, ]} def _extract_f4m(self, manifest_url, video_id): From c1c924abfeda45f29b991bb74f315f0e79dcf126 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 12 May 2015 13:04:54 +0800 Subject: [PATCH 0085/2145] [utils,common] Merge format_srt_time and _subtitles_timecode format_srt_time uses a comma as the delimiter between seconds and milliseconds while _subtitles_timecode uses a dot. All .srt examples I found on the Internet uses a comma, so I use a comma in the merged version. See http://matroska.org/technical/specs/subtitles/srt.html and http://devel.aegisub.org/wiki/SubtitleFormats/SRT --- youtube_dl/extractor/common.py | 3 --- youtube_dl/extractor/kanalplay.py | 5 +++-- youtube_dl/utils.py | 12 ++++-------- 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 981e34bc7..65bb77086 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1072,9 +1072,6 @@ class InfoExtractor(object): def _get_automatic_captions(self, *args, **kwargs): raise NotImplementedError("This method must be implemented by subclasses") - def _subtitles_timecode(self, seconds): - return '%02d:%02d:%02d.%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000) - class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/kanalplay.py b/youtube_dl/extractor/kanalplay.py index 2bb078036..4597d1b96 100644 --- a/youtube_dl/extractor/kanalplay.py +++ b/youtube_dl/extractor/kanalplay.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, float_or_none, + srt_subtitles_timecode, ) @@ -39,8 +40,8 @@ class KanalPlayIE(InfoExtractor): '%s\r\n%s --> %s\r\n%s' % ( num, - self._subtitles_timecode(item['startMillis'] / 1000.0), - self._subtitles_timecode(item['endMillis'] / 1000.0), + srt_subtitles_timecode(item['startMillis'] / 1000.0), + srt_subtitles_timecode(item['endMillis'] / 1000.0), item['text'], ) for num, item in enumerate(subs, 1)) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 5439fcb35..ed9ed9ed6 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1835,12 +1835,8 @@ def parse_dfxp_time_expr(time_expr): return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3)) -def format_srt_time(seconds): - (mins, secs) = divmod(seconds, 60) - (hours, mins) = divmod(mins, 60) - millisecs = (secs - int(secs)) * 1000 - secs = int(secs) - return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs) +def srt_subtitles_timecode(seconds): + return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000) def dfxp2srt(dfxp_data): @@ -1872,8 +1868,8 @@ def dfxp2srt(dfxp_data): end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur']) out.append('%d\n%s --> %s\n%s\n\n' % ( index, - format_srt_time(begin_time), - format_srt_time(end_time), + srt_subtitles_timecode(begin_time), + srt_subtitles_timecode(end_time), parse_node(para))) return ''.join(out) From 41333b97b9471316cf0f395db59196e6571fc776 Mon Sep 17 00:00:00 2001 From: ping Date: Tue, 12 May 2015 22:35:16 +0800 Subject: [PATCH 0086/2145] [qqmusic] Add support for charts / top lists --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/qqmusic.py | 55 ++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index de19dfd7a..8ec0c1032 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -414,6 +414,7 @@ from .qqmusic import ( QQMusicIE, QQMusicSingerIE, QQMusicAlbumIE, + QQMusicToplistIE, ) from .quickvid import QuickVidIE from .r7 import R7IE diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 174c8e0ae..d4a85d8c3 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -9,6 +9,7 @@ from .common import InfoExtractor from ..utils import ( strip_jsonp, unescapeHTML, + js_to_json, ) from ..compat import compat_urllib_request @@ -168,3 +169,57 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): album_page, 'album details', default=None) return self.playlist_result(entries, mid, album_name, album_detail) + + +class QQMusicToplistIE(QQPlaylistBaseIE): + _VALID_URL = r'http://y\.qq\.com/#type=toplist&p=(?P(top|global)_[0-9]+)' + + _TESTS = [{ + 'url': 'http://y.qq.com/#type=toplist&p=global_12', + 'info_dict': { + 'id': 'global_12', + 'title': 'itunes榜', + }, + 'playlist_count': 10, + }, { + 'url': 'http://y.qq.com/#type=toplist&p=top_6', + 'info_dict': { + 'id': 'top_6', + 'title': 'QQ音乐巅峰榜·欧美', + }, + 'playlist_count': 100, + }] + + @staticmethod + def strip_qq_jsonp(code): + return js_to_json(re.sub(r'^MusicJsonCallback\((.*?)\)/\*.+?\*/$', r'\1', code)) + + def _real_extract(self, url): + list_id = self._match_id(url) + + list_type = list_id.split("_")[0] + num_id = list_id.split("_")[1] + + list_page = self._download_webpage("http://y.qq.com/y/static/toplist/index/%s.html" % list_id, list_id, 'Download toplist page') + entries = [] + if list_type == 'top': + list = self._download_json( + "http://y.qq.com/y/static/toplist/json/top/%s/1.js" % num_id, + list_id, note='Retrieve toplist json', errnote='Unable to get toplist json', transform_source=self.strip_qq_jsonp) + + for song in list['l']: + s = song['s'] + song_mid = s.split("|")[20] + entries.append(self.url_result( + 'http://y.qq.com/#type=song&mid=' + song_mid, 'QQMusic', + song_mid)) + + elif list_type == 'global': + entries = self.get_entries_from_page(list_page) + + list_name = self._html_search_regex( + r'

([^\']+)

', list_page, 'top list name', + default=None) + list_desc = None + + return self.playlist_result(entries, list_id, list_name, list_desc) \ No newline at end of file From b480e7874b45862eae343ab8484aa43381cd28fa Mon Sep 17 00:00:00 2001 From: ping Date: Tue, 12 May 2015 22:41:37 +0800 Subject: [PATCH 0087/2145] [qqmusic] Fix code formatting --- youtube_dl/extractor/qqmusic.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index d4a85d8c3..bca4a8f90 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -200,12 +200,15 @@ class QQMusicToplistIE(QQPlaylistBaseIE): list_type = list_id.split("_")[0] num_id = list_id.split("_")[1] - list_page = self._download_webpage("http://y.qq.com/y/static/toplist/index/%s.html" % list_id, list_id, 'Download toplist page') + list_page = self._download_webpage( + "http://y.qq.com/y/static/toplist/index/%s.html" % list_id, + list_id, 'Download toplist page') entries = [] if list_type == 'top': list = self._download_json( "http://y.qq.com/y/static/toplist/json/top/%s/1.js" % num_id, - list_id, note='Retrieve toplist json', errnote='Unable to get toplist json', transform_source=self.strip_qq_jsonp) + list_id, note='Retrieve toplist json', errnote='Unable to get toplist json', + transform_source=self.strip_qq_jsonp) for song in list['l']: s = song['s'] @@ -222,4 +225,5 @@ class QQMusicToplistIE(QQPlaylistBaseIE): default=None) list_desc = None - return self.playlist_result(entries, list_id, list_name, list_desc) \ No newline at end of file + return self.playlist_result(entries, list_id, list_name, list_desc) + \ No newline at end of file From fd4eefed39595850b864d3be9711224e4e8e9dd4 Mon Sep 17 00:00:00 2001 From: ping Date: Wed, 13 May 2015 01:14:02 +0800 Subject: [PATCH 0088/2145] [qqmusic] Fix extraction for global list --- youtube_dl/extractor/qqmusic.py | 34 ++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index bca4a8f90..3401dcaef 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -188,6 +188,13 @@ class QQMusicToplistIE(QQPlaylistBaseIE): 'title': 'QQ音乐巅峰榜·欧美', }, 'playlist_count': 100, + }, { + 'url': 'http://y.qq.com/#type=toplist&p=global_5', + 'info_dict': { + 'id': 'global_5', + 'title': '韩国mnet排行榜', + }, + 'playlist_count': 50, }] @staticmethod @@ -203,22 +210,23 @@ class QQMusicToplistIE(QQPlaylistBaseIE): list_page = self._download_webpage( "http://y.qq.com/y/static/toplist/index/%s.html" % list_id, list_id, 'Download toplist page') + entries = [] + jsonp_url = "" if list_type == 'top': - list = self._download_json( - "http://y.qq.com/y/static/toplist/json/top/%s/1.js" % num_id, - list_id, note='Retrieve toplist json', errnote='Unable to get toplist json', - transform_source=self.strip_qq_jsonp) - - for song in list['l']: - s = song['s'] - song_mid = s.split("|")[20] - entries.append(self.url_result( - 'http://y.qq.com/#type=song&mid=' + song_mid, 'QQMusic', - song_mid)) - + jsonp_url = "http://y.qq.com/y/static/toplist/json/top/%s/1.js" % num_id elif list_type == 'global': - entries = self.get_entries_from_page(list_page) + jsonp_url = "http://y.qq.com/y/static/toplist/json/global/%s/1_1.js" % num_id + + list = self._download_json(jsonp_url, list_id, note='Retrieve toplist json', + errnote='Unable to get toplist json', transform_source=self.strip_qq_jsonp) + + for song in list['l']: + s = song['s'] + song_mid = s.split("|")[20] + entries.append(self.url_result( + 'http://y.qq.com/#type=song&mid=' + song_mid, 'QQMusic', + song_mid)) list_name = self._html_search_regex( r'

([^\']+)

', list_page, 'top list name', From 86ec1e487c4908f4d0d0ece512007a2e5fedc593 Mon Sep 17 00:00:00 2001 From: ping Date: Wed, 13 May 2015 01:37:56 +0800 Subject: [PATCH 0089/2145] [qqmusic] Code fixes --- youtube_dl/extractor/qqmusic.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 3401dcaef..bae2ce31a 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -212,10 +212,9 @@ class QQMusicToplistIE(QQPlaylistBaseIE): list_id, 'Download toplist page') entries = [] - jsonp_url = "" if list_type == 'top': jsonp_url = "http://y.qq.com/y/static/toplist/json/top/%s/1.js" % num_id - elif list_type == 'global': + else: jsonp_url = "http://y.qq.com/y/static/toplist/json/global/%s/1_1.js" % num_id list = self._download_json(jsonp_url, list_id, note='Retrieve toplist json', From 0b4253fa3710c656e12b6147ed7c1f7843bb9aae Mon Sep 17 00:00:00 2001 From: blissland Date: Tue, 12 May 2015 18:57:06 +0100 Subject: [PATCH 0090/2145] [BYUtvIE] Change thumbnail regex so test does not fail --- youtube_dl/extractor/byutv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py index 6252be05b..3b2de517e 100644 --- a/youtube_dl/extractor/byutv.py +++ b/youtube_dl/extractor/byutv.py @@ -16,7 +16,7 @@ class BYUtvIE(InfoExtractor): 'ext': 'mp4', 'description': 'md5:5438d33774b6bdc662f9485a340401cc', 'title': 'Season 5 Episode 5', - 'thumbnail': 're:^https?://.*promo.*' + 'thumbnail': 're:^https?://.*\.jpg$' }, 'params': { 'skip_download': True, From 3749e36e9f0e6be2a3a3ab1b15c0c02be5a50e2f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 13 May 2015 21:16:45 +0800 Subject: [PATCH 0091/2145] [YoutubeDL] Fix PEP8 W503 --- youtube_dl/YoutubeDL.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 4cf83c510..84d50dab7 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1085,9 +1085,9 @@ class YoutubeDL(object): req_format = self.params.get('format') if req_format is None: req_format_list = [] - if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' - and info_dict['extractor'] in ['youtube', 'ted'] - and FFmpegMergerPP(self).available): + if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and + info_dict['extractor'] in ['youtube', 'ted'] and + FFmpegMergerPP(self).available): req_format_list.append('bestvideo+bestaudio') req_format_list.append('best') req_format = '/'.join(req_format_list) From 372744c544ec3de1b35583e7d6fc2cbc4cc39f3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 13 May 2015 22:26:30 +0600 Subject: [PATCH 0092/2145] [odnoklassniki] Fix extraction (Closes #5671) --- youtube_dl/extractor/odnoklassniki.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 155d0ee6a..fbc521d1a 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -6,6 +6,7 @@ from ..utils import ( unified_strdate, int_or_none, qualities, + unescapeHTML, ) @@ -36,8 +37,8 @@ class OdnoklassnikiIE(InfoExtractor): webpage = self._download_webpage(url, video_id) player = self._parse_json( - self._search_regex( - r"OKVideo\.start\(({.+?})\s*,\s*'VideoAutoplay_player'", webpage, 'player'), + unescapeHTML(self._search_regex( + r'data-attributes="([^"]+)"', webpage, 'player')), video_id) metadata = self._parse_json(player['flashvars']['metadata'], video_id) From 8e595397529abc71093264e3695fb00d95be4d78 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 14 May 2015 02:32:00 +0800 Subject: [PATCH 0093/2145] [postprocessor/embedthumbnail] Use thumbnails downloaded by YoutubeDL --- youtube_dl/YoutubeDL.py | 2 +- youtube_dl/__init__.py | 8 +++++++- youtube_dl/postprocessor/embedthumbnail.py | 23 +++++++++++----------- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 84d50dab7..0fbfe9642 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1848,7 +1848,7 @@ class YoutubeDL(object): thumb_ext = determine_ext(t['url'], 'jpg') suffix = '_%s' % t['id'] if len(thumbnails) > 1 else '' thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else '' - thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext + t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)): self.to_screen('[%s] %s: Thumbnail %sis already present' % diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 9cc9f851f..ace17857c 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -240,7 +240,13 @@ def _real_main(argv=None): if opts.xattrs: postprocessors.append({'key': 'XAttrMetadata'}) if opts.embedthumbnail: - postprocessors.append({'key': 'EmbedThumbnail'}) + already_have_thumbnail = opts.writethumbnail or opts.write_all_thumbnails + postprocessors.append({ + 'key': 'EmbedThumbnail', + 'already_have_thumbnail': already_have_thumbnail + }) + if not already_have_thumbnail: + opts.writethumbnail = True # Please keep ExecAfterDownload towards the bottom as it allows the user to modify the final file in any way. # So if the user is able to remove the file before your postprocessor runs it might cause a few problems. if opts.exec_cmd: diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index 4868a42fd..95c52f65f 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -7,11 +7,7 @@ import subprocess from .ffmpeg import FFmpegPostProcessor -from ..compat import ( - compat_urlretrieve, -) from ..utils import ( - determine_ext, check_executable, encodeFilename, PostProcessingError, @@ -25,26 +21,30 @@ class EmbedThumbnailPPError(PostProcessingError): class EmbedThumbnailPP(FFmpegPostProcessor): + def __init__(self, downloader=None, already_have_thumbnail=False): + super(EmbedThumbnailPP, self).__init__(downloader) + self._already_have_thumbnail = already_have_thumbnail + def run(self, info): filename = info['filepath'] temp_filename = prepend_extension(filename, 'temp') - temp_thumbnail = filename + '.' + determine_ext(info['thumbnail']) - if not info.get('thumbnail'): + if not info.get('thumbnails'): raise EmbedThumbnailPPError('Thumbnail was not found. Nothing to do.') - compat_urlretrieve(info['thumbnail'], temp_thumbnail) + thumbnail_filename = info['thumbnails'][-1]['filename'] if info['ext'] == 'mp3': options = [ - '-i', temp_thumbnail, '-c', 'copy', '-map', '0', '-map', '1', + '-i', thumbnail_filename, '-c', 'copy', '-map', '0', '-map', '1', '-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment="Cover (Front)"'] self._downloader.to_screen('[ffmpeg] Adding thumbnail to "%s"' % filename) self.run_ffmpeg(filename, temp_filename, options) - os.remove(encodeFilename(temp_thumbnail)) + if not self._already_have_thumbnail: + os.remove(encodeFilename(thumbnail_filename)) os.remove(encodeFilename(filename)) os.rename(encodeFilename(temp_filename), encodeFilename(filename)) @@ -52,7 +52,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor): if not check_executable('AtomicParsley', ['-v']): raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.') - cmd = ['AtomicParsley', filename, '--artwork', temp_thumbnail, '-o', temp_filename] + cmd = ['AtomicParsley', filename, '--artwork', thumbnail_filename, '-o', temp_filename] self._downloader.to_screen('[atomicparsley] Adding thumbnail to "%s"' % filename) @@ -66,7 +66,8 @@ class EmbedThumbnailPP(FFmpegPostProcessor): msg = stderr.decode('utf-8', 'replace').strip() raise EmbedThumbnailPPError(msg) - os.remove(encodeFilename(temp_thumbnail)) + if not self._already_have_thumbnail: + os.remove(encodeFilename(thumbnail_filename)) # for formats that don't support thumbnails (like 3gp) AtomicParsley # won't create to the temporary file if b'No changes' in stdout: From bb8ca1d112e95cd3fe48fff5af980a62a9db2572 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 14 May 2015 02:35:28 +0800 Subject: [PATCH 0094/2145] [postprocessor/embedthumbnail] Use run_ffmpeg_multiple_files --- youtube_dl/postprocessor/embedthumbnail.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index 95c52f65f..4e08c2709 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -36,12 +36,12 @@ class EmbedThumbnailPP(FFmpegPostProcessor): if info['ext'] == 'mp3': options = [ - '-i', thumbnail_filename, '-c', 'copy', '-map', '0', '-map', '1', + '-c', 'copy', '-map', '0', '-map', '1', '-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment="Cover (Front)"'] self._downloader.to_screen('[ffmpeg] Adding thumbnail to "%s"' % filename) - self.run_ffmpeg(filename, temp_filename, options) + self.run_ffmpeg_multiple_files([filename, thumbnail_filename], temp_filename, options) if not self._already_have_thumbnail: os.remove(encodeFilename(thumbnail_filename)) From 2cc6d135479c5dbd6e715a1e767c5be163cd22ce Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 14 May 2015 04:41:30 +0800 Subject: [PATCH 0095/2145] [postprocessor/embedthumbnail] Encode arguments in calling AtomicParsley --- youtube_dl/postprocessor/embedthumbnail.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index 4e08c2709..8f825f785 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -9,6 +9,7 @@ from .ffmpeg import FFmpegPostProcessor from ..utils import ( check_executable, + encodeArgument, encodeFilename, PostProcessingError, prepend_extension, @@ -52,7 +53,12 @@ class EmbedThumbnailPP(FFmpegPostProcessor): if not check_executable('AtomicParsley', ['-v']): raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.') - cmd = ['AtomicParsley', filename, '--artwork', thumbnail_filename, '-o', temp_filename] + cmd = [encodeFilename('AtomicParsley', True), + encodeFilename(filename, True), + encodeArgument('--artwork'), + encodeFilename(thumbnail_filename, True), + encodeArgument('-o'), + encodeFilename(temp_filename, True)] self._downloader.to_screen('[atomicparsley] Adding thumbnail to "%s"' % filename) From 86c7fdb17c0dcbff88a8daa131fddc57b6304b83 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 14 May 2015 14:26:47 +0800 Subject: [PATCH 0096/2145] [xattr] Enhance error handling to catch ENOSPC Fixes #5589 --- youtube_dl/postprocessor/xattrpp.py | 61 +++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py index 93d0abcf6..16f2966e9 100644 --- a/youtube_dl/postprocessor/xattrpp.py +++ b/youtube_dl/postprocessor/xattrpp.py @@ -3,18 +3,32 @@ from __future__ import unicode_literals import os import subprocess import sys +import errno from .common import PostProcessor -from ..compat import ( - subprocess_check_output -) from ..utils import ( check_executable, hyphenate_date, version_tuple, + PostProcessingError, + encodeArgument, + encodeFilename, ) +class XAttrMetadataError(PostProcessingError): + def __init__(self, code=None, msg='Unknown error'): + super(XAttrMetadataError, self).__init__(msg) + self.code = code + + # Parsing code and msg + if (self.code in (errno.ENOSPC, errno.EDQUOT) or + 'No space left' in self.msg or 'Disk quota excedded' in self.msg): + self.reason = 'NO_SPACE' + else: + self.reason = 'NOT_SUPPORTED' + + class XAttrMetadataPP(PostProcessor): # @@ -51,7 +65,10 @@ class XAttrMetadataPP(PostProcessor): raise ImportError def write_xattr(path, key, value): - return xattr.setxattr(path, key, value) + try: + xattr.set(path, key, value) + except EnvironmentError as e: + raise XAttrMetadataError(e.errno, e.strerror) except ImportError: if os.name == 'nt': @@ -62,8 +79,11 @@ class XAttrMetadataPP(PostProcessor): assert os.path.exists(path) ads_fn = path + ":" + key - with open(ads_fn, "wb") as f: - f.write(value) + try: + with open(ads_fn, "wb") as f: + f.write(value) + except EnvironmentError as e: + raise XAttrMetadataError(e.errno, e.strerror) else: user_has_setfattr = check_executable("setfattr", ['--version']) user_has_xattr = check_executable("xattr", ['-h']) @@ -71,12 +91,24 @@ class XAttrMetadataPP(PostProcessor): if user_has_setfattr or user_has_xattr: def write_xattr(path, key, value): + value = value.decode('utf-8') if user_has_setfattr: - cmd = ['setfattr', '-n', key, '-v', value, path] + executable = 'setfattr' + opts = ['-n', key, '-v', value] elif user_has_xattr: - cmd = ['xattr', '-w', key, value, path] + executable = 'xattr' + opts = ['-w', key, value] - subprocess_check_output(cmd) + cmd = ([encodeFilename(executable, True)] + + [encodeArgument(o) for o in opts] + + [encodeFilename(path, True)]) + + p = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + stdout, stderr = p.communicate() + stderr = stderr.decode('utf-8', 'replace') + if p.returncode != 0: + raise XAttrMetadataError(p.returncode, stderr) else: # On Unix, and can't find pyxattr, setfattr, or xattr. @@ -121,6 +153,13 @@ class XAttrMetadataPP(PostProcessor): return [], info - except (subprocess.CalledProcessError, OSError): - self._downloader.report_error("This filesystem doesn't support extended attributes. (You may have to enable them in your /etc/fstab)") + except XAttrMetadataError as e: + if e.reason == 'NO_SPACE': + self._downloader.report_warning( + 'There\'s no disk space left or disk quota exceeded. ' + + 'Extended attributes are not written.') + else: + self._downloader.report_error( + 'This filesystem doesn\'t support extended attributes. ' + + '(You may have to enable them in your /etc/fstab)') return [], info From fbff30d2dbc6462c628384ea5960c2461e7cdcca Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 14 May 2015 14:51:00 +0800 Subject: [PATCH 0097/2145] [xattr] Catch 'Argument list too long' --- youtube_dl/postprocessor/xattrpp.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py index 16f2966e9..27e273000 100644 --- a/youtube_dl/postprocessor/xattrpp.py +++ b/youtube_dl/postprocessor/xattrpp.py @@ -25,6 +25,8 @@ class XAttrMetadataError(PostProcessingError): if (self.code in (errno.ENOSPC, errno.EDQUOT) or 'No space left' in self.msg or 'Disk quota excedded' in self.msg): self.reason = 'NO_SPACE' + elif self.code == errno.E2BIG or 'Argument list too long' in self.msg: + self.reason = 'VALUE_TOO_LONG' else: self.reason = 'NOT_SUPPORTED' @@ -103,8 +105,11 @@ class XAttrMetadataPP(PostProcessor): [encodeArgument(o) for o in opts] + [encodeFilename(path, True)]) - p = subprocess.Popen( - cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + try: + p = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + except EnvironmentError as e: + raise XAttrMetadataError(e.errno, e.strerror) stdout, stderr = p.communicate() stderr = stderr.decode('utf-8', 'replace') if p.returncode != 0: @@ -158,6 +163,9 @@ class XAttrMetadataPP(PostProcessor): self._downloader.report_warning( 'There\'s no disk space left or disk quota exceeded. ' + 'Extended attributes are not written.') + elif e.reason == 'VALUE_TOO_LONG': + self._downloader.report_warning( + 'Unable to write extended attributes due to too long values.') else: self._downloader.report_error( 'This filesystem doesn\'t support extended attributes. ' + From 509c630db8cdaff473f95805cda1ae350107e36b Mon Sep 17 00:00:00 2001 From: blissland Date: Thu, 14 May 2015 08:09:56 +0100 Subject: [PATCH 0098/2145] [CanalplusIE] Update tests that were no longer working --- youtube_dl/extractor/canalplus.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 1b14471e5..71801488a 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -25,14 +25,13 @@ class CanalplusIE(InfoExtractor): } _TESTS = [{ - 'url': 'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470', - 'md5': '3db39fb48b9685438ecf33a1078023e4', + 'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1263092', 'info_dict': { - 'id': '922470', + 'id': '1263092', 'ext': 'flv', - 'title': 'Zapping - 26/08/13', - 'description': 'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013', - 'upload_date': '20130826', + 'title': 'Le Zapping - 13/05/15', + 'description': 'md5:09738c0d06be4b5d06a0940edb0da73f', + 'upload_date': '20150513', }, }, { 'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190', @@ -56,7 +55,6 @@ class CanalplusIE(InfoExtractor): 'skip': 'videos get deleted after a while', }, { 'url': 'http://www.itele.fr/france/video/aubervilliers-un-lycee-en-colere-111559', - 'md5': '65aa83ad62fe107ce29e564bb8712580', 'info_dict': { 'id': '1213714', 'ext': 'flv', From c827d4cfdb9ce47d13ccbec32d2b32dfb429ea8a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 14 May 2015 16:53:10 +0800 Subject: [PATCH 0099/2145] [xattr] Enhanced error messages on Windows --- youtube_dl/postprocessor/xattrpp.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py index 27e273000..7d88e1308 100644 --- a/youtube_dl/postprocessor/xattrpp.py +++ b/youtube_dl/postprocessor/xattrpp.py @@ -167,7 +167,10 @@ class XAttrMetadataPP(PostProcessor): self._downloader.report_warning( 'Unable to write extended attributes due to too long values.') else: - self._downloader.report_error( - 'This filesystem doesn\'t support extended attributes. ' + - '(You may have to enable them in your /etc/fstab)') + msg = 'This filesystem doesn\'t support extended attributes. ' + if os.name == 'nt': + msg += 'You need to use NTFS.' + else: + msg += '(You may have to enable them in your /etc/fstab)' + self._downloader.report_error(msg) return [], info From 7d57d2e18be416faa593364966ccf667243fd3ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 14 May 2015 14:59:27 +0600 Subject: [PATCH 0100/2145] [canalplus] Restore checksums in tests --- youtube_dl/extractor/canalplus.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 71801488a..699b4f7d0 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -26,6 +26,7 @@ class CanalplusIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1263092', + 'md5': 'b3481d7ca972f61e37420798d0a9d934', 'info_dict': { 'id': '1263092', 'ext': 'flv', @@ -55,6 +56,7 @@ class CanalplusIE(InfoExtractor): 'skip': 'videos get deleted after a while', }, { 'url': 'http://www.itele.fr/france/video/aubervilliers-un-lycee-en-colere-111559', + 'md5': 'f3a46edcdf28006598ffaf5b30e6a2d4', 'info_dict': { 'id': '1213714', 'ext': 'flv', From 82245a6de77f4755b063310258c5611c15f5ffbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 14 May 2015 15:21:27 +0600 Subject: [PATCH 0101/2145] [YoutubeDL] Restore filename for thumbnails --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 4c8196d08..691f3e09f 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1849,7 +1849,7 @@ class YoutubeDL(object): thumb_ext = determine_ext(t['url'], 'jpg') suffix = '_%s' % t['id'] if len(thumbnails) > 1 else '' thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else '' - thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext + t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)): self.to_screen('[%s] %s: Thumbnail %sis already present' % From fa6a16996e4a1aeee4e421b172efc6c351b1b123 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 14 May 2015 18:00:57 +0800 Subject: [PATCH 0102/2145] [worldstarhiphop] Support Android URLs (fixes #5629) --- youtube_dl/extractor/worldstarhiphop.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py index d5c26a032..a3ea26feb 100644 --- a/youtube_dl/extractor/worldstarhiphop.py +++ b/youtube_dl/extractor/worldstarhiphop.py @@ -6,8 +6,8 @@ from .common import InfoExtractor class WorldStarHipHopIE(InfoExtractor): - _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P.*)' - _TEST = { + _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/(?:videos|android)/video\.php\?v=(?P.*)' + _TESTS = [{ "url": "http://www.worldstarhiphop.com/videos/video.php?v=wshh6a7q1ny0G34ZwuIO", "md5": "9d04de741161603bf7071bbf4e883186", "info_dict": { @@ -15,7 +15,15 @@ class WorldStarHipHopIE(InfoExtractor): "ext": "mp4", "title": "KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!" } - } + }, { + 'url': 'http://m.worldstarhiphop.com/android/video.php?v=wshh6a7q1ny0G34ZwuIO', + 'md5': 'dc1c76c83ecc4190bb1eb143899b87d3', + 'info_dict': { + 'id': 'wshh6a7q1ny0G34ZwuIO', + 'ext': 'mp4', + "title": "KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!" + } + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -26,19 +34,22 @@ class WorldStarHipHopIE(InfoExtractor): return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo') video_url = self._search_regex( - r'so\.addVariable\("file","(.*?)"\)', webpage, 'video URL') + [r'so\.addVariable\("file","(.*?)"\)', + r'
\s*]+href="([^"]+)">'], + webpage, 'video URL') if 'youtube' in video_url: return self.url_result(video_url, ie='Youtube') video_title = self._html_search_regex( - r'(?s)
\s*

(.*?)

', + [r'(?s)
\s*

(.*?)

', + r']+class="tc-sp-pinned-title">(.*)'], webpage, 'title') # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. thumbnail = self._html_search_regex( r'rel="image_src" href="(.*)" />', webpage, 'thumbnail', - fatal=False) + default=None) if not thumbnail: _title = r'candytitles.*>(.*)' mobj = re.search(_title, webpage) From 7a012d5a16632a103466f9e9794dd98ad573ce88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 14 May 2015 16:39:35 +0600 Subject: [PATCH 0103/2145] [screenwavemedia] Add support for player2 URLs (Closes #5696) --- youtube_dl/extractor/screenwavemedia.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py index 74fb1983a..d1ab66b32 100644 --- a/youtube_dl/extractor/screenwavemedia.py +++ b/youtube_dl/extractor/screenwavemedia.py @@ -11,7 +11,7 @@ from ..utils import ( class ScreenwaveMediaIE(InfoExtractor): - _VALID_URL = r'http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=(?P.+)' + _VALID_URL = r'http://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?[^"]*\bid=(?P.+)' _TESTS = [{ 'url': 'http://player.screenwavemedia.com/play/play.php?playerdiv=videoarea&companiondiv=squareAd&id=Cinemassacre-19911', @@ -20,7 +20,10 @@ class ScreenwaveMediaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - playerdata = self._download_webpage(url, video_id, 'Downloading player webpage') + + playerdata = self._download_webpage( + 'http://player.screenwavemedia.com/play/player.php?id=%s' % video_id, + video_id, 'Downloading player webpage') vidtitle = self._search_regex( r'\'vidtitle\'\s*:\s*"([^"]+)"', playerdata, 'vidtitle').replace('\\/', '/') @@ -99,7 +102,7 @@ class TeamFourIE(InfoExtractor): webpage = self._download_webpage(url, display_id) playerdata_url = self._search_regex( - r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"', + r'src="(http://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?[^"]*\bid=.+?)"', webpage, 'player data URL') video_title = self._html_search_regex( From 548897396158d7822020f45c10301e9ca3c46453 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 14 May 2015 23:25:43 +0800 Subject: [PATCH 0104/2145] [qqmusic] flake8 --- youtube_dl/extractor/qqmusic.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index bae2ce31a..5ddbb183e 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -173,7 +173,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): class QQMusicToplistIE(QQPlaylistBaseIE): _VALID_URL = r'http://y\.qq\.com/#type=toplist&p=(?P(top|global)_[0-9]+)' - + _TESTS = [{ 'url': 'http://y.qq.com/#type=toplist&p=global_12', 'info_dict': { @@ -200,7 +200,7 @@ class QQMusicToplistIE(QQPlaylistBaseIE): @staticmethod def strip_qq_jsonp(code): return js_to_json(re.sub(r'^MusicJsonCallback\((.*?)\)/\*.+?\*/$', r'\1', code)) - + def _real_extract(self, url): list_id = self._match_id(url) @@ -208,7 +208,7 @@ class QQMusicToplistIE(QQPlaylistBaseIE): num_id = list_id.split("_")[1] list_page = self._download_webpage( - "http://y.qq.com/y/static/toplist/index/%s.html" % list_id, + "http://y.qq.com/y/static/toplist/index/%s.html" % list_id, list_id, 'Download toplist page') entries = [] @@ -216,10 +216,11 @@ class QQMusicToplistIE(QQPlaylistBaseIE): jsonp_url = "http://y.qq.com/y/static/toplist/json/top/%s/1.js" % num_id else: jsonp_url = "http://y.qq.com/y/static/toplist/json/global/%s/1_1.js" % num_id - - list = self._download_json(jsonp_url, list_id, note='Retrieve toplist json', + + list = self._download_json( + jsonp_url, list_id, note='Retrieve toplist json', errnote='Unable to get toplist json', transform_source=self.strip_qq_jsonp) - + for song in list['l']: s = song['s'] song_mid = s.split("|")[20] @@ -233,4 +234,3 @@ class QQMusicToplistIE(QQPlaylistBaseIE): list_desc = None return self.playlist_result(entries, list_id, list_name, list_desc) - \ No newline at end of file From 29ea57283e473e94c72cf9cbc065c3c05a14830f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 14 May 2015 23:28:42 +0800 Subject: [PATCH 0105/2145] [qqmusic] Refactoring QQMusicToplistIE --- youtube_dl/extractor/qqmusic.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 5ddbb183e..59e93a1dd 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -204,8 +204,7 @@ class QQMusicToplistIE(QQPlaylistBaseIE): def _real_extract(self, url): list_id = self._match_id(url) - list_type = list_id.split("_")[0] - num_id = list_id.split("_")[1] + list_type, num_id = list_id.split("_") list_page = self._download_webpage( "http://y.qq.com/y/static/toplist/index/%s.html" % list_id, @@ -217,11 +216,11 @@ class QQMusicToplistIE(QQPlaylistBaseIE): else: jsonp_url = "http://y.qq.com/y/static/toplist/json/global/%s/1_1.js" % num_id - list = self._download_json( + toplist_json = self._download_json( jsonp_url, list_id, note='Retrieve toplist json', errnote='Unable to get toplist json', transform_source=self.strip_qq_jsonp) - for song in list['l']: + for song in toplist_json['l']: s = song['s'] song_mid = s.split("|")[20] entries.append(self.url_result( @@ -231,6 +230,5 @@ class QQMusicToplistIE(QQPlaylistBaseIE): list_name = self._html_search_regex( r'

([^\']+)

', list_page, 'top list name', default=None) - list_desc = None - return self.playlist_result(entries, list_id, list_name, list_desc) + return self.playlist_result(entries, list_id, list_name) From 7ec676bb3dd6cba4b56fccb2d5aae08e66086b4e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 14 May 2015 23:32:36 +0800 Subject: [PATCH 0106/2145] [qqmusic] Add IE_NAME for all extractors --- youtube_dl/extractor/qqmusic.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 59e93a1dd..13113820b 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -15,6 +15,7 @@ from ..compat import compat_urllib_request class QQMusicIE(InfoExtractor): + IE_NAME = 'qqmusic' _VALID_URL = r'http://y.qq.com/#type=song&mid=(?P[0-9A-Za-z]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD', @@ -97,6 +98,7 @@ class QQPlaylistBaseIE(InfoExtractor): class QQMusicSingerIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:singer' _VALID_URL = r'http://y.qq.com/#type=singer&mid=(?P[0-9A-Za-z]+)' _TEST = { 'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2', @@ -140,6 +142,7 @@ class QQMusicSingerIE(QQPlaylistBaseIE): class QQMusicAlbumIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:album' _VALID_URL = r'http://y.qq.com/#type=album&mid=(?P[0-9A-Za-z]+)' _TEST = { @@ -172,6 +175,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): class QQMusicToplistIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:toplist' _VALID_URL = r'http://y\.qq\.com/#type=toplist&p=(?P(top|global)_[0-9]+)' _TESTS = [{ From 1ae72fb23df709687091133602fd715ab6cb7b2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 14 May 2015 22:28:42 +0600 Subject: [PATCH 0107/2145] [soundcloud:user] Defer download link resolve (Closes #5248) Looks like final download links can expire before downloading process reach them. So, resolving download links right before actual downloading. --- youtube_dl/extractor/soundcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 183ff50f4..c23c5ee0f 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -336,7 +336,7 @@ class SoundcloudUserIE(SoundcloudIE): if len(new_entries) == 0: self.to_screen('%s: End page received' % uploader) break - entries.extend(self._extract_info_dict(e, quiet=True) for e in new_entries) + entries.extend(self.url_result(e['permalink_url'], 'Soundcloud') for e in new_entries) return { '_type': 'playlist', From 3a105f7b20e8a3f742ac86cc1a6b02935b831778 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 15 May 2015 02:17:22 +0800 Subject: [PATCH 0108/2145] [teamcoco] Rewrite preload data extraction Idea: "puncture" some consecutive fragments and check whether the b64decode result of a punctured string is a valid JSON or not. It's a O(N^3) algorithm, but should be fast for a small N (less than 30 fragments in all test cases) --- youtube_dl/extractor/teamcoco.py | 53 ++++++++++++++++---------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 95d58ddd0..410eb7d3a 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import base64 import binascii import re +import json from .common import InfoExtractor from ..utils import ( @@ -68,41 +69,39 @@ class TeamcocoIE(InfoExtractor): video_id = self._html_search_regex( self._VIDEO_ID_REGEXES, webpage, 'video id') - data = preload = None - preloads = re.findall(r'"preload":\s*"([^"]+)"', webpage) - if preloads: - preload = max([(len(p), p) for p in preloads])[1] + data = None - if not preload: - preload = ''.join(re.findall(r'this\.push\("([^"]+)"\);', webpage)) + preload_codes = self._html_search_regex( + r'(function.+)setTimeout\(function\(\)\{playlist', + webpage, 'preload codes') + base64_fragments = re.findall(r'"([a-zA-z0-9+/=]+)"', preload_codes) + base64_fragments.remove('init') - if not preload: - preload = self._html_search_regex([ - r'player,\[?"([^"]+)"\]?', r'player.init\(\[?"([^"]+)"\]?\)' - ], webpage.replace('","', ''), 'preload data', default=None) - - if not preload: - preload_codes = self._html_search_regex( - r'(function.+)setTimeout\(function\(\)\{playlist', - webpage, 'preload codes') - base64_fragments = re.findall(r'"([a-zA-z0-9+/=]+)"', preload_codes) - base64_fragments.remove('init') - for i in range(len(base64_fragments)): - cur_sequence = (''.join(base64_fragments[i:] + base64_fragments[:i])).encode('ascii') + def _check_sequence(cur_fragments): + if not cur_fragments: + return + for i in range(len(cur_fragments)): + cur_sequence = (''.join(cur_fragments[i:] + cur_fragments[:i])).encode('ascii') try: raw_data = base64.b64decode(cur_sequence) - except (TypeError, binascii.Error): + if compat_ord(raw_data[0]) == compat_ord('{'): + return json.loads(raw_data.decode('utf-8')) + except (TypeError, binascii.Error, UnicodeDecodeError, ValueError): continue - if compat_ord(raw_data[0]) == compat_ord('{'): - data = self._parse_json(raw_data.decode('utf-8'), video_id, fatal=False) - if not preload and not data: - raise ExtractorError( - 'Preload information could not be extracted', expected=True) + def _check_data(): + for i in range(len(base64_fragments) + 1): + for j in range(i, len(base64_fragments) + 1): + data = _check_sequence(base64_fragments[:i] + base64_fragments[j:]) + if data: + return data + + self.to_screen('Try to compute possible data sequence. This may take some time.') + data = _check_data() if not data: - data = self._parse_json( - base64.b64decode(preload.encode('ascii')).decode('utf-8'), video_id) + raise ExtractorError( + 'Preload information could not be extracted', expected=True) formats = [] get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) From 12675275a1d2158fbe409361888569e4cb52ef07 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 15 May 2015 02:27:41 +0800 Subject: [PATCH 0109/2145] [teamcoco] Detect expired videos (#5626) --- youtube_dl/extractor/teamcoco.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 410eb7d3a..56be52638 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -62,7 +62,9 @@ class TeamcocoIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('display_id') - webpage = self._download_webpage(url, display_id) + webpage, urlh = self._download_webpage_handle(url, display_id) + if 'src=expired' in urlh.geturl(): + raise ExtractorError('This video is expired.', expected=True) video_id = mobj.group('video_id') if not video_id: From 2bc43303031215436b201e656094b60ab3ec7e9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 14 May 2015 23:41:27 +0200 Subject: [PATCH 0110/2145] [youtube:history] Fix extraction (fixes #5702) It uses the same method as YoutubeSubscriptionsIE, if other feed starts using it we should consider using base class. --- youtube_dl/extractor/youtube.py | 37 +++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0869c9fd4..e58184adc 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1667,13 +1667,42 @@ class YoutubeWatchLaterIE(YoutubePlaylistIE): return self._extract_playlist('WL') -class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): +class YoutubeHistoryIE(YoutubePlaylistIE): IE_NAME = 'youtube:history' IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' - _FEED_NAME = 'history' - _PERSONAL_FEED = True - _PLAYLIST_TITLE = 'Youtube Watch History' + _TESTS = [] + + def _real_extract(self, url): + title = 'Youtube History' + page = self._download_webpage('https://www.youtube.com/feed/history', title) + + # The extraction process is the same as for playlists, but the regex + # for the video ids doesn't contain an index + ids = [] + more_widget_html = content_html = page + + for page_num in itertools.count(1): + matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) + new_ids = orderedSet(matches) + ids.extend(new_ids) + + mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) + if not mobj: + break + + more = self._download_json( + 'https://youtube.com/%s' % mobj.group('more'), title, + 'Downloading page #%s' % page_num, + transform_source=uppercase_escape) + content_html = more['content_html'] + more_widget_html = more['load_more_widget_html'] + + return { + '_type': 'playlist', + 'title': title, + 'entries': self._ids_to_results(ids), + } class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): From c4fc559f45ea5c40409eab44867ff2b4f08976c2 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 15 May 2015 10:13:43 +0200 Subject: [PATCH 0111/2145] release 2015.05.15 --- docs/supportedsites.md | 11 ++++++++--- youtube_dl/version.py | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 98b625380..43fbe8b1d 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -291,6 +291,7 @@ - **MySpass** - **myvideo** - **MyVidster** + - **N-JOY** - **n-tv.de** - **NationalGeographic** - **Naver** @@ -368,9 +369,10 @@ - **prosiebensat1**: ProSiebenSat.1 Digital - **Puls4** - **Pyvideo** - - **QQMusic** - - **QQMusicAlbum** - - **QQMusicSinger** + - **qqmusic** + - **qqmusic:album** + - **qqmusic:singer** + - **qqmusic:toplist** - **QuickVid** - **R7** - **radio.de** @@ -439,6 +441,7 @@ - **southpark.cc.com:español** - **southpark.de** - **southpark.nl** + - **southparkstudios.dk** - **Space** - **SpankBang** - **Spankwire** @@ -492,6 +495,7 @@ - **tlc.com** - **tlc.de** - **TMZ** + - **TMZArticle** - **TNAFlix** - **tou.tv** - **Toypics**: Toypics user profile @@ -569,6 +573,7 @@ - **vk.com** - **vk.com:user-videos**: vk.com:All of a user's videos - **Vodlocker** + - **VoiceRepublic** - **Vporn** - **VRT** - **vube**: Vube.com diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 83c5a1659..38f00bc9b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.05.10' +__version__ = '2015.05.15' From 3884dcf313223040049e4153e0c398fbc36b5117 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 15 May 2015 14:03:00 +0200 Subject: [PATCH 0112/2145] YoutubeDL: ignore indexes from 'playlist_items' that are not in the list (fixes #5706) We ignore them instead of failing to match the behaviour of the 'playliststart' parameter. --- youtube_dl/YoutubeDL.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 691f3e09f..5df889945 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -759,7 +759,9 @@ class YoutubeDL(object): if isinstance(ie_entries, list): n_all_entries = len(ie_entries) if playlistitems: - entries = [ie_entries[i - 1] for i in playlistitems] + entries = [ + ie_entries[i - 1] for i in playlistitems + if -n_all_entries <= i - 1 < n_all_entries] else: entries = ie_entries[playliststart:playlistend] n_entries = len(entries) From e9eaf3fbcf497e76a55d2ba15d5880af83a065d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 15 May 2015 14:06:19 +0200 Subject: [PATCH 0113/2145] [test/YoutubeDL] Add tests for 'playliststart', 'playlistend' and 'playlist_items' --- test/test_YoutubeDL.py | 46 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 82b827536..a13c09ef4 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -12,6 +12,7 @@ import copy from test.helper import FakeYDL, assertRegexpMatches from youtube_dl import YoutubeDL +from youtube_dl.compat import compat_str from youtube_dl.extractor import YoutubeIE from youtube_dl.postprocessor.common import PostProcessor from youtube_dl.utils import match_filter_func @@ -507,6 +508,51 @@ class TestYoutubeDL(unittest.TestCase): res = get_videos(f) self.assertEqual(res, ['1']) + def test_playlist_items_selection(self): + entries = [{ + 'id': compat_str(i), + 'title': compat_str(i), + 'url': TEST_URL, + } for i in range(1, 5)] + playlist = { + '_type': 'playlist', + 'id': 'test', + 'entries': entries, + 'extractor': 'test:playlist', + 'extractor_key': 'test:playlist', + 'webpage_url': 'http://example.com', + } + + def get_ids(params): + ydl = YDL(params) + # make a copy because the dictionary can be modified + ydl.process_ie_result(playlist.copy()) + return [int(v['id']) for v in ydl.downloaded_info_dicts] + + result = get_ids({}) + self.assertEqual(result, [1, 2, 3, 4]) + + result = get_ids({'playlistend': 10}) + self.assertEqual(result, [1, 2, 3, 4]) + + result = get_ids({'playlistend': 2}) + self.assertEqual(result, [1, 2]) + + result = get_ids({'playliststart': 10}) + self.assertEqual(result, []) + + result = get_ids({'playliststart': 2}) + self.assertEqual(result, [2, 3, 4]) + + result = get_ids({'playlist_items': '2-4'}) + self.assertEqual(result, [2, 3, 4]) + + result = get_ids({'playlist_items': '2,4'}) + self.assertEqual(result, [2, 4]) + + result = get_ids({'playlist_items': '10'}) + self.assertEqual(result, []) + if __name__ == '__main__': unittest.main() From 15da7ce7fb89203247f4c959a748281ecf353e2a Mon Sep 17 00:00:00 2001 From: blissland Date: Fri, 15 May 2015 12:28:10 +0100 Subject: [PATCH 0114/2145] Fix file format extraction regex and update test file checksum --- youtube_dl/extractor/ccc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ccc.py b/youtube_dl/extractor/ccc.py index 2a5d4be18..6924eac70 100644 --- a/youtube_dl/extractor/ccc.py +++ b/youtube_dl/extractor/ccc.py @@ -16,7 +16,7 @@ class CCCIE(InfoExtractor): _TEST = { 'url': 'http://media.ccc.de/browse/congress/2013/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor.html#video', - 'md5': '205a365d0d57c0b1e43a12c9ffe8f9be', + 'md5': '3a1eda8f3a29515d27f5adb967d7e740', 'info_dict': { 'id': '20131228183', 'ext': 'mp4', @@ -51,7 +51,7 @@ class CCCIE(InfoExtractor): matches = re.finditer(r'''(?xs) <(?:span|div)\s+class='label\s+filetype'>(?P.*?)\s* - [^']+)'>\s* + [^']+)'>\s* (?: .*? [^']+\.torrent)' From a7b8467ac0baecd02a815b1f57731ae9bb10ab87 Mon Sep 17 00:00:00 2001 From: Vitaliy Syrchikov Date: Fri, 15 May 2015 16:52:11 +0400 Subject: [PATCH 0115/2145] Sportbox extractor fix. --- youtube_dl/extractor/sportbox.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index becdf658f..830220543 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -11,7 +11,7 @@ from ..utils import ( class SportBoxIE(InfoExtractor): - _VALID_URL = r'https?://news\.sportbox\.ru/Vidy_sporta/(?:[^/]+/)+spbvideo_NI\d+_(?P.+)' + _VALID_URL = r'https?://news\.sportbox\.ru/(?:[^/]+/)+spbvideo_NI\d+_(?P.+)' _TESTS = [ { 'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S', @@ -50,7 +50,7 @@ class SportBoxIE(InfoExtractor): display_id, 'Downloading player webpage') hls = self._search_regex( - r"var\s+original_hls_file\s*=\s*'([^']+)'", player, 'hls file') + r"sportboxPlayer\.jwplayer_common_params\.file\s*=\s*['\"]+([^\"]+)['\"]+", player, 'hls file') formats = self._extract_m3u8_formats(hls, display_id, 'mp4') From ae670a6ed8019f1b69bbe345621f51c8b32789ec Mon Sep 17 00:00:00 2001 From: Vitaliy Syrchikov Date: Fri, 15 May 2015 17:53:05 +0400 Subject: [PATCH 0116/2145] Sportbox source fix. HD videos support. --- youtube_dl/extractor/sportbox.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index 830220543..695b3ff82 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -14,7 +14,7 @@ class SportBoxIE(InfoExtractor): _VALID_URL = r'https?://news\.sportbox\.ru/(?:[^/]+/)+spbvideo_NI\d+_(?P.+)' _TESTS = [ { - 'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S', + 'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S', 'md5': 'ff56a598c2cf411a9a38a69709e97079', 'info_dict': { 'id': '80822', @@ -42,11 +42,15 @@ class SportBoxIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'src="/vdl/player/media/(\d+)"', webpage, 'video id') + sobj = re.search(r'src="/vdl/player/(?P\w+)/(?P\d+)"', webpage) + if (sobj): + video_id = sobj.group('video_id') + media_type = sobj.group('media_type') + else: + raise RegexNotFoundError('Unable to extract video_id') player = self._download_webpage( - 'http://news.sportbox.ru/vdl/player/media/%s' % video_id, + 'http://news.sportbox.ru/vdl/player/%s/%s' % (media_type, video_id), display_id, 'Downloading player webpage') hls = self._search_regex( From 25f14e9f93295a787e0cb436a5f6179d6174733d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 May 2015 21:06:59 +0600 Subject: [PATCH 0117/2145] [youtube] Separate feed extractor --- youtube_dl/extractor/youtube.py | 143 +++++++++----------------------- 1 file changed, 37 insertions(+), 106 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e58184adc..9096a2975 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -49,6 +49,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # YouTube sets the expire time to about two months expire_time=time.time() + 2 * 30 * 24 * 3600) + def _ids_to_results(self, ids): + return [ + self.url_result(vid_id, 'Youtube', video_id=vid_id) + for vid_id in ids] + def _login(self): """ Attempt to log in to YouTube. @@ -1261,11 +1266,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): def _real_initialize(self): self._login() - def _ids_to_results(self, ids): - return [ - self.url_result(vid_id, 'Youtube', video_id=vid_id) - for vid_id in ids] - def _extract_mix(self, playlist_id): # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id @@ -1601,20 +1601,10 @@ class YoutubeShowIE(InfoExtractor): class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ - Base class for extractors that fetch info from - http://www.youtube.com/feed_ajax + Base class for feed extractors Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True - # use action_load_personal_feed instead of action_load_system_feed - _PERSONAL_FEED = False - - @property - def _FEED_TEMPLATE(self): - action = 'action_load_system_feed' - if self._PERSONAL_FEED: - action = 'action_load_personal_feed' - return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME) @property def IE_NAME(self): @@ -1624,58 +1614,8 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): self._login() def _real_extract(self, url): - feed_entries = [] - paging = 0 - for i in itertools.count(1): - info = self._download_json( - self._FEED_TEMPLATE % paging, - '%s feed' % self._FEED_NAME, - 'Downloading page %s' % i, - transform_source=uppercase_escape) - feed_html = info.get('feed_html') or info.get('content_html') - load_more_widget_html = info.get('load_more_widget_html') or feed_html - m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html) - ids = orderedSet(m.group(1) for m in m_ids) - feed_entries.extend( - self.url_result(video_id, 'Youtube', video_id=video_id) - for video_id in ids) - mobj = re.search( - r'data-uix-load-more-href="/?[^"]+paging=(?P\d+)', - load_more_widget_html) - if mobj is None: - break - paging = mobj.group('paging') - return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE) - - -class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): - IE_NAME = 'youtube:recommended' - IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' - _FEED_NAME = 'recommended' - _PLAYLIST_TITLE = 'Youtube Recommended videos' - - -class YoutubeWatchLaterIE(YoutubePlaylistIE): - IE_NAME = 'youtube:watchlater' - IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater' - - _TESTS = [] # override PlaylistIE tests - - def _real_extract(self, url): - return self._extract_playlist('WL') - - -class YoutubeHistoryIE(YoutubePlaylistIE): - IE_NAME = 'youtube:history' - IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' - _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' - _TESTS = [] - - def _real_extract(self, url): - title = 'Youtube History' - page = self._download_webpage('https://www.youtube.com/feed/history', title) + page = self._download_webpage( + 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE) # The extraction process is the same as for playlists, but the regex # for the video ids doesn't contain an index @@ -1692,17 +1632,25 @@ class YoutubeHistoryIE(YoutubePlaylistIE): break more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), title, + 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE, 'Downloading page #%s' % page_num, transform_source=uppercase_escape) content_html = more['content_html'] more_widget_html = more['load_more_widget_html'] - return { - '_type': 'playlist', - 'title': title, - 'entries': self._ids_to_results(ids), - } + return self.playlist_result( + self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE) + + +class YoutubeWatchLaterIE(YoutubePlaylistIE): + IE_NAME = 'youtube:watchlater' + IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' + _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater' + + _TESTS = [] # override PlaylistIE tests + + def _real_extract(self, url): + return self._extract_playlist('WL') class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): @@ -1717,42 +1665,25 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): return self.url_result(playlist_id, 'YoutubePlaylist') -class YoutubeSubscriptionsIE(YoutubePlaylistIE): - IE_NAME = 'youtube:subscriptions' +class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): + IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' + _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' + _FEED_NAME = 'recommended' + _PLAYLIST_TITLE = 'Youtube Recommended videos' + + +class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' - _TESTS = [] + _FEED_NAME = 'subscriptions' + _PLAYLIST_TITLE = 'Youtube Subscriptions' - def _real_extract(self, url): - title = 'Youtube Subscriptions' - page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title) - # The extraction process is the same as for playlists, but the regex - # for the video ids doesn't contain an index - ids = [] - more_widget_html = content_html = page - - for page_num in itertools.count(1): - matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) - new_ids = orderedSet(matches) - ids.extend(new_ids) - - mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) - if not mobj: - break - - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), title, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] - - return { - '_type': 'playlist', - 'title': title, - 'entries': self._ids_to_results(ids), - } +class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): + IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' + _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' + _FEED_NAME = 'history' + _PLAYLIST_TITLE = 'Youtube History' class YoutubeTruncatedURLIE(InfoExtractor): From 62c95fd5fcb8dbea2faeb4edac4c5177cbac5912 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 May 2015 21:42:34 +0600 Subject: [PATCH 0118/2145] [youtube:feed] Check each 'load more' portion for unique video ids --- youtube_dl/extractor/youtube.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9096a2975..1f9940cf5 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1621,10 +1621,16 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): # for the video ids doesn't contain an index ids = [] more_widget_html = content_html = page - for page_num in itertools.count(1): matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) - new_ids = orderedSet(matches) + + # 'recommended' feed has infinite 'load more' and each new portion spins + # the same videos in (sometimes) slightly different order, so we'll check + # for unicity and break when portion has no new videos + new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches)) + if not new_ids: + break + ids.extend(new_ids) mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) From e9ca615a9872e85a6986061fdf54257244ce1f77 Mon Sep 17 00:00:00 2001 From: Vitaliy Syrchikov Date: Fri, 15 May 2015 19:57:54 +0400 Subject: [PATCH 0119/2145] New test --- youtube_dl/extractor/sportbox.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index 695b3ff82..cb1515eff 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -30,10 +30,29 @@ class SportBoxIE(InfoExtractor): # m3u8 download 'skip_download': True, }, - }, { + }, + { + 'url': 'http://news.sportbox.ru/video/no_ads/spbvideo_NI536574_V_Novorossijske_proshel_detskij_turnir_Pole_slavy_bojevoj?ci=211355', + 'md5': 'ff56a598c2cf411a9a38a69709e97079', + 'info_dict': { + 'id': '211355', + 'ext': 'mp4', + 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»', + 'description': '16 детских коллективов приняли участие в суперфинале турнира «Поле славы боевой».', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1426237001, + 'upload_date': '20150313', + 'duration': 292, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { 'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4', 'only_matching': True, - } + }, ] def _real_extract(self, url): From 34fe5a94baf9e7ea437de68621a5fa73780c0f17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 15 May 2015 18:42:59 +0200 Subject: [PATCH 0120/2145] [gamespot] Add support for videos that don't use 'f4m_stream' (fixes #5707) --- youtube_dl/extractor/gamespot.py | 60 +++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 47373e215..5927455f6 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -15,7 +15,7 @@ from ..utils import ( class GameSpotIE(InfoExtractor): _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P\d+)/?' - _TEST = { + _TESTS = [{ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', 'md5': 'b2a30deaa8654fcccd43713a6b6a4825', 'info_dict': { @@ -23,8 +23,16 @@ class GameSpotIE(InfoExtractor): 'ext': 'mp4', 'title': 'Arma 3 - Community Guide: SITREP I', 'description': 'Check out this video where some of the basics of Arma 3 is explained.', - } - } + }, + }, { + 'url': 'http://www.gamespot.com/videos/the-witcher-3-wild-hunt-xbox-one-now-playing/2300-6424837/', + 'info_dict': { + 'id': 'gs-2300-6424837', + 'ext': 'flv', + 'title': 'The Witcher 3: Wild Hunt [Xbox ONE] - Now Playing', + 'description': 'Join us as we take a look at the early hours of The Witcher 3: Wild Hunt and more.', + }, + }] def _real_extract(self, url): page_id = self._match_id(url) @@ -32,25 +40,37 @@ class GameSpotIE(InfoExtractor): data_video_json = self._search_regex( r'data-video=["\'](.*?)["\']', webpage, 'data video') data_video = json.loads(unescapeHTML(data_video_json)) + streams = data_video['videoStreams'] - # Transform the manifest url to a link to the mp4 files - # they are used in mobile devices. - f4m_url = data_video['videoStreams']['f4m_stream'] - f4m_path = compat_urlparse.urlparse(f4m_url).path - QUALITIES_RE = r'((,\d+)+,?)' - qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',') - http_path = f4m_path[1:].split('/', 1)[1] - http_template = re.sub(QUALITIES_RE, r'%s', http_path) - http_template = http_template.replace('.csmil/manifest.f4m', '') - http_template = compat_urlparse.urljoin( - 'http://video.gamespotcdn.com/', http_template) formats = [] - for q in qualities: - formats.append({ - 'url': http_template % q, - 'ext': 'mp4', - 'format_id': q, - }) + f4m_url = streams.get('f4m_stream') + if f4m_url is not None: + # Transform the manifest url to a link to the mp4 files + # they are used in mobile devices. + f4m_path = compat_urlparse.urlparse(f4m_url).path + QUALITIES_RE = r'((,\d+)+,?)' + qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',') + http_path = f4m_path[1:].split('/', 1)[1] + http_template = re.sub(QUALITIES_RE, r'%s', http_path) + http_template = http_template.replace('.csmil/manifest.f4m', '') + http_template = compat_urlparse.urljoin( + 'http://video.gamespotcdn.com/', http_template) + for q in qualities: + formats.append({ + 'url': http_template % q, + 'ext': 'mp4', + 'format_id': q, + }) + else: + for quality in ['sd', 'hd']: + # It's actually a link to a flv file + flv_url = streams.get('f4m_{0}'.format(quality)) + if flv_url is not None: + formats.append({ + 'url': flv_url, + 'ext': 'flv', + 'format_id': quality, + }) return { 'id': data_video['guid'], From eeb23eb7ea6953d7e90ccf669cd0e636d10b2b91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 15 May 2015 18:44:08 +0200 Subject: [PATCH 0121/2145] [gamespot] The protocol is not optional --- youtube_dl/extractor/gamespot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 5927455f6..2d33fa7f5 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -14,7 +14,7 @@ from ..utils import ( class GameSpotIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P\d+)/?' + _VALID_URL = r'http://(?:www\.)?gamespot\.com/.*-(?P\d+)/?' _TESTS = [{ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', 'md5': 'b2a30deaa8654fcccd43713a6b6a4825', From 3a7382950b6f498f50173c8813f6cb1db3739277 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 May 2015 22:50:44 +0600 Subject: [PATCH 0122/2145] [sportbox:embed] Add extractor --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/sportbox.py | 138 +++++++++++++++++-------------- 2 files changed, 82 insertions(+), 61 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 8ec0c1032..f293bc2a4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -502,7 +502,10 @@ from .spiegel import SpiegelIE, SpiegelArticleIE from .spiegeltv import SpiegeltvIE from .spike import SpikeIE from .sport5 import Sport5IE -from .sportbox import SportBoxIE +from .sportbox import ( + SportBoxIE, + SportBoxEmbedIE, +) from .sportdeutschland import SportDeutschlandIE from .srf import SrfIE from .srmediathek import SRMediathekIE diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index cb1515eff..10c45eb74 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( parse_duration, parse_iso8601, @@ -12,48 +13,30 @@ from ..utils import ( class SportBoxIE(InfoExtractor): _VALID_URL = r'https?://news\.sportbox\.ru/(?:[^/]+/)+spbvideo_NI\d+_(?P.+)' - _TESTS = [ - { - 'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S', - 'md5': 'ff56a598c2cf411a9a38a69709e97079', - 'info_dict': { - 'id': '80822', - 'ext': 'mp4', - 'title': 'Гонка 2 заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн', - 'description': 'md5:81715fa9c4ea3d9e7915dc8180c778ed', - 'thumbnail': 're:^https?://.*\.jpg$', - 'timestamp': 1411896237, - 'upload_date': '20140928', - 'duration': 4846, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - 'url': 'http://news.sportbox.ru/video/no_ads/spbvideo_NI536574_V_Novorossijske_proshel_detskij_turnir_Pole_slavy_bojevoj?ci=211355', - 'md5': 'ff56a598c2cf411a9a38a69709e97079', - 'info_dict': { - 'id': '211355', - 'ext': 'mp4', - 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»', - 'description': '16 детских коллективов приняли участие в суперфинале турнира «Поле славы боевой».', - 'thumbnail': 're:^https?://.*\.jpg$', - 'timestamp': 1426237001, - 'upload_date': '20150313', - 'duration': 292, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - 'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4', - 'only_matching': True, + _TESTS = [{ + 'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S', + 'md5': 'ff56a598c2cf411a9a38a69709e97079', + 'info_dict': { + 'id': '80822', + 'ext': 'mp4', + 'title': 'Гонка 2 заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн', + 'description': 'md5:81715fa9c4ea3d9e7915dc8180c778ed', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1411896237, + 'upload_date': '20140928', + 'duration': 4846, }, - ] + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4', + 'only_matching': True, + }, { + 'url': 'http://news.sportbox.ru/video/no_ads/spbvideo_NI536574_V_Novorossijske_proshel_detskij_turnir_Pole_slavy_bojevoj?ci=211355', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -61,39 +44,74 @@ class SportBoxIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - sobj = re.search(r'src="/vdl/player/(?P\w+)/(?P\d+)"', webpage) - if (sobj): - video_id = sobj.group('video_id') - media_type = sobj.group('media_type') - else: - raise RegexNotFoundError('Unable to extract video_id') - - player = self._download_webpage( - 'http://news.sportbox.ru/vdl/player/%s/%s' % (media_type, video_id), - display_id, 'Downloading player webpage') - - hls = self._search_regex( - r"sportboxPlayer\.jwplayer_common_params\.file\s*=\s*['\"]+([^\"]+)['\"]+", player, 'hls file') - - formats = self._extract_m3u8_formats(hls, display_id, 'mp4') + player = self._search_regex( + r'src="/?(vdl/player/[^"]+)"', webpage, 'player') title = self._html_search_regex( r'

([^<]+)

', webpage, 'title') description = self._html_search_regex( - r'(?s)
(.+?)
', webpage, 'description', fatal=False) + r'(?s)
(.+?)
', + webpage, 'description', fatal=False) thumbnail = self._og_search_thumbnail(webpage) timestamp = parse_iso8601(self._search_regex( - r'([^<]+)', webpage, 'timestamp', fatal=False)) + r'([^<]+)', + webpage, 'timestamp', fatal=False)) duration = parse_duration(self._html_search_regex( - r'', webpage, 'duration', fatal=False)) + r'', + webpage, 'duration', fatal=False)) return { - 'id': video_id, + '_type': 'url_transparent', + 'url': compat_urlparse.urljoin(url, '/%s' % player), 'display_id': display_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'timestamp': timestamp, 'duration': duration, + } + + +class SportBoxEmbedIE(InfoExtractor): + _VALID_URL = r'https?://news\.sportbox\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P\d+)' + _TESTS = [{ + 'url': 'http://news.sportbox.ru/vdl/player/ci/211355', + 'info_dict': { + 'id': '211355', + 'ext': 'mp4', + 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + hls = self._search_regex( + r"sportboxPlayer\.jwplayer_common_params\.file\s*=\s*['\"]([^'\"]+)['\"]", + webpage, 'hls file') + + formats = self._extract_m3u8_formats(hls, video_id, 'mp4') + + title = self._search_regex( + r'sportboxPlayer\.node_title\s*=\s*"([^"]+)"', webpage, 'title') + + thumbnail = self._search_regex( + r'sportboxPlayer\.jwplayer_common_params\.image\s*=\s*"([^"]+)"', + webpage, 'thumbnail', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, 'formats': formats, } From e8cfacae3710c2c225488e4b2d41b84268217a55 Mon Sep 17 00:00:00 2001 From: blissland Date: Fri, 15 May 2015 17:57:32 +0100 Subject: [PATCH 0123/2145] [CBSNewsIE] Relax thumbnail regex so test passes --- youtube_dl/extractor/cbsnews.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 7e47960ab..52e61d85b 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -32,7 +32,7 @@ class CBSNewsIE(InfoExtractor): 'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack', 'ext': 'flv', 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack', - 'thumbnail': 'http://cbsnews2.cbsistatic.com/hub/i/r/2014/04/04/0c9fbc66-576b-41ca-8069-02d122060dd2/thumbnail/140x90/6dad7a502f88875ceac38202984b6d58/en-0404-werner-replace-640x360.jpg', + 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 205, }, 'params': { From 1436a6835e0f3489a4c37cca3da5087567b68158 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 May 2015 23:08:44 +0600 Subject: [PATCH 0124/2145] [sportbox:embed] Add `_extract_urls` --- youtube_dl/extractor/sportbox.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index 10c45eb74..a869a1b25 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -91,6 +91,12 @@ class SportBoxEmbedIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+src="(https?://news\.sportbox\.ru/vdl/player[^"]+)"', + webpage) + def _real_extract(self, url): video_id = self._match_id(url) From ef28a6cb26630f8f198a72eee34a2b5c8bd2f802 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 May 2015 23:09:10 +0600 Subject: [PATCH 0125/2145] [sportbox:embed] Relax thumbnail --- youtube_dl/extractor/sportbox.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index a869a1b25..8686f9d11 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -113,7 +113,7 @@ class SportBoxEmbedIE(InfoExtractor): thumbnail = self._search_regex( r'sportboxPlayer\.jwplayer_common_params\.image\s*=\s*"([^"]+)"', - webpage, 'thumbnail', fatal=False) + webpage, 'thumbnail', default=None) return { 'id': video_id, From d40a3b5b55973d7ed65538179b71990c1828845a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 May 2015 23:09:34 +0600 Subject: [PATCH 0126/2145] [generic] Add support for sportbox embeds --- youtube_dl/extractor/generic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 3d756e848..9230c3bb0 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -32,6 +32,7 @@ from .brightcove import BrightcoveIE from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE +from .sportbox import SportBoxEmbedIE from .smotri import SmotriIE from .condenast import CondeNastIE from .udn import UDNEmbedIE @@ -1229,6 +1230,11 @@ class GenericIE(InfoExtractor): if rutv_url: return self.url_result(rutv_url, 'RUTV') + # Look for embedded SportBox player + sportbox_urls = SportBoxEmbedIE._extract_urls(webpage) + if sportbox_urls: + return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed') + # Look for embedded TED player mobj = re.search( r']+?src=(["\'])(?Phttps?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage) From b827a6015c145d67a4d4e9ea38aa54ebe347d3fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 May 2015 23:18:21 +0600 Subject: [PATCH 0127/2145] [generic] Add test for sportbox embeds --- youtube_dl/extractor/generic.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 9230c3bb0..610e33091 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -225,6 +225,37 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + # SportBox embed + { + 'url': 'http://www.vestifinance.ru/articles/25753', + 'info_dict': { + 'id': '25753', + 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"', + }, + 'playlist': [{ + 'info_dict': { + 'id': '370908', + 'title': 'Госзаказ. День 3', + 'ext': 'mp4', + } + }, { + 'info_dict': { + 'id': '370905', + 'title': 'Госзаказ. День 2', + 'ext': 'mp4', + } + }, { + 'info_dict': { + 'id': '370902', + 'title': 'Госзаказ. День 1', + 'ext': 'mp4', + } + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, # Embedded TED video { 'url': 'http://en.support.wordpress.com/videos/ted-talks/', From 25c3a7348f7971c0af32dcea2d7fd57bd5c63f05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 May 2015 23:23:51 +0600 Subject: [PATCH 0128/2145] [generic] Fix typo --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 610e33091..9a7b0d25d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1425,7 +1425,7 @@ class GenericIE(InfoExtractor): # Look for Senate ISVP iframe senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) if senate_isvp_url: - return self.url_result(surl, 'SenateISVP') + return self.url_result(senate_isvp_url, 'SenateISVP') def check_video(vurl): if YoutubeIE.suitable(vurl): From 70d0d43b5eeff04b41b089e499401e38c115e456 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 15 May 2015 23:32:25 +0600 Subject: [PATCH 0129/2145] [rts] Check formats (Closes #5711) --- youtube_dl/extractor/rts.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py index d0981115d..9fbe239d8 100644 --- a/youtube_dl/extractor/rts.py +++ b/youtube_dl/extractor/rts.py @@ -190,6 +190,7 @@ class RTSIE(InfoExtractor): 'tbr': media['rate'] or extract_bitrate(media['url']), } for media in info['media'] if media.get('rate')]) + self._check_formats(formats, video_id) self._sort_formats(formats) return { From 0d7f03642976e7859e290b06db41d20a4bfd3a38 Mon Sep 17 00:00:00 2001 From: ping Date: Sat, 16 May 2015 15:43:13 +0800 Subject: [PATCH 0130/2145] [viki] Add support for shows --- youtube_dl/extractor/__init__.py | 5 ++++- youtube_dl/extractor/viki.py | 33 ++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f293bc2a4..cb6635610 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -639,7 +639,10 @@ from .vine import ( VineIE, VineUserIE, ) -from .viki import VikiIE +from .viki import ( + VikiIE, + VikiShowIE, +) from .vk import ( VKIE, VKUserVideosIE, diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index cf6af1e5c..4d185c0e6 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -145,3 +145,36 @@ class VikiIE(InfoExtractor): 'ext': 'vtt', }] return res + + +class VikiShowIE(InfoExtractor): + IE_NAME = 'viki:show' + _VALID_URL = r'^https?://(?:www\.)?viki\.com/tv/(?P[0-9]+c)' + _TESTS = [{ + 'url': 'http://www.viki.com/tv/50c-boys-over-flowers', + 'info_dict': { + 'id': '50c', + 'title': 'Boys Over Flowers', + 'description': 'md5:ecd3cff47967fe193cff37c0bec52790', + }, + 'playlist_count': 25, + }] + + def _real_extract(self, url): + show_id = self._match_id(url) + show_page = self._download_webpage(url, show_id, 'Download show page') + + title = self._og_search_title(show_page) + description = self._og_search_description(show_page) + + show_json = self._download_json( + 'http://api.viki.io/v4/containers/%s/episodes.json?app=100000a&per_page=999&sort=number&direction=asc' % show_id, + show_id, note='Retrieve show json', errnote='Unable to get show json' + ) + entries = [] + for video in show_json['response']: + video_id = video['id'] + entries.append(self.url_result( + 'http://www.viki.com/videos/%s' % video_id, 'Viki', video_id)) + + return self.playlist_result(entries, show_id, title, description) From 2f3bdab2b90c6695c0a478f352967b0c9da4f23f Mon Sep 17 00:00:00 2001 From: ping Date: Sat, 16 May 2015 15:56:37 +0800 Subject: [PATCH 0131/2145] [viki] Fix code format --- youtube_dl/extractor/viki.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 4d185c0e6..40a73f561 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -169,8 +169,7 @@ class VikiShowIE(InfoExtractor): show_json = self._download_json( 'http://api.viki.io/v4/containers/%s/episodes.json?app=100000a&per_page=999&sort=number&direction=asc' % show_id, - show_id, note='Retrieve show json', errnote='Unable to get show json' - ) + show_id, note='Retrieve show json', errnote='Unable to get show json') entries = [] for video in show_json['response']: video_id = video['id'] From 1c97b0a777f52c520587e93e7e61721fa6195977 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 16 May 2015 20:00:40 +0600 Subject: [PATCH 0132/2145] [ooyala:external] Add extractor --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/ooyala.py | 221 +++++++++++++++++++------------ 2 files changed, 137 insertions(+), 89 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f293bc2a4..1731f4fb2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -376,7 +376,10 @@ from .nytimes import ( from .nuvid import NuvidIE from .odnoklassniki import OdnoklassnikiIE from .oktoberfesttv import OktoberfestTVIE -from .ooyala import OoyalaIE +from .ooyala import ( + OoyalaIE, + OoyalaExternalIE, +) from .openfilm import OpenFilmIE from .orf import ( ORFTVthekIE, diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index c0e6d643d..9f4fe5b29 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -12,7 +12,100 @@ from ..utils import ( ) -class OoyalaIE(InfoExtractor): +class OoyalaBaseIE(InfoExtractor): + + def _extract_result(self, info, more_info): + embedCode = info['embedCode'] + video_url = info.get('ipad_url') or info['url'] + + if determine_ext(video_url) == 'm3u8': + formats = self._extract_m3u8_formats(video_url, embedCode, ext='mp4') + else: + formats = [{ + 'url': video_url, + 'ext': 'mp4', + }] + + return { + 'id': embedCode, + 'title': unescapeHTML(info['title']), + 'formats': formats, + 'description': unescapeHTML(more_info['description']), + 'thumbnail': more_info['promo'], + } + + def _extract(self, player_url, video_id): + player = self._download_webpage(player_url, video_id) + mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="', + player, 'mobile player url') + # Looks like some videos are only available for particular devices + # (e.g. http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0 + # is only available for ipad) + # Working around with fetching URLs for all the devices found starting with 'unknown' + # until we succeed or eventually fail for each device. + devices = re.findall(r'device\s*=\s*"([^"]+)";', player) + devices.remove('unknown') + devices.insert(0, 'unknown') + for device in devices: + mobile_player = self._download_webpage( + '%s&device=%s' % (mobile_url, device), video_id, + 'Downloading mobile player JS for %s device' % device) + videos_info = self._search_regex( + r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);', + mobile_player, 'info', fatal=False, default=None) + if videos_info: + break + + if not videos_info: + formats = [] + auth_data = self._download_json( + 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=mp4,webm' % (embedCode, embedCode), + video_id) + + cur_auth_data = auth_data['authorization_data'][video_id] + + for stream in cur_auth_data['streams']: + formats.append({ + 'url': base64.b64decode(stream['url']['data'].encode('ascii')).decode('utf-8'), + 'ext': stream.get('delivery_type'), + 'format': stream.get('video_codec'), + 'format_id': stream.get('profile'), + 'width': int_or_none(stream.get('width')), + 'height': int_or_none(stream.get('height')), + 'abr': int_or_none(stream.get('audio_bitrate')), + 'vbr': int_or_none(stream.get('video_bitrate')), + }) + if formats: + return { + 'id': video_id, + 'formats': formats, + 'title': 'Ooyala video', + } + + if not cur_auth_data['authorized']: + raise ExtractorError(cur_auth_data['message'], expected=True) + + if not videos_info: + raise ExtractorError('Unable to extract info') + videos_info = videos_info.replace('\\"', '"') + videos_more_info = self._search_regex( + r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, 'more info').replace('\\"', '"') + videos_info = json.loads(videos_info) + videos_more_info = json.loads(videos_more_info) + + if videos_more_info.get('lineup'): + videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])] + return { + '_type': 'playlist', + 'id': video_id, + 'title': unescapeHTML(videos_more_info['title']), + 'entries': videos, + } + else: + return self._extract_result(videos_info[0], videos_more_info) + + +class OoyalaIE(OoyalaBaseIE): _VALID_URL = r'(?:ooyala:|https?://.+?\.ooyala\.com/.*?(?:embedCode|ec)=)(?P.+?)(&|$)' _TESTS = [ @@ -57,95 +150,47 @@ class OoyalaIE(InfoExtractor): return cls.url_result(cls._url_for_embed_code(embed_code), ie=cls.ie_key()) - def _extract_result(self, info, more_info): - embedCode = info['embedCode'] - video_url = info.get('ipad_url') or info['url'] + def _real_extract(self, url): + embed_code = self._match_id(url) + player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code + return self._extract(player_url, embed_code) - if determine_ext(video_url) == 'm3u8': - formats = self._extract_m3u8_formats(video_url, embedCode, ext='mp4') - else: - formats = [{ - 'url': video_url, - 'ext': 'mp4', - }] - return { - 'id': embedCode, - 'title': unescapeHTML(info['title']), - 'formats': formats, - 'description': unescapeHTML(more_info['description']), - 'thumbnail': more_info['promo'], - } +class OoyalaExternalIE(OoyalaBaseIE): + _VALID_URL = r'''(?x) + (?: + ooyalaexternal:| + https?://.+?\.ooyala\.com/.*?\bexternalId= + ) + (?P[^:]+) + : + (?P.+) + (?: + :| + .*?&pcode= + ) + (?P.+?) + (&|$) + ''' + + _TEST = { + 'url': 'https://player.ooyala.com/player.js?externalId=espn:10365079&pcode=1kNG061cgaoolOncv54OAO1ceO-I&adSetCode=91cDU6NuXTGKz3OdjOxFdAgJVtQcKJnI&callback=handleEvents&hasModuleParams=1&height=968&playerBrandingId=7af3bd04449c444c964f347f11873075&targetReplaceId=videoPlayer&width=1656&wmode=opaque&allowScriptAccess=always', + 'info_dict': { + 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG', + 'ext': 'mp4', + 'title': 'dm_140128_30for30Shorts___JudgingJewellv2', + 'description': '', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - embedCode = mobj.group('id') - player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embedCode - player = self._download_webpage(player_url, embedCode) - mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="', - player, 'mobile player url') - # Looks like some videos are only available for particular devices - # (e.g. http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0 - # is only available for ipad) - # Working around with fetching URLs for all the devices found starting with 'unknown' - # until we succeed or eventually fail for each device. - devices = re.findall(r'device\s*=\s*"([^"]+)";', player) - devices.remove('unknown') - devices.insert(0, 'unknown') - for device in devices: - mobile_player = self._download_webpage( - '%s&device=%s' % (mobile_url, device), embedCode, - 'Downloading mobile player JS for %s device' % device) - videos_info = self._search_regex( - r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);', - mobile_player, 'info', fatal=False, default=None) - if videos_info: - break - - if not videos_info: - formats = [] - auth_data = self._download_json( - 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=mp4,webm' % (embedCode, embedCode), - embedCode) - - cur_auth_data = auth_data['authorization_data'][embedCode] - - for stream in cur_auth_data['streams']: - formats.append({ - 'url': base64.b64decode(stream['url']['data'].encode('ascii')).decode('utf-8'), - 'ext': stream.get('delivery_type'), - 'format': stream.get('video_codec'), - 'format_id': stream.get('profile'), - 'width': int_or_none(stream.get('width')), - 'height': int_or_none(stream.get('height')), - 'abr': int_or_none(stream.get('audio_bitrate')), - 'vbr': int_or_none(stream.get('video_bitrate')), - }) - if formats: - return { - 'id': embedCode, - 'formats': formats, - 'title': 'Ooyala video', - } - - if not cur_auth_data['authorized']: - raise ExtractorError(cur_auth_data['message'], expected=True) - - if not videos_info: - raise ExtractorError('Unable to extract info') - videos_info = videos_info.replace('\\"', '"') - videos_more_info = self._search_regex( - r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, 'more info').replace('\\"', '"') - videos_info = json.loads(videos_info) - videos_more_info = json.loads(videos_more_info) - - if videos_more_info.get('lineup'): - videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])] - return { - '_type': 'playlist', - 'id': embedCode, - 'title': unescapeHTML(videos_more_info['title']), - 'entries': videos, - } - else: - return self._extract_result(videos_info[0], videos_more_info) + partner_id = mobj.group('partner_id') + video_id = mobj.group('id') + pcode = mobj.group('pcode') + player_url = 'http://player.ooyala.com/player.js?externalId=%s:%s&pcode=%s' % (partner_id, video_id, pcode) + return self._extract(player_url, video_id) From 9354a5fad4521687eb9f08c1a42848621857400b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 16 May 2015 20:15:31 +0600 Subject: [PATCH 0133/2145] [ooyala] Fix unresolved reference --- youtube_dl/extractor/ooyala.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 9f4fe5b29..a262a9f6d 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -59,7 +59,7 @@ class OoyalaBaseIE(InfoExtractor): if not videos_info: formats = [] auth_data = self._download_json( - 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=mp4,webm' % (embedCode, embedCode), + 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=mp4,webm' % (video_id, video_id), video_id) cur_auth_data = auth_data['authorization_data'][video_id] From ef2dcbe4adce4478d409397faaae7ec6453ecf7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 16 May 2015 21:07:29 +0600 Subject: [PATCH 0134/2145] [sbs] Fix extraction (Closes #5725) --- youtube_dl/extractor/sbs.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index b8775c2f9..3073e5e86 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -33,16 +33,18 @@ class SBSIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - release_urls_json = js_to_json(self._search_regex( + player = self._search_regex( r'(?s)playerParams\.releaseUrls\s*=\s*(\{.*?\n\});\n', - webpage, '')) - release_urls = json.loads(release_urls_json) - theplatform_url = ( - release_urls.get('progressive') or release_urls.get('standard')) + webpage, 'player') + player = re.sub(r"'\s*\+\s*[\da-zA-Z_]+\s*\+\s*'", '', player) + + release_urls = self._parse_json(js_to_json(player), video_id) + + theplatform_url = release_urls.get('progressive') or release_urls['standard'] title = remove_end(self._og_search_title(webpage), ' (The Feed)') description = self._html_search_meta('description', webpage) @@ -52,7 +54,6 @@ class SBSIE(InfoExtractor): '_type': 'url_transparent', 'id': video_id, 'url': theplatform_url, - 'title': title, 'description': description, 'thumbnail': thumbnail, From 7e760fc18897663db7c0717434e28a8cca9f3810 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 16 May 2015 21:14:19 +0600 Subject: [PATCH 0135/2145] [espn] Add extractor (#4396) Unfinished --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/espn.py | 55 ++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 youtube_dl/extractor/espn.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1731f4fb2..6b19eb6f8 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -141,6 +141,7 @@ from .engadget import EngadgetIE from .eporner import EpornerIE from .eroprofile import EroProfileIE from .escapist import EscapistIE +from .espn import ESPNIE from .everyonesmixtape import EveryonesMixtapeIE from .exfm import ExfmIE from .expotv import ExpoTVIE diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py new file mode 100644 index 000000000..e6f8f0337 --- /dev/null +++ b/youtube_dl/extractor/espn.py @@ -0,0 +1,55 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class ESPNIE(InfoExtractor): + _VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P[^/]+)' + _WORKING = False + _TESTS = [{ + 'url': 'http://espn.go.com/video/clip?id=10365079', + 'info_dict': { + 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG', + 'ext': 'mp4', + 'title': 'dm_140128_30for30Shorts___JudgingJewellv2', + 'description': '', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079', + 'only_matching': True, + }, { + 'url': 'http://espn.go.com/nba/recap?gameId=400793786', + 'only_matching': True, + }, { + 'url': 'http://espn.go.com/blog/golden-state-warriors/post/_/id/593/how-warriors-rapidly-regained-a-winning-edge', + 'only_matching': True, + }, { + 'url': 'http://espn.go.com/sports/endurance/story/_/id/12893522/dzhokhar-tsarnaev-sentenced-role-boston-marathon-bombings', + 'only_matching': True, + }, { + 'url': 'http://espn.go.com/nba/playoffs/2015/story/_/id/12887571/john-wall-washington-wizards-no-swelling-left-hand-wrist-game-5-return', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + r'class="video-play-button"[^>]+data-id="(\d+)', + webpage, 'video id') + + player = self._download_webpage( + 'https://espn.go.com/video/iframe/twitter/?id=%s' % video_id, video_id) + + pcode = self._search_regex( + r'["\']pcode=([^"\']+)["\']', player, 'pcode') + + return self.url_result( + 'ooyalaexternal:espn:%s:%s' % (video_id, pcode), + 'OoyalaExternal') From 4d52f2eb7f4b16ea5491f20abf0b29a1fcb24a02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 16 May 2015 18:38:28 +0200 Subject: [PATCH 0136/2145] [sbs] Remove unused import --- youtube_dl/extractor/sbs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index 3073e5e86..d4bd1a0d7 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -import json import re from .common import InfoExtractor from ..utils import ( From 1c18de00192d195357989861563cc1fad9256128 Mon Sep 17 00:00:00 2001 From: ping Date: Sun, 17 May 2015 01:38:50 +0800 Subject: [PATCH 0137/2145] [viki] Add proper paging and include clips --- youtube_dl/extractor/viki.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 40a73f561..4d477b03c 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -93,7 +93,7 @@ class VikiIE(InfoExtractor): 'Video %s is blocked from your location.' % video_id, expected=True) else: - raise ExtractorError('Viki said: ' + err_msg) + raise ExtractorError('Viki said: %s %s' % (err_msg, url)) mobj = re.search( r']+type="(?P[^"]+)"[^>]+src="(?P[^"]+)"', info_webpage) if not mobj: @@ -157,7 +157,15 @@ class VikiShowIE(InfoExtractor): 'title': 'Boys Over Flowers', 'description': 'md5:ecd3cff47967fe193cff37c0bec52790', }, - 'playlist_count': 25, + 'playlist_count': 70, + }, { + 'url': 'http://www.viki.com/tv/1354c-poor-nastya-complete', + 'info_dict': { + 'id': '1354c', + 'title': 'Poor Nastya [COMPLETE]', + 'description': 'md5:05bf5471385aa8b21c18ad450e350525', + }, + 'playlist_count': 127, }] def _real_extract(self, url): @@ -167,13 +175,16 @@ class VikiShowIE(InfoExtractor): title = self._og_search_title(show_page) description = self._og_search_description(show_page) - show_json = self._download_json( - 'http://api.viki.io/v4/containers/%s/episodes.json?app=100000a&per_page=999&sort=number&direction=asc' % show_id, - show_id, note='Retrieve show json', errnote='Unable to get show json') entries = [] - for video in show_json['response']: - video_id = video['id'] - entries.append(self.url_result( - 'http://www.viki.com/videos/%s' % video_id, 'Viki', video_id)) + for video_type in ['episodes', 'clips']: + json_url = 'http://api.viki.io/v4/containers/%s/%s.json?app=100000a&per_page=25&sort=number&direction=asc&with_paging=true&page=1' % (show_id, video_type) + while json_url is not None: + show_json = self._download_json( + json_url, show_id, note='Retrieve show json', errnote='Unable to get show json') + for video in show_json['response']: + video_id = video['id'] + entries.append(self.url_result( + 'http://www.viki.com/videos/%s' % video_id, 'Viki', video_id)) + json_url = show_json['pagination']['next'] return self.playlist_result(entries, show_id, title, description) From baa43cbaf01a575eacb8e1bb39c7200f68c36daa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 17 May 2015 02:59:35 +0600 Subject: [PATCH 0138/2145] [extractor/common] Relax valid url check verbosity --- youtube_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 65bb77086..cecf917ff 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -786,8 +786,8 @@ class InfoExtractor(object): return True except ExtractorError as e: if isinstance(e.cause, compat_HTTPError): - self.report_warning( - '%s URL is invalid, skipping' % item, video_id) + self.to_screen( + '%s: %s URL is invalid, skipping' % (video_id, item)) return False raise From bc0f937b55aae6ce731d259a7658b0281c2e62ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 17 May 2015 03:01:52 +0600 Subject: [PATCH 0139/2145] [tv2] Add extractor (#5724) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/tv2.py | 93 ++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100644 youtube_dl/extractor/tv2.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6b19eb6f8..fb4f63ca3 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -572,6 +572,7 @@ from .tumblr import TumblrIE from .tunein import TuneInIE from .turbo import TurboIE from .tutv import TutvIE +from .tv2 import TV2IE from .tv4 import TV4IE from .tvigle import TvigleIE from .tvp import TvpIE, TvpSeriesIE diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py new file mode 100644 index 000000000..2dcc0e971 --- /dev/null +++ b/youtube_dl/extractor/tv2.py @@ -0,0 +1,93 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + float_or_none, + parse_iso8601, +) + + +class TV2IE(InfoExtractor): + _VALID_URL = 'http://(?:www\.)?tv2\.no/v/(?P\d+)' + _TEST = { + 'url': 'http://www.tv2.no/v/916509/', + 'md5': '9cb9e3410b18b515d71892f27856e9b1', + 'info_dict': { + 'id': '916509', + 'ext': 'flv', + 'title': 'Se Gryttens hyllest av Steven Gerrard', + 'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.', + 'timestamp': 1431715610, + 'upload_date': '20150515', + 'duration': 156.967, + 'view_count': int, + 'categories': list, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + formats = [] + format_urls = [] + for protocol in ('HDS', 'HLS'): + data = self._download_json( + 'http://sumo.tv2.no/api/web/asset/%s/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % (video_id, protocol), + video_id, 'Downloading play JSON')['playback'] + for item in data['items']['item']: + video_url = item.get('url') + if not video_url or video_url in format_urls: + continue + format_id = '%s-%s' % (protocol.lower(), item.get('mediaFormat')) + if not self._is_valid_url(video_url, video_id, format_id): + continue + format_urls.append(video_url) + ext = determine_ext(video_url) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id=format_id)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id=format_id)) + elif ext == 'ism' or video_url.endswith('.ism/Manifest'): + pass + else: + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'tbr': int_or_none(item.get('bitrate')), + 'filesize': int_or_none(item.get('fileSize')), + }) + self._sort_formats(formats) + + asset = self._download_json( + 'http://sumo.tv2.no/api/web/asset/%s.json' % video_id, + video_id, 'Downloading metadata JSON')['asset'] + + title = asset['title'] + description = asset.get('description') + timestamp = parse_iso8601(asset.get('createTime')) + duration = float_or_none(asset.get('accurateDuration') or asset.get('duration')) + view_count = int_or_none(asset.get('views')) + categories = asset.get('keywords', '').split(',') + + thumbnails = [{ + 'id': thumbnail.get('@type'), + 'url': thumbnail.get('url'), + } for _, thumbnail in asset.get('imageVersions', {}).items()] + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnails': thumbnails, + 'timestamp': timestamp, + 'duration': duration, + 'view_count': view_count, + 'categories': categories, + 'formats': formats, + } From 588b82bbf8c90981c54f180eca40e6c743f8f89f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 17 May 2015 03:32:53 +0600 Subject: [PATCH 0140/2145] [tv2:article] Add extractor (Closes #5724) --- youtube_dl/extractor/__init__.py | 5 ++++- youtube_dl/extractor/tv2.py | 33 ++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index fb4f63ca3..6f8c261d5 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -572,7 +572,10 @@ from .tumblr import TumblrIE from .tunein import TuneInIE from .turbo import TurboIE from .tutv import TutvIE -from .tv2 import TV2IE +from .tv2 import ( + TV2IE, + TV2ArticleIE, +) from .tv4 import TV4IE from .tvigle import TvigleIE from .tvp import TvpIE, TvpSeriesIE diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index 2dcc0e971..fa338b936 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -1,12 +1,15 @@ # encoding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, float_or_none, parse_iso8601, + remove_end, ) @@ -91,3 +94,33 @@ class TV2IE(InfoExtractor): 'categories': categories, 'formats': formats, } + + +class TV2ArticleIE(InfoExtractor): + _VALID_URL = 'http://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P\d+)' + _TESTS = [{ + 'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542', + 'info_dict': { + 'id': '6930542', + 'title': 'Russen hetses etter pingvintyveri – innrømmer å ha åpnet luken på buret', + 'description': 'md5:339573779d3eea3542ffe12006190954', + }, + 'playlist_count': 2, + }, { + 'url': 'http://www.tv2.no/a/6930542', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result('http://www.tv2.no/v/%s' % video_id, 'TV2') + for video_id in re.findall(r'data-assetid="(\d+)"', webpage)] + + title = remove_end(self._og_search_title(webpage), ' - TV2.no') + description = remove_end(self._og_search_description(webpage), ' - TV2.no') + + return self.playlist_result(entries, playlist_id, title, description) From 8da0e0e94682faa0463f33d991df70a2402b5a86 Mon Sep 17 00:00:00 2001 From: ping Date: Sun, 17 May 2015 06:19:38 +0800 Subject: [PATCH 0141/2145] [viki] Change IE name to channel, better message output --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/viki.py | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index cb6635610..21f7b7290 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -641,7 +641,7 @@ from .vine import ( ) from .viki import ( VikiIE, - VikiShowIE, + VikiChannelIE, ) from .vk import ( VKIE, diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 4d477b03c..9bdbdc3e4 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -147,8 +147,8 @@ class VikiIE(InfoExtractor): return res -class VikiShowIE(InfoExtractor): - IE_NAME = 'viki:show' +class VikiChannelIE(InfoExtractor): + IE_NAME = 'viki:channel' _VALID_URL = r'^https?://(?:www\.)?viki\.com/tv/(?P[0-9]+c)' _TESTS = [{ 'url': 'http://www.viki.com/tv/50c-boys-over-flowers', @@ -167,6 +167,7 @@ class VikiShowIE(InfoExtractor): }, 'playlist_count': 127, }] + _PER_PAGE = 25 def _real_extract(self, url): show_id = self._match_id(url) @@ -177,10 +178,12 @@ class VikiShowIE(InfoExtractor): entries = [] for video_type in ['episodes', 'clips']: - json_url = 'http://api.viki.io/v4/containers/%s/%s.json?app=100000a&per_page=25&sort=number&direction=asc&with_paging=true&page=1' % (show_id, video_type) + json_url = 'http://api.viki.io/v4/containers/%s/%s.json?app=100000a&per_page=%d&sort=number&direction=asc&with_paging=true&page=1' % (show_id, video_type, self._PER_PAGE) while json_url is not None: show_json = self._download_json( - json_url, show_id, note='Retrieve show json', errnote='Unable to get show json') + json_url, show_id, + note='Downloading %s json page #%s' % + (video_type, re.search(r'[?&]page=([0-9]+)', json_url).group(1))) for video in show_json['response']: video_id = video['id'] entries.append(self.url_result( From 725652e9247e1171110b624d748e20fa1c88260e Mon Sep 17 00:00:00 2001 From: Mister Hat Date: Sat, 16 May 2015 19:50:58 -0500 Subject: [PATCH 0142/2145] [karrierevideos] add support for www.karrierevideos.at (closes #5354) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/karrierevideos.py | 52 ++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 youtube_dl/extractor/karrierevideos.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index fb4f63ca3..d131d3ec3 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -244,6 +244,7 @@ from .kaltura import KalturaIE from .kanalplay import KanalPlayIE from .kankan import KankanIE from .karaoketv import KaraoketvIE +from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py new file mode 100644 index 000000000..59d29e845 --- /dev/null +++ b/youtube_dl/extractor/karrierevideos.py @@ -0,0 +1,52 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class KarriereVideosIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?karrierevideos\.at/berufsvideos/([a-z-]+)/(?P[a-z-]+)' + _TEST = { + 'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin', + 'info_dict': { + 'id': 'altenpflegerin', + 'ext': 'mp4', + 'title': 'AltenpflegerIn', + 'thumbnail': 're:^http://.*\.png\?v=[0-9]+', + 'description': 'md5:dbadd1259fde2159a9b28667cb664ae2' + }, + 'params': { + 'skip_download': 'requires rtmpdump' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + description = self._html_search_regex( + r'
\n{0,}?\s{0,}

(.*?)

', + webpage, 'description') + + playlist = self._html_search_regex(r'/config/video/(.*?)\.xml', webpage, 'playlist') + playlist = self._download_xml( + 'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % playlist, + video_id) + + namespace = { + 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats' + } + + item = playlist.find('tracklist/item') + streamer = item.find('jwplayer:streamer', namespace).text + + return { + 'id': video_id, + 'title': self._html_search_meta('title', webpage), + 'description': description, + 'thumbnail': 'http://www.karrierevideos.at' + self._html_search_meta('thumbnail', webpage), + 'protocol': 'rtmp', + 'url': streamer.replace('rtmpt', 'http'), + 'play_path': 'mp4:' + item.find('jwplayer:file', namespace).text, + 'tc_url': streamer, + 'ext': 'mp4' + } From ba9d16291b8ace3bd412bcfc0c128c047545e509 Mon Sep 17 00:00:00 2001 From: Mister Hat Date: Sun, 17 May 2015 03:35:08 -0500 Subject: [PATCH 0143/2145] manually specify namespace --- youtube_dl/extractor/karrierevideos.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py index 59d29e845..a05e8ab76 100644 --- a/youtube_dl/extractor/karrierevideos.py +++ b/youtube_dl/extractor/karrierevideos.py @@ -29,15 +29,13 @@ class KarriereVideosIE(InfoExtractor): playlist = self._html_search_regex(r'/config/video/(.*?)\.xml', webpage, 'playlist') playlist = self._download_xml( - 'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % playlist, + 'http://www.karrierevideos.at/player-playlist.xml.php?p=' + playlist, video_id) - namespace = { - 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats' - } + namespace = 'http://developer.longtailvideo.com/trac/wiki/FlashFormats' item = playlist.find('tracklist/item') - streamer = item.find('jwplayer:streamer', namespace).text + streamer = item.find('{%s}streamer' % namespace).text return { 'id': video_id, @@ -46,7 +44,7 @@ class KarriereVideosIE(InfoExtractor): 'thumbnail': 'http://www.karrierevideos.at' + self._html_search_meta('thumbnail', webpage), 'protocol': 'rtmp', 'url': streamer.replace('rtmpt', 'http'), - 'play_path': 'mp4:' + item.find('jwplayer:file', namespace).text, + 'play_path': 'mp4:' + item.find('{%s}file' % namespace).text, 'tc_url': streamer, 'ext': 'mp4' } From 4a5a898a8fa392d02102672f9767f33a39a73066 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 17 May 2015 20:56:03 +0600 Subject: [PATCH 0144/2145] [YoutubeDL] Clarify incompatible formats merge message When `-f` is not specified it's misleading to see `You have requested ...` as user did not actually request any formats. --- youtube_dl/YoutubeDL.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 5df889945..58b34e087 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1368,7 +1368,7 @@ class YoutubeDL(object): postprocessors = [] self.report_warning('You have requested multiple ' 'formats but ffmpeg or avconv are not installed.' - ' The formats won\'t be merged') + ' The formats won\'t be merged.') else: postprocessors = [merger] @@ -1395,8 +1395,8 @@ class YoutubeDL(object): requested_formats = info_dict['requested_formats'] if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats): info_dict['ext'] = 'mkv' - self.report_warning('You have requested formats incompatible for merge. ' - 'The formats will be merged into mkv') + self.report_warning( + 'Requested formats are incompatible for merge and will be merged into mkv.') # Ensure filename always has a correct extension for successful merge filename = '%s.%s' % (filename_wo_ext, info_dict['ext']) if os.path.exists(encodeFilename(filename)): From fc6e75dd57f3497b99def659b3d0f173b195b7d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 18 May 2015 11:21:09 +0200 Subject: [PATCH 0145/2145] [instagram] Only recognize https urls (fixes #5739) http urls redirect to them. --- youtube_dl/extractor/instagram.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 65f6ca103..b10755788 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -7,9 +7,9 @@ from ..utils import int_or_none class InstagramIE(InfoExtractor): - _VALID_URL = r'https?://instagram\.com/p/(?P[\da-zA-Z]+)' + _VALID_URL = r'https://instagram\.com/p/(?P[\da-zA-Z]+)' _TEST = { - 'url': 'http://instagram.com/p/aye83DjauH/?foo=bar#abc', + 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'md5': '0d2da106a9d2631273e192b372806516', 'info_dict': { 'id': 'aye83DjauH', @@ -41,11 +41,11 @@ class InstagramIE(InfoExtractor): class InstagramUserIE(InfoExtractor): - _VALID_URL = r'http://instagram\.com/(?P[^/]{2,})/?(?:$|[?#])' + _VALID_URL = r'https://instagram\.com/(?P[^/]{2,})/?(?:$|[?#])' IE_DESC = 'Instagram user profile' IE_NAME = 'instagram:user' _TEST = { - 'url': 'http://instagram.com/porsche', + 'url': 'https://instagram.com/porsche', 'info_dict': { 'id': 'porsche', 'title': 'porsche', From 5bdc520cf19f404247ec2be1ffc1e83449fa2375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 18 May 2015 21:23:05 +0600 Subject: [PATCH 0146/2145] [xminus] Fix extraction --- youtube_dl/extractor/xminus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/xminus.py b/youtube_dl/extractor/xminus.py index 8c6241aed..7c9d8af6f 100644 --- a/youtube_dl/extractor/xminus.py +++ b/youtube_dl/extractor/xminus.py @@ -43,7 +43,7 @@ class XMinusIE(InfoExtractor): r'minus_track\.dur_sec=\'([0-9]*?)\'', webpage, 'duration', fatal=False)) filesize_approx = parse_filesize(self._html_search_regex( - r'
\s*([0-9.]+\s*[a-zA-Z][bB])', + r'
]*>\s*↓\s*([0-9.]+\s*[a-zA-Z][bB])', webpage, 'approximate filesize', fatal=False)) tbr = int_or_none(self._html_search_regex( r'
\s*([0-9]+)\s*kbps', @@ -58,7 +58,7 @@ class XMinusIE(InfoExtractor): description = re.sub(' *\r *', '\n', description) enc_token = self._html_search_regex( - r'minus_track\.tkn="(.+?)"', webpage, 'enc_token') + r'minus_track\.s?tkn="(.+?)"', webpage, 'enc_token') token = ''.join( c if pos == 3 else compat_chr(compat_ord(c) - 1) for pos, c in enumerate(reversed(enc_token))) From 4f514c7e88d2ce8ebe9c2478183e8797cfb2a4c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 18 May 2015 21:29:41 +0600 Subject: [PATCH 0147/2145] [wimp] Fix youtube extraction (Closes #5690) --- youtube_dl/extractor/wimp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index d6dec25ca..f69d46a28 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -37,7 +37,8 @@ class WimpIE(InfoExtractor): video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) video_url = self._search_regex( - r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", webpage, 'video URL') + [r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", r"videoId\s*:\s*[\"']([^\"']+)[\"']"], + webpage, 'video URL') if YoutubeIE.suitable(video_url): self.to_screen('Found YouTube video') return { From 2328f2fe684f9a9025217c6f149e92a403a4c437 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 18 May 2015 21:34:20 +0600 Subject: [PATCH 0148/2145] [vulture] Fix extraction --- youtube_dl/extractor/vulture.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vulture.py b/youtube_dl/extractor/vulture.py index 1eb24a3d6..faa167e65 100644 --- a/youtube_dl/extractor/vulture.py +++ b/youtube_dl/extractor/vulture.py @@ -44,7 +44,7 @@ class VultureIE(InfoExtractor): query_webpage = self._download_webpage( query_url, display_id, note='Downloading query page') params_json = self._search_regex( - r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n,\n', + r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n?,\n', query_webpage, 'player params') params = json.loads(params_json) From 5d8dcb5342c97b05c037c8c4e80002540db261b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 18 May 2015 21:39:15 +0600 Subject: [PATCH 0149/2145] [vuclip] Fix extraction --- youtube_dl/extractor/vuclip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py index c3fde53f5..a6d9b5fee 100644 --- a/youtube_dl/extractor/vuclip.py +++ b/youtube_dl/extractor/vuclip.py @@ -49,7 +49,7 @@ class VuClipIE(InfoExtractor): links_code = self._search_regex( r'''(?xs) (?: - | + | \s*
) (.*?) From 484c9d2d5b669220c24c865947c3f65049916b56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 18 May 2015 21:43:54 +0600 Subject: [PATCH 0150/2145] [vier] Fix extraction --- youtube_dl/extractor/vier.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py index 619039e51..15377097e 100644 --- a/youtube_dl/extractor/vier.py +++ b/youtube_dl/extractor/vier.py @@ -38,11 +38,14 @@ class VierIE(InfoExtractor): webpage = self._download_webpage(url, display_id) video_id = self._search_regex( - r'"nid"\s*:\s*"(\d+)"', webpage, 'video id') + [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'], + webpage, 'video id') application = self._search_regex( - r'"application"\s*:\s*"([^"]+)"', webpage, 'application', default='vier_vod') + [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], + webpage, 'application', default='vier_vod') filename = self._search_regex( - r'"filename"\s*:\s*"([^"]+)"', webpage, 'filename') + [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'], + webpage, 'filename') playlist_url = 'http://vod.streamcloud.be/%s/mp4:_definst_/%s.mp4/playlist.m3u8' % (application, filename) formats = self._extract_m3u8_formats(playlist_url, display_id, 'mp4') From 2aa64b89b3ac8f387d4c0c27ce7de64bc0ff68de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 18 May 2015 17:58:53 +0200 Subject: [PATCH 0151/2145] tox: Pass HOME environment variable Since version 2.0 it only passes a limited set of variables and we need HOME for the tests --- tox.ini | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tox.ini b/tox.ini index 00c6e00e3..cd805fe8a 100644 --- a/tox.ini +++ b/tox.ini @@ -4,6 +4,8 @@ envlist = py26,py27,py33,py34 deps = nose coverage +# We need a valid $HOME for test_compat_expanduser +passenv = HOME defaultargs = test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py From 1b0427e6c433c0b6db5e210db6e3173e19e702ed Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 19 May 2015 00:45:01 +0800 Subject: [PATCH 0152/2145] [utils] Support TTML without default namespace In a strict sense such TTML is invalid, but Yahoo uses it. --- test/test_utils.py | 15 +++++++++++++++ youtube_dl/utils.py | 9 ++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index b40107037..e13e11b59 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -621,6 +621,21 @@ Line ''' self.assertEqual(dfxp2srt(dfxp_data), srt_data) + dfxp_data_no_default_namespace = ''' + + +
+

The first line

+
+ +
''' + srt_data = '''1 +00:00:00,000 --> 00:00:01,000 +The first line + +''' + self.assertEqual(dfxp2srt(dfxp_data_no_default_namespace), srt_data) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ed9ed9ed6..507f07383 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1848,9 +1848,9 @@ def dfxp2srt(dfxp_data): out = str_or_empty(node.text) for child in node: - if child.tag == _x('ttml:br'): + if child.tag in (_x('ttml:br'), 'br'): out += '\n' + str_or_empty(child.tail) - elif child.tag == _x('ttml:span'): + elif child.tag in (_x('ttml:span'), 'span'): out += str_or_empty(parse_node(child)) else: out += str_or_empty(xml.etree.ElementTree.tostring(child)) @@ -1859,7 +1859,10 @@ def dfxp2srt(dfxp_data): dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) out = [] - paras = dfxp.findall(_x('.//ttml:p')) + paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') + + if not paras: + raise ValueError('Invalid dfxp/TTML subtitle') for para, index in zip(paras, itertools.count(1)): begin_time = parse_dfxp_time_expr(para.attrib['begin']) From ecee5724110847b832a6074c66ca4a63758100f4 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 19 May 2015 00:50:24 +0800 Subject: [PATCH 0153/2145] [yahoo] Add support for closed captions (closes #5714) --- youtube_dl/extractor/yahoo.py | 18 ++++++++++++++++++ youtube_dl/utils.py | 1 + 2 files changed, 19 insertions(+) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index bf4e659ac..f9afbdbab 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -15,6 +15,7 @@ from ..utils import ( unescapeHTML, ExtractorError, int_or_none, + mimetype2ext, ) from .nbc import NBCSportsVPlayerIE @@ -236,6 +237,22 @@ class YahooIE(InfoExtractor): self._sort_formats(formats) + closed_captions = self._html_search_regex( + r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions', + default='[]') + + cc_json = self._parse_json(closed_captions, video_id, fatal=False) + subtitles = {} + if cc_json: + for closed_caption in cc_json: + lang = closed_caption['lang'] + if lang not in subtitles: + subtitles[lang] = [] + subtitles[lang].append({ + 'url': closed_caption['url'], + 'ext': mimetype2ext(closed_caption['content_type']), + }) + return { 'id': video_id, 'display_id': display_id, @@ -244,6 +261,7 @@ class YahooIE(InfoExtractor): 'description': clean_html(meta['description']), 'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage), 'duration': int_or_none(meta.get('duration')), + 'subtitles': subtitles, } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 507f07383..52d198fa3 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1665,6 +1665,7 @@ def mimetype2ext(mt): return { 'x-ms-wmv': 'wmv', 'x-mp4-fragmented': 'mp4', + 'ttml+xml': 'ttml', }.get(res, res) From b813d8caf1b23821036b77b851e42ba0a0ad35a7 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 19 May 2015 01:01:42 +0800 Subject: [PATCH 0154/2145] [qqmusic] Unescape '\\n' in description (#5705) --- youtube_dl/extractor/qqmusic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 13113820b..b540033e2 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -26,7 +26,7 @@ class QQMusicIE(InfoExtractor): 'title': '可惜没如果', 'upload_date': '20141227', 'creator': '林俊杰', - 'description': 'md5:4348ff1dd24036906baa7b6f973f8d30', + 'description': 'md5:d327722d0361576fde558f1ac68a7065', } }] @@ -60,6 +60,8 @@ class QQMusicIE(InfoExtractor): lrc_content = self._html_search_regex( r'
]*>([^<>]+)
', detail_info_page, 'LRC lyrics', default=None) + if lrc_content: + lrc_content = lrc_content.replace('\\n', '\n') guid = self.m_r_get_ruin() From d9d747a06ab3b4c36c6063074ffb42aeb185431f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 19 May 2015 21:28:41 +0600 Subject: [PATCH 0155/2145] [ultimedia] Fix extraction --- youtube_dl/extractor/ultimedia.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/ultimedia.py b/youtube_dl/extractor/ultimedia.py index 96c809eaf..c4751050e 100644 --- a/youtube_dl/extractor/ultimedia.py +++ b/youtube_dl/extractor/ultimedia.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse from ..utils import ( ExtractorError, qualities, @@ -44,9 +45,9 @@ class UltimediaIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - deliver_url = self._search_regex( - r']+src="(https?://(?:www\.)?ultimedia\.com/deliver/[^"]+)"', - webpage, 'deliver URL') + deliver_url = self._proto_relative_url(self._search_regex( + r']+src="((?:https?:)?//(?:www\.)?ultimedia\.com/deliver/[^"]+)"', + webpage, 'deliver URL'), compat_urllib_parse_urlparse(url).scheme + ':') deliver_page = self._download_webpage( deliver_url, video_id, 'Downloading iframe page') @@ -57,7 +58,8 @@ class UltimediaIE(InfoExtractor): player = self._parse_json( self._search_regex( - r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on", deliver_page, 'player'), + r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on", + deliver_page, 'player'), video_id) quality = qualities(['flash', 'html5']) From f670ef1c8ebe0329a68b3a3d5c2b7e07ae5c9425 Mon Sep 17 00:00:00 2001 From: ping Date: Wed, 20 May 2015 13:51:43 +0800 Subject: [PATCH 0156/2145] [dramafever] Add new extractor for dramafever.com --- youtube_dl/extractor/__init__.py | 4 + youtube_dl/extractor/dramafever.py | 131 +++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+) create mode 100644 youtube_dl/extractor/dramafever.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6f8c261d5..ca857a75f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -112,6 +112,10 @@ from .dfb import DFBIE from .dhm import DHMIE from .dotsub import DotsubIE from .douyutv import DouyuTVIE +from .dramafever import ( + DramaFeverIE, + DramaFeverSeriesIE, +) from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py new file mode 100644 index 000000000..8fac99cc5 --- /dev/null +++ b/youtube_dl/extractor/dramafever.py @@ -0,0 +1,131 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class DramaFeverIE(InfoExtractor): + IE_NAME = 'dramafever' + _VALID_URL = r'^https?://(?:www\.)?dramafever\.com/drama/(?P[0-9]+/[0-9]+)/' + _TESTS = [{ + 'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/', + 'info_dict': { + 'id': '4512.1', + 'ext': 'flv', + 'title': 'Cooking with Shin 4512.1', + 'upload_date': '20140702', + 'description': 'Served at all special occasions and featured in the hit drama Heirs, Shin cooks Red Bean Rice.', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url).replace("/", ".") + + consumer_secret = self._get_consumer_secret(video_id) + + ep_json = self._download_json( + "http://www.dramafever.com/amp/episode/feed.json?guid=%s" % video_id, + video_id, note='Downloading episode metadata', + errnote="Video may not be available for your location")["channel"]["item"] + + title = ep_json["media-group"]["media-title"] + description = ep_json["media-group"]["media-description"] + thumbnail = ep_json["media-group"]["media-thumbnail"]["@attributes"]["url"] + duration = int(ep_json["media-group"]["media-content"][0]["@attributes"]["duration"]) + mobj = re.match(r"([0-9]{4})-([0-9]{2})-([0-9]{2})", ep_json["pubDate"]) + upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3) if mobj is not None else None + + formats = [] + for vid_format in ep_json["media-group"]["media-content"]: + src = vid_format["@attributes"]["url"] + if '.f4m' in src: + formats.extend(self._extract_f4m_formats(src, video_id)) + + self._sort_formats(formats) + video_subtitles = self.extract_subtitles(video_id, consumer_secret) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'formats': formats, + 'subtitles': video_subtitles, + } + + def _get_consumer_secret(self, video_id): + df_js = self._download_webpage( + "http://www.dramafever.com/static/126960d/v2/js/plugins/jquery.threadedcomments.js", video_id) + return self._search_regex(r"'cs': '([0-9a-zA-Z]+)'", df_js, "cs") + + def _get_episodes(self, series_id, consumer_secret, episode_filter=None): + _PAGE_SIZE = 60 + + curr_page = 1 + max_pages = curr_page + 1 + results = [] + while max_pages >= curr_page: + page_url = "http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_size=%d&page_number=%d" % \ + (consumer_secret, series_id, _PAGE_SIZE, curr_page) + series = self._download_json( + page_url, series_id, note="Downloading series json page #%d" % curr_page) + max_pages = series['num_pages'] + results.extend([ep for ep in series['value'] if episode_filter is None or episode_filter(ep)]) + curr_page += 1 + return results + + def _get_subtitles(self, video_id, consumer_secret): + + def match_episode(ep): + return ep['guid'] == video_id + + res = None + info = self._get_episodes( + video_id.split(".")[0], consumer_secret, episode_filter=match_episode) + if len(info) == 1 and info[0]['subfile'] != '': + res = {'en': [{'url': info[0]['subfile'], 'ext': 'srt'}]} + return res + + +class DramaFeverSeriesIE(DramaFeverIE): + IE_NAME = 'dramafever:series' + _VALID_URL = r'^https?://(?:www\.)?dramafever\.com/drama/(?P[0-9]+)/\d*[a-zA-Z_][a-zA-Z0-9_]*/' + _TESTS = [{ + 'url': 'http://www.dramafever.com/drama/4512/Cooking_with_Shin/', + 'info_dict': { + 'id': '4512', + 'title': 'Cooking with Shin', + 'description': 'Professional chef and cooking instructor Shin Kim takes some of the delicious dishes featured in your favorite dramas and shows you how to make them right at home.', + }, + 'playlist_count': 4, + }, { + 'url': 'http://www.dramafever.com/drama/124/IRIS/', + 'info_dict': { + 'id': '124', + 'title': 'IRIS', + 'description': 'Lee Byung Hun and Kim Tae Hee star in this powerhouse drama and ratings megahit of action, intrigue and romance.', + }, + 'playlist_count': 20, + }] + + def _real_extract(self, url): + series_id = self._match_id(url) + consumer_secret = self._get_consumer_secret(series_id) + + series_json = self._download_json( + "http://www.dramafever.com/api/4/series/query/?cs=%s&series_id=%s" % (consumer_secret, series_id), + series_id, note='Downloading series metadata')["series"][series_id] + + title = series_json["name"] + description = series_json["description_short"] + + episodes = self._get_episodes(series_id, consumer_secret) + entries = [] + for ep in episodes: + entries.append(self.url_result( + 'http://www.dramafever.com%s' % ep['episode_url'], 'DramaFever', ep['guid'])) + return self.playlist_result(entries, series_id, title, description) From 051df9ad99d0a29d9eb984970e3e431795b6e445 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 20 May 2015 14:08:23 +0800 Subject: [PATCH 0157/2145] [letv/sohu] Skip tests relying on external proxies The proxy is currently broken. See #5655 and zhuzhuor/Unblock-Youku#427 --- youtube_dl/extractor/letv.py | 4 +--- youtube_dl/extractor/sohu.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index 1484ac0d2..da896caf1 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -50,9 +50,7 @@ class LetvIE(InfoExtractor): 'title': '与龙共舞 完整版', 'description': 'md5:7506a5eeb1722bb9d4068f85024e3986', }, - 'params': { - 'cn_verification_proxy': 'http://proxy.uku.im:8888' - }, + 'skip': 'Only available in China', }] @staticmethod diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index eab4adfca..29bd9ce6f 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -23,9 +23,7 @@ class SohuIE(InfoExtractor): 'ext': 'mp4', 'title': 'MV:Far East Movement《The Illest》', }, - 'params': { - 'cn_verification_proxy': 'proxy.uku.im:8888' - } + 'skip': 'On available in China', }, { 'url': 'http://tv.sohu.com/20150305/n409385080.shtml', 'md5': 'ac9a5d322b4bf9ae184d53e4711e4f1a', From 137597b0ea88a92d174341b44b8f395b8897a2bf Mon Sep 17 00:00:00 2001 From: ping Date: Wed, 20 May 2015 15:15:28 +0800 Subject: [PATCH 0158/2145] [dramafever] Streamline code --- youtube_dl/extractor/dramafever.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index 8fac99cc5..40787ffcd 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -80,12 +80,11 @@ class DramaFeverIE(InfoExtractor): def _get_subtitles(self, video_id, consumer_secret): - def match_episode(ep): - return ep['guid'] == video_id - res = None info = self._get_episodes( - video_id.split(".")[0], consumer_secret, episode_filter=match_episode) + video_id.split(".")[0], consumer_secret, + episode_filter=lambda x: x['guid'] == video_id) + if len(info) == 1 and info[0]['subfile'] != '': res = {'en': [{'url': info[0]['subfile'], 'ext': 'srt'}]} return res From 2632941f327c8b013e5fbc736317fc897876ab73 Mon Sep 17 00:00:00 2001 From: ping Date: Wed, 20 May 2015 15:53:45 +0800 Subject: [PATCH 0159/2145] [soompi] Add new extractor for tv.soompi.com --- youtube_dl/extractor/__init__.py | 4 + youtube_dl/extractor/soompi.py | 130 +++++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+) create mode 100644 youtube_dl/extractor/soompi.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6f8c261d5..2a5cf9547 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -482,6 +482,10 @@ from .smotri import ( from .snotr import SnotrIE from .sockshare import SockshareIE from .sohu import SohuIE +from .soompi import ( + SoompiIE, + SoompiShowIE, +) from .soundcloud import ( SoundcloudIE, SoundcloudSetIE, diff --git a/youtube_dl/extractor/soompi.py b/youtube_dl/extractor/soompi.py new file mode 100644 index 000000000..5ecf40b7f --- /dev/null +++ b/youtube_dl/extractor/soompi.py @@ -0,0 +1,130 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re +import json +import base64 +import xml.etree.ElementTree + +# Soompi uses the same subtitle encryption as crunchyroll +from .crunchyroll import CrunchyrollIE + + +class SoompiIE(CrunchyrollIE): + IE_NAME = 'soompi' + _VALID_URL = r'^https?://tv\.soompi\.com/en/watch/(?P[0-9]+)' + _TESTS = [{ + 'url': 'http://tv.soompi.com/en/watch/23363', + 'info_dict': { + 'id': '23363', + 'ext': 'mp4', + 'title': 'Liar Game CM1', + 'description': '15sec' + }, + 'params': { + 'skip_download': True, + }, + }] + + def _get_episodes(self, webpage, episode_filter=None): + episodes = json.loads( + self._search_regex(r'\s+VIDEOS\s+= (\[.+?\]);', webpage, "episodes meta")) + return [ep for ep in episodes if episode_filter is None or episode_filter(ep)] + + def _get_subtitles(self, video_id, show_format_xml): + subtitles = {} + subtitle_info_nodes = show_format_xml.findall('./{default}preload/subtitles/subtitle') + subtitle_nodes = show_format_xml.findall('./{default}preload/subtitle') + + sub_langs = {} + for i in subtitle_info_nodes: + sub_langs[i.attrib["id"]] = i.attrib["title"] + + for s in subtitle_nodes: + lang_code = sub_langs.get(s.attrib["id"], None) + if lang_code is None: + continue + + sub_id = int(s.attrib["id"]) + iv = base64.b64decode(s.find("iv").text) + data = base64.b64decode(s.find("data").text) + subtitle = self._decrypt_subtitles(data, iv, sub_id).decode('utf-8') + sub_root = xml.etree.ElementTree.fromstring(subtitle) + + subtitles[lang_code] = [{ + 'ext': 'srt', 'data': self._convert_subtitles_to_srt(sub_root) + }, { + 'ext': 'ass', 'data': self._convert_subtitles_to_ass(sub_root) + }] + return subtitles + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + url, video_id, note="Downloading episode page", + errnote="Video may not be available for your location") + vid_formats = re.findall(r"\?quality=q([0-9]+)", webpage) + + show_meta = json.loads( + self._search_regex(r'\s+var show = (\{.+?\});', webpage, "show meta")) + episodes = self._get_episodes( + webpage, episode_filter=lambda x: x['id'] == video_id) + + title = episodes[0]["name"] + description = episodes[0]["description"] + duration = int(episodes[0]["duration"]) + slug = show_meta["slug"] + + formats = [] + show_format_xml = None + for vf in vid_formats: + show_format_url = "http://tv.soompi.com/en/show/%s/%s-config.xml?mode=hls&quality=q%s" \ + % (slug, video_id, vf) + show_format_xml = self._download_xml( + show_format_url, video_id, note="Downloading q%s show xml" % vf) + avail_formats = self._extract_m3u8_formats( + show_format_xml.find('./{default}preload/stream_info/file').text, + video_id, ext="mp4", m3u8_id=vf, preference=int(vf)) + formats.extend(avail_formats) + self._sort_formats(formats) + + subtitles = self.extract_subtitles(video_id, show_format_xml) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles + } + + +class SoompiShowIE(SoompiIE): + IE_NAME = 'soompi:show' + _VALID_URL = r'^https?://tv\.soompi\.com/en/shows/(?P[0-9a-zA-Z\-_]+)' + _TESTS = [{ + 'url': 'http://tv.soompi.com/en/shows/liar-game', + 'info_dict': { + 'id': 'liar-game', + 'title': 'Liar Game', + 'description': 'md5:52c02bce0c1a622a95823591d0589b66', + }, + 'playlist_count': 14, + }] + + def _real_extract(self, url): + show_id = self._match_id(url) + + webpage = self._download_webpage(url, show_id, note="Downloading show page") + title = self._og_search_title(webpage).replace("SoompiTV | ", "") + description = self._og_search_description(webpage) + + episodes = self._get_episodes(webpage) + entries = [] + for ep in episodes: + entries.append(self.url_result( + 'http://tv.soompi.com/en/watch/%s' % ep['id'], 'Soompi', ep['id'])) + + return self.playlist_result(entries, show_id, title, description) From 0b9f7cd074786abafcd35b26db4ecb4d92814393 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 20 May 2015 10:01:48 +0200 Subject: [PATCH 0160/2145] release 2015.05.20 --- docs/supportedsites.md | 5 +++++ youtube_dl/version.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 43fbe8b1d..a4879bd9a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -142,6 +142,7 @@ - **Eporner** - **EroProfile** - **Escapist** + - **ESPN** (Currently broken) - **EveryonesMixtape** - **exfm**: ex.fm - **ExpoTV** @@ -338,6 +339,7 @@ - **OktoberfestTV** - **on.aol.com** - **Ooyala** + - **OoyalaExternal** - **OpenFilm** - **orf:fm4**: radio FM4 - **orf:iptv**: iptv.ORF.at @@ -451,6 +453,7 @@ - **Spike** - **Sport5** - **SportBox** + - **SportBoxEmbed** - **SportDeutschland** - **Srf** - **SRMediathek**: Saarländischer Rundfunk @@ -510,6 +513,8 @@ - **Turbo** - **Tutv** - **tv.dfb.de** + - **TV2** + - **TV2Article** - **TV4**: tv4.se and tv4play.se - **tvigle**: Интернет-телевидение Tvigle.ru - **tvp.pl** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 38f00bc9b..b33385153 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.05.15' +__version__ = '2015.05.20' From 5137adb94dcce98a3c14fb3892c5c72f70ff34ea Mon Sep 17 00:00:00 2001 From: ping Date: Wed, 20 May 2015 16:16:10 +0800 Subject: [PATCH 0161/2145] [soompi] Switch to non-geoblocked test video --- youtube_dl/extractor/soompi.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/soompi.py b/youtube_dl/extractor/soompi.py index 5ecf40b7f..4726872dc 100644 --- a/youtube_dl/extractor/soompi.py +++ b/youtube_dl/extractor/soompi.py @@ -14,12 +14,12 @@ class SoompiIE(CrunchyrollIE): IE_NAME = 'soompi' _VALID_URL = r'^https?://tv\.soompi\.com/en/watch/(?P[0-9]+)' _TESTS = [{ - 'url': 'http://tv.soompi.com/en/watch/23363', + 'url': 'http://tv.soompi.com/en/watch/29235', 'info_dict': { - 'id': '23363', + 'id': '29235', 'ext': 'mp4', - 'title': 'Liar Game CM1', - 'description': '15sec' + 'title': 'Episode 1096', + 'description': '2015-05-20' }, 'params': { 'skip_download': True, From b0d619fde2b187f2b36b077a1eb11d766429f88c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 20 May 2015 21:28:04 +0600 Subject: [PATCH 0162/2145] [viki:channel] Extract title from JSON --- youtube_dl/extractor/viki.py | 40 +++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 9bdbdc3e4..fc585c299 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -23,7 +23,7 @@ class VikiIE(InfoExtractor): # iPad2 _USER_AGENT = 'Mozilla/5.0(iPad; U; CPU OS 4_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8F191 Safari/6533.18.5' - _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P[0-9]+v)' + _VALID_URL = r'https?://(?:www\.)?viki\.com/videos/(?P[0-9]+v)' _TESTS = [{ 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', 'info_dict': { @@ -149,7 +149,7 @@ class VikiIE(InfoExtractor): class VikiChannelIE(InfoExtractor): IE_NAME = 'viki:channel' - _VALID_URL = r'^https?://(?:www\.)?viki\.com/tv/(?P[0-9]+c)' + _VALID_URL = r'https?://(?:www\.)?viki\.com/tv/(?P[0-9]+c)' _TESTS = [{ 'url': 'http://www.viki.com/tv/50c-boys-over-flowers', 'info_dict': { @@ -167,27 +167,35 @@ class VikiChannelIE(InfoExtractor): }, 'playlist_count': 127, }] + _API_BASE = 'http://api.viki.io/v4/containers' + _APP = '100000a' _PER_PAGE = 25 def _real_extract(self, url): - show_id = self._match_id(url) - show_page = self._download_webpage(url, show_id, 'Download show page') + channel_id = self._match_id(url) - title = self._og_search_title(show_page) - description = self._og_search_description(show_page) + channel = self._download_json( + '%s/%s.json?app=%s' % (self._API_BASE, channel_id, self._APP), + channel_id, 'Downloading channel JSON') + + titles = channel['titles'] + title = titles.get('en') or titles[titles.keys()[0]] + + descriptions = channel['descriptions'] + description = descriptions.get('en') or descriptions[descriptions.keys()[0]] entries = [] - for video_type in ['episodes', 'clips']: - json_url = 'http://api.viki.io/v4/containers/%s/%s.json?app=100000a&per_page=%d&sort=number&direction=asc&with_paging=true&page=1' % (show_id, video_type, self._PER_PAGE) - while json_url is not None: - show_json = self._download_json( - json_url, show_id, - note='Downloading %s json page #%s' % - (video_type, re.search(r'[?&]page=([0-9]+)', json_url).group(1))) - for video in show_json['response']: + for video_type in ('episodes', 'clips'): + page_url = '%s/%s/%s.json?app=%s&per_page=%d&sort=number&direction=asc&with_paging=true&page=1' % (self._API_BASE, channel_id, video_type, self._APP, self._PER_PAGE) + while page_url: + page = self._download_json( + page_url, channel_id, + 'Downloading %s JSON page #%s' + % (video_type, re.search(r'[?&]page=([0-9]+)', page_url).group(1))) + for video in page['response']: video_id = video['id'] entries.append(self.url_result( 'http://www.viki.com/videos/%s' % video_id, 'Viki', video_id)) - json_url = show_json['pagination']['next'] + page_url = page['pagination']['next'] - return self.playlist_result(entries, show_id, title, description) + return self.playlist_result(entries, channel_id, title, description) From 1a83c731bd58ed85f6f7695cee9c88d09a224bc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 21 May 2015 01:44:05 +0600 Subject: [PATCH 0163/2145] [viki] Switch extraction to API --- youtube_dl/extractor/viki.py | 230 +++++++++++++++++++++-------------- 1 file changed, 142 insertions(+), 88 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index fc585c299..234649ca8 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -1,29 +1,64 @@ from __future__ import unicode_literals import re +import time +import hmac +import hashlib -from ..compat import ( - compat_urlparse, - compat_urllib_request, -) from ..utils import ( ExtractorError, - unescapeHTML, - unified_strdate, - US_RATINGS, - determine_ext, - mimetype2ext, + int_or_none, + parse_age_limit, + parse_iso8601, ) from .common import InfoExtractor -class VikiIE(InfoExtractor): +class VikiBaseIE(InfoExtractor): + _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com' + _API_URL_TEMPLATE = 'http://api.viki.io%s&sig=%s' + + _APP = '65535a' + _APP_VERSION = '2.2.5.1428709186' + _APP_SECRET = '-$iJ}@p7!G@SyU/je1bEyWg}upLu-6V6-Lg9VD(]siH,r.,m-r|ulZ,U4LC/SeR)' + + def _prepare_call(self, path, timestamp=None): + path += '?' if '?' not in path else '&' + if not timestamp: + timestamp = int(time.time()) + query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp) + sig = hmac.new( + self._APP_SECRET.encode('ascii'), + query.encode('ascii'), + hashlib.sha1 + ).hexdigest() + return self._API_URL_TEMPLATE % (query, sig) + + def _call_api(self, path, video_id, note, timestamp=None): + resp = self._download_json( + self._prepare_call(path, timestamp), video_id, note) + + error = resp.get('error') + if error: + if error == 'invalid timestamp': + resp = self._download_json( + self._prepare_call(path, int(resp['current_timestamp'])), + video_id, '%s (retry)' % note) + error = resp.get('error') + if error: + self._raise_error(resp['error']) + + return resp + + def _raise_error(self, error): + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error), + expected=True) + + +class VikiIE(VikiBaseIE): IE_NAME = 'viki' - - # iPad2 - _USER_AGENT = 'Mozilla/5.0(iPad; U; CPU OS 4_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8F191 Safari/6533.18.5' - - _VALID_URL = r'https?://(?:www\.)?viki\.com/videos/(?P[0-9]+v)' + _VALID_URL = r'https?://(?:www\.)?viki\.com/(?:videos|player)/(?P[0-9]+v)' _TESTS = [{ 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', 'info_dict': { @@ -37,115 +72,134 @@ class VikiIE(InfoExtractor): }, 'skip': 'Blocked in the US', }, { + # clip 'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference', - 'md5': 'ca6493e6f0a6ec07da9aa8d6304b4b2c', + 'md5': '86c0b5dbd4d83a6611a79987cc7a1989', 'info_dict': { 'id': '1067139v', 'ext': 'mp4', + 'title': "'The Avengers: Age of Ultron' Press Conference", 'description': 'md5:d70b2f9428f5488321bfe1db10d612ea', + 'duration': 352, + 'timestamp': 1430380829, 'upload_date': '20150430', - 'title': '\'The Avengers: Age of Ultron\' Press Conference', + 'uploader': 'Arirang TV', + 'like_count': int, + 'age_limit': 0, } }, { 'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi', 'info_dict': { 'id': '1048879v', 'ext': 'mp4', - 'upload_date': '20140820', - 'description': 'md5:54ff56d51bdfc7a30441ec967394e91c', 'title': 'Ankhon Dekhi', + 'duration': 6512, + 'timestamp': 1408532356, + 'upload_date': '20140820', + 'uploader': 'Spuul', + 'like_count': int, + 'age_limit': 13, }, 'params': { - # requires ffmpeg + # m3u8 download 'skip_download': True, } + }, { + # episode + 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', + 'md5': '190f3ef426005ba3a080a63325955bc3', + 'info_dict': { + 'id': '44699v', + 'ext': 'mp4', + 'title': 'Boys Over Flowers - Episode 1', + 'description': 'md5:52617e4f729c7d03bfd4bcbbb6e946f2', + 'duration': 4155, + 'timestamp': 1270496524, + 'upload_date': '20100405', + 'uploader': 'group8', + 'like_count': int, + 'age_limit': 13, + } + }, { + 'url': 'http://www.viki.com/player/44699v', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) + streams = self._call_api( + 'videos/%s/streams.json' % video_id, video_id, + 'Downloading video streams JSON') - uploader_m = re.search( - r'Broadcast Network: \s*([^<]*)<', webpage) - if uploader_m is None: - uploader = None - else: - uploader = uploader_m.group(1).strip() + formats = [] + for format_id, stream_dict in streams.items(): + height = self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None) + for protocol, format_dict in stream_dict.items(): + if format_id == 'm3u8': + formats = self._extract_m3u8_formats( + format_dict['url'], video_id, 'mp4', m3u8_id='m3u8-%s' % protocol) + else: + formats.append({ + 'url': format_dict['url'], + 'format_id': '%s-%s' % (format_id, protocol), + 'height': height, + }) + self._sort_formats(formats) - rating_str = self._html_search_regex( - r'Rating: \s*([^<]*)<', webpage, - 'rating information', default='').strip() - age_limit = US_RATINGS.get(rating_str) + video = self._call_api( + 'videos/%s.json' % video_id, video_id, 'Downloading video JSON') - req = compat_urllib_request.Request( - 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id) - req.add_header('User-Agent', self._USER_AGENT) - info_webpage = self._download_webpage( - req, video_id, note='Downloading info page') - err_msg = self._html_search_regex(r']+class="video-error[^>]+>(.+)
', info_webpage, 'error message', default=None) - if err_msg: - if 'not available in your region' in err_msg: - raise ExtractorError( - 'Video %s is blocked from your location.' % video_id, - expected=True) - else: - raise ExtractorError('Viki said: %s %s' % (err_msg, url)) - mobj = re.search( - r']+type="(?P[^"]+)"[^>]+src="(?P[^"]+)"', info_webpage) - if not mobj: - raise ExtractorError('Unable to find video URL') - video_url = unescapeHTML(mobj.group('url')) - video_ext = mimetype2ext(mobj.group('mime_type')) + title = None + titles = video.get('titles') + if titles: + title = titles.get('en') or titles[titles.keys()[0]] + if not title: + title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id + container_titles = video.get('container', {}).get('titles') + if container_titles: + container_title = container_titles.get('en') or container_titles[titles.keys()[0]] + title = '%s - %s' % (container_title, title) - if determine_ext(video_url) == 'm3u8': - formats = self._extract_m3u8_formats( - video_url, video_id, ext=video_ext) - else: - formats = [{ - 'url': video_url, - 'ext': video_ext, - }] + descriptions = video.get('descriptions') + description = descriptions.get('en') or descriptions[titles.keys()[0]] if descriptions else None - upload_date_str = self._html_search_regex( - r'"created_at":"([^"]+)"', info_webpage, 'upload date') - upload_date = ( - unified_strdate(upload_date_str) - if upload_date_str is not None - else None - ) + duration = int_or_none(video.get('duration')) + timestamp = parse_iso8601(video.get('created_at')) + uploader = video.get('author') + like_count = int_or_none(video.get('likes', {}).get('count')) + age_limit = parse_age_limit(video.get('rating')) - # subtitles - video_subtitles = self.extract_subtitles(video_id, info_webpage) + thumbnails = [] + for thumbnail_id, thumbnail in video.get('images', {}).items(): + thumbnails.append({ + 'id': thumbnail_id, + 'url': thumbnail.get('url'), + }) + + subtitles = {} + for subtitle_lang, _ in video.get('subtitle_completions', {}).items(): + subtitles[subtitle_lang] = [{ + 'ext': subtitles_format, + 'url': self._prepare_call( + 'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)), + } for subtitles_format in ('srt', 'vtt')] return { 'id': video_id, 'title': title, - 'formats': formats, 'description': description, - 'thumbnail': thumbnail, - 'age_limit': age_limit, + 'duration': duration, + 'timestamp': timestamp, 'uploader': uploader, - 'subtitles': video_subtitles, - 'upload_date': upload_date, + 'like_count': like_count, + 'age_limit': age_limit, + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, } - def _get_subtitles(self, video_id, info_webpage): - res = {} - for sturl_html in re.findall(r'[a-z]+)\.vtt', sturl) - if not m: - continue - res[m.group('lang')] = [{ - 'url': compat_urlparse.urljoin('http://www.viki.com', sturl), - 'ext': 'vtt', - }] - return res - class VikiChannelIE(InfoExtractor): IE_NAME = 'viki:channel' From ac20d95f9766aa130748aac07fa90ee5dfa566d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 21 May 2015 01:56:02 +0600 Subject: [PATCH 0164/2145] [viki] Add support for youtube externals --- youtube_dl/extractor/viki.py | 70 ++++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 234649ca8..68d5cac6e 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -120,6 +120,23 @@ class VikiIE(VikiBaseIE): 'like_count': int, 'age_limit': 13, } + }, { + # youtube external + 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', + 'md5': '216d1afdc0c64d1febc1e9f2bd4b864b', + 'info_dict': { + 'id': '50562v', + 'ext': 'mp4', + 'title': 'Poor Nastya [COMPLETE] - Episode 1', + 'description': '', + 'duration': 607, + 'timestamp': 1274949505, + 'upload_date': '20101213', + 'uploader': 'ad14065n', + 'uploader_id': 'ad14065n', + 'like_count': int, + 'age_limit': 13, + } }, { 'url': 'http://www.viki.com/player/44699v', 'only_matching': True, @@ -128,26 +145,6 @@ class VikiIE(VikiBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - streams = self._call_api( - 'videos/%s/streams.json' % video_id, video_id, - 'Downloading video streams JSON') - - formats = [] - for format_id, stream_dict in streams.items(): - height = self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None) - for protocol, format_dict in stream_dict.items(): - if format_id == 'm3u8': - formats = self._extract_m3u8_formats( - format_dict['url'], video_id, 'mp4', m3u8_id='m3u8-%s' % protocol) - else: - formats.append({ - 'url': format_dict['url'], - 'format_id': '%s-%s' % (format_id, protocol), - 'height': height, - }) - self._sort_formats(formats) - video = self._call_api( 'videos/%s.json' % video_id, video_id, 'Downloading video JSON') @@ -186,7 +183,7 @@ class VikiIE(VikiBaseIE): 'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)), } for subtitles_format in ('srt', 'vtt')] - return { + result = { 'id': video_id, 'title': title, 'description': description, @@ -196,10 +193,39 @@ class VikiIE(VikiBaseIE): 'like_count': like_count, 'age_limit': age_limit, 'thumbnails': thumbnails, - 'formats': formats, 'subtitles': subtitles, } + streams = self._call_api( + 'videos/%s/streams.json' % video_id, video_id, + 'Downloading video streams JSON') + + if 'external' in streams: + result.update({ + '_type': 'url_transparent', + 'url': streams['external']['url'], + }) + return result + + formats = [] + for format_id, stream_dict in streams.items(): + height = self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None) + for protocol, format_dict in stream_dict.items(): + if format_id == 'm3u8': + formats = self._extract_m3u8_formats( + format_dict['url'], video_id, 'mp4', m3u8_id='m3u8-%s' % protocol) + else: + formats.append({ + 'url': format_dict['url'], + 'format_id': '%s-%s' % (format_id, protocol), + 'height': height, + }) + self._sort_formats(formats) + + result['formats'] = formats + return result + class VikiChannelIE(InfoExtractor): IE_NAME = 'viki:channel' From bc56355ec6bc823fe96e31688cd3123dc18ae627 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 21 May 2015 02:08:13 +0600 Subject: [PATCH 0165/2145] [viki:channel] Switch to API --- youtube_dl/extractor/viki.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 68d5cac6e..071e280fb 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -4,6 +4,7 @@ import re import time import hmac import hashlib +import itertools from ..utils import ( ExtractorError, @@ -227,7 +228,7 @@ class VikiIE(VikiBaseIE): return result -class VikiChannelIE(InfoExtractor): +class VikiChannelIE(VikiBaseIE): IE_NAME = 'viki:channel' _VALID_URL = r'https?://(?:www\.)?viki\.com/tv/(?P[0-9]+c)' _TESTS = [{ @@ -247,16 +248,15 @@ class VikiChannelIE(InfoExtractor): }, 'playlist_count': 127, }] - _API_BASE = 'http://api.viki.io/v4/containers' - _APP = '100000a' + _PER_PAGE = 25 def _real_extract(self, url): channel_id = self._match_id(url) - channel = self._download_json( - '%s/%s.json?app=%s' % (self._API_BASE, channel_id, self._APP), - channel_id, 'Downloading channel JSON') + channel = self._call_api( + 'containers/%s.json' % channel_id, channel_id, + 'Downloading channel JSON') titles = channel['titles'] title = titles.get('en') or titles[titles.keys()[0]] @@ -266,16 +266,16 @@ class VikiChannelIE(InfoExtractor): entries = [] for video_type in ('episodes', 'clips'): - page_url = '%s/%s/%s.json?app=%s&per_page=%d&sort=number&direction=asc&with_paging=true&page=1' % (self._API_BASE, channel_id, video_type, self._APP, self._PER_PAGE) - while page_url: - page = self._download_json( - page_url, channel_id, - 'Downloading %s JSON page #%s' - % (video_type, re.search(r'[?&]page=([0-9]+)', page_url).group(1))) + for page_num in itertools.count(1): + page = self._call_api( + 'containers/%s/%s.json?per_page=%d&sort=number&direction=asc&with_paging=true&page=%d' + % (channel_id, video_type, self._PER_PAGE, page_num), channel_id, + 'Downloading %s JSON page #%d' % (video_type, page_num)) for video in page['response']: video_id = video['id'] entries.append(self.url_result( - 'http://www.viki.com/videos/%s' % video_id, 'Viki', video_id)) - page_url = page['pagination']['next'] + 'http://www.viki.com/videos/%s' % video_id, 'Viki')) + if not page['pagination']['next']: + break return self.playlist_result(entries, channel_id, title, description) From d01924f48810db69d572bc121ab98021f04ac957 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 21 May 2015 02:30:04 +0600 Subject: [PATCH 0166/2145] [viki:channel] Extend matching URLs and extract movies --- youtube_dl/extractor/viki.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 071e280fb..3acb481f9 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -230,7 +230,7 @@ class VikiIE(VikiBaseIE): class VikiChannelIE(VikiBaseIE): IE_NAME = 'viki:channel' - _VALID_URL = r'https?://(?:www\.)?viki\.com/tv/(?P[0-9]+c)' + _VALID_URL = r'https?://(?:www\.)?viki\.com/(?:tv|news|movies|artists)/(?P[0-9]+c)' _TESTS = [{ 'url': 'http://www.viki.com/tv/50c-boys-over-flowers', 'info_dict': { @@ -247,6 +247,15 @@ class VikiChannelIE(VikiBaseIE): 'description': 'md5:05bf5471385aa8b21c18ad450e350525', }, 'playlist_count': 127, + }, { + 'url': 'http://www.viki.com/news/24569c-showbiz-korea', + 'only_matching': True, + }, { + 'url': 'http://www.viki.com/movies/22047c-pride-and-prejudice-2005', + 'only_matching': True, + }, { + 'url': 'http://www.viki.com/artists/2141c-shinee', + 'only_matching': True, }] _PER_PAGE = 25 @@ -265,7 +274,7 @@ class VikiChannelIE(VikiBaseIE): description = descriptions.get('en') or descriptions[descriptions.keys()[0]] entries = [] - for video_type in ('episodes', 'clips'): + for video_type in ('episodes', 'clips', 'movies'): for page_num in itertools.count(1): page = self._call_api( 'containers/%s/%s.json?per_page=%d&sort=number&direction=asc&with_paging=true&page=%d' From 4d8ee01389c4229f14fad45f0aa7b033a2509aef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 21 May 2015 02:38:43 +0600 Subject: [PATCH 0167/2145] [viki] Fix typo --- youtube_dl/extractor/viki.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 3acb481f9..0ec8ef0ef 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -157,7 +157,7 @@ class VikiIE(VikiBaseIE): title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id container_titles = video.get('container', {}).get('titles') if container_titles: - container_title = container_titles.get('en') or container_titles[titles.keys()[0]] + container_title = container_titles.get('en') or container_titles[container_titles.keys()[0]] title = '%s - %s' % (container_title, title) descriptions = video.get('descriptions') From 4d2f42361e02bb67de7c2017c6817b46ff3b2bd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 21 May 2015 11:42:20 +0200 Subject: [PATCH 0168/2145] [viki] remove unused import --- youtube_dl/extractor/viki.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 0ec8ef0ef..fe7229952 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import re import time import hmac import hashlib From e7752cd57853718c6875b02517613d14c4c7221d Mon Sep 17 00:00:00 2001 From: frenchy1983 Date: Thu, 21 May 2015 11:47:16 +0200 Subject: [PATCH 0169/2145] [TNAFlix] Allow dot (and more) in cat_id and display_id URLs with dots were raising a "UnsupportedError: Unsupported URL" error. --- youtube_dl/extractor/tnaflix.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index d48cbbf14..725edd3c7 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -10,23 +10,23 @@ from ..utils import ( class TNAFlixIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/(?P[\w-]+)/(?P[\w-]+)/video(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/(?P[^/]+)/(?P[^/]+)/video(?P\d+)' _TITLE_REGEX = r'(.+?) - TNAFlix Porn Videos' _DESCRIPTION_REGEX = r'

([^<]+)

' _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"' _TEST = { - 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', - 'md5': 'ecf3498417d09216374fc5907f9c6ec0', + 'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632', + 'md5': '6c431ea56756497e227fb3f01a687869', 'info_dict': { - 'id': '553878', - 'display_id': 'Carmella-Decesare-striptease', + 'id': '358632', + 'display_id': 'bunzHD-Ms.Donk', 'ext': 'mp4', - 'title': 'Carmella Decesare - striptease', - 'description': '', + 'title': 'bunzHD Ms.Donk', + 'description': 'bubble booty ebony teen goddess Ms.Donk has a firm ass and acts like she is shy but really she is a freak in the sheets watch her 20 min XX rated vid at bunzHD.com click on the catalog link', 'thumbnail': 're:https?://.*\.jpg$', - 'duration': 91, + 'duration': 394, 'age_limit': 18, } } From 6ad9cb224a7d9156109fe0b0100d277b954063d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 21 May 2015 12:02:53 +0200 Subject: [PATCH 0170/2145] [mitele] It now uses m3u8 (#5764) It should also be possible to use Adobe HDS, but it would require more work. --- youtube_dl/extractor/mitele.py | 9 +++++++-- youtube_dl/extractor/telecinco.py | 4 ++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index d8897eb90..7091f3335 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -20,7 +20,6 @@ class MiTeleIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', - 'md5': '6a75fe9d0d3275bead0cb683c616fddb', 'info_dict': { 'id': '0fce117d', 'ext': 'mp4', @@ -29,6 +28,10 @@ class MiTeleIE(InfoExtractor): 'display_id': 'programa-144', 'duration': 2913, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -56,12 +59,14 @@ class MiTeleIE(InfoExtractor): episode, transform_source=strip_jsonp ) + formats = self._extract_m3u8_formats( + token_info['tokenizedUrl'], episode, ext='mp4') return { 'id': embed_data['videoId'], 'display_id': episode, 'title': info_el.find('title').text, - 'url': token_info['tokenizedUrl'], + 'formats': formats, 'description': get_element_by_attribute('class', 'text', webpage), 'thumbnail': info_el.find('thumb').text, 'duration': parse_duration(info_el.find('duration').text), diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index 251a68680..a0c744fd1 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -16,6 +16,10 @@ class TelecincoIE(MiTeleIE): 'title': 'Con Martín Berasategui, hacer un bacalao al ...', 'duration': 662, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html', 'only_matching': True, From 663004ac2b001b9be03bd951d539a62cf83c58ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 21 May 2015 22:06:25 +0600 Subject: [PATCH 0171/2145] [options] Clarify `--metadata-from-title` additional templates --- youtube_dl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 22dbc3aec..dd07266b7 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -713,7 +713,7 @@ def parseOpts(overrideArguments=None): help='Parse additional metadata like song title / artist from the video title. ' 'The format syntax is the same as --output, ' 'the parsed parameters replace existing values. ' - 'Additional templates: %(album), %(artist). ' + 'Additional templates: %(album)s, %(artist)s. ' 'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like ' '"Coldplay - Paradise"') postproc.add_option( From 53de95da5e40aa1a465668977e507ccc914099f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 21 May 2015 22:27:22 +0600 Subject: [PATCH 0172/2145] [viki] Extend _VALID_URLs --- youtube_dl/extractor/viki.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index fe7229952..7f2fb1ca8 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -15,6 +15,7 @@ from .common import InfoExtractor class VikiBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?viki\.(?:com|net|mx|jp|fr)/' _API_QUERY_TEMPLATE = '/v4/%sapp=%s&t=%s&site=www.viki.com' _API_URL_TEMPLATE = 'http://api.viki.io%s&sig=%s' @@ -58,7 +59,7 @@ class VikiBaseIE(InfoExtractor): class VikiIE(VikiBaseIE): IE_NAME = 'viki' - _VALID_URL = r'https?://(?:www\.)?viki\.com/(?:videos|player)/(?P[0-9]+v)' + _VALID_URL = r'%s(?:videos|player)/(?P[0-9]+v)' % VikiBaseIE._VALID_URL_BASE _TESTS = [{ 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', 'info_dict': { @@ -229,7 +230,7 @@ class VikiIE(VikiBaseIE): class VikiChannelIE(VikiBaseIE): IE_NAME = 'viki:channel' - _VALID_URL = r'https?://(?:www\.)?viki\.com/(?:tv|news|movies|artists)/(?P[0-9]+c)' + _VALID_URL = r'%s(?:tv|news|movies|artists)/(?P[0-9]+c)' % VikiBaseIE._VALID_URL_BASE _TESTS = [{ 'url': 'http://www.viki.com/tv/50c-boys-over-flowers', 'info_dict': { From 5cd47a5e4f54033bcf6d80908e00eff4c75a51c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 21 May 2015 23:58:46 +0600 Subject: [PATCH 0173/2145] [videott] Fix for python 3.2 --- youtube_dl/extractor/videott.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py index ececc7ee0..591024ead 100644 --- a/youtube_dl/extractor/videott.py +++ b/youtube_dl/extractor/videott.py @@ -43,7 +43,7 @@ class VideoTtIE(InfoExtractor): formats = [ { - 'url': base64.b64decode(res['u']).decode('utf-8'), + 'url': base64.b64decode(res['u'].encode('utf-8')).decode('utf-8'), 'ext': 'flv', 'format_id': res['l'], } for res in settings['res'] if res['u'] From 06947add03b6b619292812a771993d3365b0e7e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 May 2015 00:03:47 +0600 Subject: [PATCH 0174/2145] [chilloutzone] Fix for python 3.2 --- youtube_dl/extractor/chilloutzone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py index c922f6959..0206d96db 100644 --- a/youtube_dl/extractor/chilloutzone.py +++ b/youtube_dl/extractor/chilloutzone.py @@ -57,7 +57,7 @@ class ChilloutzoneIE(InfoExtractor): base64_video_info = self._html_search_regex( r'var cozVidData = "(.+?)";', webpage, 'video data') - decoded_video_info = base64.b64decode(base64_video_info).decode("utf-8") + decoded_video_info = base64.b64decode(base64_video_info.encode('utf-8')).decode('utf-8') video_info_dict = json.loads(decoded_video_info) # get video information from dict From 878563c847fa5248eedbd44187536dec04643eaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 May 2015 00:06:10 +0600 Subject: [PATCH 0175/2145] [aes] Fix for python 3.2 --- youtube_dl/aes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py index 07224d508..7817adcfd 100644 --- a/youtube_dl/aes.py +++ b/youtube_dl/aes.py @@ -152,7 +152,7 @@ def aes_decrypt_text(data, password, key_size_bytes): """ NONCE_LENGTH_BYTES = 8 - data = bytes_to_intlist(base64.b64decode(data)) + data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) password = bytes_to_intlist(password.encode('utf-8')) key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password)) From afe8b594be53161f68189e15a65b4e9c6eba0b35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 May 2015 00:09:15 +0600 Subject: [PATCH 0176/2145] [rtve.es:alacarta] Fix for python 3.2 --- youtube_dl/extractor/rtve.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 849300140..82cd98ac7 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -17,7 +17,7 @@ from ..utils import ( def _decrypt_url(png): - encrypted_data = base64.b64decode(png) + encrypted_data = base64.b64decode(png.encode('utf-8')) text_index = encrypted_data.find(b'tEXt') text_chunk = encrypted_data[text_index - 4:] length = struct_unpack('!I', text_chunk[:4])[0] From 43150d7ac36efda7bc60c694b8a18e1f720da04b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 May 2015 00:10:05 +0600 Subject: [PATCH 0177/2145] [shared] Fix for python 3.2 --- youtube_dl/extractor/shared.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 26ced716e..9f3e944e7 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -47,7 +47,7 @@ class SharedIE(InfoExtractor): video_url = self._html_search_regex( r'data-url="([^"]+)"', video_page, 'video URL') title = base64.b64decode(self._html_search_meta( - 'full:title', webpage, 'title')).decode('utf-8') + 'full:title', webpage, 'title').encode('utf-8')).decode('utf-8') filesize = int_or_none(self._html_search_meta( 'full:size', webpage, 'file size', fatal=False)) thumbnail = self._html_search_regex( From 0459432d962bf358566340eed00f6c1c56b7b732 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 May 2015 00:10:53 +0600 Subject: [PATCH 0178/2145] [shared] Fix for python 3.2 --- youtube_dl/extractor/tutv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py index 4de0aac52..fad720b68 100644 --- a/youtube_dl/extractor/tutv.py +++ b/youtube_dl/extractor/tutv.py @@ -26,7 +26,7 @@ class TutvIE(InfoExtractor): data_content = self._download_webpage( 'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info') - video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8') + video_url = base64.b64decode(compat_parse_qs(data_content)['kpt'][0].encode('utf-8')).decode('utf-8') return { 'id': internal_id, From 77d9cb2f04462677f2a36f487c20e7a7992a0a32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 22 May 2015 00:45:33 +0600 Subject: [PATCH 0179/2145] [sportbox] Fix extraction --- youtube_dl/extractor/sportbox.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index 8686f9d11..2ab3489e4 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -7,7 +7,7 @@ from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( parse_duration, - parse_iso8601, + unified_strdate, ) @@ -20,11 +20,9 @@ class SportBoxIE(InfoExtractor): 'id': '80822', 'ext': 'mp4', 'title': 'Гонка 2 заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн', - 'description': 'md5:81715fa9c4ea3d9e7915dc8180c778ed', + 'description': 'md5:3d72dc4a006ab6805d82f037fdc637ad', 'thumbnail': 're:^https?://.*\.jpg$', - 'timestamp': 1411896237, 'upload_date': '20140928', - 'duration': 4846, }, 'params': { # m3u8 download @@ -48,17 +46,13 @@ class SportBoxIE(InfoExtractor): r'src="/?(vdl/player/[^"]+)"', webpage, 'player') title = self._html_search_regex( - r'

([^<]+)

', webpage, 'title') - description = self._html_search_regex( - r'(?s)
(.+?)
', - webpage, 'description', fatal=False) + [r'"nodetitle"\s*:\s*"([^"]+)"', r'class="node-header_{1,2}title">([^<]+)'], + webpage, 'title') + description = self._og_search_description(webpage) or self._html_search_meta( + 'description', webpage, 'description') thumbnail = self._og_search_thumbnail(webpage) - timestamp = parse_iso8601(self._search_regex( - r'([^<]+)', - webpage, 'timestamp', fatal=False)) - duration = parse_duration(self._html_search_regex( - r'', - webpage, 'duration', fatal=False)) + upload_date = unified_strdate(self._html_search_meta( + 'dateCreated', webpage, 'upload date')) return { '_type': 'url_transparent', @@ -67,8 +61,7 @@ class SportBoxIE(InfoExtractor): 'title': title, 'description': description, 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, + 'upload_date': upload_date, } From 8a278a1d7ef6134a5ac6b7dd31e3458d05f71225 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 22 May 2015 13:26:50 +0800 Subject: [PATCH 0180/2145] [nba] Fix duration extraction (fixes #5777) --- youtube_dl/extractor/nba.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 862b706bf..944096e1c 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -22,6 +22,18 @@ class NBAIE(InfoExtractor): }, { 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', 'only_matching': True, + }, { + 'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', + 'info_dict': { + 'id': '0041400301-cle-atl-recap.nba', + 'ext': 'mp4', + 'title': 'NBA GAME TIME | Video: Hawks vs. Cavaliers Game 1', + 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', + 'duration': 228, + }, + 'params': { + 'skip_download': True, + } }] def _real_extract(self, url): @@ -35,8 +47,12 @@ class NBAIE(InfoExtractor): self._og_search_title(webpage, default=shortened_video_id), ' : NBA.com') description = self._og_search_description(webpage) - duration = parse_duration( - self._html_search_meta('duration', webpage, 'duration')) + duration_str = self._html_search_meta( + 'duration', webpage, 'duration', default=None) + if not duration_str: + duration_str = self._html_search_regex( + r'Duration:\s*(\d+:\d+)', webpage, 'duration', fatal=False) + duration = parse_duration(duration_str) return { 'id': shortened_video_id, From ed5a637d62e8ede4a8cef75df4e5f341e3c667a1 Mon Sep 17 00:00:00 2001 From: frenchy1983 Date: Fri, 22 May 2015 09:29:35 +0200 Subject: [PATCH 0181/2145] [TNAFlix] Restore test See dstftw's comment in #5772 --- youtube_dl/extractor/empflix.py | 26 ++++++++++++++------------ youtube_dl/extractor/tnaflix.py | 32 +++++++++++++++++++------------- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py index 70f8efe27..0dc947c1d 100644 --- a/youtube_dl/extractor/empflix.py +++ b/youtube_dl/extractor/empflix.py @@ -10,16 +10,18 @@ class EMPFlixIE(TNAFlixIE): _DESCRIPTION_REGEX = r'name="description" value="([^"]*)"' _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"' - _TEST = { - 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', - 'md5': 'b1bc15b6412d33902d6e5952035fcabc', - 'info_dict': { - 'id': '33051', - 'display_id': 'Amateur-Finger-Fuck', - 'ext': 'mp4', - 'title': 'Amateur Finger Fuck', - 'description': 'Amateur solo finger fucking.', - 'thumbnail': 're:https?://.*\.jpg$', - 'age_limit': 18, + _TESTS = [ + { + 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', + 'md5': 'b1bc15b6412d33902d6e5952035fcabc', + 'info_dict': { + 'id': '33051', + 'display_id': 'Amateur-Finger-Fuck', + 'ext': 'mp4', + 'title': 'Amateur Finger Fuck', + 'description': 'Amateur solo finger fucking.', + 'thumbnail': 're:https?://.*\.jpg$', + 'age_limit': 18, + } } - } + ] diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index 725edd3c7..79496039d 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -16,20 +16,26 @@ class TNAFlixIE(InfoExtractor): _DESCRIPTION_REGEX = r'

([^<]+)

' _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"' - _TEST = { - 'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632', - 'md5': '6c431ea56756497e227fb3f01a687869', - 'info_dict': { - 'id': '358632', - 'display_id': 'bunzHD-Ms.Donk', - 'ext': 'mp4', - 'title': 'bunzHD Ms.Donk', - 'description': 'bubble booty ebony teen goddess Ms.Donk has a firm ass and acts like she is shy but really she is a freak in the sheets watch her 20 min XX rated vid at bunzHD.com click on the catalog link', - 'thumbnail': 're:https?://.*\.jpg$', - 'duration': 394, - 'age_limit': 18, + _TESTS = [ + { + 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', + 'md5': 'ecf3498417d09216374fc5907f9c6ec0', + 'info_dict': { + 'id': '553878', + 'display_id': 'Carmella-Decesare-striptease', + 'ext': 'mp4', + 'title': 'Carmella Decesare - striptease', + 'description': '', + 'thumbnail': 're:https?://.*\.jpg$', + 'duration': 91, + 'age_limit': 18, + } + }, + { + 'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632', + 'matching_only': True, } - } + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From ba6454761687e099f960b50cb50a9b87f4ec6d17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 22 May 2015 11:35:09 +0200 Subject: [PATCH 0182/2145] [sportbox] Remove unused import --- youtube_dl/extractor/sportbox.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index 2ab3489e4..86d509ae5 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -6,7 +6,6 @@ import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( - parse_duration, unified_strdate, ) From 79979c689713fd28e8fdf08bd71eecb6798f23d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 22 May 2015 16:14:55 +0200 Subject: [PATCH 0183/2145] Clarify that --dump-pages encodes the pages using base64 (#5781) --- youtube_dl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index dd07266b7..5a2315bd9 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -537,7 +537,7 @@ def parseOpts(overrideArguments=None): verbosity.add_option( '--dump-pages', '--dump-intermediate-pages', action='store_true', dest='dump_intermediate_pages', default=False, - help='Print downloaded pages to debug problems (very verbose)') + help='Print downloaded pages encoded using base64 to debug problems (very verbose)') verbosity.add_option( '--write-pages', action='store_true', dest='write_pages', default=False, From 69e0f1b445388e4b6f45868d53780d6f8937f56e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 May 2015 00:08:10 +0600 Subject: [PATCH 0184/2145] Credit @ping for viki:channel, qqmusic:toplist --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 267b8da1e..ebed7ebb3 100644 --- a/AUTHORS +++ b/AUTHORS @@ -124,3 +124,4 @@ Mohammad Teimori Pabandi Roman Le Négrate Matthias Küch Julian Richen +Ping O. From 685c74d315a54154c5a1d9ecee8b212dbee94bc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 May 2015 01:01:47 +0600 Subject: [PATCH 0185/2145] [rutv] Extend embed URL (Closes #5782) --- youtube_dl/extractor/rutv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rutv.py b/youtube_dl/extractor/rutv.py index 55604637d..d9df06861 100644 --- a/youtube_dl/extractor/rutv.py +++ b/youtube_dl/extractor/rutv.py @@ -104,7 +104,7 @@ class RUTVIE(InfoExtractor): @classmethod def _extract_url(cls, webpage): mobj = re.search( - r']+?src=(["\'])(?Phttps?://player\.rutv\.ru/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) + r']+?src=(["\'])(?Phttps?://player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) if mobj: return mobj.group('url') From d386878af96f368ba4c2fc8bc9b078a69b79fdf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 23 May 2015 21:25:53 +0600 Subject: [PATCH 0186/2145] [prosiebensat1] Add support for .at domain names (Closes #5786) --- youtube_dl/extractor/prosiebensat1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 7cc799664..255d4abc1 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -17,7 +17,7 @@ from ..utils import ( class ProSiebenSat1IE(InfoExtractor): IE_NAME = 'prosiebensat1' IE_DESC = 'ProSiebenSat.1 Digital' - _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P.+)' + _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at)|ran\.de|fem\.com)/(?P.+)' _TESTS = [ { From abca34cbc04693662d913e19634c06c214a237f6 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 24 May 2015 02:04:02 +0800 Subject: [PATCH 0187/2145] [cnn] Relax _VALID_URL again (fixes #5737) The problem is the same as test:CNN_1, so I didn't add the test case --- youtube_dl/extractor/cnn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 5efc5f4fe..3b1bd4033 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -12,7 +12,7 @@ from ..utils import ( class CNNIE(InfoExtractor): _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/ - (?P.+?/(?P[^/]+?)(?:\.(?:[a-z]{3,5})(?:-ap)?|(?=&)))''' + (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))''' _TESTS = [{ 'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn', From 9bf87ae3aaac81df3efb92fd0a3247ccb522de2a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 24 May 2015 02:36:47 +0800 Subject: [PATCH 0188/2145] [nextmedia] Merge AppleDailyRealtimeNewsIE and AppleDailyAnimationNewsIE --- youtube_dl/extractor/__init__.py | 3 +-- youtube_dl/extractor/nextmedia.py | 37 +++++++++++++------------------ 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 24efb7ce5..8bb3926a0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -338,8 +338,7 @@ from .newstube import NewstubeIE from .nextmedia import ( NextMediaIE, NextMediaActionNewsIE, - AppleDailyRealtimeNewsIE, - AppleDailyAnimationNewsIE + AppleDailyIE, ) from .nfb import NFBIE from .nfl import NFLIE diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py index 02dba4ef6..c75ccafc7 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/youtube_dl/extractor/nextmedia.py @@ -89,8 +89,8 @@ class NextMediaActionNewsIE(NextMediaIE): return self._extract_from_nextmedia_page(news_id, url, article_page) -class AppleDailyRealtimeNewsIE(NextMediaIE): - _VALID_URL = r'http://(www|ent).appledaily.com.tw/(realtimenews|enews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' +class AppleDailyIE(NextMediaIE): + _VALID_URL = r'http://(www|ent).appledaily.com.tw/(animation|realtimenews|enews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' _TESTS = [{ 'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694', 'md5': 'a843ab23d150977cc55ef94f1e2c1e4d', @@ -99,7 +99,7 @@ class AppleDailyRealtimeNewsIE(NextMediaIE): 'ext': 'mp4', 'title': '周亭羽走過摩鐵陰霾2男陪吃 九把刀孤寒看醫生', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'md5:b23787119933404ce515c6356a8c355c', + 'description': 'md5:2acd430e59956dc47cd7f67cb3c003f4', 'upload_date': '20150128', } }, { @@ -110,26 +110,10 @@ class AppleDailyRealtimeNewsIE(NextMediaIE): 'ext': 'mp4', 'title': '不滿被踩腳 山東兩大媽一路打下車', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'md5:2648aaf6fc4f401f6de35a91d111aa1d', + 'description': 'md5:175b4260c1d7c085993474217e4ab1b4', 'upload_date': '20150128', } - }] - - _URL_PATTERN = r'\{url: \'(.+)\'\}' - - def _fetch_title(self, page): - return self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title') - - def _fetch_thumbnail(self, page): - return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False) - - def _fetch_timestamp(self, page): - return None - - -class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE): - _VALID_URL = 'http://www.appledaily.com.tw/animation/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' - _TESTS = [{ + }, { 'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671', 'md5': '03df296d95dedc2d5886debbb80cb43f', 'info_dict': { @@ -156,8 +140,17 @@ class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE): ] }] + _URL_PATTERN = r'\{url: \'(.+)\'\}' + def _fetch_title(self, page): - return self._html_search_meta('description', page, 'news title') + return (self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title', default=None) or + self._html_search_meta('description', page, 'news title')) + + def _fetch_thumbnail(self, page): + return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False) + + def _fetch_timestamp(self, page): + return None def _fetch_description(self, page): return self._html_search_meta('description', page, 'news description') From 30455ce2554d00489901a398d457fac89456fe49 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 24 May 2015 02:42:01 +0800 Subject: [PATCH 0189/2145] [nextmedia] Extend and reorder _VALID_URL --- youtube_dl/extractor/nextmedia.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py index c75ccafc7..d1b7cff4c 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/youtube_dl/extractor/nextmedia.py @@ -90,7 +90,7 @@ class NextMediaActionNewsIE(NextMediaIE): class AppleDailyIE(NextMediaIE): - _VALID_URL = r'http://(www|ent).appledaily.com.tw/(animation|realtimenews|enews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' + _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' _TESTS = [{ 'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694', 'md5': 'a843ab23d150977cc55ef94f1e2c1e4d', @@ -138,6 +138,9 @@ class AppleDailyIE(NextMediaIE): 'expected_warnings': [ 'video thumbnail', ] + }, { + 'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/', + 'only_matching': True, }] _URL_PATTERN = r'\{url: \'(.+)\'\}' From 1335c3aca8f0cbddc0c521c73579eec2b9a5643c Mon Sep 17 00:00:00 2001 From: Sergey M? <dstftw@gmail.com> Date: Sun, 24 May 2015 01:21:18 +0600 Subject: [PATCH 0190/2145] [drtv] Improve extraction (Closes #5792) --- youtube_dl/extractor/drtv.py | 37 +++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index f25ab319e..baa24c6d1 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -1,8 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor, ExtractorError -from ..utils import parse_iso8601 +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + parse_iso8601, +) class DRTVIE(InfoExtractor): @@ -60,19 +63,31 @@ class DRTVIE(InfoExtractor): restricted_to_denmark = asset['RestrictedToDenmark'] spoken_subtitles = asset['Target'] == 'SpokenSubtitles' for link in asset['Links']: - target = link['Target'] uri = link['Uri'] + target = link['Target'] format_id = target - preference = -1 if target == 'HDS' else -2 + preference = None if spoken_subtitles: - preference -= 2 + preference = -1 format_id += '-spoken-subtitles' - formats.append({ - 'url': uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43' if target == 'HDS' else uri, - 'format_id': format_id, - 'ext': link['FileFormat'], - 'preference': preference, - }) + if target == 'HDS': + formats.extend(self._extract_f4m_formats( + uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', + video_id, preference, f4m_id=format_id)) + elif target == 'HLS': + formats.extend(self._extract_m3u8_formats( + uri, video_id, 'mp4', preference=preference, + m3u8_id=format_id)) + else: + bitrate = link.get('Bitrate') + if bitrate: + format_id += '-%s' % bitrate + formats.append({ + 'url': uri, + 'format_id': format_id, + 'tbr': bitrate, + 'ext': link.get('FileFormat'), + }) subtitles_list = asset.get('SubtitlesList') if isinstance(subtitles_list, list): LANGS = { From 71646e465348b25962a15f9a567f134514bde30a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 May 2015 04:14:01 +0600 Subject: [PATCH 0191/2145] [YoutubeDL] Initialize `files_to_delete` (Closes #5797) --- youtube_dl/YoutubeDL.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 58b34e087..d1953c18f 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1527,6 +1527,7 @@ class YoutubeDL(object): pps_chain.extend(ie_info['__postprocessors']) pps_chain.extend(self._pps) for pp in pps_chain: + files_to_delete = [] try: files_to_delete, info = pp.run(info) except PostProcessingError as e: From 1807ae22dd93646ea4d7ba4bd28087bf1ef4857c Mon Sep 17 00:00:00 2001 From: WassimAttar <wassim.attar@free.fr> Date: Sun, 24 May 2015 10:37:05 +0200 Subject: [PATCH 0192/2145] chmod error After installing youtube-dl with this method sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl sudo chmod a+xr /usr/local/bin/youtube-dl When i try to use it, i get this error python: can't open file '/usr/local/bin/youtube-dl': [Errno 13] Permission denied The correct chmod is a+xr --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3d9436456..a29cccb3f 100644 --- a/README.md +++ b/README.md @@ -17,12 +17,12 @@ youtube-dl - download videos from youtube.com or other video platforms To install it right away for all UNIX users (Linux, OS X, etc.), type: sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl - sudo chmod a+x /usr/local/bin/youtube-dl + sudo chmod a+xr /usr/local/bin/youtube-dl If you do not have curl, you can alternatively use a recent wget: sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl - sudo chmod a+x /usr/local/bin/youtube-dl + sudo chmod a+xr /usr/local/bin/youtube-dl Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29). From 23905927e19280d9217ecad377ef26ea9d5793fe Mon Sep 17 00:00:00 2001 From: "Sergey M." <dstftw@gmail.com> Date: Sun, 24 May 2015 18:32:04 +0600 Subject: [PATCH 0193/2145] [README.md] Keep more idiomatic rwx order --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a29cccb3f..e51bb5343 100644 --- a/README.md +++ b/README.md @@ -17,12 +17,12 @@ youtube-dl - download videos from youtube.com or other video platforms To install it right away for all UNIX users (Linux, OS X, etc.), type: sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl - sudo chmod a+xr /usr/local/bin/youtube-dl + sudo chmod a+rx /usr/local/bin/youtube-dl If you do not have curl, you can alternatively use a recent wget: sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl - sudo chmod a+xr /usr/local/bin/youtube-dl + sudo chmod a+rx /usr/local/bin/youtube-dl Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29). From abac15f3c6915d176c37f7aa748b8a0f03db82a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 May 2015 19:08:22 +0600 Subject: [PATCH 0194/2145] [tnaflix] Do not capture `cat_id` --- youtube_dl/extractor/tnaflix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index 79496039d..59af9aba0 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -10,7 +10,7 @@ from ..utils import ( class TNAFlixIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/(?P<cat_id>[^/]+)/(?P<display_id>[^/]+)/video(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)' _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos' _DESCRIPTION_REGEX = r'

([^<]+)

' From 34fb7e46ad3fa1a04635fa4876401aac881bb39b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 24 May 2015 19:10:03 +0600 Subject: [PATCH 0195/2145] [empflix] Relax _VALID_URL --- youtube_dl/extractor/empflix.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py index 0dc947c1d..9a5a8f4bb 100644 --- a/youtube_dl/extractor/empflix.py +++ b/youtube_dl/extractor/empflix.py @@ -4,7 +4,7 @@ from .tnaflix import TNAFlixIE class EMPFlixIE(TNAFlixIE): - _VALID_URL = r'^https?://www\.empflix\.com/videos/(?P[0-9a-zA-Z-]+)-(?P[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P.+?)-(?P[0-9]+)\.html' _TITLE_REGEX = r'name="title" value="(?P[^"]*)"' _DESCRIPTION_REGEX = r'name="description" value="([^"]*)"' @@ -23,5 +23,9 @@ class EMPFlixIE(TNAFlixIE): 'thumbnail': 're:https?://.*\.jpg$', 'age_limit': 18, } + }, + { + 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', + 'matching_only': True, } ] From d78c834ead934e5532d2f5bc221bb11eedaef0e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 May 2015 20:04:13 +0600 Subject: [PATCH 0196/2145] [karrierevideos] Improve and simplify --- youtube_dl/extractor/karrierevideos.py | 96 +++++++++++++++++++------- 1 file changed, 71 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py index a05e8ab76..bed94bc93 100644 --- a/youtube_dl/extractor/karrierevideos.py +++ b/youtube_dl/extractor/karrierevideos.py @@ -1,50 +1,96 @@ +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + fix_xml_ampersands, + float_or_none, + xpath_with_ns, + xpath_text, +) class KarriereVideosIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?karrierevideos\.at/berufsvideos/([a-z-]+)/(?P<id>[a-z-]+)' - _TEST = { + _VALID_URL = r'http://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)' + _TESTS = [{ 'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin', 'info_dict': { - 'id': 'altenpflegerin', - 'ext': 'mp4', + 'id': '32c91', + 'ext': 'flv', 'title': 'AltenpflegerIn', - 'thumbnail': 're:^http://.*\.png\?v=[0-9]+', - 'description': 'md5:dbadd1259fde2159a9b28667cb664ae2' + 'description': 'md5:dbadd1259fde2159a9b28667cb664ae2', + 'thumbnail': 're:^http://.*\.png', }, 'params': { - 'skip_download': 'requires rtmpdump' + # rtmp download + 'skip_download': True, } - } + }, { + # broken ampersands + 'url': 'http://www.karrierevideos.at/orientierung/vaeterkarenz-und-neue-chancen-fuer-muetter-baby-was-nun', + 'info_dict': { + 'id': '5sniu', + 'ext': 'flv', + 'title': 'Väterkarenz und neue Chancen für Mütter - "Baby - was nun?"', + 'description': 'md5:97092c6ad1fd7d38e9d6a5fdeb2bcc33', + 'thumbnail': 're:^http://.*\.png', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + title = (self._html_search_meta('title', webpage, default=None) or + self._search_regex(r'<h1 class="title">([^<]+)</h1>')) + + video_id = self._search_regex( + r'/config/video/(.+?)\.xml', webpage, 'video id') + playlist = self._download_xml( + 'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % video_id, + video_id, transform_source=fix_xml_ampersands) + + NS_MAP = { + 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats' + } + + def ns(path): + return xpath_with_ns(path, NS_MAP) + + item = playlist.find('./tracklist/item') + video_file = xpath_text( + item, ns('./jwplayer:file'), 'video url', fatal=True) + streamer = xpath_text( + item, ns('./jwplayer:streamer'), 'streamer', fatal=True) + + uploader = xpath_text( + item, ns('./jwplayer:author'), 'uploader') + duration = float_or_none( + xpath_text(item, ns('./jwplayer:duration'), 'duration')) + description = self._html_search_regex( - r'<div class="leadtext">\n{0,}?\s{0,}<p>(.*?)</p>', + r'(?s)<div class="leadtext">(.+?)</div>', webpage, 'description') - playlist = self._html_search_regex(r'/config/video/(.*?)\.xml', webpage, 'playlist') - playlist = self._download_xml( - 'http://www.karrierevideos.at/player-playlist.xml.php?p=' + playlist, - video_id) - - namespace = 'http://developer.longtailvideo.com/trac/wiki/FlashFormats' - - item = playlist.find('tracklist/item') - streamer = item.find('{%s}streamer' % namespace).text + thumbnail = self._html_search_meta( + 'thumbnail', webpage, 'thumbnail') + if thumbnail: + thumbnail = compat_urlparse.urljoin(url, thumbnail) return { 'id': video_id, - 'title': self._html_search_meta('title', webpage), + 'url': streamer.replace('rtmpt', 'rtmp'), + 'play_path': 'mp4:%s' % video_file, + 'ext': 'flv', + 'title': title, 'description': description, - 'thumbnail': 'http://www.karrierevideos.at' + self._html_search_meta('thumbnail', webpage), - 'protocol': 'rtmp', - 'url': streamer.replace('rtmpt', 'http'), - 'play_path': 'mp4:' + item.find('{%s}file' % namespace).text, - 'tc_url': streamer, - 'ext': 'mp4' + 'thumbnail': thumbnail, + 'uploader': uploader, + 'duration': duration, } From 63f3cab4aee5da45dc9b91a9661d5d52b5a72ec6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 May 2015 21:09:08 +0600 Subject: [PATCH 0197/2145] [rtbf] Fix extraction (Closes #5803) --- youtube_dl/extractor/rtbf.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index dce64e151..5a381d9ce 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -1,10 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import json - from .common import InfoExtractor +from ..utils import ( + int_or_none, + unescapeHTML, +) class RTBFIE(InfoExtractor): @@ -16,25 +17,24 @@ class RTBFIE(InfoExtractor): 'id': '1921274', 'ext': 'mp4', 'title': 'Les Diables au coeur (épisode 2)', - 'description': 'Football - Diables Rouges', 'duration': 3099, - 'timestamp': 1398456336, - 'upload_date': '20140425', } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) - page = self._download_webpage('https://www.rtbf.be/video/embed?id=%s' % video_id, video_id) + webpage = self._download_webpage( + 'http://www.rtbf.be/video/embed?id=%s' % video_id, video_id) - data = json.loads(self._html_search_regex( - r'<div class="js-player-embed(?: player-embed)?" data-video="([^"]+)"', page, 'data video'))['data'] + data = self._parse_json( + unescapeHTML(self._search_regex( + r'data-video="([^"]+)"', webpage, 'data video')), + video_id) video_url = data.get('downloadUrl') or data.get('url') - if data['provider'].lower() == 'youtube': + if data.get('provider').lower() == 'youtube': return self.url_result(video_url, 'Youtube') return { @@ -42,8 +42,8 @@ class RTBFIE(InfoExtractor): 'url': video_url, 'title': data['title'], 'description': data.get('description') or data.get('subtitle'), - 'thumbnail': data['thumbnail']['large'], + 'thumbnail': data.get('thumbnail'), 'duration': data.get('duration') or data.get('realDuration'), - 'timestamp': data['created'], - 'view_count': data['viewCount'], + 'timestamp': int_or_none(data.get('created')), + 'view_count': int_or_none(data.get('viewCount')), } From 2ad5708c43a8672da547fa279e71b20c327793d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 24 May 2015 21:25:00 +0600 Subject: [PATCH 0198/2145] [arte:future] Switch to `search_regex` for now (Closes #5801) --- youtube_dl/extractor/arte.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 8273bd6c9..fce38248d 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -195,7 +195,9 @@ class ArteTVFutureIE(ArteTVPlus7IE): def _real_extract(self, url): anchor_id, lang = self._extract_url_info(url) webpage = self._download_webpage(url, anchor_id) - row = get_element_by_id(anchor_id, webpage) + row = self._search_regex( + r'(?s)id="%s"[^>]*>.+?(<div[^>]*arte_vp_url[^>]*>)' % anchor_id, + webpage, 'row') return self._extract_from_webpage(row, anchor_id, lang) From 04b3b3df05a26a361441754afeb7ff24d0c1f559 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 22 May 2015 11:58:52 +0200 Subject: [PATCH 0199/2145] [youtube] Remove the nondash formats (fixes #5774) Since we use fixed values for some fields like width and height they can be wrong, and would get picked by some formats filters. For example for https://www.youtube.com/watch?v=EQCrhbBxsjA the biggest height is 720 and for nondash formats it's set to 1440, so -f 'bestvideo[height>=1200]+bestaudio' would incorrectly pick the nondash format, instead it should report that the requested format is not available. --- youtube_dl/extractor/youtube.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 1f9940cf5..0301682b8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1126,12 +1126,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.report_warning( 'Skipping DASH manifest: %r' % e, video_id) else: - # Hide the formats we found through non-DASH + # Remove the formats we found through non-DASH, they + # contain less info and it can be wrong, because we use + # fixed values (for example the resolution). See + # https://github.com/rg3/youtube-dl/issues/5774 for an + # example. dash_keys = set(df['format_id'] for df in dash_formats) - for f in formats: - if f['format_id'] in dash_keys: - f['format_id'] = 'nondash-%s' % f['format_id'] - f['preference'] = f.get('preference', 0) - 10000 + formats = [f for f in formats if f['format_id'] not in dash_keys] formats.extend(dash_formats) # Check for malformed aspect ratio From 4b4e1af059c0922da9770a79a68a471277303f21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 24 May 2015 18:46:29 +0200 Subject: [PATCH 0200/2145] [arte] Remove unused import --- youtube_dl/extractor/arte.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index fce38248d..76de24477 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -7,7 +7,6 @@ from .common import InfoExtractor from ..utils import ( find_xpath_attr, unified_strdate, - get_element_by_id, get_element_by_attribute, int_or_none, qualities, From d41ebe146b693011eb1020ca9cd935e7db892d0b Mon Sep 17 00:00:00 2001 From: Sergey M? <dstftw@gmail.com> Date: Sun, 24 May 2015 23:57:47 +0600 Subject: [PATCH 0201/2145] [tenplay] Fix formats and modernize (Closes #5806) --- youtube_dl/extractor/tenplay.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py index 466155ef8..f6694149b 100644 --- a/youtube_dl/extractor/tenplay.py +++ b/youtube_dl/extractor/tenplay.py @@ -2,6 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + int_or_none, + float_or_none, +) class TenPlayIE(InfoExtractor): @@ -49,18 +53,23 @@ class TenPlayIE(InfoExtractor): if protocol == 'rtmp': url = url.replace('&mp4:', '') + tbr = int_or_none(rendition.get('encodingRate'), 1000) + formats.append({ - 'format_id': '_'.join(['rtmp', rendition['videoContainer'].lower(), rendition['videoCodec'].lower()]), - 'width': rendition['frameWidth'], - 'height': rendition['frameHeight'], - 'tbr': rendition['encodingRate'] / 1024, - 'filesize': rendition['size'], + 'format_id': '_'.join( + ['rtmp', rendition['videoContainer'].lower(), + rendition['videoCodec'].lower(), '%sk' % tbr]), + 'width': int_or_none(rendition['frameWidth']), + 'height': int_or_none(rendition['frameHeight']), + 'tbr': tbr, + 'filesize': int_or_none(rendition['size']), 'protocol': protocol, 'ext': ext, 'vcodec': rendition['videoCodec'].lower(), 'container': rendition['videoContainer'].lower(), 'url': url, }) + self._sort_formats(formats) return { 'id': video_id, @@ -74,8 +83,8 @@ class TenPlayIE(InfoExtractor): 'url': json['thumbnailURL'] }], 'thumbnail': json['videoStillURL'], - 'duration': json['length'] / 1000, - 'timestamp': float(json['creationDate']) / 1000, - 'uploader': json['customFields']['production_company_distributor'] if 'production_company_distributor' in json['customFields'] else 'TENplay', - 'view_count': json['playsTotal'] + 'duration': float_or_none(json.get('length'), 1000), + 'timestamp': float_or_none(json.get('creationDate'), 1000), + 'uploader': json.get('customFields', {}).get('production_company_distributor') or 'TENplay', + 'view_count': int_or_none(json.get('playsTotal')), } From 7198063d96003050eccb0ea59cc938f0388c0606 Mon Sep 17 00:00:00 2001 From: Mister Hat <misterhat144@gmail.com> Date: Sun, 24 May 2015 15:26:59 -0500 Subject: [PATCH 0202/2145] [pinkbike] new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/pinkbike.py | 78 ++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 youtube_dl/extractor/pinkbike.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 79bcd9106..80bec39da 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -394,6 +394,7 @@ from .pbs import PBSIE from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE +from .pinkbike import PinkbikeIE from .planetaplay import PlanetaPlayIE from .pladform import PladformIE from .played import PlayedIE diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py new file mode 100644 index 000000000..4a15c1835 --- /dev/null +++ b/youtube_dl/extractor/pinkbike.py @@ -0,0 +1,78 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class PinkbikeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pinkbike\.com/video/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.pinkbike.com/video/402811/', + 'md5': '4814b8ca7651034cd87e3361d5c2155a', + 'info_dict': { + 'id': '402811', + 'ext': 'mp4', + 'title': 'Brandon Semenuk - RAW 100', + 'thumbnail': 're:^https?://.*\.jpg$', + 'location': 'Victoria, British Columbia, Canada', + 'uploader_id': 'revelco', + 'upload_date': '20150406', + 'description': 'Official release: www.redbull.ca/rupertwalker', + 'duration': '100' + } + }, { + 'url': 'http://www.pinkbike.com/video/406629/', + 'md5': 'c7a3e19a2bd5cde5a1cda6b2b46caa74', + 'info_dict': { + 'id': '406629', + 'ext': 'mp4', + 'title': 'Chromag: Reece Wallace in Utah', + 'thumbnail': 're:^https?://.*\.jpg$', + 'location': 'Whistler, British Columbia, Canada', + 'uploader_id': 'Chromagbikes', + 'upload_date': '20150505', + 'description': 'Reece Wallace shredding Virgin, Utah. Video by Virtu Media.', + 'duration': '180' + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'<title>(.*?)', webpage, 'title') + title = title[:-len(' Video - Pinkbike')] + + description = self._html_search_meta('description', webpage, 'description') + description = description[len(title + '. '):] + + uploader_id = self._html_search_regex(r'un:\s*"(.*?)"', webpage, 'uploader_id') + + upload_date = self._html_search_regex( + r'class="fullTime"\s*title="([0-9]{4}(?:-[0-9]{2}){2})"', + webpage, 'upload_date') + upload_date = upload_date.replace('-', '') + + location = self._html_search_regex( + r'
Location
\n?\s*
\n?(.*?)\s*', + webpage) + + formats = [{'url': fmt[1], 'height': fmt[0]} for fmt in formats] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': self._html_search_meta('video:duration', webpage, 'duration'), + 'thumbnail': self._html_search_meta('og:image', webpage, 'thumbnail'), + 'uploader_id': uploader_id, + 'upload_date': upload_date, + 'location': location, + 'formats': formats + } From 2c935c0c7224a3332ff9f0fd83e8c074cfbe2c9d Mon Sep 17 00:00:00 2001 From: Mister Hat Date: Sun, 24 May 2015 16:30:03 -0500 Subject: [PATCH 0203/2145] [pinkbike] converted duration to int --- youtube_dl/extractor/pinkbike.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py index 4a15c1835..66605ddbe 100644 --- a/youtube_dl/extractor/pinkbike.py +++ b/youtube_dl/extractor/pinkbike.py @@ -20,7 +20,7 @@ class PinkbikeIE(InfoExtractor): 'uploader_id': 'revelco', 'upload_date': '20150406', 'description': 'Official release: www.redbull.ca/rupertwalker', - 'duration': '100' + 'duration': 100 } }, { 'url': 'http://www.pinkbike.com/video/406629/', @@ -34,7 +34,7 @@ class PinkbikeIE(InfoExtractor): 'uploader_id': 'Chromagbikes', 'upload_date': '20150505', 'description': 'Reece Wallace shredding Virgin, Utah. Video by Virtu Media.', - 'duration': '180' + 'duration': 180 } }] @@ -69,7 +69,7 @@ class PinkbikeIE(InfoExtractor): 'id': video_id, 'title': title, 'description': description, - 'duration': self._html_search_meta('video:duration', webpage, 'duration'), + 'duration': int(self._html_search_meta('video:duration', webpage, 'duration')), 'thumbnail': self._html_search_meta('og:image', webpage, 'thumbnail'), 'uploader_id': uploader_id, 'upload_date': upload_date, From 680f9744c4e010ad5111c7711c58c341d5ba24dd Mon Sep 17 00:00:00 2001 From: Mister Hat Date: Sun, 24 May 2015 16:45:10 -0500 Subject: [PATCH 0204/2145] [pinkbike] used proper conversion methods --- youtube_dl/extractor/pinkbike.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py index 66605ddbe..45c0b1377 100644 --- a/youtube_dl/extractor/pinkbike.py +++ b/youtube_dl/extractor/pinkbike.py @@ -4,6 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + int_or_none, + remove_end, + remove_start +) class PinkbikeIE(InfoExtractor): @@ -43,10 +48,13 @@ class PinkbikeIE(InfoExtractor): webpage = self._download_webpage(url, video_id) title = self._html_search_regex(r'(.*?)', webpage, 'title') - title = title[:-len(' Video - Pinkbike')] + title = remove_end(title, ' Video - Pinkbike') description = self._html_search_meta('description', webpage, 'description') - description = description[len(title + '. '):] + description = remove_start(description, title + '. ') + + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'duration')) uploader_id = self._html_search_regex(r'un:\s*"(.*?)"', webpage, 'uploader_id') @@ -63,13 +71,13 @@ class PinkbikeIE(InfoExtractor): r'', webpage) - formats = [{'url': fmt[1], 'height': fmt[0]} for fmt in formats] + formats = [{'url': fmt[1], 'height': int_or_none(fmt[0])} for fmt in formats] return { 'id': video_id, 'title': title, 'description': description, - 'duration': int(self._html_search_meta('video:duration', webpage, 'duration')), + 'duration': duration, 'thumbnail': self._html_search_meta('og:image', webpage, 'thumbnail'), 'uploader_id': uploader_id, 'upload_date': upload_date, From b885bae6340b2aa9406501250fdebfbeea54e5e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 25 May 2015 04:53:53 +0600 Subject: [PATCH 0205/2145] Credit @misterhat for karrierevideos (#5729) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index ebed7ebb3..3410e1fb9 100644 --- a/AUTHORS +++ b/AUTHORS @@ -125,3 +125,4 @@ Roman Le Négrate Matthias Küch Julian Richen Ping O. +Mister Hat From c6bbdadd79fac001cde15e8fd118b9535427474d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 25 May 2015 21:22:13 +0600 Subject: [PATCH 0206/2145] [odnoklassniki] Support extraction from metadata URL (Closes #5813) --- youtube_dl/extractor/odnoklassniki.py | 33 ++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index fbc521d1a..691f503f5 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_urllib_parse from ..utils import ( unified_strdate, int_or_none, @@ -11,8 +12,9 @@ from ..utils import ( class OdnoklassnikiIE(InfoExtractor): - _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P\d+)' + _VALID_URL = r'https?://(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P[\d-]+)' _TESTS = [{ + # metadata in JSON 'url': 'http://ok.ru/video/20079905452', 'md5': '8e24ad2da6f387948e7a7d44eb8668fe', 'info_dict': { @@ -20,11 +22,22 @@ class OdnoklassnikiIE(InfoExtractor): 'ext': 'mp4', 'title': 'Культура меняет нас (прекрасный ролик!))', 'duration': 100, - 'upload_date': '20141207', 'uploader_id': '330537914540', 'uploader': 'Виталий Добровольский', 'like_count': int, - 'age_limit': 0, + }, + }, { + # metadataUrl + 'url': 'http://ok.ru/video/63567059965189-0', + 'md5': '9676cf86eff5391d35dea675d224e131', + 'info_dict': { + 'id': '63567059965189-0', + 'ext': 'mp4', + 'title': 'Девушка без комплексов ...', + 'duration': 191, + 'uploader_id': '534380003155', + 'uploader': 'Андрей Мещанинов', + 'like_count': int, }, }, { 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452', @@ -41,7 +54,15 @@ class OdnoklassnikiIE(InfoExtractor): r'data-attributes="([^"]+)"', webpage, 'player')), video_id) - metadata = self._parse_json(player['flashvars']['metadata'], video_id) + flashvars = player['flashvars'] + + metadata = flashvars.get('metadata') + if metadata: + metadata = self._parse_json(metadata, video_id) + else: + metadata = self._download_json( + compat_urllib_parse.unquote(flashvars['metadataUrl']), + video_id, 'Downloading metadata JSON') movie = metadata['movie'] title = movie['title'] @@ -53,11 +74,11 @@ class OdnoklassnikiIE(InfoExtractor): uploader = author.get('name') upload_date = unified_strdate(self._html_search_meta( - 'ya:ovs:upload_date', webpage, 'upload date')) + 'ya:ovs:upload_date', webpage, 'upload date', default=None)) age_limit = None adult = self._html_search_meta( - 'ya:ovs:adult', webpage, 'age limit') + 'ya:ovs:adult', webpage, 'age limit', default=None) if adult: age_limit = 18 if adult == 'true' else 0 From ba2df04b41b62d08e1fd0efaaaf104467133e9a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 25 May 2015 21:27:43 +0600 Subject: [PATCH 0207/2145] [odnoklassniki] Make URL explicit --- youtube_dl/extractor/odnoklassniki.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 691f503f5..6c7149fe3 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -47,7 +47,8 @@ class OdnoklassnikiIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + 'http://ok.ru/video/%s' % video_id, video_id) player = self._parse_json( unescapeHTML(self._search_regex( From 5d0a33eebcae821ac5d1124043a5ad77a58fa291 Mon Sep 17 00:00:00 2001 From: Alexander Kirk Date: Mon, 25 May 2015 20:12:18 +0200 Subject: [PATCH 0208/2145] rtlnow is now hosted at nowtv.de --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/nowtv.py | 90 ++++++++++++++++ youtube_dl/extractor/rtlnow.py | 174 ------------------------------- 3 files changed, 91 insertions(+), 175 deletions(-) create mode 100644 youtube_dl/extractor/nowtv.py delete mode 100644 youtube_dl/extractor/rtlnow.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 79bcd9106..bfd07392e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -355,6 +355,7 @@ from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE from .novamov import NovaMovIE from .nowness import NownessIE +from .nowtv import NowTvIE from .nowvideo import NowVideoIE from .npo import ( NPOIE, @@ -438,7 +439,6 @@ from .roxwel import RoxwelIE from .rtbf import RTBFIE from .rte import RteIE from .rtlnl import RtlNlIE -from .rtlnow import RTLnowIE from .rtl2 import RTL2IE from .rtp import RTPIE from .rts import RTSIE diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py new file mode 100644 index 000000000..bf97fe7f4 --- /dev/null +++ b/youtube_dl/extractor/nowtv.py @@ -0,0 +1,90 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + qualities, + unified_strdate, + int_or_none, +) + +class NowTvIE(InfoExtractor): + """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW""" + _VALID_URL = r'''(?x) + (?:https?://)? + ( + (?:www\.)?nowtv\.de + /(rtl|rtl2|rtlnitro||superrtl|ntv|vox)(?P/.*?)/player + )''' + + _TESTS = [ + { + 'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player', + 'info_dict': { + 'id': '128953', + 'ext': 'mp4', + 'title': 'B\u00fcro-Fall \/ Chihuahua \'Joel\'', + 'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de', + 'upload_date': '2015-05-23 19:10:00', + 'duration': '00:51:32', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Only works from Germany', + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + info_url = 'https://api.nowtv.de/v3/movies' + mobj.group('path') + '?fields=*,format,files,breakpoints,paymentPaytypes,trailers' + info = self._download_json(info_url, None) + + video_id = info['id'] + title = info['title'] + description = info['articleShort'] + duration = info['duration'] + upload_date = unified_strdate(info['broadcastStartDate']) + free = info['free'] + station = info['format']['station'] + thumbnail = info['format']['defaultImage169Logo'] + + if station == 'rtl': + base_url = 'http://hls.fra.rtlnow.de/hls-vod-enc/' + elif station == 'rtl2': + base_url = 'http://hls.fra.rtl2now.de/hls-vod-enc/' + elif station == 'vox': + base_url = 'http://hls.fra.voxnow.de/hls-vod-enc/' + elif station == 'nitro': + base_url = 'http://hls.fra.rtlnitronow.de/hls-vod-enc/' + elif station == 'ntv': + base_url = 'http://hls.fra.n-tvnow.de/hls-vod-enc/' + elif station == 'superrtl': + base_url = 'http://hls.fra.superrtlnow.de/hls-vod-enc/' + + formats = [] + for item in info['files']['items']: + if item['type'] != 'video/x-abr': + continue + + fmt = { + 'url': base_url + item['path'] + '.m3u8', + 'tbr': int_or_none(item['bitrate']), + 'ext': 'mp4', + 'format_id': int_or_none(item['id']), + } + formats.append(fmt) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py deleted file mode 100644 index 785a8045e..000000000 --- a/youtube_dl/extractor/rtlnow.py +++ /dev/null @@ -1,174 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - clean_html, - unified_strdate, - int_or_none, -) - - -class RTLnowIE(InfoExtractor): - """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW""" - _VALID_URL = r'''(?x) - (?:https?://)? - (?P - (?P - rtl-now\.rtl\.de| - rtl2now\.rtl2\.de| - (?:www\.)?voxnow\.de| - (?:www\.)?rtlnitronow\.de| - (?:www\.)?superrtlnow\.de| - (?:www\.)?n-tvnow\.de) - /+[a-zA-Z0-9-]+/[a-zA-Z0-9-]+\.php\? - (?:container_id|film_id)=(?P[0-9]+)& - player=1(?:&season=[0-9]+)?(?:&.*)? - )''' - - _TESTS = [ - { - 'url': 'http://rtl-now.rtl.de/ahornallee/folge-1.php?film_id=90419&player=1&season=1', - 'info_dict': { - 'id': '90419', - 'ext': 'flv', - 'title': 'Ahornallee - Folge 1 - Der Einzug', - 'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de', - 'upload_date': '20070416', - 'duration': 1685, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Only works from Germany', - }, - { - 'url': 'http://rtl2now.rtl2.de/aerger-im-revier/episode-15-teil-1.php?film_id=69756&player=1&season=2&index=5', - 'info_dict': { - 'id': '69756', - 'ext': 'flv', - 'title': 'Ärger im Revier - Ein junger Ladendieb, ein handfester Streit u.a.', - 'description': 'md5:3fb247005ed21a935ffc82b7dfa70cf0', - 'thumbnail': 'http://autoimg.static-fra.de/rtl2now/219850/1500x1500/image2.jpg', - 'upload_date': '20120519', - 'duration': 1245, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Only works from Germany', - }, - { - 'url': 'http://www.voxnow.de/voxtours/suedafrika-reporter-ii.php?film_id=13883&player=1&season=17', - 'info_dict': { - 'id': '13883', - 'ext': 'flv', - 'title': 'Voxtours - Südafrika-Reporter II', - 'description': 'md5:de7f8d56be6fd4fed10f10f57786db00', - 'upload_date': '20090627', - 'duration': 1800, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://superrtlnow.de/medicopter-117/angst.php?film_id=99205&player=1', - 'info_dict': { - 'id': '99205', - 'ext': 'flv', - 'title': 'Medicopter 117 - Angst!', - 'description': 're:^Im Therapiezentrum \'Sonnalm\' kommen durch eine Unachtsamkeit die für die B.handlung mit Phobikern gehaltenen Voglespinnen frei\. Eine Ausreißerin', - 'thumbnail': 'http://autoimg.static-fra.de/superrtlnow/287529/1500x1500/image2.jpg', - 'upload_date': '20080928', - 'duration': 2691, - }, - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://rtl-now.rtl.de/der-bachelor/folge-4.php?film_id=188729&player=1&season=5', - 'info_dict': { - 'id': '188729', - 'ext': 'flv', - 'upload_date': '20150204', - 'description': 'md5:5e1ce23095e61a79c166d134b683cecc', - 'title': 'Der Bachelor - Folge 4', - } - }, { - 'url': 'http://www.n-tvnow.de/deluxe-alles-was-spass-macht/thema-ua-luxushotel-fuer-vierbeiner.php?container_id=153819&player=1&season=0', - 'only_matching': True, - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_page_url = 'http://%s/' % mobj.group('domain') - video_id = mobj.group('video_id') - - webpage = self._download_webpage('http://' + mobj.group('url'), video_id) - - mobj = re.search(r'(?s)
(.*?)
', webpage) - if mobj: - raise ExtractorError(clean_html(mobj.group(1)), expected=True) - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage, default=None) - - upload_date = unified_strdate(self._html_search_meta('uploadDate', webpage, 'upload date')) - - mobj = re.search(r'', webpage) - duration = int(mobj.group('seconds')) if mobj else None - - playerdata_url = self._html_search_regex( - r"'playerdata': '(?P[^']+)'", webpage, 'playerdata_url') - - playerdata = self._download_xml(playerdata_url, video_id, 'Downloading player data XML') - - videoinfo = playerdata.find('./playlist/videoinfo') - - formats = [] - for filename in videoinfo.findall('filename'): - mobj = re.search(r'(?Prtmpe://(?:[^/]+/){2})(?P.+)', filename.text) - if mobj: - fmt = { - 'url': mobj.group('url'), - 'play_path': 'mp4:' + mobj.group('play_path'), - 'page_url': video_page_url, - 'player_url': video_page_url + 'includes/vodplayer.swf', - } - else: - mobj = re.search(r'.*/(?P[^/]+)/videos/(?P.+)\.f4m', filename.text) - if mobj: - fmt = { - 'url': 'rtmpe://fms.rtl.de/' + mobj.group('hoster'), - 'play_path': 'mp4:' + mobj.group('play_path'), - 'page_url': url, - 'player_url': video_page_url + 'includes/vodplayer.swf', - } - else: - fmt = { - 'url': filename.text, - } - fmt.update({ - 'width': int_or_none(filename.get('width')), - 'height': int_or_none(filename.get('height')), - 'vbr': int_or_none(filename.get('bitrate')), - 'ext': 'flv', - }) - formats.append(fmt) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'duration': duration, - 'formats': formats, - } From 42833b44b5e3810a2875dfb130aefbf5db057c1e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 26 May 2015 13:32:43 +0800 Subject: [PATCH 0209/2145] [tf1] Extend _VALID_URL (fixes #5819) --- youtube_dl/extractor/tf1.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 025d0877c..656410528 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -6,8 +6,8 @@ from .common import InfoExtractor class TF1IE(InfoExtractor): """TF1 uses the wat.tv player.""" - _VALID_URL = r'http://(?:videos\.tf1|www\.tfou)\.fr/.*?-(?P\d+)(?:-\d+)?\.html' - _TESTS = { + _VALID_URL = r'http://(?:videos\.tf1|www\.tfou|www\.tf1)\.fr/.*?-(?P\d+)(?:-\d+)?\.html' + _TESTS = [{ 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', 'info_dict': { 'id': '10635995', @@ -32,7 +32,10 @@ class TF1IE(InfoExtractor): # Sometimes wat serves the whole file with the --test option 'skip_download': True, }, - } + }, { + 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From db3ca36403930063b7df3b228a3f297bf278b43c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 26 May 2015 13:37:15 +0800 Subject: [PATCH 0210/2145] [facebook] Move the title extraction warning below (fixes #5820) --- youtube_dl/extractor/facebook.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 937b28fcc..e8d682716 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -50,7 +50,10 @@ class FacebookIE(InfoExtractor): 'id': '274175099429670', 'ext': 'mp4', 'title': 'Facebook video #274175099429670', - } + }, + 'expected_warnings': [ + 'title' + ] }, { 'url': 'https://www.facebook.com/video.php?v=10204634152394104', 'only_matching': True, @@ -150,11 +153,11 @@ class FacebookIE(InfoExtractor): video_title = self._html_search_regex( r'

([^<]*)

', webpage, 'title', - fatal=False) + default=None) if not video_title: video_title = self._html_search_regex( r'(?s)(.*?)', - webpage, 'alternative title', default=None) + webpage, 'alternative title', fatal=False) video_title = limit_length(video_title, 80) if not video_title: video_title = 'Facebook video #%s' % video_id From efec4358b9b8ed5726c1f7d7939e8bce49f9100c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 26 May 2015 13:54:41 +0800 Subject: [PATCH 0211/2145] [cinemassacre] Support an alternative form of screenwavemedia URL fixes #5821 --- youtube_dl/extractor/cinemassacre.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index cf0a7551b..c949a4814 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -60,6 +60,17 @@ class CinemassacreIE(InfoExtractor): 'uploader_id': 'Cinemassacre', 'title': 'AVGN: McKids', } + }, + { + 'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/', + 'md5': '1376908e49572389e7b06251a53cdd08', + 'info_dict': { + 'id': 'Cinemassacre-555779690c440', + 'ext': 'mp4', + 'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!', + 'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays', + 'upload_date': '20150525', + } } ] @@ -72,7 +83,7 @@ class CinemassacreIE(InfoExtractor): playerdata_url = self._search_regex( [ - r'src="(http://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"', + r'src="(http://(?:player2\.screenwavemedia\.com|player\.screenwavemedia\.com/play)/[a-zA-Z]+\.php\?[^"]*\bid=.+?)"', r']+src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', ], webpage, 'player data URL', default=None) From ff305edd645d6f4307faa9307dea91694a1d217d Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis Date: Tue, 26 May 2015 13:43:00 +0300 Subject: [PATCH 0212/2145] [sockshare] Remove extractor Haywire since last October. --- youtube_dl/extractor/sockshare.py | 83 ------------------------------- 1 file changed, 83 deletions(-) delete mode 100644 youtube_dl/extractor/sockshare.py diff --git a/youtube_dl/extractor/sockshare.py b/youtube_dl/extractor/sockshare.py deleted file mode 100644 index b5fa6f1da..000000000 --- a/youtube_dl/extractor/sockshare.py +++ /dev/null @@ -1,83 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from ..compat import ( - compat_urllib_parse, - compat_urllib_request, -) -from ..utils import ( - determine_ext, - ExtractorError, -) - -from .common import InfoExtractor - - -class SockshareIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?sockshare\.com/file/(?P[0-9A-Za-z]+)' - _FILE_DELETED_REGEX = r'This file doesn\'t exist, or has been removed\.
' - _TEST = { - 'url': 'http://www.sockshare.com/file/437BE28B89D799D7', - 'md5': '9d0bf1cfb6dbeaa8d562f6c97506c5bd', - 'info_dict': { - 'id': '437BE28B89D799D7', - 'title': 'big_buck_bunny_720p_surround.avi', - 'ext': 'avi', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - url = 'http://sockshare.com/file/%s' % video_id - webpage = self._download_webpage(url, video_id) - - if re.search(self._FILE_DELETED_REGEX, webpage) is not None: - raise ExtractorError('Video %s does not exist' % video_id, - expected=True) - - confirm_hash = self._html_search_regex(r'''(?x)(.+)', - r'var name = "([^"]+)";'), - webpage, 'title', default=None) - thumbnail = self._html_search_regex( - r' Date: Tue, 26 May 2015 13:44:46 +0300 Subject: [PATCH 0213/2145] [firedrive] Remove extractor (Closes #3870) Haywire since last October. --- youtube_dl/extractor/firedrive.py | 80 ------------------------------- 1 file changed, 80 deletions(-) delete mode 100644 youtube_dl/extractor/firedrive.py diff --git a/youtube_dl/extractor/firedrive.py b/youtube_dl/extractor/firedrive.py deleted file mode 100644 index 3191116d9..000000000 --- a/youtube_dl/extractor/firedrive.py +++ /dev/null @@ -1,80 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urllib_request, -) -from ..utils import ( - ExtractorError, -) - - -class FiredriveIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?firedrive\.com/' + \ - '(?:file|embed)/(?P[0-9a-zA-Z]+)' - _FILE_DELETED_REGEX = r'
' - - _TESTS = [{ - 'url': 'https://www.firedrive.com/file/FEB892FA160EBD01', - 'md5': 'd5d4252f80ebeab4dc2d5ceaed1b7970', - 'info_dict': { - 'id': 'FEB892FA160EBD01', - 'ext': 'flv', - 'title': 'bbb_theora_486kbit.flv', - 'thumbnail': 're:^http://.*\.jpg$', - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - url = 'http://firedrive.com/file/%s' % video_id - webpage = self._download_webpage(url, video_id) - - if re.search(self._FILE_DELETED_REGEX, webpage) is not None: - raise ExtractorError('Video %s does not exist' % video_id, - expected=True) - - fields = dict(re.findall(r'''(?x)(.+)
', - webpage, 'title') - thumbnail = self._search_regex(r'image:\s?"(//[^\"]+)', webpage, - 'thumbnail', fatal=False) - if thumbnail is not None: - thumbnail = 'http:' + thumbnail - - ext = self._search_regex(r'type:\s?\'([^\']+)\',', - webpage, 'extension', fatal=False) - video_url = self._search_regex( - r'file:\s?loadURL\(\'(http[^\']+)\'\),', webpage, 'file url') - - formats = [{ - 'format_id': 'sd', - 'url': video_url, - 'ext': ext, - }] - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'formats': formats, - } From 544a8693b7d9321d776987a5104889056955daa2 Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis Date: Tue, 26 May 2015 13:53:14 +0300 Subject: [PATCH 0214/2145] Remove Firedrive and Sockshare imports Oops --- youtube_dl/extractor/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 79bcd9106..80c9cb107 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -149,7 +149,6 @@ from .extremetube import ExtremeTubeIE from .facebook import FacebookIE from .faz import FazIE from .fc2 import FC2IE -from .firedrive import FiredriveIE from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE @@ -480,7 +479,6 @@ from .smotri import ( SmotriBroadcastIE, ) from .snotr import SnotrIE -from .sockshare import SockshareIE from .sohu import SohuIE from .soundcloud import ( SoundcloudIE, From 7d65242dc3c0c3306b775f0663d325ba55b62379 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 26 May 2015 22:12:26 +0600 Subject: [PATCH 0215/2145] [dailymotion:user] Process user home as user (Closes #5823) --- youtube_dl/extractor/dailymotion.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index db10b8d00..d8f9eb13f 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -225,7 +225,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): class DailymotionUserIE(DailymotionPlaylistIE): IE_NAME = 'dailymotion:user' - _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:old/)?user/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:(?:old/)?user/)?(?P[^/]+)' _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' _TESTS = [{ 'url': 'https://www.dailymotion.com/user/nqtv', @@ -239,7 +239,8 @@ class DailymotionUserIE(DailymotionPlaylistIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user = mobj.group('user') - webpage = self._download_webpage(url, user) + webpage = self._download_webpage( + 'https://www.dailymotion.com/user/%s' % user, user) full_user = unescapeHTML(self._html_search_regex( r'' % re.escape(user), webpage, 'user')) From 5406af92bc3f88b9fe4f26fe20bdaaf0b4968c32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 26 May 2015 22:16:47 +0600 Subject: [PATCH 0216/2145] [dailymotion:user] Fix _VALID_URL --- youtube_dl/extractor/dailymotion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index d8f9eb13f..70aa4333c 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -225,7 +225,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): class DailymotionUserIE(DailymotionPlaylistIE): IE_NAME = 'dailymotion:user' - _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:(?:old/)?user/)?(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?:(?:old/)?user/)?(?P[^/]+)$' _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' _TESTS = [{ 'url': 'https://www.dailymotion.com/user/nqtv', From 7a372b64dfa69d5b2cfd1514b89e8fc0ab7e5874 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 27 May 2015 01:41:00 +0600 Subject: [PATCH 0217/2145] [pornhub] Do not modify aes key string (Closes #5824) --- youtube_dl/extractor/pornhub.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 0c8b731cf..daa284ea2 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -71,7 +71,8 @@ class PornHubIE(InfoExtractor): video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage))) if webpage.find('"encrypted":true') != -1: - password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password')) + password = compat_urllib_parse.unquote_plus( + self._search_regex(r'"video_title":"([^"]+)', webpage, 'password')) video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) formats = [] From 1434184c577953ff6fe558ccc6751697791f4076 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 27 May 2015 01:42:53 +0600 Subject: [PATCH 0218/2145] [spankwire] Do not modify aes key string --- youtube_dl/extractor/spankwire.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index b936202f6..06d6e6640 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -71,7 +71,7 @@ class SpankwireIE(InfoExtractor): compat_urllib_parse.unquote, re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage))) if webpage.find('flashvars\.encrypted = "true"') != -1: - password = self._html_search_regex( + password = self._search_regex( r'flashvars\.video_title = "([^"]+)', webpage, 'password').replace('+', ' ') video_urls = list(map( From b535170b218131afd1165776e611691479627ce8 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 27 May 2015 04:14:24 +0800 Subject: [PATCH 0219/2145] [bilibili] Skip assertion if HQ videos not available --- youtube_dl/extractor/bilibili.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 7ca835e31..a8bea2c10 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -80,9 +80,11 @@ class BiliBiliIE(InfoExtractor): note='Downloading HQ video info', fatal=False, ) - hq_durls = hq_doc.findall('./durl') if hq_doc is not False else itertools.repeat(None) - - assert len(lq_durls) == len(hq_durls) + if hq_doc is not False: + hq_durls = hq_doc.findall('./durl') + assert len(lq_durls) == len(hq_durls) + else: + hq_durls = itertools.repeat(None) i = 1 for lq_durl, hq_durl in zip(lq_durls, hq_durls): From 6d00a2dcd110f12a0aa110f5479df76613792fbd Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 27 May 2015 04:23:21 +0800 Subject: [PATCH 0220/2145] [bilibili] Catch API call failures JSON are returned in a failed API call --- youtube_dl/extractor/bilibili.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index a8bea2c10..2103ed73a 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals import re import itertools +import json +import xml.etree.ElementTree as ET from .common import InfoExtractor from ..utils import ( @@ -67,11 +69,19 @@ class BiliBiliIE(InfoExtractor): entries = [] - lq_doc = self._download_xml( + lq_page = self._download_webpage( 'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid, video_id, note='Downloading LQ video info' ) + try: + err_info = json.loads(lq_page) + raise ExtractorError( + 'BiliBili said: ' + err_info['error_text'], expected=True) + except ValueError: + pass + + lq_doc = ET.fromstring(lq_page) lq_durls = lq_doc.findall('./durl') hq_doc = self._download_xml( From c23848b3c5244f8ef1501adfd04a32111b12d7ff Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 27 May 2015 14:20:29 +0800 Subject: [PATCH 0221/2145] [naver] Enhanced error detection --- youtube_dl/extractor/naver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index c10405f04..fdd825784 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -35,7 +35,7 @@ class NaverIE(InfoExtractor): webpage) if m_id is None: m_error = re.search( - r'(?s)
\s*(?:)?\s*

(?P.+?)

\s*
', + r'(?s)
\s*(?:)?\s*

(?P.+?)

\s*
', webpage) if m_error: raise ExtractorError(clean_html(m_error.group('msg')), expected=True) From f8d5e1cfb5d9a8c946a966452d9b86c45182a952 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 27 May 2015 14:44:08 +0800 Subject: [PATCH 0222/2145] [naver] Fix video url (fixes #5809) RTMP urls in test:naver does not work. Need more investigation. --- youtube_dl/extractor/naver.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index fdd825784..925967753 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..compat import ( compat_urllib_parse, + compat_urlparse, ) from ..utils import ( ExtractorError, @@ -16,7 +17,7 @@ from ..utils import ( class NaverIE(InfoExtractor): _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://tvcast.naver.com/v/81652', 'info_dict': { 'id': '81652', @@ -25,7 +26,18 @@ class NaverIE(InfoExtractor): 'description': '합격불변의 법칙 메가스터디 | 메가스터디 수학 김상희 선생님이 9월 모의고사 수학A형 16번에서 20번까지 해설강의를 공개합니다.', 'upload_date': '20130903', }, - } + }, { + 'url': 'http://tvcast.naver.com/v/395837', + 'md5': '638ed4c12012c458fefcddfd01f173cd', + 'info_dict': { + 'id': '395837', + 'ext': 'mp4', + 'title': '9년이 지나도 아픈 기억, 전효성의 아버지', + 'description': 'md5:5bf200dcbf4b66eb1b350d1eb9c753f7', + 'upload_date': '20150519', + }, + 'skip': 'Georestricted', + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -58,14 +70,18 @@ class NaverIE(InfoExtractor): formats = [] for format_el in urls.findall('EncodingOptions/EncodingOption'): domain = format_el.find('Domain').text + uri = format_el.find('uri').text f = { - 'url': domain + format_el.find('uri').text, + 'url': compat_urlparse.urljoin(domain, uri), 'ext': 'mp4', 'width': int(format_el.find('width').text), 'height': int(format_el.find('height').text), } if domain.startswith('rtmp'): + # urlparse does not support custom schemes + # https://bugs.python.org/issue18828 f.update({ + 'url': domain + uri, 'ext': 'flv', 'rtmp_protocol': '1', # rtmpt }) From f9f3e3df9a9fff1b00c7184234c4f607ea3cec81 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 27 May 2015 14:51:18 +0800 Subject: [PATCH 0223/2145] [teamcoco] Use determine_ext to determine the video type Some videos does not contain a 'type' field (#5798) --- youtube_dl/extractor/teamcoco.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 56be52638..b2a4b1fc0 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -10,6 +10,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, qualities, + determine_ext, ) from ..compat import compat_ord @@ -108,7 +109,7 @@ class TeamcocoIE(InfoExtractor): formats = [] get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) for filed in data['files']: - if filed['type'] == 'hls': + if determine_ext(filed['url']) == 'm3u8': formats.extend(self._extract_m3u8_formats( filed['url'], video_id, ext='mp4')) else: From f0bfaa2d7d9563975f1f6effa75d28dcdb1c23ce Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 27 May 2015 15:23:34 +0800 Subject: [PATCH 0224/2145] [nrk] Update subtitles test Subtitle conversion routine is removed, so the subtitles are TTML now. See 1c7e2e64f6328024711d5fa999d4498396f4cb5c --- test/test_subtitles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 891ee620b..c4e3adb67 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -266,7 +266,7 @@ class TestNRKSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['no'])) - self.assertEqual(md5(subtitles['no']), '1d221e6458c95c5494dcd38e6a1f129a') + self.assertEqual(md5(subtitles['no']), '544fa917d3197fcbee64634559221cc2') class TestRaiSubtitles(BaseTestSubtitles): From bf24c3d01798fad0a8344a642eb5d46231fd78c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 27 May 2015 21:25:07 +0600 Subject: [PATCH 0225/2145] [facebook] Improve title regex (Closes #5816) --- youtube_dl/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index e8d682716..82dc27bc6 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -152,7 +152,7 @@ class FacebookIE(InfoExtractor): raise ExtractorError('Cannot find video formats') video_title = self._html_search_regex( - r'

([^<]*)

', webpage, 'title', + r']*class="uiHeaderTitle"[^>]*>([^<]*)', webpage, 'title', default=None) if not video_title: video_title = self._html_search_regex( From d90b3854ca9e8602f440cc9439e1cba240192286 Mon Sep 17 00:00:00 2001 From: PeterDing Date: Thu, 28 May 2015 00:37:00 +0800 Subject: [PATCH 0226/2145] [porn91] Add new extractor for 91porn.com --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/porn91.py | 62 ++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 youtube_dl/extractor/porn91.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 80c9cb107..d20ad286d 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -400,6 +400,7 @@ from .playfm import PlayFMIE from .playvid import PlayvidIE from .playwire import PlaywireIE from .podomatic import PodomaticIE +from .porn91 import Porn91IE from .pornhd import PornHdIE from .pornhub import ( PornHubIE, diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py new file mode 100644 index 000000000..af06af2b7 --- /dev/null +++ b/youtube_dl/extractor/porn91.py @@ -0,0 +1,62 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re +import json + +from ..compat import compat_urllib_parse +from .common import InfoExtractor +from ..utils import ExtractorError + + +class Porn91IE(InfoExtractor): + IE_NAME = '91porn' + _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/.+?\?viewkey=(?P[\w\d]+)' + + _TEST = { + 'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134', + 'md5': '6df8f6d028bc8b14f5dbd73af742fb20', + 'info_dict': { + 'id': '7e42283b4f5ab36da134', + 'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!', + 'ext': 'mp4' + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + url = 'http://91porn.com/view_video.php?viewkey=%s' % video_id + self._set_cookie('91porn.com', 'language', 'cn_CN') + webpage = self._download_webpage(url, video_id, "get HTML content") + title = re.search( + r'
(.+?)
', + webpage, + re.DOTALL) + assert title + title = title.group(1).replace('\n', '') + + # get real url + n1 = re.search(r'so.addVariable\(\'file\',\'(\d+)\'', webpage) + n2 = re.search(r'so.addVariable\(\'seccode\',\'(.+?)\'', webpage) + n3 = re.search(r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage) + if not (n1 and n2 and n3): + raise ExtractorError("You are Blocked by Server.") + + url_params = compat_urllib_parse.urlencode({ + 'VID': n1.group(1), + 'mp4': '1', + 'seccode': n2.group(1), + 'max_vid': n3.group(1), + }) + t_url = 'http://91porn.com/getfile.php?' + url_params + info_cn = self._download_webpage(t_url, video_id, "get real video_url") + video_url = re.search(r'file=(http.+?)&', info_cn).group(1) + + info = { + 'id': video_id, + 'title': title, + 'url': video_url, + } + + return info From b25b645d5106e5b2bf33c640813fe744b63f4730 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 27 May 2015 23:20:32 +0600 Subject: [PATCH 0227/2145] [nowtv] Improve and simplify --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/nowtv.py | 197 ++++++++++++++++++++++--------- 2 files changed, 139 insertions(+), 60 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bfd07392e..17248ccea 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -355,7 +355,7 @@ from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE from .novamov import NovaMovIE from .nowness import NownessIE -from .nowtv import NowTvIE +from .nowtv import NowTVIE from .nowvideo import NowVideoIE from .npo import ( NPOIE, diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index bf97fe7f4..5c91acec6 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -1,90 +1,169 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, - qualities, - unified_strdate, int_or_none, + parse_iso8601, + parse_duration, + remove_start, ) -class NowTvIE(InfoExtractor): - """Information Extractor for RTL NOW, RTL2 NOW, RTL NITRO, SUPER RTL NOW, VOX NOW and n-tv NOW""" - _VALID_URL = r'''(?x) - (?:https?://)? - ( - (?:www\.)?nowtv\.de - /(rtl|rtl2|rtlnitro||superrtl|ntv|vox)(?P/.*?)/player - )''' - _TESTS = [ - { - 'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player', - 'info_dict': { - 'id': '128953', - 'ext': 'mp4', - 'title': 'B\u00fcro-Fall \/ Chihuahua \'Joel\'', - 'description': 'md5:ce843b6b5901d9a7f7d04d1bbcdb12de', - 'upload_date': '2015-05-23 19:10:00', - 'duration': '00:51:32', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Only works from Germany', +class NowTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?Prtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P.+?)/player' + + _TESTS = [{ + # rtl + 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/player', + 'info_dict': { + 'id': '203519', + 'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', + 'ext': 'mp4', + 'title': 'Die neuen Bauern und eine Hochzeit', + 'description': 'md5:e234e1ed6d63cf06be5c070442612e7e', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432580700, + 'upload_date': '20150525', + 'duration': 2786, }, - ] + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # rtl2 + 'url': 'http://www.nowtv.de/rtl2/berlin-tag-nacht/berlin-tag-nacht-folge-934/player', + 'info_dict': { + 'id': '203481', + 'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934', + 'ext': 'mp4', + 'title': 'Berlin - Tag & Nacht (Folge 934)', + 'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432666800, + 'upload_date': '20150526', + 'duration': 2641, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # superrtl + 'url': 'http://www.nowtv.de/superrtl/medicopter-117/angst/player', + 'info_dict': { + 'id': '99205', + 'display_id': 'medicopter-117/angst', + 'ext': 'mp4', + 'title': 'Angst!', + 'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1222632900, + 'upload_date': '20080928', + 'duration': 3025, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # ntv + 'url': 'http://www.nowtv.de/ntv/ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch/player', + 'info_dict': { + 'id': '203521', + 'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch', + 'ext': 'mp4', + 'title': 'Thema u.a.: Der erste Blick: Die Apple Watch', + 'description': 'md5:4312b6c9d839ffe7d8caf03865a531af', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432751700, + 'upload_date': '20150527', + 'duration': 1083, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # vox + 'url': 'http://www.nowtv.de/vox/der-hundeprofi/buero-fall-chihuahua-joel/player', + 'info_dict': { + 'id': '128953', + 'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel', + 'ext': 'mp4', + 'title': "Büro-Fall / Chihuahua 'Joel'", + 'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432408200, + 'upload_date': '20150523', + 'duration': 3092, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - info_url = 'https://api.nowtv.de/v3/movies' + mobj.group('path') + '?fields=*,format,files,breakpoints,paymentPaytypes,trailers' - info = self._download_json(info_url, None) + display_id = mobj.group('id') + station = mobj.group('station') - video_id = info['id'] - title = info['title'] - description = info['articleShort'] - duration = info['duration'] - upload_date = unified_strdate(info['broadcastStartDate']) - free = info['free'] - station = info['format']['station'] - thumbnail = info['format']['defaultImage169Logo'] + info = self._download_json( + 'https://api.nowtv.de/v3/movies/%s?fields=*,format,files,breakpoints,paymentPaytypes,trailers,pictures' % display_id, + display_id) - if station == 'rtl': - base_url = 'http://hls.fra.rtlnow.de/hls-vod-enc/' - elif station == 'rtl2': - base_url = 'http://hls.fra.rtl2now.de/hls-vod-enc/' - elif station == 'vox': - base_url = 'http://hls.fra.voxnow.de/hls-vod-enc/' - elif station == 'nitro': - base_url = 'http://hls.fra.rtlnitronow.de/hls-vod-enc/' - elif station == 'ntv': - base_url = 'http://hls.fra.n-tvnow.de/hls-vod-enc/' - elif station == 'superrtl': - base_url = 'http://hls.fra.superrtlnow.de/hls-vod-enc/' + video_id = compat_str(info['id']) + + if info.get('geoblocked'): + raise ExtractorError( + 'Video %s is not available from your location due to geo restriction' % video_id, + expected=True) + + f = info.get('format', {}) + station = f.get('station') or station + + STATIONS = { + 'rtl': 'rtlnow', + 'rtl2': 'rtl2now', + 'vox': 'voxnow', + 'nitro': 'rtlnitronow', + 'ntv': 'n-tvnow', + 'superrtl': 'superrtlnow' + } formats = [] for item in info['files']['items']: - if item['type'] != 'video/x-abr': - continue - - fmt = { - 'url': base_url + item['path'] + '.m3u8', - 'tbr': int_or_none(item['bitrate']), + item_path = remove_start(item['path'], '/') + tbr = int_or_none(item['bitrate']) + m3u8_url = 'http://hls.fra.%s.de/hls-vod-enc/%s.m3u8' % (STATIONS[station], item_path) + m3u8_url = m3u8_url.replace('now/', 'now/videos/') + formats.append({ + 'url': m3u8_url, + 'format_id': '%s-%sk' % (item['id'], tbr), 'ext': 'mp4', - 'format_id': int_or_none(item['id']), - } - formats.append(fmt) + 'tbr': tbr, + }) self._sort_formats(formats) + title = info['title'] + description = info.get('articleLong') or info.get('articleShort') + timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') + duration = parse_duration(info.get('duration')) + thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') + return { 'id': video_id, + 'display_id': display_id, 'title': title, 'description': description, 'thumbnail': thumbnail, - 'upload_date': upload_date, + 'timestamp': timestamp, 'duration': duration, 'formats': formats, } From 703d78bbf5edf73f60447ac273c0d303d28cc340 Mon Sep 17 00:00:00 2001 From: PeterDing Date: Thu, 28 May 2015 01:37:24 +0800 Subject: [PATCH 0228/2145] [porn91] change re to _search_regex --- youtube_dl/extractor/porn91.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py index af06af2b7..f3a97df64 100644 --- a/youtube_dl/extractor/porn91.py +++ b/youtube_dl/extractor/porn91.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import json from ..compat import compat_urllib_parse from .common import InfoExtractor @@ -29,30 +28,32 @@ class Porn91IE(InfoExtractor): url = 'http://91porn.com/view_video.php?viewkey=%s' % video_id self._set_cookie('91porn.com', 'language', 'cn_CN') webpage = self._download_webpage(url, video_id, "get HTML content") - title = re.search( - r'
(.+?)
', - webpage, - re.DOTALL) + title = self._search_regex( + r'
(?P.+?)</div>', + webpage, 'title', flags=re.DOTALL) assert title - title = title.group(1).replace('\n', '') + title = title.replace('\n', '') # get real url - n1 = re.search(r'so.addVariable\(\'file\',\'(\d+)\'', webpage) - n2 = re.search(r'so.addVariable\(\'seccode\',\'(.+?)\'', webpage) - n3 = re.search(r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage) + n1 = self._search_regex( + r'so.addVariable\(\'file\',\'(?P<n1>\d+)\'', webpage, 'n1') + n2 = self._search_regex( + r'so.addVariable\(\'seccode\',\'(?P<n2>.+?)\'', webpage, 'n2') + n3 = self._search_regex( + r'so.addVariable\(\'max_vid\',\'(?P<n3>\d+)\'', webpage, 'n3') if not (n1 and n2 and n3): raise ExtractorError("You are Blocked by Server.") - url_params = compat_urllib_parse.urlencode({ - 'VID': n1.group(1), + 'VID': n1, 'mp4': '1', - 'seccode': n2.group(1), - 'max_vid': n3.group(1), + 'seccode': n2, + 'max_vid': n3, }) t_url = 'http://91porn.com/getfile.php?' + url_params info_cn = self._download_webpage(t_url, video_id, "get real video_url") - video_url = re.search(r'file=(http.+?)&', info_cn).group(1) + video_url = self._search_regex(r'file=(?P<url>http.+?)&', info_cn, 'url') + # construct info info = { 'id': video_id, 'title': title, From 9b254aa177d58b7b4c4f44dce8c38fa7978c7df6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 27 May 2015 23:41:43 +0600 Subject: [PATCH 0229/2145] [nowtv] Add non-free video check --- youtube_dl/extractor/nowtv.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index 5c91acec6..295168432 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -120,10 +120,15 @@ class NowTVIE(InfoExtractor): video_id = compat_str(info['id']) - if info.get('geoblocked'): - raise ExtractorError( - 'Video %s is not available from your location due to geo restriction' % video_id, - expected=True) + files = info['files'] + if not files: + if info.get('geoblocked', False): + raise ExtractorError( + 'Video %s is not available from your location due to geo restriction' % video_id, + expected=True) + if not info.get('free', True): + raise ExtractorError( + 'Video %s is not available for free' % video_id, expected=True) f = info.get('format', {}) station = f.get('station') or station @@ -138,7 +143,7 @@ class NowTVIE(InfoExtractor): } formats = [] - for item in info['files']['items']: + for item in files['items']: item_path = remove_start(item['path'], '/') tbr = int_or_none(item['bitrate']) m3u8_url = 'http://hls.fra.%s.de/hls-vod-enc/%s.m3u8' % (STATIONS[station], item_path) From ff4a1279f2d40fdba3287d4e7949bd8caa89eb04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 May 2015 01:15:04 +0600 Subject: [PATCH 0230/2145] [nowtv] Do not request unnecessary metadata --- youtube_dl/extractor/nowtv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index 295168432..d39bbde99 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -115,7 +115,7 @@ class NowTVIE(InfoExtractor): station = mobj.group('station') info = self._download_json( - 'https://api.nowtv.de/v3/movies/%s?fields=*,format,files,breakpoints,paymentPaytypes,trailers,pictures' % display_id, + 'https://api.nowtv.de/v3/movies/%s?fields=*,format,files' % display_id, display_id) video_id = compat_str(info['id']) From 9e0b5791281c68e5773555688928184064396011 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 May 2015 01:26:14 +0600 Subject: [PATCH 0231/2145] [nowtv] Add test for rtlnitro --- youtube_dl/extractor/nowtv.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index d39bbde99..173e46cd8 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -53,6 +53,24 @@ class NowTVIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + }, { + # rtlnitro + 'url': 'http://www.nowtv.de/rtlnitro/alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00/player', + 'info_dict': { + 'id': '165780', + 'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00', + 'ext': 'mp4', + 'title': 'Hals- und Beinbruch', + 'description': 'md5:b50d248efffe244e6f56737f0911ca57', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1432415400, + 'upload_date': '20150523', + 'duration': 2742, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { # superrtl 'url': 'http://www.nowtv.de/superrtl/medicopter-117/angst/player', From f9355dc989362f31d8e21ccb8fa765546a2360f2 Mon Sep 17 00:00:00 2001 From: PeterDing <dfhayst@gmail.com> Date: Thu, 28 May 2015 17:00:09 +0800 Subject: [PATCH 0232/2145] [youku] update youku --- youtube_dl/extractor/youku.py | 248 ++++++++++++++++++++++------------ 1 file changed, 162 insertions(+), 86 deletions(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 97b98bbe8..8d86c3f45 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -1,123 +1,199 @@ # coding: utf-8 - from __future__ import unicode_literals -import math -import random import re -import time +import base64 from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) - +from ..utils import ExtractorError class YoukuIE(InfoExtractor): + IE_NAME = 'youku' _VALID_URL = r'''(?x) (?: http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)| youku:) (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|) ''' + _TEST = { - 'url': 'http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html', - 'md5': 'ffe3f2e435663dc2d1eea34faeff5b5b', - 'params': { - 'test': False - }, - 'info_dict': { - 'id': 'XNDgyMDQ2NTQw_part00', - 'ext': 'flv', - 'title': 'youtube-dl test video "\'/\\ä↭𝕐' - } + 'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html', + 'md5': '5f3af4192eabacc4501508d54a8cabd7', + 'info_dict': { + 'id': 'XMTc1ODE5Njcy', + 'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.', + 'ext': 'flv' + } } - def _gen_sid(self): - nowTime = int(time.time() * 1000) - random1 = random.randint(1000, 1998) - random2 = random.randint(1000, 9999) + def construct_video_urls(self, data1, data2): + # get sid, token + def yk_t(s1, s2): + ls = list(range(256)) + t = 0 + for i in range(256): + t = (t + ls[i] + ord(s1[i%len(s1)])) % 256 + ls[i], ls[t] = ls[t], ls[i] + s, x, y = '', 0, 0 + for i in range(len(s2)): + y = (y + 1) % 256 + x = (x + ls[y]) % 256 + ls[x], ls[y] = ls[y], ls[x] + s += chr((s2[i] ^ ls[(ls[x]+ls[y]) % 256])) + return s - return "%d%d%d" % (nowTime, random1, random2) + sid, token = yk_t( + 'becaf9be', base64.b64decode(bytes(data2['ep'], 'ascii')) + ).split('_') - def _get_file_ID_mix_string(self, seed): - mixed = [] - source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890") - seed = float(seed) - for i in range(len(source)): - seed = (seed * 211 + 30031) % 65536 - index = math.floor(seed / 65536 * len(source)) - mixed.append(source[int(index)]) - source.remove(source[int(index)]) - # return ''.join(mixed) - return mixed + # get oip + oip = data2['ip'] - def _get_file_id(self, fileId, seed): - mixed = self._get_file_ID_mix_string(seed) - ids = fileId.split('*') - realId = [] - for ch in ids: - if ch: - realId.append(mixed[int(ch)]) - return ''.join(realId) + # get fileid + string_ls = list( + 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890') + shuffled_string_ls = [] + seed = data1['seed'] + N = len(string_ls) + for ii in range(N): + seed = (seed * 0xd3 + 0x754f) % 0x10000 + idx = seed * len(string_ls) // 0x10000 + shuffled_string_ls.append(string_ls[idx]) + del string_ls[idx] + + fileid_dict = {} + for format in data1['streamtypes']: + streamfileid = [ + int(i) for i in data1['streamfileids'][format].strip('*').split('*')] + fileid = ''.join( + [shuffled_string_ls[i] for i in streamfileid]) + fileid_dict[format] = fileid[:8] + '%s' + fileid[10:] + + def get_fileid(format, n): + fileid = fileid_dict[format] % hex(int(n))[2:].upper().zfill(2) + return fileid + + # get ep + def generate_ep(format, n): + fileid = get_fileid(format, n) + ep_t = yk_t( + 'bf7e5f01', + bytes('%s_%s_%s' % (sid, fileid, token), 'ascii')) + ep = base64.b64encode(bytes(ep_t, 'latin')).decode() + ep = ep.replace('+', '%2B') + ep = ep.replace('/', '%2F') + ep = ep.replace('=', '%2D') + return ep + + # generate video_urls + video_urls_dict = {} + for format in data1['streamtypes']: + video_urls = [] + for dt in data1['segs'][format]: + n = str(int(dt['no'])) + video_url = \ + 'http://k.youku.com/player/getFlvPath/' + \ + 'sid/' + sid + \ + '_' + str(int(n)+1).zfill(2) + \ + '/st/' + self.parse_ext_l(format) + \ + '/fileid/' + get_fileid(format, n) + '?' + \ + 'K=' + str(dt['k']) + \ + '&hd=' + self.get_hd(format) + \ + '&myp=0' + \ + '&ts=' + str(dt['seconds']) + \ + '&ypp=0&ctype=12&ev=1' + \ + '&token=' + str(token) + \ + '&oip=' + str(oip) + \ + '&ep=' + generate_ep(format, n) + video_urls.append(video_url) + video_urls_dict[format] = video_urls + + return video_urls_dict + + def get_hd(self, fm): + hd_id_dict = { + 'flv': '0', + 'mp4': '1', + 'hd2': '2', + 'hd3': '3', + '3gp': '0', + '3gphd': '1' + } + return hd_id_dict[fm] + + def parse_ext_l(self, fm): + ext_dict = { + 'flv': 'flv', + 'mp4': 'mp4', + 'hd2': 'flv', + 'hd3': 'flv', + '3gp': 'flv', + '3gphd': 'mp4', + } + return ext_dict[fm] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id + # request basic data + data1_url = 'http://v.youku.com/player/getPlayList/VideoIDS/%s' % video_id + data2_url = 'http://v.youku.com/player/getPlayList/VideoIDS/%s/Pf/4/ctype/12/ev/1' % video_id - config = self._download_json(info_url, video_id) + raw_data1 = self._download_json(data1_url, video_id) + raw_data2 = self._download_json(data2_url, video_id) + data1 = raw_data1['data'][0] + data2 = raw_data2['data'][0] - error_code = config['data'][0].get('error_code') + error_code = data1.get('error_code') if error_code: # -8 means blocked outside China. - error = config['data'][0].get('error') # Chinese and English, separated by newline. - raise ExtractorError(error or 'Server reported error %i' % error_code, - expected=True) + # Chinese and English, separated by newline. + error = data1.get('error') + raise ExtractorError( + error or 'Server reported error %i' % + error_code, + expected=True) - video_title = config['data'][0]['title'] - seed = config['data'][0]['seed'] + title = data1['title'] - format = self._downloader.params.get('format', None) - supported_format = list(config['data'][0]['streamfileids'].keys()) + # generate video_urls_dict + video_urls_dict = self.construct_video_urls(data1, data2) - # TODO proper format selection - if format is None or format == 'best': - if 'hd2' in supported_format: - format = 'hd2' - else: - format = 'flv' - ext = 'flv' - elif format == 'worst': - format = 'mp4' - ext = 'mp4' - else: - format = 'flv' - ext = 'flv' + # construct info + entries = [] + for fm in data1['streamtypes']: + #formats = [] + video_urls = video_urls_dict[fm] + for i in range(len(video_urls)): + if len(entries) < i+1: + entries.append({'formats': []}) + entries[i]['formats'].append( + { + 'url': video_urls[i], + 'format_id': fm, + 'ext': self.parse_ext_l(fm), + 'filesize': int(data1['segs'][fm][i]['size']) + } + ) - fileid = config['data'][0]['streamfileids'][format] - keys = [s['k'] for s in config['data'][0]['segs'][format]] - # segs is usually a dictionary, but an empty *list* if an error occured. - - files_info = [] - sid = self._gen_sid() - fileid = self._get_file_id(fileid, seed) - - # column 8,9 of fileid represent the segment number - # fileid[7:9] should be changed - for index, key in enumerate(keys): - temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:]) - download_url = 'http://k.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key) + for i in range(len(entries)): + entries[i].update( + { + 'id': '_part%d' % (i+1), + 'title': title, + } + ) + if len(entries) > 1: info = { - 'id': '%s_part%02d' % (video_id, index), - 'url': download_url, - 'uploader': None, - 'upload_date': None, - 'title': video_title, - 'ext': ext, + '_type': 'multi_video', + 'id': video_id, + 'title': title, + 'entries': entries, } - files_info.append(info) + else: + info = entries[0] + info['id'] = video_id - return files_info + return info From ca45246627f5a67a7c82cd40a11e5c4ff5f68871 Mon Sep 17 00:00:00 2001 From: PeterDing <dfhayst@gmail.com> Date: Thu, 28 May 2015 21:04:58 +0800 Subject: [PATCH 0233/2145] [youku] compatible for python > 3.3 or > 2.7 --- youtube_dl/extractor/youku.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 8d86c3f45..7a07c8a5f 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import sys +pyvs = sys.version_info[0] import re import base64 @@ -34,16 +36,23 @@ class YoukuIE(InfoExtractor): for i in range(256): t = (t + ls[i] + ord(s1[i%len(s1)])) % 256 ls[i], ls[t] = ls[t], ls[i] - s, x, y = '', 0, 0 + s = '' if pyvs == 3 else b'' + x, y = 0, 0 for i in range(len(s2)): y = (y + 1) % 256 x = (x + ls[y]) % 256 ls[x], ls[y] = ls[y], ls[x] - s += chr((s2[i] ^ ls[(ls[x]+ls[y]) % 256])) + if isinstance(s2[i], int): + s += chr(s2[i] ^ ls[(ls[x]+ls[y]) % 256]) + else: + s += chr(ord(s2[i]) ^ ls[(ls[x]+ls[y]) % 256]) return s sid, token = yk_t( - 'becaf9be', base64.b64decode(bytes(data2['ep'], 'ascii')) + 'becaf9be', + base64.b64decode(bytes(data2['ep'], 'ascii')) \ + if pyvs == 3 \ + else base64.b64decode(data2['ep']) ).split('_') # get oip @@ -78,8 +87,15 @@ class YoukuIE(InfoExtractor): fileid = get_fileid(format, n) ep_t = yk_t( 'bf7e5f01', - bytes('%s_%s_%s' % (sid, fileid, token), 'ascii')) - ep = base64.b64encode(bytes(ep_t, 'latin')).decode() + bytes('%s_%s_%s' % (sid, fileid, token), 'ascii') \ + if pyvs == 3 \ + else ('%s_%s_%s' % (sid, fileid, token)) + ) + ep = base64.b64encode( + bytes(ep_t, 'latin') \ + if pyvs == 3 \ + else ep_t + ).decode() ep = ep.replace('+', '%2B') ep = ep.replace('/', '%2F') ep = ep.replace('=', '%2D') From 806598b94dec1268566ae71d671116060f7971d6 Mon Sep 17 00:00:00 2001 From: PeterDing <dfhayst@gmail.com> Date: Fri, 29 May 2015 08:21:24 +0800 Subject: [PATCH 0234/2145] [porn91] the one that _search_regex returns not needs to be checked --- youtube_dl/extractor/porn91.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py index f3a97df64..b62eec92d 100644 --- a/youtube_dl/extractor/porn91.py +++ b/youtube_dl/extractor/porn91.py @@ -31,7 +31,6 @@ class Porn91IE(InfoExtractor): title = self._search_regex( r'<div id="viewvideo-title">(?P<title>.+?)</div>', webpage, 'title', flags=re.DOTALL) - assert title title = title.replace('\n', '') # get real url @@ -41,8 +40,6 @@ class Porn91IE(InfoExtractor): r'so.addVariable\(\'seccode\',\'(?P<n2>.+?)\'', webpage, 'n2') n3 = self._search_regex( r'so.addVariable\(\'max_vid\',\'(?P<n3>\d+)\'', webpage, 'n3') - if not (n1 and n2 and n3): - raise ExtractorError("You are Blocked by Server.") url_params = compat_urllib_parse.urlencode({ 'VID': n1, 'mp4': '1', From 1498940b10a3f43490c05045ebe7a517267a2bff Mon Sep 17 00:00:00 2001 From: PeterDing <dfhayst@gmail.com> Date: Fri, 29 May 2015 10:13:09 +0800 Subject: [PATCH 0235/2145] [youku] compare bytes and str for compatible; use compat_urllib_parse for making video_url --- youtube_dl/extractor/youku.py | 38 +++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 7a07c8a5f..063f2e10e 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -1,14 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals -import sys -pyvs = sys.version_info[0] import re import base64 from .common import InfoExtractor from ..utils import ExtractorError +from ..compat import compat_urllib_parse + +bytes_is_str = (bytes == str) # for compatible + class YoukuIE(InfoExtractor): IE_NAME = 'youku' _VALID_URL = r'''(?x) @@ -36,7 +38,7 @@ class YoukuIE(InfoExtractor): for i in range(256): t = (t + ls[i] + ord(s1[i%len(s1)])) % 256 ls[i], ls[t] = ls[t], ls[i] - s = '' if pyvs == 3 else b'' + s = '' if not bytes_is_str else b'' x, y = 0, 0 for i in range(len(s2)): y = (y + 1) % 256 @@ -51,7 +53,7 @@ class YoukuIE(InfoExtractor): sid, token = yk_t( 'becaf9be', base64.b64decode(bytes(data2['ep'], 'ascii')) \ - if pyvs == 3 \ + if not bytes_is_str \ else base64.b64decode(data2['ep']) ).split('_') @@ -88,17 +90,14 @@ class YoukuIE(InfoExtractor): ep_t = yk_t( 'bf7e5f01', bytes('%s_%s_%s' % (sid, fileid, token), 'ascii') \ - if pyvs == 3 \ + if not bytes_is_str \ else ('%s_%s_%s' % (sid, fileid, token)) ) ep = base64.b64encode( bytes(ep_t, 'latin') \ - if pyvs == 3 \ + if not bytes_is_str \ else ep_t ).decode() - ep = ep.replace('+', '%2B') - ep = ep.replace('/', '%2F') - ep = ep.replace('=', '%2D') return ep # generate video_urls @@ -107,20 +106,25 @@ class YoukuIE(InfoExtractor): video_urls = [] for dt in data1['segs'][format]: n = str(int(dt['no'])) + param = { + 'K': dt['k'], + 'hd': self.get_hd(format), + 'myp': 0, + 'ts': dt['seconds'], + 'ypp': 0, + 'ctype': 12, + 'ev': 1, + 'token': token, + 'oip': oip, + 'ep': generate_ep(format, n) + } video_url = \ 'http://k.youku.com/player/getFlvPath/' + \ 'sid/' + sid + \ '_' + str(int(n)+1).zfill(2) + \ '/st/' + self.parse_ext_l(format) + \ '/fileid/' + get_fileid(format, n) + '?' + \ - 'K=' + str(dt['k']) + \ - '&hd=' + self.get_hd(format) + \ - '&myp=0' + \ - '&ts=' + str(dt['seconds']) + \ - '&ypp=0&ctype=12&ev=1' + \ - '&token=' + str(token) + \ - '&oip=' + str(oip) + \ - '&ep=' + generate_ep(format, n) + compat_urllib_parse.urlencode(param) video_urls.append(video_url) video_urls_dict[format] = video_urls From 84e1e036c2cb7311cdea14763bec3322403a8d54 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 29 May 2015 12:44:31 +0800 Subject: [PATCH 0236/2145] [senate] Extend _VALID_URL (fixes #5836) --- youtube_dl/extractor/senateisvp.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py index d3b8a1be4..9c53704ea 100644 --- a/youtube_dl/extractor/senateisvp.py +++ b/youtube_dl/extractor/senateisvp.py @@ -48,7 +48,7 @@ class SenateISVPIE(InfoExtractor): ["arch", "", "http://ussenate-f.akamaihd.net/"] ] _IE_NAME = 'senate.gov' - _VALID_URL = r'http://www\.senate\.gov/isvp/\?(?P<qs>.+)' + _VALID_URL = r'http://www\.senate\.gov/isvp/?\?(?P<qs>.+)' _TESTS = [{ 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', 'info_dict': { @@ -72,12 +72,16 @@ class SenateISVPIE(InfoExtractor): 'ext': 'mp4', 'title': 'Integrated Senate Video Player' } + }, { + # From http://www.c-span.org/video/?96791-1 + 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715', + 'only_matching': True, }] @staticmethod def _search_iframe_url(webpage): mobj = re.search( - r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/\?[^'\"]+)['\"]", + r"<iframe[^>]+src=['\"](?P<url>http://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", webpage) if mobj: return mobj.group('url') From eb6cb9fbe934fe99a35af22065cf91063d416c12 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 29 May 2015 07:52:17 +0200 Subject: [PATCH 0237/2145] release 2015.05.29 --- README.md | 4 ++-- docs/supportedsites.md | 9 ++++----- youtube_dl/version.py | 2 +- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index e51bb5343..f3d83c89f 100644 --- a/README.md +++ b/README.md @@ -168,7 +168,7 @@ which means you can modify it, redistribute it or use it however you like. --no-progress Do not print progress bar --console-title Display progress in console titlebar -v, --verbose Print various debugging information - --dump-pages Print downloaded pages to debug problems (very verbose) + --dump-pages Print downloaded pages encoded using base64 to debug problems (very verbose) --write-pages Write downloaded intermediary pages to files in the current directory to debug problems --print-traffic Display sent and read HTTP traffic -C, --call-home Contact the youtube-dl server for debugging @@ -220,7 +220,7 @@ which means you can modify it, redistribute it or use it however you like. --embed-thumbnail Embed thumbnail in the audio as cover art --add-metadata Write metadata to the video file --metadata-from-title FORMAT Parse additional metadata like song title / artist from the video title. The format syntax is the same as --output, the parsed - parameters replace existing values. Additional templates: %(album), %(artist). Example: --metadata-from-title "%(artist)s - + parameters replace existing values. Additional templates: %(album)s, %(artist)s. Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like "Coldplay - Paradise" --xattrs Write metadata to the video file's xattrs (using dublin core and xdg standards) --fixup POLICY Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn(the default; diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a4879bd9a..a421ae62b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -26,8 +26,7 @@ - **anitube.se** - **AnySex** - **Aparat** - - **AppleDailyAnimationNews** - - **AppleDailyRealtimeNews** + - **AppleDaily** - **AppleTrailers** - **archive.org**: archive.org videos - **ARD** @@ -152,7 +151,6 @@ - **fc2** - **fernsehkritik.tv** - **fernsehkritik.tv:postecke** - - **Firedrive** - **Firstpost** - **Flickr** - **Folketinget**: Folketinget (ft.dk; Danish parliament) @@ -230,6 +228,7 @@ - **KanalPlay**: Kanal 5/9/11 Play - **Kankan** - **Karaoketv** + - **KarriereVideos** - **keek** - **KeezMovies** - **KhanAcademy** @@ -322,6 +321,7 @@ - **NosVideo** - **novamov**: NovaMov - **Nowness** + - **NowTV** - **nowvideo**: NowVideo - **npo.nl** - **npo.nl:live** @@ -393,7 +393,6 @@ - **Rte** - **rtl.nl**: rtl.nl and rtlxl.nl - **RTL2** - - **RTLnow** - **RTP** - **RTS**: RTS.ch - **rtve.es:alacarta**: RTVE a la carta @@ -431,7 +430,6 @@ - **smotri:community**: Smotri.com community videos - **smotri:user**: Smotri.com user videos - **Snotr** - - **Sockshare** - **Sohu** - **soundcloud** - **soundcloud:playlist** @@ -564,6 +562,7 @@ - **vier:videos** - **Viewster** - **viki** + - **viki:channel** - **vimeo** - **vimeo:album** - **vimeo:channel** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b33385153..653710131 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.05.20' +__version__ = '2015.05.29' From d6aa68ce75ef4f4d27fbf7103edfda1f92ba70b2 Mon Sep 17 00:00:00 2001 From: pulpe <Pulpan3@gmail.com> Date: Fri, 29 May 2015 12:47:20 +0200 Subject: [PATCH 0238/2145] [postprocessor/embedthumbnail] embed mp4 too (fixes #5840) --- youtube_dl/postprocessor/embedthumbnail.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index 8f825f785..774494efd 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -49,7 +49,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor): os.remove(encodeFilename(filename)) os.rename(encodeFilename(temp_filename), encodeFilename(filename)) - elif info['ext'] == 'm4a': + elif info['ext'] in ['m4a', 'mp4']: if not check_executable('AtomicParsley', ['-v']): raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.') @@ -82,6 +82,6 @@ class EmbedThumbnailPP(FFmpegPostProcessor): os.remove(encodeFilename(filename)) os.rename(encodeFilename(temp_filename), encodeFilename(filename)) else: - raise EmbedThumbnailPPError('Only mp3 and m4a are supported for thumbnail embedding for now.') + raise EmbedThumbnailPPError('Only mp3 and m4a/mp4 are supported for thumbnail embedding for now.') return [], info From 605ec701b7b4cd120a9acb33bfcc4306719b59b4 Mon Sep 17 00:00:00 2001 From: PeterDing <dfhayst@gmail.com> Date: Fri, 29 May 2015 23:32:04 +0800 Subject: [PATCH 0239/2145] [iqiyi] Add new extractor for iqiyi.com --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/iqiyi.py | 214 +++++++++++++++++++++++++++++++ 2 files changed, 215 insertions(+) create mode 100644 youtube_dl/extractor/iqiyi.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 80c9cb107..85c1b1a3a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -229,6 +229,7 @@ from .infoq import InfoQIE from .instagram import InstagramIE, InstagramUserIE from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE +from .iqiyi import IqiyiIE from .ivi import ( IviIE, IviCompilationIE diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py new file mode 100644 index 000000000..d96d13225 --- /dev/null +++ b/youtube_dl/extractor/iqiyi.py @@ -0,0 +1,214 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..compat import ( + compat_chr, + compat_parse_qs, + compat_urllib_parse, + compat_urllib_request, + compat_urlparse, + compat_str, +) + +from ..utils import ExtractorError + +import re +import time +import json +import uuid +import math +import random +import zlib +import hashlib + +class IqiyiIE(InfoExtractor): + IE_NAME = 'iqiyi' + + _VALID_URL = r'http://(?:www\.)iqiyi.com/.+?\.html' + + _TEST = { + 'url': 'http://www.iqiyi.com/v_19rrojlavg.html', + 'md5': '260f0f59686e65e886995d0ba791ab83', + 'info_dict': { + 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73', + 'title': '美国德州空中惊现奇异云团 酷似UFO', + 'ext': 'f4v' + } + } + + def construct_video_urls(self, data, video_id, _uuid): + def do_xor(x, y): + a = y % 3 + if a == 1: + return x ^ 121 + if a == 2: + return x ^ 72 + return x ^ 103 + + def get_encode_code(l): + a = 0 + b = l.split('-') + c = len(b) + s = '' + for i in range(c - 1, -1, -1): + a = do_xor(int(b[c-i-1], 16), i) + s += chr(a) + return s[::-1] + + def get_path_key(x): + mg = ')(*&^flash@#$%a' + tm = self._download_json( + 'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id)['t'] + t = str(int(math.floor(int(tm)/(600.0)))) + return hashlib.md5( + (t+mg+x).encode('utf8')).hexdigest() + + video_urls_dict = {} + for i in data['vp']['tkl'][0]['vs']: + if 0 < int(i['bid']) <= 10: + format_id = self.get_format(i['bid']) + + video_urls_info = i['fs'] + if not i['fs'][0]['l'].startswith('/'): + t = get_encode_code(i['fs'][0]['l']) + if t.endswith('mp4'): + video_urls_info = i['flvs'] + + video_urls = [] + for ii in video_urls_info: + vl = ii['l'] + if not vl.startswith('/'): + vl = get_encode_code(vl) + key = get_path_key( + vl.split('/')[-1].split('.')[0]) + filesize = ii['b'] + base_url = data['vp']['du'].split('/') + base_url.insert(-1, key) + base_url = '/'.join(base_url) + param = { + 'su': _uuid, + 'qyid': uuid.uuid4().hex, + 'client': '', + 'z': '', + 'bt': '', + 'ct': '', + 'tn': str(int(time.time())) + } + api_video_url = base_url + vl + '?' + \ + compat_urllib_parse.urlencode(param) + js = self._download_json(api_video_url, video_id) + video_url = js['l'] + video_urls.append( + (video_url, filesize)) + + video_urls_dict[format_id] = video_urls + return video_urls_dict + + def get_format(self, bid): + bid_dict = { + '1': 'standard', + '2': 'high', + '3': 'super', + '4': 'suprt-high', + '5': 'fullhd', + '10': '4k' + } + return bid_dict[str(bid)] + + def get_raw_data(self, tvid, video_id, enc_key, _uuid): + tm = str(int(time.time())) + param = { + 'key': 'fvip', + 'src': hashlib.md5(b'youtube-dl').hexdigest(), + 'tvId': tvid, + 'vid': video_id, + 'vinfo': 1, + 'tm': tm, + 'enc': hashlib.md5( + (enc_key + tm + tvid).encode('utf8')).hexdigest(), + 'qyid': _uuid, + 'tn': random.random(), + 'um': 0, + 'authkey': hashlib.md5( + (tm + tvid).encode('utf8')).hexdigest() + } + + api_url = 'http://cache.video.qiyi.com/vms' + '?' + \ + compat_urllib_parse.urlencode(param) + raw_data = self._download_json(api_url, video_id) + return raw_data + + def get_enc_key(self, swf_url, video_id): + req = self._request_webpage( + swf_url, video_id, note='download swf content') + cn = req.read() + cn = zlib.decompress(cn[8:]) + pt = re.compile(b'MixerRemote\x08(?P<enc_key>.+?)\$&vv') + enc_key = self._search_regex(pt, cn, 'enc_key').decode('utf8') + return enc_key + + def _real_extract(self, url): + webpage = self._download_webpage( + url, 'temp_id', note='download video page') + tvid = self._search_regex( + r'tvId ?= ?(\'|\")(?P<tvid>\d+)', webpage, 'tvid', flags=re.I, group='tvid') + video_id = self._search_regex( + r'videoId ?= ?(\'|\")(?P<video_id>[a-z\d]+)', + webpage, 'video_id', flags=re.I, group='video_id') + swf_url = self._search_regex( + r'(?P<swf>http://.+?MainPlayer.+?\.swf)', webpage, 'swf') + _uuid = uuid.uuid4().hex + + enc_key = self.get_enc_key(swf_url, video_id) + + raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid) + assert raw_data['code'] == 'A000000' + if not raw_data['data']['vp']['tkl']: + raise ExtractorError('No support iQiqy VIP video') + + data = raw_data['data'] + + title = data['vi']['vn'] + + # generate video_urls_dict + video_urls_dict = self.construct_video_urls(data, video_id, _uuid) + + # construct info + entries = [] + for format_id in video_urls_dict: + video_urls = video_urls_dict[format_id] + for i, video_url_info in enumerate(video_urls): + if len(entries) < i+1: + entries.append({'formats': []}) + entries[i]['formats'].append( + { + 'url': video_url_info[0], + 'filesize': video_url_info[-1], + 'format_id': format_id, + } + ) + + for i in range(len(entries)): + entries[i].update( + { + 'id': '_part%d' % (i+1), + 'title': title, + } + ) + + if len(entries) > 1: + info = { + '_type': 'multi_video', + 'id': video_id, + 'title': title, + 'entries': entries, + } + else: + info = entries[0] + info['id'] = video_id + info['title'] = title + + return info From 08f7db20c16743a2bd3040eb7dac11d675011eef Mon Sep 17 00:00:00 2001 From: PeterDing <dfhayst@gmail.com> Date: Sat, 30 May 2015 10:03:32 +0800 Subject: [PATCH 0240/2145] [youku] change format_id --- youtube_dl/extractor/youku.py | 37 +++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 063f2e10e..aed6b960a 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -132,26 +132,37 @@ class YoukuIE(InfoExtractor): def get_hd(self, fm): hd_id_dict = { - 'flv': '0', - 'mp4': '1', - 'hd2': '2', - 'hd3': '3', - '3gp': '0', - '3gphd': '1' + 'flv' : '0', + 'mp4' : '1', + 'hd2' : '2', + 'hd3' : '3', + '3gp' : '0', + '3gphd' : '1' } return hd_id_dict[fm] def parse_ext_l(self, fm): ext_dict = { - 'flv': 'flv', - 'mp4': 'mp4', - 'hd2': 'flv', - 'hd3': 'flv', - '3gp': 'flv', - '3gphd': 'mp4', + 'flv' : 'flv', + 'mp4' : 'mp4', + 'hd2' : 'flv', + 'hd3' : 'flv', + '3gp' : 'flv', + '3gphd' : 'mp4' } return ext_dict[fm] + def get_format_name(self, fm): + _dict = { + '3gp' : 'h6', + '3gphd' : 'h5', + 'flv' : 'h4', + 'mp4' : 'h3', + 'hd2' : 'h2', + 'hd3' : 'h1' + } + return _dict[fm] + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') @@ -191,7 +202,7 @@ class YoukuIE(InfoExtractor): entries[i]['formats'].append( { 'url': video_urls[i], - 'format_id': fm, + 'format_id': self.get_format_name(fm), 'ext': self.parse_ext_l(fm), 'filesize': int(data1['segs'][fm][i]['size']) } From 670861bd206ab4063baeb6b80d06a054ce4e1d62 Mon Sep 17 00:00:00 2001 From: PeterDing <dfhayst@gmail.com> Date: Sat, 30 May 2015 10:37:54 +0800 Subject: [PATCH 0241/2145] [iqiyi] Do not request for unneeded formats --- youtube_dl/extractor/iqiyi.py | 72 ++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index d96d13225..747f3f902 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -4,20 +4,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_chr, - compat_parse_qs, - compat_urllib_parse, - compat_urllib_request, - compat_urlparse, - compat_str, -) +from ..compat import compat_urllib_parse from ..utils import ExtractorError import re import time -import json import uuid import math import random @@ -31,15 +23,15 @@ class IqiyiIE(InfoExtractor): _TEST = { 'url': 'http://www.iqiyi.com/v_19rrojlavg.html', - 'md5': '260f0f59686e65e886995d0ba791ab83', + 'md5': '2cb594dc2781e6c941a110d8f358118b', 'info_dict': { 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73', 'title': '美国德州空中惊现奇异云团 酷似UFO', - 'ext': 'f4v' + 'ext': 'f4v', } } - def construct_video_urls(self, data, video_id, _uuid): + def construct_video_urls(self, data, video_id, _uuid, bid): def do_xor(x, y): a = y % 3 if a == 1: @@ -66,10 +58,21 @@ class IqiyiIE(InfoExtractor): return hashlib.md5( (t+mg+x).encode('utf8')).hexdigest() + # get accept format + # getting all format will spend minutes for a big video. + if bid == 'best': + bids = [int(i['bid']) for i in data['vp']['tkl'][0]['vs'] \ + if 0 < int(i['bid']) <= 10] + bid = str(max(bids)) + video_urls_dict = {} for i in data['vp']['tkl'][0]['vs']: if 0 < int(i['bid']) <= 10: format_id = self.get_format(i['bid']) + else: + continue + + video_urls = [] video_urls_info = i['fs'] if not i['fs'][0]['l'].startswith('/'): @@ -77,7 +80,12 @@ class IqiyiIE(InfoExtractor): if t.endswith('mp4'): video_urls_info = i['flvs'] - video_urls = [] + if int(i['bid']) != int(bid): # ignore missing match format + video_urls.extend( + [('http://example.com/v.flv', ii['b']) for ii in video_urls_info]) + video_urls_dict[format_id] = video_urls + continue + for ii in video_urls_info: vl = ii['l'] if not vl.startswith('/'): @@ -108,15 +116,27 @@ class IqiyiIE(InfoExtractor): return video_urls_dict def get_format(self, bid): - bid_dict = { - '1': 'standard', - '2': 'high', - '3': 'super', - '4': 'suprt-high', - '5': 'fullhd', - '10': '4k' + _dict = { + '1' : 'h6', + '2' : 'h5', + '3' : 'h4', + '4' : 'h3', + '5' : 'h2', + '10' : 'h1' } - return bid_dict[str(bid)] + return _dict.get(str(bid), None) + + def get_bid(self, format_id): + _dict = { + 'h6' : '1', + 'h5' : '2', + 'h4' : '3', + 'h3' : '4', + 'h2' : '5', + 'h1' : '10', + 'best' : 'best' + } + return _dict.get(format_id, None) def get_raw_data(self, tvid, video_id, enc_key, _uuid): tm = str(int(time.time())) @@ -173,8 +193,14 @@ class IqiyiIE(InfoExtractor): title = data['vi']['vn'] + format = self._downloader.params.get('format', None) + bid = self.get_bid(format) if format else 'best' + if not bid: + raise ExtractorError('Can\'t get format.') + # generate video_urls_dict - video_urls_dict = self.construct_video_urls(data, video_id, _uuid) + video_urls_dict = self.construct_video_urls( + data, video_id, _uuid, bid) # construct info entries = [] @@ -188,10 +214,12 @@ class IqiyiIE(InfoExtractor): 'url': video_url_info[0], 'filesize': video_url_info[-1], 'format_id': format_id, + 'preference': int(self.get_bid(format_id)) } ) for i in range(len(entries)): + self._sort_formats(entries[i]['formats']) entries[i].update( { 'id': '_part%d' % (i+1), From fafec39d4177f4873bb2393749a46873c4ffda4a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 30 May 2015 13:23:09 +0800 Subject: [PATCH 0242/2145] [spiegeltv] Changed RTMP server (fixes #5788 and fixes #5843) Thanks to @brickleroux for finding out the problem --- youtube_dl/extractor/spiegeltv.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py index 98cf92d89..359722ad6 100644 --- a/youtube_dl/extractor/spiegeltv.py +++ b/youtube_dl/extractor/spiegeltv.py @@ -51,9 +51,9 @@ class SpiegeltvIE(InfoExtractor): is_wide = media_json['is_wide'] server_json = self._download_json( - 'http://www.spiegel.tv/streaming_servers/', video_id, - note='Downloading server information') - server = server_json[0]['endpoint'] + 'http://spiegeltv-prod-static.s3.amazonaws.com/projectConfigs/projectConfig.json', + video_id, note='Downloading server information') + server = server_json['streamingserver'][0]['endpoint'] thumbnails = [] for image in media_json['images']: @@ -76,5 +76,6 @@ class SpiegeltvIE(InfoExtractor): 'ext': 'm4v', 'description': description, 'duration': duration, - 'thumbnails': thumbnails + 'thumbnails': thumbnails, + 'rtmp_live': True, } From 6ebdfe43e439239df051f6071a23c51705c150cf Mon Sep 17 00:00:00 2001 From: pulpe <Pulpan3@gmail.com> Date: Sat, 30 May 2015 09:30:14 +0200 Subject: [PATCH 0243/2145] [tube8] fix extractor (fixes #5846) --- youtube_dl/extractor/tube8.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index d73ad3762..6ca8840b0 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -47,7 +47,7 @@ class Tube8IE(InfoExtractor): webpage = self._download_webpage(req, display_id) flashvars = json.loads(self._html_search_regex( - r'var flashvars\s*=\s*({.+?})', webpage, 'flashvars')) + r'flashvars\s*=\s*({.+?})', webpage, 'flashvars')) video_url = flashvars['video_url'] if flashvars.get('encrypted') is True: @@ -58,19 +58,19 @@ class Tube8IE(InfoExtractor): thumbnail = flashvars.get('image_url') title = self._html_search_regex( - r'videotitle\s*=\s*"([^"]+)', webpage, 'title') + r'videoTitle\s*=\s*"([^"]+)', webpage, 'title') description = self._html_search_regex( - r'>Description:</strong>(.+?)<', webpage, 'description', fatal=False) + r'>Description:</strong>\s*(.+?)\s*<', webpage, 'description', fatal=False) uploader = self._html_search_regex( - r'<strong class="video-username">(?:<a href="[^"]+">)?([^<]+)(?:</a>)?</strong>', + r'<span class="username">\s*(.+?)\s*<', webpage, 'uploader', fatal=False) like_count = int_or_none(self._html_search_regex( - r"rupVar\s*=\s*'(\d+)'", webpage, 'like count', fatal=False)) + r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False)) dislike_count = int_or_none(self._html_search_regex( - r"rdownVar\s*=\s*'(\d+)'", webpage, 'dislike count', fatal=False)) + r'rdownVar\s*=\s*"(\d+)"', webpage, 'dislike count', fatal=False)) view_count = self._html_search_regex( - r'<strong>Views: </strong>([\d,\.]+)</li>', webpage, 'view count', fatal=False) + r'<strong>Views: </strong>([\d,\.]+)\s*</li>', webpage, 'view count', fatal=False) if view_count: view_count = str_to_int(view_count) comment_count = self._html_search_regex( From 0385d642232ba4e8b455d0c4eb95c7985f22f276 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 30 May 2015 14:12:58 +0600 Subject: [PATCH 0244/2145] [crunchyroll] Extract subtitles extraction routine --- youtube_dl/extractor/crunchyroll.py | 30 +++++++++++++---------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 1c77df47e..4ac537a6d 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -76,8 +76,8 @@ class CrunchyrollIE(InfoExtractor): self._login() def _decrypt_subtitles(self, data, iv, id): - data = bytes_to_intlist(data) - iv = bytes_to_intlist(iv) + data = bytes_to_intlist(base64.b64decode(data)) + iv = bytes_to_intlist(base64.b64decode(iv)) id = int(id) def obfuscate_key_aux(count, modulo, start): @@ -179,6 +179,16 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text return output + def _extract_subtitles(self, subtitle): + sub_root = xml.etree.ElementTree.fromstring(subtitle) + return [{ + 'ext': 'srt', + 'data': self._convert_subtitles_to_srt(sub_root), + }, { + 'ext': 'ass', + 'data': self._convert_subtitles_to_ass(sub_root), + }] + def _get_subtitles(self, video_id, webpage): subtitles = {} for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage): @@ -190,25 +200,11 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False) if not id or not iv or not data: continue - id = int(id) - iv = base64.b64decode(iv) - data = base64.b64decode(data) - subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8') lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) if not lang_code: continue - sub_root = xml.etree.ElementTree.fromstring(subtitle) - subtitles[lang_code] = [ - { - 'ext': 'srt', - 'data': self._convert_subtitles_to_srt(sub_root), - }, - { - 'ext': 'ass', - 'data': self._convert_subtitles_to_ass(sub_root), - }, - ] + subtitles[lang_code] = self._extract_subtitles(subtitle) return subtitles def _real_extract(self, url): From b2cf6543b21bbe0954c45b35b1402eaca5187c0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 30 May 2015 14:30:04 +0600 Subject: [PATCH 0245/2145] [soompi] Improve and simplify --- youtube_dl/extractor/soompi.py | 146 ++++++++++++++++++--------------- 1 file changed, 81 insertions(+), 65 deletions(-) diff --git a/youtube_dl/extractor/soompi.py b/youtube_dl/extractor/soompi.py index 4726872dc..5da66ca9e 100644 --- a/youtube_dl/extractor/soompi.py +++ b/youtube_dl/extractor/soompi.py @@ -2,17 +2,31 @@ from __future__ import unicode_literals import re -import json -import base64 -import xml.etree.ElementTree -# Soompi uses the same subtitle encryption as crunchyroll from .crunchyroll import CrunchyrollIE +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + ExtractorError, + int_or_none, + remove_start, + xpath_text, +) -class SoompiIE(CrunchyrollIE): + +class SoompiBaseIE(InfoExtractor): + def _get_episodes(self, webpage, episode_filter=None): + episodes = self._parse_json( + self._search_regex( + r'VIDEOS\s*=\s*(\[.+?\]);', webpage, 'episodes JSON'), + None) + return list(filter(episode_filter, episodes)) + + +class SoompiIE(SoompiBaseIE, CrunchyrollIE): IE_NAME = 'soompi' - _VALID_URL = r'^https?://tv\.soompi\.com/en/watch/(?P<id>[0-9]+)' + _VALID_URL = r'https?://tv\.soompi\.com/(?:en/)?watch/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://tv.soompi.com/en/watch/29235', 'info_dict': { @@ -26,84 +40,86 @@ class SoompiIE(CrunchyrollIE): }, }] - def _get_episodes(self, webpage, episode_filter=None): - episodes = json.loads( - self._search_regex(r'\s+VIDEOS\s+= (\[.+?\]);', webpage, "episodes meta")) - return [ep for ep in episodes if episode_filter is None or episode_filter(ep)] - - def _get_subtitles(self, video_id, show_format_xml): - subtitles = {} - subtitle_info_nodes = show_format_xml.findall('./{default}preload/subtitles/subtitle') - subtitle_nodes = show_format_xml.findall('./{default}preload/subtitle') + def _get_episode(self, webpage, video_id): + return self._get_episodes(webpage, lambda x: x['id'] == video_id)[0] + def _get_subtitles(self, config, video_id): sub_langs = {} - for i in subtitle_info_nodes: - sub_langs[i.attrib["id"]] = i.attrib["title"] + for subtitle in config.findall('./{default}preload/subtitles/subtitle'): + sub_langs[subtitle.attrib['id']] = subtitle.attrib['title'] - for s in subtitle_nodes: - lang_code = sub_langs.get(s.attrib["id"], None) - if lang_code is None: + subtitles = {} + for s in config.findall('./{default}preload/subtitle'): + lang_code = sub_langs.get(s.attrib['id']) + if not lang_code: + continue + sub_id = s.get('id') + data = xpath_text(s, './data', 'data') + iv = xpath_text(s, './iv', 'iv') + if not id or not iv or not data: continue - - sub_id = int(s.attrib["id"]) - iv = base64.b64decode(s.find("iv").text) - data = base64.b64decode(s.find("data").text) subtitle = self._decrypt_subtitles(data, iv, sub_id).decode('utf-8') - sub_root = xml.etree.ElementTree.fromstring(subtitle) - - subtitles[lang_code] = [{ - 'ext': 'srt', 'data': self._convert_subtitles_to_srt(sub_root) - }, { - 'ext': 'ass', 'data': self._convert_subtitles_to_ass(sub_root) - }] + subtitles[lang_code] = self._extract_subtitles(subtitle) return subtitles def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - url, video_id, note="Downloading episode page", - errnote="Video may not be available for your location") - vid_formats = re.findall(r"\?quality=q([0-9]+)", webpage) - - show_meta = json.loads( - self._search_regex(r'\s+var show = (\{.+?\});', webpage, "show meta")) - episodes = self._get_episodes( - webpage, episode_filter=lambda x: x['id'] == video_id) - - title = episodes[0]["name"] - description = episodes[0]["description"] - duration = int(episodes[0]["duration"]) - slug = show_meta["slug"] + try: + webpage = self._download_webpage( + url, video_id, 'Downloading episode page') + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: + webpage = ee.cause.read() + block_message = self._html_search_regex( + r'(?s)<div class="block-message">(.+?)</div>', webpage, + 'block message', default=None) + if block_message: + raise ExtractorError(block_message, expected=True) + raise formats = [] - show_format_xml = None - for vf in vid_formats: - show_format_url = "http://tv.soompi.com/en/show/%s/%s-config.xml?mode=hls&quality=q%s" \ - % (slug, video_id, vf) - show_format_xml = self._download_xml( - show_format_url, video_id, note="Downloading q%s show xml" % vf) - avail_formats = self._extract_m3u8_formats( - show_format_xml.find('./{default}preload/stream_info/file').text, - video_id, ext="mp4", m3u8_id=vf, preference=int(vf)) - formats.extend(avail_formats) + config = None + for format_id in re.findall(r'\?quality=([0-9a-zA-Z]+)', webpage): + config = self._download_xml( + 'http://tv.soompi.com/en/show/_/%s-config.xml?mode=hls&quality=%s' % (video_id, format_id), + video_id, 'Downloading %s XML' % format_id) + m3u8_url = xpath_text( + config, './{default}preload/stream_info/file', + '%s m3u8 URL' % format_id) + if not m3u8_url: + continue + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', m3u8_id=format_id)) self._sort_formats(formats) - subtitles = self.extract_subtitles(video_id, show_format_xml) + episode = self._get_episode(webpage, video_id) + + title = episode['name'] + description = episode.get('description') + duration = int_or_none(episode.get('duration')) + + thumbnails = [{ + 'id': thumbnail_id, + 'url': thumbnail_url, + } for thumbnail_id, thumbnail_url in episode.get('img_url', {}).items()] + + subtitles = self.extract_subtitles(config, video_id) return { 'id': video_id, 'title': title, 'description': description, + 'thumbnails': thumbnails, 'duration': duration, 'formats': formats, 'subtitles': subtitles } -class SoompiShowIE(SoompiIE): +class SoompiShowIE(SoompiBaseIE): IE_NAME = 'soompi:show' - _VALID_URL = r'^https?://tv\.soompi\.com/en/shows/(?P<id>[0-9a-zA-Z\-_]+)' + _VALID_URL = r'https?://tv\.soompi\.com/en/shows/(?P<id>[0-9a-zA-Z\-_]+)' _TESTS = [{ 'url': 'http://tv.soompi.com/en/shows/liar-game', 'info_dict': { @@ -117,14 +133,14 @@ class SoompiShowIE(SoompiIE): def _real_extract(self, url): show_id = self._match_id(url) - webpage = self._download_webpage(url, show_id, note="Downloading show page") - title = self._og_search_title(webpage).replace("SoompiTV | ", "") + webpage = self._download_webpage( + url, show_id, 'Downloading show page') + + title = remove_start(self._og_search_title(webpage), 'SoompiTV | ') description = self._og_search_description(webpage) - episodes = self._get_episodes(webpage) - entries = [] - for ep in episodes: - entries.append(self.url_result( - 'http://tv.soompi.com/en/watch/%s' % ep['id'], 'Soompi', ep['id'])) + entries = [ + self.url_result('http://tv.soompi.com/en/watch/%s' % episode['id'], 'Soompi') + for episode in self._get_episodes(webpage)] return self.playlist_result(entries, show_id, title, description) From 1a5b77dc21384c462e0be86a1638cafd15a6e236 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 30 May 2015 14:36:45 +0600 Subject: [PATCH 0246/2145] [crunchyroll] Fix python 3.2 --- youtube_dl/extractor/crunchyroll.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 4ac537a6d..41f0c736d 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -76,8 +76,8 @@ class CrunchyrollIE(InfoExtractor): self._login() def _decrypt_subtitles(self, data, iv, id): - data = bytes_to_intlist(base64.b64decode(data)) - iv = bytes_to_intlist(base64.b64decode(iv)) + data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) + iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8'))) id = int(id) def obfuscate_key_aux(count, modulo, start): From 5c2191a6053cb5b1210cef68406e8a52e86fd9fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 30 May 2015 15:14:10 +0600 Subject: [PATCH 0247/2145] [vgtv] Skip wasLive hds (Closes #5835) --- youtube_dl/extractor/vgtv.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index e6ee1e471..654298431 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -107,7 +107,8 @@ class VGTVIE(InfoExtractor): hls_url, video_id, 'mp4', m3u8_id='hls')) hds_url = streams.get('hds') - if hds_url: + # wasLive hds are always 404 + if hds_url and data.get('streamType') != 'wasLive': formats.extend(self._extract_f4m_formats( hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id, f4m_id='hds')) From 4d454c5e4b7ecfae97ff109e05453f43d7cea0a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 30 May 2015 15:15:42 +0600 Subject: [PATCH 0248/2145] [vgtv] Check for inactive videos --- youtube_dl/extractor/vgtv.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 654298431..472feb7f0 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import float_or_none +from ..utils import ( + ExtractorError, + float_or_none, +) class VGTVIE(InfoExtractor): @@ -97,6 +100,10 @@ class VGTVIE(InfoExtractor): % (host, video_id, HOST_WEBSITES[host]), video_id, 'Downloading media JSON') + if data.get('status') == 'inactive': + raise ExtractorError( + 'Video %s is no longer available' % video_id, expected=True) + streams = data['streamUrls'] formats = [] From 181c7053e377700c1615bdff2b0fb19235762c57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 30 May 2015 16:04:44 +0600 Subject: [PATCH 0249/2145] [YoutubeDL] Make sure all formats have unique format_id --- youtube_dl/YoutubeDL.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index d1953c18f..21d247f23 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1047,6 +1047,8 @@ class YoutubeDL(object): if not formats: raise ExtractorError('No video formats found!') + formats_dict = {} + # We check that all the formats have the format and format_id fields for i, format in enumerate(formats): if 'url' not in format: @@ -1054,6 +1056,18 @@ class YoutubeDL(object): if format.get('format_id') is None: format['format_id'] = compat_str(i) + format_id = format['format_id'] + if format_id not in formats_dict: + formats_dict[format_id] = [] + formats_dict[format_id].append(format) + + # Make sure all formats have unique format_id + for format_id, ambiguous_formats in formats_dict.items(): + if len(ambiguous_formats) > 1: + for i, format in enumerate(ambiguous_formats): + format['format_id'] = '%s-%d' % (format_id, i) + + for i, format in enumerate(formats): if format.get('format') is None: format['format'] = '{id} - {res}{note}'.format( id=format['format_id'], From b4dd98358f0a68650f6154e8de4e12b8881248aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 30 May 2015 16:12:07 +0600 Subject: [PATCH 0250/2145] [vgtv] Properly handle lives --- youtube_dl/extractor/vgtv.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 472feb7f0..f38a72fde 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -62,16 +62,16 @@ class VGTVIE(InfoExtractor): }, { # streamType: live - 'url': 'http://www.vgtv.no/#!/live/100015/direkte-her-kan-du-se-laksen-live-fra-suldalslaagen', + 'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla', 'info_dict': { - 'id': '100015', + 'id': '113063', 'ext': 'flv', - 'title': 'DIREKTE: Her kan du se laksen live fra Suldalslågen!', - 'description': 'md5:9a60cc23fa349f761628924e56eeec2d', + 'title': 're:^DIREKTE: V75 fra Solvalla [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': 'md5:b3743425765355855f88e096acc93231', 'thumbnail': 're:^https?://.*\.jpg', 'duration': 0, - 'timestamp': 1407423348, - 'upload_date': '20140807', + 'timestamp': 1432975582, + 'upload_date': '20150530', 'view_count': int, }, 'params': { @@ -105,6 +105,7 @@ class VGTVIE(InfoExtractor): 'Video %s is no longer available' % video_id, expected=True) streams = data['streamUrls'] + stream_type = data.get('streamType') formats = [] @@ -115,7 +116,7 @@ class VGTVIE(InfoExtractor): hds_url = streams.get('hds') # wasLive hds are always 404 - if hds_url and data.get('streamType') != 'wasLive': + if hds_url and stream_type != 'wasLive': formats.extend(self._extract_f4m_formats( hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id, f4m_id='hds')) @@ -143,13 +144,14 @@ class VGTVIE(InfoExtractor): return { 'id': video_id, - 'title': data['title'], + 'title': self._live_title(data['title']), 'description': data['description'], 'thumbnail': data['images']['main'] + '?t[]=900x506q80', 'timestamp': data['published'], 'duration': float_or_none(data['duration'], 1000), 'view_count': data['displays'], 'formats': formats, + 'is_live': True if stream_type == 'live' else False, } From e6e63e91a70a9c2e4ab92b8afad6fac3b8bede18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 30 May 2015 16:18:11 +0600 Subject: [PATCH 0251/2145] [tf1] Extend _VALID_URL (Closes #5848) --- youtube_dl/extractor/tf1.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 656410528..3a68eaa80 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class TF1IE(InfoExtractor): """TF1 uses the wat.tv player.""" - _VALID_URL = r'http://(?:videos\.tf1|www\.tfou|www\.tf1)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html' + _VALID_URL = r'http://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/.*?-(?P<id>\d+)(?:-\d+)?\.html' _TESTS = [{ 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', 'info_dict': { @@ -35,6 +35,9 @@ class TF1IE(InfoExtractor): }, { 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', 'only_matching': True, + }, { + 'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html', + 'only_matching': True, }] def _real_extract(self, url): From 5196b988971716b9e9c5884d33c757a41aa4548a Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Sat, 30 May 2015 14:16:18 +0300 Subject: [PATCH 0252/2145] [tubitv] Add new extractor (Closes #5524) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/tubitv.py | 79 ++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 youtube_dl/extractor/tubitv.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f73bf646b..e7e0a55f2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -569,6 +569,7 @@ from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE from .trutube import TruTubeIE from .tube8 import Tube8IE +from .tubitv import TubiTvIE from .tudou import TudouIE from .tumblr import TumblrIE from .tunein import TuneInIE diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py new file mode 100644 index 000000000..03e971e5e --- /dev/null +++ b/youtube_dl/extractor/tubitv.py @@ -0,0 +1,79 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import codecs +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse, + compat_urllib_request +) +from ..utils import ( + ExtractorError, + int_or_none, +) + + +class TubiTvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tubitv\.com/video\?id=(?P<id>[0-9]+)' + _LOGIN_URL = 'http://tubitv.com/login' + _NETRC_MACHINE = 'tubitv' + _TEST = { + 'url': 'http://tubitv.com/video?id=54411&title=The_Kitchen_Musical_-_EP01', + 'info_dict': { + 'id': '54411', + 'ext': 'mp4', + 'title': 'The Kitchen Musical - EP01', + 'thumbnail': 're:^https?://.*\.png$', + 'description': 'md5:37532716166069b353e8866e71fefae7', + 'duration': 2407, + }, + 'params': { + 'skip_download': 'HLS download', + }, + } + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + self.report_login() + form_data = { + 'username': username, + 'password': password, + } + payload = compat_urllib_parse.urlencode(form_data).encode('utf-8') + request = compat_urllib_request.Request(self._LOGIN_URL, payload) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') + login_page = self._download_webpage( + request, None, False, 'Wrong login info') + if not re.search(r'id="tubi-logout"', login_page): + raise ExtractorError( + 'Login failed (invalid username/password)', expected=True) + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'duration')) + + apu = self._search_regex(r"apu='([^']+)'", webpage, 'apu') + m3u8_url = codecs.decode(apu, 'rot_13')[::-1] + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'description': description, + 'duration': duration, + } From 1ae7ff771b17d16540aa446aef4f10971465a249 Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Sat, 30 May 2015 14:33:27 +0300 Subject: [PATCH 0253/2145] [tubitv] Add error message for videos that require login (#5524) --- youtube_dl/extractor/tubitv.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py index 03e971e5e..2c4b21807 100644 --- a/youtube_dl/extractor/tubitv.py +++ b/youtube_dl/extractor/tubitv.py @@ -59,6 +59,11 @@ class TubiTvIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + if re.search(r"<(?:DIV|div) class='login-required-screen'>", webpage): + raise ExtractorError( + 'This video requires login, use --username and --password ' + 'options to provide account credentials.', expected=True) + title = self._og_search_title(webpage) description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) From 386bdfa698a7f06c43df91913677db3732e29900 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 30 May 2015 18:29:16 +0600 Subject: [PATCH 0254/2145] [youtube:user] Workaround 35 pages limitation (Closes #5778) --- youtube_dl/extractor/youtube.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0301682b8..fcdbfe0bc 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1399,6 +1399,26 @@ class YoutubeChannelIE(InfoExtractor): channel_id = self._match_id(url) url = self._TEMPLATE_URL % channel_id + + # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) + # Workaround by extracting as a playlist if managed to obtain channel playlist URL + # otherwise fallback on channel by page extraction + channel_page = self._download_webpage( + url + '?view=57', channel_id, + 'Downloading channel page', fatal=False) + channel_playlist_id = self._search_regex( + [r'<meta itemprop="channelId" content="([^"]+)">', + r'data-channel-external-id="([^"]+)"'], + channel_page, 'channel id', default=None) + if channel_playlist_id and channel_playlist_id.startswith('UC'): + playlist_id = 'UU' + channel_playlist_id[2:] + channel_playlist = unescapeHTML(self._search_regex( + r'href="/?(watch\?v=[0-9A-Za-z_-]{11}&list=%s)"' % playlist_id, + channel_page, 'channel playlist URL', default=None)) + if channel_playlist: + return self.url_result( + compat_urlparse.urljoin(url, '/%s' % channel_playlist), 'YoutubePlaylist') + channel_page = self._download_webpage(url, channel_id, 'Downloading page #1') autogenerated = re.search(r'''(?x) class="[^"]*?(?: From 9ff811c5cddbf3481fdcd44e97cf3683a925b33f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 30 May 2015 23:35:55 +0800 Subject: [PATCH 0255/2145] [porn91] PEP8 --- youtube_dl/extractor/porn91.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py index b62eec92d..cdf308f3d 100644 --- a/youtube_dl/extractor/porn91.py +++ b/youtube_dl/extractor/porn91.py @@ -5,7 +5,6 @@ import re from ..compat import compat_urllib_parse from .common import InfoExtractor -from ..utils import ExtractorError class Porn91IE(InfoExtractor): @@ -13,13 +12,13 @@ class Porn91IE(InfoExtractor): _VALID_URL = r'(?:https?://)(?:www\.|)91porn\.com/.+?\?viewkey=(?P<id>[\w\d]+)' _TEST = { - 'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134', - 'md5': '6df8f6d028bc8b14f5dbd73af742fb20', - 'info_dict': { - 'id': '7e42283b4f5ab36da134', - 'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!', - 'ext': 'mp4' - } + 'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134', + 'md5': '6df8f6d028bc8b14f5dbd73af742fb20', + 'info_dict': { + 'id': '7e42283b4f5ab36da134', + 'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!', + 'ext': 'mp4' + } } def _real_extract(self, url): From 1c2223875664f99325b73fe7765677db9b87e105 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 31 May 2015 00:03:19 +0800 Subject: [PATCH 0256/2145] [porn91] Simplify --- youtube_dl/extractor/porn91.py | 38 ++++++++++++++-------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py index cdf308f3d..377ca2c77 100644 --- a/youtube_dl/extractor/porn91.py +++ b/youtube_dl/extractor/porn91.py @@ -1,8 +1,6 @@ # encoding: utf-8 from __future__ import unicode_literals -import re - from ..compat import compat_urllib_parse from .common import InfoExtractor @@ -22,38 +20,34 @@ class Porn91IE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) url = 'http://91porn.com/view_video.php?viewkey=%s' % video_id self._set_cookie('91porn.com', 'language', 'cn_CN') webpage = self._download_webpage(url, video_id, "get HTML content") title = self._search_regex( - r'<div id="viewvideo-title">(?P<title>.+?)</div>', - webpage, 'title', flags=re.DOTALL) + r'<div id="viewvideo-title">([^<]+)</div>', webpage, 'title') title = title.replace('\n', '') # get real url - n1 = self._search_regex( - r'so.addVariable\(\'file\',\'(?P<n1>\d+)\'', webpage, 'n1') - n2 = self._search_regex( - r'so.addVariable\(\'seccode\',\'(?P<n2>.+?)\'', webpage, 'n2') - n3 = self._search_regex( - r'so.addVariable\(\'max_vid\',\'(?P<n3>\d+)\'', webpage, 'n3') + file_id = self._search_regex( + r'so.addVariable\(\'file\',\'(\d+)\'', webpage, 'file id') + sec_code = self._search_regex( + r'so.addVariable\(\'seccode\',\'([^\']+)\'', webpage, 'sec code') + max_vid = self._search_regex( + r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage, 'max vid') url_params = compat_urllib_parse.urlencode({ - 'VID': n1, + 'VID': file_id, 'mp4': '1', - 'seccode': n2, - 'max_vid': n3, + 'seccode': sec_code, + 'max_vid': max_vid, }) - t_url = 'http://91porn.com/getfile.php?' + url_params - info_cn = self._download_webpage(t_url, video_id, "get real video_url") - video_url = self._search_regex(r'file=(?P<url>http.+?)&', info_cn, 'url') + info_cn = self._download_webpage( + 'http://91porn.com/getfile.php?' + url_params, video_id, + "get real video url") + video_url = self._search_regex(r'file=([^&]+)&', info_cn, 'url') - # construct info - info = { + return { 'id': video_id, 'title': title, 'url': video_url, } - - return info From a80601f8d9789e27c0a916e63d7192c3f398d5d5 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 31 May 2015 00:20:37 +0800 Subject: [PATCH 0257/2145] [porn91] Extract more info --- youtube_dl/extractor/porn91.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py index 377ca2c77..ea1efc71b 100644 --- a/youtube_dl/extractor/porn91.py +++ b/youtube_dl/extractor/porn91.py @@ -3,6 +3,10 @@ from __future__ import unicode_literals from ..compat import compat_urllib_parse from .common import InfoExtractor +from ..utils import ( + parse_duration, + int_or_none, +) class Porn91IE(InfoExtractor): @@ -15,7 +19,8 @@ class Porn91IE(InfoExtractor): 'info_dict': { 'id': '7e42283b4f5ab36da134', 'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!', - 'ext': 'mp4' + 'ext': 'mp4', + 'duration': 431, } } @@ -46,8 +51,16 @@ class Porn91IE(InfoExtractor): "get real video url") video_url = self._search_regex(r'file=([^&]+)&', info_cn, 'url') + duration = parse_duration(self._search_regex( + r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False)) + + comment_count = int_or_none(self._search_regex( + r'留言:\s*</span>\s*(\d+)', webpage, 'comment count', fatal=False)) + return { 'id': video_id, 'title': title, 'url': video_url, + 'duration': duration, + 'comment_count': comment_count, } From d05a1dbe7013d6314ec477b50d864726e509a872 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 31 May 2015 00:26:12 +0800 Subject: [PATCH 0258/2145] [porn91] Catch daily limit error --- youtube_dl/extractor/porn91.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py index ea1efc71b..c119c7e94 100644 --- a/youtube_dl/extractor/porn91.py +++ b/youtube_dl/extractor/porn91.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..utils import ( parse_duration, int_or_none, + ExtractorError, ) @@ -29,6 +30,10 @@ class Porn91IE(InfoExtractor): url = 'http://91porn.com/view_video.php?viewkey=%s' % video_id self._set_cookie('91porn.com', 'language', 'cn_CN') webpage = self._download_webpage(url, video_id, "get HTML content") + + if '作为游客,你每天只可观看10个视频' in webpage: + raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True) + title = self._search_regex( r'<div id="viewvideo-title">([^<]+)</div>', webpage, 'title') title = title.replace('\n', '') From a2d971309b75c79f3f688a0c381707d828cb1026 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 31 May 2015 00:31:18 +0800 Subject: [PATCH 0259/2145] [porn91] Use single quotes --- youtube_dl/extractor/porn91.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py index c119c7e94..72d1b2718 100644 --- a/youtube_dl/extractor/porn91.py +++ b/youtube_dl/extractor/porn91.py @@ -29,7 +29,7 @@ class Porn91IE(InfoExtractor): video_id = self._match_id(url) url = 'http://91porn.com/view_video.php?viewkey=%s' % video_id self._set_cookie('91porn.com', 'language', 'cn_CN') - webpage = self._download_webpage(url, video_id, "get HTML content") + webpage = self._download_webpage(url, video_id, 'get HTML content') if '作为游客,你每天只可观看10个视频' in webpage: raise ExtractorError('91 Porn says: Daily limit 10 videos exceeded', expected=True) @@ -53,7 +53,7 @@ class Porn91IE(InfoExtractor): }) info_cn = self._download_webpage( 'http://91porn.com/getfile.php?' + url_params, video_id, - "get real video url") + 'get real video url') video_url = self._search_regex(r'file=([^&]+)&', info_cn, 'url') duration = parse_duration(self._search_regex( From 931bc3c3a719fe33101c05b9fdc4e6ad8eb08bdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 30 May 2015 22:52:02 +0600 Subject: [PATCH 0260/2145] [YoutubeDL] Do not loose request method information --- youtube_dl/YoutubeDL.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 21d247f23..5fc8754c6 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -49,6 +49,7 @@ from .utils import ( ExtractorError, format_bytes, formatSeconds, + HEADRequest, locked_file, make_HTTPS_handler, MaxDownloadsReached, @@ -1720,7 +1721,8 @@ class YoutubeDL(object): if req_is_string: req = url_escaped else: - req = compat_urllib_request.Request( + req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request + req = req_type( url_escaped, data=req.data, headers=req.headers, origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) From 339516072be6865bf7e9316be81704ae69296c6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 30 May 2015 23:16:14 +0600 Subject: [PATCH 0261/2145] [extractor/generic] Unescape video_id and title extracted from URL --- youtube_dl/extractor/generic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 9a7b0d25d..c9c92d686 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -894,7 +894,7 @@ class GenericIE(InfoExtractor): force_videoid = smuggled_data['force_videoid'] video_id = force_videoid else: - video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0] + video_id = os.path.splitext(compat_urllib_parse.unquote(url.rstrip('/').split('/')[-1]))[0] self.to_screen('%s: Requesting header' % video_id) @@ -927,7 +927,7 @@ class GenericIE(InfoExtractor): head_response.headers.get('Last-Modified')) return { 'id': video_id, - 'title': os.path.splitext(url_basename(url))[0], + 'title': os.path.splitext(compat_urllib_parse.unquote(url_basename(url)))[0], 'direct': True, 'formats': [{ 'format_id': m.group('format_id'), @@ -953,7 +953,7 @@ class GenericIE(InfoExtractor): head_response.headers.get('Last-Modified')) return { 'id': video_id, - 'title': os.path.splitext(url_basename(url))[0], + 'title': os.path.splitext(compat_urllib_parse.unquote(url_basename(url)))[0], 'direct': True, 'url': url, 'upload_date': upload_date, From 58bde34a236ff98f25fc109a94b3d393f0bbc9ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 31 May 2015 00:44:54 +0600 Subject: [PATCH 0262/2145] [extractor/generic] Force Accept-Encoding to any for extraction pass --- youtube_dl/extractor/generic.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c9c92d686..ec1d9abbe 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -11,6 +11,7 @@ from ..compat import ( compat_urllib_parse, compat_urlparse, compat_xml_parse_error, + compat_urllib_request, ) from ..utils import ( determine_ext, @@ -916,7 +917,9 @@ class GenericIE(InfoExtractor): full_response = None if head_response is False: - full_response = self._request_webpage(url, video_id) + request = compat_urllib_request.Request(url) + request.add_header('Accept-Encoding', '*') + full_response = self._request_webpage(request, video_id) head_response = full_response # Check for direct link to a video @@ -941,7 +944,17 @@ class GenericIE(InfoExtractor): self._downloader.report_warning('Falling back on generic information extractor.') if not full_response: - full_response = self._request_webpage(url, video_id) + request = compat_urllib_request.Request(url) + # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) + # making it impossible to download only chunk of the file (yet we need only 512kB to + # test whether it's HTML or not). According to youtube-dl default Accept-Encoding + # that will always result in downloading the whole file that is not desirable. + # Therefore for extraction pass we have to override Accept-Encoding to any in order + # to accept raw bytes and being able to download only a chunk. + # It may probably better to solve this by checking Content-Type for application/octet-stream + # after HEAD request finishes, but not sure if we can rely on this. + request.add_header('Accept-Encoding', '*') + full_response = self._request_webpage(request, video_id) # Maybe it's a direct link to a video? # Be careful not to download the whole thing! From 1ddb9456c4a63a207ec40bd74cdf0b36d8c68409 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 31 May 2015 01:23:58 +0600 Subject: [PATCH 0263/2145] [extractor/generic] Use compat_urllib_parse_unquote for unquoting video_id and title from URL --- youtube_dl/extractor/generic.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ec1d9abbe..d9116ce10 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -9,9 +9,10 @@ from .common import InfoExtractor from .youtube import YoutubeIE from ..compat import ( compat_urllib_parse, + compat_urllib_parse_unquote, + compat_urllib_request, compat_urlparse, compat_xml_parse_error, - compat_urllib_request, ) from ..utils import ( determine_ext, @@ -895,7 +896,7 @@ class GenericIE(InfoExtractor): force_videoid = smuggled_data['force_videoid'] video_id = force_videoid else: - video_id = os.path.splitext(compat_urllib_parse.unquote(url.rstrip('/').split('/')[-1]))[0] + video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) self.to_screen('%s: Requesting header' % video_id) @@ -930,7 +931,7 @@ class GenericIE(InfoExtractor): head_response.headers.get('Last-Modified')) return { 'id': video_id, - 'title': os.path.splitext(compat_urllib_parse.unquote(url_basename(url)))[0], + 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), 'direct': True, 'formats': [{ 'format_id': m.group('format_id'), @@ -966,7 +967,7 @@ class GenericIE(InfoExtractor): head_response.headers.get('Last-Modified')) return { 'id': video_id, - 'title': os.path.splitext(compat_urllib_parse.unquote(url_basename(url)))[0], + 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), 'direct': True, 'url': url, 'upload_date': upload_date, From a074e922967fa571d4f1abb1773c711747060f00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 31 May 2015 02:13:24 +0600 Subject: [PATCH 0264/2145] [extractor/generic] Add test for large compressed media --- youtube_dl/extractor/generic.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d9116ce10..737141f95 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -138,6 +138,20 @@ class GenericIE(InfoExtractor): 'upload_date': '20100513', } }, + # Direct link to a media delivered compressed (requires Accept-Encoding == *) + { + 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac', + 'md5': '128c42e68b13950268b648275386fc74', + 'info_dict': { + 'id': 'FictionJunction-Parallel_Hearts', + 'ext': 'flac', + 'title': 'FictionJunction-Parallel_Hearts', + 'upload_date': '20140522', + }, + 'expected_warnings': [ + 'URL could be a direct video link, returning it as such.' + ] + }, # ooyala video { 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', From c5fa81fe81ce05cd81c20ff4ea6dac3dccdcbf9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 31 May 2015 02:22:29 +0600 Subject: [PATCH 0265/2145] [extractor/generic] Put all direct link tests near to each other for better navigation --- youtube_dl/extractor/generic.py | 182 ++++++++++++++++---------------- 1 file changed, 91 insertions(+), 91 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 737141f95..8f2e53063 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -48,6 +48,97 @@ class GenericIE(InfoExtractor): _VALID_URL = r'.*' IE_NAME = 'generic' _TESTS = [ + # Direct link to a video + { + 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', + 'md5': '67d406c2bcb6af27fa886f31aa934bbe', + 'info_dict': { + 'id': 'trailer', + 'ext': 'mp4', + 'title': 'trailer', + 'upload_date': '20100513', + } + }, + # Direct link to a media delivered compressed (requires Accept-Encoding == *) + { + 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac', + 'md5': '128c42e68b13950268b648275386fc74', + 'info_dict': { + 'id': 'FictionJunction-Parallel_Hearts', + 'ext': 'flac', + 'title': 'FictionJunction-Parallel_Hearts', + 'upload_date': '20140522', + }, + 'expected_warnings': [ + 'URL could be a direct video link, returning it as such.' + ] + }, + # Direct download with broken HEAD + { + 'url': 'http://ai-radio.org:8000/radio.opus', + 'info_dict': { + 'id': 'radio', + 'ext': 'opus', + 'title': 'radio', + }, + 'params': { + 'skip_download': True, # infinite live stream + }, + 'expected_warnings': [ + r'501.*Not Implemented' + ], + }, + # Direct link with incorrect MIME type + { + 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', + 'md5': '4ccbebe5f36706d85221f204d7eb5913', + 'info_dict': { + 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', + 'id': '5_Lennart_Poettering_-_Systemd', + 'ext': 'webm', + 'title': '5_Lennart_Poettering_-_Systemd', + 'upload_date': '20141120', + }, + 'expected_warnings': [ + 'URL could be a direct video link, returning it as such.' + ] + }, + # RSS feed + { + 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'info_dict': { + 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'title': 'Zero Punctuation', + 'description': 're:.*groundbreaking video review series.*' + }, + 'playlist_mincount': 11, + }, + # RSS feed with enclosure + { + 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', + 'info_dict': { + 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', + 'ext': 'm4v', + 'upload_date': '20150228', + 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', + } + }, + # google redirect + { + 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', + 'info_dict': { + 'id': 'cmQHVoWB5FY', + 'ext': 'mp4', + 'upload_date': '20130224', + 'uploader_id': 'TheVerge', + 'description': 're:^Chris Ziegler takes a look at the\.*', + 'uploader': 'The Verge', + 'title': 'First Firefox OS phones side-by-side', + }, + 'params': { + 'skip_download': False, + } + }, { 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', 'md5': '85b90ccc9d73b4acd9138d3af4c27f89', @@ -127,31 +218,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, # m3u8 download }, }, - # Direct link to a video - { - 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', - 'md5': '67d406c2bcb6af27fa886f31aa934bbe', - 'info_dict': { - 'id': 'trailer', - 'ext': 'mp4', - 'title': 'trailer', - 'upload_date': '20100513', - } - }, - # Direct link to a media delivered compressed (requires Accept-Encoding == *) - { - 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac', - 'md5': '128c42e68b13950268b648275386fc74', - 'info_dict': { - 'id': 'FictionJunction-Parallel_Hearts', - 'ext': 'flac', - 'title': 'FictionJunction-Parallel_Hearts', - 'upload_date': '20140522', - }, - 'expected_warnings': [ - 'URL could be a direct video link, returning it as such.' - ] - }, # ooyala video { 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', @@ -176,22 +242,6 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Ooyala'], }, - # google redirect - { - 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', - 'info_dict': { - 'id': 'cmQHVoWB5FY', - 'ext': 'mp4', - 'upload_date': '20130224', - 'uploader_id': 'TheVerge', - 'description': 're:^Chris Ziegler takes a look at the\.*', - 'uploader': 'The Verge', - 'title': 'First Firefox OS phones side-by-side', - }, - 'params': { - 'skip_download': False, - } - }, # embed.ly video { 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', @@ -423,16 +473,6 @@ class GenericIE(InfoExtractor): 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com', } }, - # RSS feed - { - 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', - 'info_dict': { - 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', - 'title': 'Zero Punctuation', - 'description': 're:.*groundbreaking video review series.*' - }, - 'playlist_mincount': 11, - }, # Multiple brightcove videos # https://github.com/rg3/youtube-dl/issues/2283 { @@ -486,21 +526,6 @@ class GenericIE(InfoExtractor): 'uploader': 'thoughtworks.wistia.com', }, }, - # Direct download with broken HEAD - { - 'url': 'http://ai-radio.org:8000/radio.opus', - 'info_dict': { - 'id': 'radio', - 'ext': 'opus', - 'title': 'radio', - }, - 'params': { - 'skip_download': True, # infinite live stream - }, - 'expected_warnings': [ - r'501.*Not Implemented' - ], - }, # Soundcloud embed { 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/', @@ -532,21 +557,6 @@ class GenericIE(InfoExtractor): }, 'playlist_mincount': 2, }, - # Direct link with incorrect MIME type - { - 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', - 'md5': '4ccbebe5f36706d85221f204d7eb5913', - 'info_dict': { - 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', - 'id': '5_Lennart_Poettering_-_Systemd', - 'ext': 'webm', - 'title': '5_Lennart_Poettering_-_Systemd', - 'upload_date': '20141120', - }, - 'expected_warnings': [ - 'URL could be a direct video link, returning it as such.' - ] - }, # Cinchcast embed { 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/', @@ -705,16 +715,6 @@ class GenericIE(InfoExtractor): 'age_limit': 0, }, }, - # RSS feed with enclosure - { - 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', - 'info_dict': { - 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - 'ext': 'm4v', - 'upload_date': '20150228', - 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - } - }, # Crooks and Liars embed { 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists', From c5138a7ce49db19b64adc11d81384595b966a7a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 31 May 2015 02:36:20 +0600 Subject: [PATCH 0266/2145] [extractor/generic] Clarify test comment --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8f2e53063..96ca398de 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -59,7 +59,7 @@ class GenericIE(InfoExtractor): 'upload_date': '20100513', } }, - # Direct link to a media delivered compressed (requires Accept-Encoding == *) + # Direct link to media delivered compressed (until Accept-Encoding is *) { 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac', 'md5': '128c42e68b13950268b648275386fc74', From d2a9de78dfc629aaaaf8a2a30432d5f02c949e9a Mon Sep 17 00:00:00 2001 From: Ivan Kozik <ivan@ludios.org> Date: Sat, 30 May 2015 20:50:22 +0000 Subject: [PATCH 0267/2145] [youtube] Construct a playlist URL in case the page is missing one This fixes jumping from user/channel -> playlist for some users like https://www.youtube.com/user/BitcoinFoundation This also removes the superfluous log message "add --no-playlist to just download video VIDEOID" when downloading a user/channel. --- youtube_dl/extractor/youtube.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index fcdbfe0bc..aacb999ce 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1412,12 +1412,8 @@ class YoutubeChannelIE(InfoExtractor): channel_page, 'channel id', default=None) if channel_playlist_id and channel_playlist_id.startswith('UC'): playlist_id = 'UU' + channel_playlist_id[2:] - channel_playlist = unescapeHTML(self._search_regex( - r'href="/?(watch\?v=[0-9A-Za-z_-]{11}&list=%s)"' % playlist_id, - channel_page, 'channel playlist URL', default=None)) - if channel_playlist: - return self.url_result( - compat_urlparse.urljoin(url, '/%s' % channel_playlist), 'YoutubePlaylist') + return self.url_result( + compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist') channel_page = self._download_webpage(url, channel_id, 'Downloading page #1') autogenerated = re.search(r'''(?x) From eb47569f8a6017190d73429b3ef54c1ffaf201dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 31 May 2015 03:00:13 +0600 Subject: [PATCH 0268/2145] [tvigle] Add support for m3u8 --- youtube_dl/extractor/tvigle.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py index 102362b29..4e95bd30f 100644 --- a/youtube_dl/extractor/tvigle.py +++ b/youtube_dl/extractor/tvigle.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( float_or_none, + int_or_none, parse_age_limit, ) @@ -24,17 +25,17 @@ class TvigleIE(InfoExtractor): 'display_id': 'sokrat', 'ext': 'flv', 'title': 'Сократ', - 'description': 'md5:a05bd01be310074d5833efc6743be95e', + 'description': 'md5:d6b92ffb7217b4b8ebad2e7665253c17', 'duration': 6586, - 'age_limit': 0, + 'age_limit': 12, }, }, { 'url': 'http://www.tvigle.ru/video/vladimir-vysotskii/vedushchii-teleprogrammy-60-minut-ssha-o-vladimire-vysotskom/', - 'md5': 'd9012d7c7c598fe7a11d7fb46dc1f574', + 'md5': 'e7efe5350dd5011d0de6550b53c3ba7b', 'info_dict': { 'id': '5142516', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Ведущий телепрограммы «60 минут» (США) о Владимире Высоцком', 'description': 'md5:027f7dc872948f14c96d19b4178428a4', 'duration': 186.080, @@ -54,7 +55,7 @@ class TvigleIE(InfoExtractor): if not video_id: webpage = self._download_webpage(url, display_id) video_id = self._html_search_regex( - r'<li class="video-preview current_playing" id="(\d+)">', + r'class="video-preview current_playing" id="(\d+)">', webpage, 'video id') video_data = self._download_json( @@ -70,13 +71,19 @@ class TvigleIE(InfoExtractor): formats = [] for vcodec, fmts in item['videos'].items(): - for quality, video_url in fmts.items(): + for format_id, video_url in fmts.items(): + if format_id == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id=vcodec)) + continue + height = self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None) formats.append({ 'url': video_url, - 'format_id': '%s-%s' % (vcodec, quality), + 'format_id': '%s-%s' % (vcodec, format_id), 'vcodec': vcodec, - 'height': int(quality[:-1]), - 'filesize': item['video_files_size'][vcodec][quality], + 'height': int_or_none(height), + 'filesize': item['video_files_size'][vcodec][format_id], }) self._sort_formats(formats) From 7584e38ce4e98e0e9abca146a513d215701308e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 31 May 2015 03:01:41 +0600 Subject: [PATCH 0269/2145] [tvigle] Modernize --- youtube_dl/extractor/tvigle.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py index 4e95bd30f..a85693888 100644 --- a/youtube_dl/extractor/tvigle.py +++ b/youtube_dl/extractor/tvigle.py @@ -64,8 +64,8 @@ class TvigleIE(InfoExtractor): item = video_data['playlist']['items'][0] title = item['title'] - description = item['description'] - thumbnail = item['thumbnail'] + description = item.get('description') + thumbnail = item.get('thumbnail') duration = float_or_none(item.get('durationMilliseconds'), 1000) age_limit = parse_age_limit(item.get('ageRestrictions')) @@ -83,7 +83,7 @@ class TvigleIE(InfoExtractor): 'format_id': '%s-%s' % (vcodec, format_id), 'vcodec': vcodec, 'height': int_or_none(height), - 'filesize': item['video_files_size'][vcodec][format_id], + 'filesize': int_or_none(item.get('video_files_size', {}).get(vcodec, {}).get(format_id)), }) self._sort_formats(formats) From df15ef8dab6df79eb076b3d06b3948917763ac3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 31 May 2015 04:05:09 +0600 Subject: [PATCH 0270/2145] [YoutubeDL] Tweak select_format for video only media --- youtube_dl/YoutubeDL.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 5fc8754c6..aa6ec9d9a 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -924,8 +924,9 @@ class YoutubeDL(object): if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] if audiovideo_formats: return audiovideo_formats[format_idx] - # for audio only urls, select the best/worst audio format - elif all(f.get('acodec') != 'none' for f in available_formats): + # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format + elif (all(f.get('acodec') != 'none' for f in available_formats) or + all(f.get('vcodec') != 'none' for f in available_formats)): return available_formats[format_idx] elif format_spec == 'bestaudio': audio_formats = [ From 96b9690985e9b9f4e50fde10bbc92e1a72df64e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 31 May 2015 04:05:26 +0600 Subject: [PATCH 0271/2145] [imgur] Improve extraction --- youtube_dl/extractor/imgur.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index fe5d95e2c..d692ea79a 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( int_or_none, js_to_json, @@ -12,7 +13,7 @@ from ..utils import ( class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P<id>[a-zA-Z0-9]+)(?:\.mp4|\.gifv)?' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?P<id>[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -34,7 +35,8 @@ class ImgurIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + compat_urlparse.urljoin(url, video_id), video_id) width = int_or_none(self._search_regex( r'<param name="width" value="([0-9]+)"', From 47fd8c2f761c2073744cb041f9eccb7ed10f2470 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 1 Jun 2015 00:04:36 +0600 Subject: [PATCH 0272/2145] [patreon] Fix embeds extraction (Closes #5862) --- youtube_dl/extractor/patreon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index f179ea200..6cdc2638b 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -87,7 +87,7 @@ class PatreonIE(InfoExtractor): r'<div class="attach"><a target="_blank" href="([^"]+)">', webpage, 'attachment URL', default=None) embed = self._html_search_regex( - r'<div id="watchCreation">\s*<iframe class="embedly-embed" src="([^"]+)"', + r'<div[^>]+id="watchCreation"[^>]*>\s*<iframe[^>]+src="([^"]+)"', webpage, 'embedded URL', default=None) if attach_fn is not None: From 4053ee9104cd7669f749a267dc2c2a1725ca188b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 1 Jun 2015 14:43:20 +0800 Subject: [PATCH 0273/2145] Credit @PeterDing for 91porn extractor (#5830) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 3410e1fb9..bf2a25cb8 100644 --- a/AUTHORS +++ b/AUTHORS @@ -126,3 +126,4 @@ Matthias Küch Julian Richen Ping O. Mister Hat +Peter Ding From 866b296d0f156831cceccc967c34382a90b77422 Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Mon, 1 Jun 2015 16:11:19 +0300 Subject: [PATCH 0274/2145] [aftonbladet] Fix extraction and update _VALID_URL (Fixes #5863) --- youtube_dl/extractor/aftonbladet.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py index a117502bc..e0518cf26 100644 --- a/youtube_dl/extractor/aftonbladet.py +++ b/youtube_dl/extractor/aftonbladet.py @@ -6,11 +6,11 @@ from ..utils import int_or_none class AftonbladetIE(InfoExtractor): - _VALID_URL = r'http://tv\.aftonbladet\.se/webbtv.+?(?P<id>article[0-9]+)\.ab(?:$|[?#])' + _VALID_URL = r'http://tv\.aftonbladet\.se/abtv/articles/(?P<id>[0-9]+)' _TEST = { - 'url': 'http://tv.aftonbladet.se/webbtv/nyheter/vetenskap/rymden/article36015.ab', + 'url': 'http://tv.aftonbladet.se/abtv/articles/36015', 'info_dict': { - 'id': 'article36015', + 'id': '36015', 'ext': 'mp4', 'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna', 'description': 'Jupiters måne mest aktiv av alla himlakroppar', @@ -25,8 +25,9 @@ class AftonbladetIE(InfoExtractor): # find internal video meta data meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json' - internal_meta_id = self._html_search_regex( - r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id') + player_config = self._parse_json(self._html_search_regex( + r'data-player-config="([^"]+)"', webpage, 'player config'), video_id) + internal_meta_id = player_config['videoId'] internal_meta_url = meta_url % internal_meta_id internal_meta_json = self._download_json( internal_meta_url, video_id, 'Downloading video meta data') From 923e79e2e4d9cc0c24496614aab520737cdc89ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 2 Jun 2015 00:53:04 +0600 Subject: [PATCH 0275/2145] [nova] Add extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/nova.py | 135 +++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 youtube_dl/extractor/nova.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 4dc07efe0..67eb96057 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -352,6 +352,7 @@ from .ninegag import NineGagIE from .noco import NocoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE +from .nova import NovaIE from .novamov import NovaMovIE from .nowness import NownessIE from .nowtv import NowTVIE diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py new file mode 100644 index 000000000..e93a7ffa8 --- /dev/null +++ b/youtube_dl/extractor/nova.py @@ -0,0 +1,135 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class NovaIE(InfoExtractor): + IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz' + _VALID_URL = 'http://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+)(?:\.html|/?)' + _TESTS = [{ + 'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html', + 'info_dict': { + 'id': '1608920', + 'display_id': 'co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou', + 'ext': 'flv', + 'title': 'Duel: Michal Hrdlička a Petr Suchoň', + 'description': 'md5:d0cc509858eee1b1374111c588c6f5d5', + 'thumbnail': 're:^https?://.*\.(?:jpg)', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html', + 'md5': '1dd7b9d5ea27bc361f110cd855a19bd3', + 'info_dict': { + 'id': '1757139', + 'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci', + 'ext': 'mp4', + 'title': 'Podzemní nemocnice v pražské Krči', + 'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53', + 'thumbnail': 're:^https?://.*\.(?:jpg)', + } + }, { + 'url': 'http://novaplus.nova.cz/porad/policie-modrava/video/5591-policie-modrava-15-dil-blondynka-na-hrbitove/', + 'info_dict': { + 'id': '1756825', + 'display_id': '5591-policie-modrava-15-dil-blondynka-na-hrbitove', + 'ext': 'flv', + 'title': 'Policie Modrava - 15. díl - Blondýnka na hřbitově', + 'description': 'md5:d804ba6b30bc7da2705b1fea961bddfe', + 'thumbnail': 're:^https?://.*\.(?:jpg)', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html', + 'only_matching': True, + }, { + 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html', + 'only_matching': True, + }, { + 'url': 'http://doma.nova.cz/clanek/zdravi/prijdte-se-zapsat-do-registru-kostni-drene-jiz-ve-stredu-3-cervna.html', + 'only_matching': True, + }, { + 'url': 'http://prask.nova.cz/clanek/novinky/co-si-na-sobe-nase-hvezdy-nechaly-pojistit.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + site = mobj.group('site') + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + [r"(?:media|video_id)\s*:\s*'(\d+)'", + r'media=(\d+)', + r'id="article_video_(\d+)"', + r'id="player_(\d+)"'], + webpage, 'video id') + + config_url = self._search_regex( + r'src="(http://tn\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"', + webpage, 'config url', default=None) + + if not config_url: + DEFAULT_SITE_ID = '23000' + SITES = { + 'tvnoviny': DEFAULT_SITE_ID, + 'novaplus': DEFAULT_SITE_ID, + 'vymena': DEFAULT_SITE_ID, + 'krasna': DEFAULT_SITE_ID, + 'fanda': '30', + 'tn': '30', + 'doma': '30', + } + + site_id = self._search_regex( + r'site=(\d+)', webpage, 'site id', default=None) or SITES.get(site, DEFAULT_SITE_ID) + + config_url = ('http://tn.nova.cz/bin/player/videojs/config.php?site=%s&media=%s&jsVar=vjsconfig' + % (site_id, video_id)) + + config = self._download_json( + config_url, display_id, + 'Downloading config JSON', + transform_source=lambda s: re.sub(r'var\s+[\da-zA-Z_]+\s*=\s*({.+?});', r'\1', s)) + + mediafile = config['mediafile'] + video_url = mediafile['src'] + + m = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+?))/&*(?P<playpath>.+)$', video_url) + if m: + formats = [{ + 'url': m.group('url'), + 'app': m.group('app'), + 'play_path': m.group('playpath'), + 'player_path': 'http://tvnoviny.nova.cz/static/shared/app/videojs/video-js.swf', + 'ext': 'flv', + }] + else: + formats = [{ + 'url': video_url, + }] + self._sort_formats(formats) + + title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = config.get('poster') + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + } From 60158217ef8da5f44ef316e50c8a5e2ac2e202c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 2 Jun 2015 00:57:08 +0600 Subject: [PATCH 0276/2145] [nova] Add tv test --- youtube_dl/extractor/nova.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index e93a7ffa8..4e999b237 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -60,6 +60,9 @@ class NovaIE(InfoExtractor): }, { 'url': 'http://prask.nova.cz/clanek/novinky/co-si-na-sobe-nase-hvezdy-nechaly-pojistit.html', 'only_matching': True, + }, { + 'url': 'http://tv.nova.cz/clanek/novinky/zivot-je-zivot-bondovsky-trailer.html', + 'only_matching': True, }] def _real_extract(self, url): From 9f4b9118ccaef5cd7c414a78c5622968e8c3343f Mon Sep 17 00:00:00 2001 From: Slava Shklyaev <shk.slava@gmail.com> Date: Tue, 2 Jun 2015 12:47:52 +0300 Subject: [PATCH 0277/2145] [nova] Fix display_id extraction bug Make id group non-greedy so that .html is not included in it. --- youtube_dl/extractor/nova.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 4e999b237..1dd18511e 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class NovaIE(InfoExtractor): IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz' - _VALID_URL = 'http://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+)(?:\.html|/?)' + _VALID_URL = 'http://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/?)$' _TESTS = [{ 'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html', 'info_dict': { From 9464a194dbf48989c486fa2de9e1aebc59e28ed4 Mon Sep 17 00:00:00 2001 From: Slava Shklyaev <shk.slava@gmail.com> Date: Tue, 2 Jun 2015 12:52:39 +0300 Subject: [PATCH 0278/2145] [nova] Fix extension extraction bug Replace the hardcoded flv with determine_ext. Let rtmpdump parse the url. --- youtube_dl/extractor/nova.py | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 1dd18511e..fd5f9cb0e 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import determine_ext class NovaIE(InfoExtractor): @@ -39,7 +40,7 @@ class NovaIE(InfoExtractor): 'info_dict': { 'id': '1756825', 'display_id': '5591-policie-modrava-15-dil-blondynka-na-hrbitove', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Policie Modrava - 15. díl - Blondýnka na hřbitově', 'description': 'md5:d804ba6b30bc7da2705b1fea961bddfe', 'thumbnail': 're:^https?://.*\.(?:jpg)', @@ -108,21 +109,8 @@ class NovaIE(InfoExtractor): mediafile = config['mediafile'] video_url = mediafile['src'] - - m = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+?))/&*(?P<playpath>.+)$', video_url) - if m: - formats = [{ - 'url': m.group('url'), - 'app': m.group('app'), - 'play_path': m.group('playpath'), - 'player_path': 'http://tvnoviny.nova.cz/static/shared/app/videojs/video-js.swf', - 'ext': 'flv', - }] - else: - formats = [{ - 'url': video_url, - }] - self._sort_formats(formats) + ext = determine_ext(video_url) + video_url = video_url.replace('&{}:'.format(ext), '') title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage) description = self._og_search_description(webpage) @@ -134,5 +122,6 @@ class NovaIE(InfoExtractor): 'title': title, 'description': description, 'thumbnail': thumbnail, - 'formats': formats, + 'url': video_url, + 'ext': ext, } From fcb04bcaca1b83cd3f13f494d7d775e35e0b6182 Mon Sep 17 00:00:00 2001 From: Slava Shklyaev <shk.slava@gmail.com> Date: Tue, 2 Jun 2015 12:55:41 +0300 Subject: [PATCH 0279/2145] [nova] Extract upload_date in some cases --- youtube_dl/extractor/nova.py | 39 ++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index fd5f9cb0e..30c64aaf8 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -49,6 +49,33 @@ class NovaIE(InfoExtractor): # rtmp download 'skip_download': True, } + }, { + 'url': 'http://novaplus.nova.cz/porad/televizni-noviny/video/5585-televizni-noviny-30-5-2015/', + 'info_dict': { + 'id': '1756858', + 'ext': 'mp4', + 'title': 'Televizní noviny - 30. 5. 2015', + 'thumbnail': 're:^https?://.*\.(?:jpg)', + 'upload_date': '20150530', + }, + 'params': { + # rtmp download + 'skip_download': True, + } + }, { + 'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html', + 'info_dict': { + 'id': '1753621', + 'ext': 'mp4', + 'title': 'Zaklínač 3: Divoký hon', + 'description': 're:.*Pokud se stejně jako my nemůžete.*', + 'thumbnail': 're:https?://.*\.jpg(\?.*)?', + 'upload_date': '20150521', + }, + 'params': { + # rtmp download + 'skip_download': True, + } }, { 'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html', 'only_matching': True, @@ -116,11 +143,23 @@ class NovaIE(InfoExtractor): description = self._og_search_description(webpage) thumbnail = config.get('poster') + mobj = None + if site == 'novaplus': + mobj = re.search(r'(?P<day>\d{1,2})-(?P<month>\d{1,2})-(?P<year>\d{4})$', display_id) + if site == 'fanda': + mobj = re.search( + r'<span class="date_time">(?P<day>\d{1,2})\.(?P<month>\d{1,2})\.(?P<year>\d{4})\b', webpage) + if mobj: + upload_date = '{}{:02d}{:02d}'.format(mobj.group('year'), int(mobj.group('month')), int(mobj.group('day'))) + else: + upload_date = None + return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, + 'upload_date': upload_date, 'thumbnail': thumbnail, 'url': video_url, 'ext': ext, From 34c0f95db273ac5e7a7f8a6d23a3f90ceadf4695 Mon Sep 17 00:00:00 2001 From: Slava Shklyaev <shk.slava@gmail.com> Date: Tue, 2 Jun 2015 12:56:36 +0300 Subject: [PATCH 0280/2145] [nova] Remove html tags from description --- youtube_dl/extractor/nova.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 30c64aaf8..140312f9c 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import determine_ext +from ..utils import clean_html, determine_ext class NovaIE(InfoExtractor): @@ -42,7 +42,7 @@ class NovaIE(InfoExtractor): 'display_id': '5591-policie-modrava-15-dil-blondynka-na-hrbitove', 'ext': 'mp4', 'title': 'Policie Modrava - 15. díl - Blondýnka na hřbitově', - 'description': 'md5:d804ba6b30bc7da2705b1fea961bddfe', + 'description': 'md5:dc24e50be5908df83348e50d1431295e', 'thumbnail': 're:^https?://.*\.(?:jpg)', }, 'params': { @@ -140,7 +140,7 @@ class NovaIE(InfoExtractor): video_url = video_url.replace('&{}:'.format(ext), '') title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage) - description = self._og_search_description(webpage) + description = clean_html(self._og_search_description(webpage, default=None)) thumbnail = config.get('poster') mobj = None From a00234f1c517d077a237da576be638fef980d79e Mon Sep 17 00:00:00 2001 From: Slava Shklyaev <shk.slava@gmail.com> Date: Tue, 2 Jun 2015 12:57:03 +0300 Subject: [PATCH 0281/2145] [nova] Minor style improvement --- youtube_dl/extractor/nova.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 140312f9c..10957e5fa 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -132,7 +132,7 @@ class NovaIE(InfoExtractor): config = self._download_json( config_url, display_id, 'Downloading config JSON', - transform_source=lambda s: re.sub(r'var\s+[\da-zA-Z_]+\s*=\s*({.+?});', r'\1', s)) + transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) mediafile = config['mediafile'] video_url = mediafile['src'] From bc03e58565d99677a643e0a058d25c7ee9b265d6 Mon Sep 17 00:00:00 2001 From: Slava Shklyaev <shk.slava@gmail.com> Date: Tue, 2 Jun 2015 13:16:58 +0300 Subject: [PATCH 0282/2145] [iprima] Update --- youtube_dl/extractor/iprima.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index 8529bedfc..f3536893a 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -15,7 +15,7 @@ from ..utils import ( class IPrimaIE(InfoExtractor): - _VALID_URL = r'https?://play\.iprima\.cz/[^?#]+/(?P<id>[^?#]+)' + _VALID_URL = r'https?://play\.iprima\.cz/([^/]+/)*(?P<id>[^?#]+)' _TESTS = [{ 'url': 'http://play.iprima.cz/particka/particka-92', @@ -23,7 +23,7 @@ class IPrimaIE(InfoExtractor): 'id': '39152', 'ext': 'flv', 'title': 'Partička (92)', - 'description': 'md5:3740fda51464da35a2d4d0670b8e4fd6', + 'description': 'md5:db00b9bc10ffd26fb148fa6a3a67c40b', 'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg', }, 'params': { @@ -35,13 +35,16 @@ class IPrimaIE(InfoExtractor): 'id': '9718337', 'ext': 'flv', 'title': 'Tchibo Partička - Jarní móda', - 'description': 'md5:589f8f59f414220621ff8882eb3ce7be', + 'description': 'md5:db00b9bc10ffd26fb148fa6a3a67c40b', 'thumbnail': 're:^http:.*\.jpg$', }, 'params': { 'skip_download': True, # requires rtmpdump }, 'skip': 'Do not have permission to access this page', + }, { + 'url': 'http://play.iprima.cz/zpravy-ftv-prima-2752015', + 'only_matching': True, }] def _real_extract(self, url): @@ -102,7 +105,7 @@ class IPrimaIE(InfoExtractor): return { 'id': real_id, - 'title': self._og_search_title(webpage), + 'title': self._og_search_title(webpage).replace(' | Prima PLAY', ''), 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, 'description': self._og_search_description(webpage), From b5597738d4de35fd6f2be7bf1cb6a32c754d873f Mon Sep 17 00:00:00 2001 From: Slava Shklyaev <shk.slava@gmail.com> Date: Tue, 2 Jun 2015 17:28:14 +0300 Subject: [PATCH 0283/2145] [iprima] Comply with review --- youtube_dl/extractor/iprima.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index f3536893a..502507551 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -11,11 +11,12 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + remove_end, ) class IPrimaIE(InfoExtractor): - _VALID_URL = r'https?://play\.iprima\.cz/([^/]+/)*(?P<id>[^?#]+)' + _VALID_URL = r'https?://play\.iprima\.cz/(?:[^/]+/)*(?P<id>[^?#]+)' _TESTS = [{ 'url': 'http://play.iprima.cz/particka/particka-92', @@ -105,7 +106,7 @@ class IPrimaIE(InfoExtractor): return { 'id': real_id, - 'title': self._og_search_title(webpage).replace(' | Prima PLAY', ''), + 'title': remove_end(self._og_search_title(webpage), ' | Prima PLAY'), 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, 'description': self._og_search_description(webpage), From d23da75b32e02963f988bad962b3f5259e4a6d31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 2 Jun 2015 21:10:18 +0600 Subject: [PATCH 0284/2145] [iprima] Fix description extraction `og:description` does not contain actual description anymore. --- youtube_dl/extractor/iprima.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index 502507551..821c8ec10 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -24,7 +24,7 @@ class IPrimaIE(InfoExtractor): 'id': '39152', 'ext': 'flv', 'title': 'Partička (92)', - 'description': 'md5:db00b9bc10ffd26fb148fa6a3a67c40b', + 'description': 'md5:74e9617e51bca67c3ecfb2c6f9766f45', 'thumbnail': 'http://play.iprima.cz/sites/default/files/image_crops/image_620x349/3/491483_particka-92_image_620x349.jpg', }, 'params': { @@ -36,13 +36,11 @@ class IPrimaIE(InfoExtractor): 'id': '9718337', 'ext': 'flv', 'title': 'Tchibo Partička - Jarní móda', - 'description': 'md5:db00b9bc10ffd26fb148fa6a3a67c40b', 'thumbnail': 're:^http:.*\.jpg$', }, 'params': { 'skip_download': True, # requires rtmpdump }, - 'skip': 'Do not have permission to access this page', }, { 'url': 'http://play.iprima.cz/zpravy-ftv-prima-2752015', 'only_matching': True, @@ -109,5 +107,7 @@ class IPrimaIE(InfoExtractor): 'title': remove_end(self._og_search_title(webpage), ' | Prima PLAY'), 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, - 'description': self._og_search_description(webpage), + 'description': self._search_regex( + r'<p[^>]+itemprop="description"[^>]*>([^<]+)', + webpage, 'description', default=None), } From 4b5fe1349f5568f3b9b939520db0a1ddc598b4b3 Mon Sep 17 00:00:00 2001 From: Slava Shklyaev <shk.slava@gmail.com> Date: Tue, 2 Jun 2015 18:15:05 +0300 Subject: [PATCH 0285/2145] [nova] Comply with review --- youtube_dl/extractor/nova.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 10957e5fa..4a2d76506 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -4,7 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import clean_html, determine_ext +from ..utils import ( + clean_html, + determine_ext, + unified_strdate, +) class NovaIE(InfoExtractor): @@ -143,14 +147,12 @@ class NovaIE(InfoExtractor): description = clean_html(self._og_search_description(webpage, default=None)) thumbnail = config.get('poster') - mobj = None if site == 'novaplus': - mobj = re.search(r'(?P<day>\d{1,2})-(?P<month>\d{1,2})-(?P<year>\d{4})$', display_id) - if site == 'fanda': - mobj = re.search( - r'<span class="date_time">(?P<day>\d{1,2})\.(?P<month>\d{1,2})\.(?P<year>\d{4})\b', webpage) - if mobj: - upload_date = '{}{:02d}{:02d}'.format(mobj.group('year'), int(mobj.group('month')), int(mobj.group('day'))) + upload_date = unified_strdate(self._search_regex( + r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None)) + elif site == 'fanda': + upload_date = unified_strdate(self._search_regex( + r'<span class="date_time">(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None)) else: upload_date = None From 08b7968e2873b45dafe465ec04541db8fcd4967d Mon Sep 17 00:00:00 2001 From: Slava Shklyaev <shk.slava@gmail.com> Date: Tue, 2 Jun 2015 17:49:15 +0300 Subject: [PATCH 0286/2145] [nova] Fix display_id extraction bug --- youtube_dl/extractor/nova.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 4a2d76506..8360a65d9 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -13,9 +13,9 @@ from ..utils import ( class NovaIE(InfoExtractor): IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz' - _VALID_URL = 'http://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/?)$' + _VALID_URL = 'http://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)' _TESTS = [{ - 'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html', + 'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html?utm_source=tvnoviny&utm_medium=cpfooter&utm_campaign=novaplus', 'info_dict': { 'id': '1608920', 'display_id': 'co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou', @@ -29,7 +29,7 @@ class NovaIE(InfoExtractor): 'skip_download': True, } }, { - 'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html', + 'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260', 'md5': '1dd7b9d5ea27bc361f110cd855a19bd3', 'info_dict': { 'id': '1757139', @@ -40,7 +40,7 @@ class NovaIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.(?:jpg)', } }, { - 'url': 'http://novaplus.nova.cz/porad/policie-modrava/video/5591-policie-modrava-15-dil-blondynka-na-hrbitove/', + 'url': 'http://novaplus.nova.cz/porad/policie-modrava/video/5591-policie-modrava-15-dil-blondynka-na-hrbitove', 'info_dict': { 'id': '1756825', 'display_id': '5591-policie-modrava-15-dil-blondynka-na-hrbitove', From b0cda32f726443d464a68a34b22a2e02ef8b29b0 Mon Sep 17 00:00:00 2001 From: Slava Shklyaev <shk.slava@gmail.com> Date: Tue, 2 Jun 2015 18:17:33 +0300 Subject: [PATCH 0287/2145] [nova] Fix Python 2.6 compatability issue --- youtube_dl/extractor/nova.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 8360a65d9..7e3498eea 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -141,7 +141,7 @@ class NovaIE(InfoExtractor): mediafile = config['mediafile'] video_url = mediafile['src'] ext = determine_ext(video_url) - video_url = video_url.replace('&{}:'.format(ext), '') + video_url = video_url.replace('&%s:' % ext, '') title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage) description = clean_html(self._og_search_description(webpage, default=None)) From fa971259e69a8031c384754b6238cfff71bea773 Mon Sep 17 00:00:00 2001 From: Slava Shklyaev <shk.slava@gmail.com> Date: Tue, 2 Jun 2015 19:09:47 +0300 Subject: [PATCH 0288/2145] [nova] Add a comment about html in description --- youtube_dl/extractor/nova.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 7e3498eea..85253b6ed 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -46,7 +46,7 @@ class NovaIE(InfoExtractor): 'display_id': '5591-policie-modrava-15-dil-blondynka-na-hrbitove', 'ext': 'mp4', 'title': 'Policie Modrava - 15. díl - Blondýnka na hřbitově', - 'description': 'md5:dc24e50be5908df83348e50d1431295e', + 'description': 'md5:dc24e50be5908df83348e50d1431295e', # Make sure this description is clean of html tags 'thumbnail': 're:^https?://.*\.(?:jpg)', }, 'params': { From 23dd1fc74c36329ed40855301ac499a0ad2a0009 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 3 Jun 2015 10:21:03 +0800 Subject: [PATCH 0289/2145] [vidme] Always use the non-embedded page For example, https://vid.me/Wmur contains more information than https://vid.me/e/Wmur --- youtube_dl/extractor/vidme.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index bd953fb4c..e0b55078b 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -10,7 +10,7 @@ from ..utils import ( class VidmeIE(InfoExtractor): _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]+)' - _TEST = { + _TESTS = [{ 'url': 'https://vid.me/QNB', 'md5': 'f42d05e7149aeaec5c037b17e5d3dc82', 'info_dict': { @@ -23,9 +23,14 @@ class VidmeIE(InfoExtractor): 'upload_date': '20140725', 'thumbnail': 're:^https?://.*\.jpg', }, - } + }, { + # From http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching + 'url': 'https://vid.me/e/Wmur', + 'only_matching': True, + }] def _real_extract(self, url): + url = url.replace('vid.me/e/', 'vid.me/') video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) From 8f9478412424b87e4fb77be53d239c13932b078a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 3 Jun 2015 10:26:39 +0800 Subject: [PATCH 0290/2145] [tumblr] Detect vid.me embeds (fixes #5883) --- youtube_dl/extractor/tumblr.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 828c808a6..e6218808f 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -28,6 +28,17 @@ class TumblrIE(InfoExtractor): 'description': 'md5:dba62ac8639482759c8eb10ce474586a', 'thumbnail': 're:http://.*\.jpg', } + }, { + 'url': 'http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching', + 'md5': 'de07e5211d60d4f3a2c3df757ea9f6ab', + 'info_dict': { + 'id': 'Wmur', + 'ext': 'mp4', + 'title': 'naked smoking & stretching', + 'upload_date': '20150506', + 'timestamp': 1430931613, + }, + 'add_ie': ['Vidme'], }] def _real_extract(self, url): @@ -38,6 +49,12 @@ class TumblrIE(InfoExtractor): url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) webpage = self._download_webpage(url, video_id) + vid_me_embed_url = self._search_regex( + r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', + webpage, 'vid.me embed', default=None) + if vid_me_embed_url is not None: + return self.url_result(vid_me_embed_url, 'Vidme') + iframe_url = self._search_regex( r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', webpage, 'iframe url') From 687cb3ad35ac49f1053c1ea52e3b6db18b3aa1cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 3 Jun 2015 20:47:11 +0600 Subject: [PATCH 0291/2145] [24video] Fix uploader extraction --- youtube_dl/extractor/twentyfourvideo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index 67e8bfea0..c1ee1decc 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -15,7 +15,7 @@ class TwentyFourVideoIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.24video.net/video/view/1044982', - 'md5': '48dd7646775690a80447a8dca6a2df76', + 'md5': 'd041af8b5b4246ea466226a0d6693345', 'info_dict': { 'id': '1044982', 'ext': 'mp4', @@ -54,7 +54,7 @@ class TwentyFourVideoIE(InfoExtractor): webpage, 'upload date')) uploader = self._html_search_regex( - r'Загрузил\s*<a href="/jsecUser/movies/[^"]+" class="link">([^<]+)</a>', + r'class="video-uploaded"[^>]*>\s*<a href="/jsecUser/movies/[^"]+"[^>]*>([^<]+)</a>', webpage, 'uploader', fatal=False) view_count = int_or_none(self._html_search_regex( From 15b74b94beb8720a4f6d7ee076c123dd8ae05309 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 3 Jun 2015 20:52:47 +0600 Subject: [PATCH 0292/2145] [tvigle] Capture error message --- youtube_dl/extractor/tvigle.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py index a85693888..aa07c8251 100644 --- a/youtube_dl/extractor/tvigle.py +++ b/youtube_dl/extractor/tvigle.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( + ExtractorError, float_or_none, int_or_none, parse_age_limit, @@ -63,6 +64,13 @@ class TvigleIE(InfoExtractor): item = video_data['playlist']['items'][0] + videos = item.get('videos') + + error_message = item.get('errorMessage') + if not videos and error_message: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) + title = item['title'] description = item.get('description') thumbnail = item.get('thumbnail') From 3153a2c98d7201b1ae8104c346db58e19f322cc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 3 Jun 2015 20:53:54 +0600 Subject: [PATCH 0293/2145] [tvigle] Skip tests --- youtube_dl/extractor/tvigle.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py index aa07c8251..dc3a8334a 100644 --- a/youtube_dl/extractor/tvigle.py +++ b/youtube_dl/extractor/tvigle.py @@ -30,6 +30,7 @@ class TvigleIE(InfoExtractor): 'duration': 6586, 'age_limit': 12, }, + 'skip': 'georestricted', }, { 'url': 'http://www.tvigle.ru/video/vladimir-vysotskii/vedushchii-teleprogrammy-60-minut-ssha-o-vladimire-vysotskom/', @@ -42,6 +43,7 @@ class TvigleIE(InfoExtractor): 'duration': 186.080, 'age_limit': 0, }, + 'skip': 'georestricted', }, { 'url': 'https://cloud.tvigle.ru/video/5267604/', 'only_matching': True, From 6800d3372f35e08dcc4d34d06601815bf0cb0a3d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 3 Jun 2015 23:10:18 +0800 Subject: [PATCH 0294/2145] [YoutubeDL] Support DASH manifest downloading --- youtube_dl/downloader/dash.py | 50 +++++++++++++++++++++++++++++++++ youtube_dl/downloader/http.py | 4 +++ youtube_dl/extractor/youtube.py | 6 ++++ 3 files changed, 60 insertions(+) create mode 100644 youtube_dl/downloader/dash.py diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py new file mode 100644 index 000000000..18eca2c04 --- /dev/null +++ b/youtube_dl/downloader/dash.py @@ -0,0 +1,50 @@ +from __future__ import unicode_literals +from .common import FileDownloader +from ..compat import compat_urllib_request + +import re + + +class DashSegmentsFD(FileDownloader): + """ + Download segments in a DASH manifest + """ + def real_download(self, filename, info_dict): + self.report_destination(filename) + tmpfilename = self.temp_name(filename) + base_url = info_dict['url'] + segment_urls = info_dict['segment_urls'] + + self.byte_counter = 0 + + def append_url_to_file(outf, target_url, target_name): + self.to_screen('[DashSegments] %s: Downloading %s' % (info_dict['id'], target_name)) + req = compat_urllib_request.Request(target_url) + data = self.ydl.urlopen(req).read() + outf.write(data) + self.byte_counter += len(data) + + def combine_url(base_url, target_url): + if re.match(r'^https?://', target_url): + return target_url + return '%s/%s' % (base_url, target_url) + + with open(tmpfilename, 'wb') as outf: + append_url_to_file( + outf, combine_url(base_url, info_dict['initialization_url']), + 'initialization segment') + for i, segment_url in enumerate(segment_urls): + append_url_to_file( + outf, combine_url(base_url, segment_url), + 'segment %d / %d' % (i + 1, len(segment_urls))) + + self.try_rename(tmpfilename, filename) + + self._hook_progress({ + 'downloaded_bytes': self.byte_counter, + 'total_bytes': self.byte_counter, + 'filename': filename, + 'status': 'finished', + }) + + return True diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index b7f144af9..ceacb8522 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -6,6 +6,7 @@ import socket import time from .common import FileDownloader +from .dash import DashSegmentsFD from ..compat import ( compat_urllib_request, compat_urllib_error, @@ -19,6 +20,9 @@ from ..utils import ( class HttpFD(FileDownloader): def real_download(self, filename, info_dict): + if info_dict.get('initialization_url') and list(filter(None, info_dict.get('segment_urls', []))): + return DashSegmentsFD(self.ydl, self.params).real_download(filename, info_dict) + url = info_dict['url'] tmpfilename = self.temp_name(filename) stream = None diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index aacb999ce..5d1297e0d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -802,6 +802,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # TODO implement WebVTT downloading pass elif mime_type.startswith('audio/') or mime_type.startswith('video/'): + segment_list = r.find('{urn:mpeg:DASH:schema:MPD:2011}SegmentList') format_id = r.attrib['id'] video_url = url_el.text filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength')) @@ -815,6 +816,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'filesize': filesize, 'fps': int_or_none(r.attrib.get('frameRate')), } + if segment_list: + f.update({ + 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'], + 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')] + }) try: existing_format = next( fo for fo in formats From e4ac7bb1e598b0317742737ea06c162fa7f22cd4 Mon Sep 17 00:00:00 2001 From: Slava Shklyaev <shk.slava@gmail.com> Date: Wed, 3 Jun 2015 19:18:41 +0300 Subject: [PATCH 0295/2145] [nova] Revert "Fix extension extraction bug" This reverts commit 9464a194dbf48989c486fa2de9e1aebc59e28ed4. --- youtube_dl/extractor/nova.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/nova.py b/youtube_dl/extractor/nova.py index 85253b6ed..3f9c776ef 100644 --- a/youtube_dl/extractor/nova.py +++ b/youtube_dl/extractor/nova.py @@ -6,7 +6,6 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, - determine_ext, unified_strdate, ) @@ -44,7 +43,7 @@ class NovaIE(InfoExtractor): 'info_dict': { 'id': '1756825', 'display_id': '5591-policie-modrava-15-dil-blondynka-na-hrbitove', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Policie Modrava - 15. díl - Blondýnka na hřbitově', 'description': 'md5:dc24e50be5908df83348e50d1431295e', # Make sure this description is clean of html tags 'thumbnail': 're:^https?://.*\.(?:jpg)', @@ -57,7 +56,7 @@ class NovaIE(InfoExtractor): 'url': 'http://novaplus.nova.cz/porad/televizni-noviny/video/5585-televizni-noviny-30-5-2015/', 'info_dict': { 'id': '1756858', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Televizní noviny - 30. 5. 2015', 'thumbnail': 're:^https?://.*\.(?:jpg)', 'upload_date': '20150530', @@ -140,8 +139,21 @@ class NovaIE(InfoExtractor): mediafile = config['mediafile'] video_url = mediafile['src'] - ext = determine_ext(video_url) - video_url = video_url.replace('&%s:' % ext, '') + + m = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+?))/&*(?P<playpath>.+)$', video_url) + if m: + formats = [{ + 'url': m.group('url'), + 'app': m.group('app'), + 'play_path': m.group('playpath'), + 'player_path': 'http://tvnoviny.nova.cz/static/shared/app/videojs/video-js.swf', + 'ext': 'flv', + }] + else: + formats = [{ + 'url': video_url, + }] + self._sort_formats(formats) title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage) description = clean_html(self._og_search_description(webpage, default=None)) @@ -163,6 +175,5 @@ class NovaIE(InfoExtractor): 'description': description, 'upload_date': upload_date, 'thumbnail': thumbnail, - 'url': video_url, - 'ext': ext, + 'formats': formats, } From 4c8fea92f350d5a3f33d505980ac750b05a9cd34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 3 Jun 2015 23:50:38 +0200 Subject: [PATCH 0296/2145] [test/aes] Fix on python 3.3 and higher Since 878563c847fa5248eedbd44187536dec04643eaf the aes functions only accepts the base64 data as a unicode string. --- test/test_aes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_aes.py b/test/test_aes.py index 4dc7de7b5..315a3f5ae 100644 --- a/test/test_aes.py +++ b/test/test_aes.py @@ -39,7 +39,7 @@ class TestAES(unittest.TestCase): encrypted = base64.b64encode( intlist_to_bytes(self.iv[:8]) + b'\x17\x15\x93\xab\x8d\x80V\xcdV\xe0\t\xcdo\xc2\xa5\xd8ksM\r\xe27N\xae' - ) + ).decode('utf-8') decrypted = (aes_decrypt_text(encrypted, password, 16)) self.assertEqual(decrypted, self.secret_msg) @@ -47,7 +47,7 @@ class TestAES(unittest.TestCase): encrypted = base64.b64encode( intlist_to_bytes(self.iv[:8]) + b'\x0b\xe6\xa4\xd9z\x0e\xb8\xb9\xd0\xd4i_\x85\x1d\x99\x98_\xe5\x80\xe7.\xbf\xa5\x83' - ) + ).decode('utf-8') decrypted = (aes_decrypt_text(encrypted, password, 32)) self.assertEqual(decrypted, self.secret_msg) From eedda32e6bc4620d704eabab7a2c8f4b1f1a9169 Mon Sep 17 00:00:00 2001 From: ping <lipng.ong@gmail.com> Date: Thu, 4 Jun 2015 11:27:18 +0800 Subject: [PATCH 0297/2145] [qqmusic] Fix toplist --- youtube_dl/extractor/qqmusic.py | 58 +++++++++++++-------------------- 1 file changed, 23 insertions(+), 35 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index b540033e2..48f28ffe9 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -181,60 +181,48 @@ class QQMusicToplistIE(QQPlaylistBaseIE): _VALID_URL = r'http://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)' _TESTS = [{ - 'url': 'http://y.qq.com/#type=toplist&p=global_12', + 'url': 'http://y.qq.com/#type=toplist&p=global_123', 'info_dict': { - 'id': 'global_12', - 'title': 'itunes榜', + 'id': 'global_123', + 'title': '美国iTunes榜', }, 'playlist_count': 10, }, { - 'url': 'http://y.qq.com/#type=toplist&p=top_6', + 'url': 'http://y.qq.com/#type=toplist&p=top_3', 'info_dict': { - 'id': 'top_6', + 'id': 'top_3', 'title': 'QQ音乐巅峰榜·欧美', + 'description': 'QQ音乐巅峰榜·欧美根据用户收听行为自动生成,集结当下最流行的欧美新歌!:更新时间:每周四22点|统' + '计周期:一周(上周四至本周三)|统计对象:三个月内发行的欧美歌曲|统计数量:100首|统计算法:根据' + '歌曲在一周内的有效播放次数,由高到低取前100名(同一歌手最多允许5首歌曲同时上榜)|有效播放次数:' + '登录用户完整播放一首歌曲,记为一次有效播放;同一用户收听同一首歌曲,每天记录为1次有效播放' }, 'playlist_count': 100, }, { - 'url': 'http://y.qq.com/#type=toplist&p=global_5', + 'url': 'http://y.qq.com/#type=toplist&p=global_106', 'info_dict': { - 'id': 'global_5', - 'title': '韩国mnet排行榜', + 'id': 'global_106', + 'title': '韩国Mnet榜', }, 'playlist_count': 50, }] - @staticmethod - def strip_qq_jsonp(code): - return js_to_json(re.sub(r'^MusicJsonCallback\((.*?)\)/\*.+?\*/$', r'\1', code)) - def _real_extract(self, url): list_id = self._match_id(url) list_type, num_id = list_id.split("_") - list_page = self._download_webpage( - "http://y.qq.com/y/static/toplist/index/%s.html" % list_id, + toplist_json = self._download_json( + 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?type=%s&topid=%s&format=json' + % (list_type, num_id), list_id, 'Download toplist page') - entries = [] - if list_type == 'top': - jsonp_url = "http://y.qq.com/y/static/toplist/json/top/%s/1.js" % num_id - else: - jsonp_url = "http://y.qq.com/y/static/toplist/json/global/%s/1_1.js" % num_id + entries = [ + self.url_result( + 'http://y.qq.com/#type=song&mid=' + song['data']['songmid'], 'QQMusic', song['data']['songmid'] + ) for song in toplist_json['songlist'] + ] - toplist_json = self._download_json( - jsonp_url, list_id, note='Retrieve toplist json', - errnote='Unable to get toplist json', transform_source=self.strip_qq_jsonp) - - for song in toplist_json['l']: - s = song['s'] - song_mid = s.split("|")[20] - entries.append(self.url_result( - 'http://y.qq.com/#type=song&mid=' + song_mid, 'QQMusic', - song_mid)) - - list_name = self._html_search_regex( - r'<h2 id="top_name">([^\']+)</h2>', list_page, 'top list name', - default=None) - - return self.playlist_result(entries, list_id, list_name) + list_name = toplist_json['topinfo']['ListName'] + list_description = toplist_json['topinfo']['info'] + return self.playlist_result(entries, list_id, list_name, list_description) From ed15e9ba02382fc7db22e6176068d2220c00a32e Mon Sep 17 00:00:00 2001 From: ping <lipng.ong@gmail.com> Date: Thu, 4 Jun 2015 17:32:06 +0800 Subject: [PATCH 0298/2145] [qqmusic] Remove unused import --- youtube_dl/extractor/qqmusic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 48f28ffe9..9943fcddb 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -9,7 +9,6 @@ from .common import InfoExtractor from ..utils import ( strip_jsonp, unescapeHTML, - js_to_json, ) from ..compat import compat_urllib_request From 55e5841f14131ab61359535fdcc44e1564d555b8 Mon Sep 17 00:00:00 2001 From: ping <lipng.ong@gmail.com> Date: Thu, 4 Jun 2015 17:41:29 +0800 Subject: [PATCH 0299/2145] [qqmusic] Extract additional formats (mp3-128, mp3-320) --- youtube_dl/extractor/qqmusic.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index b540033e2..4b4ef4993 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -19,10 +19,10 @@ class QQMusicIE(InfoExtractor): _VALID_URL = r'http://y.qq.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD', - 'md5': 'bed90b6db2a7a7a7e11bc585f471f63a', + 'md5': '9ce1c1c8445f561506d2e3cfb0255705', 'info_dict': { 'id': '004295Et37taLD', - 'ext': 'm4a', + 'ext': 'mp3', 'title': '可惜没如果', 'upload_date': '20141227', 'creator': '林俊杰', @@ -30,6 +30,12 @@ class QQMusicIE(InfoExtractor): } }] + _FORMATS = { + 'mp3-320': {'prefix': 'M800', 'ext': 'mp3', 'preference': 40}, + 'mp3-128': {'prefix': 'M500', 'ext': 'mp3', 'preference': 30}, + 'm4a': {'prefix': 'C200', 'ext': 'm4a', 'preference': 10} + } + # Reference: m_r_GetRUin() in top_player.js # http://imgcache.gtimg.cn/music/portal_v3/y/top_player.js @staticmethod @@ -69,11 +75,19 @@ class QQMusicIE(InfoExtractor): 'http://base.music.qq.com/fcgi-bin/fcg_musicexpress.fcg?json=3&guid=%s' % guid, mid, note='Retrieve vkey', errnote='Unable to get vkey', transform_source=strip_jsonp)['key'] - song_url = 'http://cc.stream.qqmusic.qq.com/C200%s.m4a?vkey=%s&guid=%s&fromtag=0' % (mid, vkey, guid) + + formats = [] + for k, sf in self._FORMATS.items(): + formats.append({ + 'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0' + % (sf['prefix'], mid, sf['ext'], vkey, guid), + 'format': k, 'format_id': k, 'preference': sf['preference'] + }) + self._sort_formats(formats) return { 'id': mid, - 'url': song_url, + 'formats': formats, 'title': song_name, 'upload_date': publish_time, 'creator': singer, From b9258c61789388b49792ebdceb5d804217a36da5 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 4 Jun 2015 22:05:33 +0800 Subject: [PATCH 0300/2145] [YoutubeDL] Change how DashSegmentsFD is selected --- youtube_dl/downloader/__init__.py | 2 ++ youtube_dl/downloader/http.py | 4 ---- youtube_dl/extractor/youtube.py | 3 ++- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index f110830c4..1b618ab54 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -8,6 +8,7 @@ from .hls import NativeHlsFD from .http import HttpFD from .rtsp import RtspFD from .rtmp import RtmpFD +from .dash import DashSegmentsFD from ..utils import ( determine_protocol, @@ -20,6 +21,7 @@ PROTOCOL_MAP = { 'mms': RtspFD, 'rtsp': RtspFD, 'f4m': F4mFD, + 'dash_segments': DashSegmentsFD, } diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index ceacb8522..b7f144af9 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -6,7 +6,6 @@ import socket import time from .common import FileDownloader -from .dash import DashSegmentsFD from ..compat import ( compat_urllib_request, compat_urllib_error, @@ -20,9 +19,6 @@ from ..utils import ( class HttpFD(FileDownloader): def real_download(self, filename, info_dict): - if info_dict.get('initialization_url') and list(filter(None, info_dict.get('segment_urls', []))): - return DashSegmentsFD(self.ydl, self.params).real_download(filename, info_dict) - url = info_dict['url'] tmpfilename = self.temp_name(filename) stream = None diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5d1297e0d..692d4d8db 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -819,7 +819,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if segment_list: f.update({ 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'], - 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')] + 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')], + 'protocol': 'dash_segments', }) try: existing_format = next( From 453a1617aac6e8000ed947cad7d88817c5740ede Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 4 Jun 2015 22:12:05 +0800 Subject: [PATCH 0301/2145] [downloader/dash] Reorder imports --- youtube_dl/downloader/dash.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 18eca2c04..5f14658ba 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -1,9 +1,10 @@ from __future__ import unicode_literals -from .common import FileDownloader -from ..compat import compat_urllib_request import re +from .common import FileDownloader +from ..compat import compat_urllib_request + class DashSegmentsFD(FileDownloader): """ From 423d2be5f8c5e70d202ddfa63f3e5365e6afe823 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 4 Jun 2015 22:27:29 +0800 Subject: [PATCH 0302/2145] [downloader/dash] Rename the protocol 'http_dash_segments' looks more like a protocol name than 'dash_segments' --- youtube_dl/downloader/__init__.py | 2 +- youtube_dl/extractor/youtube.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index 1b618ab54..dccc59212 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -21,7 +21,7 @@ PROTOCOL_MAP = { 'mms': RtspFD, 'rtsp': RtspFD, 'f4m': F4mFD, - 'dash_segments': DashSegmentsFD, + 'http_dash_segments': DashSegmentsFD, } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 692d4d8db..6d288e848 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -820,7 +820,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): f.update({ 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'], 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')], - 'protocol': 'dash_segments', + 'protocol': 'http_dash_segments', }) try: existing_format = next( From 56c837ccb75b639d362397095f33300229c4bd1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Jun 2015 20:34:48 +0600 Subject: [PATCH 0303/2145] [tnaflix] Fix typo --- youtube_dl/extractor/tnaflix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index 59af9aba0..bc51bae37 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -33,7 +33,7 @@ class TNAFlixIE(InfoExtractor): }, { 'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632', - 'matching_only': True, + 'only_matching': True, } ] From e52c0bd0eb853d4d242872e1d9ff5426a35dd30c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Jun 2015 20:37:05 +0600 Subject: [PATCH 0304/2145] [tnaflix] Modernize --- youtube_dl/extractor/tnaflix.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index bc51bae37..3e335d653 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -51,9 +51,8 @@ class TNAFlixIE(InfoExtractor): age_limit = self._rta_search(webpage) - duration = self._html_search_meta('duration', webpage, 'duration', default=None) - if duration: - duration = parse_duration(duration[1:]) + duration = parse_duration(self._html_search_meta( + 'duration', webpage, 'duration', default=None)) cfg_url = self._proto_relative_url(self._html_search_regex( self._CONFIG_REGEX, webpage, 'flashvars.config'), 'http:') From 3ce9bc712acd88df8499dd0982277c8f64b0d15a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Jun 2015 20:39:03 +0600 Subject: [PATCH 0305/2145] [empflix] Fix typo --- youtube_dl/extractor/empflix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py index 9a5a8f4bb..4827022e0 100644 --- a/youtube_dl/extractor/empflix.py +++ b/youtube_dl/extractor/empflix.py @@ -26,6 +26,6 @@ class EMPFlixIE(TNAFlixIE): }, { 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', - 'matching_only': True, + 'only_matching': True, } ] From 3d6388e34ea41d937f39e561b7731f1389971a66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 4 Jun 2015 20:42:37 +0600 Subject: [PATCH 0306/2145] [tnaflix] Fix relative URLs (empflix) --- youtube_dl/extractor/tnaflix.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index 3e335d653..c282865b2 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -61,14 +61,15 @@ class TNAFlixIE(InfoExtractor): cfg_url, display_id, note='Downloading metadata', transform_source=fix_xml_ampersands) - thumbnail = cfg_xml.find('./startThumb').text + thumbnail = self._proto_relative_url( + cfg_xml.find('./startThumb').text, 'http:') formats = [] for item in cfg_xml.findall('./quality/item'): video_url = re.sub('speed=\d+', 'speed=', item.find('videoLink').text) format_id = item.find('res').text fmt = { - 'url': video_url, + 'url': self._proto_relative_url(video_url, 'http:'), 'format_id': format_id, } m = re.search(r'^(\d+)', format_id) From 9d4f213f90d6024c7748f4defdc7b45f2351b0da Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 5 Jun 2015 00:52:18 +0800 Subject: [PATCH 0307/2145] [qqmusic:toplist] List name and description are optional --- youtube_dl/extractor/qqmusic.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 9943fcddb..f773332a8 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -222,6 +222,7 @@ class QQMusicToplistIE(QQPlaylistBaseIE): ) for song in toplist_json['songlist'] ] - list_name = toplist_json['topinfo']['ListName'] - list_description = toplist_json['topinfo']['info'] + topinfo = toplist_json.get('topinfo', {}) + list_name = topinfo.get('ListName') + list_description = topinfo.get('info') return self.playlist_result(entries, list_id, list_name, list_description) From f5c78d118ba2d7e5e4a1ccd40c97fc1bf85a8dcf Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 4 Jun 2015 21:49:02 +0200 Subject: [PATCH 0308/2145] release 2015.06.04 --- docs/supportedsites.md | 5 +++++ youtube_dl/version.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a421ae62b..d147b53fe 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -10,6 +10,7 @@ - **56.com** - **5min** - **8tracks** + - **91porn** - **9gag** - **abc.net.au** - **Abc7News** @@ -319,6 +320,7 @@ - **Noco** - **Normalboots** - **NosVideo** + - **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz - **novamov**: NovaMov - **Nowness** - **NowTV** @@ -431,6 +433,8 @@ - **smotri:user**: Smotri.com user videos - **Snotr** - **Sohu** + - **soompi** + - **soompi:show** - **soundcloud** - **soundcloud:playlist** - **soundcloud:set** @@ -505,6 +509,7 @@ - **Trilulilu** - **TruTube** - **Tube8** + - **TubiTv** - **Tudou** - **Tumblr** - **TuneIn** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 653710131..84224b7a7 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.05.29' +__version__ = '2015.06.04' From 0e805e782bd05951ca3f420cf2a050e2ac3ae846 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 4 Jun 2015 21:54:33 +0200 Subject: [PATCH 0309/2145] release 2015.06.04.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 84224b7a7..9cf84ff71 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.06.04' +__version__ = '2015.06.04.1' From 8b8cde21406b53f5aeb6586dab03a9d78d62e631 Mon Sep 17 00:00:00 2001 From: ping <lipng.ong@gmail.com> Date: Fri, 5 Jun 2015 06:04:26 +0800 Subject: [PATCH 0310/2145] [qqmusic] Set abr for mp3 formats --- youtube_dl/extractor/qqmusic.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 4b4ef4993..dc300e189 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -31,8 +31,8 @@ class QQMusicIE(InfoExtractor): }] _FORMATS = { - 'mp3-320': {'prefix': 'M800', 'ext': 'mp3', 'preference': 40}, - 'mp3-128': {'prefix': 'M500', 'ext': 'mp3', 'preference': 30}, + 'mp3-320': {'prefix': 'M800', 'ext': 'mp3', 'preference': 40, 'abr': 320}, + 'mp3-128': {'prefix': 'M500', 'ext': 'mp3', 'preference': 30, 'abr': 128}, 'm4a': {'prefix': 'C200', 'ext': 'm4a', 'preference': 10} } @@ -77,11 +77,12 @@ class QQMusicIE(InfoExtractor): transform_source=strip_jsonp)['key'] formats = [] - for k, sf in self._FORMATS.items(): + for k, f in self._FORMATS.items(): formats.append({ 'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0' - % (sf['prefix'], mid, sf['ext'], vkey, guid), - 'format': k, 'format_id': k, 'preference': sf['preference'] + % (f['prefix'], mid, f['ext'], vkey, guid), + 'format': k, 'format_id': k, 'preference': f['preference'], + 'abr': f.get('abr') }) self._sort_formats(formats) From d31573fa37c8db7133492baf0a6be3ece643f8ff Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 5 Jun 2015 22:55:29 +0800 Subject: [PATCH 0311/2145] [teamcoco] Handle incomplete m3u8 URLs (fixes #5798) There are 2 TODOs. I don't know how to handle these cases correctly. --- youtube_dl/extractor/teamcoco.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index b2a4b1fc0..d1b7264b4 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -51,6 +51,17 @@ class TeamcocoIE(InfoExtractor): 'params': { 'skip_download': True, # m3u8 downloads } + }, { + 'url': 'http://teamcoco.com/video/full-episode-mon-6-1-joel-mchale-jake-tapper-and-musical-guest-courtney-barnett?playlist=x;eyJ0eXBlIjoidGFnIiwiaWQiOjl9', + 'info_dict': { + 'id': '89341', + 'ext': 'mp4', + 'title': 'Full Episode - Mon. 6/1 - Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett', + 'description': 'Guests: Joel McHale, Jake Tapper, And Musical Guest Courtney Barnett', + }, + 'params': { + 'skip_download': True, # m3u8 downloads + } } ] _VIDEO_ID_REGEXES = ( @@ -110,9 +121,23 @@ class TeamcocoIE(InfoExtractor): get_quality = qualities(['500k', '480p', '1000k', '720p', '1080p']) for filed in data['files']: if determine_ext(filed['url']) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - filed['url'], video_id, ext='mp4')) + # compat_urllib_parse.urljoin does not work here + if filed['url'].startswith('/'): + m3u8_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + filed['url'] + else: + m3u8_url = filed['url'] + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4') + for m3u8_format in m3u8_formats: + if m3u8_format not in formats: + formats.append(m3u8_format) + elif determine_ext(filed['url']) == 'f4m': + # TODO Correct f4m extraction + continue else: + if filed['url'].startswith('/mp4:protected/'): + # TODO Correct extraction for these files + continue m_format = re.search(r'(\d+(k|p))\.mp4', filed['url']) if m_format is not None: format_id = m_format.group(1) From f00a650705a5e5b4f2b540ea8133a1752e63dd81 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 5 Jun 2015 23:16:34 +0800 Subject: [PATCH 0312/2145] [qqmusic] Rearrange codes --- youtube_dl/extractor/qqmusic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index e24ddaefe..c903bee58 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -80,7 +80,9 @@ class QQMusicIE(InfoExtractor): formats.append({ 'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0' % (f['prefix'], mid, f['ext'], vkey, guid), - 'format': k, 'format_id': k, 'preference': f['preference'], + 'format': k, + 'format_id': k, + 'preference': f['preference'], 'abr': f.get('abr') }) self._sort_formats(formats) From e8ac61e840b5c02e406b910f5f0eed3d8b331969 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 5 Jun 2015 23:19:25 +0800 Subject: [PATCH 0313/2145] [qqmusic] Use meaningful variable names --- youtube_dl/extractor/qqmusic.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index c903bee58..bafa81c21 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -76,14 +76,14 @@ class QQMusicIE(InfoExtractor): transform_source=strip_jsonp)['key'] formats = [] - for k, f in self._FORMATS.items(): + for format_id, details in self._FORMATS.items(): formats.append({ 'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0' - % (f['prefix'], mid, f['ext'], vkey, guid), - 'format': k, - 'format_id': k, - 'preference': f['preference'], - 'abr': f.get('abr') + % (details['prefix'], mid, details['ext'], vkey, guid), + 'format': format_id, + 'format_id': format_id, + 'preference': details['preference'], + 'abr': details.get('abr'), }) self._sort_formats(formats) From dfe7dd9bdb45ec765c9b335c149e9913cf7e413f Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Sat, 6 Jun 2015 02:54:57 +0300 Subject: [PATCH 0314/2145] [izlesene] Unquote video URLs and simplify --- youtube_dl/extractor/izlesene.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index 99a1361f8..753cb98ea 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote from ..utils import ( determine_ext, float_or_none, @@ -67,9 +68,9 @@ class IzleseneIE(InfoExtractor): uploader = self._html_search_regex( r"adduserUsername\s*=\s*'([^']+)';", - webpage, 'uploader', fatal=False, default='') + webpage, 'uploader', fatal=False) timestamp = parse_iso8601(self._html_search_meta( - 'uploadDate', webpage, 'upload date', fatal=False)) + 'uploadDate', webpage, 'upload date')) duration = float_or_none(self._html_search_regex( r'"videoduration"\s*:\s*"([^"]+)"', @@ -86,8 +87,7 @@ class IzleseneIE(InfoExtractor): # Might be empty for some videos. streams = self._html_search_regex( - r'"qualitylevel"\s*:\s*"([^"]+)"', - webpage, 'streams', fatal=False, default='') + r'"qualitylevel"\s*:\s*"([^"]+)"', webpage, 'streams', default='') formats = [] if streams: @@ -95,15 +95,15 @@ class IzleseneIE(InfoExtractor): quality, url = re.search(r'\[(\w+)\](.+)', stream).groups() formats.append({ 'format_id': '%sp' % quality if quality else 'sd', - 'url': url, + 'url': compat_urllib_parse_unquote(url), 'ext': ext, }) else: stream_url = self._search_regex( - r'"streamurl"\s?:\s?"([^"]+)"', webpage, 'stream URL') + r'"streamurl"\s*:\s*"([^"]+)"', webpage, 'stream URL') formats.append({ 'format_id': 'sd', - 'url': stream_url, + 'url': compat_urllib_parse_unquote(stream_url), 'ext': ext, }) From c33c547d66b20064f83932cdaa04823b17a96b70 Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Sat, 6 Jun 2015 02:57:12 +0300 Subject: [PATCH 0315/2145] [izlesene] Avoid timestamp differences in tests due to DST --- youtube_dl/extractor/izlesene.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/izlesene.py b/youtube_dl/extractor/izlesene.py index 753cb98ea..bc226fa67 100644 --- a/youtube_dl/extractor/izlesene.py +++ b/youtube_dl/extractor/izlesene.py @@ -31,7 +31,7 @@ class IzleseneIE(InfoExtractor): 'description': 'md5:253753e2655dde93f59f74b572454f6d', 'thumbnail': 're:^http://.*\.jpg', 'uploader_id': 'pelikzzle', - 'timestamp': 1404302298, + 'timestamp': int, 'upload_date': '20140702', 'duration': 95.395, 'age_limit': 0, @@ -47,7 +47,7 @@ class IzleseneIE(InfoExtractor): 'description': 'Tarkan Dortmund 2006 Konseri', 'thumbnail': 're:^http://.*\.jpg', 'uploader_id': 'parlayankiz', - 'timestamp': 1163322193, + 'timestamp': int, 'upload_date': '20061112', 'duration': 253.666, 'age_limit': 0, From 54eb81a087516e9d040bc1ad274c0a64b51dd1d1 Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Sat, 6 Jun 2015 03:11:43 +0300 Subject: [PATCH 0316/2145] [pornovoisines] Improve average_rating extraction and update test case --- youtube_dl/extractor/pornovoisines.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py index 9688ed948..eba4dfbb3 100644 --- a/youtube_dl/extractor/pornovoisines.py +++ b/youtube_dl/extractor/pornovoisines.py @@ -34,7 +34,7 @@ class PornoVoisinesIE(InfoExtractor): 'duration': 120, 'view_count': int, 'average_rating': float, - 'categories': ['Débutante', 'Scénario', 'Sodomie'], + 'categories': ['Débutantes', 'Scénario', 'Sodomie'], 'age_limit': 18, } } @@ -71,7 +71,7 @@ class PornoVoisinesIE(InfoExtractor): view_count = int_or_none(self._search_regex( r'(\d+) vues', webpage, 'view count', fatal=False)) average_rating = self._search_regex( - r'Note : (\d+,\d+)', webpage, 'average rating', fatal=False) + r'Note\s*:\s*(\d+(?:,\d+)?)', webpage, 'average rating', fatal=False) if average_rating: average_rating = float_or_none(average_rating.replace(',', '.')) From 3d8e9573a470594df6fa471dc33c4c4b938b668a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 6 Jun 2015 06:25:37 +0600 Subject: [PATCH 0317/2145] [youtube:channel] Improve channel id extraction (#5904) --- youtube_dl/extractor/youtube.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index aacb999ce..419f7b019 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1406,10 +1406,12 @@ class YoutubeChannelIE(InfoExtractor): channel_page = self._download_webpage( url + '?view=57', channel_id, 'Downloading channel page', fatal=False) - channel_playlist_id = self._search_regex( - [r'<meta itemprop="channelId" content="([^"]+)">', - r'data-channel-external-id="([^"]+)"'], - channel_page, 'channel id', default=None) + channel_playlist_id = self._html_search_meta( + 'channelId', channel_page, 'channel id', default=None) + if not channel_playlist_id: + channel_playlist_id = self._search_regex( + r'data-channel-external-id="([^"]+)"', + channel_page, 'channel id', default=None) if channel_playlist_id and channel_playlist_id.startswith('UC'): playlist_id = 'UU' + channel_playlist_id[2:] return self.url_result( From 223544552fcfec0c5c6a83326520c614e4489cbb Mon Sep 17 00:00:00 2001 From: Hannu Lintala <hannu.lintala@gmail.com> Date: Sat, 9 May 2015 03:53:43 +0300 Subject: [PATCH 0318/2145] [Ruutu] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/ruutu.py | 88 ++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 youtube_dl/extractor/ruutu.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 8ec0c1032..860023d14 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -448,6 +448,7 @@ from .rutube import ( RutubePersonIE, ) from .rutv import RUTVIE +from .ruutu import RuutuIE from .sandia import SandiaIE from .safari import ( SafariIE, diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py new file mode 100644 index 000000000..e346434f9 --- /dev/null +++ b/youtube_dl/extractor/ruutu.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse +import re + + +class RuutuIE(InfoExtractor): + _VALID_URL = r'http://(www\.)?ruutu\.fi/ohjelmat/(?:[^/]+/)?(?P<id>.*)$' + _TESTS = [ + { + 'url': 'http://www.ruutu.fi/ohjelmat/oletko-aina-halunnut-tietaa-mita-tapahtuu-vain-hetki-ennen-lahetysta-nyt-se-selvisi', + 'md5': 'ab2093f39be1ca8581963451b3c0234f', + 'info_dict': { + 'id': 'oletko-aina-halunnut-tietaa-mita-tapahtuu-vain-hetki-ennen-lahetysta-nyt-se-selvisi', + 'ext': 'mp4', + 'title': 'Oletko aina halunnut tietää mitä tapahtuu vain hetki ennen lähetystä? - Nyt se selvisi!', + 'description': 'Toinen toistaan huikeampia ohjelmaideoita ja täysin päätöntä sekoilua? No sitä juuri nimenomaan. Metro Helsingin Iltapäivän vieraaksi saapui Tuomas Kauhanen ja he Petra Kalliomaan kanssa keskustelivat hieman ennen lähetyksen alkua, mutta kamerat olivatkin jo päällä.', + }, + 'params': { + 'format': 'http-1000', + } + }, + { + 'url': 'http://www.ruutu.fi/ohjelmat/superpesis/superpesis-katso-koko-kausi-ruudussa', + 'md5': '065a10ae4d5b8cfd9d0c3d332465e3d9', + 'info_dict': { + 'id': 'superpesis-katso-koko-kausi-ruudussa', + 'ext': 'mp4', + 'title': 'Superpesis: katso koko kausi Ruudussa', + 'description': 'Huippujännittävän Superpesiksen suoria ottelulähetyksiä seurataan Ruudussa kauden alusta viimeiseen finaaliin asti. Katso lisätiedot osoitteesta ruutu.fi/superpesis.', + }, + 'params': { + 'format': 'http-1000', + } + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + media_id = self._html_search_regex(r'data-media-id="(\d+)"', webpage, 'media_id') + media_json = self._parse_json(self._search_regex(r'jQuery.extend\([^,]+, (.*)\);', webpage, 'media_data'), video_id) + xml_url = media_json['ruutuplayer']['xmlUrl'].replace('{ID}', media_id) + media_xml = self._download_xml(xml_url, media_id) + + formats = [] + parsed_urls = [] + for fmt in media_xml.findall('.//Clip//'): + url = fmt.text + if not fmt.tag.endswith('File') or url in parsed_urls or \ + 'NOT_USED' in url: + continue + + if url.endswith('m3u8'): + formats.extend(self._extract_m3u8_formats(url, media_id, m3u8_id='hls')) + parsed_urls.append(url) + elif url.endswith('f4m'): + formats.extend(self._extract_f4m_formats(url, media_id, f4m_id='hds')) + parsed_urls.append(url) + else: + proto = compat_urllib_parse_urlparse(url).scheme + width_str, height_str = fmt.get('resolution').split('x') + tbr = int(fmt.get('bitrate', 0)) + formats.append({ + 'format_id': '%s-%d' % (proto, tbr), + 'url': url, + 'width': int(width_str), + 'height': int(height_str), + 'tbr': tbr, + 'ext': url.rsplit('.', 1)[-1], + 'live': True, + 'protocol': proto, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'formats': formats, + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': int(media_xml.find('.//Runtime').text), + 'age_limit': int(media_xml.find('.//AgeLimit').text), + } From a9e58ecd3fdfa93cdc8a7f9fc852dbbd0814d6a4 Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Sat, 6 Jun 2015 13:56:46 +0300 Subject: [PATCH 0319/2145] [turbo] Improve description extraction `og:description` is empty for some videos. --- youtube_dl/extractor/turbo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/turbo.py b/youtube_dl/extractor/turbo.py index 29703a8a9..7ae63a499 100644 --- a/youtube_dl/extractor/turbo.py +++ b/youtube_dl/extractor/turbo.py @@ -23,7 +23,7 @@ class TurboIE(InfoExtractor): 'ext': 'mp4', 'duration': 3715, 'title': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ', - 'description': 'Retrouvez dans cette rubrique toutes les vidéos de l\'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia... ', + 'description': 'Turbo du 07/09/2014 : Renault Twingo 3, Bentley Continental GT Speed, CES, Guide Achat Dacia...', 'thumbnail': 're:^https?://.*\.jpg$', } } @@ -42,7 +42,7 @@ class TurboIE(InfoExtractor): title = xpath_text(item, './title', 'title') duration = int_or_none(xpath_text(item, './durate', 'duration')) thumbnail = xpath_text(item, './visuel_clip', 'thumbnail') - description = self._og_search_description(webpage) + description = self._html_search_meta('description', webpage) formats = [] get_quality = qualities(['3g', 'sd', 'hq']) From 05aa9c82d90644af406519e5e25fefb0884d504e Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Sat, 6 Jun 2015 13:58:20 +0300 Subject: [PATCH 0320/2145] [sunporno] Fix view_count extraction --- youtube_dl/extractor/sunporno.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py index 854d01bee..e527aa971 100644 --- a/youtube_dl/extractor/sunporno.py +++ b/youtube_dl/extractor/sunporno.py @@ -44,7 +44,7 @@ class SunPornoIE(InfoExtractor): webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( - r'class="views">\s*(\d+)\s*<', + r'class="views">(?:<noscript>)?\s*(\d+)\s*<', webpage, 'view count', fatal=False)) comment_count = int_or_none(self._html_search_regex( r'(\d+)</b> Comments?', From 4da31bd56629054497634d041035e4bd6fcfacbb Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 6 Jun 2015 22:22:26 +0800 Subject: [PATCH 0321/2145] [youtube] Fix a FutureWarning from xml.etree.ElementTree --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6d288e848..2424ac2c0 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -816,7 +816,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'filesize': filesize, 'fps': int_or_none(r.attrib.get('frameRate')), } - if segment_list: + if len(segment_list): f.update({ 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'], 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')], From f1da861018924e6f442ffedd9a5682055c79aea6 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 7 Jun 2015 00:37:29 +0800 Subject: [PATCH 0322/2145] [iqiyi] PEP8 --- youtube_dl/extractor/iqiyi.py | 56 +++++++++++++++++------------------ 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 747f3f902..597441baf 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -16,19 +16,20 @@ import random import zlib import hashlib + class IqiyiIE(InfoExtractor): IE_NAME = 'iqiyi' _VALID_URL = r'http://(?:www\.)iqiyi.com/.+?\.html' _TEST = { - 'url': 'http://www.iqiyi.com/v_19rrojlavg.html', - 'md5': '2cb594dc2781e6c941a110d8f358118b', - 'info_dict': { - 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73', - 'title': '美国德州空中惊现奇异云团 酷似UFO', - 'ext': 'f4v', - } + 'url': 'http://www.iqiyi.com/v_19rrojlavg.html', + 'md5': '2cb594dc2781e6c941a110d8f358118b', + 'info_dict': { + 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73', + 'title': '美国德州空中惊现奇异云团 酷似UFO', + 'ext': 'f4v', + } } def construct_video_urls(self, data, video_id, _uuid, bid): @@ -46,7 +47,7 @@ class IqiyiIE(InfoExtractor): c = len(b) s = '' for i in range(c - 1, -1, -1): - a = do_xor(int(b[c-i-1], 16), i) + a = do_xor(int(b[c - i - 1], 16), i) s += chr(a) return s[::-1] @@ -54,15 +55,14 @@ class IqiyiIE(InfoExtractor): mg = ')(*&^flash@#$%a' tm = self._download_json( 'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id)['t'] - t = str(int(math.floor(int(tm)/(600.0)))) - return hashlib.md5( - (t+mg+x).encode('utf8')).hexdigest() + t = str(int(math.floor(int(tm) / (600.0)))) + return hashlib.md5((t + mg + x).encode('utf8')).hexdigest() # get accept format # getting all format will spend minutes for a big video. if bid == 'best': - bids = [int(i['bid']) for i in data['vp']['tkl'][0]['vs'] \ - if 0 < int(i['bid']) <= 10] + bids = [int(i['bid']) for i in data['vp']['tkl'][0]['vs'] + if 0 < int(i['bid']) <= 10] bid = str(max(bids)) video_urls_dict = {} @@ -117,24 +117,24 @@ class IqiyiIE(InfoExtractor): def get_format(self, bid): _dict = { - '1' : 'h6', - '2' : 'h5', - '3' : 'h4', - '4' : 'h3', - '5' : 'h2', - '10' : 'h1' + '1': 'h6', + '2': 'h5', + '3': 'h4', + '4': 'h3', + '5': 'h2', + '10': 'h1' } return _dict.get(str(bid), None) def get_bid(self, format_id): _dict = { - 'h6' : '1', - 'h5' : '2', - 'h4' : '3', - 'h3' : '4', - 'h2' : '5', - 'h1' : '10', - 'best' : 'best' + 'h6': '1', + 'h5': '2', + 'h4': '3', + 'h3': '4', + 'h2': '5', + 'h1': '10', + 'best': 'best' } return _dict.get(format_id, None) @@ -207,7 +207,7 @@ class IqiyiIE(InfoExtractor): for format_id in video_urls_dict: video_urls = video_urls_dict[format_id] for i, video_url_info in enumerate(video_urls): - if len(entries) < i+1: + if len(entries) < i + 1: entries.append({'formats': []}) entries[i]['formats'].append( { @@ -222,7 +222,7 @@ class IqiyiIE(InfoExtractor): self._sort_formats(entries[i]['formats']) entries[i].update( { - 'id': '_part%d' % (i+1), + 'id': '_part%d' % (i + 1), 'title': title, } ) From 7012620e2b9355d25ddfc855fc5990af938f04d8 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 7 Jun 2015 00:44:54 +0800 Subject: [PATCH 0323/2145] [iqiyi] Remove format selection codes --- youtube_dl/extractor/iqiyi.py | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 597441baf..5645fb6ee 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -32,7 +32,7 @@ class IqiyiIE(InfoExtractor): } } - def construct_video_urls(self, data, video_id, _uuid, bid): + def construct_video_urls(self, data, video_id, _uuid): def do_xor(x, y): a = y % 3 if a == 1: @@ -58,13 +58,6 @@ class IqiyiIE(InfoExtractor): t = str(int(math.floor(int(tm) / (600.0)))) return hashlib.md5((t + mg + x).encode('utf8')).hexdigest() - # get accept format - # getting all format will spend minutes for a big video. - if bid == 'best': - bids = [int(i['bid']) for i in data['vp']['tkl'][0]['vs'] - if 0 < int(i['bid']) <= 10] - bid = str(max(bids)) - video_urls_dict = {} for i in data['vp']['tkl'][0]['vs']: if 0 < int(i['bid']) <= 10: @@ -80,12 +73,6 @@ class IqiyiIE(InfoExtractor): if t.endswith('mp4'): video_urls_info = i['flvs'] - if int(i['bid']) != int(bid): # ignore missing match format - video_urls.extend( - [('http://example.com/v.flv', ii['b']) for ii in video_urls_info]) - video_urls_dict[format_id] = video_urls - continue - for ii in video_urls_info: vl = ii['l'] if not vl.startswith('/'): @@ -193,14 +180,9 @@ class IqiyiIE(InfoExtractor): title = data['vi']['vn'] - format = self._downloader.params.get('format', None) - bid = self.get_bid(format) if format else 'best' - if not bid: - raise ExtractorError('Can\'t get format.') - # generate video_urls_dict video_urls_dict = self.construct_video_urls( - data, video_id, _uuid, bid) + data, video_id, _uuid) # construct info entries = [] From 29e7e0781b1b8e276c28a079bc5b18e1b0db2d5e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 7 Jun 2015 00:56:08 +0800 Subject: [PATCH 0324/2145] [iqiyi] Simplify and improve regex patterns See the comments in #5849 --- youtube_dl/extractor/iqiyi.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 5645fb6ee..18a7587a2 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -161,12 +161,11 @@ class IqiyiIE(InfoExtractor): webpage = self._download_webpage( url, 'temp_id', note='download video page') tvid = self._search_regex( - r'tvId ?= ?(\'|\")(?P<tvid>\d+)', webpage, 'tvid', flags=re.I, group='tvid') + r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid') video_id = self._search_regex( - r'videoId ?= ?(\'|\")(?P<video_id>[a-z\d]+)', - webpage, 'video_id', flags=re.I, group='video_id') + r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') swf_url = self._search_regex( - r'(?P<swf>http://.+?MainPlayer.+?\.swf)', webpage, 'swf') + r'(http://.+?MainPlayer.+?\.swf)', webpage, 'swf player URL') _uuid = uuid.uuid4().hex enc_key = self.get_enc_key(swf_url, video_id) From aacda28b28c1804866d634c5c5086b3d53cb2b2f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 7 Jun 2015 01:32:03 +0800 Subject: [PATCH 0325/2145] [iqiyi] Give error message for assertion failures --- youtube_dl/extractor/iqiyi.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 18a7587a2..dc35c3380 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -171,7 +171,10 @@ class IqiyiIE(InfoExtractor): enc_key = self.get_enc_key(swf_url, video_id) raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid) - assert raw_data['code'] == 'A000000' + + if raw_data['code'] != 'A000000': + raise ExtractorError('Unable to load data. Error code: ' + raw_data['code']) + if not raw_data['data']['vp']['tkl']: raise ExtractorError('No support iQiqy VIP video') From 958d0b659b80d4493d045d4da82074ed68ed6c4e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 7 Jun 2015 01:35:09 +0800 Subject: [PATCH 0326/2145] [iqiyi] Reorder imports --- youtube_dl/extractor/iqiyi.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index dc35c3380..36029361a 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -1,20 +1,17 @@ # coding: utf-8 - from __future__ import unicode_literals -from .common import InfoExtractor - -from ..compat import compat_urllib_parse - -from ..utils import ExtractorError - +import hashlib +import math +import random import re import time import uuid -import math -import random import zlib -import hashlib + +from .common import InfoExtractor +from ..compat import compat_urllib_parse +from ..utils import ExtractorError class IqiyiIE(InfoExtractor): From ffba4edb067238b593b98c71f4293e9b60ba95ce Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 7 Jun 2015 01:52:51 +0800 Subject: [PATCH 0327/2145] [iqiyi] Improve some variable names and add download notes --- youtube_dl/extractor/iqiyi.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 36029361a..c17e1fde4 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -48,35 +48,37 @@ class IqiyiIE(InfoExtractor): s += chr(a) return s[::-1] - def get_path_key(x): + def get_path_key(x, format_id, segment_index): mg = ')(*&^flash@#$%a' tm = self._download_json( - 'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id)['t'] + 'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id, + note='Download path key of segment %d for format %s' % (segment_index + 1, format_id) + )['t'] t = str(int(math.floor(int(tm) / (600.0)))) return hashlib.md5((t + mg + x).encode('utf8')).hexdigest() video_urls_dict = {} - for i in data['vp']['tkl'][0]['vs']: - if 0 < int(i['bid']) <= 10: - format_id = self.get_format(i['bid']) + for format_item in data['vp']['tkl'][0]['vs']: + if 0 < int(format_item['bid']) <= 10: + format_id = self.get_format(format_item['bid']) else: continue video_urls = [] - video_urls_info = i['fs'] - if not i['fs'][0]['l'].startswith('/'): - t = get_encode_code(i['fs'][0]['l']) + video_urls_info = format_item['fs'] + if not format_item['fs'][0]['l'].startswith('/'): + t = get_encode_code(format_item['fs'][0]['l']) if t.endswith('mp4'): - video_urls_info = i['flvs'] + video_urls_info = format_item['flvs'] - for ii in video_urls_info: - vl = ii['l'] + for segment_index, segment in enumerate(video_urls_info): + vl = segment['l'] if not vl.startswith('/'): vl = get_encode_code(vl) key = get_path_key( - vl.split('/')[-1].split('.')[0]) - filesize = ii['b'] + vl.split('/')[-1].split('.')[0], format_id, segment_index) + filesize = segment['b'] base_url = data['vp']['du'].split('/') base_url.insert(-1, key) base_url = '/'.join(base_url) @@ -91,7 +93,9 @@ class IqiyiIE(InfoExtractor): } api_video_url = base_url + vl + '?' + \ compat_urllib_parse.urlencode(param) - js = self._download_json(api_video_url, video_id) + js = self._download_json( + api_video_url, video_id, + note='Download video info of segment %d for format %s' % (segment_index + 1, format_id)) video_url = js['l'] video_urls.append( (video_url, filesize)) From c4ee87022bd18863fc3f22f80064453e272d956f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 7 Jun 2015 01:57:05 +0800 Subject: [PATCH 0328/2145] [iqiyi] Change id for multipart videos --- youtube_dl/extractor/iqiyi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index c17e1fde4..840cc9a4d 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -207,7 +207,7 @@ class IqiyiIE(InfoExtractor): self._sort_formats(entries[i]['formats']) entries[i].update( { - 'id': '_part%d' % (i + 1), + 'id': '%s_part%d' % (video_id, i + 1), 'title': title, } ) From 99481135907b5fa3558d4f176fd02acbdafccdb6 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 7 Jun 2015 02:09:33 +0800 Subject: [PATCH 0329/2145] [iqiyi] Add a multipart test case --- youtube_dl/extractor/iqiyi.py | 67 +++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 840cc9a4d..d73687d88 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -19,7 +19,7 @@ class IqiyiIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)iqiyi.com/.+?\.html' - _TEST = { + _TESTS = [{ 'url': 'http://www.iqiyi.com/v_19rrojlavg.html', 'md5': '2cb594dc2781e6c941a110d8f358118b', 'info_dict': { @@ -27,7 +27,70 @@ class IqiyiIE(InfoExtractor): 'title': '美国德州空中惊现奇异云团 酷似UFO', 'ext': 'f4v', } - } + }, { + 'url': 'http://www.iqiyi.com/v_19rrhnnclk.html', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb', + 'title': '名侦探柯南第752集', + }, + 'playlist': [{ + 'md5': '7e49376fecaffa115d951634917fe105', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part1', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': '41b75ba13bb7ac0e411131f92bc4f6ca', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part2', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': '0cee1dd0a3d46a83e71e2badeae2aab0', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part3', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': '4f8ad72373b0c491b582e7c196b0b1f9', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part4', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': 'd89ad028bcfad282918e8098e811711d', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part5', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': '9cb1e5c95da25dff0660c32ae50903b7', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part6', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': '155116e0ff1867bbc9b98df294faabc9', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part7', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }, { + 'md5': '53f5db77622ae14fa493ed2a278a082b', + 'info_dict': { + 'id': 'e3f585b550a280af23c98b6cb2be19fb_part8', + 'ext': 'f4v', + 'title': '名侦探柯南第752集', + }, + }], + }] def construct_video_urls(self, data, video_id, _uuid): def do_xor(x, y): From 865ab62f43eb94a9f4f757a464df147e983cb439 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 7 Jun 2015 02:13:22 +0800 Subject: [PATCH 0330/2145] [iqiyi] Make _VALID_URL more accurate v_* urls are individual videos, while a_* urls are playlists, which are not supported yet. --- youtube_dl/extractor/iqiyi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index d73687d88..f0d423331 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -17,7 +17,7 @@ from ..utils import ExtractorError class IqiyiIE(InfoExtractor): IE_NAME = 'iqiyi' - _VALID_URL = r'http://(?:www\.)iqiyi.com/.+?\.html' + _VALID_URL = r'http://(?:www\.)iqiyi.com/v_.+?\.html' _TESTS = [{ 'url': 'http://www.iqiyi.com/v_19rrojlavg.html', From 08bb8ef2011d795948d8e89478bf3afe4b99405f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 7 Jun 2015 02:25:00 +0800 Subject: [PATCH 0331/2145] [iqiyi] Unify get_format() and get_bid() --- youtube_dl/extractor/iqiyi.py | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index f0d423331..122f33692 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -92,6 +92,15 @@ class IqiyiIE(InfoExtractor): }], }] + _FORMATS_MAP = [ + ('1', 'h6'), + ('2', 'h5'), + ('3', 'h4'), + ('4', 'h3'), + ('5', 'h2'), + ('10', 'h1'), + ] + def construct_video_urls(self, data, video_id, _uuid): def do_xor(x, y): a = y % 3 @@ -167,27 +176,12 @@ class IqiyiIE(InfoExtractor): return video_urls_dict def get_format(self, bid): - _dict = { - '1': 'h6', - '2': 'h5', - '3': 'h4', - '4': 'h3', - '5': 'h2', - '10': 'h1' - } - return _dict.get(str(bid), None) + matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)] + return matched_format_ids[0] if len(matched_format_ids) else None def get_bid(self, format_id): - _dict = { - 'h6': '1', - 'h5': '2', - 'h4': '3', - 'h3': '4', - 'h2': '5', - 'h1': '10', - 'best': 'best' - } - return _dict.get(format_id, None) + matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id] + return matched_bids[0] if len(matched_bids) else None def get_raw_data(self, tvid, video_id, enc_key, _uuid): tm = str(int(time.time())) From 9c5f685ef14a8b44d17b897ba8ae2da051011c35 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 7 Jun 2015 02:39:03 +0800 Subject: [PATCH 0332/2145] [iqiyi] Improve regex pattern again --- youtube_dl/extractor/iqiyi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 122f33692..15481b84b 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -223,7 +223,7 @@ class IqiyiIE(InfoExtractor): video_id = self._search_regex( r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') swf_url = self._search_regex( - r'(http://.+?MainPlayer.+?\.swf)', webpage, 'swf player URL') + r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL') _uuid = uuid.uuid4().hex enc_key = self.get_enc_key(swf_url, video_id) From b5a3c7f10927c9d55f6fdad5f5c002e02338642e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 7 Jun 2015 02:47:36 +0800 Subject: [PATCH 0333/2145] [iqiyi] Cache encryption keys --- youtube_dl/extractor/iqiyi.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 15481b84b..9106dd074 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import hashlib import math +import os.path import random import re import time @@ -11,7 +12,10 @@ import zlib from .common import InfoExtractor from ..compat import compat_urllib_parse -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + url_basename, +) class IqiyiIE(InfoExtractor): @@ -207,12 +211,20 @@ class IqiyiIE(InfoExtractor): return raw_data def get_enc_key(self, swf_url, video_id): + filename, _ = os.path.splitext(url_basename(swf_url)) + enc_key_json = self._downloader.cache.load('iqiyi-enc-key', filename) + if enc_key_json is not None: + return enc_key_json[0] + req = self._request_webpage( swf_url, video_id, note='download swf content') cn = req.read() cn = zlib.decompress(cn[8:]) pt = re.compile(b'MixerRemote\x08(?P<enc_key>.+?)\$&vv') enc_key = self._search_regex(pt, cn, 'enc_key').decode('utf8') + + self._downloader.cache.store('iqiyi-enc-key', filename, [enc_key]) + return enc_key def _real_extract(self, url): From d00735a0c5aabd38b37bfea76a93ae8c47a8d419 Mon Sep 17 00:00:00 2001 From: Hannu Lintala <hannu.lintala@gmail.com> Date: Sat, 6 Jun 2015 23:01:23 +0300 Subject: [PATCH 0334/2145] [ruutu] Don't use fallback for DASH and other non-HTTP urls --- youtube_dl/extractor/ruutu.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py index e346434f9..59e0b12fd 100644 --- a/youtube_dl/extractor/ruutu.py +++ b/youtube_dl/extractor/ruutu.py @@ -62,6 +62,8 @@ class RuutuIE(InfoExtractor): formats.extend(self._extract_f4m_formats(url, media_id, f4m_id='hds')) parsed_urls.append(url) else: + if not fmt.tag.startswith('HTTP'): + continue proto = compat_urllib_parse_urlparse(url).scheme width_str, height_str = fmt.get('resolution').split('x') tbr = int(fmt.get('bitrate', 0)) From de390ea0771a0e35c0c2970bc00f5fa2dd9d3eac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 7 Jun 2015 00:19:45 +0200 Subject: [PATCH 0335/2145] update: Use https for getting the version info (fixes #5909) --- youtube_dl/update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/update.py b/youtube_dl/update.py index de3169eef..fc7ac8305 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -50,7 +50,7 @@ def rsa_verify(message, signature, key): def update_self(to_screen, verbose): """Update the program file with the latest version from the repository""" - UPDATE_URL = "http://rg3.github.io/youtube-dl/update/" + UPDATE_URL = "https://rg3.github.io/youtube-dl/update/" VERSION_URL = UPDATE_URL + 'LATEST_VERSION' JSON_URL = UPDATE_URL + 'versions.json' UPDATES_RSA_KEY = (0x9d60ee4d8f805312fdb15a62f87b95bd66177b91df176765d13514a0f1754bcd2057295c5b6f1d35daa6742c3ffc9a82d3e118861c207995a8031e151d863c9927e304576bc80692bc8e094896fcf11b66f3e29e04e3a71e9a11558558acea1840aec37fc396fb6b65dc81a1c4144e03bd1c011de62e3f1357b327d08426fe93, 65537) From 9414338a48ca815fd666aad496ebabd6d0c76e5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 7 Jun 2015 05:37:29 +0600 Subject: [PATCH 0336/2145] [ruutu] Improve, make more robust and fix python 2.6 support --- youtube_dl/extractor/ruutu.py | 129 +++++++++++++++++++++------------- 1 file changed, 79 insertions(+), 50 deletions(-) diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py index 59e0b12fd..4e22628d0 100644 --- a/youtube_dl/extractor/ruutu.py +++ b/youtube_dl/extractor/ruutu.py @@ -3,88 +3,117 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_parse_urlparse -import re +from ..utils import ( + determine_ext, + int_or_none, + xpath_text, +) class RuutuIE(InfoExtractor): - _VALID_URL = r'http://(www\.)?ruutu\.fi/ohjelmat/(?:[^/]+/)?(?P<id>.*)$' + _VALID_URL = r'http://(?:www\.)?ruutu\.fi/ohjelmat/(?:[^/?#]+/)*(?P<id>[^/?#]+)' _TESTS = [ { 'url': 'http://www.ruutu.fi/ohjelmat/oletko-aina-halunnut-tietaa-mita-tapahtuu-vain-hetki-ennen-lahetysta-nyt-se-selvisi', 'md5': 'ab2093f39be1ca8581963451b3c0234f', 'info_dict': { - 'id': 'oletko-aina-halunnut-tietaa-mita-tapahtuu-vain-hetki-ennen-lahetysta-nyt-se-selvisi', + 'id': '2058907', + 'display_id': 'oletko-aina-halunnut-tietaa-mita-tapahtuu-vain-hetki-ennen-lahetysta-nyt-se-selvisi', 'ext': 'mp4', 'title': 'Oletko aina halunnut tietää mitä tapahtuu vain hetki ennen lähetystä? - Nyt se selvisi!', - 'description': 'Toinen toistaan huikeampia ohjelmaideoita ja täysin päätöntä sekoilua? No sitä juuri nimenomaan. Metro Helsingin Iltapäivän vieraaksi saapui Tuomas Kauhanen ja he Petra Kalliomaan kanssa keskustelivat hieman ennen lähetyksen alkua, mutta kamerat olivatkin jo päällä.', + 'description': 'md5:cfc6ccf0e57a814360df464a91ff67d6', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 114, + 'age_limit': 0, }, - 'params': { - 'format': 'http-1000', - } }, { 'url': 'http://www.ruutu.fi/ohjelmat/superpesis/superpesis-katso-koko-kausi-ruudussa', 'md5': '065a10ae4d5b8cfd9d0c3d332465e3d9', 'info_dict': { - 'id': 'superpesis-katso-koko-kausi-ruudussa', + 'id': '2057306', + 'display_id': 'superpesis-katso-koko-kausi-ruudussa', 'ext': 'mp4', 'title': 'Superpesis: katso koko kausi Ruudussa', - 'description': 'Huippujännittävän Superpesiksen suoria ottelulähetyksiä seurataan Ruudussa kauden alusta viimeiseen finaaliin asti. Katso lisätiedot osoitteesta ruutu.fi/superpesis.', + 'description': 'md5:44c44a99fdbe5b380ab74ebd75f0af77', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 40, + 'age_limit': 0, }, - 'params': { - 'format': 'http-1000', - } }, ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + display_id = self._match_id(url) - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - media_id = self._html_search_regex(r'data-media-id="(\d+)"', webpage, 'media_id') - media_json = self._parse_json(self._search_regex(r'jQuery.extend\([^,]+, (.*)\);', webpage, 'media_data'), video_id) - xml_url = media_json['ruutuplayer']['xmlUrl'].replace('{ID}', media_id) - media_xml = self._download_xml(xml_url, media_id) + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r'data-media-id="(\d+)"', webpage, 'media id') + + video_xml_url = None + + media_data = self._search_regex( + r'jQuery\.extend\([^,]+,\s*(.+?)\);', webpage, + 'media data', default=None) + if media_data: + media_json = self._parse_json(media_data, display_id, fatal=False) + if media_json: + xml_url = media_json.get('ruutuplayer', {}).get('xmlUrl') + if xml_url: + video_xml_url = xml_url.replace('{ID}', video_id) + + if not video_xml_url: + video_xml_url = 'http://gatling.ruutu.fi/media-xml-cache?id=%s' % video_id + + video_xml = self._download_xml(video_xml_url, video_id) formats = [] - parsed_urls = [] - for fmt in media_xml.findall('.//Clip//'): - url = fmt.text - if not fmt.tag.endswith('File') or url in parsed_urls or \ - 'NOT_USED' in url: - continue + processed_urls = [] - if url.endswith('m3u8'): - formats.extend(self._extract_m3u8_formats(url, media_id, m3u8_id='hls')) - parsed_urls.append(url) - elif url.endswith('f4m'): - formats.extend(self._extract_f4m_formats(url, media_id, f4m_id='hds')) - parsed_urls.append(url) - else: - if not fmt.tag.startswith('HTTP'): - continue - proto = compat_urllib_parse_urlparse(url).scheme - width_str, height_str = fmt.get('resolution').split('x') - tbr = int(fmt.get('bitrate', 0)) - formats.append({ - 'format_id': '%s-%d' % (proto, tbr), - 'url': url, - 'width': int(width_str), - 'height': int(height_str), - 'tbr': tbr, - 'ext': url.rsplit('.', 1)[-1], - 'live': True, - 'protocol': proto, - }) + def extract_formats(node): + for child in node: + if child.tag.endswith('Files'): + extract_formats(child) + elif child.tag.endswith('File'): + video_url = child.text + if not video_url or video_url in processed_urls or 'NOT_USED' in video_url: + return + processed_urls.append(video_url) + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id='hls')) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id='hds')) + else: + proto = compat_urllib_parse_urlparse(video_url).scheme + if not child.tag.startswith('HTTP') and proto != 'rtmp': + continue + preference = -1 if proto == 'rtmp' else 1 + label = child.get('label') + tbr = int_or_none(child.get('bitrate')) + width, height = [int_or_none(x) for x in child.get('resolution', '').split('x')] + formats.append({ + 'format_id': '%s-%s' % (proto, label if label else tbr), + 'url': video_url, + 'width': width, + 'height': height, + 'tbr': tbr, + 'preference': preference, + }) + + extract_formats(video_xml.find('./Clip')) self._sort_formats(formats) return { 'id': video_id, + 'display_id': display_id, 'title': self._og_search_title(webpage), - 'formats': formats, 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), - 'duration': int(media_xml.find('.//Runtime').text), - 'age_limit': int(media_xml.find('.//AgeLimit').text), + 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')), + 'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')), + 'formats': formats, } From 9836cfb8d682c91036ce417fa31200673b52115b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 7 Jun 2015 08:12:21 +0600 Subject: [PATCH 0337/2145] [options] Clarify `--list-extractors` (Closes #5916) --- youtube_dl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 5a2315bd9..689fa7595 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -145,7 +145,7 @@ def parseOpts(overrideArguments=None): general.add_option( '--list-extractors', action='store_true', dest='list_extractors', default=False, - help='List all supported extractors and the URLs they would handle') + help='List all supported extractors') general.add_option( '--extractor-descriptions', action='store_true', dest='list_extractor_descriptions', default=False, From b26733ba7f376f8c9285ac7928534286622bbc7c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 7 Jun 2015 15:29:17 +0800 Subject: [PATCH 0338/2145] [brightcove] Allow single quotes in Brightcove URLs (fixes #5901) --- youtube_dl/extractor/brightcove.py | 2 +- youtube_dl/extractor/generic.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 4f60d5366..c1d4320e1 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -172,7 +172,7 @@ class BrightcoveIE(InfoExtractor): """Return a list of all Brightcove URLs from the webpage """ url_m = re.search( - r'<meta\s+property="og:video"\s+content="(https?://(?:secure|c)\.brightcove.com/[^"]+)"', + r'<meta\s+property=[\'"]og:video[\'"]\s+content=[\'"](https?://(?:secure|c)\.brightcove.com/[^\'"]+)[\'"]', webpage) if url_m: url = unescapeHTML(url_m.group(1)) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 96ca398de..759691365 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -789,6 +789,18 @@ class GenericIE(InfoExtractor): # rtmpe downloads 'skip_download': True, } + }, + # Brightcove URL in single quotes + { + 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/', + 'md5': '4ae374f1f8b91c889c4b9203c8c752af', + 'info_dict': { + 'id': '4255764656001', + 'ext': 'mp4', + 'title': 'SN Presents: Russell Martin, World Citizen', + 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.', + 'uploader': 'Rogers Sportsnet', + }, } ] From 621ed9f5f4d9d82659272ebe01e740e9196fad61 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 7 Jun 2015 16:33:22 +0800 Subject: [PATCH 0339/2145] [common] Add note and errnote field for _extract_m3u8_formats --- youtube_dl/extractor/common.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index cecf917ff..49e4dc710 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -846,7 +846,7 @@ class InfoExtractor(object): def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, - m3u8_id=None): + m3u8_id=None, note=None, errnote=None): formats = [{ 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), @@ -865,8 +865,8 @@ class InfoExtractor(object): m3u8_doc = self._download_webpage( m3u8_url, video_id, - note='Downloading m3u8 information', - errnote='Failed to download m3u8 information') + note=note or 'Downloading m3u8 information', + errnote=errnote or 'Failed to download m3u8 information') last_info = None last_media = None kv_rex = re.compile( From 65ba8b23f471b96e6f937f2754c729e22bf2cf0a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 7 Jun 2015 16:34:19 +0800 Subject: [PATCH 0340/2145] [discovery] Rewrite DiscoveryIE (fixes #5898) Discovery.com now uses a completely different approach for serving videos. At least in both test cases brightcove are involved. However, AMF support is necessary for these brightcove videos. As a result, I try to extract videos from the info page ('?flat=1'). The downloaded file can be different from the one in browsers. --- youtube_dl/extractor/discovery.py | 52 ++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index d3e667528..d6723ecf2 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -2,19 +2,19 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + parse_duration, parse_iso8601, - int_or_none, ) +from ..compat import compat_str class DiscoveryIE(InfoExtractor): _VALID_URL = r'http://www\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9_\-]*)(?:\.htm)?' - _TEST = { + _TESTS = [{ 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm', - 'md5': '3c69d77d9b0d82bfd5e5932a60f26504', 'info_dict': { - 'id': 'mission-impossible-outtakes', - 'ext': 'flv', + 'id': '20769', + 'ext': 'mp4', 'title': 'Mission Impossible Outtakes', 'description': ('Watch Jamie Hyneman and Adam Savage practice being' ' each other -- to the point of confusing Jamie\'s dog -- and ' @@ -24,22 +24,36 @@ class DiscoveryIE(InfoExtractor): 'timestamp': 1303099200, 'upload_date': '20110418', }, - } + 'params': { + 'skip_download': True, # requires ffmpeg + } + }, { + 'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mythbusters-the-simpsons', + 'info_dict': { + 'id': 'mythbusters-the-simpsons', + 'title': 'MythBusters: The Simpsons', + }, + 'playlist_count': 9, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + info = self._download_json(url + '?flat=1', video_id) - info = self._parse_json(self._search_regex( - r'(?s)<script type="application/ld\+json">(.*?)</script>', - webpage, 'video info'), video_id) + video_title = info.get('playlist_title') or info.get('video_title') - return { - 'id': video_id, - 'title': info['name'], - 'url': info['contentURL'], - 'description': info.get('description'), - 'thumbnail': info.get('thumbnailUrl'), - 'timestamp': parse_iso8601(info.get('uploadDate')), - 'duration': int_or_none(info.get('duration')), - } + entries = [{ + 'id': compat_str(video_info['id']), + 'formats': self._extract_m3u8_formats( + video_info['src'], video_id, ext='mp4', + note='Download m3u8 information for video %d' % (idx + 1)), + 'title': video_info['title'], + 'description': video_info.get('description'), + 'duration': parse_duration(video_info.get('video_length')), + 'webpage_url': video_info.get('href'), + 'thumbnail': video_info.get('thumbnailURL'), + 'alt_title': video_info.get('secondary_title'), + 'timestamp': parse_iso8601(video_info.get('publishedDate')), + } for idx, video_info in enumerate(info['playlist'])] + + return self.playlist_result(entries, video_id, video_title) From 68477c3dab97733eb7a2feb8fcc90f648c29c2b4 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 7 Jun 2015 16:38:39 +0800 Subject: [PATCH 0341/2145] [tlc] Fix test failure due to DiscoveryIE changes --- youtube_dl/extractor/tlc.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py index 9f9e388c5..13263614c 100644 --- a/youtube_dl/extractor/tlc.py +++ b/youtube_dl/extractor/tlc.py @@ -12,17 +12,22 @@ class TlcIE(DiscoveryIE): IE_NAME = 'tlc.com' _VALID_URL = r'http://www\.tlc\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P<id>[a-zA-Z0-9\-]*)(.htm)?' - _TEST = { + # DiscoveryIE has _TESTS + _TESTS = [{ 'url': 'http://www.tlc.com/tv-shows/cake-boss/videos/too-big-to-fly.htm', - 'md5': 'c4038f4a9b44d0b5d74caaa64ed2a01a', 'info_dict': { - 'id': '853232', + 'id': '104493', 'ext': 'mp4', - 'title': 'Cake Boss: Too Big to Fly', + 'title': 'Too Big to Fly', 'description': 'Buddy has taken on a high flying task.', 'duration': 119, + 'timestamp': 1393365060, + 'upload_date': '20140225', }, - } + 'params': { + 'skip_download': True, # requires ffmpef + }, + }] class TlcDeIE(InfoExtractor): From edb99d4c18475ba27fae4f7d0ec6e3db9b574885 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 8 Jun 2015 01:17:21 +0800 Subject: [PATCH 0342/2145] [instagram] Handling null values (fixes #5919) I didn't add the test case here because it takes too much time. (7 minutes on my machine) --- youtube_dl/extractor/instagram.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index b10755788..b92367a9d 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -100,7 +100,9 @@ class InstagramUserIE(InfoExtractor): thumbnails_el = it.get('images', {}) thumbnail = thumbnails_el.get('thumbnail', {}).get('url') - title = it.get('caption', {}).get('text', it['id']) + # In some cases caption is null, which corresponds to None + # in python. As a result, it.get('caption', {}) gives None + title = (it.get('caption') or {}).get('text', it['id']) entries.append({ 'id': it['id'], From e1ec93304dfcf385380feb95a3777c796cc49420 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 8 Jun 2015 01:46:33 +0800 Subject: [PATCH 0343/2145] [instagram:user] Truncate title to 80 characters (#5919) This is a workaround. Currently YoutubeDL.process_info() truncates info_dict['title'] to 200 characters, but the implementation can't handle wide characters. --- youtube_dl/extractor/instagram.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index b92367a9d..3d78f78c4 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -3,7 +3,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + limit_length, +) class InstagramIE(InfoExtractor): @@ -106,7 +109,7 @@ class InstagramUserIE(InfoExtractor): entries.append({ 'id': it['id'], - 'title': title, + 'title': limit_length(title, 80), 'formats': formats, 'thumbnail': thumbnail, 'webpage_url': it.get('link'), From 788be3313df7ad020dc0a98bd5ed43a60120fb3b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 8 Jun 2015 13:32:04 +0800 Subject: [PATCH 0344/2145] [cnet] Fix theplatform vid extraction (fixes #5924) --- youtube_dl/extractor/cnet.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index 3145b3051..5dd69bff7 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -11,7 +11,7 @@ from ..utils import ( class CNETIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/' - _TEST = { + _TESTS = [{ 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', 'info_dict': { 'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60', @@ -25,7 +25,20 @@ class CNETIE(InfoExtractor): 'params': { 'skip_download': 'requires rtmpdump', } - } + }, { + 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', + 'info_dict': { + 'id': '56527b93-d25d-44e3-b738-f989ce2e49ba', + 'ext': 'flv', + 'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole', + 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', + 'uploader': 'Ashley Esqueda', + 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -42,7 +55,7 @@ class CNETIE(InfoExtractor): raise ExtractorError('Cannot find video data') mpx_account = data['config']['players']['default']['mpx_account'] - vid = vdata['files']['rtmp'] + vid = vdata['files'].get('rtmp', vdata['files']['hds']) tp_link = 'http://link.theplatform.com/s/%s/%s' % (mpx_account, vid) video_id = vdata['id'] From 01e21b89eefc32bcc4a92c3a82658cee139b6b2c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 8 Jun 2015 17:39:55 +0800 Subject: [PATCH 0345/2145] [noco] Skip invalid timestamps (closes #5826) --- youtube_dl/extractor/noco.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index 664dc81d4..5bbd2dcf6 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -166,6 +166,10 @@ class NocoIE(InfoExtractor): self._sort_formats(formats) timestamp = parse_iso8601(show.get('online_date_start_utc'), ' ') + + if timestamp is not None and timestamp < 0: + timestamp = None + uploader = show.get('partner_name') uploader_id = show.get('partner_key') duration = float_or_none(show.get('duration_ms'), 1000) From a55e36f48d1f0dc5454b144c7373361f284b9236 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 8 Jun 2015 21:05:17 +0600 Subject: [PATCH 0346/2145] [YoutubeDL] Handle out-of-range timestamps (#5826) --- youtube_dl/YoutubeDL.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index aa6ec9d9a..b1f792d4e 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1016,13 +1016,13 @@ class YoutubeDL(object): info_dict['display_id'] = info_dict['id'] if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None: - # Working around negative timestamps in Windows - # (see http://bugs.python.org/issue1646728) - if info_dict['timestamp'] < 0 and os.name == 'nt': - info_dict['timestamp'] = 0 - upload_date = datetime.datetime.utcfromtimestamp( - info_dict['timestamp']) - info_dict['upload_date'] = upload_date.strftime('%Y%m%d') + # Working around out-of-range timestamp values (e.g. negative ones on Windows, + # see http://bugs.python.org/issue1646728) + try: + upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp']) + info_dict['upload_date'] = upload_date.strftime('%Y%m%d') + except (ValueError, OverflowError, OSError): + pass if self.params.get('listsubtitles', False): if 'automatic_captions' in info_dict: From 627b96482567c1525dddfcceae2c16ff53c18b6a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 9 Jun 2015 11:41:17 +0800 Subject: [PATCH 0347/2145] [kickstarted] Extract thumbnails in embedded videos (#5929) --- youtube_dl/extractor/kickstarter.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py index 7d4b57056..1d391e69f 100644 --- a/youtube_dl/extractor/kickstarter.py +++ b/youtube_dl/extractor/kickstarter.py @@ -28,6 +28,14 @@ class KickStarterIE(InfoExtractor): 'uploader': 'Pebble Technology', 'title': 'Pebble iOS Notifications', } + }, { + 'url': 'https://www.kickstarter.com/projects/1420158244/power-drive-2000/widget/video.html', + 'info_dict': { + 'id': '1420158244', + 'ext': 'mp4', + 'title': 'Power Drive 2000', + }, + 'expected_warnings': ['OpenGraph description'], }] def _real_extract(self, url): @@ -48,10 +56,15 @@ class KickStarterIE(InfoExtractor): 'title': title, } + thumbnail = self._og_search_thumbnail(webpage, default=None) + if thumbnail is None: + thumbnail = self._html_search_regex( + r'<img[^>]+class="[^"]+\s*poster\s*[^"]+"[^>]+src="([^"]+)"', + webpage, 'thumbnail image', fatal=False) return { 'id': video_id, 'url': video_url, 'title': title, 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), + 'thumbnail': thumbnail, } From e1b9322b091122b6f6832c70e3a845a84ee1764e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 9 Jun 2015 14:48:18 +0800 Subject: [PATCH 0348/2145] [youtube] Restricter DASH signature pattern A problematic DASH url is: https://manifest.googlevideo.com/api/manifest/dash/mm/35/key/yt5/ip/140.112.247.145/ms/pm/mv/s/mt/1433794435/id/o-AD2Od_dsOlAUYPu03ZsVWKSbGEbCJJrMp9vnXGhnyRhd/mn/sn-aigllm7r/sparams/as%2Chfr%2Cid%2Cip%2Cipbits%2Citag%2Cmm%2Cmn%2Cms%2Cmv%2Cnh%2Cpl%2Cplayback_host%2Crequiressl%2Csource%2Cexpire/fexp/9406009%2C9406821%2C9407575%2C9408142%2C9408420%2C9408710%2C9409121%2C9409208%2C9412514%2C9412780%2C9413208%2C9413426%2C9413476%2C9413503%2C9415304%2C9415753/upn/viDQrs8SnmE/as/fmp4_audio_clear%2Cwebm_audio_clear%2Cfmp4_sd_hd_clear%2Cwebm_sd_hd_clear%2Cwebm2_sd_hd_clear/playback_host/r4---sn-aigllm7r.googlevideo.com/ipbits/0/requiressl/yes/pl/20/itag/0/source/youtube/expire/1433824806/nh/EAQ/signature/81ABE6391E351BA495F5B041B00FF1257A353318.1A6E48ABB74E8F4AE73CA2CB1F963FC34E33DEE7/sver/3/hfr/1 --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 419f7b019..083da777d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -785,7 +785,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): s = mobj.group(1) dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) return '/signature/%s' % dec_s - dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url) + dash_manifest_url = re.sub(r'/s/([a-fA-F0-9\.]+)', decrypt_sig, dash_manifest_url) dash_doc = self._download_xml( dash_manifest_url, video_id, note='Downloading DASH manifest', From d9cf48e81e38f4bf151a8648c48d6e5233325b40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 9 Jun 2015 20:36:08 +0600 Subject: [PATCH 0349/2145] [spiegeltv] Extract all formats and prefer hls (Closes #5843) --- youtube_dl/extractor/spiegeltv.py | 45 ++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py index 359722ad6..08a5c4314 100644 --- a/youtube_dl/extractor/spiegeltv.py +++ b/youtube_dl/extractor/spiegeltv.py @@ -2,7 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import float_or_none +from ..compat import compat_urllib_parse_urlparse +from ..utils import ( + determine_ext, + float_or_none, +) class SpiegeltvIE(InfoExtractor): @@ -17,7 +21,7 @@ class SpiegeltvIE(InfoExtractor): 'thumbnail': 're:http://.*\.jpg$', }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, } }, { @@ -53,7 +57,35 @@ class SpiegeltvIE(InfoExtractor): server_json = self._download_json( 'http://spiegeltv-prod-static.s3.amazonaws.com/projectConfigs/projectConfig.json', video_id, note='Downloading server information') - server = server_json['streamingserver'][0]['endpoint'] + + format = '16x9' if is_wide else '4x3' + + formats = [] + for streamingserver in server_json['streamingserver']: + endpoint = streamingserver.get('endpoint') + if not endpoint: + continue + play_path = 'mp4:%s_spiegeltv_0500_%s.m4v' % (uuid, format) + if endpoint.startswith('rtmp'): + formats.append({ + 'url': endpoint, + 'format_id': 'rtmp', + 'app': compat_urllib_parse_urlparse(endpoint).path[1:], + 'play_path': play_path, + 'player_path': 'http://prod-static.spiegel.tv/frontend-076.swf', + 'ext': 'flv', + 'rtmp_live': True, + }) + elif determine_ext(endpoint) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + endpoint.replace('[video]', play_path), + video_id, 'm4v', + preference=1, # Prefer hls since it allows to workaround georestriction + m3u8_id='hls')) + else: + formats.append({ + 'url': endpoint, + }) thumbnails = [] for image in media_json['images']: @@ -65,17 +97,12 @@ class SpiegeltvIE(InfoExtractor): description = media_json['subtitle'] duration = float_or_none(media_json.get('duration_in_ms'), scale=1000) - format = '16x9' if is_wide else '4x3' - - url = server + 'mp4:' + uuid + '_spiegeltv_0500_' + format + '.m4v' return { 'id': video_id, 'title': title, - 'url': url, - 'ext': 'm4v', 'description': description, 'duration': duration, 'thumbnails': thumbnails, - 'rtmp_live': True, + 'formats': formats, } From 9bf99891d08d166ac1d81b652dc487bb940fa685 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 9 Jun 2015 21:23:53 +0600 Subject: [PATCH 0350/2145] [cbs] Add support for colbertlateshow (Closes #5888) --- youtube_dl/extractor/cbs.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 1ceb9d8d9..89614a3c9 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class CBSIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cbs\.com/shows/[^/]+/(?:video|artist)/(?P<id>[^/]+)/.*' + _VALID_URL = r'https?://(?:(?:www\.)?cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/(?P<id>[^/]+)/.*' _TESTS = [{ 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', @@ -34,12 +34,18 @@ class CBSIE(InfoExtractor): 'skip_download': True, }, '_skip': 'Blocked outside the US', + }, { + 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', + 'only_matching': True, + }, { + 'url': 'http://colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) real_id = self._search_regex( - r"video\.settings\.pid\s*=\s*'([^']+)';", + [r"video\.settings\.pid\s*=\s*'([^']+)';", r"cbsplayer\.pid\s*=\s*'([^']+)';"], webpage, 'real video ID') return self.url_result('theplatform:%s' % real_id) From 9d581f3d5224140ca35ebd06d614b929e9252cd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 9 Jun 2015 21:39:45 +0600 Subject: [PATCH 0351/2145] [cbs] Extract display_id --- youtube_dl/extractor/cbs.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 89614a3c9..75fffb156 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -4,12 +4,13 @@ from .common import InfoExtractor class CBSIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www\.)?cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/(?P<id>[^/]+)/.*' + _VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', 'info_dict': { 'id': '4JUVEwq3wUT7', + 'display_id': 'connect-chat-feat-garth-brooks', 'ext': 'flv', 'title': 'Connect Chat feat. Garth Brooks', 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', @@ -24,6 +25,7 @@ class CBSIE(InfoExtractor): 'url': 'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/', 'info_dict': { 'id': 'WWF_5KqY3PK1', + 'display_id': 'st-vincent', 'ext': 'flv', 'title': 'Live on Letterman - St. Vincent', 'description': 'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.', @@ -38,14 +40,19 @@ class CBSIE(InfoExtractor): 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', 'only_matching': True, }, { - 'url': 'http://colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', + 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) real_id = self._search_regex( [r"video\.settings\.pid\s*=\s*'([^']+)';", r"cbsplayer\.pid\s*=\s*'([^']+)';"], webpage, 'real video ID') - return self.url_result('theplatform:%s' % real_id) + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': 'theplatform:%s' % real_id, + 'display_id': display_id, + } From 6e054aacca2ac44413ed37ee8b1d63a09c8b4ae2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 9 Jun 2015 23:07:22 +0600 Subject: [PATCH 0352/2145] [theplatform] Take care of /select/media URLs (Closes #5746) --- youtube_dl/extractor/theplatform.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 92731ad3d..48c6ff03f 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -26,7 +26,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language class ThePlatformIE(InfoExtractor): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/ - (?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)? + (?:(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)|(?P<media>(?:[^/]+/)+select/media/))? |theplatform:)(?P<id>[^/\?&]+)''' _TESTS = [{ @@ -56,6 +56,17 @@ class ThePlatformIE(InfoExtractor): # rtmp download 'skip_download': True, } + }, { + 'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD', + 'info_dict': { + 'id': 'yMBg9E8KFxZD', + 'ext': 'mp4', + 'description': 'md5:644ad9188d655b742f942bf2e06b002d', + 'title': 'HIGHLIGHTS: USA bag first ever series Cup win', + } + }, { + 'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7', + 'only_matching': True, }] @staticmethod @@ -85,6 +96,11 @@ class ThePlatformIE(InfoExtractor): if not provider_id: provider_id = 'dJ5BDC' + path = provider_id + if mobj.group('media'): + path += '/media' + path += '/' + video_id + if smuggled_data.get('force_smil_url', False): smil_url = url elif mobj.group('config'): @@ -94,8 +110,7 @@ class ThePlatformIE(InfoExtractor): config = self._download_json(config_url, video_id, 'Downloading config') smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m' else: - smil_url = ('http://link.theplatform.com/s/{0}/{1}/meta.smil?' - 'format=smil&mbr=true'.format(provider_id, video_id)) + smil_url = 'http://link.theplatform.com/s/%s/meta.smil?format=smil&mbr=true' % path sig = smuggled_data.get('sig') if sig: @@ -112,7 +127,7 @@ class ThePlatformIE(InfoExtractor): else: raise ExtractorError(error_msg, expected=True) - info_url = 'http://link.theplatform.com/s/{0}/{1}?format=preview'.format(provider_id, video_id) + info_url = 'http://link.theplatform.com/s/%s?format=preview' % path info_json = self._download_webpage(info_url, video_id) info = json.loads(info_json) From bd5bc0cd5af257abf7a1a4c14a9dd39c4f97e622 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 9 Jun 2015 23:12:13 +0600 Subject: [PATCH 0353/2145] [theplatform] Check for /select/media URLs first (#5746) --- youtube_dl/extractor/theplatform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 48c6ff03f..83d833e30 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -26,7 +26,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language class ThePlatformIE(InfoExtractor): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/ - (?:(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/)|(?P<media>(?:[^/]+/)+select/media/))? + (?:(?P<media>(?:[^/]+/)+select/media/)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))? |theplatform:)(?P<id>[^/\?&]+)''' _TESTS = [{ From 70219b0f4371fe54cc72d025ce06fc4691ba12fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 9 Jun 2015 23:49:11 +0200 Subject: [PATCH 0354/2145] [youtube:playlist] Use an iterator for the entries (closes #5935) So that '--playlist-end' downloads only the required pages. --- youtube_dl/extractor/youtube.py | 47 ++++++++++++++++----------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 083da777d..3448bec4f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1290,7 +1290,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): def _extract_playlist(self, playlist_id): url = self._TEMPLATE_URL % playlist_id page = self._download_webpage(url, playlist_id) - more_widget_html = content_html = page for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page): match = match.strip() @@ -1310,36 +1309,36 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): self.report_warning('Youtube gives an alert message: ' + match) # Extract the video ids from the playlist pages - ids = [] + def _entries(): + more_widget_html = content_html = page + for page_num in itertools.count(1): + matches = re.finditer(self._VIDEO_RE, content_html) + # We remove the duplicates and the link with index 0 + # (it's not the first video of the playlist) + new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') + for vid_id in new_ids: + yield self.url_result(vid_id, 'Youtube', video_id=vid_id) - for page_num in itertools.count(1): - matches = re.finditer(self._VIDEO_RE, content_html) - # We remove the duplicates and the link with index 0 - # (it's not the first video of the playlist) - new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') - ids.extend(new_ids) + mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) + if not mobj: + break - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: - break - - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape) - content_html = more['content_html'] - if not content_html.strip(): - # Some webpages show a "Load more" button but they don't - # have more videos - break - more_widget_html = more['load_more_widget_html'] + more = self._download_json( + 'https://youtube.com/%s' % mobj.group('more'), playlist_id, + 'Downloading page #%s' % page_num, + transform_source=uppercase_escape) + content_html = more['content_html'] + if not content_html.strip(): + # Some webpages show a "Load more" button but they don't + # have more videos + break + more_widget_html = more['load_more_widget_html'] playlist_title = self._html_search_regex( r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>', page, 'title') - url_results = self._ids_to_results(ids) - return self.playlist_result(url_results, playlist_id, playlist_title) + return self.playlist_result(_entries(), playlist_id, playlist_title) def _real_extract(self, url): # Extract playlist id From d84f1d14b526c4a5359117a58f25691a3da4c97e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lio=20A=2E=20Heckert?= <aurelio@colivre.coop.br> Date: Tue, 9 Jun 2015 22:08:16 -0300 Subject: [PATCH 0355/2145] Adds support for XviD output with extra parametrization As the "LG Time Machine" (a (not so) smart TV) has a limitation for video dimensions (as for codecs), I take to implement an extra parameter `--pp-params` where we can send extra parameterization for the video converter (post-processor). Example: ``` $ youtube-dl --recode-video=xvid --pp-params='-s 720x480' -c https://www.youtube.com/watch?v=BE7Qoe2ZiXE ``` That works fine on a 4yo LG Time Machine. Closes #5733 --- README.md | 3 ++- youtube_dl/__init__.py | 5 ++++- youtube_dl/options.py | 6 +++++- youtube_dl/postprocessor/ffmpeg.py | 14 ++++++++++---- 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index f3d83c89f..726ec9cf2 100644 --- a/README.md +++ b/README.md @@ -213,7 +213,8 @@ which means you can modify it, redistribute it or use it however you like. --audio-format FORMAT Specify audio format: "best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; "best" by default --audio-quality QUALITY Specify ffmpeg/avconv audio quality, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default 5) - --recode-video FORMAT Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv) + --recode-video FORMAT Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|xvid) + --pp-params Extra parameters for video post-processor. The params will be splited on spaces. -k, --keep-video Keep the video file on disk after the post-processing; the video is erased by default --no-post-overwrites Do not overwrite post-processed files; the post-processed files are overwritten by default --embed-subs Embed subtitles in the video (only for mkv and mp4 videos) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index ace17857c..5b28e4817 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -169,8 +169,10 @@ def _real_main(argv=None): if not opts.audioquality.isdigit(): parser.error('invalid audio quality specified') if opts.recodevideo is not None: - if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv']: + if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'xvid']: parser.error('invalid video recode format specified') + if opts.pp_params is not None: + opts.pp_params = opts.pp_params.split() if opts.convertsubtitles is not None: if opts.convertsubtitles not in ['srt', 'vtt', 'ass']: parser.error('invalid subtitle format specified') @@ -227,6 +229,7 @@ def _real_main(argv=None): postprocessors.append({ 'key': 'FFmpegVideoConvertor', 'preferedformat': opts.recodevideo, + 'extra_params': opts.pp_params }) if opts.convertsubtitles: postprocessors.append({ diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 689fa7595..ceb4b5f38 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -686,7 +686,11 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--recode-video', metavar='FORMAT', dest='recodevideo', default=None, - help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv)') + help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|xvid)') + postproc.add_option( + '--pp-params', + dest='pp_params', default=None, + help='Extra parameters for video post-processor. The params will be splited on spaces.') postproc.add_option( '-k', '--keep-video', action='store_true', dest='keepvideo', default=False, diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index cc65b34e7..a696b12b4 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -287,22 +287,28 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): class FFmpegVideoConvertorPP(FFmpegPostProcessor): - def __init__(self, downloader=None, preferedformat=None): + def __init__(self, downloader=None, preferedformat=None, extra_params=[]): super(FFmpegVideoConvertorPP, self).__init__(downloader) self._preferedformat = preferedformat + self._extra_params = extra_params def run(self, information): path = information['filepath'] prefix, sep, ext = path.rpartition('.') - outpath = prefix + sep + self._preferedformat + ext = self._preferedformat + options = self._extra_params + if self._preferedformat == 'xvid': + ext = 'avi' + options.extend(['-c:v', 'libxvid', '-vtag', 'XVID']) + outpath = prefix + sep + ext if information['ext'] == self._preferedformat: self._downloader.to_screen('[ffmpeg] Not converting video file %s - already is in target format %s' % (path, self._preferedformat)) return [], information self._downloader.to_screen('[' + 'ffmpeg' + '] Converting video from %s to %s, Destination: ' % (information['ext'], self._preferedformat) + outpath) - self.run_ffmpeg(path, outpath, []) + self.run_ffmpeg(path, outpath, options) information['filepath'] = outpath information['format'] = self._preferedformat - information['ext'] = self._preferedformat + information['ext'] = ext return [path], information From 0c8662d2b6f033ad42f1cc97989d4975629b524b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 10 Jun 2015 13:40:41 +0800 Subject: [PATCH 0356/2145] [youtube] Fix a TypeError caused by 4da31bd56629054497634d041035e4bd6fcfacbb --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2424ac2c0..a1906eef6 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -816,7 +816,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'filesize': filesize, 'fps': int_or_none(r.attrib.get('frameRate')), } - if len(segment_list): + if segment_list is not None: f.update({ 'initialization_url': segment_list.find('{urn:mpeg:DASH:schema:MPD:2011}Initialization').attrib['sourceURL'], 'segment_urls': [segment.attrib.get('media') for segment in segment_list.findall('{urn:mpeg:DASH:schema:MPD:2011}SegmentURL')], From 93dfcb9357b400b4d7e353d0a9db0e0194135b19 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 10 Jun 2015 13:44:54 +0800 Subject: [PATCH 0357/2145] [downloader/dash] Do not pollute ```self``` --- youtube_dl/downloader/dash.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 5f14658ba..cd84e0b07 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -16,14 +16,14 @@ class DashSegmentsFD(FileDownloader): base_url = info_dict['url'] segment_urls = info_dict['segment_urls'] - self.byte_counter = 0 + byte_counter = 0 def append_url_to_file(outf, target_url, target_name): self.to_screen('[DashSegments] %s: Downloading %s' % (info_dict['id'], target_name)) req = compat_urllib_request.Request(target_url) data = self.ydl.urlopen(req).read() outf.write(data) - self.byte_counter += len(data) + return len(data) def combine_url(base_url, target_url): if re.match(r'^https?://', target_url): @@ -35,15 +35,16 @@ class DashSegmentsFD(FileDownloader): outf, combine_url(base_url, info_dict['initialization_url']), 'initialization segment') for i, segment_url in enumerate(segment_urls): - append_url_to_file( + segment_len = append_url_to_file( outf, combine_url(base_url, segment_url), 'segment %d / %d' % (i + 1, len(segment_urls))) + byte_counter += segment_len self.try_rename(tmpfilename, filename) self._hook_progress({ - 'downloaded_bytes': self.byte_counter, - 'total_bytes': self.byte_counter, + 'downloaded_bytes': byte_counter, + 'total_bytes': byte_counter, 'filename': filename, 'status': 'finished', }) From 7ebd5376feb493edd0bc04abd07bba89397b7307 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 10 Jun 2015 14:15:20 +0800 Subject: [PATCH 0358/2145] [nfl] Relax _VALID_URL (fixes #5940) --- youtube_dl/extractor/nfl.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py index 2684dd250..dc54634a5 100644 --- a/youtube_dl/extractor/nfl.py +++ b/youtube_dl/extractor/nfl.py @@ -19,7 +19,7 @@ class NFLIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// (?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/ (?:.+?/)* - (?P<id>(?:\d[a-z]{2}\d{13}|\w{8}\-(?:\w{4}\-){3}\w{12}))''' + (?P<id>(?:[a-z0-9]{16}|\w{8}\-(?:\w{4}\-){3}\w{12}))''' _TESTS = [ { 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', @@ -58,6 +58,10 @@ class NFLIE(InfoExtractor): 'upload_date': '20150202', }, }, + { + 'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood', + 'only_matching': True, + } ] @staticmethod From 5bf3276e8d6ee7d017c8be04414398752cd9cdf3 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 10 Jun 2015 14:45:54 +0800 Subject: [PATCH 0359/2145] [downloader/dash] Add testing facility --- youtube_dl/downloader/dash.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index cd84e0b07..a4685d307 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -16,12 +16,21 @@ class DashSegmentsFD(FileDownloader): base_url = info_dict['url'] segment_urls = info_dict['segment_urls'] + is_test = self.params.get('test', False) + remaining_bytes = self._TEST_FILE_SIZE if is_test else None byte_counter = 0 - def append_url_to_file(outf, target_url, target_name): + def append_url_to_file(outf, target_url, target_name, remaining_bytes=None): self.to_screen('[DashSegments] %s: Downloading %s' % (info_dict['id'], target_name)) req = compat_urllib_request.Request(target_url) + if remaining_bytes is not None: + req.add_header('Range', 'bytes=0-%d' % (remaining_bytes - 1)) + data = self.ydl.urlopen(req).read() + + if remaining_bytes is not None: + data = data[:remaining_bytes] + outf.write(data) return len(data) @@ -37,8 +46,13 @@ class DashSegmentsFD(FileDownloader): for i, segment_url in enumerate(segment_urls): segment_len = append_url_to_file( outf, combine_url(base_url, segment_url), - 'segment %d / %d' % (i + 1, len(segment_urls))) + 'segment %d / %d' % (i + 1, len(segment_urls)), + remaining_bytes) byte_counter += segment_len + if remaining_bytes is not None: + remaining_bytes -= segment_len + if remaining_bytes <= 0: + break self.try_rename(tmpfilename, filename) From 8a1a26ce4c64d7a2c142718fc56f46d9a1c2c4f2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 10 Jun 2015 14:47:02 +0800 Subject: [PATCH 0360/2145] [youtube] Add a test for the DASH segment downloader --- youtube_dl/extractor/youtube.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a1906eef6..939f5e61f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -516,6 +516,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': 'requires avconv', } }, + # DASH manifest with segment_list + { + 'url': 'https://www.youtube.com/embed/CsmdDsKjzN8', + 'md5': '8ce563a1d667b599d21064e982ab9e31', + 'info_dict': { + 'id': 'CsmdDsKjzN8', + 'ext': 'mp4', + 'upload_date': '20150510', + 'uploader': 'Airtek', + 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.', + 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ', + 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015', + }, + 'params': { + 'youtube_include_dash_manifest': True, + 'format': '135', # bestvideo + } + } ] def __init__(self, *args, **kwargs): From eb8be1fe76a9fbc285e6c957b3fdd5c05135ae3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 10 Jun 2015 14:12:43 +0200 Subject: [PATCH 0361/2145] [rtbf] Extract all formats (closes #5947) --- youtube_dl/extractor/rtbf.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index 5a381d9ce..e4215d546 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -21,6 +21,13 @@ class RTBFIE(InfoExtractor): } } + _QUALITIES = [ + ('mobile', 'mobile'), + ('web', 'SD'), + ('url', 'MD'), + ('high', 'HD'), + ] + def _real_extract(self, url): video_id = self._match_id(url) @@ -32,14 +39,21 @@ class RTBFIE(InfoExtractor): r'data-video="([^"]+)"', webpage, 'data video')), video_id) - video_url = data.get('downloadUrl') or data.get('url') - if data.get('provider').lower() == 'youtube': + video_url = data.get('downloadUrl') or data.get('url') return self.url_result(video_url, 'Youtube') + formats = [] + for key, format_id in self._QUALITIES: + format_url = data['sources'].get(key) + if format_url: + formats.append({ + 'format_id': format_id, + 'url': format_url, + }) return { 'id': video_id, - 'url': video_url, + 'formats': formats, 'title': data['title'], 'description': data.get('description') or data.get('subtitle'), 'thumbnail': data.get('thumbnail'), From f98470df690d053e45691ede2751ab6a4063082b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 10 Jun 2015 23:01:12 +0600 Subject: [PATCH 0362/2145] [bilibili] Fix FutureWarning --- youtube_dl/extractor/bilibili.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 2103ed73a..bf60450c2 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -105,7 +105,7 @@ class BiliBiliIE(InfoExtractor): 'filesize': int_or_none( lq_durl.find('./size'), get_attr='text'), }] - if hq_durl: + if hq_durl is not None: formats.append({ 'format_id': 'hq', 'quality': 2, From a9d56c684319eaf8b9494bd8d2dc9d0f40485254 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 11 Jun 2015 19:03:22 +0600 Subject: [PATCH 0363/2145] [rtlnl] Improve _VALID_URL (#5950) --- youtube_dl/extractor/rtlnl.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index cfce4550a..41d202c28 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -12,10 +12,10 @@ class RtlNlIE(InfoExtractor): IE_NAME = 'rtl.nl' IE_DESC = 'rtl.nl and rtlxl.nl' _VALID_URL = r'''(?x) - https?://(www\.)? + https?://(?:www\.)? (?: rtlxl\.nl/\#!/[^/]+/| - rtl\.nl/system/videoplayer/[^?#]+?/video_embed\.html\#uuid= + rtl\.nl/system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid= ) (?P<id>[0-9a-f-]+)''' @@ -43,6 +43,9 @@ class RtlNlIE(InfoExtractor): 'upload_date': '20150215', 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.', } + }, { + 'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0', + 'only_matching': True, }] def _real_extract(self, url): From 97b570a94cc2387153af525f781e144bb4bb791e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 11 Jun 2015 19:04:12 +0600 Subject: [PATCH 0364/2145] [generic] Improve rtl.nl embeds detection (Closes #5950) --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 759691365..75526384f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1073,7 +1073,7 @@ class GenericIE(InfoExtractor): # Look for embedded rtl.nl player matches = re.findall( - r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"', + r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', webpage) if matches: return _playlist_from_matches(matches, ie='RtlNl') From ff0f0b9172e432ebbfca88da91278554eb47c307 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 11 Jun 2015 22:18:08 +0600 Subject: [PATCH 0365/2145] [tube8] Fix extraction (Closes #5952) --- youtube_dl/extractor/tube8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index 6ca8840b0..c9cb69333 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -47,7 +47,7 @@ class Tube8IE(InfoExtractor): webpage = self._download_webpage(req, display_id) flashvars = json.loads(self._html_search_regex( - r'flashvars\s*=\s*({.+?})', webpage, 'flashvars')) + r'flashvars\s*=\s*({.+?});\r?\n', webpage, 'flashvars')) video_url = flashvars['video_url'] if flashvars.get('encrypted') is True: From 99ac0390f559aa6dd09ffd8a15b9b562fda5f363 Mon Sep 17 00:00:00 2001 From: Hannu Lintala <hannu.lintala@gmail.com> Date: Mon, 8 Jun 2015 05:58:41 +0300 Subject: [PATCH 0366/2145] [fivetv] Add extractor (Closes #5794) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/fivetv.py | 67 ++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 youtube_dl/extractor/fivetv.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 67eb96057..d10275d03 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -152,6 +152,7 @@ from .fc2 import FC2IE from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE +from .fivetv import FiveTVIE from .fktv import ( FKTVIE, FKTVPosteckeIE, diff --git a/youtube_dl/extractor/fivetv.py b/youtube_dl/extractor/fivetv.py new file mode 100644 index 000000000..e47383b39 --- /dev/null +++ b/youtube_dl/extractor/fivetv.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, +) + + +class FiveTVIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?5-tv\.ru/[^/]*/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'http://5-tv.ru/news/96814/', + 'md5': 'bbff554ad415ecf5416a2f48c22d9283', + 'info_dict': { + 'id': '96814', + 'ext': 'mp4', + 'title': 'Россияне выбрали имя для общенациональной платежной системы', + 'description': 'md5:a8aa13e2b7ad36789e9f77a74b6de660', + 'thumbnail': 're:^https?://.*\.jpg$', + 'width': 480, + 'height': 360, + 'duration': 180, + }, + }, + { + 'url': 'http://5-tv.ru/video/1021729/', + 'md5': '299c8b72960efc9990acd2c784dc2296', + 'info_dict': { + 'id': '1021729', + 'ext': 'mp4', + 'title': '3D принтер', + 'description': 'md5:d76c736d29ef7ec5c0cf7d7c65ffcb41', + 'thumbnail': 're:^https?://.*\.jpg$', + 'width': 480, + 'height': 360, + 'duration': 180, + }, + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_link = self._search_regex( + r'(<a.*?class="videoplayer">)', webpage, 'video link') + + url = self._search_regex(r'href="([^"]+)"', video_link, 'video url') + width = int_or_none(self._search_regex( + r'width:(\d+)px', video_link, 'width', default=None, fatal=False)) + height = int_or_none(self._search_regex( + r'height:(\d+)px', video_link, 'height', default=None, fatal=False)) + duration = int_or_none(self._og_search_property( + 'video:duration', webpage, 'duration')) + return { + 'id': video_id, + 'url': url, + 'width': width, + 'height': height, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': duration, + } From 87446dc6186c7e4247fd7f9bc1046ef41f5d1a0f Mon Sep 17 00:00:00 2001 From: Hannu Lintala <hannu.lintala@gmail.com> Date: Sun, 7 Jun 2015 17:25:30 +0300 Subject: [PATCH 0367/2145] [tvc] Add extractor (Closes #5795) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/tvc.py | 79 ++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 youtube_dl/extractor/tvc.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 67eb96057..8c4e12904 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -582,6 +582,7 @@ from .tv2 import ( TV2ArticleIE, ) from .tv4 import TV4IE +from .tvc import TVCIE from .tvigle import TvigleIE from .tvp import TvpIE, TvpSeriesIE from .tvplay import TVPlayIE diff --git a/youtube_dl/extractor/tvc.py b/youtube_dl/extractor/tvc.py new file mode 100644 index 000000000..b62ab857c --- /dev/null +++ b/youtube_dl/extractor/tvc.py @@ -0,0 +1,79 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + str_or_none, +) + + +class TVCIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?tvc\.ru/.*/show/.*id/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'http://www.tvc.ru/channel/brand/id/29/show/episodes/episode_id/39702/', + 'md5': 'aa6fb3cf384e18a0ad3b30ee2898beba', + 'info_dict': { + 'id': '74622', + 'display_id': '39702', + 'ext': 'mp4', + 'title': 'События. "События". Эфир от 22.05.2015 14:30', + 'description': 'md5:ad7aa7db22903f983e687b8a3e98c6dd', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 1122, + }, + }, + { + 'url': 'http://www.tvc.ru/news/show/id/69944', + 'md5': 'b173128ee7b88b5b06c84e5f7880909f', + 'info_dict': { + 'id': '75399', + 'display_id': '69944', + 'ext': 'mp4', + 'title': 'Эксперты: в столице встал вопрос о максимально безопасных остановках', + 'description': 'md5:f675c8eaf23aab9df542d31773ed6518', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 278, + }, + }, + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_url = self._og_search_video_url(webpage) + + video_id = self._search_regex( + r'video/iframe/id/(\d+)/', video_url, 'video id') + + video_json_url = 'http://www.tvc.ru/video/json/id/%s' % (video_id) + + video_json = self._download_json(video_json_url, video_id) + + formats = [] + for info in video_json.get('path', {}).get('quality', []): + format_id = self._search_regex( + r'cdnvideo/([^-]+)-[^/]+/', info.get('url'), 'format id', + fatal=False) + formats.append({ + 'format_id': str_or_none(format_id), + 'url': info.get('url'), + 'width': int_or_none(info.get('width')), + 'height': int_or_none(info.get('height')), + 'tbr': int_or_none(info.get('bitrate')), + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': int_or_none(video_json.get('duration')), + 'formats': formats, + } From 9f15bdabc85add582d78a6dd57cfbb56cb33baff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 12 Jun 2015 16:13:36 +0600 Subject: [PATCH 0368/2145] [tvc] Separate embed extractor --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/tvc.py | 125 ++++++++++++++++++------------- 2 files changed, 77 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6dc3cbff4..a8d3a8928 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -584,7 +584,10 @@ from .tv2 import ( TV2ArticleIE, ) from .tv4 import TV4IE -from .tvc import TVCIE +from .tvc import ( + TVCIE, + TVCEmbedIE, +) from .tvigle import TvigleIE from .tvp import TvpIE, TvpSeriesIE from .tvplay import TVPlayIE diff --git a/youtube_dl/extractor/tvc.py b/youtube_dl/extractor/tvc.py index b62ab857c..0055f9598 100644 --- a/youtube_dl/extractor/tvc.py +++ b/youtube_dl/extractor/tvc.py @@ -3,77 +3,98 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + clean_html, int_or_none, - str_or_none, ) -class TVCIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?tvc\.ru/.*/show/.*id/(?P<id>\d+)' - _TESTS = [ - { - 'url': 'http://www.tvc.ru/channel/brand/id/29/show/episodes/episode_id/39702/', - 'md5': 'aa6fb3cf384e18a0ad3b30ee2898beba', - 'info_dict': { - 'id': '74622', - 'display_id': '39702', - 'ext': 'mp4', - 'title': 'События. "События". Эфир от 22.05.2015 14:30', - 'description': 'md5:ad7aa7db22903f983e687b8a3e98c6dd', - 'thumbnail': 're:^https?://.*\.jpg$', - 'duration': 1122, - }, +class TVCEmbedIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)' + _TEST = { + 'url': 'http://www.tvc.ru/video/iframe/id/74622/isPlay/false/id_stat/channel/?acc_video_id=/channel/brand/id/17/show/episodes/episode_id/39702', + 'md5': 'bbc5ff531d1e90e856f60fc4b3afd708', + 'info_dict': { + 'id': '74622', + 'ext': 'mp4', + 'title': 'События. "События". Эфир от 22.05.2015 14:30', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 1122, }, - { - 'url': 'http://www.tvc.ru/news/show/id/69944', - 'md5': 'b173128ee7b88b5b06c84e5f7880909f', - 'info_dict': { - 'id': '75399', - 'display_id': '69944', - 'ext': 'mp4', - 'title': 'Эксперты: в столице встал вопрос о максимально безопасных остановках', - 'description': 'md5:f675c8eaf23aab9df542d31773ed6518', - 'thumbnail': 're:^https?://.*\.jpg$', - 'duration': 278, - }, - }, - ] + } def _real_extract(self, url): - display_id = self._match_id(url) + video_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - video_url = self._og_search_video_url(webpage) - - video_id = self._search_regex( - r'video/iframe/id/(\d+)/', video_url, 'video id') - - video_json_url = 'http://www.tvc.ru/video/json/id/%s' % (video_id) - - video_json = self._download_json(video_json_url, video_id) + video = self._download_json( + 'http://www.tvc.ru/video/json/id/%s' % video_id, video_id) formats = [] - for info in video_json.get('path', {}).get('quality', []): + for info in video.get('path', {}).get('quality', []): + video_url = info.get('url') + if not video_url: + continue format_id = self._search_regex( - r'cdnvideo/([^-]+)-[^/]+/', info.get('url'), 'format id', - fatal=False) + r'cdnvideo/([^/]+?)(?:-[^/]+?)?/', video_url, + 'format id', default=None) formats.append({ - 'format_id': str_or_none(format_id), - 'url': info.get('url'), + 'url': video_url, + 'format_id': format_id, 'width': int_or_none(info.get('width')), 'height': int_or_none(info.get('height')), 'tbr': int_or_none(info.get('bitrate')), }) - self._sort_formats(formats) return { 'id': video_id, - 'display_id': display_id, - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'duration': int_or_none(video_json.get('duration')), + 'title': video['title'], + 'thumbnail': video.get('picture'), + 'duration': int_or_none(video.get('duration')), 'formats': formats, } + + +class TVCIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P<id>[^?#]+)' + _TESTS = [{ + 'url': 'http://www.tvc.ru/channel/brand/id/29/show/episodes/episode_id/39702/', + 'info_dict': { + 'id': '74622', + 'ext': 'mp4', + 'title': 'События. "События". Эфир от 22.05.2015 14:30', + 'description': 'md5:ad7aa7db22903f983e687b8a3e98c6dd', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 1122, + }, + }, { + 'url': 'http://www.tvc.ru/news/show/id/69944', + 'info_dict': { + 'id': '75399', + 'ext': 'mp4', + 'title': 'Эксперты: в столице встал вопрос о максимально безопасных остановках', + 'description': 'md5:f2098f71e21f309e89f69b525fd9846e', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 278, + }, + }, { + 'url': 'http://www.tvc.ru/channel/brand/id/47/show/episodes#', + 'info_dict': { + 'id': '2185', + 'ext': 'mp4', + 'title': 'Ещё не поздно. Эфир от 03.08.2013', + 'description': 'md5:51fae9f3f8cfe67abce014e428e5b027', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 3316, + }, + }] + + def _real_extract(self, url): + webpage = self._download_webpage(url, self._match_id(url)) + return { + '_type': 'url_transparent', + 'ie_key': 'TVCEmbed', + 'url': self._og_search_video_url(webpage), + 'title': clean_html(self._og_search_title(webpage)), + 'description': clean_html(self._og_search_description(webpage)), + 'thumbnail': self._og_search_thumbnail(webpage), + } From 29902c8ec016a7128557d47a7413e82d4e022f01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 12 Jun 2015 16:22:23 +0600 Subject: [PATCH 0369/2145] [tvc:embed] Add embed extraction routine --- youtube_dl/extractor/tvc.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/tvc.py b/youtube_dl/extractor/tvc.py index 0055f9598..756fec732 100644 --- a/youtube_dl/extractor/tvc.py +++ b/youtube_dl/extractor/tvc.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( clean_html, @@ -22,6 +24,13 @@ class TVCEmbedIE(InfoExtractor): }, } + @classmethod + def _extract_url(cls, webpage): + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:http://)?(?:www\.)?tvc\.ru/video/iframe/id/[^"]+)\1', webpage) + if mobj: + return mobj.group('url') + def _real_extract(self, url): video_id = self._match_id(url) From 494f20cbdca8e76e3cb452bb0feabcb855d9b4a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 12 Jun 2015 16:22:46 +0600 Subject: [PATCH 0370/2145] [extractor/generic] Add support for tvc embeds --- youtube_dl/extractor/generic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 75526384f..c797c4b52 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -34,6 +34,7 @@ from .brightcove import BrightcoveIE from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE +from .tvc import TVCEmbedIE from .sportbox import SportBoxEmbedIE from .smotri import SmotriIE from .condenast import CondeNastIE @@ -1301,6 +1302,11 @@ class GenericIE(InfoExtractor): if rutv_url: return self.url_result(rutv_url, 'RUTV') + # Look for embedded TVC player + rutv_url = TVCEmbedIE._extract_url(webpage) + if rutv_url: + return self.url_result(rutv_url, 'TVCEmbed') + # Look for embedded SportBox player sportbox_urls = SportBoxEmbedIE._extract_urls(webpage) if sportbox_urls: From 954c1d05299ae7c6a51db46c1ac33ddf150266c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 12 Jun 2015 16:24:13 +0600 Subject: [PATCH 0371/2145] [tvc] Refactor extractor names --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/generic.py | 4 ++-- youtube_dl/extractor/tvc.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a8d3a8928..18b1c5e54 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -586,7 +586,7 @@ from .tv2 import ( from .tv4 import TV4IE from .tvc import ( TVCIE, - TVCEmbedIE, + TVCArticleIE, ) from .tvigle import TvigleIE from .tvp import TvpIE, TvpSeriesIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c797c4b52..507e4a571 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -34,7 +34,7 @@ from .brightcove import BrightcoveIE from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE -from .tvc import TVCEmbedIE +from .tvc import TVCIE from .sportbox import SportBoxEmbedIE from .smotri import SmotriIE from .condenast import CondeNastIE @@ -1303,7 +1303,7 @@ class GenericIE(InfoExtractor): return self.url_result(rutv_url, 'RUTV') # Look for embedded TVC player - rutv_url = TVCEmbedIE._extract_url(webpage) + rutv_url = TVCIE._extract_url(webpage) if rutv_url: return self.url_result(rutv_url, 'TVCEmbed') diff --git a/youtube_dl/extractor/tvc.py b/youtube_dl/extractor/tvc.py index 756fec732..36c2a3196 100644 --- a/youtube_dl/extractor/tvc.py +++ b/youtube_dl/extractor/tvc.py @@ -10,7 +10,7 @@ from ..utils import ( ) -class TVCEmbedIE(InfoExtractor): +class TVCIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)' _TEST = { 'url': 'http://www.tvc.ru/video/iframe/id/74622/isPlay/false/id_stat/channel/?acc_video_id=/channel/brand/id/17/show/episodes/episode_id/39702', @@ -63,7 +63,7 @@ class TVCEmbedIE(InfoExtractor): } -class TVCIE(InfoExtractor): +class TVCArticleIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?tvc\.ru/(?!video/iframe/id/)(?P<id>[^?#]+)' _TESTS = [{ 'url': 'http://www.tvc.ru/channel/brand/id/29/show/episodes/episode_id/39702/', From 5ccddb7ecfb1015038f2616dd7e0da78a4365c89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 12 Jun 2015 16:25:26 +0600 Subject: [PATCH 0372/2145] [tvc] Fix ie_key --- youtube_dl/extractor/tvc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tvc.py b/youtube_dl/extractor/tvc.py index 36c2a3196..6b5d80aee 100644 --- a/youtube_dl/extractor/tvc.py +++ b/youtube_dl/extractor/tvc.py @@ -101,7 +101,7 @@ class TVCArticleIE(InfoExtractor): webpage = self._download_webpage(url, self._match_id(url)) return { '_type': 'url_transparent', - 'ie_key': 'TVCEmbed', + 'ie_key': 'TVC', 'url': self._og_search_video_url(webpage), 'title': clean_html(self._og_search_title(webpage)), 'description': clean_html(self._og_search_description(webpage)), From 2da09ff8b0de3c27a16d9096f5d28d03f44fcf70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 12 Jun 2015 16:26:31 +0600 Subject: [PATCH 0373/2145] [extractor/generic] Fix tvc ie_key --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 507e4a571..66aceefb8 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1305,7 +1305,7 @@ class GenericIE(InfoExtractor): # Look for embedded TVC player rutv_url = TVCIE._extract_url(webpage) if rutv_url: - return self.url_result(rutv_url, 'TVCEmbed') + return self.url_result(rutv_url, 'TVC') # Look for embedded SportBox player sportbox_urls = SportBoxEmbedIE._extract_urls(webpage) From f37bdbe537134f1ece0819e2aa677b1fec0c1cc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 12 Jun 2015 16:28:45 +0600 Subject: [PATCH 0374/2145] [extractor/generic] Add test for tvc embed --- youtube_dl/extractor/generic.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 66aceefb8..6be9e6329 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -292,6 +292,15 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + # TVC embed + { + 'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/', + 'info_dict': { + 'id': '55304', + 'ext': 'mp4', + 'title': 'Дошкольное воспитание', + }, + }, # SportBox embed { 'url': 'http://www.vestifinance.ru/articles/25753', From 499a077761b1577857952dc3b541c9f61a8bcade Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 12 Jun 2015 17:48:42 +0600 Subject: [PATCH 0375/2145] [5tv] Improve --- youtube_dl/extractor/fivetv.py | 115 +++++++++++++++++++-------------- 1 file changed, 68 insertions(+), 47 deletions(-) diff --git a/youtube_dl/extractor/fivetv.py b/youtube_dl/extractor/fivetv.py index e47383b39..13fbc4da2 100644 --- a/youtube_dl/extractor/fivetv.py +++ b/youtube_dl/extractor/fivetv.py @@ -1,67 +1,88 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..utils import ( - int_or_none, -) +from ..utils import int_or_none class FiveTVIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?5-tv\.ru/[^/]*/(?P<id>\d+)' - _TESTS = [ - { - 'url': 'http://5-tv.ru/news/96814/', - 'md5': 'bbff554ad415ecf5416a2f48c22d9283', - 'info_dict': { - 'id': '96814', - 'ext': 'mp4', - 'title': 'Россияне выбрали имя для общенациональной платежной системы', - 'description': 'md5:a8aa13e2b7ad36789e9f77a74b6de660', - 'thumbnail': 're:^https?://.*\.jpg$', - 'width': 480, - 'height': 360, - 'duration': 180, - }, + _VALID_URL = r'''(?x) + http:// + (?:www\.)?5-tv\.ru/ + (?: + (?:[^/]+/)+(?P<id>\d+)| + (?P<path>[^/?#]+)(?:[/?#])? + ) + ''' + + _TESTS = [{ + 'url': 'http://5-tv.ru/news/96814/', + 'md5': 'bbff554ad415ecf5416a2f48c22d9283', + 'info_dict': { + 'id': '96814', + 'ext': 'mp4', + 'title': 'Россияне выбрали имя для общенациональной платежной системы', + 'description': 'md5:a8aa13e2b7ad36789e9f77a74b6de660', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 180, }, - { - 'url': 'http://5-tv.ru/video/1021729/', - 'md5': '299c8b72960efc9990acd2c784dc2296', - 'info_dict': { - 'id': '1021729', - 'ext': 'mp4', - 'title': '3D принтер', - 'description': 'md5:d76c736d29ef7ec5c0cf7d7c65ffcb41', - 'thumbnail': 're:^https?://.*\.jpg$', - 'width': 480, - 'height': 360, - 'duration': 180, - }, + }, { + 'url': 'http://5-tv.ru/video/1021729/', + 'info_dict': { + 'id': '1021729', + 'ext': 'mp4', + 'title': '3D принтер', + 'description': 'md5:d76c736d29ef7ec5c0cf7d7c65ffcb41', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 180, }, - ] + }, { + 'url': 'http://www.5-tv.ru/glavnoe/#itemDetails', + 'info_dict': { + 'id': 'glavnoe', + 'ext': 'mp4', + 'title': 'Итоги недели с 8 по 14 июня 2015 года', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + }, { + 'url': 'http://www.5-tv.ru/glavnoe/broadcasts/508645/', + 'only_matching': True, + }, { + 'url': 'http://5-tv.ru/films/1507502/', + 'only_matching': True, + }, { + 'url': 'http://5-tv.ru/programs/broadcast/508713/', + 'only_matching': True, + }, { + 'url': 'http://5-tv.ru/angel/', + 'only_matching': True, + }, { + 'url': 'http://www.5-tv.ru/schedule/?iframe=true&width=900&height=450', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('path') webpage = self._download_webpage(url, video_id) - video_link = self._search_regex( - r'(<a.*?class="videoplayer">)', webpage, 'video link') + video_url = self._search_regex( + r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"', + webpage, 'video url') - url = self._search_regex(r'href="([^"]+)"', video_link, 'video url') - width = int_or_none(self._search_regex( - r'width:(\d+)px', video_link, 'width', default=None, fatal=False)) - height = int_or_none(self._search_regex( - r'height:(\d+)px', video_link, 'height', default=None, fatal=False)) + title = self._og_search_title(webpage, default=None) or self._search_regex( + r'<title>([^<]+)', webpage, 'title') duration = int_or_none(self._og_search_property( - 'video:duration', webpage, 'duration')) + 'video:duration', webpage, 'duration', default=None)) + return { 'id': video_id, - 'url': url, - 'width': width, - 'height': height, - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), + 'url': video_url, + 'title': title, + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'duration': duration, } From b859971873915df55668a59a18ccfd259c20800e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 12 Jun 2015 18:15:30 +0600 Subject: [PATCH 0376/2145] [extractor/generic] Rename tvc embed url variable --- youtube_dl/extractor/generic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6be9e6329..357d58cea 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1312,9 +1312,9 @@ class GenericIE(InfoExtractor): return self.url_result(rutv_url, 'RUTV') # Look for embedded TVC player - rutv_url = TVCIE._extract_url(webpage) - if rutv_url: - return self.url_result(rutv_url, 'TVC') + tvc_url = TVCIE._extract_url(webpage) + if tvc_url: + return self.url_result(tvc_url, 'TVC') # Look for embedded SportBox player sportbox_urls = SportBoxEmbedIE._extract_urls(webpage) From 9872d3110c0d3027dac856e005299f3682ef23ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 12 Jun 2015 18:37:09 +0600 Subject: [PATCH 0377/2145] [extractor/generic] Add support for tvigle embeds --- youtube_dl/extractor/generic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 357d58cea..40d869c53 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1321,6 +1321,12 @@ class GenericIE(InfoExtractor): if sportbox_urls: return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed') + # Look for embedded Tvigle player + mobj = re.search( + r']+?src=(["\'])(?P(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'Tvigle') + # Look for embedded TED player mobj = re.search( r']+?src=(["\'])(?Phttps?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage) From d22dec74ffa2a53a1c04770af37d39f384f3d56c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 12 Jun 2015 19:20:12 +0600 Subject: [PATCH 0378/2145] Add `--force-generic-extractor` For some extractors that are hard to workout a good _VALID_URL we use very vague and unrestrictive ones, e.g. just allowing anything after hostname and capturing part of URL as id. If some of these extractors happen to have an video embed of some different hoster or platform and this scenario was not handled in extractor itself we end up with inability to download this embed until extractor is fixed to support embed of this kind. Forcing downloader to use the generic extractor can be a neat temporary solution for this problem. Example: FiveTV extractor with Tvigle embed - http://www.5-tv.ru/rabota/broadcasts/48/ --- youtube_dl/YoutubeDL.py | 6 ++++++ youtube_dl/__init__.py | 1 + youtube_dl/extractor/generic.py | 4 +++- youtube_dl/options.py | 4 ++++ 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b1f792d4e..4b801a917 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -139,6 +139,7 @@ class YoutubeDL(object): outtmpl: Template for output names. restrictfilenames: Do not allow "&" and spaces in file names ignoreerrors: Do not stop on download errors. + force_generic_extractor: Force downloader to use the generic extractor nooverwrites: Prevent overwriting files. playliststart: Playlist item to start at. playlistend: Playlist item to end at. @@ -282,6 +283,7 @@ class YoutubeDL(object): self._num_downloads = 0 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] self._err_file = sys.stderr + self._force_generic_extractor_required = params.get('force_generic_extractor', False) self.params = params self.cache = Cache(self) @@ -633,6 +635,10 @@ class YoutubeDL(object): extra_info is a dict containing the extra values to add to each result ''' + if not ie_key and self._force_generic_extractor_required: + self._force_generic_extractor_required = False + ie_key = 'Generic' + if ie_key: ies = [self.get_info_extractor(ie_key)] else: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index ace17857c..215b616de 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -293,6 +293,7 @@ def _real_main(argv=None): 'autonumber_size': opts.autonumber_size, 'restrictfilenames': opts.restrictfilenames, 'ignoreerrors': opts.ignoreerrors, + 'force_generic_extractor': opts.force_generic_extractor, 'ratelimit': opts.ratelimit, 'nooverwrites': opts.nooverwrites, 'retries': opts_retries, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 40d869c53..3d672197c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -977,7 +977,9 @@ class GenericIE(InfoExtractor): 'upload_date': upload_date, } - if not self._downloader.params.get('test', False) and not is_intentional: + if (not self._downloader.params.get('test', False) and + not is_intentional and + not self._downloader.params.get('force_generic_extractor', False)): self._downloader.report_warning('Falling back on generic information extractor.') if not full_response: diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 689fa7595..096ab6137 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -150,6 +150,10 @@ def parseOpts(overrideArguments=None): '--extractor-descriptions', action='store_true', dest='list_extractor_descriptions', default=False, help='Output descriptions of all supported extractors') + general.add_option( + '--force-generic-extractor', + action='store_true', dest='force_generic_extractor', default=False, + help='Force extraction to use the generic extractor') general.add_option( '--default-search', dest='default_search', metavar='PREFIX', From 3d535e047162af021b3df6086f9a90d0cb0b6100 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 12 Jun 2015 19:31:52 +0600 Subject: [PATCH 0379/2145] [tvc] Fix embed regex --- youtube_dl/extractor/tvc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tvc.py b/youtube_dl/extractor/tvc.py index 6b5d80aee..3a4f393fc 100644 --- a/youtube_dl/extractor/tvc.py +++ b/youtube_dl/extractor/tvc.py @@ -27,7 +27,7 @@ class TVCIE(InfoExtractor): @classmethod def _extract_url(cls, webpage): mobj = re.search( - r']+?src=(["\'])(?P(?:http://)?(?:www\.)?tvc\.ru/video/iframe/id/[^"]+)\1', webpage) + r']+?src=(["\'])(?P(?:http:)?//(?:www\.)?tvc\.ru/video/iframe/id/[^"]+)\1', webpage) if mobj: return mobj.group('url') From 185dbc49749ca81fbb0a61a78c6dd35f2c32b15f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 12 Jun 2015 21:13:14 +0600 Subject: [PATCH 0380/2145] [prosiebensat1] Fix rtmp extraction (Closes #5962) --- youtube_dl/extractor/prosiebensat1.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 255d4abc1..6b13eb605 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -206,8 +206,8 @@ class ProSiebenSat1IE(InfoExtractor): def _extract_clip(self, url, webpage): clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id') - access_token = 'testclient' - client_name = 'kolibri-1.2.5' + access_token = 'prosieben' + client_name = 'kolibri-1.12.6' client_location = url videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse.urlencode({ @@ -275,13 +275,17 @@ class ProSiebenSat1IE(InfoExtractor): for source in urls_sources: protocol = source['protocol'] if protocol == 'rtmp' or protocol == 'rtmpe': - mobj = re.search(r'^(?Prtmpe?://[^/]+/(?P[^/]+))/(?P.+)$', source['url']) + mobj = re.search(r'^(?Prtmpe?://[^/]+)/(?P.+)$', source['url']) if not mobj: continue + path = mobj.group('path') + mp4colon_index = path.rfind('mp4:') + app = path[:mp4colon_index] + play_path = path[mp4colon_index:] formats.append({ - 'url': mobj.group('url'), - 'app': mobj.group('app'), - 'play_path': mobj.group('playpath'), + 'url': '%s/%s' % (mobj.group('url'), app), + 'app': app, + 'play_path': play_path, 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', 'page_url': 'http://www.prosieben.de', 'vbr': fix_bitrate(source['bitrate']), From 8b6c896c4b60fe13b30227071aba2783222132a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 12 Jun 2015 21:18:13 +0600 Subject: [PATCH 0381/2145] [prosiebensat1] Add title regex --- youtube_dl/extractor/prosiebensat1.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 6b13eb605..536a42dc8 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -177,6 +177,7 @@ class ProSiebenSat1IE(InfoExtractor): r'
\s*

(.+?)

', r'\s*

(.+?)

', r'

\s*(.+?)

', + r'
\s*

([^<]+)

\s*
', ] _DESCRIPTION_REGEXES = [ r'

\s*(.+?)

', From 9f4323252abade4f10b0884682f92cedc78b4d4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 12 Jun 2015 21:56:50 +0600 Subject: [PATCH 0382/2145] [YoutubeDL] Fix for multiple URLs --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 4b801a917..8dbad7cf8 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -283,7 +283,6 @@ class YoutubeDL(object): self._num_downloads = 0 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] self._err_file = sys.stderr - self._force_generic_extractor_required = params.get('force_generic_extractor', False) self.params = params self.cache = Cache(self) @@ -1504,6 +1503,7 @@ class YoutubeDL(object): for url in url_list: try: + self._force_generic_extractor_required = self.params.get('force_generic_extractor', False) # It also downloads the videos res = self.extract_info(url) except UnavailableVideoError: From 61aa5ba36eea3b7cf8c3570ab33604dd2c13b855 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Jun 2015 02:05:21 +0600 Subject: [PATCH 0383/2145] [YoutubeDL] Remove global state for force_generic_extractor flag in favor of passing argument --- youtube_dl/YoutubeDL.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 8dbad7cf8..dd2d8cb3c 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -627,15 +627,14 @@ class YoutubeDL(object): info_dict.setdefault(key, value) def extract_info(self, url, download=True, ie_key=None, extra_info={}, - process=True): + process=True, force_generic_extractor=False): ''' Returns a list with a dictionary for each video we find. If 'download', also downloads the videos. extra_info is a dict containing the extra values to add to each result ''' - if not ie_key and self._force_generic_extractor_required: - self._force_generic_extractor_required = False + if not ie_key and force_generic_extractor: ie_key = 'Generic' if ie_key: @@ -663,7 +662,7 @@ class YoutubeDL(object): } self.add_default_extra_info(ie_result, ie, url) if process: - return self.process_ie_result(ie_result, download, extra_info) + return self.process_ie_result(ie_result, download, extra_info, force_generic_extractor=False) else: return ie_result except ExtractorError as de: # An error we somewhat expected @@ -688,7 +687,7 @@ class YoutubeDL(object): 'extractor_key': ie.ie_key(), }) - def process_ie_result(self, ie_result, download=True, extra_info={}): + def process_ie_result(self, ie_result, download=True, extra_info={}, force_generic_extractor=False): """ Take the result of the ie(may be modified) and resolve all unresolved references (URLs, playlist items). @@ -716,7 +715,8 @@ class YoutubeDL(object): return self.extract_info(ie_result['url'], download, ie_key=ie_result.get('ie_key'), - extra_info=extra_info) + extra_info=extra_info, + force_generic_extractor=force_generic_extractor) elif result_type == 'url_transparent': # Use the information from the embedding page info = self.extract_info( @@ -1503,9 +1503,9 @@ class YoutubeDL(object): for url in url_list: try: - self._force_generic_extractor_required = self.params.get('force_generic_extractor', False) # It also downloads the videos - res = self.extract_info(url) + res = self.extract_info( + url, force_generic_extractor=self.params.get('force_generic_extractor', False)) except UnavailableVideoError: self.report_error('unable to download video') except MaxDownloadsReached: From 0072afca8e02052c77dc3b7009e51114887e31b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Jun 2015 02:21:29 +0600 Subject: [PATCH 0384/2145] [YoutubeDL] Remove force_generic_extractor arg from process_ie_result --- youtube_dl/YoutubeDL.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index dd2d8cb3c..a7d3a1c01 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -662,7 +662,7 @@ class YoutubeDL(object): } self.add_default_extra_info(ie_result, ie, url) if process: - return self.process_ie_result(ie_result, download, extra_info, force_generic_extractor=False) + return self.process_ie_result(ie_result, download, extra_info) else: return ie_result except ExtractorError as de: # An error we somewhat expected @@ -687,7 +687,7 @@ class YoutubeDL(object): 'extractor_key': ie.ie_key(), }) - def process_ie_result(self, ie_result, download=True, extra_info={}, force_generic_extractor=False): + def process_ie_result(self, ie_result, download=True, extra_info={}): """ Take the result of the ie(may be modified) and resolve all unresolved references (URLs, playlist items). @@ -715,8 +715,7 @@ class YoutubeDL(object): return self.extract_info(ie_result['url'], download, ie_key=ie_result.get('ie_key'), - extra_info=extra_info, - force_generic_extractor=force_generic_extractor) + extra_info=extra_info) elif result_type == 'url_transparent': # Use the information from the embedding page info = self.extract_info( From 4f3bf679f5a764f7a26d3c45c82be43e34a3cc4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Jun 2015 03:09:35 +0600 Subject: [PATCH 0385/2145] [vk] Fix authentication for non-ASCII login/password --- youtube_dl/extractor/vk.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index cc384adbf..d0e772108 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -119,8 +119,8 @@ class VKIE(InfoExtractor): 'act': 'login', 'role': 'al_frame', 'expire': '1', - 'email': username, - 'pass': password, + 'email': username.encode('cp1251'), + 'pass': password.encode('cp1251'), } request = compat_urllib_request.Request('https://login.vk.com/?act=login', From 9fcbd5db2abb5a56beefe4a64486da692705ad12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Jun 2015 03:24:36 +0600 Subject: [PATCH 0386/2145] [pornhub] Add support for embeds --- youtube_dl/extractor/pornhub.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index daa284ea2..3c99b4def 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -19,7 +19,7 @@ from ..aes import ( class PornHubIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornhub\.com/view_video\.php\?viewkey=(?P[0-9a-f]+)' + _VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P[0-9a-f]+)' _TEST = { 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 'md5': '882f488fa1f0026f023f33576004a2ed', @@ -39,7 +39,8 @@ class PornHubIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - req = compat_urllib_request.Request(url) + req = compat_urllib_request.Request( + 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id) req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) From 65d161c480e9964026e618a2e95f9fc9eb8119e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Jun 2015 03:36:16 +0600 Subject: [PATCH 0387/2145] [extractor/generic] Add support for pornhub embeds --- youtube_dl/extractor/generic.py | 5 +++++ youtube_dl/extractor/pornhub.py | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 40d869c53..f683760e4 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -42,6 +42,7 @@ from .udn import UDNEmbedIE from .senateisvp import SenateISVPIE from .bliptv import BlipTVIE from .svt import SVTIE +from .pornhub import PornHubIE class GenericIE(InfoExtractor): @@ -1321,6 +1322,10 @@ class GenericIE(InfoExtractor): if sportbox_urls: return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed') + pornhub_url = PornHubIE._extract_url(webpage) + if pornhub_url: + return self.url_result(pornhub_url, 'PornHub') + # Look for embedded Tvigle player mobj = re.search( r']+?src=(["\'])(?P(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 3c99b4def..8565d7551 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -32,6 +32,13 @@ class PornHubIE(InfoExtractor): } } + @classmethod + def _extract_url(cls, webpage): + mobj = re.search( + r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage) + if mobj: + return mobj.group('url') + def _extract_count(self, pattern, webpage, name): return str_to_int(self._search_regex( pattern, webpage, '%s count' % name, fatal=False)) From 78e2b74bb945dc7f1724f7486405dd523486d634 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Jun 2015 03:39:14 +0600 Subject: [PATCH 0388/2145] [tumblr] Add support for pornhub embeds (Closes #5963) --- youtube_dl/extractor/generic.py | 1 + youtube_dl/extractor/tumblr.py | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f683760e4..f6b984300 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1322,6 +1322,7 @@ class GenericIE(InfoExtractor): if sportbox_urls: return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed') + # Look for embedded PornHub player pornhub_url = PornHubIE._extract_url(webpage) if pornhub_url: return self.url_result(pornhub_url, 'PornHub') diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index e6218808f..63c20310d 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .pornhub import PornHubIE class TumblrIE(InfoExtractor): @@ -55,6 +56,10 @@ class TumblrIE(InfoExtractor): if vid_me_embed_url is not None: return self.url_result(vid_me_embed_url, 'Vidme') + pornhub_url = PornHubIE._extract_url(webpage) + if pornhub_url: + return self.url_result(pornhub_url, 'PornHub') + iframe_url = self._search_regex( r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', webpage, 'iframe url') From b4e1576aee7cf18f5842714c87985ae0b72f1546 Mon Sep 17 00:00:00 2001 From: fnord Date: Sat, 13 Jun 2015 06:09:44 -0500 Subject: [PATCH 0389/2145] Brightcove extractor: support customBC.createVideo(...); method found in http://www.americanbar.org/groups/family_law.html and http://america.aljazeera.com/watch/shows/america-tonight/2015/6/exclusive-hunting-isil-with-the-pkk.html --- youtube_dl/extractor/brightcove.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index c1d4320e1..20a6ed965 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -188,7 +188,19 @@ class BrightcoveIE(InfoExtractor): [^>]*?>\s*\s*''', webpage) - return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) + if matches: + return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) + + custombcs = re.findall(r'customBC.\createVideo\((.+?)\);',webpage) + if custombcs: + urls = [] + for match in custombcs: + # brightcove playerkey begins with AQ and is 50 characters in length, + # however it's appended to itself in places, so truncate. + f = re.search(r'["\'](AQ[^"\']{48}).*?["\'](\d+)["\']', match) + if f: + urls.append('brightcove:playerKey='+f.group(1)+'&%40videoPlayer='+f.group(2)) + return urls def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) From af9cdee9cba610aa3924f90a8a3fcd7dd43c65eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 13 Jun 2015 19:53:32 +0600 Subject: [PATCH 0390/2145] [brightcove] Improve and generalize brightcove URL extraction from JS --- youtube_dl/extractor/brightcove.py | 35 +++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 20a6ed965..d768f99e6 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -156,6 +156,28 @@ class BrightcoveIE(InfoExtractor): linkBase = find_param('linkBaseURL') if linkBase is not None: params['linkBaseURL'] = linkBase + return cls._make_brightcove_url(params) + + @classmethod + def _build_brighcove_url_from_js(cls, object_js): + # The layout of JS is as follows: + # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) { + # // build Brightcove XML + # } + m = re.search( + r'''(?x)customBC.\createVideo\( + .*? # skipping width and height + ["\'](?P\d+)["\']\s*,\s* # playerID + ["\'](?PAQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters + # in length, however it's appended to itself + # in places, so truncate + ["\'](?P\d+)["\'] # @videoPlayer + ''', object_js) + if m: + return cls._make_brightcove_url(m.groupdict()) + + @classmethod + def _make_brightcove_url(cls, params): data = compat_urllib_parse.urlencode(params) return cls._FEDERATED_URL_TEMPLATE % data @@ -191,16 +213,9 @@ class BrightcoveIE(InfoExtractor): if matches: return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) - custombcs = re.findall(r'customBC.\createVideo\((.+?)\);',webpage) - if custombcs: - urls = [] - for match in custombcs: - # brightcove playerkey begins with AQ and is 50 characters in length, - # however it's appended to itself in places, so truncate. - f = re.search(r'["\'](AQ[^"\']{48}).*?["\'](\d+)["\']', match) - if f: - urls.append('brightcove:playerKey='+f.group(1)+'&%40videoPlayer='+f.group(2)) - return urls + return list(filter(None, [ + cls._build_brighcove_url_from_js(custom_bc) + for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)])) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) From 0029071adbdc0e1469d76cdc7e058c2f77299610 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 14 Jun 2015 07:43:14 +0600 Subject: [PATCH 0391/2145] [dramefever] Improve and simplify --- youtube_dl/extractor/dramafever.py | 172 +++++++++++++++++------------ 1 file changed, 101 insertions(+), 71 deletions(-) diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index 40787ffcd..0f33a61a3 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -1,104 +1,111 @@ # encoding: utf-8 from __future__ import unicode_literals -import re +import itertools from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + clean_html, + determine_ext, + int_or_none, + parse_iso8601, +) class DramaFeverIE(InfoExtractor): IE_NAME = 'dramafever' - _VALID_URL = r'^https?://(?:www\.)?dramafever\.com/drama/(?P[0-9]+/[0-9]+)/' - _TESTS = [{ + _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P[0-9]+/[0-9]+)' + _TEST = { 'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/', 'info_dict': { 'id': '4512.1', 'ext': 'flv', 'title': 'Cooking with Shin 4512.1', + 'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0', + 'thumbnail': 're:^https?://.*\.jpg', + 'timestamp': 1404336058, 'upload_date': '20140702', - 'description': 'Served at all special occasions and featured in the hit drama Heirs, Shin cooks Red Bean Rice.', + 'duration': 343, } - }] + } def _real_extract(self, url): - video_id = self._match_id(url).replace("/", ".") + video_id = self._match_id(url).replace('/', '.') - consumer_secret = self._get_consumer_secret(video_id) + try: + feed = self._download_json( + 'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id, + video_id, 'Downloading episode JSON')['channel']['item'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + raise ExtractorError( + 'Currently unavailable in your country.', expected=True) + raise - ep_json = self._download_json( - "http://www.dramafever.com/amp/episode/feed.json?guid=%s" % video_id, - video_id, note='Downloading episode metadata', - errnote="Video may not be available for your location")["channel"]["item"] - - title = ep_json["media-group"]["media-title"] - description = ep_json["media-group"]["media-description"] - thumbnail = ep_json["media-group"]["media-thumbnail"]["@attributes"]["url"] - duration = int(ep_json["media-group"]["media-content"][0]["@attributes"]["duration"]) - mobj = re.match(r"([0-9]{4})-([0-9]{2})-([0-9]{2})", ep_json["pubDate"]) - upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3) if mobj is not None else None + media_group = feed.get('media-group', {}) formats = [] - for vid_format in ep_json["media-group"]["media-content"]: - src = vid_format["@attributes"]["url"] - if '.f4m' in src: - formats.extend(self._extract_f4m_formats(src, video_id)) - + for media_content in media_group['media-content']: + src = media_content.get('@attributes', {}).get('url') + if not src: + continue + ext = determine_ext(src) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + src, video_id, f4m_id='hds')) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', m3u8_id='hls')) + else: + formats.append({ + 'url': src, + }) self._sort_formats(formats) - video_subtitles = self.extract_subtitles(video_id, consumer_secret) + + title = media_group.get('media-title') + description = media_group.get('media-description') + duration = int_or_none(media_group['media-content'][0].get('@attributes', {}).get('duration')) + thumbnail = self._proto_relative_url( + media_group.get('media-thumbnail', {}).get('@attributes', {}).get('url')) + timestamp = parse_iso8601(feed.get('pubDate'), ' ') + + subtitles = {} + for media_subtitle in media_group.get('media-subTitle', []): + lang = media_subtitle.get('@attributes', {}).get('lang') + href = media_subtitle.get('@attributes', {}).get('href') + if not lang or not href: + continue + subtitles[lang] = [{ + 'ext': 'ttml', + 'url': href, + }] return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, - 'upload_date': upload_date, + 'timestamp': timestamp, 'duration': duration, 'formats': formats, - 'subtitles': video_subtitles, + 'subtitles': subtitles, } - def _get_consumer_secret(self, video_id): - df_js = self._download_webpage( - "http://www.dramafever.com/static/126960d/v2/js/plugins/jquery.threadedcomments.js", video_id) - return self._search_regex(r"'cs': '([0-9a-zA-Z]+)'", df_js, "cs") - def _get_episodes(self, series_id, consumer_secret, episode_filter=None): - _PAGE_SIZE = 60 - - curr_page = 1 - max_pages = curr_page + 1 - results = [] - while max_pages >= curr_page: - page_url = "http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_size=%d&page_number=%d" % \ - (consumer_secret, series_id, _PAGE_SIZE, curr_page) - series = self._download_json( - page_url, series_id, note="Downloading series json page #%d" % curr_page) - max_pages = series['num_pages'] - results.extend([ep for ep in series['value'] if episode_filter is None or episode_filter(ep)]) - curr_page += 1 - return results - - def _get_subtitles(self, video_id, consumer_secret): - - res = None - info = self._get_episodes( - video_id.split(".")[0], consumer_secret, - episode_filter=lambda x: x['guid'] == video_id) - - if len(info) == 1 and info[0]['subfile'] != '': - res = {'en': [{'url': info[0]['subfile'], 'ext': 'srt'}]} - return res - - -class DramaFeverSeriesIE(DramaFeverIE): +class DramaFeverSeriesIE(InfoExtractor): IE_NAME = 'dramafever:series' - _VALID_URL = r'^https?://(?:www\.)?dramafever\.com/drama/(?P[0-9]+)/\d*[a-zA-Z_][a-zA-Z0-9_]*/' + _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P[0-9]+)(?:/(?:(?!\d).+)?)?$' _TESTS = [{ 'url': 'http://www.dramafever.com/drama/4512/Cooking_with_Shin/', 'info_dict': { 'id': '4512', 'title': 'Cooking with Shin', - 'description': 'Professional chef and cooking instructor Shin Kim takes some of the delicious dishes featured in your favorite dramas and shows you how to make them right at home.', + 'description': 'md5:84a3f26e3cdc3fb7f500211b3593b5c1', }, 'playlist_count': 4, }, { @@ -106,25 +113,48 @@ class DramaFeverSeriesIE(DramaFeverIE): 'info_dict': { 'id': '124', 'title': 'IRIS', - 'description': 'Lee Byung Hun and Kim Tae Hee star in this powerhouse drama and ratings megahit of action, intrigue and romance.', + 'description': 'md5:b3a30e587cf20c59bd1c01ec0ee1b862', }, 'playlist_count': 20, }] + _CONSUMER_SECRET = 'DA59dtVXYLxajktV' + _PAGE_SIZE = 5 # max is 60 (see http://api.drama9.com/#get--api-4-episode-series-) + + def _get_consumer_secret(self, video_id): + mainjs = self._download_webpage( + 'http://www.dramafever.com/static/51afe95/df2014/scripts/main.js', + video_id, 'Downloading main.js', fatal=False) + if not mainjs: + return self._CONSUMER_SECRET + return self._search_regex( + r"var\s+cs\s*=\s*'([^']+)'", mainjs, + 'consumer secret', default=self._CONSUMER_SECRET) + def _real_extract(self, url): series_id = self._match_id(url) + consumer_secret = self._get_consumer_secret(series_id) - series_json = self._download_json( - "http://www.dramafever.com/api/4/series/query/?cs=%s&series_id=%s" % (consumer_secret, series_id), - series_id, note='Downloading series metadata')["series"][series_id] + series = self._download_json( + 'http://www.dramafever.com/api/4/series/query/?cs=%s&series_id=%s' + % (consumer_secret, series_id), + series_id, 'Downloading series JSON')['series'][series_id] - title = series_json["name"] - description = series_json["description_short"] + title = clean_html(series['name']) + description = clean_html(series.get('description') or series.get('description_short')) - episodes = self._get_episodes(series_id, consumer_secret) entries = [] - for ep in episodes: - entries.append(self.url_result( - 'http://www.dramafever.com%s' % ep['episode_url'], 'DramaFever', ep['guid'])) + for page_num in itertools.count(1): + episodes = self._download_json( + 'http://www.dramafever.com/api/4/episode/series/?cs=%s&series_id=%s&page_size=%d&page_number=%d' + % (consumer_secret, series_id, self._PAGE_SIZE, page_num), + series_id, 'Downloading episodes JSON page #%d' % page_num) + for episode in episodes.get('value', []): + entries.append(self.url_result( + compat_urlparse.urljoin(url, episode['episode_url']), + 'DramaFever', episode.get('guid'))) + if page_num == episodes['num_pages']: + break + return self.playlist_result(entries, series_id, title, description) From 70a2002399b46aa0cde2879d856d1bb68e2c6f67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 14 Jun 2015 09:50:23 +0600 Subject: [PATCH 0392/2145] [dramafever:series] Fix _VALID_URL (Closes #5973) --- youtube_dl/extractor/dramafever.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index 0f33a61a3..42e0df24e 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -99,7 +99,7 @@ class DramaFeverIE(InfoExtractor): class DramaFeverSeriesIE(InfoExtractor): IE_NAME = 'dramafever:series' - _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P[0-9]+)(?:/(?:(?!\d).+)?)?$' + _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$' _TESTS = [{ 'url': 'http://www.dramafever.com/drama/4512/Cooking_with_Shin/', 'info_dict': { From 463b2e5542a85d5cd41b255a71833fec7b4f51e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 14 Jun 2015 09:51:07 +0600 Subject: [PATCH 0393/2145] [dramafever:series] Rollback _PAGE_SIZE to max possible --- youtube_dl/extractor/dramafever.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index 42e0df24e..c4b7c0b68 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -119,7 +119,7 @@ class DramaFeverSeriesIE(InfoExtractor): }] _CONSUMER_SECRET = 'DA59dtVXYLxajktV' - _PAGE_SIZE = 5 # max is 60 (see http://api.drama9.com/#get--api-4-episode-series-) + _PAGE_SIZE = 60 # max is 60 (see http://api.drama9.com/#get--api-4-episode-series-) def _get_consumer_secret(self, video_id): mainjs = self._download_webpage( From 450d89ddc12d80a500a2429632d35a0045cf630b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 14 Jun 2015 09:58:26 +0600 Subject: [PATCH 0394/2145] [dramafever] Improve _VALID_URL --- youtube_dl/extractor/dramafever.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index c4b7c0b68..a34aad486 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -19,7 +19,7 @@ from ..utils import ( class DramaFeverIE(InfoExtractor): IE_NAME = 'dramafever' - _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P[0-9]+/[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P[0-9]+/[0-9]+)(?:/|$)' _TEST = { 'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/', 'info_dict': { From 976b03c56bddf20c978820474e307457523f4c05 Mon Sep 17 00:00:00 2001 From: chaoskagami Date: Sun, 14 Jun 2015 00:18:40 -0400 Subject: [PATCH 0395/2145] Quality note for niconico - at least notify whether you'll get low or src --- youtube_dl/extractor/niconico.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 3cecebf95..e10348004 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -184,6 +184,11 @@ class NiconicoIE(InfoExtractor): extension = determine_ext(video_real_url) video_format = extension.upper() + if video_real_url.endswith('low'): + format_note = 'low' + else: + format_note = 'src' + thumbnail = ( xpath_text(video_info, './/thumbnail_url') or self._html_search_meta('image', webpage, 'thumbnail', default=None) or @@ -242,6 +247,7 @@ class NiconicoIE(InfoExtractor): 'title': title, 'ext': extension, 'format': video_format, + 'format_note' : format_note, 'thumbnail': thumbnail, 'description': description, 'uploader': uploader, From 180940e02df60129bce36035b4a2fd79c0c60995 Mon Sep 17 00:00:00 2001 From: Shrimadhav U K Date: Sun, 14 Jun 2015 11:19:42 +0530 Subject: [PATCH 0396/2145] spelling mistake corrected acces changed to accessing --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b1f792d4e..aacec2958 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -119,7 +119,7 @@ class YoutubeDL(object): username: Username for authentication purposes. password: Password for authentication purposes. - videopassword: Password for acces a video. + videopassword: Password for accessing a video. usenetrc: Use netrc for authentication instead. verbose: Print additional info to stdout. quiet: Do not print messages to stdout. From 755a9d3d1a8f99b061c8d29525d629b8ad6061a4 Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis Date: Sun, 14 Jun 2015 20:58:15 +0300 Subject: [PATCH 0397/2145] [tvplay] Add support for NovaTv --- youtube_dl/extractor/tvplay.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index e83e31a31..79863e781 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -26,6 +26,7 @@ class TVPlayIE(InfoExtractor): viasat4play\.no/programmer| tv6play\.no/programmer| tv3play\.dk/programmer| + play\.novatv\.bg/programi )/[^/]+/(?P\d+) ''' _TESTS = [ @@ -173,6 +174,22 @@ class TVPlayIE(InfoExtractor): 'skip_download': True, }, }, + { + 'url': 'http://play.novatv.bg/programi/zdravei-bulgariya/624952?autostart=true', + 'info_dict': { + 'id': '624952', + 'ext': 'flv', + 'title': 'Здравей, България (12.06.2015 г.) ', + 'description': 'md5:99f3700451ac5bb71a260268b8daefd7', + 'duration': 8838, + 'timestamp': 1434100372, + 'upload_date': '20150612', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, ] def _real_extract(self, url): From 9fd24e3a227a059eed07b679dac858e5bd747123 Mon Sep 17 00:00:00 2001 From: jomo Date: Sun, 14 Jun 2015 21:50:03 +0200 Subject: [PATCH 0398/2145] LiveLeak: support more original videos some (old?) videos use ...mp4.h264_270p.mp4... instead of ...mp4.h264_base.mp4... This is an addition to #4768 --- youtube_dl/extractor/liveleak.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 35822067f..431f2e85d 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -85,7 +85,7 @@ class LiveLeakIE(InfoExtractor): 'url': s['file'], } for i, s in enumerate(sources)] for i, s in enumerate(sources): - orig_url = s['file'].replace('.h264_base.mp4', '') + orig_url = re.sub(r'.h264_.+\.mp4', '', s['file']) if s['file'] != orig_url: formats.append({ 'format_id': 'original-%s' % i, From 8f75761f24f9f2599efe100b5a094182af6403d0 Mon Sep 17 00:00:00 2001 From: jomo Date: Sun, 14 Jun 2015 22:41:44 +0200 Subject: [PATCH 0399/2145] LiveLak: add test for URLs with 'h264_270p' --- youtube_dl/extractor/liveleak.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 431f2e85d..c658cc92b 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -40,6 +40,16 @@ class LiveLeakIE(InfoExtractor): 'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck', 'age_limit': 18, } + }, { + 'url': 'http://www.liveleak.com/view?i=801_1409392012', + 'md5': '0b3bec2d888c20728ca2ad3642f0ef15', + 'info_dict': { + 'id': '801_1409392012', + 'ext': 'mp4', + 'description': "Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.", + 'uploader': 'bony333', + 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia' + } }] def _real_extract(self, url): From 00ac23e6e06bf6de59d5d5b3f42ff64ce039fee2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 15 Jun 2015 02:51:21 +0600 Subject: [PATCH 0400/2145] [liveleak] Improve regex for restoring original video URL --- youtube_dl/extractor/liveleak.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index c658cc92b..e82f21ea7 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -95,7 +95,7 @@ class LiveLeakIE(InfoExtractor): 'url': s['file'], } for i, s in enumerate(sources)] for i, s in enumerate(sources): - orig_url = re.sub(r'.h264_.+\.mp4', '', s['file']) + orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file']) if s['file'] != orig_url: formats.append({ 'format_id': 'original-%s' % i, From afa1ded425ffe12a5b90bcd4d316c43941a5dc1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 15 Jun 2015 02:54:05 +0600 Subject: [PATCH 0401/2145] [liveleak] Clarify rationale for restoring raw video --- youtube_dl/extractor/liveleak.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index e82f21ea7..0a4e473d6 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -95,6 +95,9 @@ class LiveLeakIE(InfoExtractor): 'url': s['file'], } for i, s in enumerate(sources)] for i, s in enumerate(sources): + # Removing '.h264_*.mp4' gives the raw video, which is essentially + # the same video without the LiveLeak logo at the top (see + # https://github.com/rg3/youtube-dl/pull/4768) orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file']) if s['file'] != orig_url: formats.append({ From b95cfa917025750805fb873fc4e2eb161241b22b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 15 Jun 2015 02:54:49 +0600 Subject: [PATCH 0402/2145] [liveleak] Clarify test --- youtube_dl/extractor/liveleak.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 0a4e473d6..857edfde2 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -41,6 +41,7 @@ class LiveLeakIE(InfoExtractor): 'age_limit': 18, } }, { + # Covers https://github.com/rg3/youtube-dl/pull/5983 'url': 'http://www.liveleak.com/view?i=801_1409392012', 'md5': '0b3bec2d888c20728ca2ad3642f0ef15', 'info_dict': { From 5774ef35c4d167f7c959041bf4efc5581a98f0a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 15 Jun 2015 02:57:07 +0600 Subject: [PATCH 0403/2145] [options] Add missing whitespace for --fixup description --- youtube_dl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 689fa7595..740458e51 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -725,7 +725,7 @@ def parseOpts(overrideArguments=None): metavar='POLICY', dest='fixup', default='detect_or_warn', help='Automatically correct known faults of the file. ' 'One of never (do nothing), warn (only emit a warning), ' - 'detect_or_warn(the default; fix file if we can, warn otherwise)') + 'detect_or_warn (the default; fix file if we can, warn otherwise)') postproc.add_option( '--prefer-avconv', action='store_false', dest='prefer_ffmpeg', From 67d95f177c7ffedfc8f8b086535013a1a7a48b29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 15 Jun 2015 03:43:33 +0600 Subject: [PATCH 0404/2145] [niconico] Simplify format info --- youtube_dl/extractor/niconico.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index e10348004..0f8aa5ada 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -182,12 +182,6 @@ class NiconicoIE(InfoExtractor): extension = xpath_text(video_info, './/movie_type') if not extension: extension = determine_ext(video_real_url) - video_format = extension.upper() - - if video_real_url.endswith('low'): - format_note = 'low' - else: - format_note = 'src' thumbnail = ( xpath_text(video_info, './/thumbnail_url') or @@ -246,8 +240,7 @@ class NiconicoIE(InfoExtractor): 'url': video_real_url, 'title': title, 'ext': extension, - 'format': video_format, - 'format_note' : format_note, + 'format_id': 'economy' if video_real_url.endswith('low') else 'normal', 'thumbnail': thumbnail, 'description': description, 'uploader': uploader, From 2a0fcf6113c3f0c8d0510167fd7017cc0fdfa622 Mon Sep 17 00:00:00 2001 From: zx8 Date: Mon, 15 Jun 2015 00:27:43 +0100 Subject: [PATCH 0405/2145] [safari] make url regex more lenient --- youtube_dl/extractor/safari.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 10251f29e..20ba6fa33 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -83,7 +83,7 @@ class SafariIE(SafariBaseIE): library/view/[^/]+| api/v1/book )/ - (?P\d+)/ + (?P[^/]+)/ (?:chapter(?:-content)?/)? (?Ppart\d+)\.html ''' @@ -122,7 +122,7 @@ class SafariCourseIE(SafariBaseIE): IE_NAME = 'safari:course' IE_DESC = 'safaribooksonline.com online courses' - _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P\d+)/?(?:[#?]|$)' + _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P[^/]+)/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/', From 4b9f9010b0d744969189c383e98e8729f9fe9623 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 15 Jun 2015 01:35:50 +0200 Subject: [PATCH 0406/2145] release 2015.06.15 --- README.md | 4 ++-- docs/supportedsites.md | 7 +++++++ youtube_dl/version.py | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f3d83c89f..5f3a08f5a 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ which means you can modify it, redistribute it or use it however you like. -i, --ignore-errors Continue on download errors, for example to skip unavailable videos in a playlist --abort-on-error Abort downloading of further videos (in the playlist or the command line) if an error occurs --dump-user-agent Display the current browser identification - --list-extractors List all supported extractors and the URLs they would handle + --list-extractors List all supported extractors --extractor-descriptions Output descriptions of all supported extractors --default-search PREFIX Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". Use the value "auto" to let youtube-dl guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The @@ -223,7 +223,7 @@ which means you can modify it, redistribute it or use it however you like. parameters replace existing values. Additional templates: %(album)s, %(artist)s. Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like "Coldplay - Paradise" --xattrs Write metadata to the video file's xattrs (using dublin core and xdg standards) - --fixup POLICY Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn(the default; + --fixup POLICY Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn (the default; fix file if we can, warn otherwise) --prefer-avconv Prefer avconv over ffmpeg for running the postprocessors (default) --prefer-ffmpeg Prefer ffmpeg over avconv for running the postprocessors diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d147b53fe..220e52b98 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -120,6 +120,8 @@ - **divxstage**: DivxStage - **Dotsub** - **DouyuTV** + - **dramafever** + - **dramafever:series** - **DRBonanza** - **Dropbox** - **DrTuber** @@ -153,6 +155,7 @@ - **fernsehkritik.tv** - **fernsehkritik.tv:postecke** - **Firstpost** + - **FiveTV** - **Flickr** - **Folketinget**: Folketinget (ft.dk; Danish parliament) - **FootyRoom** @@ -217,6 +220,7 @@ - **instagram:user**: Instagram user profile - **InternetVideoArchive** - **IPrima** + - **iqiyi** - **ivi**: ivi.ru - **ivi:compilation**: ivi.ru compilations - **Izlesene** @@ -407,6 +411,7 @@ - **rutube:movie**: Rutube movies - **rutube:person**: Rutube person videos - **RUTV**: RUTV.RU + - **Ruutu** - **safari**: safaribooksonline.com online video - **safari:course**: safaribooksonline.com online courses - **Sandia**: Sandia National Laboratories @@ -519,6 +524,8 @@ - **TV2** - **TV2Article** - **TV4**: tv4.se and tv4play.se + - **TVC** + - **TVCArticle** - **tvigle**: Интернет-телевидение Tvigle.ru - **tvp.pl** - **tvp.pl:Series** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9cf84ff71..34a13cb81 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.06.04.1' +__version__ = '2015.06.15' From 4af98ecdfb896c16e73cb9f7306908cc686782e8 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 15 Jun 2015 18:49:27 +0800 Subject: [PATCH 0407/2145] [vbox7] Fix extraction (fixes #5967) --- youtube_dl/extractor/vbox7.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index dd026748d..722eb5236 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_urllib_request, + compat_urlparse, ) from ..utils import ( ExtractorError, @@ -26,11 +27,21 @@ class Vbox7IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - redirect_page, urlh = self._download_webpage_handle(url, video_id) - new_location = self._search_regex(r'window\.location = \'(.*)\';', - redirect_page, 'redirect location') - redirect_url = urlh.geturl() + new_location - webpage = self._download_webpage(redirect_url, video_id, + # need to get the page 3 times for the correct jsSecretToken cookie + # which is necessary for the correct title + def get_session_id(): + redirect_page = self._download_webpage(url, video_id) + session_id_url = self._search_regex( + r'var\s*url\s*=\s*\'([^\']+)\';', redirect_page, + 'session id url') + self._download_webpage( + compat_urlparse.urljoin(url, session_id_url), video_id, + 'Getting session id') + + get_session_id() + get_session_id() + + webpage = self._download_webpage(url, video_id, 'Downloading redirect page') title = self._html_search_regex(r'(.*)', From aed473ccf9d9da73b1b80ee8b06d00ee66a3769d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 15 Jun 2015 22:41:24 +0800 Subject: [PATCH 0408/2145] [youku] PEP8 --- youtube_dl/extractor/youku.py | 80 +++++++++++++++++------------------ 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index aed6b960a..4e47fca8a 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -11,6 +11,7 @@ from ..compat import compat_urllib_parse bytes_is_str = (bytes == str) # for compatible + class YoukuIE(InfoExtractor): IE_NAME = 'youku' _VALID_URL = r'''(?x) @@ -21,13 +22,13 @@ class YoukuIE(InfoExtractor): ''' _TEST = { - 'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html', - 'md5': '5f3af4192eabacc4501508d54a8cabd7', - 'info_dict': { - 'id': 'XMTc1ODE5Njcy', - 'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.', - 'ext': 'flv' - } + 'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html', + 'md5': '5f3af4192eabacc4501508d54a8cabd7', + 'info_dict': { + 'id': 'XMTc1ODE5Njcy', + 'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.', + 'ext': 'flv' + } } def construct_video_urls(self, data1, data2): @@ -36,7 +37,7 @@ class YoukuIE(InfoExtractor): ls = list(range(256)) t = 0 for i in range(256): - t = (t + ls[i] + ord(s1[i%len(s1)])) % 256 + t = (t + ls[i] + ord(s1[i % len(s1)])) % 256 ls[i], ls[t] = ls[t], ls[i] s = '' if not bytes_is_str else b'' x, y = 0, 0 @@ -45,16 +46,16 @@ class YoukuIE(InfoExtractor): x = (x + ls[y]) % 256 ls[x], ls[y] = ls[y], ls[x] if isinstance(s2[i], int): - s += chr(s2[i] ^ ls[(ls[x]+ls[y]) % 256]) + s += chr(s2[i] ^ ls[(ls[x] + ls[y]) % 256]) else: - s += chr(ord(s2[i]) ^ ls[(ls[x]+ls[y]) % 256]) + s += chr(ord(s2[i]) ^ ls[(ls[x] + ls[y]) % 256]) return s sid, token = yk_t( 'becaf9be', - base64.b64decode(bytes(data2['ep'], 'ascii')) \ - if not bytes_is_str \ - else base64.b64decode(data2['ep']) + base64.b64decode(bytes(data2['ep'], 'ascii')) + if not bytes_is_str + else base64.b64decode(data2['ep']) ).split('_') # get oip @@ -89,13 +90,13 @@ class YoukuIE(InfoExtractor): fileid = get_fileid(format, n) ep_t = yk_t( 'bf7e5f01', - bytes('%s_%s_%s' % (sid, fileid, token), 'ascii') \ - if not bytes_is_str \ + bytes('%s_%s_%s' % (sid, fileid, token), 'ascii') + if not bytes_is_str else ('%s_%s_%s' % (sid, fileid, token)) ) ep = base64.b64encode( - bytes(ep_t, 'latin') \ - if not bytes_is_str \ + bytes(ep_t, 'latin') + if not bytes_is_str else ep_t ).decode() return ep @@ -121,9 +122,9 @@ class YoukuIE(InfoExtractor): video_url = \ 'http://k.youku.com/player/getFlvPath/' + \ 'sid/' + sid + \ - '_' + str(int(n)+1).zfill(2) + \ + '_' + str(int(n) + 1).zfill(2) + \ '/st/' + self.parse_ext_l(format) + \ - '/fileid/' + get_fileid(format, n) + '?' + \ + '/fileid/' + get_fileid(format, n) + '?' + \ compat_urllib_parse.urlencode(param) video_urls.append(video_url) video_urls_dict[format] = video_urls @@ -132,34 +133,34 @@ class YoukuIE(InfoExtractor): def get_hd(self, fm): hd_id_dict = { - 'flv' : '0', - 'mp4' : '1', - 'hd2' : '2', - 'hd3' : '3', - '3gp' : '0', - '3gphd' : '1' + 'flv': '0', + 'mp4': '1', + 'hd2': '2', + 'hd3': '3', + '3gp': '0', + '3gphd': '1' } return hd_id_dict[fm] def parse_ext_l(self, fm): ext_dict = { - 'flv' : 'flv', - 'mp4' : 'mp4', - 'hd2' : 'flv', - 'hd3' : 'flv', - '3gp' : 'flv', - '3gphd' : 'mp4' + 'flv': 'flv', + 'mp4': 'mp4', + 'hd2': 'flv', + 'hd3': 'flv', + '3gp': 'flv', + '3gphd': 'mp4' } return ext_dict[fm] def get_format_name(self, fm): _dict = { - '3gp' : 'h6', - '3gphd' : 'h5', - 'flv' : 'h4', - 'mp4' : 'h3', - 'hd2' : 'h2', - 'hd3' : 'h1' + '3gp': 'h6', + '3gphd': 'h5', + 'flv': 'h4', + 'mp4': 'h3', + 'hd2': 'h2', + 'hd3': 'h1' } return _dict[fm] @@ -194,10 +195,9 @@ class YoukuIE(InfoExtractor): # construct info entries = [] for fm in data1['streamtypes']: - #formats = [] video_urls = video_urls_dict[fm] for i in range(len(video_urls)): - if len(entries) < i+1: + if len(entries) < i + 1: entries.append({'formats': []}) entries[i]['formats'].append( { @@ -211,7 +211,7 @@ class YoukuIE(InfoExtractor): for i in range(len(entries)): entries[i].update( { - 'id': '_part%d' % (i+1), + 'id': '_part%d' % (i + 1), 'title': title, } ) From 054932f4035d606946f0c054c02cf87496b753f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 15 Jun 2015 20:46:10 +0600 Subject: [PATCH 0409/2145] [vk] Fix extraction (Closes #5987) --- youtube_dl/extractor/vk.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index d0e772108..6aeba109d 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -175,16 +175,16 @@ class VKIE(InfoExtractor): m_rutube.group(1).replace('\\', '')) return self.url_result(rutube_url) - m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.*?});', info_page) + m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page) if m_opts: - m_opts_url = re.search(r"url\s*:\s*'([^']+)", m_opts.group(1)) + m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1)) if m_opts_url: opts_url = m_opts_url.group(1) if opts_url.startswith('//'): opts_url = 'http:' + opts_url return self.url_result(opts_url) - data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars') + data_json = self._search_regex(r'var\s+vars\s*=\s*({.+?});', info_page, 'vars') data = json.loads(data_json) # Extract upload date From 7c7dd9dc7fe8139196c7bd1c512301a61f9f362b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 15 Jun 2015 20:47:01 +0600 Subject: [PATCH 0410/2145] [vk] Fix upload date extraction --- youtube_dl/extractor/vk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 6aeba109d..f974f8fef 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -189,7 +189,7 @@ class VKIE(InfoExtractor): # Extract upload date upload_date = None - mobj = re.search(r'id="mv_date_wrap".*?Added ([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page) + mobj = re.search(r'id="mv_date(?:_views)?_wrap"[^>]*>([a-zA-Z]+ [0-9]+), ([0-9]+) at', info_page) if mobj is not None: mobj.group(1) + ' ' + mobj.group(2) upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2)) From 8117df4cd9e49a3c7369db3cab6c0b94365c7786 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 15 Jun 2015 20:55:25 +0600 Subject: [PATCH 0411/2145] [vk] Extract view count --- youtube_dl/extractor/vk.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index f974f8fef..38ff3c1a9 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -13,6 +13,7 @@ from ..compat import ( from ..utils import ( ExtractorError, orderedSet, + str_to_int, unescapeHTML, unified_strdate, ) @@ -34,6 +35,7 @@ class VKIE(InfoExtractor): 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', 'duration': 195, 'upload_date': '20120212', + 'view_count': int, }, }, { @@ -45,7 +47,8 @@ class VKIE(InfoExtractor): 'uploader': 'Tom Cruise', 'title': 'No name', 'duration': 9, - 'upload_date': '20130721' + 'upload_date': '20130721', + 'view_count': int, } }, { @@ -59,6 +62,7 @@ class VKIE(InfoExtractor): 'title': 'Lin Dan', 'duration': 101, 'upload_date': '20120730', + 'view_count': int, } }, { @@ -73,7 +77,8 @@ class VKIE(InfoExtractor): 'uploader': 'Триллеры', 'title': '► Бойцовский клуб / Fight Club 1999 [HD 720]', 'duration': 8352, - 'upload_date': '20121218' + 'upload_date': '20121218', + 'view_count': int, }, 'skip': 'Requires vk account credentials', }, @@ -100,6 +105,7 @@ class VKIE(InfoExtractor): 'title': 'Книга Илая', 'duration': 6771, 'upload_date': '20140626', + 'view_count': int, }, 'skip': 'Only works from Russia', }, @@ -194,6 +200,10 @@ class VKIE(InfoExtractor): mobj.group(1) + ' ' + mobj.group(2) upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2)) + view_count = str_to_int(self._search_regex( + r'"mv_views_count_number"[^>]*>([\d,.]+) views<', + info_page, 'view count', fatal=False)) + formats = [{ 'format_id': k, 'url': v, @@ -210,6 +220,7 @@ class VKIE(InfoExtractor): 'uploader': data.get('md_author'), 'duration': data.get('duration'), 'upload_date': upload_date, + 'view_count': view_count, } From 02175a7986c4223a0ed27a872c1ca16926913e05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 15 Jun 2015 21:01:26 +0600 Subject: [PATCH 0412/2145] [youtube:search] Fix search query (Closes #5988) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3448bec4f..9e2671192 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1504,7 +1504,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): for pagenum in itertools.count(1): url_query = { - 'search_query': query, + 'search_query': query.encode('utf-8'), 'page': pagenum, 'spf': 'navigate', } From c203be3fb4f00388c81564dc0c85ff8a10ff4553 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 15 Jun 2015 23:28:59 +0800 Subject: [PATCH 0413/2145] [youku] Better handling for Python 2/3 compatibility --- youtube_dl/extractor/youku.py | 37 ++++++++++++----------------------- 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 4e47fca8a..26e5baadc 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -7,9 +7,10 @@ import base64 from .common import InfoExtractor from ..utils import ExtractorError -from ..compat import compat_urllib_parse - -bytes_is_str = (bytes == str) # for compatible +from ..compat import ( + compat_urllib_parse, + compat_ord, +) class YoukuIE(InfoExtractor): @@ -37,26 +38,20 @@ class YoukuIE(InfoExtractor): ls = list(range(256)) t = 0 for i in range(256): - t = (t + ls[i] + ord(s1[i % len(s1)])) % 256 + t = (t + ls[i] + compat_ord(s1[i % len(s1)])) % 256 ls[i], ls[t] = ls[t], ls[i] - s = '' if not bytes_is_str else b'' + s = bytearray() x, y = 0, 0 for i in range(len(s2)): y = (y + 1) % 256 x = (x + ls[y]) % 256 ls[x], ls[y] = ls[y], ls[x] - if isinstance(s2[i], int): - s += chr(s2[i] ^ ls[(ls[x] + ls[y]) % 256]) - else: - s += chr(ord(s2[i]) ^ ls[(ls[x] + ls[y]) % 256]) - return s + s.append(compat_ord(s2[i]) ^ ls[(ls[x] + ls[y]) % 256]) + return bytes(s) sid, token = yk_t( - 'becaf9be', - base64.b64decode(bytes(data2['ep'], 'ascii')) - if not bytes_is_str - else base64.b64decode(data2['ep']) - ).split('_') + b'becaf9be', base64.b64decode(data2['ep'].encode('ascii')) + ).decode('ascii').split('_') # get oip oip = data2['ip'] @@ -89,16 +84,10 @@ class YoukuIE(InfoExtractor): def generate_ep(format, n): fileid = get_fileid(format, n) ep_t = yk_t( - 'bf7e5f01', - bytes('%s_%s_%s' % (sid, fileid, token), 'ascii') - if not bytes_is_str - else ('%s_%s_%s' % (sid, fileid, token)) + b'bf7e5f01', + ('%s_%s_%s' % (sid, fileid, token)).encode('ascii') ) - ep = base64.b64encode( - bytes(ep_t, 'latin') - if not bytes_is_str - else ep_t - ).decode() + ep = base64.b64encode(ep_t).decode('ascii') return ep # generate video_urls From 99e6833c85868b78df7c810603ffdccdaeb4eaf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 15 Jun 2015 21:30:27 +0600 Subject: [PATCH 0414/2145] [francetv] Update f4m manifest token URL (Closes #5981, Closes #5989) --- youtube_dl/extractor/francetv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index edf555b29..db0bbec1e 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -60,7 +60,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor): continue video_url_parsed = compat_urllib_parse_urlparse(video_url) f4m_url = self._download_webpage( - 'http://hdfauth.francetv.fr/esi/urltokengen2.html?url=%s' % video_url_parsed.path, + 'http://hdfauth.francetv.fr/esi/TA?url=%s' % video_url_parsed.path, video_id, 'Downloading f4m manifest token', fatal=False) if f4m_url: formats.extend(self._extract_f4m_formats(f4m_url, video_id, 1, format_id)) From 9383e66f9475eca0e64c09972c1392d92d17570c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 15 Jun 2015 23:31:30 +0800 Subject: [PATCH 0415/2145] [youku] Use _match_id --- youtube_dl/extractor/youku.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 26e5baadc..e41b48369 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re import base64 from .common import InfoExtractor @@ -154,8 +153,7 @@ class YoukuIE(InfoExtractor): return _dict[fm] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) # request basic data data1_url = 'http://v.youku.com/player/getPlayList/VideoIDS/%s' % video_id From ee69799262e8344742b9d8b492fe792b4d586f6a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 15 Jun 2015 23:36:28 +0800 Subject: [PATCH 0416/2145] [youku] Add a v.swf test case --- youtube_dl/extractor/youku.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index e41b48369..d8162a0c5 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -21,7 +21,7 @@ class YoukuIE(InfoExtractor): (?P[A-Za-z0-9]+)(?:\.html|/v\.swf|) ''' - _TEST = { + _TESTS = [{ 'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html', 'md5': '5f3af4192eabacc4501508d54a8cabd7', 'info_dict': { @@ -29,7 +29,10 @@ class YoukuIE(InfoExtractor): 'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.', 'ext': 'flv' } - } + }, { + 'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf', + 'only_matching': True, + }] def construct_video_urls(self, data1, data2): # get sid, token From 4fd35ee072a39654f11e794db5c50ee375c9a7c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 15 Jun 2015 21:36:30 +0600 Subject: [PATCH 0417/2145] [safari] Add test for #5985 --- youtube_dl/extractor/safari.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 20ba6fa33..f3c80708c 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -100,6 +100,10 @@ class SafariIE(SafariBaseIE): }, { 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', 'only_matching': True, + }, { + # non-digits in course id + 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html', + 'only_matching': True, }] def _real_extract(self, url): From f1e66cb2eb40b48c6508acbe57207a2d99792bf0 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 15 Jun 2015 23:46:07 +0800 Subject: [PATCH 0418/2145] [youku] Change video_id and add a multipart test case --- youtube_dl/extractor/youku.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index d8162a0c5..d5b73ebce 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -25,13 +25,20 @@ class YoukuIE(InfoExtractor): 'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html', 'md5': '5f3af4192eabacc4501508d54a8cabd7', 'info_dict': { - 'id': 'XMTc1ODE5Njcy', + 'id': 'XMTc1ODE5Njcy_part1', 'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.', 'ext': 'flv' } }, { 'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf', 'only_matching': True, + }, { + 'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html', + 'info_dict': { + 'id': 'XODgxNjg1Mzk2', + 'title': '武媚娘传奇 85', + }, + 'playlist_count': 11, }] def construct_video_urls(self, data1, data2): @@ -201,20 +208,14 @@ class YoukuIE(InfoExtractor): for i in range(len(entries)): entries[i].update( { - 'id': '_part%d' % (i + 1), + 'id': '%s_part%d' % (video_id, i + 1), 'title': title, } ) - if len(entries) > 1: - info = { - '_type': 'multi_video', - 'id': video_id, - 'title': title, - 'entries': entries, - } - else: - info = entries[0] - info['id'] = video_id - - return info + return { + '_type': 'multi_video', + 'id': video_id, + 'title': title, + 'entries': entries, + } From 04e7596680bce28beae2436bac0f6d1f01a45210 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 15 Jun 2015 23:54:55 +0800 Subject: [PATCH 0419/2145] [youku] Better error handling --- youtube_dl/extractor/youku.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index d5b73ebce..91f9f6bff 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -176,13 +176,15 @@ class YoukuIE(InfoExtractor): error_code = data1.get('error_code') if error_code: - # -8 means blocked outside China. - # Chinese and English, separated by newline. error = data1.get('error') - raise ExtractorError( - error or 'Server reported error %i' % - error_code, - expected=True) + if error is not None and '因版权原因无法观看此视频' in error: + raise ExtractorError( + 'Youku said: Sorry, this video is available in China only', expected=True) + else: + msg = 'Youku server reported error %i' % error_code + if error is not None: + msg += ': ' + error + raise ExtractorError(msg) title = data1['title'] From 5228b756af2c2bfc2962a5b1bb6db1e6a41c9e05 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 16 Jun 2015 00:06:23 +0800 Subject: [PATCH 0420/2145] [youku] Add cn_verification_proxy support and add a georestricted test case --- youtube_dl/extractor/youku.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 91f9f6bff..ea37dc8b2 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -9,6 +9,7 @@ from ..utils import ExtractorError from ..compat import ( compat_urllib_parse, compat_ord, + compat_urllib_request, ) @@ -39,6 +40,14 @@ class YoukuIE(InfoExtractor): 'title': '武媚娘传奇 85', }, 'playlist_count': 11, + }, { + 'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html', + 'info_dict': { + 'id': 'XMTI1OTczNDM5Mg', + 'title': '花千骨 04', + }, + 'playlist_count': 13, + 'skip': 'Available in China only', }] def construct_video_urls(self, data1, data2): @@ -165,14 +174,23 @@ class YoukuIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - # request basic data - data1_url = 'http://v.youku.com/player/getPlayList/VideoIDS/%s' % video_id - data2_url = 'http://v.youku.com/player/getPlayList/VideoIDS/%s/Pf/4/ctype/12/ev/1' % video_id + def retrieve_data(req_url, note): + req = compat_urllib_request.Request(req_url) - raw_data1 = self._download_json(data1_url, video_id) - raw_data2 = self._download_json(data2_url, video_id) - data1 = raw_data1['data'][0] - data2 = raw_data2['data'][0] + cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') + if cn_verification_proxy: + req.add_header('Ytdl-request-proxy', cn_verification_proxy) + + raw_data = self._download_json(req, video_id, note=note) + return raw_data['data'][0] + + # request basic data + data1 = retrieve_data( + 'http://v.youku.com/player/getPlayList/VideoIDS/%s' % video_id, + 'Downloading JSON metadata 1') + data2 = retrieve_data( + 'http://v.youku.com/player/getPlayList/VideoIDS/%s/Pf/4/ctype/12/ev/1' % video_id, + 'Downloading JSON metadata 2') error_code = data1.get('error_code') if error_code: From a155b7e76c5a71c650f62c4716d23a24943fc373 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 16 Jun 2015 00:15:09 +0800 Subject: [PATCH 0421/2145] [youku] Coding style --- youtube_dl/extractor/youku.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index ea37dc8b2..cab5be3a4 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -216,22 +216,18 @@ class YoukuIE(InfoExtractor): for i in range(len(video_urls)): if len(entries) < i + 1: entries.append({'formats': []}) - entries[i]['formats'].append( - { - 'url': video_urls[i], - 'format_id': self.get_format_name(fm), - 'ext': self.parse_ext_l(fm), - 'filesize': int(data1['segs'][fm][i]['size']) - } - ) + entries[i]['formats'].append({ + 'url': video_urls[i], + 'format_id': self.get_format_name(fm), + 'ext': self.parse_ext_l(fm), + 'filesize': int(data1['segs'][fm][i]['size']) + }) for i in range(len(entries)): - entries[i].update( - { - 'id': '%s_part%d' % (video_id, i + 1), - 'title': title, - } - ) + entries[i].update({ + 'id': '%s_part%d' % (video_id, i + 1), + 'title': title, + }) return { '_type': 'multi_video', From 0501bfa159db5b5e8ed7fd1ed966b9989becb3e9 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Tue, 16 Jun 2015 00:15:30 +0800 Subject: [PATCH 0422/2145] [YoutubeDL] Youku extractor now uses the standard format selection --- youtube_dl/YoutubeDL.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index aacec2958..6e4b6f566 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1033,12 +1033,6 @@ class YoutubeDL(object): info_dict['id'], info_dict.get('subtitles'), info_dict.get('automatic_captions')) - # This extractors handle format selection themselves - if info_dict['extractor'] in ['Youku']: - if download: - self.process_info(info_dict) - return info_dict - # We now pick which formats have to be downloaded if info_dict.get('formats') is None: # There's only one format available From 7f0172b3e5e0da2a19708fdf3ec1b521a6e2656f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 15 Jun 2015 22:29:41 +0600 Subject: [PATCH 0423/2145] Credit @jackyzy823 for iqiyi --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index bf2a25cb8..889d599a2 100644 --- a/AUTHORS +++ b/AUTHORS @@ -127,3 +127,4 @@ Julian Richen Ping O. Mister Hat Peter Ding +jackyzy823 From f3aecb27a4d7b178ae66b4a294cff5dbe9bb2b18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 16 Jun 2015 14:41:52 +0200 Subject: [PATCH 0424/2145] [youku] Simplify a bit the 'entries' construction Mainly avoid having to use an index. --- youtube_dl/extractor/youku.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index cab5be3a4..ced3a10cd 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -210,25 +210,23 @@ class YoukuIE(InfoExtractor): video_urls_dict = self.construct_video_urls(data1, data2) # construct info - entries = [] + entries = [{ + 'id': '%s_part%d' % (video_id, i + 1), + 'title': title, + 'formats': [], + # some formats are not available for all parts, we have to detect + # which one has all + } for i in range(max(len(v) for v in data1['segs'].values()))] for fm in data1['streamtypes']: video_urls = video_urls_dict[fm] - for i in range(len(video_urls)): - if len(entries) < i + 1: - entries.append({'formats': []}) - entries[i]['formats'].append({ - 'url': video_urls[i], + for video_url, seg, entry in zip(video_urls, data1['segs'][fm], entries): + entry['formats'].append({ + 'url': video_url, 'format_id': self.get_format_name(fm), 'ext': self.parse_ext_l(fm), - 'filesize': int(data1['segs'][fm][i]['size']) + 'filesize': int(seg['size']), }) - for i in range(len(entries)): - entries[i].update({ - 'id': '%s_part%d' % (video_id, i + 1), - 'title': title, - }) - return { '_type': 'multi_video', 'id': video_id, From 447053668fbed993f6f4fd2e06d9282ea30224bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 16 Jun 2015 21:19:18 +0600 Subject: [PATCH 0425/2145] [spankwire] Fix extraction --- youtube_dl/extractor/spankwire.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 06d6e6640..bff75d6b2 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -27,7 +27,7 @@ class SpankwireIE(InfoExtractor): 'description': 'Crazy Bitch X rated music video.', 'uploader': 'oreusz', 'uploader_id': '124697', - 'upload_date': '20070508', + 'upload_date': '20070507', 'age_limit': 18, } } @@ -44,7 +44,7 @@ class SpankwireIE(InfoExtractor): title = self._html_search_regex( r'

([^<]+)', webpage, 'title') description = self._html_search_regex( - r'([^<]+)<', + r'(?s)(.+?)', webpage, 'description', fatal=False) thumbnail = self._html_search_regex( r'playerData\.screenShot\s*=\s*["\']([^"\']+)["\']', @@ -64,12 +64,12 @@ class SpankwireIE(InfoExtractor): r'
([\d,\.]+) views
', webpage, 'view count', fatal=False)) comment_count = str_to_int(self._html_search_regex( - r'Comments]+>\s*\(([\d,\.]+)\)', + r']*>([\d,\.]+)', webpage, 'comment count', fatal=False)) video_urls = list(map( compat_urllib_parse.unquote, - re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*["\']([^"\']+)["\']', webpage))) + re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage))) if webpage.find('flashvars\.encrypted = "true"') != -1: password = self._search_regex( r'flashvars\.video_title = "([^"]+)', From 14835de9fb41798c8e6e731a3f07ae871770666f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lio=20A=2E=20Heckert?= Date: Tue, 16 Jun 2015 18:10:31 -0300 Subject: [PATCH 0426/2145] Use shlex.split for --pp-params and update related docs. --- README.md | 2 +- youtube_dl/YoutubeDL.py | 1 + youtube_dl/__init__.py | 6 ++++-- youtube_dl/options.py | 4 ++-- youtube_dl/postprocessor/common.py | 3 ++- 5 files changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 726ec9cf2..813ac4a15 100644 --- a/README.md +++ b/README.md @@ -214,7 +214,7 @@ which means you can modify it, redistribute it or use it however you like. --audio-quality QUALITY Specify ffmpeg/avconv audio quality, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default 5) --recode-video FORMAT Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|xvid) - --pp-params Extra parameters for video post-processor. The params will be splited on spaces. + --pp-params Extra parameters for video post-processor. -k, --keep-video Keep the video file on disk after the post-processing; the video is erased by default --no-post-overwrites Do not overwrite post-processed files; the post-processed files are overwritten by default --embed-subs Embed subtitles in the video (only for mkv and mp4 videos) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b1f792d4e..3bfe30c76 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -261,6 +261,7 @@ class YoutubeDL(object): The following options are used by the post processors: prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, otherwise prefer avconv. + pp_params: Extra parameters for external apps, like avconv. """ params = None diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 5b28e4817..8b54d4ae2 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -171,8 +171,10 @@ def _real_main(argv=None): if opts.recodevideo is not None: if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv', 'xvid']: parser.error('invalid video recode format specified') - if opts.pp_params is not None: - opts.pp_params = opts.pp_params.split() + if opts.pp_params is None: + opts.pp_params = [] + else: + opts.pp_params = shlex.split(opts.pp_params) if opts.convertsubtitles is not None: if opts.convertsubtitles not in ['srt', 'vtt', 'ass']: parser.error('invalid subtitle format specified') diff --git a/youtube_dl/options.py b/youtube_dl/options.py index ceb4b5f38..fbba9b9d8 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -689,8 +689,8 @@ def parseOpts(overrideArguments=None): help='Encode the video to another format if necessary (currently supported: mp4|flv|ogg|webm|mkv|xvid)') postproc.add_option( '--pp-params', - dest='pp_params', default=None, - help='Extra parameters for video post-processor. The params will be splited on spaces.') + dest='pp_params', default=None, metavar='ARGS', + help='Extra parameters for video post-processor.') postproc.add_option( '-k', '--keep-video', action='store_true', dest='keepvideo', default=False, diff --git a/youtube_dl/postprocessor/common.py b/youtube_dl/postprocessor/common.py index 3b0e8ddd8..d944d9367 100644 --- a/youtube_dl/postprocessor/common.py +++ b/youtube_dl/postprocessor/common.py @@ -22,7 +22,8 @@ class PostProcessor(object): of the chain is reached. PostProcessor objects follow a "mutual registration" process similar - to InfoExtractor objects. + to InfoExtractor objects. And it can receive parameters from CLI trough + --pp-params. """ _downloader = None From 028a33d7f2a0bc028f533530d2722b57b31dabdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 17 Jun 2015 20:27:38 +0600 Subject: [PATCH 0427/2145] [lifenews] Fix extraction --- youtube_dl/extractor/lifenews.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index 42cb6e35f..373122c93 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -82,10 +82,11 @@ class LifeNewsIE(InfoExtractor): view_count = self._html_search_regex( r'
\s*(\d+)\s*
', webpage, 'view count', fatal=False) comment_count = self._html_search_regex( - r'
\s*\s*(\d+)\s*', webpage, 'comment count', fatal=False) + r'=\'commentCount\'[^>]*>\s*(\d+)\s*<', + webpage, 'comment count', fatal=False) upload_date = self._html_search_regex( - r'
\s*(.*?)\s*
', webpage, 'categories', fatal=False).split(', ') } # find and add the format From 62b742ece3ec6c7d7fd24898b5413b6b98a4ae8f Mon Sep 17 00:00:00 2001 From: George Brighton Date: Sat, 27 Jun 2015 20:51:11 +0100 Subject: [PATCH 0536/2145] [moviefap] Remove redundant comments --- youtube_dl/extractor/moviefap.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/youtube_dl/extractor/moviefap.py b/youtube_dl/extractor/moviefap.py index 23575d30a..b38a8e71f 100644 --- a/youtube_dl/extractor/moviefap.py +++ b/youtube_dl/extractor/moviefap.py @@ -70,19 +70,13 @@ class MovieFapIE(InfoExtractor): def _real_extract(self, url): - # find the video ID video_id = self._match_id(url) - - # retrieve the page HTML webpage = self._download_webpage(url, video_id) - # find the URL of the XML document detailing video download URLs + # find and retrieve the XML document detailing video download URLs info_url = self._html_search_regex(r'flashvars\.config = escape\("(.+?)"', webpage, 'player parameters') - - # download that XML xml = self._download_xml(info_url, video_id) - # create dictionary of properties we know so far, or can find easily info = { 'id': video_id, 'title': self._html_search_regex(r'

(.*?)

', webpage, 'title'), From 43b925ce74efd0a011f7880dcdcc90f4cf3b8f4b Mon Sep 17 00:00:00 2001 From: George Brighton Date: Sat, 27 Jun 2015 20:52:12 +0100 Subject: [PATCH 0537/2145] [moviefap] Replace calls to `find()` with `util.xpath_text()`. --- youtube_dl/extractor/moviefap.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/moviefap.py b/youtube_dl/extractor/moviefap.py index b38a8e71f..6da93dbc9 100644 --- a/youtube_dl/extractor/moviefap.py +++ b/youtube_dl/extractor/moviefap.py @@ -3,7 +3,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import str_to_int +from ..utils import ( + xpath_text, + str_to_int +) class MovieFapIE(InfoExtractor): @@ -82,7 +85,7 @@ class MovieFapIE(InfoExtractor): 'title': self._html_search_regex(r'

(.*?)

', webpage, 'title'), 'display_id': re.compile(self._VALID_URL).match(url).group('name'), 'thumbnails': self.__get_thumbnail_data(xml), - 'thumbnail': xml.find('startThumb').text, + 'thumbnail': xpath_text(xml, 'startThumb', 'thumbnail'), 'description': self._html_search_regex(r'name="description" value="(.*?)"', webpage, 'description', fatal=False), 'uploader_id': self._html_search_regex(r'name="username" value="(.*?)"', webpage, 'uploader_id', fatal=False), 'view_count': str_to_int(self._html_search_regex(r'
Views ([0-9]+)', webpage, 'view_count, fatal=False')), @@ -102,7 +105,7 @@ class MovieFapIE(InfoExtractor): # work out the video URL(s) if xml.find('videoLink') is not None: # single format available - info['url'] = xml.find('videoLink').text + info['url'] = xpath_text(xml, 'videoLink', 'url', True) else: # multiple formats available info['formats'] = [] @@ -110,8 +113,8 @@ class MovieFapIE(InfoExtractor): # N.B. formats are already in ascending order of quality for item in xml.find('quality').findall('item'): info['formats'].append({ - 'url': item.find('videoLink').text, - 'resolution': item.find('res').text # 480p etc. + 'url': xpath_text(item, 'videoLink', 'url', True), + 'resolution': xpath_text(item, 'res', 'resolution', True) # 480p etc. }) return info From b971abe897ee17fed7e36868fdc8880f6b145d7b Mon Sep 17 00:00:00 2001 From: George Brighton Date: Sat, 27 Jun 2015 21:04:53 +0100 Subject: [PATCH 0538/2145] [moviefap] Replace call to `str()` with `compat.compat_str()` --- youtube_dl/extractor/moviefap.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/moviefap.py b/youtube_dl/extractor/moviefap.py index 6da93dbc9..20a78f3b2 100644 --- a/youtube_dl/extractor/moviefap.py +++ b/youtube_dl/extractor/moviefap.py @@ -7,6 +7,7 @@ from ..utils import ( xpath_text, str_to_int ) +from ..compat import compat_str class MovieFapIE(InfoExtractor): @@ -65,7 +66,7 @@ class MovieFapIE(InfoExtractor): thumbnails = [] for i in range(first, last + 1): thumbnails.append({ - 'url': pattern.replace('#', str(i)), + 'url': pattern.replace('#', compat_str(i)), 'width': width, 'height': height }) From 8a1b49ff19a8a1fdc2c30cf10cc0598ac9bc8819 Mon Sep 17 00:00:00 2001 From: George Brighton Date: Sat, 27 Jun 2015 22:27:06 +0100 Subject: [PATCH 0539/2145] [moviefap] Explicitly sort formats to handle possible site changes --- youtube_dl/extractor/moviefap.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/moviefap.py b/youtube_dl/extractor/moviefap.py index 20a78f3b2..295bfe3f0 100644 --- a/youtube_dl/extractor/moviefap.py +++ b/youtube_dl/extractor/moviefap.py @@ -111,11 +111,14 @@ class MovieFapIE(InfoExtractor): # multiple formats available info['formats'] = [] - # N.B. formats are already in ascending order of quality for item in xml.find('quality').findall('item'): + resolution = xpath_text(item, 'res', 'resolution', True) # 480p etc. info['formats'].append({ 'url': xpath_text(item, 'videoLink', 'url', True), - 'resolution': xpath_text(item, 'res', 'resolution', True) # 480p etc. + 'resolution': resolution, + 'height': int(re.findall(r'\d+', resolution)[0]) }) + self._sort_formats(info['formats']) + return info From 1a5fd4eebc2717b5173df50d65007f90cb05ee30 Mon Sep 17 00:00:00 2001 From: George Brighton Date: Sat, 27 Jun 2015 22:32:56 +0100 Subject: [PATCH 0540/2145] [moviefap] Wrap long lines --- youtube_dl/extractor/moviefap.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/moviefap.py b/youtube_dl/extractor/moviefap.py index 295bfe3f0..9de052a99 100644 --- a/youtube_dl/extractor/moviefap.py +++ b/youtube_dl/extractor/moviefap.py @@ -78,23 +78,32 @@ class MovieFapIE(InfoExtractor): webpage = self._download_webpage(url, video_id) # find and retrieve the XML document detailing video download URLs - info_url = self._html_search_regex(r'flashvars\.config = escape\("(.+?)"', webpage, 'player parameters') + info_url = self._html_search_regex( \ + r'flashvars\.config = escape\("(.+?)"', webpage, 'player parameters') xml = self._download_xml(info_url, video_id) info = { 'id': video_id, - 'title': self._html_search_regex(r'

(.*?)

', webpage, 'title'), + 'title': self._html_search_regex( \ + r'

(.*?)

', webpage, 'title'), 'display_id': re.compile(self._VALID_URL).match(url).group('name'), 'thumbnails': self.__get_thumbnail_data(xml), 'thumbnail': xpath_text(xml, 'startThumb', 'thumbnail'), - 'description': self._html_search_regex(r'name="description" value="(.*?)"', webpage, 'description', fatal=False), - 'uploader_id': self._html_search_regex(r'name="username" value="(.*?)"', webpage, 'uploader_id', fatal=False), - 'view_count': str_to_int(self._html_search_regex(r'
Views ([0-9]+)', webpage, 'view_count, fatal=False')), - 'average_rating': float(self._html_search_regex(r'Current Rating
(.*?)', webpage, 'average_rating', fatal=False)), - 'comment_count': str_to_int(self._html_search_regex(r'([0-9]+)', webpage, 'comment_count', fatal=False)), + 'description': self._html_search_regex( \ + r'name="description" value="(.*?)"', webpage, 'description', fatal=False), + 'uploader_id': self._html_search_regex( \ + r'name="username" value="(.*?)"', webpage, 'uploader_id', fatal=False), + 'view_count': str_to_int(self._html_search_regex( \ + r'
Views ([0-9]+)', webpage, 'view_count, fatal=False')), + 'average_rating': float(self._html_search_regex( \ + r'Current Rating
(.*?)', webpage, 'average_rating', fatal=False)), + 'comment_count': str_to_int(self._html_search_regex( \ + r'([0-9]+)', webpage, 'comment_count', fatal=False)), 'age_limit': 18, - 'webpage_url': self._html_search_regex(r'name="link" value="(.*?)"', webpage, 'webpage_url', fatal=False), - 'categories': self._html_search_regex(r'
\s*(.*?)\s*
', webpage, 'categories', fatal=False).split(', ') + 'webpage_url': self._html_search_regex( \ + r'name="link" value="(.*?)"', webpage, 'webpage_url', fatal=False), + 'categories': self._html_search_regex( \ + r'
\s*(.*?)\s*
', webpage, 'categories', fatal=False).split(', ') } # find and add the format From 5a9cc19972fb3aae7a67470f65ec5cd30918f4e1 Mon Sep 17 00:00:00 2001 From: George Brighton Date: Sat, 27 Jun 2015 23:03:06 +0100 Subject: [PATCH 0541/2145] [moviefap] Move flv videos to formats in the metadata --- youtube_dl/extractor/moviefap.py | 56 +++++++++++++++++--------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/moviefap.py b/youtube_dl/extractor/moviefap.py index 9de052a99..5e0c701d4 100644 --- a/youtube_dl/extractor/moviefap.py +++ b/youtube_dl/extractor/moviefap.py @@ -82,8 +82,36 @@ class MovieFapIE(InfoExtractor): r'flashvars\.config = escape\("(.+?)"', webpage, 'player parameters') xml = self._download_xml(info_url, video_id) - info = { + # find the video container + if xml.find('videoConfig') is not None: + ext = xml.find('videoConfig').find('type').text + else: + ext = 'flv' # guess... + + # work out the video URL(s) + formats = [] + if xml.find('videoLink') is not None: + # single format available + formats.append({ + 'url': xpath_text(xml, 'videoLink', 'url', True), + 'ext': ext + }) + else: + # multiple formats available + for item in xml.find('quality').findall('item'): + resolution = xpath_text(item, 'res', 'resolution', True) # 480p etc. + formats.append({ + 'url': xpath_text(item, 'videoLink', 'url', True), + 'ext': ext, + 'resolution': resolution, + 'height': int(re.findall(r'\d+', resolution)[0]) + }) + + self._sort_formats(formats) + + return { 'id': video_id, + 'formats': formats, 'title': self._html_search_regex( \ r'

(.*?)

', webpage, 'title'), 'display_id': re.compile(self._VALID_URL).match(url).group('name'), @@ -105,29 +133,3 @@ class MovieFapIE(InfoExtractor): 'categories': self._html_search_regex( \ r'
\s*(.*?)\s*
', webpage, 'categories', fatal=False).split(', ') } - - # find and add the format - if xml.find('videoConfig') is not None: - info['ext'] = xml.find('videoConfig').find('type').text - else: - info['ext'] = 'flv' # guess... - - # work out the video URL(s) - if xml.find('videoLink') is not None: - # single format available - info['url'] = xpath_text(xml, 'videoLink', 'url', True) - else: - # multiple formats available - info['formats'] = [] - - for item in xml.find('quality').findall('item'): - resolution = xpath_text(item, 'res', 'resolution', True) # 480p etc. - info['formats'].append({ - 'url': xpath_text(item, 'videoLink', 'url', True), - 'resolution': resolution, - 'height': int(re.findall(r'\d+', resolution)[0]) - }) - - self._sort_formats(info['formats']) - - return info From db652ea186586e3eda5006ee096161b1a867c0d0 Mon Sep 17 00:00:00 2001 From: George Brighton Date: Sat, 27 Jun 2015 23:04:55 +0100 Subject: [PATCH 0542/2145] [moviefap] Fix `flake8` warnings introduced in 1a5fd4e --- youtube_dl/extractor/moviefap.py | 36 ++++++++++++++++---------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/moviefap.py b/youtube_dl/extractor/moviefap.py index 5e0c701d4..82b863539 100644 --- a/youtube_dl/extractor/moviefap.py +++ b/youtube_dl/extractor/moviefap.py @@ -78,8 +78,8 @@ class MovieFapIE(InfoExtractor): webpage = self._download_webpage(url, video_id) # find and retrieve the XML document detailing video download URLs - info_url = self._html_search_regex( \ - r'flashvars\.config = escape\("(.+?)"', webpage, 'player parameters') + info_url = self._html_search_regex( + r'flashvars\.config = escape\("(.+?)"', webpage, 'player parameters') xml = self._download_xml(info_url, video_id) # find the video container @@ -112,24 +112,24 @@ class MovieFapIE(InfoExtractor): return { 'id': video_id, 'formats': formats, - 'title': self._html_search_regex( \ - r'

(.*?)

', webpage, 'title'), + 'title': self._html_search_regex( + r'

(.*?)

', webpage, 'title'), 'display_id': re.compile(self._VALID_URL).match(url).group('name'), 'thumbnails': self.__get_thumbnail_data(xml), 'thumbnail': xpath_text(xml, 'startThumb', 'thumbnail'), - 'description': self._html_search_regex( \ - r'name="description" value="(.*?)"', webpage, 'description', fatal=False), - 'uploader_id': self._html_search_regex( \ - r'name="username" value="(.*?)"', webpage, 'uploader_id', fatal=False), - 'view_count': str_to_int(self._html_search_regex( \ - r'
Views ([0-9]+)', webpage, 'view_count, fatal=False')), - 'average_rating': float(self._html_search_regex( \ - r'Current Rating
(.*?)', webpage, 'average_rating', fatal=False)), - 'comment_count': str_to_int(self._html_search_regex( \ - r'([0-9]+)', webpage, 'comment_count', fatal=False)), + 'description': self._html_search_regex( + r'name="description" value="(.*?)"', webpage, 'description', fatal=False), + 'uploader_id': self._html_search_regex( + r'name="username" value="(.*?)"', webpage, 'uploader_id', fatal=False), + 'view_count': str_to_int(self._html_search_regex( + r'
Views ([0-9]+)', webpage, 'view_count, fatal=False')), + 'average_rating': float(self._html_search_regex( + r'Current Rating
(.*?)', webpage, 'average_rating', fatal=False)), + 'comment_count': str_to_int(self._html_search_regex( + r'([0-9]+)', webpage, 'comment_count', fatal=False)), 'age_limit': 18, - 'webpage_url': self._html_search_regex( \ - r'name="link" value="(.*?)"', webpage, 'webpage_url', fatal=False), - 'categories': self._html_search_regex( \ - r'
\s*(.*?)\s*
', webpage, 'categories', fatal=False).split(', ') + 'webpage_url': self._html_search_regex( + r'name="link" value="(.*?)"', webpage, 'webpage_url', fatal=False), + 'categories': self._html_search_regex( + r'
\s*(.*?)\s*
', webpage, 'categories', fatal=False).split(', ') } From bb512e57dc138b261cf9c71a833b0df5d5ba849f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 28 Jun 2015 13:25:59 +0800 Subject: [PATCH 0543/2145] [twitch:vod] Fix 'Source' format in m3u8 (closes #6115) --- youtube_dl/extractor/twitch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 94bd6345d..3e798e62d 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -215,7 +215,7 @@ class TwitchVodIE(TwitchItemBaseIE): '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id, 'Downloading %s access token' % self._ITEM_TYPE) formats = self._extract_m3u8_formats( - '%s/vod/%s?nauth=%s&nauthsig=%s' + '%s/vod/%s?nauth=%s&nauthsig=%s&allow_source=true' % (self._USHER_BASE, item_id, access_token['token'], access_token['sig']), item_id, 'mp4') self._prefer_source(formats) From ac0474f89d3e6f8c8c1fb3223a16a18a2fd02bcb Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 28 Jun 2015 13:31:37 +0800 Subject: [PATCH 0544/2145] [twitch:vod] Update _TEST The original test case is gone --- youtube_dl/extractor/twitch.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 3e798e62d..b56ee2959 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -189,17 +189,17 @@ class TwitchVodIE(TwitchItemBaseIE): _ITEM_SHORTCUT = 'v' _TEST = { - 'url': 'http://www.twitch.tv/ksptv/v/3622000', + 'url': 'http://www.twitch.tv/riotgames/v/6528877', 'info_dict': { - 'id': 'v3622000', + 'id': 'v6528877', 'ext': 'mp4', - 'title': '''KSPTV: Squadcast: "Everyone's on vacation so here's Dahud" Edition!''', + 'title': 'LCK Summer Split - Week 6 Day 1', 'thumbnail': 're:^https?://.*\.jpg$', - 'duration': 6951, - 'timestamp': 1419028564, - 'upload_date': '20141219', - 'uploader': 'KSPTV', - 'uploader_id': 'ksptv', + 'duration': 17208, + 'timestamp': 1435131709, + 'upload_date': '20150624', + 'uploader': 'Riot Games', + 'uploader_id': 'riotgames', 'view_count': int, }, 'params': { From 9603e8a7d998615d3da1af47461ec9c353ec4e7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 28 Jun 2015 22:55:28 +0600 Subject: [PATCH 0545/2145] [YoutubeDL] Handle None width and height similarly to formats --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ef0f71bad..411de9ac9 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1008,7 +1008,7 @@ class YoutubeDL(object): t.get('preference'), t.get('width'), t.get('height'), t.get('id'), t.get('url'))) for i, t in enumerate(thumbnails): - if 'width' in t and 'height' in t: + if t.get('width') and t.get('height'): t['resolution'] = '%dx%d' % (t['width'], t['height']) if t.get('id') is None: t['id'] = '%d' % i From bf42a9906d9a066d32f1cc50e1b033e6676744ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 28 Jun 2015 22:56:07 +0600 Subject: [PATCH 0546/2145] [utils] Add default value for xpath_text --- youtube_dl/utils.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 96490f112..942f76d24 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -62,6 +62,8 @@ std_headers = { } +NO_DEFAULT = object() + ENGLISH_MONTH_NAMES = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] @@ -171,13 +173,15 @@ def xpath_with_ns(path, ns_map): return '/'.join(replaced) -def xpath_text(node, xpath, name=None, fatal=False): +def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT): if sys.version_info < (2, 7): # Crazy 2.6 xpath = xpath.encode('ascii') n = node.find(xpath) if n is None or n.text is None: - if fatal: + if default is not NO_DEFAULT: + return default + elif fatal: name = xpath if name is None else name raise ExtractorError('Could not find XML element %s' % name) else: From c342041fba9283ba5f05f48427aabf79adcf8647 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 28 Jun 2015 22:56:45 +0600 Subject: [PATCH 0547/2145] [extractor/common] Use NO_DEFAULT from utils --- youtube_dl/extractor/common.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 49e4dc710..7fa46d295 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -22,6 +22,7 @@ from ..compat import ( compat_str, ) from ..utils import ( + NO_DEFAULT, age_restricted, bug_reports_message, clean_html, @@ -33,7 +34,7 @@ from ..utils import ( sanitize_filename, unescapeHTML, ) -_NO_DEFAULT = object() + class InfoExtractor(object): @@ -523,7 +524,7 @@ class InfoExtractor(object): video_info['description'] = playlist_description return video_info - def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): + def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Perform a regex search on the given string, using a single or a list of patterns returning the first matching group. @@ -549,7 +550,7 @@ class InfoExtractor(object): return next(g for g in mobj.groups() if g is not None) else: return mobj.group(group) - elif default is not _NO_DEFAULT: + elif default is not NO_DEFAULT: return default elif fatal: raise RegexNotFoundError('Unable to extract %s' % _name) @@ -557,7 +558,7 @@ class InfoExtractor(object): self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) return None - def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): + def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Like _search_regex, but strips HTML tags and unescapes entities. """ From d16154d16327907279eff48a4018c495726d401a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 28 Jun 2015 23:05:09 +0600 Subject: [PATCH 0548/2145] [tnaflix] Generalize tnaflix extractors --- youtube_dl/extractor/__init__.py | 8 +- youtube_dl/extractor/empflix.py | 31 ---- youtube_dl/extractor/moviefap.py | 135 --------------- youtube_dl/extractor/tnaflix.py | 279 +++++++++++++++++++++++++------ 4 files changed, 234 insertions(+), 219 deletions(-) delete mode 100644 youtube_dl/extractor/empflix.py delete mode 100644 youtube_dl/extractor/moviefap.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d41d277c9..d44339200 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -144,7 +144,6 @@ from .ellentv import ( ) from .elpais import ElPaisIE from .embedly import EmbedlyIE -from .empflix import EMPFlixIE from .engadget import EngadgetIE from .eporner import EpornerIE from .eroprofile import EroProfileIE @@ -311,7 +310,6 @@ from .morningstar import MorningstarIE from .motherless import MotherlessIE from .motorsport import MotorsportIE from .movieclips import MovieClipsIE -from .moviefap import MovieFapIE from .moviezine import MoviezineIE from .movshare import MovShareIE from .mtv import ( @@ -578,7 +576,11 @@ from .tmz import ( TMZIE, TMZArticleIE, ) -from .tnaflix import TNAFlixIE +from .tnaflix import ( + TNAFlixIE, + EMPFlixIE, + MovieFapIE, +) from .thvideo import ( THVideoIE, THVideoPlaylistIE diff --git a/youtube_dl/extractor/empflix.py b/youtube_dl/extractor/empflix.py deleted file mode 100644 index 4827022e0..000000000 --- a/youtube_dl/extractor/empflix.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import unicode_literals - -from .tnaflix import TNAFlixIE - - -class EMPFlixIE(TNAFlixIE): - _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P.+?)-(?P[0-9]+)\.html' - - _TITLE_REGEX = r'name="title" value="(?P[^"]*)"' - _DESCRIPTION_REGEX = r'name="description" value="([^"]*)"' - _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"' - - _TESTS = [ - { - 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', - 'md5': 'b1bc15b6412d33902d6e5952035fcabc', - 'info_dict': { - 'id': '33051', - 'display_id': 'Amateur-Finger-Fuck', - 'ext': 'mp4', - 'title': 'Amateur Finger Fuck', - 'description': 'Amateur solo finger fucking.', - 'thumbnail': 're:https?://.*\.jpg$', - 'age_limit': 18, - } - }, - { - 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', - 'only_matching': True, - } - ] diff --git a/youtube_dl/extractor/moviefap.py b/youtube_dl/extractor/moviefap.py deleted file mode 100644 index 82b863539..000000000 --- a/youtube_dl/extractor/moviefap.py +++ /dev/null @@ -1,135 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - xpath_text, - str_to_int -) -from ..compat import compat_str - - -class MovieFapIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P<id>[0-9a-f]+)/(?P<name>[a-z-_]+)' - _TESTS = [{ - # normal, multi-format video - 'url': 'http://www.moviefap.com/videos/be9867c9416c19f54a4a/experienced-milf-amazing-handjob.html', - 'md5': '26624b4e2523051b550067d547615906', - 'info_dict': { - 'id': 'be9867c9416c19f54a4a', - 'ext': 'mp4', - 'title': 'Experienced MILF Amazing Handjob', - 'description': 'Experienced MILF giving an Amazing Handjob', - 'thumbnail': 'http://img.moviefap.com/a16:9w990r/thumbs/be/322032-20l.jpg', - 'uploader_id': 'darvinfred06', - 'display_id': 'experienced-milf-amazing-handjob', - 'categories': ['Amateur', 'Masturbation', 'Mature', 'Flashing'] - } - }, { - # quirky single-format case where the extension is given as fid, but the video is really an flv - 'url': 'http://www.moviefap.com/videos/e5da0d3edce5404418f5/jeune-couple-russe.html', - 'md5': 'fa56683e291fc80635907168a743c9ad', - 'info_dict': { - 'id': 'e5da0d3edce5404418f5', - 'ext': 'flv', - 'title': 'Jeune Couple Russe', - 'description': 'Amateur', - 'thumbnail': 'http://pic.moviefap.com/thumbs/e5/949-18l.jpg', - 'uploader_id': 'whiskeyjar', - 'display_id': 'jeune-couple-russe', - 'categories': ['Amateur', 'Teen'] - } - }] - - @staticmethod - def __get_thumbnail_data(xml): - - """ - Constructs a list of video thumbnails from timeline preview images. - :param xml: the information XML document to parse - """ - - timeline = xml.find('timeline') - if timeline is None: - # not all videos have the data - ah well - return [] - - # get the required information from the XML - width = str_to_int(timeline.find('imageWidth').text) - height = str_to_int(timeline.find('imageHeight').text) - first = str_to_int(timeline.find('imageFirst').text) - last = str_to_int(timeline.find('imageLast').text) - pattern = timeline.find('imagePattern').text - - # generate the list of thumbnail information dicts - thumbnails = [] - for i in range(first, last + 1): - thumbnails.append({ - 'url': pattern.replace('#', compat_str(i)), - 'width': width, - 'height': height - }) - return thumbnails - - def _real_extract(self, url): - - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - # find and retrieve the XML document detailing video download URLs - info_url = self._html_search_regex( - r'flashvars\.config = escape\("(.+?)"', webpage, 'player parameters') - xml = self._download_xml(info_url, video_id) - - # find the video container - if xml.find('videoConfig') is not None: - ext = xml.find('videoConfig').find('type').text - else: - ext = 'flv' # guess... - - # work out the video URL(s) - formats = [] - if xml.find('videoLink') is not None: - # single format available - formats.append({ - 'url': xpath_text(xml, 'videoLink', 'url', True), - 'ext': ext - }) - else: - # multiple formats available - for item in xml.find('quality').findall('item'): - resolution = xpath_text(item, 'res', 'resolution', True) # 480p etc. - formats.append({ - 'url': xpath_text(item, 'videoLink', 'url', True), - 'ext': ext, - 'resolution': resolution, - 'height': int(re.findall(r'\d+', resolution)[0]) - }) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'formats': formats, - 'title': self._html_search_regex( - r'<div id="view_title"><h1>(.*?)</h1>', webpage, 'title'), - 'display_id': re.compile(self._VALID_URL).match(url).group('name'), - 'thumbnails': self.__get_thumbnail_data(xml), - 'thumbnail': xpath_text(xml, 'startThumb', 'thumbnail'), - 'description': self._html_search_regex( - r'name="description" value="(.*?)"', webpage, 'description', fatal=False), - 'uploader_id': self._html_search_regex( - r'name="username" value="(.*?)"', webpage, 'uploader_id', fatal=False), - 'view_count': str_to_int(self._html_search_regex( - r'<br>Views <strong>([0-9]+)</strong>', webpage, 'view_count, fatal=False')), - 'average_rating': float(self._html_search_regex( - r'Current Rating<br> <strong>(.*?)</strong>', webpage, 'average_rating', fatal=False)), - 'comment_count': str_to_int(self._html_search_regex( - r'<span id="comCount">([0-9]+)</span>', webpage, 'comment_count', fatal=False)), - 'age_limit': 18, - 'webpage_url': self._html_search_regex( - r'name="link" value="(.*?)"', webpage, 'webpage_url', fatal=False), - 'categories': self._html_search_regex( - r'</div>\s*(.*?)\s*<br>', webpage, 'categories', fatal=False).split(', ') - } diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index c282865b2..49516abca 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -3,39 +3,70 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - parse_duration, fix_xml_ampersands, + float_or_none, + int_or_none, + parse_duration, + str_to_int, + xpath_text, ) -class TNAFlixIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)' - - _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos' - _DESCRIPTION_REGEX = r'

([^<]+)

' - _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"' - - _TESTS = [ - { - 'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878', - 'md5': 'ecf3498417d09216374fc5907f9c6ec0', - 'info_dict': { - 'id': '553878', - 'display_id': 'Carmella-Decesare-striptease', - 'ext': 'mp4', - 'title': 'Carmella Decesare - striptease', - 'description': '', - 'thumbnail': 're:https?://.*\.jpg$', - 'duration': 91, - 'age_limit': 18, - } - }, - { - 'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632', - 'only_matching': True, - } +class TNAFlixNetworkBaseIE(InfoExtractor): + # May be overridden in descendants if necessary + _CONFIG_REGEX = [ + r'flashvars\.config\s*=\s*escape\("([^"]+)"', + r']+name="config\d?" value="([^"]+)"', ] + _TITLE_REGEX = r']+name="title" value="([^"]+)"' + _DESCRIPTION_REGEX = r']+name="description" value="([^"]+)"' + _UPLOADER_REGEX = r']+name="username" value="([^"]+)"' + _VIEW_COUNT_REGEX = None + _COMMENT_COUNT_REGEX = None + _AVERAGE_RATING_REGEX = None + _CATEGORIES_REGEX = r']*>\s*]+class="infoTitle"[^>]*>Categories:\s*]+class="listView"[^>]*>(.+?)\s*' + + def _extract_thumbnails(self, flix_xml): + + def get_child(elem, names): + for name in names: + child = elem.find(name) + if child is not None: + return child + + timeline = get_child(flix_xml, ['timeline', 'rolloverBarImage']) + if timeline is None: + return + + pattern_el = get_child(timeline, ['imagePattern', 'pattern']) + if pattern_el is None or not pattern_el.text: + return + + first_el = get_child(timeline, ['imageFirst', 'first']) + last_el = get_child(timeline, ['imageLast', 'last']) + if first_el is None or last_el is None: + return + + first_text = first_el.text + last_text = last_el.text + if not first_text.isdigit() or not last_text.isdigit(): + return + + first = int(first_text) + last = int(last_text) + if first > last: + return + + width = int_or_none(xpath_text(timeline, './imageWidth', 'thumbnail width')) + height = int_or_none(xpath_text(timeline, './imageHeight', 'thumbnail height')) + + return [{ + 'url': self._proto_relative_url(pattern_el.text.replace('#', compat_str(i)), 'http:'), + 'width': width, + 'height': height, + } for i in range(first, last + 1)] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -44,39 +75,64 @@ class TNAFlixIE(InfoExtractor): webpage = self._download_webpage(url, display_id) + cfg_url = self._proto_relative_url(self._html_search_regex( + self._CONFIG_REGEX, webpage, 'flashvars.config'), 'http:') + + cfg_xml = self._download_xml( + cfg_url, display_id, 'Downloading metadata', + transform_source=fix_xml_ampersands) + + formats = [] + + def extract_video_url(vl): + return re.sub('speed=\d+', 'speed=', vl.text) + + video_link = cfg_xml.find('./videoLink') + if video_link is not None: + formats.append({ + 'url': extract_video_url(video_link), + 'ext': xpath_text(cfg_xml, './videoConfig/type', 'type', default='flv'), + }) + + for item in cfg_xml.findall('./quality/item'): + video_link = item.find('./videoLink') + if video_link is None: + continue + res = item.find('res') + format_id = None if res is None else res.text + height = int_or_none(self._search_regex( + r'^(\d+)[pP]', format_id, 'height', default=None)) + formats.append({ + 'url': self._proto_relative_url(extract_video_url(video_link), 'http:'), + 'format_id': format_id, + 'height': height, + }) + + self._sort_formats(formats) + + thumbnail = self._proto_relative_url( + xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:') + thumbnails = self._extract_thumbnails(cfg_xml) + title = self._html_search_regex( self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage) - description = self._html_search_regex( - self._DESCRIPTION_REGEX, webpage, 'description', fatal=False, default='') age_limit = self._rta_search(webpage) duration = parse_duration(self._html_search_meta( 'duration', webpage, 'duration', default=None)) - cfg_url = self._proto_relative_url(self._html_search_regex( - self._CONFIG_REGEX, webpage, 'flashvars.config'), 'http:') + def extract_field(pattern, name): + return self._html_search_regex(pattern, webpage, name, default=None) if pattern else None - cfg_xml = self._download_xml( - cfg_url, display_id, note='Downloading metadata', - transform_source=fix_xml_ampersands) + description = extract_field(self._DESCRIPTION_REGEX, 'description') + uploader = extract_field(self._UPLOADER_REGEX, 'uploader') + view_count = str_to_int(extract_field(self._VIEW_COUNT_REGEX, 'view count')) + comment_count = str_to_int(extract_field(self._COMMENT_COUNT_REGEX, 'comment count')) + average_rating = float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating')) - thumbnail = self._proto_relative_url( - cfg_xml.find('./startThumb').text, 'http:') - - formats = [] - for item in cfg_xml.findall('./quality/item'): - video_url = re.sub('speed=\d+', 'speed=', item.find('videoLink').text) - format_id = item.find('res').text - fmt = { - 'url': self._proto_relative_url(video_url, 'http:'), - 'format_id': format_id, - } - m = re.search(r'^(\d+)', format_id) - if m: - fmt['height'] = int(m.group(1)) - formats.append(fmt) - self._sort_formats(formats) + categories_str = extract_field(self._CATEGORIES_REGEX, 'categories') + categories = categories_str.split(', ') if categories_str is not None else [] return { 'id': video_id, @@ -84,7 +140,130 @@ class TNAFlixIE(InfoExtractor): 'title': title, 'description': description, 'thumbnail': thumbnail, + 'thumbnails': thumbnails, 'duration': duration, 'age_limit': age_limit, + 'uploader': uploader, + 'view_count': view_count, + 'comment_count': comment_count, + 'average_rating': average_rating, + 'categories': categories, 'formats': formats, } + + +class TNAFlixIE(TNAFlixNetworkBaseIE): + _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P[^/]+)/video(?P\d+)' + + _TITLE_REGEX = r'(.+?) - TNAFlix Porn Videos' + _DESCRIPTION_REGEX = r'

([^<]+)

' + _UPLOADER_REGEX = r'(?s)]+class="infoTitle"[^>]*>Uploaded By:(.+?).+?)-(?P[0-9]+)\.html' + + _UPLOADER_REGEX = r']+class="infoTitle"[^>]*>Uploaded By:(.+?)' + + _TESTS = [{ + 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', + 'md5': 'b1bc15b6412d33902d6e5952035fcabc', + 'info_dict': { + 'id': '33051', + 'display_id': 'Amateur-Finger-Fuck', + 'ext': 'mp4', + 'title': 'Amateur Finger Fuck', + 'description': 'Amateur solo finger fucking.', + 'thumbnail': 're:https?://.*\.jpg$', + 'duration': 83, + 'age_limit': 18, + 'uploader': 'cwbike', + 'categories': ['Amateur', 'Anal', 'Fisting', 'Home made', 'Solo'], + } + }, { + 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', + 'only_matching': True, + }] + + +class MovieFapIE(TNAFlixNetworkBaseIE): + _VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P[0-9a-f]+)/(?P[^/]+)\.html' + + _VIEW_COUNT_REGEX = r'
Views\s*([\d,.]+)' + _COMMENT_COUNT_REGEX = r']+id="comCount"[^>]*>([\d,.]+)' + _AVERAGE_RATING_REGEX = r'Current Rating\s*
\s*([\d.]+)' + _CATEGORIES_REGEX = r'(?s)]+id="vid_info"[^>]*>\s*]*>.+?
(.*?)
' + + _TESTS = [{ + # normal, multi-format video + 'url': 'http://www.moviefap.com/videos/be9867c9416c19f54a4a/experienced-milf-amazing-handjob.html', + 'md5': '26624b4e2523051b550067d547615906', + 'info_dict': { + 'id': 'be9867c9416c19f54a4a', + 'display_id': 'experienced-milf-amazing-handjob', + 'ext': 'mp4', + 'title': 'Experienced MILF Amazing Handjob', + 'description': 'Experienced MILF giving an Amazing Handjob', + 'thumbnail': 're:https?://.*\.jpg$', + 'age_limit': 18, + 'uploader': 'darvinfred06', + 'view_count': int, + 'comment_count': int, + 'average_rating': float, + 'categories': ['Amateur', 'Masturbation', 'Mature', 'Flashing'], + } + }, { + # quirky single-format case where the extension is given as fid, but the video is really an flv + 'url': 'http://www.moviefap.com/videos/e5da0d3edce5404418f5/jeune-couple-russe.html', + 'md5': 'fa56683e291fc80635907168a743c9ad', + 'info_dict': { + 'id': 'e5da0d3edce5404418f5', + 'display_id': 'jeune-couple-russe', + 'ext': 'flv', + 'title': 'Jeune Couple Russe', + 'description': 'Amateur', + 'thumbnail': 're:https?://.*\.jpg$', + 'age_limit': 18, + 'uploader': 'whiskeyjar', + 'view_count': int, + 'comment_count': int, + 'average_rating': float, + 'categories': ['Amateur', 'Teen'], + } + }] From 507683780eb14d012d4430044dc402d7e08e36b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 28 Jun 2015 23:08:05 +0600 Subject: [PATCH 0549/2145] Credit @gebn for moviefap --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 889d599a2..117b9c219 100644 --- a/AUTHORS +++ b/AUTHORS @@ -128,3 +128,4 @@ Ping O. Mister Hat Peter Ding jackyzy823 +George Brighton From c93d53f5e307dab1b5d03cd3c621a68f40ef840d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 29 Jun 2015 00:48:06 +0600 Subject: [PATCH 0550/2145] [youtube] Fix likes/dislike extraction --- youtube_dl/extractor/youtube.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d9240ff02..8b43e274b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -29,6 +29,7 @@ from ..utils import ( get_element_by_id, int_or_none, orderedSet, + str_to_int, unescapeHTML, unified_strdate, uppercase_escape, @@ -1005,12 +1006,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_description = '' def _extract_count(count_name): - count = self._search_regex( - r'id="watch-%s"[^>]*>.*?([\d,]+)\s*' % re.escape(count_name), - video_webpage, count_name, default=None) - if count is not None: - return int(count.replace(',', '')) - return None + return str_to_int(self._search_regex( + r'-%s-button[^>]+>]+class="yt-uix-button-content"[^>]*>([\d,]+)' + % re.escape(count_name), + video_webpage, count_name, default=None)) + like_count = _extract_count('like') dislike_count = _extract_count('dislike') From 541462379153c19656aa52cc5796dbf05de874ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 29 Jun 2015 00:49:19 +0600 Subject: [PATCH 0551/2145] [extractor/common] Remove superfluous line --- youtube_dl/extractor/common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 7fa46d295..81623bfe3 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -36,7 +36,6 @@ from ..utils import ( ) - class InfoExtractor(object): """Information Extractor class. From 67134eaba1a56cec4117000acb2fc9284c9cdd9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 28 Jun 2015 22:08:29 +0200 Subject: [PATCH 0552/2145] [YoutubeDL] rework how the format spec is processed The spec string is processed using 'tokenize.tokenize' to split it in words and operators, the filters are still processed using regular expressions. This should make easier to allow grouping operators with parens. --- test/test_YoutubeDL.py | 27 ++-- youtube_dl/YoutubeDL.py | 298 +++++++++++++++++++++++++--------------- youtube_dl/compat.py | 5 + 3 files changed, 209 insertions(+), 121 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index a13c09ef4..8f7aef512 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -229,21 +229,30 @@ class TestFormatSelection(unittest.TestCase): '141', '172', '140', '171', '139', ] - for f1id, f2id in zip(order, order[1:]): - f1 = YoutubeIE._formats[f1id].copy() - f1['format_id'] = f1id - f1['url'] = 'url:' + f1id - f2 = YoutubeIE._formats[f2id].copy() - f2['format_id'] = f2id - f2['url'] = 'url:' + f2id + def format_info(f_id): + info = YoutubeIE._formats[f_id].copy() + info['format_id'] = f_id + info['url'] = 'url:' + f_id + return info + formats_order = [format_info(f_id) for f_id in order] + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': 'bestvideo+bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], '137+141') + self.assertEqual(downloaded['ext'], 'mp4') + + for f1, f2 in zip(formats_order, formats_order[1:]): info_dict = _make_result([f1, f2], extractor='youtube') ydl = YDL({'format': 'best/bestvideo'}) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], f1id) + self.assertEqual(downloaded['format_id'], f1['format_id']) info_dict = _make_result([f2, f1], extractor='youtube') ydl = YDL({'format': 'best/bestvideo'}) @@ -251,7 +260,7 @@ class TestFormatSelection(unittest.TestCase): yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) downloaded = ydl.downloaded_info_dicts[0] - self.assertEqual(downloaded['format_id'], f1id) + self.assertEqual(downloaded['format_id'], f1['format_id']) def test_format_filtering(self): formats = [ diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index ef0f71bad..17a5407b9 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -21,6 +21,7 @@ import subprocess import socket import sys import time +import tokenize import traceback if os.name == 'nt': @@ -34,6 +35,7 @@ from .compat import ( compat_http_client, compat_kwargs, compat_str, + compat_tokenize_tokenize, compat_urllib_error, compat_urllib_request, ) @@ -851,8 +853,8 @@ class YoutubeDL(object): else: raise Exception('Invalid result type: %s' % result_type) - def _apply_format_filter(self, format_spec, available_formats): - " Returns a tuple of the remaining format_spec and filtered formats " + def _build_format_filter(self, filter_spec): + " Returns a function to filter the formats according to the filter_spec " OPERATORS = { '<': operator.lt, @@ -862,13 +864,13 @@ class YoutubeDL(object): '=': operator.eq, '!=': operator.ne, } - operator_rex = re.compile(r'''(?x)\s*\[ + operator_rex = re.compile(r'''(?x)\s* (?Pwidth|height|tbr|abr|vbr|asr|filesize|fps) \s*(?P%s)(?P\s*\?)?\s* (?P[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?) - \]$ + $ ''' % '|'.join(map(re.escape, OPERATORS.keys()))) - m = operator_rex.search(format_spec) + m = operator_rex.search(filter_spec) if m: try: comparison_value = int(m.group('value')) @@ -879,7 +881,7 @@ class YoutubeDL(object): if comparison_value is None: raise ValueError( 'Invalid value %r in format specification %r' % ( - m.group('value'), format_spec)) + m.group('value'), filter_spec)) op = OPERATORS[m.group('op')] if not m: @@ -887,85 +889,201 @@ class YoutubeDL(object): '=': operator.eq, '!=': operator.ne, } - str_operator_rex = re.compile(r'''(?x)\s*\[ + str_operator_rex = re.compile(r'''(?x) \s*(?Pext|acodec|vcodec|container|protocol) \s*(?P%s)(?P\s*\?)? \s*(?P[a-zA-Z0-9_-]+) - \s*\]$ + \s*$ ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) - m = str_operator_rex.search(format_spec) + m = str_operator_rex.search(filter_spec) if m: comparison_value = m.group('value') op = STR_OPERATORS[m.group('op')] if not m: - raise ValueError('Invalid format specification %r' % format_spec) + raise ValueError('Invalid filter specification %r' % filter_spec) def _filter(f): actual_value = f.get(m.group('key')) if actual_value is None: return m.group('none_inclusive') return op(actual_value, comparison_value) - new_formats = [f for f in available_formats if _filter(f)] + return _filter - new_format_spec = format_spec[:-len(m.group(0))] - if not new_format_spec: - new_format_spec = 'best' + def build_format_selector(self, format_spec): + def syntax_error(note, start): + message = ( + 'Invalid format specification: ' + '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1])) + return SyntaxError(message) - return (new_format_spec, new_formats) + PICKFIRST = 'PICKFIRST' + MERGE = 'MERGE' + SINGLE = 'SINGLE' + FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters']) - def select_format(self, format_spec, available_formats): - while format_spec.endswith(']'): - format_spec, available_formats = self._apply_format_filter( - format_spec, available_formats) - if not available_formats: - return None + def _parse_filter(tokens): + filter_parts = [] + for type, string, start, _, _ in tokens: + if type == tokenize.OP and string == ']': + return ''.join(filter_parts) + else: + filter_parts.append(string) - if format_spec in ['best', 'worst', None]: - format_idx = 0 if format_spec == 'worst' else -1 - audiovideo_formats = [ - f for f in available_formats - if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] - if audiovideo_formats: - return audiovideo_formats[format_idx] - # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format - elif (all(f.get('acodec') != 'none' for f in available_formats) or - all(f.get('vcodec') != 'none' for f in available_formats)): - return available_formats[format_idx] - elif format_spec == 'bestaudio': - audio_formats = [ - f for f in available_formats - if f.get('vcodec') == 'none'] - if audio_formats: - return audio_formats[-1] - elif format_spec == 'worstaudio': - audio_formats = [ - f for f in available_formats - if f.get('vcodec') == 'none'] - if audio_formats: - return audio_formats[0] - elif format_spec == 'bestvideo': - video_formats = [ - f for f in available_formats - if f.get('acodec') == 'none'] - if video_formats: - return video_formats[-1] - elif format_spec == 'worstvideo': - video_formats = [ - f for f in available_formats - if f.get('acodec') == 'none'] - if video_formats: - return video_formats[0] - else: - extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] - if format_spec in extensions: - filter_f = lambda f: f['ext'] == format_spec - else: - filter_f = lambda f: f['format_id'] == format_spec - matches = list(filter(filter_f, available_formats)) - if matches: - return matches[-1] - return None + def _parse_format_selection(tokens, endwith=[]): + selectors = [] + current_selector = None + for type, string, start, _, _ in tokens: + # ENCODING is only defined in python 3.x + if type == getattr(tokenize, 'ENCODING', None): + continue + elif type in [tokenize.NAME, tokenize.NUMBER]: + current_selector = FormatSelector(SINGLE, string, []) + elif type == tokenize.OP: + if string in endwith: + break + if string == ',': + selectors.append(current_selector) + current_selector = None + elif string == '/': + first_choice = current_selector + second_choice = _parse_format_selection(tokens, [',']) + current_selector = None + selectors.append(FormatSelector(PICKFIRST, (first_choice, second_choice), [])) + elif string == '[': + if not current_selector: + current_selector = FormatSelector(SINGLE, 'best', []) + format_filter = _parse_filter(tokens) + current_selector.filters.append(format_filter) + elif string == '+': + video_selector = current_selector + audio_selector = _parse_format_selection(tokens, [',']) + current_selector = None + selectors.append(FormatSelector(MERGE, (video_selector, audio_selector), [])) + else: + raise syntax_error('Operator not recognized: "{0}"'.format(string), start) + elif type == tokenize.ENDMARKER: + break + if current_selector: + selectors.append(current_selector) + return selectors + + def _build_selector_function(selector): + if isinstance(selector, list): + fs = [_build_selector_function(s) for s in selector] + + def selector_function(formats): + for f in fs: + for format in f(formats): + yield format + return selector_function + elif selector.type == PICKFIRST: + fs = [_build_selector_function(s) for s in selector.selector] + + def selector_function(formats): + for f in fs: + picked_formats = list(f(formats)) + if picked_formats: + return picked_formats + return [] + elif selector.type == SINGLE: + format_spec = selector.selector + + def selector_function(formats): + if format_spec in ['best', 'worst', None]: + format_idx = 0 if format_spec == 'worst' else -1 + audiovideo_formats = [ + f for f in formats + if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] + if audiovideo_formats: + yield audiovideo_formats[format_idx] + # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format + elif (all(f.get('acodec') != 'none' for f in formats) or + all(f.get('vcodec') != 'none' for f in formats)): + yield formats[format_idx] + elif format_spec == 'bestaudio': + audio_formats = [ + f for f in formats + if f.get('vcodec') == 'none'] + if audio_formats: + yield audio_formats[-1] + elif format_spec == 'worstaudio': + audio_formats = [ + f for f in formats + if f.get('vcodec') == 'none'] + if audio_formats: + yield audio_formats[0] + elif format_spec == 'bestvideo': + video_formats = [ + f for f in formats + if f.get('acodec') == 'none'] + if video_formats: + yield video_formats[-1] + elif format_spec == 'worstvideo': + video_formats = [ + f for f in formats + if f.get('acodec') == 'none'] + if video_formats: + yield video_formats[0] + else: + extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav'] + if format_spec in extensions: + filter_f = lambda f: f['ext'] == format_spec + else: + filter_f = lambda f: f['format_id'] == format_spec + matches = list(filter(filter_f, formats)) + if matches: + yield matches[-1] + elif selector.type == MERGE: + def _merge(formats_info): + format_1, format_2 = [f['format_id'] for f in formats_info] + # The first format must contain the video and the + # second the audio + if formats_info[0].get('vcodec') == 'none': + self.report_error('The first format must ' + 'contain the video, try using ' + '"-f %s+%s"' % (format_2, format_1)) + return + output_ext = ( + formats_info[0]['ext'] + if self.params.get('merge_output_format') is None + else self.params['merge_output_format']) + return { + 'requested_formats': formats_info, + 'format': '%s+%s' % (formats_info[0].get('format'), + formats_info[1].get('format')), + 'format_id': '%s+%s' % (formats_info[0].get('format_id'), + formats_info[1].get('format_id')), + 'width': formats_info[0].get('width'), + 'height': formats_info[0].get('height'), + 'resolution': formats_info[0].get('resolution'), + 'fps': formats_info[0].get('fps'), + 'vcodec': formats_info[0].get('vcodec'), + 'vbr': formats_info[0].get('vbr'), + 'stretched_ratio': formats_info[0].get('stretched_ratio'), + 'acodec': formats_info[1].get('acodec'), + 'abr': formats_info[1].get('abr'), + 'ext': output_ext, + } + video_selector, audio_selector = map(_build_selector_function, selector.selector) + + def selector_function(formats): + formats = list(formats) + for pair in itertools.product(video_selector(formats), audio_selector(formats)): + yield _merge(pair) + + filters = [self._build_format_filter(f) for f in selector.filters] + + def final_selector(formats): + for _filter in filters: + formats = list(filter(_filter, formats)) + return selector_function(formats) + return final_selector + + stream = io.BytesIO(format_spec.encode('utf-8')) + tokens = compat_tokenize_tokenize(stream.readline) + parsed_selector = _parse_format_selection(tokens) + return _build_selector_function(parsed_selector) def _calc_headers(self, info_dict): res = std_headers.copy() @@ -1112,52 +1230,8 @@ class YoutubeDL(object): if req_format == 'all': formats_to_download = formats else: - for rfstr in req_format.split(','): - # We can accept formats requested in the format: 34/5/best, we pick - # the first that is available, starting from left - req_formats = rfstr.split('/') - for rf in req_formats: - if re.match(r'.+?\+.+?', rf) is not None: - # Two formats have been requested like '137+139' - format_1, format_2 = rf.split('+') - formats_info = (self.select_format(format_1, formats), - self.select_format(format_2, formats)) - if all(formats_info): - # The first format must contain the video and the - # second the audio - if formats_info[0].get('vcodec') == 'none': - self.report_error('The first format must ' - 'contain the video, try using ' - '"-f %s+%s"' % (format_2, format_1)) - return - output_ext = ( - formats_info[0]['ext'] - if self.params.get('merge_output_format') is None - else self.params['merge_output_format']) - selected_format = { - 'requested_formats': formats_info, - 'format': '%s+%s' % (formats_info[0].get('format'), - formats_info[1].get('format')), - 'format_id': '%s+%s' % (formats_info[0].get('format_id'), - formats_info[1].get('format_id')), - 'width': formats_info[0].get('width'), - 'height': formats_info[0].get('height'), - 'resolution': formats_info[0].get('resolution'), - 'fps': formats_info[0].get('fps'), - 'vcodec': formats_info[0].get('vcodec'), - 'vbr': formats_info[0].get('vbr'), - 'stretched_ratio': formats_info[0].get('stretched_ratio'), - 'acodec': formats_info[1].get('acodec'), - 'abr': formats_info[1].get('abr'), - 'ext': output_ext, - } - else: - selected_format = None - else: - selected_format = self.select_format(rf, formats) - if selected_format is not None: - formats_to_download.append(selected_format) - break + format_selector = self.build_format_selector(req_format) + formats_to_download = list(format_selector(formats)) if not formats_to_download: raise ExtractorError('requested format not available', expected=True) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index f9529210d..bc218dd71 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -388,6 +388,10 @@ else: pass return _terminal_size(columns, lines) +if sys.version_info >= (3, 0): + from tokenize import tokenize as compat_tokenize_tokenize +else: + from tokenize import generate_tokens as compat_tokenize_tokenize __all__ = [ 'compat_HTTPError', @@ -408,6 +412,7 @@ __all__ = [ 'compat_socket_create_connection', 'compat_str', 'compat_subprocess_get_DEVNULL', + 'compat_tokenize_tokenize', 'compat_urllib_error', 'compat_urllib_parse', 'compat_urllib_parse_unquote', From 5acfa126c812c3ab7088af6c7df79697baee7831 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 28 Jun 2015 22:48:02 +0200 Subject: [PATCH 0553/2145] [YoutubeDL] format spec: treat 'all' like a normal specifier So you can use filters with it, for example 'all[width>=400][width<=600]'. --- test/test_YoutubeDL.py | 5 +++++ youtube_dl/YoutubeDL.py | 13 ++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 8f7aef512..709e3100f 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -317,6 +317,11 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'G') + ydl = YDL({'format': 'all[width>=400][width<=600]'}) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['B', 'C', 'D']) + class TestYoutubeDL(unittest.TestCase): def test_subtitles(self): diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 17a5407b9..258e612af 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -990,7 +990,10 @@ class YoutubeDL(object): format_spec = selector.selector def selector_function(formats): - if format_spec in ['best', 'worst', None]: + if format_spec == 'all': + for f in formats: + yield f + elif format_spec in ['best', 'worst', None]: format_idx = 0 if format_spec == 'worst' else -1 audiovideo_formats = [ f for f in formats @@ -1226,12 +1229,8 @@ class YoutubeDL(object): req_format_list.append('bestvideo+bestaudio') req_format_list.append('best') req_format = '/'.join(req_format_list) - formats_to_download = [] - if req_format == 'all': - formats_to_download = formats - else: - format_selector = self.build_format_selector(req_format) - formats_to_download = list(format_selector(formats)) + format_selector = self.build_format_selector(req_format) + formats_to_download = list(format_selector(formats)) if not formats_to_download: raise ExtractorError('requested format not available', expected=True) From c4bd188da46a837ddf8f8f8d4766eb799fa2b484 Mon Sep 17 00:00:00 2001 From: Anders Einar Hilden Date: Mon, 29 Jun 2015 00:11:31 +0200 Subject: [PATCH 0554/2145] NRK now supports / requires HTTPS Add s? to regexp to support new urls. Update testcases to use HTTPS. --- youtube_dl/extractor/nrk.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index cc70c2950..9e4581cf9 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -13,7 +13,7 @@ from ..utils import ( class NRKIE(InfoExtractor): - _VALID_URL = r'(?:nrk:|http://(?:www\.)?nrk\.no/video/PS\*)(?P\d+)' + _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P\d+)' _TESTS = [ { @@ -76,7 +76,7 @@ class NRKIE(InfoExtractor): class NRKPlaylistIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P[^/]+)' _TESTS = [{ 'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763', @@ -116,11 +116,11 @@ class NRKPlaylistIE(InfoExtractor): class NRKTVIE(InfoExtractor): - _VALID_URL = r'(?Phttp://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P\d+))?' + _VALID_URL = r'(?Phttps?://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P\d+))?' _TESTS = [ { - 'url': 'http://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', + 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', 'md5': 'adf2c5454fa2bf032f47a9f8fb351342', 'info_dict': { 'id': 'MUHH48000314', @@ -132,7 +132,7 @@ class NRKTVIE(InfoExtractor): }, }, { - 'url': 'http://tv.nrk.no/program/mdfp15000514', + 'url': 'https://tv.nrk.no/program/mdfp15000514', 'md5': '383650ece2b25ecec996ad7b5bb2a384', 'info_dict': { 'id': 'mdfp15000514', @@ -145,7 +145,7 @@ class NRKTVIE(InfoExtractor): }, { # single playlist video - 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', 'md5': 'adbd1dbd813edaf532b0a253780719c2', 'info_dict': { 'id': 'MSPO40010515-part2', @@ -157,7 +157,7 @@ class NRKTVIE(InfoExtractor): 'skip': 'Only works from Norway', }, { - 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', 'playlist': [ { 'md5': '9480285eff92d64f06e02a5367970a7a', From bea41c7f3fa4f9072ad2f5354938ab1c8cef0a6d Mon Sep 17 00:00:00 2001 From: corone17 Date: Mon, 29 Jun 2015 00:59:18 +0200 Subject: [PATCH 0555/2145] Update rtlnl.py Better to extract 'http://manifest.us.rtl.nl' from the json, I'd say. And I think it's better to use the default json-url to make it more futureproof. Succesfully tested with tarball. --- youtube_dl/extractor/rtlnl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 41d202c28..e708e0093 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -51,7 +51,7 @@ class RtlNlIE(InfoExtractor): def _real_extract(self, url): uuid = self._match_id(url) info = self._download_json( - 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=flash/' % uuid, + 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=adaptive/' % uuid, uuid) material = info['material'][0] @@ -60,8 +60,8 @@ class RtlNlIE(InfoExtractor): description = material.get('synopsis') or info['episodes'][0]['synopsis'] # Use unencrypted m3u8 streams (See https://github.com/rg3/youtube-dl/issues/4118) - videopath = material['videopath'].replace('.f4m', '.m3u8') - m3u8_url = 'http://manifest.us.rtl.nl' + videopath + videopath = material['videopath'].replace('adaptive', 'flash') + m3u8_url = info['meta']['videohost'] + videopath formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4') From 738b92632296a9fee3eb7e0c915f6ea6b395125f Mon Sep 17 00:00:00 2001 From: nawl Date: Sun, 28 Jun 2015 17:24:00 -0600 Subject: [PATCH 0556/2145] [hentaistigma] Fix video extractor --- youtube_dl/extractor/hentaistigma.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/hentaistigma.py b/youtube_dl/extractor/hentaistigma.py index 63d87b74c..225af8cb3 100644 --- a/youtube_dl/extractor/hentaistigma.py +++ b/youtube_dl/extractor/hentaistigma.py @@ -32,7 +32,7 @@ class HentaiStigmaIE(InfoExtractor): wrap_webpage = self._download_webpage(wrap_url, video_id) video_url = self._html_search_regex( - r'clip:\s*{\s*url: "([^"]*)"', wrap_webpage, 'video url') + r'file:"([^"]+)"', wrap_webpage, 'video url') return { 'id': video_id, From 0130afb76e5cb6f470f39f127c8d09eea3e82d0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 29 Jun 2015 12:42:02 +0200 Subject: [PATCH 0557/2145] [YoutubeDL] format spec: allow grouping specifiers with parentheses --- test/test_YoutubeDL.py | 24 ++++++++++++++++++++++++ youtube_dl/YoutubeDL.py | 39 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 709e3100f..6f374d7ea 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -245,6 +245,30 @@ class TestFormatSelection(unittest.TestCase): self.assertEqual(downloaded['format_id'], '137+141') self.assertEqual(downloaded['ext'], 'mp4') + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])+bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['137+141', '248+141']) + + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': '(bestvideo[ext=mp4],bestvideo[ext=webm])[height<=720]+bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['136+141', '247+141']) + + info_dict = _make_result(list(formats_order), extractor='youtube') + ydl = YDL({'format': '(bestvideo[ext=none]/bestvideo[ext=webm])+bestaudio'}) + yie = YoutubeIE(ydl) + yie._sort_formats(info_dict['formats']) + ydl.process_ie_result(info_dict) + downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] + self.assertEqual(downloaded_ids, ['248+141']) + for f1, f2 in zip(formats_order, formats_order[1:]): info_dict = _make_result([f1, f2], extractor='youtube') ydl = YDL({'format': 'best/bestvideo'}) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 258e612af..e5b46f87e 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -920,6 +920,7 @@ class YoutubeDL(object): PICKFIRST = 'PICKFIRST' MERGE = 'MERGE' SINGLE = 'SINGLE' + GROUP = 'GROUP' FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters']) def _parse_filter(tokens): @@ -942,6 +943,10 @@ class YoutubeDL(object): elif type == tokenize.OP: if string in endwith: break + elif string == ')': + # ')' will be handled by the parentheses group + tokens.restore_last_token() + break if string == ',': selectors.append(current_selector) current_selector = None @@ -955,6 +960,10 @@ class YoutubeDL(object): current_selector = FormatSelector(SINGLE, 'best', []) format_filter = _parse_filter(tokens) current_selector.filters.append(format_filter) + elif string == '(': + if current_selector: + raise syntax_error('Unexpected "("', start) + current_selector = FormatSelector(GROUP, _parse_format_selection(tokens, [')']), []) elif string == '+': video_selector = current_selector audio_selector = _parse_format_selection(tokens, [',']) @@ -977,6 +986,8 @@ class YoutubeDL(object): for format in f(formats): yield format return selector_function + elif selector.type == GROUP: + selector_function = _build_selector_function(selector.selector) elif selector.type == PICKFIRST: fs = [_build_selector_function(s) for s in selector.selector] @@ -1084,8 +1095,32 @@ class YoutubeDL(object): return final_selector stream = io.BytesIO(format_spec.encode('utf-8')) - tokens = compat_tokenize_tokenize(stream.readline) - parsed_selector = _parse_format_selection(tokens) + try: + tokens = list(compat_tokenize_tokenize(stream.readline)) + except tokenize.TokenError: + raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) + + class TokenIterator(object): + def __init__(self, tokens): + self.tokens = tokens + self.counter = 0 + + def __iter__(self): + return self + + def __next__(self): + if self.counter >= len(self.tokens): + raise StopIteration() + value = self.tokens[self.counter] + self.counter += 1 + return value + + next = __next__ + + def restore_last_token(self): + self.counter -= 1 + + parsed_selector = _parse_format_selection(iter(TokenIterator(tokens))) return _build_selector_function(parsed_selector) def _calc_headers(self, info_dict): From cf386750c9194839e419a0412f45f25f28236c77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 29 Jun 2015 22:21:09 +0600 Subject: [PATCH 0558/2145] [hentaistigma] Modernize --- youtube_dl/extractor/hentaistigma.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/hentaistigma.py b/youtube_dl/extractor/hentaistigma.py index 225af8cb3..f5aa73d18 100644 --- a/youtube_dl/extractor/hentaistigma.py +++ b/youtube_dl/extractor/hentaistigma.py @@ -1,7 +1,5 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor @@ -19,20 +17,19 @@ class HentaiStigmaIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_regex( - r'

]*>([^<]+)', + r']+class="posttitle"[^>]*>]*>([^<]+)', webpage, 'title') wrap_url = self._html_search_regex( - r'', start_page, 'xml filename') - xml_decription_url = xml_root + 'xml/' + xml_name - xml_description = self._download_xml(xml_decription_url, display_id) + xml_description_url = xml_root + 'xml/' + xml_name + xml_description = self._download_xml(xml_description_url, display_id) video_title = xml_description.find('./metadata/title').text video_formats = self._parse_mp4(xml_description) From ee114368ad0bb9822449295910263a99f9de4e1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 Aug 2015 20:22:13 +0600 Subject: [PATCH 0929/2145] [utils] Make value optional for find_xpath_attr This allows selecting particular attributes by name but without specifying the value and similar to xpath syntax `[@attrib]` --- test/test_utils.py | 9 +++++++++ youtube_dl/utils.py | 13 ++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 65692a9fb..a759b2da9 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -235,12 +235,21 @@ class TestUtil(unittest.TestCase): + ''' doc = xml.etree.ElementTree.fromstring(testxml) + self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n'), None) self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None) + self.assertEqual(find_xpath_attr(doc, './/node', 'n'), None) + self.assertEqual(find_xpath_attr(doc, './/node', 'n', 'v'), None) + self.assertEqual(find_xpath_attr(doc, './/node', 'x'), doc[1]) self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1]) + self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'b'), doc[3]) + self.assertEqual(find_xpath_attr(doc, './/node', 'y'), doc[2]) self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2]) + self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'd'), doc[3]) + self.assertEqual(find_xpath_attr(doc, './/node', 'x', ''), doc[4]) def test_xpath_with_ns(self): testxml = ''' diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 88f9f9070..78dc2b449 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -139,21 +139,24 @@ def write_json_file(obj, fn): if sys.version_info >= (2, 7): - def find_xpath_attr(node, xpath, key, val): + def find_xpath_attr(node, xpath, key, val=None): """ Find the xpath xpath[@key=val] """ assert re.match(r'^[a-zA-Z-]+$', key) - assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) - expr = xpath + "[@%s='%s']" % (key, val) + if val: + assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val) + expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val)) return node.find(expr) else: - def find_xpath_attr(node, xpath, key, val): + def find_xpath_attr(node, xpath, key, val=None): # Here comes the crazy part: In 2.6, if the xpath is a unicode, # .//node does not match if a node is a direct child of . ! if isinstance(xpath, compat_str): xpath = xpath.encode('ascii') for f in node.findall(xpath): - if f.attrib.get(key) == val: + if key not in f.attrib: + continue + if val is None or f.attrib.get(key) == val: return f return None From 3f125c8c70e8109bc90d4446b40740133e343b85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 1 Aug 2015 21:43:33 +0600 Subject: [PATCH 0930/2145] [nbcnews] Extend _VALID_URL --- youtube_dl/extractor/nbc.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index dc2091be0..ccdbfb6c9 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -124,7 +124,7 @@ class NBCSportsIE(InfoExtractor): class NBCNewsIE(InfoExtractor): _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/ (?:video/.+?/(?P\d+)| - (?:feature|nightly-news)/[^/]+/(?P.+)) + (?:watch|feature|nightly-news)/[^/]+/(?P<title>.+)) ''' _TESTS = [ @@ -169,6 +169,10 @@ class NBCNewsIE(InfoExtractor): 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', }, }, + { + 'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952', + 'only_matching': True, + }, ] def _real_extract(self, url): From 55eae65b39d754d699ad9de3f9c99fcdf62e0176 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 2 Aug 2015 00:42:23 +0800 Subject: [PATCH 0931/2145] Credit @cyb3r for the ir90tv extractor --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index aa6b88cc0..d16d34272 100644 --- a/AUTHORS +++ b/AUTHORS @@ -136,3 +136,4 @@ sceext Zach Bruggeman Tjark Saul slangangular +Behrouz Abbasi From a107193e4b7a3d5414dd7422263c34ac0e309ec4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Aug 2015 01:13:21 +0600 Subject: [PATCH 0932/2145] [extractor/common] Extract f4m and m3u8 formats, subtitles and info --- youtube_dl/extractor/common.py | 200 ++++++++++++++++++++++++--------- 1 file changed, 149 insertions(+), 51 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index dc5080504..f9578b838 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -18,6 +18,7 @@ from ..compat import ( compat_HTTPError, compat_http_client, compat_urllib_error, + compat_urllib_parse, compat_urllib_parse_urlparse, compat_urllib_request, compat_urlparse, @@ -37,6 +38,7 @@ from ..utils import ( RegexNotFoundError, sanitize_filename, unescapeHTML, + url_basename, ) @@ -978,69 +980,165 @@ class InfoExtractor(object): self._sort_formats(formats) return formats - # TODO: improve extraction - def _extract_smil_formats(self, smil_url, video_id, fatal=True): - smil = self._download_xml( - smil_url, video_id, 'Downloading SMIL file', - 'Unable to download SMIL file', fatal=fatal) + @staticmethod + def _xpath_ns(path, namespace=None): + if not namespace: + return path + out = [] + for c in path.split('/'): + if not c or c == '.': + out.append(c) + else: + out.append('{%s}%s' % (namespace, c)) + return '/'.join(out) + + def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None): + smil = self._download_smil(smil_url, video_id, fatal=fatal) + if smil is False: assert not fatal return [] - base = smil.find('./head/meta').get('base') + namespace = self._search_regex( + r'{([^}]+)?}smil', smil.tag, 'namespace', default=None) + + return self._parse_smil_formats( + smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) + + def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): + smil = self._download_smil(smil_url, video_id, fatal=fatal) + if smil is False: + return {} + return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) + + def _download_smil(self, smil_url, video_id, fatal=True): + return self._download_xml( + smil_url, video_id, 'Downloading SMIL file', + 'Unable to download SMIL file', fatal=fatal) + + def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): + namespace = self._search_regex( + r'{([^}]+)?}smil', smil.tag, 'namespace', default=None) + + formats = self._parse_smil_formats( + smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) + subtitles = self._parse_smil_subtitles(smil, namespace=namespace) + + video_id = os.path.splitext(url_basename(smil_url))[0] + title = None + description = None + for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): + name = meta.attrib.get('name') + content = meta.attrib.get('content') + if not name or not content: + continue + if not title and name == 'title': + title = content + elif not description and name in ('description', 'abstract'): + description = content + + return { + 'id': video_id, + 'title': title or video_id, + 'description': description, + 'formats': formats, + 'subtitles': subtitles, + } + + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None): + base = smil_url + for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): + b = meta.get('base') or meta.get('httpBase') + if b: + base = b + break formats = [] rtmp_count = 0 - if smil.findall('./body/seq/video'): - video = smil.findall('./body/seq/video')[0] - fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count) - formats.extend(fmts) - else: - for video in smil.findall('./body/switch/video'): - fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count) - formats.extend(fmts) + http_count = 0 + + videos = smil.findall(self._xpath_ns('.//video', namespace)) + for video in videos: + src = video.get('src') + if not src: + continue + + bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + filesize = int_or_none(video.get('size') or video.get('fileSize')) + width = int_or_none(video.get('width')) + height = int_or_none(video.get('height')) + proto = video.get('proto') + ext = video.get('ext') + src_ext = determine_ext(src) + streamer = video.get('streamer') or base + + if proto == 'rtmp' or streamer.startswith('rtmp'): + rtmp_count += 1 + formats.append({ + 'url': streamer, + 'play_path': src, + 'ext': 'flv', + 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), + 'tbr': bitrate, + 'filesize': filesize, + 'width': width, + 'height': height, + }) + continue + + src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) + + if proto == 'm3u8' or src_ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src_url, video_id, ext or 'mp4', m3u8_id='hls')) + continue + + if src_ext == 'f4m': + f4m_url = src_url + if not f4m_params: + f4m_params = { + 'hdcore': '3.2.0', + 'plugin': 'flowplayer-3.2.0.1', + } + f4m_url += '&' if '?' in f4m_url else '?' + f4m_url += compat_urllib_parse.urlencode(f4m_params).encode('utf-8') + formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds')) + continue + + if src_url.startswith('http'): + http_count += 1 + formats.append({ + 'url': src_url, + 'ext': ext or src_ext or 'flv', + 'format_id': 'http-%d' % (bitrate or http_count), + 'tbr': bitrate, + 'filesize': filesize, + 'width': width, + 'height': height, + }) + continue self._sort_formats(formats) return formats - def _parse_smil_video(self, video, video_id, base, rtmp_count): - src = video.get('src') - if not src: - return [], rtmp_count - bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) - width = int_or_none(video.get('width')) - height = int_or_none(video.get('height')) - proto = video.get('proto') - if not proto: - if base: - if base.startswith('rtmp'): - proto = 'rtmp' - elif base.startswith('http'): - proto = 'http' - ext = video.get('ext') - if proto == 'm3u8': - return self._extract_m3u8_formats(src, video_id, ext), rtmp_count - elif proto == 'rtmp': - rtmp_count += 1 - streamer = video.get('streamer') or base - return ([{ - 'url': streamer, - 'play_path': src, - 'ext': 'flv', - 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), - 'tbr': bitrate, - 'width': width, - 'height': height, - }], rtmp_count) - elif proto.startswith('http'): - return ([{ - 'url': base + src, - 'ext': ext or 'flv', - 'tbr': bitrate, - 'width': width, - 'height': height, - }], rtmp_count) + def _parse_smil_subtitles(self, smil, namespace=None): + subtitles = {} + for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): + src = textstream.get('src') + if not src: + continue + ext = textstream.get('ext') or determine_ext(src) + if not ext: + type_ = textstream.get('type') + if type_ == 'text/srt': + ext = 'srt' + lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') + subtitles.setdefault(lang, []).append({ + 'url': src, + 'ext': ext, + }) + return subtitles def _live_title(self, name): """ Generate the title for a live video """ From e5e8d20a3a65832c74b002f247866fcbb92e9246 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Aug 2015 01:13:59 +0600 Subject: [PATCH 0933/2145] [extractor/generic] Improve generic SMIL detection --- youtube_dl/extractor/generic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8cef61c3c..6900ed96f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1110,11 +1110,13 @@ class GenericIE(InfoExtractor): self.report_extraction(video_id) - # Is it an RSS feed? + # Is it an RSS feed or a SMIL file? try: doc = parse_xml(webpage) if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) + elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): + return self._parse_smil(doc, url, video_id) except compat_xml_parse_error: pass From 308cfe0ab3ec7122602ba2d6a4e3acd2caa7a757 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Aug 2015 01:14:41 +0600 Subject: [PATCH 0934/2145] [test_downloader] Respect --force-generic-extractor --- test/test_download.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_download.py b/test/test_download.py index 1110357a7..284418834 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -136,7 +136,9 @@ def generator(test_case): # We're not using .download here sine that is just a shim # for outside error handling, and returns the exit code # instead of the result dict. - res_dict = ydl.extract_info(test_case['url']) + res_dict = ydl.extract_info( + test_case['url'], + force_generic_extractor=params.get('force_generic_extractor', False)) except (DownloadError, ExtractorError) as err: # Check if the exception is not a network related one if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503): From 645f814544f9d40386e504a1eb8cf3558f2c109e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Aug 2015 01:15:33 +0600 Subject: [PATCH 0935/2145] [test/helper] Allow dicts for mincount --- test/helper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/helper.py b/test/helper.py index e1129e58f..c8b34654d 100644 --- a/test/helper.py +++ b/test/helper.py @@ -133,8 +133,8 @@ def expect_info_dict(self, got_dict, expected_dict): elif isinstance(expected, compat_str) and expected.startswith('mincount:'): got = got_dict.get(info_field) self.assertTrue( - isinstance(got, list), - 'Expected field %s to be a list, but it is of type %s' % ( + isinstance(got, (list, dict)), + 'Expected field %s to be a list or a dict, but it is of type %s' % ( info_field, type(got).__name__)) expected_num = int(expected.partition(':')[2]) assertGreaterEqual( From 8765222d2211cd6f2a40611249181af0bbb2d531 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Aug 2015 01:16:21 +0600 Subject: [PATCH 0936/2145] [extractor/generic] Add generic SMIL tests --- youtube_dl/extractor/generic.py | 68 +++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6900ed96f..27584c44c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -130,6 +130,74 @@ class GenericIE(InfoExtractor): 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', } }, + # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng + { + 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', + 'info_dict': { + 'id': 'smil', + 'ext': 'mp4', + 'title': 'Automatics, robotics and biocybernetics', + 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', + 'formats': 'mincount:16', + 'subtitles': 'mincount:1', + }, + 'params': { + 'force_generic_extractor': True, + 'skip_download': True, + }, + }, + # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html + { + 'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil', + 'info_dict': { + 'id': 'hds', + 'ext': 'flv', + 'title': 'hds', + 'formats': 'mincount:1', + }, + 'params': { + 'skip_download': True, + }, + }, + # SMIL from https://www.restudy.dk/video/play/id/1637 + { + 'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml', + 'info_dict': { + 'id': 'video_1637', + 'ext': 'flv', + 'title': 'video_1637', + 'formats': 'mincount:3', + }, + 'params': { + 'skip_download': True, + }, + }, + # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm + { + 'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil', + 'info_dict': { + 'id': 'smil-service', + 'ext': 'flv', + 'title': 'smil-service', + 'formats': 'mincount:1', + }, + 'params': { + 'skip_download': True, + }, + }, + # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370 + { + 'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil', + 'info_dict': { + 'id': '4719370', + 'ext': 'mp4', + 'title': '571de1fd-47bc-48db-abf9-238872a58d1f', + 'formats': 'mincount:3', + }, + 'params': { + 'skip_download': True, + }, + }, # google redirect { 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', From 41c3a5a7beebbf5f60c5edb5093d564f0829c5c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Aug 2015 01:20:49 +0600 Subject: [PATCH 0937/2145] [extractor/common] Fix python 3 --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f9578b838..c123d9fca 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1101,7 +1101,7 @@ class InfoExtractor(object): 'plugin': 'flowplayer-3.2.0.1', } f4m_url += '&' if '?' in f4m_url else '?' - f4m_url += compat_urllib_parse.urlencode(f4m_params).encode('utf-8') + f4m_url += compat_urllib_parse.urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds')) continue From 17712eeb1933f53696c1fc53606174e988a96472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Aug 2015 01:31:17 +0600 Subject: [PATCH 0938/2145] [extractor/common] Extract namespace parse routine --- youtube_dl/extractor/common.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index c123d9fca..717dcec7b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -999,8 +999,7 @@ class InfoExtractor(object): assert not fatal return [] - namespace = self._search_regex( - r'{([^}]+)?}smil', smil.tag, 'namespace', default=None) + namespace = self._parse_smil_namespace(smil) return self._parse_smil_formats( smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) @@ -1017,8 +1016,7 @@ class InfoExtractor(object): 'Unable to download SMIL file', fatal=fatal) def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): - namespace = self._search_regex( - r'{([^}]+)?}smil', smil.tag, 'namespace', default=None) + namespace = self._parse_smil_namespace(smil) formats = self._parse_smil_formats( smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) @@ -1045,6 +1043,10 @@ class InfoExtractor(object): 'subtitles': subtitles, } + def _parse_smil_namespace(self, smil): + return self._search_regex( + r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None): base = smil_url for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): From fa7a1cc5ef52a8dd9a355ab37a74be55ac2ddc1f Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 22 Jul 2015 12:34:42 +0100 Subject: [PATCH 0939/2145] [screenwavemedia] fix info extraction (fixes #6270) Closes #6330. --- youtube_dl/extractor/screenwavemedia.py | 84 +++++++++++-------------- 1 file changed, 36 insertions(+), 48 deletions(-) diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py index d1ab66b32..09c085dcf 100644 --- a/youtube_dl/extractor/screenwavemedia.py +++ b/youtube_dl/extractor/screenwavemedia.py @@ -1,12 +1,11 @@ # encoding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( int_or_none, unified_strdate, + js_to_json, ) @@ -22,59 +21,48 @@ class ScreenwaveMediaIE(InfoExtractor): video_id = self._match_id(url) playerdata = self._download_webpage( - 'http://player.screenwavemedia.com/play/player.php?id=%s' % video_id, + 'http://player.screenwavemedia.com/player.php?id=%s' % video_id, video_id, 'Downloading player webpage') vidtitle = self._search_regex( r'\'vidtitle\'\s*:\s*"([^"]+)"', playerdata, 'vidtitle').replace('\\/', '/') - vidurl = self._search_regex( - r'\'vidurl\'\s*:\s*"([^"]+)"', playerdata, 'vidurl').replace('\\/', '/') - videolist_url = None + playerconfig = self._download_webpage( + 'http://player.screenwavemedia.com/player.js', + video_id, 'Downloading playerconfig webpage') - mobj = re.search(r"'videoserver'\s*:\s*'(?P<videoserver>[^']+)'", playerdata) - if mobj: - videoserver = mobj.group('videoserver') - mobj = re.search(r'\'vidid\'\s*:\s*"(?P<vidid>[^\']+)"', playerdata) - vidid = mobj.group('vidid') if mobj else video_id - videolist_url = 'http://%s/vod/smil:%s.smil/jwplayer.smil' % (videoserver, vidid) - else: - mobj = re.search(r"file\s*:\s*'(?P<smil>http.+?/jwplayer\.smil)'", playerdata) - if mobj: - videolist_url = mobj.group('smil') + videoserver = self._search_regex(r"'videoserver'\s*:\s*'([^']+)", playerconfig, 'videoserver') - if videolist_url: - videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML') - formats = [] - baseurl = vidurl[:vidurl.rfind('/') + 1] - for video in videolist.findall('.//video'): - src = video.get('src') - if not src: - continue - file_ = src.partition(':')[-1] - width = int_or_none(video.get('width')) - height = int_or_none(video.get('height')) - bitrate = int_or_none(video.get('system-bitrate'), scale=1000) - format = { - 'url': baseurl + file_, - 'format_id': src.rpartition('.')[0].rpartition('_')[-1], - } - if width or height: - format.update({ - 'tbr': bitrate, - 'width': width, - 'height': height, - }) - else: - format.update({ - 'abr': bitrate, - 'vcodec': 'none', - }) - formats.append(format) - else: - formats = [{ - 'url': vidurl, - }] + sources = self._parse_json( + js_to_json( + self._search_regex( + r"sources\s*:\s*(\[[^\]]+?\])", playerconfig, + 'sources', + ).replace( + "' + thisObj.options.videoserver + '", + videoserver + ).replace( + "' + playerVidId + '", + video_id + ) + ), + video_id + ) + + formats = [] + for source in sources: + if source['type'] == 'hls': + formats.extend(self._extract_m3u8_formats(source['file'], video_id)) + else: + format_label = source.get('label') + height = int_or_none(self._search_regex( + r'^(\d+)[pP]', format_label, 'height', default=None)) + formats.append({ + 'url': source['file'], + 'format': format_label, + 'ext': source.get('type'), + 'height': height, + }) self._sort_formats(formats) return { From 9cc93c64aa321260475a2bdf7d8626cdd16bf8ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 1 Aug 2015 22:15:43 +0200 Subject: [PATCH 0940/2145] [screenwavemedia] Use the IP for the videoserver (fixes #6397) For http://cinemassacre.com/2015/07/28/avgn-seaman-for-dreamcast/ the other server returns a 403 error. --- youtube_dl/extractor/screenwavemedia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py index 09c085dcf..3bc84989e 100644 --- a/youtube_dl/extractor/screenwavemedia.py +++ b/youtube_dl/extractor/screenwavemedia.py @@ -31,7 +31,7 @@ class ScreenwaveMediaIE(InfoExtractor): 'http://player.screenwavemedia.com/player.js', video_id, 'Downloading playerconfig webpage') - videoserver = self._search_regex(r"'videoserver'\s*:\s*'([^']+)", playerconfig, 'videoserver') + videoserver = self._search_regex(r"\[ipaddress\]\s*=>\s*([\d\.]+)", playerdata, 'videoserver') sources = self._parse_json( js_to_json( From cdc682d5a467b7188eb13b5eeb76eb5dd544d1f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Aug 2015 04:21:16 +0600 Subject: [PATCH 0941/2145] [nowtv] Fix extraction (Closes #6357) --- youtube_dl/extractor/nowtv.py | 63 +++++++++++++++-------------------- 1 file changed, 27 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index 0b5ff4760..de6bc6d96 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, + determine_ext, int_or_none, parse_iso8601, parse_duration, @@ -15,7 +16,7 @@ from ..utils import ( class NowTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?P<station>rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/player' + _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/player' _TESTS = [{ # rtl @@ -23,7 +24,7 @@ class NowTVIE(InfoExtractor): 'info_dict': { 'id': '203519', 'display_id': 'bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Die neuen Bauern und eine Hochzeit', 'description': 'md5:e234e1ed6d63cf06be5c070442612e7e', 'thumbnail': 're:^https?://.*\.jpg$', @@ -32,7 +33,7 @@ class NowTVIE(InfoExtractor): 'duration': 2786, }, 'params': { - # m3u8 download + # rtmp download 'skip_download': True, }, }, { @@ -41,7 +42,7 @@ class NowTVIE(InfoExtractor): 'info_dict': { 'id': '203481', 'display_id': 'berlin-tag-nacht/berlin-tag-nacht-folge-934', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Berlin - Tag & Nacht (Folge 934)', 'description': 'md5:c85e88c2e36c552dfe63433bc9506dd0', 'thumbnail': 're:^https?://.*\.jpg$', @@ -50,7 +51,7 @@ class NowTVIE(InfoExtractor): 'duration': 2641, }, 'params': { - # m3u8 download + # rtmp download 'skip_download': True, }, }, { @@ -59,7 +60,7 @@ class NowTVIE(InfoExtractor): 'info_dict': { 'id': '165780', 'display_id': 'alarm-fuer-cobra-11-die-autobahnpolizei/hals-und-beinbruch-2014-08-23-21-10-00', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Hals- und Beinbruch', 'description': 'md5:b50d248efffe244e6f56737f0911ca57', 'thumbnail': 're:^https?://.*\.jpg$', @@ -68,7 +69,7 @@ class NowTVIE(InfoExtractor): 'duration': 2742, }, 'params': { - # m3u8 download + # rtmp download 'skip_download': True, }, }, { @@ -77,7 +78,7 @@ class NowTVIE(InfoExtractor): 'info_dict': { 'id': '99205', 'display_id': 'medicopter-117/angst', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Angst!', 'description': 'md5:30cbc4c0b73ec98bcd73c9f2a8c17c4e', 'thumbnail': 're:^https?://.*\.jpg$', @@ -86,7 +87,7 @@ class NowTVIE(InfoExtractor): 'duration': 3025, }, 'params': { - # m3u8 download + # rtmp download 'skip_download': True, }, }, { @@ -95,7 +96,7 @@ class NowTVIE(InfoExtractor): 'info_dict': { 'id': '203521', 'display_id': 'ratgeber-geld/thema-ua-der-erste-blick-die-apple-watch', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Thema u.a.: Der erste Blick: Die Apple Watch', 'description': 'md5:4312b6c9d839ffe7d8caf03865a531af', 'thumbnail': 're:^https?://.*\.jpg$', @@ -104,7 +105,7 @@ class NowTVIE(InfoExtractor): 'duration': 1083, }, 'params': { - # m3u8 download + # rtmp download 'skip_download': True, }, }, { @@ -113,7 +114,7 @@ class NowTVIE(InfoExtractor): 'info_dict': { 'id': '128953', 'display_id': 'der-hundeprofi/buero-fall-chihuahua-joel', - 'ext': 'mp4', + 'ext': 'flv', 'title': "Büro-Fall / Chihuahua 'Joel'", 'description': 'md5:e62cb6bf7c3cc669179d4f1eb279ad8d', 'thumbnail': 're:^https?://.*\.jpg$', @@ -122,15 +123,13 @@ class NowTVIE(InfoExtractor): 'duration': 3092, }, 'params': { - # m3u8 download + # rtmp download 'skip_download': True, }, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') - station = mobj.group('station') + display_id = self._match_id(url) info = self._download_json( 'https://api.nowtv.de/v3/movies/%s?fields=id,title,free,geoblocked,articleLong,articleShort,broadcastStartDate,seoUrl,duration,format,files' % display_id, @@ -148,29 +147,19 @@ class NowTVIE(InfoExtractor): raise ExtractorError( 'Video %s is not available for free' % video_id, expected=True) - f = info.get('format', {}) - station = f.get('station') or station - - STATIONS = { - 'rtl': 'rtlnow', - 'rtl2': 'rtl2now', - 'vox': 'voxnow', - 'nitro': 'rtlnitronow', - 'ntv': 'n-tvnow', - 'superrtl': 'superrtlnow' - } - formats = [] for item in files['items']: - item_path = remove_start(item['path'], '/') - tbr = int_or_none(item['bitrate']) - m3u8_url = 'http://hls.fra.%s.de/hls-vod-enc/%s.m3u8' % (STATIONS[station], item_path) - m3u8_url = m3u8_url.replace('now/', 'now/videos/') + if determine_ext(item['path']) != 'f4v': + continue + app, play_path = remove_start(item['path'], '/').split('/', 1) formats.append({ - 'url': m3u8_url, - 'format_id': '%s-%sk' % (item['id'], tbr), - 'ext': 'mp4', - 'tbr': tbr, + 'url': 'rtmpe://fms.rtl.de', + 'app': app, + 'play_path': 'mp4:%s' % play_path, + 'ext': 'flv', + 'page_url': url, + 'player_url': 'http://rtl-now.rtl.de/includes/nc_player.swf', + 'tbr': int_or_none(item.get('bitrate')), }) self._sort_formats(formats) @@ -178,6 +167,8 @@ class NowTVIE(InfoExtractor): description = info.get('articleLong') or info.get('articleShort') timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') duration = parse_duration(info.get('duration')) + + f = info.get('format', {}) thumbnail = f.get('defaultImage169Format') or f.get('defaultImage169Logo') return { From e422d7f4f78994de8483d2207ab4e00174a2408c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Aug 2015 04:26:59 +0600 Subject: [PATCH 0942/2145] [nowtv] Expand _VALID_URL --- youtube_dl/extractor/nowtv.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index de6bc6d96..11ce37168 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -16,7 +16,7 @@ from ..utils import ( class NowTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/player' + _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/(?:player|preview)' _TESTS = [{ # rtl @@ -126,6 +126,9 @@ class NowTVIE(InfoExtractor): # rtmp download 'skip_download': True, }, + }, { + 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview', + 'only_matching': True, }] def _real_extract(self, url): From d41d04c0f513ad3b83ab6aee60cf2201710b6063 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 2 Aug 2015 06:35:35 +0600 Subject: [PATCH 0943/2145] [videolectures] Fix _VALID_URL --- youtube_dl/extractor/videolecturesnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py index d6a7eb203..24584dc80 100644 --- a/youtube_dl/extractor/videolecturesnet.py +++ b/youtube_dl/extractor/videolecturesnet.py @@ -12,7 +12,7 @@ from ..utils import ( class VideoLecturesNetIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/' + _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)(?:/?[#?].*)?$' IE_NAME = 'videolectures.net' _TEST = { From 5c45bbe57bd791debfd64052ab030298a7c6b718 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 2 Aug 2015 15:19:30 +0200 Subject: [PATCH 0944/2145] [nowtv] Remove unused import --- youtube_dl/extractor/nowtv.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index 11ce37168..ad938fb62 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( From 25a4c5a9ed59eca0241922363e83e61172527658 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 2 Aug 2015 15:19:57 +0200 Subject: [PATCH 0945/2145] [dailymotion:playlist] Use an iterator for the entries So that using '--playlist-end' only downloads the required pages (reported in #2175). --- youtube_dl/extractor/dailymotion.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 85d945509..2d90b2224 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -15,7 +15,6 @@ from ..utils import ( ExtractorError, determine_ext, int_or_none, - orderedSet, parse_iso8601, str_to_int, unescapeHTML, @@ -278,7 +277,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): }] def _extract_entries(self, id): - video_ids = [] + video_ids = set() processed_urls = set() for pagenum in itertools.count(1): page_url = self._PAGE_TEMPLATE % (id, pagenum) @@ -291,12 +290,13 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): processed_urls.add(urlh.geturl()) - video_ids.extend(re.findall(r'data-xid="(.+?)"', webpage)) + for video_id in re.findall(r'data-xid="(.+?)"', webpage): + if video_id not in video_ids: + yield self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') + video_ids.add(video_id) if re.search(self._MORE_PAGES_INDICATOR, webpage) is None: break - return [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') - for video_id in orderedSet(video_ids)] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From d7d2a9a3dbf1cef78c5085a4aab5d2f336c64cff Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sun, 2 Aug 2015 03:28:04 +0100 Subject: [PATCH 0946/2145] [utils] restart download if server does not support byte ranges --- youtube_dl/downloader/http.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index b7f144af9..b2e82cfde 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -57,6 +57,20 @@ class HttpFD(FileDownloader): # Establish connection try: data = self.ydl.urlopen(request) + + if resume_len > 0: + content_range = data.headers.get('Content-Range') + if content_range: + content_range_m = re.search(r'bytes (\d+)-', content_range) + if content_range_m: + # Content-Range is correct - go on + if resume_len == int(content_range_m.group(1)): + break + + # Content-Range is invalid - wipe the file and do entire redownload + resume_len = 0 + open_mode = 'wb' + break except (compat_urllib_error.HTTPError, ) as err: if (err.code < 500 or err.code >= 600) and err.code != 416: From 8d5b8b477e4b1051482b21ea451f0de1ce23bce7 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sun, 2 Aug 2015 03:58:02 +0100 Subject: [PATCH 0947/2145] [utils] import re --- youtube_dl/downloader/http.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index b2e82cfde..f796ee113 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -4,6 +4,7 @@ import errno import os import socket import time +import re from .common import FileDownloader from ..compat import ( From c3124c3085e6a9a83ee31ace3a7d528a324c42da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 3 Aug 2015 02:25:08 +0600 Subject: [PATCH 0948/2145] [downloader/http] Simplify --- youtube_dl/downloader/http.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index f796ee113..0862e90bb 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -58,20 +58,16 @@ class HttpFD(FileDownloader): # Establish connection try: data = self.ydl.urlopen(request) - if resume_len > 0: content_range = data.headers.get('Content-Range') if content_range: content_range_m = re.search(r'bytes (\d+)-', content_range) - if content_range_m: - # Content-Range is correct - go on - if resume_len == int(content_range_m.group(1)): - break - + # Content-Range is correct - go on + if content_range_m and resume_len == int(content_range_m.group(1)): + break # Content-Range is invalid - wipe the file and do entire redownload resume_len = 0 open_mode = 'wb' - break except (compat_urllib_error.HTTPError, ) as err: if (err.code < 500 or err.code >= 600) and err.code != 416: From 10eaa8ef1d2a9699052af9262aa472456548e99b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 3 Aug 2015 02:25:40 +0600 Subject: [PATCH 0949/2145] [downloader/http] Report unable to resume --- youtube_dl/downloader/http.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 0862e90bb..2f8490f02 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -66,6 +66,7 @@ class HttpFD(FileDownloader): if content_range_m and resume_len == int(content_range_m.group(1)): break # Content-Range is invalid - wipe the file and do entire redownload + self.report_unable_to_resume() resume_len = 0 open_mode = 'wb' break From 84bc4dcb0f678f0a8c9f993e101b9769e3959f76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 3 Aug 2015 02:27:47 +0600 Subject: [PATCH 0950/2145] [downloader/http] Clarify rationale for Content-Range check (#6426) --- youtube_dl/downloader/http.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 2f8490f02..a29f5cf31 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -58,14 +58,21 @@ class HttpFD(FileDownloader): # Establish connection try: data = self.ydl.urlopen(request) + # When trying to resume, Content-Range HTTP header of response has to be checked + # to match the value of requested Range HTTP header. This is due to a webservers + # that don't support resuming and serve a whole file with no Content-Range + # set in response despite of requested Range (see + # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799) if resume_len > 0: content_range = data.headers.get('Content-Range') if content_range: content_range_m = re.search(r'bytes (\d+)-', content_range) - # Content-Range is correct - go on - if content_range_m and resume_len == int(content_range_m.group(1)): - break - # Content-Range is invalid - wipe the file and do entire redownload + # Content-Range is present and matches requested Range, resume is possible + if content_range_m and resume_len == int(content_range_m.group(1)): + break + # Content-Range is either not present or invalid. Assuming remote webserver is + # trying to send the whole file, resume is not possible, so wiping the local file + # and performing entire redownload self.report_unable_to_resume() resume_len = 0 open_mode = 'wb' From 754e70cf3e74218ae5d840985fbf07bbe274332a Mon Sep 17 00:00:00 2001 From: George Brighton <george@gebn.co.uk> Date: Sun, 2 Aug 2015 19:21:10 +0100 Subject: [PATCH 0951/2145] [pornhub] Fix video url regular expression. PornHub seems to have subtly changed their JavaScript. Before, video URL strings were embedded directly in the video's `flashvars_*` object, but they are now assigned to variables of the form `player_quality_*`, which are then added to this object later under the relevant quality key. --- youtube_dl/extractor/pornhub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 0b7886840..fbaa830d6 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -81,7 +81,7 @@ class PornHubIE(InfoExtractor): comment_count = self._extract_count( r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') - video_urls = list(map(compat_urllib_parse_unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage))) + video_urls = list(map(compat_urllib_parse_unquote, re.findall(r"var player_quality_[0-9]{3}p = '([^']+)'", webpage))) if webpage.find('"encrypted":true') != -1: password = compat_urllib_parse_unquote_plus( self._search_regex(r'"video_title":"([^"]+)', webpage, 'password')) From 524229a2975c20887a9a71cae77132e775003537 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 3 Aug 2015 02:41:17 +0600 Subject: [PATCH 0952/2145] [pornhub] Improve --- youtube_dl/extractor/pornhub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index fbaa830d6..fec493046 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -81,7 +81,7 @@ class PornHubIE(InfoExtractor): comment_count = self._extract_count( r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment') - video_urls = list(map(compat_urllib_parse_unquote, re.findall(r"var player_quality_[0-9]{3}p = '([^']+)'", webpage))) + video_urls = list(map(compat_urllib_parse_unquote, re.findall(r"player_quality_[0-9]{3}p\s*=\s*'([^']+)'", webpage))) if webpage.find('"encrypted":true') != -1: password = compat_urllib_parse_unquote_plus( self._search_regex(r'"video_title":"([^"]+)', webpage, 'password')) From 51a575159a5a83e4477b03544f419dcf2e9ff0fa Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sun, 2 Aug 2015 22:52:12 +0100 Subject: [PATCH 0953/2145] [facebook] extract uploader --- youtube_dl/extractor/facebook.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index e17bb9aea..734de4da2 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -17,6 +17,8 @@ from ..utils import ( int_or_none, limit_length, urlencode_postdata, + get_element_by_id, + clean_html, ) @@ -161,6 +163,7 @@ class FacebookIE(InfoExtractor): video_title = limit_length(video_title, 80) if not video_title: video_title = 'Facebook video #%s' % video_id + uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) return { 'id': video_id, @@ -168,4 +171,5 @@ class FacebookIE(InfoExtractor): 'formats': formats, 'duration': int_or_none(video_data.get('video_duration')), 'thumbnail': video_data.get('thumbnail_src'), + 'uploader': uploader, } From 67b8a28a2f69764259cf2e90c0a3785c05c55551 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 3 Aug 2015 00:09:21 +0100 Subject: [PATCH 0954/2145] [facebook] add uploader value to the tests --- youtube_dl/extractor/facebook.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 734de4da2..178a7ca4c 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -44,6 +44,7 @@ class FacebookIE(InfoExtractor): 'id': '637842556329505', 'ext': 'mp4', 'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam', + 'uploader': 'Tennis on Facebook', } }, { 'note': 'Video without discernible title', @@ -52,6 +53,7 @@ class FacebookIE(InfoExtractor): 'id': '274175099429670', 'ext': 'mp4', 'title': 'Facebook video #274175099429670', + 'uploader': 'Asif Nawab Butt', }, 'expected_warnings': [ 'title' From 8de922724b8f3ad31ff7249799de371ff8a5c3ad Mon Sep 17 00:00:00 2001 From: "Sergey M." <dstftw@gmail.com> Date: Mon, 3 Aug 2015 05:36:17 +0600 Subject: [PATCH 0955/2145] [README.md] Clarify using cookies --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index ac54d7b67..2db3139ee 100644 --- a/README.md +++ b/README.md @@ -439,6 +439,12 @@ Either prepend `http://www.youtube.com/watch?v=` or separate the ID from the opt youtube-dl -- -wNyEUrxzFU youtube-dl "http://www.youtube.com/watch?v=-wNyEUrxzFU" +### How do I pass cookies to youtube-dl? + +Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. Note that cookies file must be in Mozilla/Netscape format and the first line of cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in cookies file and convert newlines if necessary to correspond your OS, namely `CRLF` (`\r\n`) for Windows, `LF` (`\n`) for Linux and `CR` (`\r`) for Mac OS. `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. + +Passing cookies to youtube-dl is a good way to workaround login when particular extractor does not implement it explicitly. + ### Can you add support for this anime video site, or site which shows current movies for free? As a matter of policy (as well as legality), youtube-dl does not include support for services that specialize in infringing copyright. As a rule of thumb, if you cannot easily find a video that the service is quite obviously allowed to distribute (i.e. that has been uploaded by the creator, the creator's distributor, or is published under a free license), the service is probably unfit for inclusion to youtube-dl. From 47a8b7c14a085ce558db3b5a85ded850cd5df642 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 3 Aug 2015 12:00:08 +0200 Subject: [PATCH 0956/2145] [mdr] Change XPath to make it work in python 2.6 (fixes #6443) The 'progressiveDownloadUrl' element is a direct child, so they should be equivalent. --- youtube_dl/extractor/mdr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 5fdd19027..fc7499958 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -29,7 +29,7 @@ class MDRIE(InfoExtractor): doc = self._download_xml(domain + xmlurl, video_id) formats = [] for a in doc.findall('./assets/asset'): - url_el = a.find('.//progressiveDownloadUrl') + url_el = a.find('./progressiveDownloadUrl') if url_el is None: continue abr = int(a.find('bitrateAudio').text) // 1000 From 8f5639afcbb967f276fb8b35a24559cdcc3b6d32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 3 Aug 2015 19:37:48 +0600 Subject: [PATCH 0957/2145] [pornhub] Improve video quality regex --- youtube_dl/extractor/pornhub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index fec493046..7b0cdc41a 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -94,7 +94,7 @@ class PornHubIE(InfoExtractor): format = path.split('/')[5].split('_')[:2] format = "-".join(format) - m = re.match(r'^(?P<height>[0-9]+)P-(?P<tbr>[0-9]+)K$', format) + m = re.match(r'^(?P<height>[0-9]+)[pP]-(?P<tbr>[0-9]+)[kK]$', format) if m is None: height = None tbr = None From e704f87f869b98bbed56d7dd0fe27710306c8272 Mon Sep 17 00:00:00 2001 From: Niklas Haas <git@nand.wakku.to> Date: Mon, 3 Aug 2015 01:54:21 +0200 Subject: [PATCH 0958/2145] [twitch] Parse start_time from 't' (closes #6441) Eg. for VOD links like http://www.twitch.tv/gamesdonequick/v/9136645?t=14h29m15s --- youtube_dl/extractor/twitch.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 73ce335b7..a2b6a35aa 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -7,12 +7,15 @@ import random from .common import InfoExtractor from ..compat import ( + compat_parse_qs, compat_str, compat_urllib_parse, + compat_urllib_parse_urlparse, compat_urllib_request, ) from ..utils import ( ExtractorError, + parse_duration, parse_iso8601, ) @@ -185,7 +188,7 @@ class TwitchVodIE(TwitchItemBaseIE): _ITEM_SHORTCUT = 'v' _TEST = { - 'url': 'http://www.twitch.tv/riotgames/v/6528877', + 'url': 'http://www.twitch.tv/riotgames/v/6528877?t=5m10s', 'info_dict': { 'id': 'v6528877', 'ext': 'mp4', @@ -197,6 +200,7 @@ class TwitchVodIE(TwitchItemBaseIE): 'uploader': 'Riot Games', 'uploader_id': 'riotgames', 'view_count': int, + 'start_time': 310, }, 'params': { # m3u8 download @@ -216,6 +220,12 @@ class TwitchVodIE(TwitchItemBaseIE): item_id, 'mp4') self._prefer_source(formats) info['formats'] = formats + + parsed_url = compat_urllib_parse_urlparse(url) + query = compat_parse_qs(parsed_url.query) + if 't' in query: + info['start_time'] = parse_duration(query['t'][0]) + return info From d96d604e5311628ece0234733dbbfe73a58c8d18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 3 Aug 2015 23:04:11 +0200 Subject: [PATCH 0959/2145] YoutubeDL: format spec: don't accept a bare '/' (#6124) --- test/test_YoutubeDL.py | 1 + youtube_dl/YoutubeDL.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 20f45f439..9a3c28f8c 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -311,6 +311,7 @@ class TestFormatSelection(unittest.TestCase): assert_syntax_error('bestvideo,,best') assert_syntax_error('+bestaudio') assert_syntax_error('bestvideo+') + assert_syntax_error('/') def test_format_filtering(self): formats = [ diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index efa3254ce..c608ff91a 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -960,6 +960,8 @@ class YoutubeDL(object): selectors.append(current_selector) current_selector = None elif string == '/': + if not current_selector: + raise syntax_error('"/" must follow a format selector', start) first_choice = current_selector second_choice = _parse_format_selection(tokens, inside_choice=True) current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), []) From a346b1ff57a94382e80fd4edd5a6d4b91a7cb45e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 4 Aug 2015 20:44:22 +0600 Subject: [PATCH 0960/2145] [bbc] Add support for vxp-playlist-data embeds (Closes #6453) --- youtube_dl/extractor/bbc.py | 45 ++++++++++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 9a1b6e3dc..abc5a44a1 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -526,6 +526,18 @@ class BBCIE(BBCCoUkIE): 'params': { 'skip_download': True, } + }, { + # single video from video playlist embedded with vxp-playlist-data JSON + 'url': 'http://www.bbc.com/news/video_and_audio/must_see/33376376', + 'info_dict': { + 'id': 'p02w6qjc', + 'ext': 'mp4', + 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', + 'duration': 56, + }, + 'params': { + 'skip_download': True, + } }, { # single video story with digitalData 'url': 'http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret', @@ -695,13 +707,36 @@ class BBCIE(BBCCoUkIE): if not medias: # Single video article (e.g. http://www.bbc.com/news/video_and_audio/international) - media_asset_page = self._parse_json( + media_asset = self._search_regex( + r'mediaAssetPage\.init\(\s*({.+?}), "/', + webpage, 'media asset', default=None) + if media_asset: + media_asset_page = self._parse_json(media_asset, playlist_id, fatal=False) + medias = [] + for video in media_asset_page.get('videos', {}).values(): + medias.extend(video.values()) + + if not medias: + # Multiple video playlist with single `now playing` entry (e.g. + # http://www.bbc.com/news/video_and_audio/must_see/33767813) + vxp_playlist = self._parse_json( self._search_regex( - r'mediaAssetPage\.init\(\s*({.+?}), "/', webpage, 'media asset'), + r'<script[^>]+class="vxp-playlist-data"[^>]+type="application/json"[^>]*>([^<]+)</script>', + webpage, 'playlist data'), playlist_id) - medias = [] - for video in media_asset_page.get('videos', {}).values(): - medias.extend(video.values()) + playlist_medias = [] + for item in vxp_playlist: + media = item.get('media') + if not media: + continue + playlist_medias.append(media) + # Download single video if found media with asset id matching the video id from URL + if item.get('advert', {}).get('assetId') == playlist_id: + medias = [media] + break + # Fallback to the whole playlist + if not medias: + medias = playlist_medias entries = [] for num, media_meta in enumerate(medias, start=1): From 232541df441741d3d55605f03e28ec3c34249a5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Tue, 4 Aug 2015 22:29:23 +0200 Subject: [PATCH 0961/2145] [YoutubeDL] format spec: correctly handle dashes and other unused operators 'mp4-baseline-16x9' must be handled as a single string, but the '-' was treated as an operator. --- test/test_YoutubeDL.py | 6 ++++++ youtube_dl/YoutubeDL.py | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 9a3c28f8c..0388c0bf3 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -105,6 +105,7 @@ class TestFormatSelection(unittest.TestCase): def test_format_selection(self): formats = [ {'format_id': '35', 'ext': 'mp4', 'preference': 1, 'url': TEST_URL}, + {'format_id': 'example-with-dashes', 'ext': 'webm', 'preference': 1, 'url': TEST_URL}, {'format_id': '45', 'ext': 'webm', 'preference': 2, 'url': TEST_URL}, {'format_id': '47', 'ext': 'webm', 'preference': 3, 'url': TEST_URL}, {'format_id': '2', 'ext': 'flv', 'preference': 4, 'url': TEST_URL}, @@ -136,6 +137,11 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], '35') + ydl = YDL({'format': 'example-with-dashes'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'example-with-dashes') + def test_format_selection_audio(self): formats = [ {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none', 'url': TEST_URL}, diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index c608ff91a..1446b3254 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -933,6 +933,37 @@ class YoutubeDL(object): else: filter_parts.append(string) + def _remove_unused_ops(tokens): + # Remove operators that we don't use and join them with the sourrounding strings + # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' + ALLOWED_OPS = ('/', '+', ',', '(', ')') + last_string, last_start, last_end, last_line = None, None, None, None + for type, string, start, end, line in tokens: + if type == tokenize.OP and string == '[': + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + last_string = None + yield type, string, start, end, line + # everything inside brackets will be handled by _parse_filter + for type, string, start, end, line in tokens: + yield type, string, start, end, line + if type == tokenize.OP and string == ']': + break + elif type == tokenize.OP and string in ALLOWED_OPS: + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + last_string = None + yield type, string, start, end, line + elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]: + if not last_string: + last_string = string + last_start = start + last_end = end + else: + last_string += string + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False): selectors = [] current_selector = None @@ -1111,7 +1142,7 @@ class YoutubeDL(object): stream = io.BytesIO(format_spec.encode('utf-8')) try: - tokens = list(compat_tokenize_tokenize(stream.readline)) + tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline))) except tokenize.TokenError: raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) From 8a5601e42f6974e6694f01089b4c7e014b6a1b7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 5 Aug 2015 19:52:04 +0600 Subject: [PATCH 0962/2145] [lynda] Fix login (Closes #6462) --- youtube_dl/extractor/lynda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index a00f6e5e5..39214de2f 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -17,7 +17,7 @@ from ..utils import ( class LyndaBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.lynda.com/login/login.aspx' - _SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn: true' + _SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn\s*:\s*true' _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' _NETRC_MACHINE = 'lynda' From 5b7dab2dd640c93ec0f63ca8b901e701679a4c7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 5 Aug 2015 20:06:48 +0600 Subject: [PATCH 0963/2145] [lynda] Make login more robust --- youtube_dl/extractor/lynda.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 39214de2f..deead220a 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -17,7 +17,6 @@ from ..utils import ( class LyndaBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.lynda.com/login/login.aspx' - _SUCCESSFUL_LOGIN_REGEX = r'isLoggedIn\s*:\s*true' _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' _NETRC_MACHINE = 'lynda' @@ -41,7 +40,7 @@ class LyndaBaseIE(InfoExtractor): request, None, 'Logging in as %s' % username) # Not (yet) logged in - m = re.search(r'loginResultJson = \'(?P<json>[^\']+)\';', login_page) + m = re.search(r'loginResultJson\s*=\s*\'(?P<json>[^\']+)\';', login_page) if m is not None: response = m.group('json') response_json = json.loads(response) @@ -70,7 +69,7 @@ class LyndaBaseIE(InfoExtractor): request, None, 'Confirming log in and log out from another device') - if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None: + if all(not re.search(p, login_page) for p in ('isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')): raise ExtractorError('Unable to log in') From 354b4b8604ec13ccf4bd89b9d1b77cb7246fe379 Mon Sep 17 00:00:00 2001 From: vijayanand nandam <vijay@cybrilla.com> Date: Wed, 5 Aug 2015 19:37:59 +0530 Subject: [PATCH 0964/2145] fixing xhamster file extraction --- youtube_dl/extractor/xhamster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index b4ad513a0..9d025530f 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -47,7 +47,7 @@ class XHamsterIE(InfoExtractor): def _real_extract(self, url): def extract_video_url(webpage): - mp4 = re.search(r'<video\s+.*?file="([^"]+)".*?>', webpage) + mp4 = re.search(r'file:\s+\'([^\']+)\'', webpage) if mp4 is None: raise ExtractorError('Unable to extract media URL') else: From be7a8379b47c35afe66abcc02aee597e5143b1d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 5 Aug 2015 20:32:44 +0600 Subject: [PATCH 0965/2145] [xhamster] Make more robust --- youtube_dl/extractor/xhamster.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 9d025530f..481d79b89 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -46,12 +46,12 @@ class XHamsterIE(InfoExtractor): ] def _real_extract(self, url): - def extract_video_url(webpage): - mp4 = re.search(r'file:\s+\'([^\']+)\'', webpage) - if mp4 is None: - raise ExtractorError('Unable to extract media URL') - else: - return mp4.group(1) + def extract_video_url(webpage, name): + return self._search_regex( + [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''', + r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''', + r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''], + webpage, name, group='mp4') def is_hd(webpage): return '<div class=\'icon iconHD\'' in webpage @@ -97,7 +97,9 @@ class XHamsterIE(InfoExtractor): hd = is_hd(webpage) - video_url = extract_video_url(webpage) + format_id = 'hd' if hd else 'sd' + + video_url = extract_video_url(webpage, format_id) formats = [{ 'url': video_url, 'format_id': 'hd' if hd else 'sd', @@ -108,7 +110,7 @@ class XHamsterIE(InfoExtractor): mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url') webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage') if is_hd(webpage): - video_url = extract_video_url(webpage) + video_url = extract_video_url(webpage, 'hd') formats.append({ 'url': video_url, 'format_id': 'hd', From 251a44b776264c17d7799e017b856143c6cacd9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 5 Aug 2015 20:36:37 +0600 Subject: [PATCH 0966/2145] [xhamster] Fix thumbnail extraction --- youtube_dl/extractor/xhamster.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 481d79b89..b57e7c813 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -78,7 +78,10 @@ class XHamsterIE(InfoExtractor): uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)', webpage, 'uploader id', default='anonymous') - thumbnail = self._html_search_regex(r'<video\s+.*?poster="([^"]+)".*?>', webpage, 'thumbnail', fatal=False) + thumbnail = self._search_regex( + [r'''thumb\s*:\s*(?P<q>["'])(?P<thumbnail>.+?)(?P=q)''', + r'''<video[^>]+poster=(?P<q>["'])(?P<thumbnail>.+?)(?P=q)[^>]*>'''], + webpage, 'thumbnail', fatal=False, group='thumbnail') duration = parse_duration(self._html_search_regex(r'<span>Runtime:</span> (\d+:\d+)</div>', webpage, 'duration', fatal=False)) From 3e4852247744b131600ba43275ab321eb1b32bb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 5 Aug 2015 20:41:40 +0600 Subject: [PATCH 0967/2145] [xhamster] Fix uploader extraction --- youtube_dl/extractor/xhamster.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index b57e7c813..06fedf840 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -22,7 +22,7 @@ class XHamsterIE(InfoExtractor): 'ext': 'mp4', 'title': 'FemaleAgent Shy beauty takes the bait', 'upload_date': '20121014', - 'uploader_id': 'Ruseful2011', + 'uploader': 'Ruseful2011', 'duration': 893, 'age_limit': 18, } @@ -34,7 +34,7 @@ class XHamsterIE(InfoExtractor): 'ext': 'mp4', 'title': 'Britney Spears Sexy Booty', 'upload_date': '20130914', - 'uploader_id': 'jojo747400', + 'uploader': 'jojo747400', 'duration': 200, 'age_limit': 18, } @@ -75,8 +75,9 @@ class XHamsterIE(InfoExtractor): if upload_date: upload_date = unified_strdate(upload_date) - uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)', - webpage, 'uploader id', default='anonymous') + uploader = self._html_search_regex( + r"<a href='[^']+xhamster\.com/user/[^>]+>(?P<uploader>[^<]+)", + webpage, 'uploader', default='anonymous') thumbnail = self._search_regex( [r'''thumb\s*:\s*(?P<q>["'])(?P<thumbnail>.+?)(?P=q)''', @@ -127,7 +128,7 @@ class XHamsterIE(InfoExtractor): 'title': title, 'description': description, 'upload_date': upload_date, - 'uploader_id': uploader_id, + 'uploader': uploader, 'thumbnail': thumbnail, 'duration': duration, 'view_count': view_count, From 54a9328b205e8a2c916d59fd81bdb1ede25cf87a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 5 Aug 2015 21:19:52 +0600 Subject: [PATCH 0968/2145] [generic] Expand jwplayer support --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8cef61c3c..6df89f814 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1655,7 +1655,7 @@ class GenericIE(InfoExtractor): if not found: # Broaden the findall a little bit: JWPlayer JS loader found = filter_video(re.findall( - r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)) + r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)) if not found: # Flow player found = filter_video(re.findall(r'''(?xs) From c71a3195afa8c2a9ed5fe0ffa56ff6c969147d91 Mon Sep 17 00:00:00 2001 From: Delon <liuxi326@qq.com> Date: Wed, 5 Aug 2015 18:22:25 +0800 Subject: [PATCH 0969/2145] [tudou] Fix extracion --- youtube_dl/extractor/tudou.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index c89de5ba4..9b934cb57 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -29,6 +29,8 @@ class TudouIE(InfoExtractor): } }] + _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' + def _url_for_id(self, id, quality=None): info_url = "http://v2.tudou.com/f?id=" + str(id) if quality: @@ -76,6 +78,9 @@ class TudouIE(InfoExtractor): 'ext': ext, 'title': title, 'thumbnail': thumbnail_url, + 'http_headers': { + 'Referer': self._PLAYER_URL, + }, } result.append(part_info) From 238755752f4f9169a1edda91067c8627afe19cce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 5 Aug 2015 23:07:52 +0600 Subject: [PATCH 0970/2145] [tudou] Extract player URL from the webpage --- youtube_dl/extractor/tudou.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 9b934cb57..84fe71aef 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -30,7 +30,7 @@ class TudouIE(InfoExtractor): }] _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' - + def _url_for_id(self, id, quality=None): info_url = "http://v2.tudou.com/f?id=" + str(id) if quality: @@ -56,6 +56,10 @@ class TudouIE(InfoExtractor): thumbnail_url = self._search_regex( r",pic:\s*[\"'](.+?)[\"']", webpage, 'thumbnail URL', fatal=False) + player_url = self._search_regex( + r"playerUrl\s*:\s*['\"](.+?\.swf)[\"']", + webpage, 'player URL', default=self._PLAYER_URL) + segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments') segments = json.loads(segs_json) # It looks like the keys are the arguments that have to be passed as @@ -79,7 +83,7 @@ class TudouIE(InfoExtractor): 'title': title, 'thumbnail': thumbnail_url, 'http_headers': { - 'Referer': self._PLAYER_URL, + 'Referer': player_url, }, } result.append(part_info) From f535ec8278c8f465b47919d3f451571ae8ccfc7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 5 Aug 2015 23:08:26 +0600 Subject: [PATCH 0971/2145] [xhamster] Remove unused import --- youtube_dl/extractor/xhamster.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 06fedf840..f76ee8fd4 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -4,7 +4,6 @@ import re from .common import InfoExtractor from ..utils import ( - ExtractorError, unified_strdate, str_to_int, int_or_none, From c73cdd800f0dc7b465ac0b36d338875bb80c23aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 5 Aug 2015 23:08:55 +0600 Subject: [PATCH 0972/2145] [xhamster] flake8 --- youtube_dl/extractor/xhamster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index f76ee8fd4..97315750f 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -81,7 +81,7 @@ class XHamsterIE(InfoExtractor): thumbnail = self._search_regex( [r'''thumb\s*:\s*(?P<q>["'])(?P<thumbnail>.+?)(?P=q)''', r'''<video[^>]+poster=(?P<q>["'])(?P<thumbnail>.+?)(?P=q)[^>]*>'''], - webpage, 'thumbnail', fatal=False, group='thumbnail') + webpage, 'thumbnail', fatal=False, group='thumbnail') duration = parse_duration(self._html_search_regex(r'<span>Runtime:</span> (\d+:\d+)</div>', webpage, 'duration', fatal=False)) From 51f267d9d4d26c3cd67f318a2040513946f2b4d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 6 Aug 2015 22:01:01 +0600 Subject: [PATCH 0973/2145] [YoutubeDL:utils] Move percent encode non-ASCII URLs workaround to http_request and simplify (Closes #6457) --- youtube_dl/YoutubeDL.py | 21 --------------------- youtube_dl/utils.py | 20 ++++++++++++++++++++ 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 1446b3254..079d42ce8 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1860,27 +1860,6 @@ class YoutubeDL(object): def urlopen(self, req): """ Start an HTTP download """ - - # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not - # always respected by websites, some tend to give out URLs with non percent-encoded - # non-ASCII characters (see telemb.py, ard.py [#3412]) - # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) - # To work around aforementioned issue we will replace request's original URL with - # percent-encoded one - req_is_string = isinstance(req, compat_basestring) - url = req if req_is_string else req.get_full_url() - url_escaped = escape_url(url) - - # Substitute URL if any change after escaping - if url != url_escaped: - if req_is_string: - req = url_escaped - else: - req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request - req = req_type( - url_escaped, data=req.data, headers=req.headers, - origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) - return self._opener.open(req, timeout=self._socket_timeout) def print_debug_header(self): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 78dc2b449..c7db75f80 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -651,6 +651,26 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): return ret def http_request(self, req): + # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not + # always respected by websites, some tend to give out URLs with non percent-encoded + # non-ASCII characters (see telemb.py, ard.py [#3412]) + # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) + # To work around aforementioned issue we will replace request's original URL with + # percent-encoded one + # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09) + # the code of this workaround has been moved here from YoutubeDL.urlopen() + url = req.get_full_url() + url_escaped = escape_url(url) + + # Substitute URL if any change after escaping + if url != url_escaped: + req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request + new_req = req_type( + url_escaped, data=req.data, headers=req.headers, + origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) + new_req.timeout = req.timeout + req = new_req + for h, v in std_headers.items(): # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 # The dict keys are capitalized because of this bug by urllib From bd690a9f9368095f561184778fb2f3ef12c66342 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 6 Aug 2015 22:01:31 +0600 Subject: [PATCH 0974/2145] [southpark:de] Add test for non-ASCII in URLs --- youtube_dl/extractor/southpark.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index 7fb165a87..87b650468 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -45,6 +45,14 @@ class SouthParkDeIE(SouthParkIE): 'title': 'The Government Won\'t Respect My Privacy', 'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', }, + }, { + # non-ASCII characters in initial URL + 'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen', + 'playlist_count': 4, + }, { + # non-ASCII characters in redirect URL + 'url': 'http://www.southpark.de/alle-episoden/s18e09', + 'playlist_count': 4, }] From 4f34cdb0a87a506d25a352ff265678c86cb9b979 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 6 Aug 2015 23:56:44 +0600 Subject: [PATCH 0975/2145] [southpark:de] Skip test --- youtube_dl/extractor/southpark.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index 87b650468..ad63a8785 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -53,6 +53,7 @@ class SouthParkDeIE(SouthParkIE): # non-ASCII characters in redirect URL 'url': 'http://www.southpark.de/alle-episoden/s18e09', 'playlist_count': 4, + 'skip': 'Broken python 3', }] From 671302b5c0ff8cefa5f26e599423ef7799b19631 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 7 Aug 2015 00:08:11 +0600 Subject: [PATCH 0976/2145] [YoutubeDL] Remove unused imports --- youtube_dl/YoutubeDL.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 079d42ce8..cad6b026e 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -28,7 +28,6 @@ if os.name == 'nt': import ctypes from .compat import ( - compat_basestring, compat_cookiejar, compat_expanduser, compat_get_terminal_size, @@ -40,7 +39,6 @@ from .compat import ( compat_urllib_request, ) from .utils import ( - escape_url, ContentTooShortError, date_from_str, DateRange, @@ -51,7 +49,6 @@ from .utils import ( ExtractorError, format_bytes, formatSeconds, - HEADRequest, locked_file, make_HTTPS_handler, MaxDownloadsReached, From cd6b555e19c601d575679dd29da0080eda7f8890 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 6 Aug 2015 19:17:50 +0100 Subject: [PATCH 0977/2145] [dcn] add origin to api request and fix the test and check with flake8 --- youtube_dl/extractor/dcn.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index f76ebda9e..d44e8cef0 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -1,4 +1,9 @@ +# coding: utf-8 +from __future__ import unicode_literals + from .common import InfoExtractor +from ..compat import compat_urllib_request + class DcnIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/.+|show/\d+/.+?)/(?P<id>\d+)/?' @@ -9,24 +14,29 @@ class DcnIE(InfoExtractor): 'id': '17375', 'ext': 'm3u8', 'title': 'رحلة العمر : الحلقة 1', - 'description': '"في هذه الحلقة من برنامج رحلة العمر يقدّم الدكتور عمر عبد الكافي تبسيطاً لمناسك الحج والعمرة ويجيب مباشرة على استفسارات حجاج بيت الله الحرام بخصوص مناسك الحج والعمرة1"', + 'description': 'في هذه الحلقة من برنامج رحلة العمر يقدّم الدكتور عمر عبد الكافي تبسيطاً لمناسك الحج والعمرة ويجيب مباشرة على استفسارات حجاج بيت الله الحرام بخصوص مناسك الحج والعمرة\n1', 'thumbnail': 'http://admin.mangomolo.com/analytics/uploads/71/images/media/2/2cefc09d7bec80afa754682f40e49503.jpg', 'duration': '2041' - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): video_id = self._match_id(url) - json_data = self._download_json( - 'http://admin.mangomolo.com/analytics/index.php/plus/video?id='+video_id, - video_id + request = compat_urllib_request.Request( + 'http://admin.mangomolo.com/analytics/index.php/plus/video?id=' + video_id, + headers={'Origin': 'http://www.dcndigital.ae'} ) - title = json_data['title_ar']; - thumbnail = 'http://admin.mangomolo.com/analytics/'+json_data['img']; - duration = json_data['duration']; - description = json_data['description_ar']; + json_data = self._download_json(request, video_id) + title = json_data['title_ar'] + thumbnail = 'http://admin.mangomolo.com/analytics/' + json_data['img'] + duration = json_data['duration'] + description = json_data['description_ar'] webpage = self._download_webpage( - 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?id='+json_data['id']+'&user_id='+json_data['user_id']+'&countries=Q0M=&w=100%&h=100%&filter=DENY&signature='+json_data['signature'], + 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?id=' + json_data['id'] + '&user_id=' + json_data['user_id'] + '&countries=Q0M=&w=100%&h=100%&filter=DENY&signature=' + json_data['signature'], video_id ) m3u8_url = self._html_search_regex( From 3be3c622dc1d3d7b92c5268a079d202a9f2b0a5a Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 6 Aug 2015 19:37:45 +0100 Subject: [PATCH 0978/2145] [shahid] generic errors handling and check with flake8 --- youtube_dl/extractor/shahid.py | 42 ++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py index b3b45da24..57c159833 100644 --- a/youtube_dl/extractor/shahid.py +++ b/youtube_dl/extractor/shahid.py @@ -1,3 +1,6 @@ +# coding: utf-8 +from __future__ import unicode_literals + from .common import InfoExtractor from ..utils import ( js_to_json, @@ -5,6 +8,7 @@ from ..utils import ( int_or_none ) + class ShahidIE(InfoExtractor): _VALID_URL = r'https?://shahid\.mbc\.net/ar/episode/(?P<id>\d+)/?' _TESTS = [ @@ -23,7 +27,7 @@ class ShahidIE(InfoExtractor): } }, { - #shahid plus subscriber only + # shahid plus subscriber only 'url': 'https://shahid.mbc.net/ar/series/90497/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011.html', 'only_matching': True } @@ -32,31 +36,15 @@ class ShahidIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + player_info = '' - for line in self._search_regex( 'var flashvars = ({[^}]+})', webpage, 'flashvars').splitlines(): + for line in self._search_regex('var flashvars = ({[^}]+})', webpage, 'flashvars').splitlines(): if '+' not in line and '(' not in line and ')' not in line: player_info += line player_info = self._parse_json(js_to_json(player_info), video_id) video_id = player_info['id'] player_type = player_info['playerType'] - video_info = self._download_json( - player_info['url'] + '/' + player_type + '/' + video_id + - '?apiKey=sh%40hid0nlin3&hash=b2wMCTHpSmyxGqQjJFOycRmLSex%2BBpTK%2Fooxy6vHaqs%3D', - video_id - )['data'] - if video_info['error']: - for error in video_info['error']: - raise ExtractorError(error) - video_info = video_info[player_type] - if video_info.get('availabilities').get('plus'): - raise ExtractorError('plus members only') - title = video_info['title'] - thumbnail = video_info.get('thumbnailUrl') - categories = [category['name'] for category in video_info.get('genres')] - description = video_info.get('description') - duration = int_or_none(video_info.get('duration')) - player_json_data = self._download_json( 'https://shahid.mbc.net/arContent/getPlayerContent-param-.id-' + video_id + '.type-' + player_info['type'] + '.html', video_id @@ -66,8 +54,22 @@ class ShahidIE(InfoExtractor): else: for error in player_json_data['error'].values(): raise ExtractorError(error) - return formats = self._extract_m3u8_formats(m3u8_url, video_id) + + video_info = self._download_json( + player_info['url'] + '/' + player_type + '/' + video_id + '?apiKey=sh%40hid0nlin3&hash=b2wMCTHpSmyxGqQjJFOycRmLSex%2BBpTK%2Fooxy6vHaqs%3D', + video_id + )['data'] + if video_info.get('error'): + for error in video_info['error']: + raise ExtractorError(error) + video_info = video_info[player_type] + title = video_info['title'] + thumbnail = video_info.get('thumbnailUrl') + categories = [category['name'] for category in video_info.get('genres')] + description = video_info.get('description') + duration = int_or_none(video_info.get('duration')) + return { 'id': video_id, 'title': title, From 5a4d9ddb218e761fe7ab15d197690e0cb132a536 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 7 Aug 2015 01:26:40 +0600 Subject: [PATCH 0979/2145] [utils] Percent-encode redirect URL of Location header (Closes #6457) --- youtube_dl/utils.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c7db75f80..e265c7574 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -715,6 +715,17 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): gz = io.BytesIO(self.deflate(resp.read())) resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg + # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 + if 300 <= resp.code < 400: + location = resp.headers.get('Location') + if location: + # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 + if sys.version_info >= (3, 0): + location = location.encode('iso-8859-1').decode('utf-8') + location_escaped = escape_url(location) + if location != location_escaped: + del resp.headers['Location'] + resp.headers['Location'] = location_escaped return resp https_request = http_request From 9663bd3abb78911bddad75742bd41006677d628e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 7 Aug 2015 01:27:07 +0600 Subject: [PATCH 0980/2145] [southpark:de] Enable non-ASCII redirect URL test --- youtube_dl/extractor/southpark.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index ad63a8785..87b650468 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -53,7 +53,6 @@ class SouthParkDeIE(SouthParkIE): # non-ASCII characters in redirect URL 'url': 'http://www.southpark.de/alle-episoden/s18e09', 'playlist_count': 4, - 'skip': 'Broken python 3', }] From 3eb5fdb58112032a9831eda1d2e3b8a151ea217f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 6 Aug 2015 22:55:43 +0200 Subject: [PATCH 0981/2145] release 2015.08.06 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index fa157cadb..b81d5e658 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.07.28' +__version__ = '2015.08.06' From 430b092a5f59fbe407b92ebcb0c42b9f7062a334 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 6 Aug 2015 23:06:21 +0200 Subject: [PATCH 0982/2145] release 2015.08.06.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b81d5e658..9f209499c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.08.06' +__version__ = '2015.08.06.1' From 6d30cf04db9c9662dbb30c2490e24eb5c6dca4c3 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 7 Aug 2015 10:01:18 +0100 Subject: [PATCH 0983/2145] [dcn] fix type and key errors --- youtube_dl/extractor/dcn.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index d44e8cef0..22ff35b56 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_request +from ..utils import int_or_none class DcnIE(InfoExtractor): @@ -16,7 +17,7 @@ class DcnIE(InfoExtractor): 'title': 'رحلة العمر : الحلقة 1', 'description': 'في هذه الحلقة من برنامج رحلة العمر يقدّم الدكتور عمر عبد الكافي تبسيطاً لمناسك الحج والعمرة ويجيب مباشرة على استفسارات حجاج بيت الله الحرام بخصوص مناسك الحج والعمرة\n1', 'thumbnail': 'http://admin.mangomolo.com/analytics/uploads/71/images/media/2/2cefc09d7bec80afa754682f40e49503.jpg', - 'duration': '2041' + 'duration': 2041 }, 'params': { # m3u8 download @@ -32,9 +33,9 @@ class DcnIE(InfoExtractor): ) json_data = self._download_json(request, video_id) title = json_data['title_ar'] - thumbnail = 'http://admin.mangomolo.com/analytics/' + json_data['img'] - duration = json_data['duration'] - description = json_data['description_ar'] + thumbnail = 'http://admin.mangomolo.com/analytics/' + json_data.get('img') + duration = int_or_none(json_data.get('duration')) + description = json_data.get('description_ar') webpage = self._download_webpage( 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?id=' + json_data['id'] + '&user_id=' + json_data['user_id'] + '&countries=Q0M=&w=100%&h=100%&filter=DENY&signature=' + json_data['signature'], video_id From 8002ac9e0a88d918735c06599dbf8f2005f79666 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 7 Aug 2015 19:04:44 +0600 Subject: [PATCH 0984/2145] [nowtv] Add support for .at TLD --- youtube_dl/extractor/nowtv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index ad938fb62..78e8851c0 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -14,7 +14,7 @@ from ..utils import ( class NowTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nowtv\.de/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/(?:player|preview)' + _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/(?:player|preview)' _TESTS = [{ # rtl From acc1adbe7ab93657cd4d303cee1fba4464931a50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 7 Aug 2015 19:50:54 +0600 Subject: [PATCH 0985/2145] [nowtv] Add support for .ch TLD --- youtube_dl/extractor/nowtv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index 78e8851c0..fc21d8e3f 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -14,7 +14,7 @@ from ..utils import ( class NowTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/(?:player|preview)' + _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<id>.+?)/(?:player|preview)' _TESTS = [{ # rtl From 0f422256d6eea5aff062a4c35d7434cd118c7a0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 7 Aug 2015 19:51:09 +0600 Subject: [PATCH 0986/2145] [nowtv] Add .at test --- youtube_dl/extractor/nowtv.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index fc21d8e3f..66c627bec 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -127,6 +127,9 @@ class NowTVIE(InfoExtractor): }, { 'url': 'http://www.nowtv.de/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview', 'only_matching': True, + }, { + 'url': 'http://www.nowtv.at/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit/preview?return=/rtl/bauer-sucht-frau/die-neuen-bauern-und-eine-hochzeit', + 'only_matching': True, }] def _real_extract(self, url): From f94639fadf91312bf3365802981f506ecba698dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 00:06:03 +0600 Subject: [PATCH 0987/2145] [dcn] Improve --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/dcn.py | 78 ++++++++++++++++++++++---------- 2 files changed, 54 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index eb8ef1fe3..922d9b3d8 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -118,7 +118,7 @@ from .dailymotion import ( ) from .daum import DaumIE from .dbtv import DBTVIE -from .dcn import DcnIE +from .dcn import DCNIE from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE from .dfb import DFBIE diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 22ff35b56..b98a6c032 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -2,22 +2,30 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_request -from ..utils import int_or_none +from ..compat import ( + compat_urllib_parse, + compat_urllib_request, +) +from ..utils import ( + int_or_none, + parse_iso8601, +) -class DcnIE(InfoExtractor): +class DCNIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/.+|show/\d+/.+?)/(?P<id>\d+)/?' _TEST = { 'url': 'http://www.dcndigital.ae/#/show/199074/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375/6887', 'info_dict': { 'id': '17375', - 'ext': 'm3u8', + 'ext': 'mp4', 'title': 'رحلة العمر : الحلقة 1', - 'description': 'في هذه الحلقة من برنامج رحلة العمر يقدّم الدكتور عمر عبد الكافي تبسيطاً لمناسك الحج والعمرة ويجيب مباشرة على استفسارات حجاج بيت الله الحرام بخصوص مناسك الحج والعمرة\n1', - 'thumbnail': 'http://admin.mangomolo.com/analytics/uploads/71/images/media/2/2cefc09d7bec80afa754682f40e49503.jpg', - 'duration': 2041 + 'description': 'md5:0156e935d870acb8ef0a66d24070c6d6', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 2041, + 'timestamp': 1227504126, + 'upload_date': '20081124', }, 'params': { # m3u8 download @@ -27,30 +35,50 @@ class DcnIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + request = compat_urllib_request.Request( - 'http://admin.mangomolo.com/analytics/index.php/plus/video?id=' + video_id, - headers={'Origin': 'http://www.dcndigital.ae'} - ) - json_data = self._download_json(request, video_id) - title = json_data['title_ar'] - thumbnail = 'http://admin.mangomolo.com/analytics/' + json_data.get('img') - duration = int_or_none(json_data.get('duration')) - description = json_data.get('description_ar') + 'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id, + headers={'Origin': 'http://www.dcndigital.ae'}) + + video = self._download_json(request, video_id) + title = video.get('title_en') or video['title_ar'] + webpage = self._download_webpage( - 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?id=' + json_data['id'] + '&user_id=' + json_data['user_id'] + '&countries=Q0M=&w=100%&h=100%&filter=DENY&signature=' + json_data['signature'], - video_id - ) - m3u8_url = self._html_search_regex( - r'file:\s*"([^"]+)', - webpage, - 'm3u8_url' - ) - formats = self._extract_m3u8_formats(m3u8_url, video_id) + 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' + + compat_urllib_parse.urlencode({ + 'id': video['id'], + 'user_id': video['user_id'], + 'signature': video['signature'], + 'countries': 'Q0M=', + 'filter': 'DENY', + }), video_id) + + m3u8_url = self._html_search_regex(r'file:\s*"([^"]+)', webpage, 'm3u8 url') + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + + rtsp_url = self._search_regex( + r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False) + if rtsp_url: + formats.append({ + 'url': rtsp_url, + 'format_id': 'rtsp', + }) + + self._sort_formats(formats) + + img = video.get('img') + thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None + duration = int_or_none(video.get('duration')) + description = video.get('description_en') or video.get('description_ar') + timestamp = parse_iso8601(video.get('create_time') or video.get('update_time'), ' ') + return { 'id': video_id, 'title': title, + 'description': description, 'thumbnail': thumbnail, 'duration': duration, - 'description': description, + 'timestamp': timestamp, 'formats': formats, } From 4a7434d0b09e14b773c2d278c8299efa6225b84e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 00:19:40 +0600 Subject: [PATCH 0988/2145] [dcn] Simplify _VALID_URL --- youtube_dl/extractor/dcn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index b98a6c032..82261e25c 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -13,7 +13,7 @@ from ..utils import ( class DCNIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/.+|show/\d+/.+?)/(?P<id>\d+)/?' + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/.+|show/\d+/.+?)/(?P<id>\d+)' _TEST = { 'url': 'http://www.dcndigital.ae/#/show/199074/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375/6887', 'info_dict': From fd5d8270dcd6d8baada3390a4a1cae5bdbcb6da4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 01:10:41 +0600 Subject: [PATCH 0989/2145] [clipfish] Fix extraction, minimize requests, get rid of drm hds, extract m3u8 and more metadata --- youtube_dl/extractor/clipfish.py | 56 ++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index 09dfaac60..7af903571 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -1,18 +1,19 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( - ExtractorError, + determine_ext, int_or_none, js_to_json, - determine_ext, + parse_iso8601, + remove_end, ) class ClipfishIE(InfoExtractor): - IE_NAME = 'clipfish' - - _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/' + _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/', 'md5': '79bc922f3e8a9097b3d68a93780fd475', @@ -20,35 +21,48 @@ class ClipfishIE(InfoExtractor): 'id': '3966754', 'ext': 'mp4', 'title': 'FIFA 14 - E3 2013 Trailer', + 'timestamp': 1370938118, + 'upload_date': '20130611', 'duration': 82, } } def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_info = self._parse_json( - js_to_json(self._html_search_regex('var videoObject = ({[^}]+?})', webpage, 'videoObject')), - video_id - ) - info_url = self._parse_json( - js_to_json(self._html_search_regex('var globalFlashvars = ({[^}]+?})', webpage, 'globalFlashvars')), - video_id - )['data'] - doc = self._download_xml( - info_url, video_id, note='Downloading info page') - title = doc.find('title').text - video_url = doc.find('filename').text - thumbnail = doc.find('imageurl').text - duration = int_or_none(video_info['length']) - formats = [{'url': video_info['videourl']},{'url': video_url}] + webpage = self._download_webpage(url, video_id) + + video_info = self._parse_json( + js_to_json(self._html_search_regex( + '(?s)videoObject\s*=\s*({.+?});', webpage, 'video object')), + video_id) + + formats = [] + for video_url in re.findall(r'var\s+videourl\s*=\s*"([^"]+)"', webpage): + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.append({ + 'url': video_url.replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'), + 'ext': 'mp4', + 'format_id': 'hls', + }) + else: + formats.append({ + 'url': video_url, + 'format_id': ext, + }) self._sort_formats(formats) + title = remove_end(self._og_search_title(webpage), ' - Video') + thumbnail = self._og_search_thumbnail(webpage) + duration = int_or_none(video_info.get('length')) + timestamp = parse_iso8601(self._html_search_meta('uploadDate', webpage, 'upload date')) + return { 'id': video_id, 'title': title, 'formats': formats, 'thumbnail': thumbnail, 'duration': duration, + 'timestamp': timestamp, } From 8a37aa1517ccc474b3e2831b77e48534cb8ed47c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 01:55:59 +0600 Subject: [PATCH 0990/2145] [extractor/generic] Expand ooyala regex (Closes #6485) --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6df89f814..649c0bce6 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1320,7 +1320,7 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url')) # Look for Ooyala videos - mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or + mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) From bf94d763ba73e09fd77d25110c7219254b63c786 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 02:00:49 +0600 Subject: [PATCH 0991/2145] [extractor/generic] Add test for #6485 --- youtube_dl/extractor/generic.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 649c0bce6..469909a51 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -236,6 +236,19 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Ooyala'], }, + { + # ooyala video embedded with http://player.ooyala.com/iframe.js + 'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/', + 'info_dict': { + 'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB', + 'ext': 'mp4', + 'title': '"Steve Jobs: Man in the Machine" trailer', + 'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."', + }, + 'params': { + 'skip_download': True, + }, + }, # multiple ooyala embeds on SBN network websites { 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok', From c29458f3ec77072e9c17169b78871bf4473134d6 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 7 Aug 2015 21:38:50 +0100 Subject: [PATCH 0992/2145] [shahid] change the tests --- youtube_dl/extractor/shahid.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py index 57c159833..b2050525e 100644 --- a/youtube_dl/extractor/shahid.py +++ b/youtube_dl/extractor/shahid.py @@ -13,13 +13,13 @@ class ShahidIE(InfoExtractor): _VALID_URL = r'https?://shahid\.mbc\.net/ar/episode/(?P<id>\d+)/?' _TESTS = [ { - 'url': 'https://shahid.mbc.net/ar/episode/108084/%D8%AE%D9%88%D8%A7%D8%B7%D8%B1-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-11-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1.html', + 'url': 'https://shahid.mbc.net/ar/episode/90574/%D8%A7%D9%84%D9%85%D9%84%D9%83-%D8%B9%D8%A8%D8%AF%D8%A7%D9%84%D9%84%D9%87-%D8%A7%D9%84%D8%A5%D9%86%D8%B3%D8%A7%D9%86-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-3.html', 'info_dict': { - 'id': '108084', + 'id': '90574', 'ext': 'm3u8', - 'title': 'خواطر الموسم 11 الحلقة 1', - 'description': 'بسم الله', - 'duration': 1166, + 'title': 'الملك عبدالله الإنسان الموسم 1 كليب 3', + 'description': 'الفيلم الوثائقي - الملك عبد الله الإنسان', + 'duration': 2972, }, 'params': { # m3u8 download @@ -28,7 +28,7 @@ class ShahidIE(InfoExtractor): }, { # shahid plus subscriber only - 'url': 'https://shahid.mbc.net/ar/series/90497/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011.html', + 'url': 'https://shahid.mbc.net/ar/episode/90511/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1.html', 'only_matching': True } ] From e0ac521438218e978b9c4bbcd92cfc2d5fef79cb Mon Sep 17 00:00:00 2001 From: vijayanand nandam <vijay@cybrilla.com> Date: Thu, 6 Aug 2015 22:42:58 +0530 Subject: [PATCH 0993/2145] adding support for axel download manager --- youtube_dl/downloader/external.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 1d5cc9904..30699934b 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -83,6 +83,16 @@ class CurlFD(ExternalFD): return cmd +class AxelFD(ExternalFD): + def _make_cmd(self, tmpfilename, info_dict): + cmd = [self.exe, '-o', tmpfilename] + for key, val in info_dict['http_headers'].items(): + cmd += ['-H', '%s: %s' % (key, val)] + cmd += self._configuration_args() + cmd += ['--', info_dict['url']] + return cmd + + class WgetFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] From 5b0c40da24b5ddb789428de731e02ac8759a363c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 03:36:29 +0600 Subject: [PATCH 0994/2145] [extractor/common] Expand meta regex --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index dc5080504..507ea5ec0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -636,7 +636,7 @@ class InfoExtractor(object): @staticmethod def _meta_regex(prop): return r'''(?isx)<meta - (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1) + (?=[^>]+(?:itemprop|name|property|id)=(["\']?)%s\1) [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) def _og_search_property(self, prop, html, name=None, **kargs): From 3550821fb4ca2f0e47542a7fa16b6543b06df724 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 03:38:55 +0600 Subject: [PATCH 0995/2145] [periscope] Add extractor (Closes #5850, closes #6459) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/periscope.py | 66 +++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 youtube_dl/extractor/periscope.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 922d9b3d8..bd86a5be2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -432,6 +432,7 @@ from .orf import ( from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE +from .periscope import PeriscopeIE from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py new file mode 100644 index 000000000..5219e1a75 --- /dev/null +++ b/youtube_dl/extractor/periscope.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + unescapeHTML, +) + + +class PeriscopeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?periscope\.tv/w/(?P<id>[^/?#]+)' + _TEST = { + 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', + 'md5': '65b57957972e503fcbbaeed8f4fa04ca', + 'info_dict': { + 'id': '56102209', + 'ext': 'mp4', + 'title': 'Bec Boop - 🚠✈️🇬🇧 Fly above #London in Emirates Air Line cable car at night 🇬🇧✈️🚠 #BoopScope 🎀💗', + 'timestamp': 1438978559, + 'upload_date': '20150807', + 'uploader': 'Bec Boop', + 'uploader_id': '1465763', + }, + 'skip': 'Expires in 24 hours', + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + replay = self._download_json( + 'https://api.periscope.tv/api/v2/getAccessPublic?token=%s' % video_id, video_id) + + video_url = replay['replay_url'] + + webpage = self._download_webpage(url, video_id) + + broadcast_data = self._parse_json( + unescapeHTML(self._html_search_meta( + 'broadcast-data', webpage, 'broadcast data', fatal=True)), + video_id) + + broadcast = broadcast_data['broadcast'] + status = broadcast['status'] + + uploader = broadcast.get('user_display_name') or broadcast_data.get('user', {}).get('display_name') + uploader_id = broadcast.get('user_id') or broadcast_data.get('user', {}).get('id') + + title = '%s - %s' % (uploader, status) if uploader else status + timestamp = parse_iso8601(broadcast.get('created_at')) + + thumbnails = [{ + 'url': broadcast[image], + } for image in ('image_url', 'image_url_small') if broadcast.get(image)] + + return { + 'id': broadcast.get('id') or video_id, + 'url': video_url, + 'ext': 'mp4', + 'protocol': 'm3u8_native', + 'title': title, + 'timestamp': timestamp, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'thumbnails': thumbnails, + } From 621d6a9516e0f9cd8c45e12904f4d4b7615e7fb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 04:00:52 +0600 Subject: [PATCH 0996/2145] [periscope] Switch to API for broadcast data --- youtube_dl/extractor/periscope.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 5219e1a75..11648a511 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -25,21 +25,17 @@ class PeriscopeIE(InfoExtractor): 'skip': 'Expires in 24 hours', } + def _call_api(self, method, token): + return self._download_json( + 'https://api.periscope.tv/api/v2/%s?token=%s' % (method, token), token) + def _real_extract(self, url): - video_id = self._match_id(url) - - replay = self._download_json( - 'https://api.periscope.tv/api/v2/getAccessPublic?token=%s' % video_id, video_id) + token = self._match_id(url) + replay = self._call_api('getAccessPublic', token) video_url = replay['replay_url'] - webpage = self._download_webpage(url, video_id) - - broadcast_data = self._parse_json( - unescapeHTML(self._html_search_meta( - 'broadcast-data', webpage, 'broadcast data', fatal=True)), - video_id) - + broadcast_data = self._call_api('getBroadcastPublic', token) broadcast = broadcast_data['broadcast'] status = broadcast['status'] @@ -54,7 +50,7 @@ class PeriscopeIE(InfoExtractor): } for image in ('image_url', 'image_url_small') if broadcast.get(image)] return { - 'id': broadcast.get('id') or video_id, + 'id': broadcast.get('id') or token, 'url': video_url, 'ext': 'mp4', 'protocol': 'm3u8_native', From 1e83741c9a5d67e8bbe65510d41b558361496fe8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 05:33:53 +0600 Subject: [PATCH 0997/2145] [periscope] Add support for running streams --- youtube_dl/extractor/periscope.py | 34 +++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 11648a511..de53b752d 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -2,13 +2,15 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ( - parse_iso8601, - unescapeHTML, +from ..compat import ( + compat_urllib_parse, + compat_urllib_request, ) +from ..utils import parse_iso8601 class PeriscopeIE(InfoExtractor): + IE_DESC = 'Periscope' _VALID_URL = r'https?://(?:www\.)?periscope\.tv/w/(?P<id>[^/?#]+)' _TEST = { 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', @@ -32,9 +34,6 @@ class PeriscopeIE(InfoExtractor): def _real_extract(self, url): token = self._match_id(url) - replay = self._call_api('getAccessPublic', token) - video_url = replay['replay_url'] - broadcast_data = self._call_api('getBroadcastPublic', token) broadcast = broadcast_data['broadcast'] status = broadcast['status'] @@ -43,20 +42,37 @@ class PeriscopeIE(InfoExtractor): uploader_id = broadcast.get('user_id') or broadcast_data.get('user', {}).get('id') title = '%s - %s' % (uploader, status) if uploader else status + state = broadcast.get('state').lower() + if state == 'running': + title = self._live_title(title) timestamp = parse_iso8601(broadcast.get('created_at')) thumbnails = [{ 'url': broadcast[image], } for image in ('image_url', 'image_url_small') if broadcast.get(image)] + stream = self._call_api('getAccessPublic', token) + + formats = [] + for format_id in ('replay', 'rtmp', 'hls', 'https_hls'): + video_url = stream.get(format_id + '_url') + if not video_url: + continue + f = { + 'url': video_url, + 'ext': 'flv' if format_id == 'rtmp' else 'mp4', + } + if format_id != 'rtmp': + f['protocol'] = 'm3u8_native' if state == 'ended' else 'm3u8' + formats.append(f) + self._sort_formats(formats) + return { 'id': broadcast.get('id') or token, - 'url': video_url, - 'ext': 'mp4', - 'protocol': 'm3u8_native', 'title': title, 'timestamp': timestamp, 'uploader': uploader, 'uploader_id': uploader_id, 'thumbnails': thumbnails, + 'formats': formats, } From 428e4e4a850df81031e8267dddf759da605639e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 05:37:38 +0600 Subject: [PATCH 0998/2145] [quickscope] Add extractor --- youtube_dl/extractor/__init__.py | 5 ++++- youtube_dl/extractor/periscope.py | 21 +++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bd86a5be2..e38e77a27 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -432,7 +432,10 @@ from .orf import ( from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE -from .periscope import PeriscopeIE +from .periscope import ( + PeriscopeIE, + QuickscopeIE, +) from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index de53b752d..578b53a24 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -76,3 +76,24 @@ class PeriscopeIE(InfoExtractor): 'thumbnails': thumbnails, 'formats': formats, } + + +class QuickscopeIE(InfoExtractor): + IE_DESC = 'Quisck Scope' + _VALID_URL = r'https?://watchonperiscope\.com/broadcast/(?P<id>\d+)' + _TEST = { + 'url': 'https://watchonperiscope.com/broadcast/56180087', + 'only_matching': True, + } + + def _real_extract(self, url): + broadcast_id = self._match_id(url) + request = compat_urllib_request.Request( + 'https://watchonperiscope.com/api/accessChannel', compat_urllib_parse.urlencode({ + 'broadcast_id': broadcast_id, + 'entry_ticket': '', + 'from_push': 'false', + 'uses_sessions': 'true', + }).encode('utf-8')) + return self.url_result( + self._download_json(request, broadcast_id)['share_url'], 'Periscope') From b2f82948ee5eadc483c01dc589b82426bb32ba68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 05:40:41 +0600 Subject: [PATCH 0999/2145] [quickscope] Fix typo --- youtube_dl/extractor/periscope.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 578b53a24..8ad936758 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -79,7 +79,7 @@ class PeriscopeIE(InfoExtractor): class QuickscopeIE(InfoExtractor): - IE_DESC = 'Quisck Scope' + IE_DESC = 'Quick Scope' _VALID_URL = r'https?://watchonperiscope\.com/broadcast/(?P<id>\d+)' _TEST = { 'url': 'https://watchonperiscope.com/broadcast/56180087', From 59e89e62d7b45554cef502dc4986f35618110679 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 8 Aug 2015 12:59:10 +0100 Subject: [PATCH 1000/2145] [shahid] add default fallbacks for extracting api vars --- youtube_dl/extractor/shahid.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py index b2050525e..399140189 100644 --- a/youtube_dl/extractor/shahid.py +++ b/youtube_dl/extractor/shahid.py @@ -33,20 +33,30 @@ class ShahidIE(InfoExtractor): } ] + _api_vars = { + 'type': 'player', + 'url': 'http://api.shahid.net/api/v1_1', + 'playerType': 'episode', + } + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) player_info = '' - for line in self._search_regex('var flashvars = ({[^}]+})', webpage, 'flashvars').splitlines(): - if '+' not in line and '(' not in line and ')' not in line: - player_info += line - player_info = self._parse_json(js_to_json(player_info), video_id) - video_id = player_info['id'] - player_type = player_info['playerType'] + flash_vars = self._search_regex('var flashvars = ({[^}]+})', webpage, 'flashvars', None) + if flash_vars is not None: + for line in flash_vars.splitlines(): + if '+' not in line and '(' not in line and ')' not in line: + player_info += line + player_info = self._parse_json(player_info, video_id, js_to_json, False) + if player_info is not None: + for key in self._api_vars: + if key in player_info: + self._api_vars[key] = player_info[key] player_json_data = self._download_json( - 'https://shahid.mbc.net/arContent/getPlayerContent-param-.id-' + video_id + '.type-' + player_info['type'] + '.html', + 'https://shahid.mbc.net/arContent/getPlayerContent-param-.id-' + video_id + '.type-' + self._api_vars['type'] + '.html', video_id )['data'] if 'url' in player_json_data: @@ -57,13 +67,13 @@ class ShahidIE(InfoExtractor): formats = self._extract_m3u8_formats(m3u8_url, video_id) video_info = self._download_json( - player_info['url'] + '/' + player_type + '/' + video_id + '?apiKey=sh%40hid0nlin3&hash=b2wMCTHpSmyxGqQjJFOycRmLSex%2BBpTK%2Fooxy6vHaqs%3D', + self._api_vars['url'] + '/' + self._api_vars['playerType'] + '/' + video_id + '?apiKey=sh%40hid0nlin3&hash=b2wMCTHpSmyxGqQjJFOycRmLSex%2BBpTK%2Fooxy6vHaqs%3D', video_id )['data'] if video_info.get('error'): for error in video_info['error']: raise ExtractorError(error) - video_info = video_info[player_type] + video_info = video_info[self._api_vars['playerType']] title = video_info['title'] thumbnail = video_info.get('thumbnailUrl') categories = [category['name'] for category in video_info.get('genres')] From 154655a85ae8b7740aa9fe7821544050fd65641b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 19:21:05 +0600 Subject: [PATCH 1001/2145] [downloader/external] Respect --no-check-certificate for wget --- youtube_dl/downloader/external.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 30699934b..07ce59f7d 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -51,6 +51,9 @@ class ExternalFD(FileDownloader): return [] return [command_option, source_address] + def _no_check_certificate(self, command_option): + return [command_option] if self.params.get('nocheckcertificate', False) else [] + def _configuration_args(self, default=[]): ex_args = self.params.get('external_downloader_args') if ex_args is None: @@ -99,6 +102,7 @@ class WgetFD(ExternalFD): for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] cmd += self._source_address('--bind-address') + cmd += self._no_check_certificate('--no-check-certificate') cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd From b465083f45e63fe8aeb0255b5cea7dfbf0770a2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 21:27:10 +0600 Subject: [PATCH 1002/2145] [sexykarma] Fix test --- youtube_dl/extractor/sexykarma.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/sexykarma.py index 6446d26dc..e33483674 100644 --- a/youtube_dl/extractor/sexykarma.py +++ b/youtube_dl/extractor/sexykarma.py @@ -29,6 +29,7 @@ class SexyKarmaIE(InfoExtractor): 'view_count': int, 'comment_count': int, 'categories': list, + 'age_limit': 18, } }, { 'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html', From b61b7787cbef408154695bbb9f5c3d29a70fdd38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 21:30:57 +0600 Subject: [PATCH 1003/2145] [91porn] Extract age limit --- youtube_dl/extractor/porn91.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py index 72d1b2718..3e15533e9 100644 --- a/youtube_dl/extractor/porn91.py +++ b/youtube_dl/extractor/porn91.py @@ -22,6 +22,7 @@ class Porn91IE(InfoExtractor): 'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!', 'ext': 'mp4', 'duration': 431, + 'age_limit': 18, } } @@ -68,4 +69,5 @@ class Porn91IE(InfoExtractor): 'url': video_url, 'duration': duration, 'comment_count': comment_count, + 'age_limit': self._rta_search(webpage), } From 8e2b1be12791b4e62c463562b570661e7b2c5852 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 21:42:50 +0600 Subject: [PATCH 1004/2145] [test/helper] Make age_limit checkable field --- test/helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/helper.py b/test/helper.py index c8b34654d..cb6eec8d9 100644 --- a/test/helper.py +++ b/test/helper.py @@ -160,7 +160,7 @@ def expect_info_dict(self, got_dict, expected_dict): # Are checkable fields missing from the test case definition? test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value)) for key, value in got_dict.items() - if value and key in ('id', 'title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location')) + if value and key in ('id', 'title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location', 'age_limit')) missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys()) if missing_keys: def _repr(v): From 18c3281f9e1e32e00c778b149137fc91accb3b1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 21:43:20 +0600 Subject: [PATCH 1005/2145] [24video] Fix test --- youtube_dl/extractor/fourtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index b2284ab01..3bb4f6239 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -32,6 +32,7 @@ class FourTubeIE(InfoExtractor): 'view_count': int, 'like_count': int, 'categories': list, + 'age_limit': 18, } } From 464e792496665b2e3dcabf5df43a45604673730a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 21:51:21 +0600 Subject: [PATCH 1006/2145] [vpro] Override npo IE_NAME --- youtube_dl/extractor/npo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 0c2d02c10..eb12fb810 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -407,6 +407,7 @@ class NPORadioFragmentIE(InfoExtractor): class VPROIE(NPOIE): + IE_NAME = 'vpro' _VALID_URL = r'https?://(?:www\.)?(?:tegenlicht\.)?vpro\.nl/(?:[^/]+/){2,}(?P<id>[^/]+)\.html' _TESTS = [ From d7bb8884afc8651b0ad86046dcd56a5330c98dd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 21:58:24 +0600 Subject: [PATCH 1007/2145] [break] Add age_limit to test --- youtube_dl/extractor/breakcom.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 809287d14..aa08051b1 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -18,6 +18,7 @@ class BreakIE(InfoExtractor): 'id': '2468056', 'ext': 'mp4', 'title': 'When Girls Act Like D-Bags', + 'age_limit': 13, } }, { 'url': 'http://www.break.com/video/ugc/baby-flex-2773063', From 9f2e7c2f34c48942a2a3e55532dd0d0ef8ed4d98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 22:04:48 +0600 Subject: [PATCH 1008/2145] [ok] Add age_limit to tests --- youtube_dl/extractor/odnoklassniki.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 215ffe87b..e5fd1ba04 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -25,6 +25,7 @@ class OdnoklassnikiIE(InfoExtractor): 'uploader_id': '330537914540', 'uploader': 'Виталий Добровольский', 'like_count': int, + 'age_limit': 0, }, }, { # metadataUrl @@ -38,6 +39,7 @@ class OdnoklassnikiIE(InfoExtractor): 'uploader_id': '534380003155', 'uploader': 'Андрей Мещанинов', 'like_count': int, + 'age_limit': 0, }, }, { 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452', From 887e9bc7b561f9b2b97dec8f99f9c04392d95d40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 22:08:54 +0600 Subject: [PATCH 1009/2145] [ok] Update tests --- youtube_dl/extractor/odnoklassniki.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index e5fd1ba04..003d27de7 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -16,12 +16,13 @@ class OdnoklassnikiIE(InfoExtractor): _TESTS = [{ # metadata in JSON 'url': 'http://ok.ru/video/20079905452', - 'md5': '8e24ad2da6f387948e7a7d44eb8668fe', + 'md5': '6ba728d85d60aa2e6dd37c9e70fdc6bc', 'info_dict': { 'id': '20079905452', 'ext': 'mp4', 'title': 'Культура меняет нас (прекрасный ролик!))', 'duration': 100, + 'upload_date': '20141207', 'uploader_id': '330537914540', 'uploader': 'Виталий Добровольский', 'like_count': int, @@ -36,8 +37,9 @@ class OdnoklassnikiIE(InfoExtractor): 'ext': 'mp4', 'title': 'Девушка без комплексов ...', 'duration': 191, + 'upload_date': '20150518', 'uploader_id': '534380003155', - 'uploader': 'Андрей Мещанинов', + 'uploader': '☭ Андрей Мещанинов ☭', 'like_count': int, 'age_limit': 0, }, From c8d1be772daa496759bd85cb95c4ec799294c7f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 8 Aug 2015 22:11:06 +0600 Subject: [PATCH 1010/2145] [rutube] Add age_limit to test --- youtube_dl/extractor/rutube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 5b1c3577a..d94dc7399 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -30,6 +30,7 @@ class RutubeIE(InfoExtractor): 'uploader': 'NTDRussian', 'uploader_id': '29790', 'upload_date': '20131016', + 'age_limit': 0, }, 'params': { # It requires ffmpeg (m3u8 download) From 08df685fe7764ef9f7dc271075340e4effc5e621 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 9 Aug 2015 08:51:37 +0600 Subject: [PATCH 1011/2145] [videolectures] Fix _VALID_URL for test_no_duplicates to pass --- youtube_dl/extractor/videolecturesnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py index 24584dc80..ef2da5632 100644 --- a/youtube_dl/extractor/videolecturesnet.py +++ b/youtube_dl/extractor/videolecturesnet.py @@ -12,7 +12,7 @@ from ..utils import ( class VideoLecturesNetIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)(?:/?[#?].*)?$' + _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/*(?:[#?].*)?$' IE_NAME = 'videolectures.net' _TEST = { From 12bb392a0ff8adbde2ced75b0c4976d0aabc7f4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 9 Aug 2015 17:10:40 +0600 Subject: [PATCH 1012/2145] [vimeo] Fix password protected videos (Closes #6507) --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 10d6745af..4c4e3c72a 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -203,7 +203,7 @@ class VimeoIE(VimeoBaseInfoExtractor): url = url.replace('http://', 'https://') password_request = compat_urllib_request.Request(url + '/password', data) password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - password_request.add_header('Cookie', 'xsrft=%s' % token) + password_request.add_header('Referer', url) return self._download_webpage( password_request, video_id, 'Verifying the password', 'Wrong password') From 8d6765cf48138cc44fdbaee4e8c7a199ae348bb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 9 Aug 2015 19:07:18 +0600 Subject: [PATCH 1013/2145] [extractor/generic] Add generic support for xspf playist extraction --- youtube_dl/extractor/common.py | 41 ++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index def6caa0d..e201ea6db 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -39,6 +39,8 @@ from ..utils import ( sanitize_filename, unescapeHTML, url_basename, + xpath_text, + xpath_with_ns, ) @@ -1142,6 +1144,45 @@ class InfoExtractor(object): }) return subtitles + def _extract_xspf_playlist(self, playlist_url, playlist_id): + playlist = self._download_xml( + playlist_url, playlist_id, 'Downloading xpsf playlist', + 'Unable to download xspf manifest') + + NS_MAP = { + 'xspf': 'http://xspf.org/ns/0/', + 's1': 'http://static.streamone.nl/player/ns/0', + } + + entries = [] + for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): + title = xpath_text( + track, xpath_with_ns('./xspf:title', NS_MAP), 'title') + description = xpath_text( + track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description') + thumbnail = xpath_text( + track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail') + duration = float_or_none( + xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) + + formats = [{ + 'url': location.text, + 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), + 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), + 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), + } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))] + self._sort_formats(formats) + + entries.append({ + 'id': playlist_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + }) + return entries + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() From e0b9d78fab76e2c2819c8a9a7512ad4533319b72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 9 Aug 2015 19:09:50 +0600 Subject: [PATCH 1014/2145] [extractor/common] Clarify playlists can have description field --- youtube_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e201ea6db..9b4775e0a 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -204,8 +204,8 @@ class InfoExtractor(object): There must be a key "entries", which is a list, an iterable, or a PagedList object, each element of which is a valid dictionary by this specification. - Additionally, playlists can have "title" and "id" attributes with the same - semantics as videos (see above). + Additionally, playlists can have "title", "description" and "id" attributes + with the same semantics as videos (see above). _type "multi_video" indicates that there are multiple videos that From 3a30508b943c044e5f684b703ff58ac352686f63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 9 Aug 2015 19:11:23 +0600 Subject: [PATCH 1015/2145] [telegraaf] Add extractor (Closes #6492) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/telegraaf.py | 35 +++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 youtube_dl/extractor/telegraaf.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e38e77a27..dad3ec87f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -596,6 +596,7 @@ from .techtalks import TechTalksIE from .ted import TEDIE from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE +from .telegraaf import TelegraafIE from .telemb import TeleMBIE from .teletask import TeleTaskIE from .tenplay import TenPlayIE diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py new file mode 100644 index 000000000..6f8333cfc --- /dev/null +++ b/youtube_dl/extractor/telegraaf.py @@ -0,0 +1,35 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import remove_end + + +class TelegraafIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/tv/(?:[^/]+/)+(?P<id>\d+)/[^/]+\.html' + _TEST = { + 'url': 'http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html', + 'md5': '83245a9779bcc4a24454bfd53c65b6dc', + 'info_dict': { + 'id': '24353229', + 'ext': 'mp4', + 'title': 'Tikibad ontruimd wegens brand', + 'description': 'md5:05ca046ff47b931f9b04855015e163a4', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 33, + }, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + playlist_url = self._search_regex( + r"iframe\.loadPlayer\('([^']+)'", webpage, 'player') + + entries = self._extract_xspf_playlist(playlist_url, playlist_id) + title = remove_end(self._og_search_title(webpage), ' - VIDEO') + description = self._og_search_description(webpage) + + return self.playlist_result(entries, playlist_id, title, description) From f32143469fd0a2720bd40908ea8360490983b97d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 9 Aug 2015 19:15:00 +0600 Subject: [PATCH 1016/2145] [tweakers] Use _extract_xspf_playlist --- youtube_dl/extractor/tweakers.py | 42 +++----------------------------- 1 file changed, 4 insertions(+), 38 deletions(-) diff --git a/youtube_dl/extractor/tweakers.py b/youtube_dl/extractor/tweakers.py index c80ec15cf..4bbe76e96 100644 --- a/youtube_dl/extractor/tweakers.py +++ b/youtube_dl/extractor/tweakers.py @@ -25,41 +25,7 @@ class TweakersIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) - - playlist = self._download_xml( - 'https://tweakers.net/video/s1playlist/%s/playlist.xspf' % video_id, - video_id) - - NS_MAP = { - 'xspf': 'http://xspf.org/ns/0/', - 's1': 'http://static.streamone.nl/player/ns/0', - } - - track = playlist.find(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)) - - title = xpath_text( - track, xpath_with_ns('./xspf:title', NS_MAP), 'title') - description = xpath_text( - track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description') - thumbnail = xpath_text( - track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail') - duration = float_or_none( - xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), - 1000) - - formats = [{ - 'url': location.text, - 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), - 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), - 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), - } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - } + playlist_id = self._match_id(url) + entries = self._extract_xspf_playlist( + 'https://tweakers.net/video/s1playlist/%s/playlist.xspf' % playlist_id, playlist_id) + return self.playlist_result(entries, playlist_id) From 0dcb318f622d944ad0f5c23c32c9bc9b00e76aaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 9 Aug 2015 19:15:20 +0600 Subject: [PATCH 1017/2145] [tweakers] Fix test --- youtube_dl/extractor/tweakers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tweakers.py b/youtube_dl/extractor/tweakers.py index 4bbe76e96..6eeffb1cc 100644 --- a/youtube_dl/extractor/tweakers.py +++ b/youtube_dl/extractor/tweakers.py @@ -13,7 +13,7 @@ class TweakersIE(InfoExtractor): _VALID_URL = r'https?://tweakers\.net/video/(?P<id>\d+)' _TEST = { 'url': 'https://tweakers.net/video/9926/new-nintendo-3ds-xl-op-alle-fronten-beter.html', - 'md5': '1b5afa817403bb5baa08359dca31e6df', + 'md5': '3147e4ddad366f97476a93863e4557c8', 'info_dict': { 'id': '9926', 'ext': 'mp4', From 98044462b1035000a44b35a41f4f780b2e844f2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 9 Aug 2015 19:18:50 +0600 Subject: [PATCH 1018/2145] [extractor/common] Use playlist id as default title --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 9b4775e0a..be91e03e9 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1157,7 +1157,7 @@ class InfoExtractor(object): entries = [] for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): title = xpath_text( - track, xpath_with_ns('./xspf:title', NS_MAP), 'title') + track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) description = xpath_text( track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description') thumbnail = xpath_text( From fb2f339fec20c35cb62c1da682e0dfd418faef81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 9 Aug 2015 19:21:25 +0600 Subject: [PATCH 1019/2145] [dhm] Use _extract_xspf_playlist --- youtube_dl/extractor/dhm.py | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/dhm.py b/youtube_dl/extractor/dhm.py index 3ed1f1663..127eb0439 100644 --- a/youtube_dl/extractor/dhm.py +++ b/youtube_dl/extractor/dhm.py @@ -34,24 +34,14 @@ class DHMIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) + playlist_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, playlist_id) playlist_url = self._search_regex( r"file\s*:\s*'([^']+)'", webpage, 'playlist url') - playlist = self._download_xml(playlist_url, video_id) - - track = playlist.find( - './{http://xspf.org/ns/0/}trackList/{http://xspf.org/ns/0/}track') - - video_url = xpath_text( - track, './{http://xspf.org/ns/0/}location', - 'video url', fatal=True) - thumbnail = xpath_text( - track, './{http://xspf.org/ns/0/}image', - 'thumbnail') + entries = self._extract_xspf_playlist(playlist_url, playlist_id) title = self._search_regex( [r'dc:title="([^"]+)"', r'<title> »([^<]+)'], @@ -63,11 +53,10 @@ class DHMIE(InfoExtractor): r'Length\s*\s*:\s*([^<]+)', webpage, 'duration', default=None)) - return { - 'id': video_id, - 'url': video_url, + entries[0].update({ 'title': title, 'description': description, 'duration': duration, - 'thumbnail': thumbnail, - } + }) + + return self.playlist_result(entries, playlist_id) From 942acef594428b5f5c7e0ed7860cb6d725d8f1e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 9 Aug 2015 19:41:55 +0600 Subject: [PATCH 1020/2145] [extractor/common] Extract _parse_xspf --- youtube_dl/extractor/common.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index be91e03e9..5982055be 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1144,11 +1144,15 @@ class InfoExtractor(object): }) return subtitles - def _extract_xspf_playlist(self, playlist_url, playlist_id): - playlist = self._download_xml( + def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True): + xspf = self._download_xml( playlist_url, playlist_id, 'Downloading xpsf playlist', - 'Unable to download xspf manifest') + 'Unable to download xspf manifest', fatal=fatal) + if xspf is False: + return [] + return self._parse_xspf(xspf, playlist_id) + def _parse_xspf(self, playlist, playlist_id): NS_MAP = { 'xspf': 'http://xspf.org/ns/0/', 's1': 'http://static.streamone.nl/player/ns/0', From 729accb48221bd72e40076939616792c1c6fc15f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 9 Aug 2015 19:43:42 +0600 Subject: [PATCH 1021/2145] [extractor/generic] Add support for xspf playlists --- youtube_dl/extractor/generic.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 901f77304..a382d6be4 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1198,6 +1198,8 @@ class GenericIE(InfoExtractor): return self._extract_rss(url, video_id, doc) elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): return self._parse_smil(doc, url, video_id) + elif doc.tag == '{http://xspf.org/ns/0/}playlist': + return self.playlist_result(self._parse_xspf(doc, video_id), video_id) except compat_xml_parse_error: pass @@ -1799,7 +1801,8 @@ class GenericIE(InfoExtractor): # here's a fun little line of code for you: video_id = os.path.splitext(video_id)[0] - if determine_ext(video_url) == 'smil': + ext = determine_ext(video_url) + if ext == 'smil': entries.append({ 'id': video_id, 'formats': self._extract_smil_formats(video_url, video_id), @@ -1807,6 +1810,8 @@ class GenericIE(InfoExtractor): 'title': video_title, 'age_limit': age_limit, }) + elif ext == 'xspf': + return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id) else: entries.append({ 'id': video_id, From 1de5cd3ba51ce67d9a1cd3b40157058e78e46692 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 9 Aug 2015 19:47:08 +0600 Subject: [PATCH 1022/2145] [extractor/generic] Add test for xspf playlist --- youtube_dl/extractor/generic.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a382d6be4..4756a658f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -198,6 +198,21 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + # XSPF playlist from http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html + { + 'url': 'http://www.telegraaf.nl/xml/playlist/2015/8/7/mZlp2ctYIUEB.xspf', + 'info_dict': { + 'id': 'mZlp2ctYIUEB', + 'ext': 'mp4', + 'title': 'Tikibad ontruimd wegens brand', + 'description': 'md5:05ca046ff47b931f9b04855015e163a4', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 33, + }, + 'params': { + 'skip_download': True, + }, + }, # google redirect { 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', From 0791ac1b4415601f464f9656a4485b3ae6b67f4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 9 Aug 2015 19:47:58 +0600 Subject: [PATCH 1023/2145] [extractor/generic] Clarify comment --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 4756a658f..376feecae 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1206,7 +1206,7 @@ class GenericIE(InfoExtractor): self.report_extraction(video_id) - # Is it an RSS feed or a SMIL file? + # Is it an RSS feed, a SMIL file or a XSPF playlist? try: doc = parse_xml(webpage) if doc.tag == 'rss': From 27c7114af6b82bfe8be6b8e4dfa6e11dd1356044 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 9 Aug 2015 20:13:02 +0200 Subject: [PATCH 1024/2145] release 2015.08.09 --- README.md | 2 +- docs/supportedsites.md | 8 ++++++-- youtube_dl/version.py | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 02b9775f9..15baf75ce 100644 --- a/README.md +++ b/README.md @@ -108,7 +108,7 @@ which means you can modify it, redistribute it or use it however you like. --playlist-reverse Download playlist videos in reverse order --xattr-set-filesize Set file xattribute ytdl.filesize with expected filesize (experimental) --hls-prefer-native Use the native HLS downloader instead of ffmpeg (experimental) - --external-downloader COMMAND Use the specified external downloader. Currently supports aria2c,curl,httpie,wget + --external-downloader COMMAND Use the specified external downloader. Currently supports aria2c,axel,curl,httpie,wget --external-downloader-args ARGS Give these arguments to the external downloader ## Filesystem Options: diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 657935dc6..e21471102 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -86,7 +86,7 @@ - **chirbit:profile** - **Cinchcast** - **Cinemassacre** - - **clipfish** + - **Clipfish** - **cliphunter** - **Clipsyndicate** - **Cloudy** @@ -116,6 +116,7 @@ - **DailymotionCloud** - **daum.net** - **DBTV** + - **DCN** - **DctpTv** - **DeezerPlaylist** - **defense.gouv.fr** @@ -351,7 +352,6 @@ - **NowTV** - **nowvideo**: NowVideo - **npo**: npo.nl and ntr.nl - - **npo**: npo.nl and ntr.nl - **npo.nl:live** - **npo.nl:radio** - **npo.nl:radio:fragment** @@ -377,6 +377,7 @@ - **parliamentlive.tv**: UK parliament videos - **Patreon** - **PBS** + - **Periscope**: Periscope - **PhilharmonieDeParis**: Philharmonie de Paris - **Phoenix** - **Photobucket** @@ -406,6 +407,7 @@ - **qqmusic:playlist**: QQ音乐 - 歌单 - **qqmusic:singer**: QQ音乐 - 歌手 - **qqmusic:toplist**: QQ音乐 - 排行榜 + - **Quickscope**: Quick Scope - **QuickVid** - **R7** - **radio.de** @@ -518,6 +520,7 @@ - **ted** - **TeleBruxelles** - **telecinco.es** + - **Telegraaf** - **TeleMB** - **TeleTask** - **TenPlay** @@ -621,6 +624,7 @@ - **Vodlocker** - **VoiceRepublic** - **Vporn** + - **vpro**: npo.nl and ntr.nl - **VRT** - **vube**: Vube.com - **VuClip** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9f209499c..6462d4477 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.08.06.1' +__version__ = '2015.08.09' From c5864a8ce6379dca300f447cca12a5a946d67d3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 10 Aug 2015 21:38:58 +0600 Subject: [PATCH 1025/2145] [fc2] Fix python 2.6 (Closes #6512) --- youtube_dl/extractor/fc2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index 1ccc1a964..e4f7195a8 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -86,7 +86,7 @@ class FC2IE(InfoExtractor): info_url = ( "http://video.fc2.com/ginfo.php?mimi={1:s}&href={2:s}&v={0:s}&fversion=WIN%2011%2C6%2C602%2C180&from=2&otag=0&upid={0:s}&tk=null&". - format(video_id, mimi, compat_urllib_request.quote(refer, safe='').replace('.', '%2E'))) + format(video_id, mimi, compat_urllib_request.quote(refer, safe=b'').replace('.', '%2E'))) info_webpage = self._download_webpage( info_url, video_id, note='Downloading info page') From f6c3664d717857a7994f189a01a00402df2b4168 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 10 Aug 2015 23:35:08 +0600 Subject: [PATCH 1026/2145] [vimeo] Fix login (Closes #6488) --- youtube_dl/extractor/vimeo.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 4c4e3c72a..5bce78ac0 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -29,6 +29,7 @@ from ..utils import ( class VimeoBaseInfoExtractor(InfoExtractor): _NETRC_MACHINE = 'vimeo' _LOGIN_REQUIRED = False + _LOGIN_URL = 'https://vimeo.com/log_in' def _login(self): (username, password) = self._get_login_info() @@ -37,21 +38,25 @@ class VimeoBaseInfoExtractor(InfoExtractor): raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) return self.report_login() - login_url = 'https://vimeo.com/log_in' - webpage = self._download_webpage(login_url, None, False) - token = self._search_regex(r'xsrft":"(.*?)"', webpage, 'login token') + webpage = self._download_webpage(self._LOGIN_URL, None, False) + token = self._extract_xsrft(webpage) data = urlencode_postdata({ + 'action': 'login', 'email': username, 'password': password, - 'action': 'login', 'service': 'vimeo', 'token': token, }) - login_request = compat_urllib_request.Request(login_url, data) + login_request = compat_urllib_request.Request(self._LOGIN_URL, data) login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - login_request.add_header('Cookie', 'xsrft=%s' % token) + login_request.add_header('Referer', self._LOGIN_URL) self._download_webpage(login_request, None, False, 'Wrong login info') + def _extract_xsrft(self, webpage): + return self._search_regex( + r'xsrft\s*[=:]\s*(?P["\'])(?P.+?)(?P=q)', + webpage, 'login token', group='xsrft') + class VimeoIE(VimeoBaseInfoExtractor): """Information extractor for vimeo.com.""" @@ -193,7 +198,7 @@ class VimeoIE(VimeoBaseInfoExtractor): password = self._downloader.params.get('videopassword', None) if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) - token = self._search_regex(r'xsrft[\s=:"\']+([^"\']+)', webpage, 'login token') + token = self._extract_xsrft(webpage) data = urlencode_postdata({ 'password': password, 'token': token, @@ -422,7 +427,7 @@ class VimeoIE(VimeoBaseInfoExtractor): } -class VimeoChannelIE(InfoExtractor): +class VimeoChannelIE(VimeoBaseInfoExtractor): IE_NAME = 'vimeo:channel' _VALID_URL = r'https://vimeo\.com/channels/(?P[^/?#]+)/?(?:$|[?#])' _MORE_PAGES_INDICATOR = r' Date: Mon, 10 Aug 2015 23:58:01 +0600 Subject: [PATCH 1027/2145] [vimeo:watchlater] Fix extraction (Closes #3886) --- youtube_dl/extractor/vimeo.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 5bce78ac0..1eeb4618e 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -431,6 +431,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): IE_NAME = 'vimeo:channel' _VALID_URL = r'https://vimeo\.com/channels/(?P[^/?#]+)/?(?:$|[?#])' _MORE_PAGES_INDICATOR = r']+?title="(.*?)"' _TESTS = [{ 'url': 'https://vimeo.com/channels/tributes', @@ -445,7 +446,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): return '%s/videos/page:%d/' % (base_url, pagenum) def _extract_list_title(self, webpage): - return self._html_search_regex(self._TITLE_RE, webpage, 'list title') + return self._TITLE or self._html_search_regex(self._TITLE_RE, webpage, 'list title') def _login_list_password(self, page_url, list_id, webpage): login_form = self._search_regex( @@ -611,11 +612,11 @@ class VimeoReviewIE(InfoExtractor): class VimeoWatchLaterIE(VimeoChannelIE): IE_NAME = 'vimeo:watchlater' IE_DESC = 'Vimeo watch later list, "vimeowatchlater" keyword (requires authentication)' - _VALID_URL = r'https://vimeo\.com/home/watchlater|:vimeowatchlater' + _VALID_URL = r'https://vimeo\.com/(?:home/)?watchlater|:vimeowatchlater' + _TITLE = 'Watch Later' _LOGIN_REQUIRED = True - _TITLE_RE = r'href="/home/watchlater".*?>(.*?)<' _TESTS = [{ - 'url': 'https://vimeo.com/home/watchlater', + 'url': 'https://vimeo.com/watchlater', 'only_matching': True, }] @@ -631,7 +632,7 @@ class VimeoWatchLaterIE(VimeoChannelIE): return request def _real_extract(self, url): - return self._extract_videos('watchlater', 'https://vimeo.com/home/watchlater') + return self._extract_videos('watchlater', 'https://vimeo.com/watchlater') class VimeoLikesIE(InfoExtractor): From 11b5605815d685263b271b4e061c43f9cb55a08c Mon Sep 17 00:00:00 2001 From: Puck Meerburg Date: Mon, 10 Aug 2015 20:52:38 +0200 Subject: [PATCH 1028/2145] [youtube] Use the first v= argument in the URL This is according to how youtube handles multiple v= values in one URL. Before this, it was possible to make a single URL show up differently on youtube itself, and if you downloaded/viewed it with youtube-dl/mpv --- youtube_dl/extractor/youtube.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 67a1df9a0..eaf058cfb 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -213,7 +213,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): |(?: # or the v= param in all its forms (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! - (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx) + (?:.*?&)?? # any other preceding param (like /?s=tuff&v=xxxx) v= ) )) @@ -380,6 +380,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': 'setindia' } }, + { + 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&v=UxxajLWwzqY', + 'note': 'Use the first video ID in the URL', + 'info_dict': { + 'id': 'BaW_jenozKc', + 'ext': 'mp4', + 'title': 'youtube-dl test video "\'/\\ä↭𝕐', + 'uploader': 'Philipp Hagemeister', + 'uploader_id': 'phihag', + 'upload_date': '20121002', + 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', + 'categories': ['Science & Technology'], + 'tags': ['youtube-dl'], + 'like_count': int, + 'dislike_count': int, + } + }, { 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I', 'note': '256k DASH audio (format 141) via DASH manifest', From b29440aee64027b3e4145070b0235193752b4d9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 11 Aug 2015 01:17:41 +0600 Subject: [PATCH 1029/2145] [vimeo:user] Do not match watchlater --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 1eeb4618e..50df79ca1 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -505,7 +505,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): class VimeoUserIE(VimeoChannelIE): IE_NAME = 'vimeo:user' - _VALID_URL = r'https://vimeo\.com/(?![0-9]+(?:$|[?#/]))(?P[^/]+)(?:/videos|[#?]|$)' + _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P[^/]+)(?:/videos|[#?]|$)' _TITLE_RE = r']+?class="user">([^<>]+?)' _TESTS = [{ 'url': 'https://vimeo.com/nkistudio/videos', From 34a7de2970d8bbceeb3f485d64a57f67489a44d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 11 Aug 2015 01:22:06 +0600 Subject: [PATCH 1030/2145] [youtube] Skip download for multiple v= test --- youtube_dl/extractor/youtube.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index eaf058cfb..01dbbfa3c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -395,7 +395,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'tags': ['youtube-dl'], 'like_count': int, 'dislike_count': int, - } + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I', From 34952f09e175e0b78c929fddf56f82ccf028dc5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 11 Aug 2015 01:24:53 +0600 Subject: [PATCH 1031/2145] [youtube] Add age limit to tests --- youtube_dl/extractor/youtube.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 01dbbfa3c..e74a39095 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -365,6 +365,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:64249768eec3bc4276236606ea996373', 'uploader': 'justintimberlakeVEVO', 'uploader_id': 'justintimberlakeVEVO', + 'age_limit': 18, } }, { @@ -475,6 +476,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'The Witcher', 'uploader_id': 'WitcherGame', 'upload_date': '20140605', + 'age_limit': 18, }, }, # Age-gate video with encrypted signature @@ -488,6 +490,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'LloydVEVO', 'uploader_id': 'LloydVEVO', 'upload_date': '20110629', + 'age_limit': 18, }, }, # video_info is None (https://github.com/rg3/youtube-dl/issues/4421) From fb0d12c6cbcabd6f9e84d51c82dea6778d0bb863 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 11 Aug 2015 01:46:25 +0600 Subject: [PATCH 1032/2145] [pbs] Add age limit to tests --- youtube_dl/extractor/pbs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index a53479aad..683c81de3 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -92,6 +92,7 @@ class PBSIE(InfoExtractor): 'duration': 3172, 'thumbnail': 're:^https?://.*\.jpg$', 'upload_date': '20140122', + 'age_limit': 10, }, 'params': { 'skip_download': True, # requires ffmpeg From b1ac38fadc65049dc6f9611fa7e9649de1e7eb93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 11 Aug 2015 01:49:23 +0600 Subject: [PATCH 1033/2145] [tvplay] Add age limit to tests --- youtube_dl/extractor/tvplay.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 79863e781..b4683de54 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -104,6 +104,7 @@ class TVPlayIE(InfoExtractor): 'duration': 1492, 'timestamp': 1330522854, 'upload_date': '20120229', + 'age_limit': 18, }, 'params': { # rtmp download From bf812ef71438036c23640f29bd7ae955289720ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 11 Aug 2015 23:00:45 +0600 Subject: [PATCH 1034/2145] [downloader/external] Forward --proxy to wget and aria2c --- youtube_dl/downloader/external.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 07ce59f7d..49d806ee4 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -51,6 +51,14 @@ class ExternalFD(FileDownloader): return [] return [command_option, source_address] + def _option(self, command_option, param): + param = self.params.get(param) + if param is None: + return [] + if isinstance(param, bool): + return [command_option] + return [command_option, param] + def _no_check_certificate(self, command_option): return [command_option] if self.params.get('nocheckcertificate', False) else [] @@ -102,6 +110,7 @@ class WgetFD(ExternalFD): for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] cmd += self._source_address('--bind-address') + cmd += self._option('--proxy', 'proxy') cmd += self._no_check_certificate('--no-check-certificate') cmd += self._configuration_args() cmd += ['--', info_dict['url']] @@ -120,6 +129,7 @@ class Aria2cFD(ExternalFD): for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] cmd += self._source_address('--interface') + cmd += self._option('--all-proxy', 'proxy') cmd += ['--', info_dict['url']] return cmd From 9f3da138606773339de9accc2bc6522ea88185fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 11 Aug 2015 23:05:04 +0600 Subject: [PATCH 1035/2145] [downloader/external] Use generic _option --- youtube_dl/downloader/external.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 49d806ee4..6c310346c 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -45,12 +45,6 @@ class ExternalFD(FileDownloader): def supports(cls, info_dict): return info_dict['protocol'] in ('http', 'https', 'ftp', 'ftps') - def _source_address(self, command_option): - source_address = self.params.get('source_address') - if source_address is None: - return [] - return [command_option, source_address] - def _option(self, command_option, param): param = self.params.get(param) if param is None: @@ -59,9 +53,6 @@ class ExternalFD(FileDownloader): return [command_option] return [command_option, param] - def _no_check_certificate(self, command_option): - return [command_option] if self.params.get('nocheckcertificate', False) else [] - def _configuration_args(self, default=[]): ex_args = self.params.get('external_downloader_args') if ex_args is None: @@ -88,7 +79,7 @@ class CurlFD(ExternalFD): cmd = [self.exe, '--location', '-o', tmpfilename] for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] - cmd += self._source_address('--interface') + cmd += self._option('--interface', 'source_address') cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd @@ -109,9 +100,9 @@ class WgetFD(ExternalFD): cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] - cmd += self._source_address('--bind-address') + cmd += self._option('--bind-address', 'source_address') cmd += self._option('--proxy', 'proxy') - cmd += self._no_check_certificate('--no-check-certificate') + cmd += self._option('--no-check-certificate', 'nocheckcertificate') cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd @@ -128,7 +119,7 @@ class Aria2cFD(ExternalFD): cmd += ['--out', os.path.basename(tmpfilename)] for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] - cmd += self._source_address('--interface') + cmd += self._option('--interface', 'source_address') cmd += self._option('--all-proxy', 'proxy') cmd += ['--', info_dict['url']] return cmd From 0a19d4ccd6914d8547fd3e42fd279c960d9f8fad Mon Sep 17 00:00:00 2001 From: sceext Date: Wed, 12 Aug 2015 14:01:48 +0800 Subject: [PATCH 1036/2145] [iqiyi] update md5 salt (2015-08-10 Zombie) --- youtube_dl/extractor/iqiyi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index afb7f4e61..dfc6d58a0 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -201,7 +201,7 @@ class IqiyiIE(InfoExtractor): return raw_data def get_enc_key(self, swf_url, video_id): - enc_key = '8e29ab5666d041c3a1ea76e06dabdffb' + enc_key = '3601ba290e4f4662848c710e2122007e' # last update at 2015-08-10 for Zombie return enc_key def _real_extract(self, url): From 1df3186e0e2c49993f4230ec77a9de351177b271 Mon Sep 17 00:00:00 2001 From: ngld Date: Wed, 12 Aug 2015 16:01:47 +0200 Subject: [PATCH 1037/2145] [funnyordie] Handle protocol-relative URLs (fixes #6490) --- youtube_dl/extractor/funnyordie.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index dd87257c4..f5f13689c 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -53,7 +53,7 @@ class FunnyOrDieIE(InfoExtractor): for bitrate in bitrates: for link in links: formats.append({ - 'url': '%s%d.%s' % (link[0], bitrate, link[1]), + 'url': self._proto_relative_url('%s%d.%s' % (link[0], bitrate, link[1])), 'format_id': '%s-%d' % (link[1], bitrate), 'vbr': bitrate, }) From f57b7835e21b00a1b2205b4bcfba50c630ff68b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 12 Aug 2015 21:27:58 +0600 Subject: [PATCH 1038/2145] [youtube] Update tests --- youtube_dl/extractor/youtube.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e74a39095..facd837ad 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -442,7 +442,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'nfWlot6h_JM', 'ext': 'm4a', 'title': 'Taylor Swift - Shake It Off', - 'description': 'md5:2acfda1b285bdd478ccec22f9918199d', + 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3', 'uploader': 'TaylorSwiftVEVO', 'uploader_id': 'TaylorSwiftVEVO', 'upload_date': '20140818', @@ -515,7 +515,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'lqQg6PlCWgI', 'ext': 'mp4', - 'upload_date': '20120731', + 'upload_date': '20120724', 'uploader_id': 'olympic', 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', 'uploader': 'Olympics', @@ -544,7 +544,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'qEJwOuvDf7I', 'info_dict': { 'id': 'qEJwOuvDf7I', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге', 'description': '', 'upload_date': '20150404', From f0f3a6c99d2834ca8af87be4978c0040c3744628 Mon Sep 17 00:00:00 2001 From: ngld Date: Wed, 12 Aug 2015 18:07:27 +0200 Subject: [PATCH 1039/2145] [rtvnhnl] Added new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/rtvnhnl.py | 40 ++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 youtube_dl/extractor/rtvnhnl.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index dad3ec87f..f026a4171 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -491,6 +491,7 @@ from .rtl2 import RTL2IE from .rtp import RTPIE from .rts import RTSIE from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE +from .rtvnhnl import RtvnhNlIE from .ruhd import RUHDIE from .rutube import ( RutubeIE, diff --git a/youtube_dl/extractor/rtvnhnl.py b/youtube_dl/extractor/rtvnhnl.py new file mode 100644 index 000000000..ce84900a0 --- /dev/null +++ b/youtube_dl/extractor/rtvnhnl.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class RtvnhNlIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtvnh\.nl/video/(?P[0-9]+)' + _TEST = { + 'params': { + 'hls_prefer_native': True + }, + + 'url': 'http://www.rtvnh.nl/video/131946', + 'md5': '6e1d0ab079e2a00b6161442d3ceacfc1', + 'info_dict': { + 'id': '131946', + 'ext': 'mp4', + 'title': 'Grote zoektocht in zee bij Zandvoort naar vermiste vrouw', + 'thumbnail': 're:^https?://rtvnh-webfiles\.[^.]+\.amazonaws\.com/data/cache/[0-9]+/basedata/pf_image/[0-9.]+/[0-9\-a-f]+\.jpg$' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + meta = self._parse_json(self._download_webpage('http://www.rtvnh.nl/video/json?m=' + video_id, video_id), video_id) + formats = self._extract_smil_formats('http://www.rtvnh.nl/video/smil?m=' + video_id, video_id) + + for item in meta['source']['fb']: + if item.get('type') == 'hls': + formats.extend(self._extract_m3u8_formats(item['file'], video_id, ext='mp4')) + elif item.get('type') == '': + formats.append({'url': item['file']}) + + return { + 'id': video_id, + 'title': meta['title'].strip(), + 'thumbnail': meta['image'], + 'formats': formats + } From fb124e37419668c34b4056575614776b0c64b401 Mon Sep 17 00:00:00 2001 From: ngld Date: Wed, 12 Aug 2015 20:21:32 +0200 Subject: [PATCH 1040/2145] [rtvnhnl] Relax the thumbnail check --- youtube_dl/extractor/rtvnhnl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtvnhnl.py b/youtube_dl/extractor/rtvnhnl.py index ce84900a0..0921e2648 100644 --- a/youtube_dl/extractor/rtvnhnl.py +++ b/youtube_dl/extractor/rtvnhnl.py @@ -17,7 +17,7 @@ class RtvnhNlIE(InfoExtractor): 'id': '131946', 'ext': 'mp4', 'title': 'Grote zoektocht in zee bij Zandvoort naar vermiste vrouw', - 'thumbnail': 're:^https?://rtvnh-webfiles\.[^.]+\.amazonaws\.com/data/cache/[0-9]+/basedata/pf_image/[0-9.]+/[0-9\-a-f]+\.jpg$' + 'thumbnail': 're:^http:.*\.jpg$' } } From d9ab5262b137962995af1b444f45f7f32dc33a77 Mon Sep 17 00:00:00 2001 From: ngld Date: Wed, 12 Aug 2015 20:26:13 +0200 Subject: [PATCH 1041/2145] [rtvnh] Renamed rtvnhnl -> rtvnh --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/{rtvnhnl.py => rtvnh.py} | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) rename youtube_dl/extractor/{rtvnhnl.py => rtvnh.py} (94%) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f026a4171..9a6308723 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -491,7 +491,7 @@ from .rtl2 import RTL2IE from .rtp import RTPIE from .rts import RTSIE from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE -from .rtvnhnl import RtvnhNlIE +from .rtvnh import RTVNHIE from .ruhd import RUHDIE from .rutube import ( RutubeIE, diff --git a/youtube_dl/extractor/rtvnhnl.py b/youtube_dl/extractor/rtvnh.py similarity index 94% rename from youtube_dl/extractor/rtvnhnl.py rename to youtube_dl/extractor/rtvnh.py index 0921e2648..f5c0b94a8 100644 --- a/youtube_dl/extractor/rtvnhnl.py +++ b/youtube_dl/extractor/rtvnh.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -class RtvnhNlIE(InfoExtractor): +class RTVNHIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rtvnh\.nl/video/(?P[0-9]+)' _TEST = { 'params': { @@ -17,7 +17,7 @@ class RtvnhNlIE(InfoExtractor): 'id': '131946', 'ext': 'mp4', 'title': 'Grote zoektocht in zee bij Zandvoort naar vermiste vrouw', - 'thumbnail': 're:^http:.*\.jpg$' + 'thumbnail': 're:^https?:.*\.jpg$' } } From d7dbfc7cc18c2d54d7e1752def6c4710c58b49fc Mon Sep 17 00:00:00 2001 From: ngld Date: Wed, 12 Aug 2015 20:51:28 +0200 Subject: [PATCH 1042/2145] Use native HLS implementation by default. --- youtube_dl/extractor/rtvnh.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/youtube_dl/extractor/rtvnh.py b/youtube_dl/extractor/rtvnh.py index f5c0b94a8..2799f01a6 100644 --- a/youtube_dl/extractor/rtvnh.py +++ b/youtube_dl/extractor/rtvnh.py @@ -7,10 +7,6 @@ from .common import InfoExtractor class RTVNHIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rtvnh\.nl/video/(?P[0-9]+)' _TEST = { - 'params': { - 'hls_prefer_native': True - }, - 'url': 'http://www.rtvnh.nl/video/131946', 'md5': '6e1d0ab079e2a00b6161442d3ceacfc1', 'info_dict': { @@ -28,7 +24,7 @@ class RTVNHIE(InfoExtractor): for item in meta['source']['fb']: if item.get('type') == 'hls': - formats.extend(self._extract_m3u8_formats(item['file'], video_id, ext='mp4')) + formats.extend(self._extract_m3u8_formats(item['file'], video_id, ext='mp4', entry_protocol='m3u8_native')) elif item.get('type') == '': formats.append({'url': item['file']}) From 240ca32e57a027ff8cec8617c154bb7100bead1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 13 Aug 2015 01:00:05 +0600 Subject: [PATCH 1043/2145] [rtvnh] Carry long lines --- youtube_dl/extractor/rtvnh.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/rtvnh.py b/youtube_dl/extractor/rtvnh.py index 2799f01a6..998a3c53d 100644 --- a/youtube_dl/extractor/rtvnh.py +++ b/youtube_dl/extractor/rtvnh.py @@ -19,12 +19,16 @@ class RTVNHIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - meta = self._parse_json(self._download_webpage('http://www.rtvnh.nl/video/json?m=' + video_id, video_id), video_id) - formats = self._extract_smil_formats('http://www.rtvnh.nl/video/smil?m=' + video_id, video_id) + + meta = self._parse_json(self._download_webpage( + 'http://www.rtvnh.nl/video/json?m=' + video_id, video_id), video_id) + formats = self._extract_smil_formats( + 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id) for item in meta['source']['fb']: if item.get('type') == 'hls': - formats.extend(self._extract_m3u8_formats(item['file'], video_id, ext='mp4', entry_protocol='m3u8_native')) + formats.extend(self._extract_m3u8_formats( + item['file'], video_id, ext='mp4', entry_protocol='m3u8_native')) elif item.get('type') == '': formats.append({'url': item['file']}) From f196047832a2da74d5adf75759877b5d95ec5b5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 13 Aug 2015 01:00:25 +0600 Subject: [PATCH 1044/2145] [rtvnh] Make thumbnail optional --- youtube_dl/extractor/rtvnh.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtvnh.py b/youtube_dl/extractor/rtvnh.py index 998a3c53d..d576a3410 100644 --- a/youtube_dl/extractor/rtvnh.py +++ b/youtube_dl/extractor/rtvnh.py @@ -35,6 +35,6 @@ class RTVNHIE(InfoExtractor): return { 'id': video_id, 'title': meta['title'].strip(), - 'thumbnail': meta['image'], + 'thumbnail': meta.get('image'), 'formats': formats } From 60231c65b9a50e08967d748c3ed401488fed3587 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 13 Aug 2015 01:02:50 +0600 Subject: [PATCH 1045/2145] [rtvnh] Make SMIL not fatal --- youtube_dl/extractor/rtvnh.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtvnh.py b/youtube_dl/extractor/rtvnh.py index d576a3410..202ea0181 100644 --- a/youtube_dl/extractor/rtvnh.py +++ b/youtube_dl/extractor/rtvnh.py @@ -23,7 +23,7 @@ class RTVNHIE(InfoExtractor): meta = self._parse_json(self._download_webpage( 'http://www.rtvnh.nl/video/json?m=' + video_id, video_id), video_id) formats = self._extract_smil_formats( - 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id) + 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id, fatal=False) for item in meta['source']['fb']: if item.get('type') == 'hls': From 2c919adb74893544ab6def1d56ff8ed37c282ecb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 13 Aug 2015 01:11:55 +0600 Subject: [PATCH 1046/2145] [rtvnh] Check status code --- youtube_dl/extractor/rtvnh.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtvnh.py b/youtube_dl/extractor/rtvnh.py index 202ea0181..7c9d4b0cd 100644 --- a/youtube_dl/extractor/rtvnh.py +++ b/youtube_dl/extractor/rtvnh.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ExtractorError class RTVNHIE(InfoExtractor): @@ -22,6 +23,12 @@ class RTVNHIE(InfoExtractor): meta = self._parse_json(self._download_webpage( 'http://www.rtvnh.nl/video/json?m=' + video_id, video_id), video_id) + + status = meta.get('status') + if status != 200: + raise ExtractorError( + '%s returned error code %d' % (self.IE_NAME, status), expected=True) + formats = self._extract_smil_formats( 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id, fatal=False) @@ -31,7 +38,7 @@ class RTVNHIE(InfoExtractor): item['file'], video_id, ext='mp4', entry_protocol='m3u8_native')) elif item.get('type') == '': formats.append({'url': item['file']}) - + return { 'id': video_id, 'title': meta['title'].strip(), From 3b7130439aade87b628fa6dd727df5860323a68f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 13 Aug 2015 01:15:58 +0600 Subject: [PATCH 1047/2145] Credit @ngld for RTVNH (#6537) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index d16d34272..71c420165 100644 --- a/AUTHORS +++ b/AUTHORS @@ -137,3 +137,4 @@ Zach Bruggeman Tjark Saul slangangular Behrouz Abbasi +ngld From b6b2711298f8d43414deac939f92c7c3477826b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 13 Aug 2015 01:17:15 +0600 Subject: [PATCH 1048/2145] [tweakers] Remove unused imports --- youtube_dl/extractor/tweakers.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/youtube_dl/extractor/tweakers.py b/youtube_dl/extractor/tweakers.py index 6eeffb1cc..f3198fb85 100644 --- a/youtube_dl/extractor/tweakers.py +++ b/youtube_dl/extractor/tweakers.py @@ -1,12 +1,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ( - xpath_text, - xpath_with_ns, - int_or_none, - float_or_none, -) class TweakersIE(InfoExtractor): From e73c85cb23d278702357412479fd4b162a3abbb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 13 Aug 2015 01:18:49 +0600 Subject: [PATCH 1049/2145] [iqiyi] PEP 8 --- youtube_dl/extractor/iqiyi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index dfc6d58a0..393e67e35 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -201,7 +201,7 @@ class IqiyiIE(InfoExtractor): return raw_data def get_enc_key(self, swf_url, video_id): - enc_key = '3601ba290e4f4662848c710e2122007e' # last update at 2015-08-10 for Zombie + enc_key = '3601ba290e4f4662848c710e2122007e' # last update at 2015-08-10 for Zombie return enc_key def _real_extract(self, url): From 237c03c8eaa4da1713a635e87f98ac14430b35cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 13 Aug 2015 01:19:23 +0600 Subject: [PATCH 1050/2145] [dhm] Remove unused import --- youtube_dl/extractor/dhm.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/dhm.py b/youtube_dl/extractor/dhm.py index 127eb0439..44e0c5d4d 100644 --- a/youtube_dl/extractor/dhm.py +++ b/youtube_dl/extractor/dhm.py @@ -1,10 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ( - xpath_text, - parse_duration, -) +from ..utils import parse_duration class DHMIE(InfoExtractor): From 28479149ccf3425e6a6e35d3a155f6802629728a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 13 Aug 2015 12:56:12 +0800 Subject: [PATCH 1051/2145] [theplatform] Fallback to hardcoded releaseUrl if not available Fixes #6546. Not adding a test case as test_NBC has the same problem. --- youtube_dl/extractor/theplatform.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 83d833e30..0643eccaf 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -108,7 +108,11 @@ class ThePlatformIE(InfoExtractor): config_url = config_url.replace('swf/', 'config/') config_url = config_url.replace('onsite/', 'onsite/config/') config = self._download_json(config_url, video_id, 'Downloading config') - smil_url = config['releaseUrl'] + '&format=SMIL&formats=MPEG4&manifest=f4m' + if 'releaseUrl' in config: + release_url = config['releaseUrl'] + else: + release_url = 'http://link.theplatform.com/s/%s?mbr=true' % path + smil_url = release_url + '&format=SMIL&formats=MPEG4&manifest=f4m' else: smil_url = 'http://link.theplatform.com/s/%s/meta.smil?format=smil&mbr=true' % path From 6828c809e44fca7b19da3c62a11cea313a86b64e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 13 Aug 2015 21:07:14 +0600 Subject: [PATCH 1052/2145] [downloader/fragment] Respect --retries for fragment based downloaders (Closes #6549) --- youtube_dl/downloader/fragment.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 5f9d6796d..5a64b29ee 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -35,6 +35,7 @@ class FragmentFD(FileDownloader): 'quiet': True, 'noprogress': True, 'ratelimit': self.params.get('ratelimit', None), + 'retries': self.params.get('retries', 0), 'test': self.params.get('test', False), } ) From 7393746da213bec686f8425165854e5e383b7eb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 13 Aug 2015 21:10:11 +0600 Subject: [PATCH 1053/2145] [downloader/hls] Add _debug_cmd --- youtube_dl/downloader/hls.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 60dca0ab1..2b6c3370f 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -32,6 +32,8 @@ class HlsFD(FileDownloader): for opt in (ffpp.executable, '-y', '-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc')] args.append(encodeFilename(tmpfilename, True)) + self._debug_cmd(args) + retval = subprocess.call(args) if retval == 0: fsize = os.path.getsize(encodeFilename(tmpfilename)) From cb28e0338665c96b2d5b35d203b1d54a57f3feb1 Mon Sep 17 00:00:00 2001 From: nyuszika7h Date: Mon, 10 Aug 2015 19:27:16 +0200 Subject: [PATCH 1054/2145] [indavideo] Add new extractor Closes #2147. --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/indavideo.py | 79 +++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 youtube_dl/extractor/indavideo.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 9a6308723..3bcfa93bb 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -242,6 +242,7 @@ from .imdb import ( ) from .imgur import ImgurIE from .ina import InaIE +from .indavideo import IndavideoIE from .infoq import InfoQIE from .instagram import InstagramIE, InstagramUserIE from .internetvideoarchive import InternetVideoArchiveIE diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py new file mode 100644 index 000000000..2a2cf2bd3 --- /dev/null +++ b/youtube_dl/extractor/indavideo.py @@ -0,0 +1,79 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .. import utils +from .common import InfoExtractor + + +class IndavideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?indavideo\.hu/video/(?P.+)' + _TESTS = [ + { + 'url': 'http://indavideo.hu/video/Cicatanc', + 'md5': 'c8a507a1c7410685f83a06eaeeaafeab', + 'info_dict': { + 'id': '1837039', + 'title': 'Cicatánc', + 'ext': 'mp4', + 'display_id': 'Cicatanc', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': '', + 'uploader': 'cukiajanlo', + 'uploader_id': '83729', + 'duration': 72, + 'age_limit': 0, + 'tags': ['tánc', 'cica', 'cuki', 'cukiajanlo', 'newsroom'] + }, + }, + { + 'url': 'http://indavideo.hu/video/Vicces_cica_1', + 'md5': '8c82244ba85d2a2310275b318eb51eac', + 'info_dict': { + 'id': '1335611', + 'title': 'Vicces cica', + 'ext': 'mp4', + 'display_id': 'Vicces_cica_1', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'Játszik a tablettel. :D', + 'uploader': 'Jet_Pack', + 'uploader_id': '491217', + 'duration': 7, + 'age_limit': 0, + 'tags': ['vicces', 'macska', 'cica', 'ügyes', 'nevetés', 'játszik', 'Cukiság', 'Jet_Pack'], + }, + }, + ] + + def _real_extract(self, url): + video_disp_id = self._match_id(url) + webpage = self._download_webpage(url, video_disp_id) + + embed_url = self._html_search_regex(r'', webpage, 'embed_url') + video_hash = embed_url.split('/')[-1] + + payload = self._download_json('http://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/' + video_hash, video_disp_id) + video_info = payload['data'] + + thumbnails = video_info.get('thumbnails') + if thumbnails: + thumbnails = [{'url': self._proto_relative_url(x)} for x in thumbnails] + + tags = video_info.get('tags') + if tags: + tags = [x['title'] for x in tags] + + return { + 'id': video_info.get('id'), + 'title': video_info['title'], + 'url': video_info['video_file'], + 'ext': 'mp4', + 'display_id': video_disp_id, + 'thumbnails': thumbnails, + 'description': video_info.get('description'), + 'uploader': video_info.get('user_name'), + # TODO: upload date (it's in CET/CEST) + 'uploader_id': video_info.get('user_id'), + 'duration': utils.int_or_none(video_info.get('length')), + 'age_limit': utils.int_or_none(video_info.get('age_limit')), + 'tags': tags, + } From 3c12a027d48a2d6d1162ab515df0308237aef881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 13 Aug 2015 23:25:47 +0600 Subject: [PATCH 1055/2145] [indavideo] Split in two extractors, extract all formats and fix timestamp --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/indavideo.py | 178 +++++++++++++++++++----------- 2 files changed, 118 insertions(+), 65 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3bcfa93bb..83d21bd15 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -242,7 +242,10 @@ from .imdb import ( ) from .imgur import ImgurIE from .ina import InaIE -from .indavideo import IndavideoIE +from .indavideo import ( + IndavideoIE, + IndavideoEmbedIE, +) from .infoq import InfoQIE from .instagram import InstagramIE, InstagramUserIE from .internetvideoarchive import InternetVideoArchiveIE diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index 2a2cf2bd3..b75715244 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -3,77 +3,127 @@ from __future__ import unicode_literals from .. import utils from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_age_limit, + parse_iso8601, +) + + +class IndavideoEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P[\da-f]+)' + _TESTS = [{ + 'url': 'http://indavideo.hu/player/video/1bdc3c6d80/', + 'md5': 'f79b009c66194acacd40712a6778acfa', + 'info_dict': { + 'id': '1837039', + 'ext': 'mp4', + 'title': 'Cicatánc', + 'description': '', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'cukiajanlo', + 'uploader_id': '83729', + 'timestamp': 1439193826, + 'upload_date': '20150810', + 'duration': 72, + 'age_limit': 0, + 'tags': ['tánc', 'cica', 'cuki', 'cukiajanlo', 'newsroom'], + }, + }, { + 'url': 'http://embed.indavideo.hu/player/video/1bdc3c6d80?autostart=1&hide=1', + 'only_matching': True, + }, { + 'url': 'http://assets.indavideo.hu/swf/player.swf?v=fe25e500&vID=1bdc3c6d80&autostart=1&hide=1&i=1', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'http://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/%s' % video_id, + video_id)['data'] + + video_id = video['id'] + title = video['title'] + + video_urls = video.get('video_files', []) + video_file = video.get('video_file') + if video: + video_urls.append(video_file) + video_urls = list(set(video_urls)) + + video_prefix = video_urls[0].rsplit('/', 1)[0] + + for flv_file in video.get('flv_files', []): + flv_url = '%s/%s' % (video_prefix, flv_file) + if flv_url not in video_urls: + video_urls.append(flv_url) + + formats = [{ + 'url': video_url, + 'height': self._search_regex(r'\.(\d{3,4})\.mp4$', video_url, 'height', default=None), + } for video_url in video_urls] + self._sort_formats(formats) + + timestamp = video.get('date') + if timestamp: + # upload date is in CEST + timestamp = parse_iso8601(timestamp + ' +0200', ' ') + + thumbnails = [{ + 'url': self._proto_relative_url(thumbnail) + } for thumbnail in video.get('thumbnails', [])] + + tags = [tag['title'] for tag in video.get('tags', [])] + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnails': thumbnails, + 'uploader': video.get('user_name'), + 'uploader_id': video.get('user_id'), + 'timestamp': timestamp, + 'duration': int_or_none(video.get('length')), + 'age_limit': parse_age_limit(video.get('age_limit')), + 'tags': tags, + 'formats': formats, + } class IndavideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?indavideo\.hu/video/(?P.+)' - _TESTS = [ - { - 'url': 'http://indavideo.hu/video/Cicatanc', - 'md5': 'c8a507a1c7410685f83a06eaeeaafeab', - 'info_dict': { - 'id': '1837039', - 'title': 'Cicatánc', - 'ext': 'mp4', - 'display_id': 'Cicatanc', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': '', - 'uploader': 'cukiajanlo', - 'uploader_id': '83729', - 'duration': 72, - 'age_limit': 0, - 'tags': ['tánc', 'cica', 'cuki', 'cukiajanlo', 'newsroom'] - }, + _VALID_URL = r'https?://(?:www\.)?indavideo\.hu/video/(?P[^/#?]+)' + _TEST = { + 'url': 'http://indavideo.hu/video/Vicces_cica_1', + 'md5': '8c82244ba85d2a2310275b318eb51eac', + 'info_dict': { + 'id': '1335611', + 'display_id': 'Vicces_cica_1', + 'ext': 'mp4', + 'title': 'Vicces cica', + 'description': 'Játszik a tablettel. :D', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Jet_Pack', + 'uploader_id': '491217', + 'timestamp': 1390821212, + 'upload_date': '20140127', + 'duration': 7, + 'age_limit': 0, + 'tags': ['vicces', 'macska', 'cica', 'ügyes', 'nevetés', 'játszik', 'Cukiság', 'Jet_Pack'], }, - { - 'url': 'http://indavideo.hu/video/Vicces_cica_1', - 'md5': '8c82244ba85d2a2310275b318eb51eac', - 'info_dict': { - 'id': '1335611', - 'title': 'Vicces cica', - 'ext': 'mp4', - 'display_id': 'Vicces_cica_1', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'Játszik a tablettel. :D', - 'uploader': 'Jet_Pack', - 'uploader_id': '491217', - 'duration': 7, - 'age_limit': 0, - 'tags': ['vicces', 'macska', 'cica', 'ügyes', 'nevetés', 'játszik', 'Cukiság', 'Jet_Pack'], - }, - }, - ] + } def _real_extract(self, url): - video_disp_id = self._match_id(url) - webpage = self._download_webpage(url, video_disp_id) + display_id = self._match_id(url) - embed_url = self._html_search_regex(r'', webpage, 'embed_url') - video_hash = embed_url.split('/')[-1] - - payload = self._download_json('http://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/' + video_hash, video_disp_id) - video_info = payload['data'] - - thumbnails = video_info.get('thumbnails') - if thumbnails: - thumbnails = [{'url': self._proto_relative_url(x)} for x in thumbnails] - - tags = video_info.get('tags') - if tags: - tags = [x['title'] for x in tags] + webpage = self._download_webpage(url, display_id) + embed_url = self._search_regex( + r']+rel="video_src"[^>]+href="(.+?)"', webpage, 'embed url') return { - 'id': video_info.get('id'), - 'title': video_info['title'], - 'url': video_info['video_file'], - 'ext': 'mp4', - 'display_id': video_disp_id, - 'thumbnails': thumbnails, - 'description': video_info.get('description'), - 'uploader': video_info.get('user_name'), - # TODO: upload date (it's in CET/CEST) - 'uploader_id': video_info.get('user_id'), - 'duration': utils.int_or_none(video_info.get('length')), - 'age_limit': utils.int_or_none(video_info.get('age_limit')), - 'tags': tags, + '_type': 'url_transparent', + 'ie_key': 'IndavideoEmbed', + 'url': embed_url, + 'display_id': display_id, } From a34e19629c407a08cd9065223f26f1f5468a4423 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 13 Aug 2015 23:40:20 +0600 Subject: [PATCH 1056/2145] [indavideo] Relax _VALID_URL to match subdomains and add tests --- youtube_dl/extractor/indavideo.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index b75715244..550a7001b 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -44,7 +44,6 @@ class IndavideoEmbedIE(InfoExtractor): 'http://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/%s' % video_id, video_id)['data'] - video_id = video['id'] title = video['title'] video_urls = video.get('video_files', []) @@ -78,7 +77,7 @@ class IndavideoEmbedIE(InfoExtractor): tags = [tag['title'] for tag in video.get('tags', [])] return { - 'id': video_id, + 'id': video.get('id') or video_id, 'title': title, 'description': video.get('description'), 'thumbnails': thumbnails, @@ -93,8 +92,8 @@ class IndavideoEmbedIE(InfoExtractor): class IndavideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?indavideo\.hu/video/(?P[^/#?]+)' - _TEST = { + _VALID_URL = r'https?://(?:.+?\.)?indavideo\.hu/video/(?P[^/#?]+)' + _TESTS = [{ 'url': 'http://indavideo.hu/video/Vicces_cica_1', 'md5': '8c82244ba85d2a2310275b318eb51eac', 'info_dict': { @@ -112,7 +111,22 @@ class IndavideoIE(InfoExtractor): 'age_limit': 0, 'tags': ['vicces', 'macska', 'cica', 'ügyes', 'nevetés', 'játszik', 'Cukiság', 'Jet_Pack'], }, - } + }, { + 'url': 'http://index.indavideo.hu/video/2015_0728_beregszasz', + 'only_matching': True, + }, { + 'url': 'http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko', + 'only_matching': True, + }, { + 'url': 'http://erotika.indavideo.hu/video/Amator_tini_punci', + 'only_matching': True, + }, { + 'url': 'http://film.indavideo.hu/video/f_hrom_nagymamm_volt', + 'only_matching': True, + }, { + 'url': 'http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) From fb56131dd9cf3bfa31d7d6920a135281d151f803 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 13 Aug 2015 23:47:12 +0600 Subject: [PATCH 1057/2145] Credit @nyuszika7h for indavideo (#6517) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 71c420165..ded9e87d2 100644 --- a/AUTHORS +++ b/AUTHORS @@ -138,3 +138,4 @@ Tjark Saul slangangular Behrouz Abbasi ngld +nyuszika7h From 594f51b85934878ff20b608f312d7f564e3a3d71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 13 Aug 2015 23:47:49 +0600 Subject: [PATCH 1058/2145] [indavideo] Remove unused import --- youtube_dl/extractor/indavideo.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/indavideo.py b/youtube_dl/extractor/indavideo.py index 550a7001b..12fb5e8e1 100644 --- a/youtube_dl/extractor/indavideo.py +++ b/youtube_dl/extractor/indavideo.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -from .. import utils from .common import InfoExtractor from ..utils import ( int_or_none, From 3cafca04aaf2bfc4d31e8255b9cb75e8f1ad4b16 Mon Sep 17 00:00:00 2001 From: reddraggone9 Date: Fri, 14 Aug 2015 00:35:35 -0500 Subject: [PATCH 1059/2145] Updated line numbers in the fragment portion of README links. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 15baf75ce..e91119d84 100644 --- a/README.md +++ b/README.md @@ -544,7 +544,7 @@ If you want to add support for a new site, you can follow this quick list (assum ``` 5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will be then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. -7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Add tests and code for as many as you want. +7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62). Add tests and code for as many as you want. 8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8). 9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: @@ -572,7 +572,7 @@ with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc']) ``` -Most likely, you'll want to use various options. For a list of what can be done, have a look at [youtube_dl/YoutubeDL.py](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L69). For a start, if you want to intercept youtube-dl's output, set a `logger` object. +Most likely, you'll want to use various options. For a list of what can be done, have a look at [youtube_dl/YoutubeDL.py](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L92). For a start, if you want to intercept youtube-dl's output, set a `logger` object. Here's a more complete example of a program that outputs only errors (and a short message after the download is finished), and downloads/converts the video to an mp3 file: From 4d2ad866f347086d3a1cf4cb7e0a8cadd3c87748 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 14 Aug 2015 19:18:03 +0800 Subject: [PATCH 1060/2145] [README.md] Document format_id field in output template section (#6557) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 15baf75ce..8fa402ee2 100644 --- a/README.md +++ b/README.md @@ -272,6 +272,7 @@ The `-o` option allows users to indicate a template for the output file names. T - `autonumber`: The sequence will be replaced by a five-digit number that will be increased with each download, starting at zero. - `playlist`: The name or the id of the playlist that contains the video. - `playlist_index`: The index of the video in the playlist, a five-digit number. + - `format_id`: The sequence will be replaced by the format code specified by `--format`. The current default template is `%(title)s-%(id)s.%(ext)s`. From 41dbc50f9c7dfaad4084fbeac77192c7ac37daca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 14 Aug 2015 22:07:02 +0600 Subject: [PATCH 1061/2145] [lynda] Capture and output login error (Closes #6556) --- youtube_dl/extractor/lynda.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index deead220a..5b9157ed4 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -11,6 +11,7 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + clean_html, int_or_none, ) @@ -70,6 +71,15 @@ class LyndaBaseIE(InfoExtractor): 'Confirming log in and log out from another device') if all(not re.search(p, login_page) for p in ('isLoggedIn\s*:\s*true', r'logout\.aspx', r'>Log out<')): + if 'login error' in login_page: + mobj = re.search( + r'(?s)]+class="topmost">(?P[^<]+)</h1>\s*<div>(?P<description>.+?)</div>', + login_page) + if mobj: + raise ExtractorError( + 'lynda returned error: %s - %s' + % (mobj.group('title'), clean_html(mobj.group('description'))), + expected=True) raise ExtractorError('Unable to log in') From 6be5e46994ea5db76d7a2659260606898c265957 Mon Sep 17 00:00:00 2001 From: "Sergey M." <dstftw@gmail.com> Date: Fri, 14 Aug 2015 22:22:39 +0600 Subject: [PATCH 1062/2145] [README.md] Clarify line ranges --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e52cdb941..542a7c26a 100644 --- a/README.md +++ b/README.md @@ -545,7 +545,7 @@ If you want to add support for a new site, you can follow this quick list (assum ``` 5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will be then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. -7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62). Add tests and code for as many as you want. +7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. 8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8). 9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: @@ -573,7 +573,7 @@ with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc']) ``` -Most likely, you'll want to use various options. For a list of what can be done, have a look at [youtube_dl/YoutubeDL.py](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L92). For a start, if you want to intercept youtube-dl's output, set a `logger` object. +Most likely, you'll want to use various options. For a list of what can be done, have a look at [youtube_dl/YoutubeDL.py](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L117-L265). For a start, if you want to intercept youtube-dl's output, set a `logger` object. Here's a more complete example of a program that outputs only errors (and a short message after the download is finished), and downloads/converts the video to an mp3 file: From d0d6c097fc7859180f16a445536029c600b1e57f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 15 Aug 2015 15:17:27 +0800 Subject: [PATCH 1063/2145] [moniker] Support embed- URLs (#6450) --- youtube_dl/extractor/moniker.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/moniker.py b/youtube_dl/extractor/moniker.py index 88dcd4f73..69e4bcd1a 100644 --- a/youtube_dl/extractor/moniker.py +++ b/youtube_dl/extractor/moniker.py @@ -9,7 +9,10 @@ from ..compat import ( compat_urllib_parse, compat_urllib_request, ) -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + remove_start, +) class MonikerIE(InfoExtractor): @@ -24,6 +27,14 @@ class MonikerIE(InfoExtractor): 'ext': 'mp4', 'title': 'youtube-dl test video', }, + }, { + 'url': 'http://allmyvideos.net/embed-jih3nce3x6wn', + 'md5': '710883dee1bfc370ecf9fa6a89307c88', + 'info_dict': { + 'id': 'jih3nce3x6wn', + 'ext': 'mp4', + 'title': 'youtube-dl test video', + }, }, { 'url': 'http://vidspot.net/l2ngsmhs8ci5', 'md5': '710883dee1bfc370ecf9fa6a89307c88', @@ -38,7 +49,10 @@ class MonikerIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) + orig_video_id = self._match_id(url) + video_id = remove_start(orig_video_id, 'embed-') + url = url.replace(orig_video_id, video_id) + assert re.match(self._VALID_URL, url) is not None orig_webpage = self._download_webpage(url, video_id) if '>File Not Found<' in orig_webpage: From 8b8c1093b65ee02aad859ed8d82217312ed0d9d8 Mon Sep 17 00:00:00 2001 From: Shaun Walbridge <shaun.walbridge@gmail.com> Date: Sat, 18 Apr 2015 00:37:04 -0400 Subject: [PATCH 1064/2145] [EsriVideo] Add new extractor Add extractor for [videos.esri.com](https://videos.esri.com), a collection of videos relating to GIS. --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/videoesri.py | 90 +++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 youtube_dl/extractor/videoesri.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 83d21bd15..a4387636f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -695,6 +695,7 @@ from .vice import ViceIE from .viddler import ViddlerIE from .videobam import VideoBamIE from .videodetective import VideoDetectiveIE +from .videoesri import VideoEsriIE from .videolecturesnet import VideoLecturesNetIE from .videofyme import VideofyMeIE from .videomega import VideoMegaIE diff --git a/youtube_dl/extractor/videoesri.py b/youtube_dl/extractor/videoesri.py new file mode 100644 index 000000000..0f84323a4 --- /dev/null +++ b/youtube_dl/extractor/videoesri.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import os +import re + +from .common import InfoExtractor + +from ..utils import ( + unified_strdate +) + + +class VideoEsriIE(InfoExtractor): + _VALID_URL = r'https?://video\.esri\.com/watch/(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://video.esri.com/watch/4228', + 'md5': '170b4d513c2466ed483c150a48384133', + 'info_dict': { + 'id': '4228', + 'ext': 'mp4', + 'title': 'AppStudio for ArcGIS', + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20150310', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title') + + upload_date_raw = self._search_regex( + r'http-equiv="last-modified" content="(.*)"', + webpage, 'upload date') + upload_date = unified_strdate(upload_date_raw) + + settings_info = self._search_regex( + r'evPlayerSettings = {(.*?);\s*$', + webpage, 'settings info', flags=re.MULTILINE | re.DOTALL) + + # thumbnail includes '_x' for large, also has {_m,_t,_s} or + # without size suffix returns full image + thumbnail_path = re.findall( + r'image\': \'(\/thumbs.*)\'', + settings_info)[0] + + if thumbnail_path: + thumbnail = '/'.join(['http://video.esri.com', thumbnail_path]) + + # note that this misses the (exceedly rare) webm files + video_paths = re.findall(r'mp4:(.*)\'', settings_info) + + # find possible http servers of the mp4 files (also has rtsp) + base_url = re.findall( + r'netstreambasepath\':\s\'(h.*)\'', settings_info)[0] + + # these are the numbers used internally, but really map + # to other resolutions, e.g. 960 is 720p. + heights = [480, 720, 960] + videos_by_res = {} + for video_path in video_paths: + url = "{base_url}{video_path}".format( + base_url=base_url, + video_path=video_path) + filename, ext = os.path.splitext(video_path) + height_label = int(filename.split('_')[1]) + videos_by_res[height_label] = { + 'url': url, + 'ext': ext[1:], + 'protocol': 'http', # http-only supported currently + } + + formats = [] + for height in heights: + if height in videos_by_res: + formats.append(videos_by_res[height]) + + result = { + 'id': video_id, + 'title': title, + 'upload_date': upload_date, + 'formats': formats, + } + + if thumbnail: + result['thumbnail'] = thumbnail + + return result From 8b9848ac5678356757f67a412f7ed89a0f559be7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 15 Aug 2015 15:58:30 +0600 Subject: [PATCH 1065/2145] [extractor/common] Expand meta regex --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5982055be..16ae4b98f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -640,7 +640,7 @@ class InfoExtractor(object): @staticmethod def _meta_regex(prop): return r'''(?isx)<meta - (?=[^>]+(?:itemprop|name|property|id)=(["\']?)%s\1) + (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1) [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) def _og_search_property(self, prop, html, name=None, **kargs): From 3aa697f993e3719cf032c5b1e192a034100b0534 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 15 Aug 2015 15:58:56 +0600 Subject: [PATCH 1066/2145] [esri:video] Extract all formats and simplify --- youtube_dl/extractor/videoesri.py | 106 +++++++++++++----------------- 1 file changed, 45 insertions(+), 61 deletions(-) diff --git a/youtube_dl/extractor/videoesri.py b/youtube_dl/extractor/videoesri.py index 0f84323a4..84faba678 100644 --- a/youtube_dl/extractor/videoesri.py +++ b/youtube_dl/extractor/videoesri.py @@ -1,90 +1,74 @@ # coding: utf-8 from __future__ import unicode_literals -import os import re from .common import InfoExtractor - +from ..compat import compat_urlparse from ..utils import ( - unified_strdate + int_or_none, + parse_filesize, + unified_strdate, ) class VideoEsriIE(InfoExtractor): _VALID_URL = r'https?://video\.esri\.com/watch/(?P<id>[0-9]+)' _TEST = { - 'url': 'https://video.esri.com/watch/4228', - 'md5': '170b4d513c2466ed483c150a48384133', + 'url': 'https://video.esri.com/watch/1124/arcgis-online-_dash_-developing-applications', + 'md5': 'd4aaf1408b221f1b38227a9bbaeb95bc', 'info_dict': { - 'id': '4228', + 'id': '1124', 'ext': 'mp4', - 'title': 'AppStudio for ArcGIS', + 'title': 'ArcGIS Online - Developing Applications', + 'description': 'Jeremy Bartley demonstrates how to develop applications with ArcGIS Online.', 'thumbnail': 're:^https?://.*\.jpg$', - 'upload_date': '20150310', + 'duration': 185, + 'upload_date': '20120419', } } def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title') - - upload_date_raw = self._search_regex( - r'http-equiv="last-modified" content="(.*)"', - webpage, 'upload date') - upload_date = unified_strdate(upload_date_raw) - - settings_info = self._search_regex( - r'evPlayerSettings = {(.*?);\s*$', - webpage, 'settings info', flags=re.MULTILINE | re.DOTALL) - - # thumbnail includes '_x' for large, also has {_m,_t,_s} or - # without size suffix returns full image - thumbnail_path = re.findall( - r'image\': \'(\/thumbs.*)\'', - settings_info)[0] - - if thumbnail_path: - thumbnail = '/'.join(['http://video.esri.com', thumbnail_path]) - - # note that this misses the (exceedly rare) webm files - video_paths = re.findall(r'mp4:(.*)\'', settings_info) - - # find possible http servers of the mp4 files (also has rtsp) - base_url = re.findall( - r'netstreambasepath\':\s\'(h.*)\'', settings_info)[0] - - # these are the numbers used internally, but really map - # to other resolutions, e.g. 960 is 720p. - heights = [480, 720, 960] - videos_by_res = {} - for video_path in video_paths: - url = "{base_url}{video_path}".format( - base_url=base_url, - video_path=video_path) - filename, ext = os.path.splitext(video_path) - height_label = int(filename.split('_')[1]) - videos_by_res[height_label] = { - 'url': url, - 'ext': ext[1:], - 'protocol': 'http', # http-only supported currently - } - formats = [] - for height in heights: - if height in videos_by_res: - formats.append(videos_by_res[height]) + for width, height, content in re.findall( + r'(?s)<li><strong>(\d+)x(\d+):</strong>(.+?)</li>', webpage): + for video_url, ext, filesize in re.findall( + r'<a[^>]+href="([^"]+)">([^<]+) \(([^<]+)\)</a>', content): + formats.append({ + 'url': compat_urlparse.urljoin(url, video_url), + 'ext': ext.lower(), + 'format_id': '%s-%s' % (ext.lower(), height), + 'width': int(width), + 'height': int(height), + 'filesize_approx': parse_filesize(filesize), + }) + self._sort_formats(formats) - result = { + title = self._html_search_meta('title', webpage, 'title') + description = self._html_search_meta( + 'description', webpage, 'description', fatal=False) + + thumbnail = self._html_search_meta('thumbnail', webpage, 'thumbnail', fatal=False) + if thumbnail: + thumbnail = re.sub(r'_[st]\.jpg$', '_x.jpg', thumbnail) + + duration = int_or_none(self._search_regex( + [r'var\s+videoSeconds\s*=\s*(\d+)', r"'duration'\s*:\s*(\d+)"], + webpage, 'duration', fatal=False)) + + upload_date = unified_strdate(self._html_search_meta( + 'last-modified', webpage, 'upload date', fatal=None)) + + return { 'id': video_id, 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, 'upload_date': upload_date, - 'formats': formats, + 'formats': formats } - - if thumbnail: - result['thumbnail'] = thumbnail - - return result From 9c21f229236c77a8865c857b43c6cbd95dcc6f23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 15 Aug 2015 15:59:35 +0600 Subject: [PATCH 1067/2145] [esri:video] Rename extractor --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/{videoesri.py => esri.py} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename youtube_dl/extractor/{videoesri.py => esri.py} (98%) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a4387636f..760f73918 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -695,7 +695,7 @@ from .vice import ViceIE from .viddler import ViddlerIE from .videobam import VideoBamIE from .videodetective import VideoDetectiveIE -from .videoesri import VideoEsriIE +from .esri import EsriVideoIE from .videolecturesnet import VideoLecturesNetIE from .videofyme import VideofyMeIE from .videomega import VideoMegaIE diff --git a/youtube_dl/extractor/videoesri.py b/youtube_dl/extractor/esri.py similarity index 98% rename from youtube_dl/extractor/videoesri.py rename to youtube_dl/extractor/esri.py index 84faba678..bf5d2019f 100644 --- a/youtube_dl/extractor/videoesri.py +++ b/youtube_dl/extractor/esri.py @@ -12,7 +12,7 @@ from ..utils import ( ) -class VideoEsriIE(InfoExtractor): +class EsriVideoIE(InfoExtractor): _VALID_URL = r'https?://video\.esri\.com/watch/(?P<id>[0-9]+)' _TEST = { 'url': 'https://video.esri.com/watch/1124/arcgis-online-_dash_-developing-applications', From 1d25e9d173931da0d2cb65b114f44bbf24184f6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 15 Aug 2015 16:00:24 +0600 Subject: [PATCH 1068/2145] [extractor/__init__] Fix order --- youtube_dl/extractor/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 760f73918..a8be63624 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -158,6 +158,7 @@ from .eporner import EpornerIE from .eroprofile import EroProfileIE from .escapist import EscapistIE from .espn import ESPNIE +from .esri import EsriVideoIE from .everyonesmixtape import EveryonesMixtapeIE from .exfm import ExfmIE from .expotv import ExpoTVIE @@ -695,7 +696,6 @@ from .vice import ViceIE from .viddler import ViddlerIE from .videobam import VideoBamIE from .videodetective import VideoDetectiveIE -from .esri import EsriVideoIE from .videolecturesnet import VideoLecturesNetIE from .videofyme import VideofyMeIE from .videomega import VideoMegaIE From fab83e24567226fa70e7f5076d961b83239ccfbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 15 Aug 2015 16:10:20 +0600 Subject: [PATCH 1069/2145] Credit @scw for video.esri.com (#5459) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index ded9e87d2..d1693224e 100644 --- a/AUTHORS +++ b/AUTHORS @@ -139,3 +139,4 @@ slangangular Behrouz Abbasi ngld nyuszika7h +Shaun Walbridge From c576ef1e7cfd31ca94ca6025c054b3ae4f611b21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 15 Aug 2015 18:13:37 +0600 Subject: [PATCH 1070/2145] [shahid] Improve and simplify --- youtube_dl/extractor/shahid.py | 140 ++++++++++++++++++--------------- 1 file changed, 78 insertions(+), 62 deletions(-) diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py index 399140189..6e9903d5e 100644 --- a/youtube_dl/extractor/shahid.py +++ b/youtube_dl/extractor/shahid.py @@ -2,90 +2,106 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_urllib_parse from ..utils import ( - js_to_json, ExtractorError, - int_or_none + int_or_none, + parse_iso8601, ) class ShahidIE(InfoExtractor): _VALID_URL = r'https?://shahid\.mbc\.net/ar/episode/(?P<id>\d+)/?' - _TESTS = [ - { - 'url': 'https://shahid.mbc.net/ar/episode/90574/%D8%A7%D9%84%D9%85%D9%84%D9%83-%D8%B9%D8%A8%D8%AF%D8%A7%D9%84%D9%84%D9%87-%D8%A7%D9%84%D8%A5%D9%86%D8%B3%D8%A7%D9%86-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-3.html', - 'info_dict': { - 'id': '90574', - 'ext': 'm3u8', - 'title': 'الملك عبدالله الإنسان الموسم 1 كليب 3', - 'description': 'الفيلم الوثائقي - الملك عبد الله الإنسان', - 'duration': 2972, - }, - 'params': { - # m3u8 download - 'skip_download': True, - } + _TESTS = [{ + 'url': 'https://shahid.mbc.net/ar/episode/90574/%D8%A7%D9%84%D9%85%D9%84%D9%83-%D8%B9%D8%A8%D8%AF%D8%A7%D9%84%D9%84%D9%87-%D8%A7%D9%84%D8%A5%D9%86%D8%B3%D8%A7%D9%86-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-3.html', + 'info_dict': { + 'id': '90574', + 'ext': 'm3u8', + 'title': 'الملك عبدالله الإنسان الموسم 1 كليب 3', + 'description': 'الفيلم الوثائقي - الملك عبد الله الإنسان', + 'duration': 2972, + 'timestamp': 1422057420, + 'upload_date': '20150123', }, - { - # shahid plus subscriber only - 'url': 'https://shahid.mbc.net/ar/episode/90511/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1.html', - 'only_matching': True + 'params': { + # m3u8 download + 'skip_download': True, } - ] + }, { + # shahid plus subscriber only + 'url': 'https://shahid.mbc.net/ar/episode/90511/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1.html', + 'only_matching': True + }] - _api_vars = { - 'type': 'player', - 'url': 'http://api.shahid.net/api/v1_1', - 'playerType': 'episode', - } + def _handle_error(self, response): + if not isinstance(response, dict): + return + error = response.get('error') + if error: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())), + expected=True) + + def _download_json(self, url, video_id, note='Downloading JSON metadata'): + response = super(ShahidIE, self)._download_json(url, video_id, note)['data'] + self._handle_error(response) + return response def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - player_info = '' - flash_vars = self._search_regex('var flashvars = ({[^}]+})', webpage, 'flashvars', None) - if flash_vars is not None: - for line in flash_vars.splitlines(): - if '+' not in line and '(' not in line and ')' not in line: - player_info += line - player_info = self._parse_json(player_info, video_id, js_to_json, False) - if player_info is not None: - for key in self._api_vars: - if key in player_info: - self._api_vars[key] = player_info[key] + api_vars = { + 'id': video_id, + 'type': 'player', + 'url': 'http://api.shahid.net/api/v1_1', + 'playerType': 'episode', + } - player_json_data = self._download_json( - 'https://shahid.mbc.net/arContent/getPlayerContent-param-.id-' + video_id + '.type-' + self._api_vars['type'] + '.html', - video_id - )['data'] - if 'url' in player_json_data: - m3u8_url = player_json_data['url'] - else: - for error in player_json_data['error'].values(): - raise ExtractorError(error) - formats = self._extract_m3u8_formats(m3u8_url, video_id) + flashvars = self._search_regex( + r'var\s+flashvars\s*=\s*({[^}]+})', webpage, 'flashvars', default=None) + if flashvars: + for key in api_vars.keys(): + value = self._search_regex( + r'\b%s\s*:\s*(?P<q>["\'])(?P<value>.+?)(?P=q)' % key, + flashvars, 'type', default=None, group='value') + if value: + api_vars[key] = value - video_info = self._download_json( - self._api_vars['url'] + '/' + self._api_vars['playerType'] + '/' + video_id + '?apiKey=sh%40hid0nlin3&hash=b2wMCTHpSmyxGqQjJFOycRmLSex%2BBpTK%2Fooxy6vHaqs%3D', - video_id - )['data'] - if video_info.get('error'): - for error in video_info['error']: - raise ExtractorError(error) - video_info = video_info[self._api_vars['playerType']] - title = video_info['title'] - thumbnail = video_info.get('thumbnailUrl') - categories = [category['name'] for category in video_info.get('genres')] - description = video_info.get('description') - duration = int_or_none(video_info.get('duration')) + player = self._download_json( + 'https://shahid.mbc.net/arContent/getPlayerContent-param-.id-%s.type-%s.html' + % (video_id, api_vars['type']), video_id, 'Downloading player JSON') + + formats = self._extract_m3u8_formats(player['url'], video_id, 'mp4') + + video = self._download_json( + '%s/%s/%s?%s' % ( + api_vars['url'], api_vars['playerType'], api_vars['id'], + compat_urllib_parse.urlencode({ + 'apiKey': 'sh@hid0nlin3', + 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', + }).encode('utf-8')), + video_id, 'Downloading video JSON') + + video = video[api_vars['playerType']] + + title = video['title'] + description = video.get('description') + thumbnail = video.get('thumbnailUrl') + duration = int_or_none(video.get('duration')) + timestamp = parse_iso8601(video.get('referenceDate')) + categories = [ + category['name'] + for category in video.get('genres', []) if 'name' in category] return { 'id': video_id, 'title': title, - 'thumbnail': thumbnail, - 'categories': categories, 'description': description, + 'thumbnail': thumbnail, 'duration': duration, + 'timestamp': timestamp, + 'categories': categories, 'formats': formats, } From 9303ce3e6969b5818982d6214a8d0ff4e3c95286 Mon Sep 17 00:00:00 2001 From: reddraggone9 <cljenkins9@gmail.com> Date: Thu, 13 Aug 2015 22:11:11 -0500 Subject: [PATCH 1071/2145] [youtube] Fix two-factor authentication --- youtube_dl/extractor/youtube.py | 40 +++++++++++++++------------------ 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index facd837ad..bfa9a12a8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -46,7 +46,7 @@ from ..utils import ( class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' - _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor' + _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False @@ -128,7 +128,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # Two-Factor # TODO add SMS and phone call support - these require making a request and then prompting the user - if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None: + if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None: tfa_code = self._get_tfa_info() if tfa_code is None: @@ -136,31 +136,27 @@ class YoutubeBaseInfoExtractor(InfoExtractor): self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)') return False - # Unlike the first login form, secTok and timeStmp are both required for the TFA form + def find_value(element_id): + match = re.search(r'id="%s"\s+value="(.+?)">' % element_id, login_results, re.M | re.U) + if match is None: + self._downloader.report_warning('Failed to get %s - did the page structure change?' % id) + return match.group(1) - match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U) - if match is None: - self._downloader.report_warning('Failed to get secTok - did the page structure change?') - secTok = match.group(1) - match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U) - if match is None: - self._downloader.report_warning('Failed to get timeStmp - did the page structure change?') - timeStmp = match.group(1) + challengeId = find_value('challengeId') + challengeType = find_value('challengeType') + gxf = find_value('gxf') tfa_form_strs = { + 'challengeId': challengeId, + 'challengeType': challengeType, # This doesn't appear to change 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1', - 'smsToken': '', - 'smsUserPin': tfa_code, - 'smsVerifyPin': 'Verify', - - 'PersistentCookie': 'yes', - 'checkConnection': '', - 'checkedDomains': 'youtube', - 'pstMsg': '1', - 'secTok': secTok, - 'timeStmp': timeStmp, 'service': 'youtube', 'hl': 'en_US', + 'checkedDomains': 'youtube', + 'pstMsg': '0', + 'gxf': gxf, + 'Pin': tfa_code, + 'TrustDevice': 'on', } tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items()) tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii') @@ -173,7 +169,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if tfa_results is False: return False - if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None: + if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None: self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.') return False if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None: From 201ea3ee8e392d6c82bb8137b80b4328db40a399 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 15 Aug 2015 21:52:22 +0600 Subject: [PATCH 1072/2145] [extractor/common] Improve _hidden_inputs --- youtube_dl/extractor/common.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 16ae4b98f..e2ace827f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -724,16 +724,18 @@ class InfoExtractor(object): @staticmethod def _hidden_inputs(html): - return dict([ - (input.group('name'), input.group('value')) for input in re.finditer( - r'''(?x) - <input\s+ - type=(?P<q_hidden>["\'])hidden(?P=q_hidden)\s+ - name=(?P<q_name>["\'])(?P<name>.+?)(?P=q_name)\s+ - (?:id=(?P<q_id>["\']).+?(?P=q_id)\s+)? - value=(?P<q_value>["\'])(?P<value>.*?)(?P=q_value) - ''', html) - ]) + hidden_inputs = {} + for input in re.findall(r'<input([^>]+)>', html): + if not re.search(r'type=(["\'])hidden\1', input): + continue + name = re.search(r'name=(["\'])(?P<value>.+?)\1', input) + if not name: + continue + value = re.search(r'value=(["\'])(?P<value>.*?)\1', input) + if not value: + continue + hidden_inputs[name.group('value')] = value.group('value') + return hidden_inputs def _form_hidden_inputs(self, form_id, html): form = self._search_regex( From e64b756943440d602dc757f81787cad6aee8f412 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 15 Aug 2015 21:55:07 +0600 Subject: [PATCH 1073/2145] [extractor/common] Interactive TFA code input --- youtube_dl/extractor/common.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index e2ace827f..65835d257 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -15,6 +15,7 @@ import xml.etree.ElementTree from ..compat import ( compat_cookiejar, compat_cookies, + compat_getpass, compat_HTTPError, compat_http_client, compat_urllib_error, @@ -610,7 +611,7 @@ class InfoExtractor(object): return (username, password) - def _get_tfa_info(self): + def _get_tfa_info(self, note='two-factor verification code'): """ Get the two-factor authentication info TODO - asking the user will be required for sms/phone verify @@ -624,7 +625,7 @@ class InfoExtractor(object): if downloader_params.get('twofactor', None) is not None: return downloader_params['twofactor'] - return None + return compat_getpass('Type %s and press [Return]: ' % note) # Helper functions for extracting OpenGraph info @staticmethod From 041bc3adc55bbe81649a1c5d283302e5a120659e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 15 Aug 2015 22:03:43 +0600 Subject: [PATCH 1074/2145] [youtube] Simplify two-factor authentication --- youtube_dl/extractor/youtube.py | 35 +++++++++++---------------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index bfa9a12a8..887c46d95 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -33,6 +33,7 @@ from ..utils import ( int_or_none, orderedSet, parse_duration, + remove_start, smuggle_url, str_to_int, unescapeHTML, @@ -129,35 +130,23 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # TODO add SMS and phone call support - these require making a request and then prompting the user if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None: - tfa_code = self._get_tfa_info() + tfa_code = self._get_tfa_info('2-step verification code') - if tfa_code is None: - self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>') - self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)') + if not tfa_code: + self._downloader.report_warning( + 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>' + '(Note that only TOTP (Google Authenticator App) codes work at this time.)') return False - def find_value(element_id): - match = re.search(r'id="%s"\s+value="(.+?)">' % element_id, login_results, re.M | re.U) - if match is None: - self._downloader.report_warning('Failed to get %s - did the page structure change?' % id) - return match.group(1) + tfa_code = remove_start(tfa_code, 'G-') - challengeId = find_value('challengeId') - challengeType = find_value('challengeType') - gxf = find_value('gxf') + tfa_form_strs = self._form_hidden_inputs('challenge', login_results) - tfa_form_strs = { - 'challengeId': challengeId, - 'challengeType': challengeType, # This doesn't appear to change - 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1', - 'service': 'youtube', - 'hl': 'en_US', - 'checkedDomains': 'youtube', - 'pstMsg': '0', - 'gxf': gxf, + tfa_form_strs.update({ 'Pin': tfa_code, 'TrustDevice': 'on', - } + }) + tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items()) tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii') @@ -170,7 +159,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return False if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None: - self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.') + self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.') return False if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None: self._downloader.report_warning('unable to log in - did the page structure change?') From eaa5646483d22d3b658dcf63b61e6c3b67aa5bc5 Mon Sep 17 00:00:00 2001 From: "Sergey M." <dstftw@gmail.com> Date: Sat, 15 Aug 2015 22:57:06 +0600 Subject: [PATCH 1075/2145] [README.md] Clarify configuration file usage (Closes #6530) --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 542a7c26a..25844eb6d 100644 --- a/README.md +++ b/README.md @@ -236,7 +236,14 @@ which means you can modify it, redistribute it or use it however you like. # CONFIGURATION -You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl/config`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\<user name>\youtube-dl.conf`. +You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux, system wide configuration file is located at `/etc/youtube-dl.conf` and user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`. For example, with the following configration file youtube-dl will always extract the audio, not copy the mtime and use proxy: +``` +--extract-audio +--no-mtime +--proxy 127.0.0.1:3128 +``` + +You can use `--ignore-config` if you want to disable configuration file for a particular youtube-dl run. ### Authentication with `.netrc` file ### From 221a59fe6fe22ec286830319e72cbabdc83fd02f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Caletka?= <ondrej@caletka.cz> Date: Sun, 9 Aug 2015 12:27:31 +0200 Subject: [PATCH 1076/2145] [playtvak] Initial support for videos from Mafra group servers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Support for videos in articles from idnes.cz, lidovky.cz, metro.cz, as well as video and live streams from playtvak.cz Signed-off-by: Ondřej Caletka <ondrej@caletka.cz> --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/playtvak.py | 127 +++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+) create mode 100644 youtube_dl/extractor/playtvak.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 83d21bd15..5307240f8 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -448,6 +448,7 @@ from .planetaplay import PlanetaPlayIE from .pladform import PladformIE from .played import PlayedIE from .playfm import PlayFMIE +from .playtvak import PlaytvakIE from .playvid import PlayvidIE from .playwire import PlaywireIE from .podomatic import PodomaticIE diff --git a/youtube_dl/extractor/playtvak.py b/youtube_dl/extractor/playtvak.py new file mode 100644 index 000000000..6dff6650c --- /dev/null +++ b/youtube_dl/extractor/playtvak.py @@ -0,0 +1,127 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urlparse, + compat_urllib_parse, +) +from ..utils import ExtractorError + + +def _extract_json(code): + return re.sub( + r'(?s)^VideoPlayer.data\("", ({.*})\);?\s*?(?://[^\n]*)*$', r'\1', code) + + +class PlaytvakIE(InfoExtractor): + _VALID_URL = r'https?://.*?(playtvak|idnes|lidovky|metro)\.cz/.*\?c=(?P<id>[A-Z][0-9]{6}_[0-9]{6}_.*)' + _TESTS = [{ + 'url': 'http://www.playtvak.cz/vyzente-vosy-a-srsne-ze-zahrady-dn5-/hodinovy-manzel.aspx?c=A150730_150323_hodinovy-manzel_kuko', + 'md5': '4525ae312c324b4be2f4603cc78ceb4a', + 'info_dict': { + 'id': 'A150730_150323_hodinovy-manzel_kuko', + 'ext': 'mp4', + 'title': 'Vyžeňte vosy a sršně ze zahrady', + 'thumbnail': 'http://oidnes.cz/15/074/mobil/KUK5cea00_010hodmanel58154.jpg', + 'description': 'Málo co kazí atmosféru venkovního posezení tak jako neustálé bzučení kolem hlavy. Vyzkoušejte náš lapač a odpuzovač vos a sršňů.', + } + }, { # live video test + 'url': 'http://slowtv.playtvak.cz/planespotting-0pr-/planespotting.aspx?c=A150624_164934_planespotting_cat', + 'info_dict': { + 'id': 'A150624_164934_planespotting_cat', + 'ext': 'flv', + 'title': 're:^Přímý přenos iDNES.cz [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': 'http://data.idnes.cz/soubory/servisni-play-porady/89A150630_ACEK_026_VIDEOPLAYER-STREA.PNG', + 'description': 'Sledujte provoz na ranveji Letiště Václava Havla v Praze', + 'is_live': True, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }, { # idnes.cz + 'url': 'http://zpravy.idnes.cz/pes-zavreny-v-aute-rozbijeni-okynek-v-aute-fj5-/domaci.aspx?c=A150809_104116_domaci_pku', + 'md5': '819832ba33cd7016e58a6658577fe289', + 'info_dict': { + 'id': 'A150809_104116_domaci_pku', + 'ext': 'mp4', + 'title': 'Zavřeli jsme mraženou pizzu do auta. Upekla se', + 'thumbnail': 'http://i.idnes.cz/15/081/vidw/SHA5d1786_pizzaauto.jpg', + 'description': 'Na sociálních sítích se objevila výzva, aby lidé, kteří v horkých letních dnech uvidí v zaparkovaném autě zavřeného psa, neváhali rozbít okénko. Zastánci tohoto postoje argumentují zdravím zvířete, které v dusnu může zkolabovat. Policie doporučuje nejprve volat tísňovou linku.', + } + }, { # lidovky.cz + 'url': 'http://www.lidovky.cz/dalsi-demonstrace-v-praze-o-migraci-duq-/video.aspx?c=A150808_214044_ln-video_ELE', + 'md5': 'c7209ac4ba9d234d4ad5bab7485bcee8', + 'info_dict': { + 'id': 'A150808_214044_ln-video_ELE', + 'ext': 'mp4', + 'title': 'Táhni! Demonstrace proti imigrantům budila emoce', + 'thumbnail': 'http://i.idnes.cz/15/081/vidw/PID5d1d52_vandas3.jpg', + 'description': 'Desítky lidí se sešly v Praze na protest proti imigrantům. Současně probíhala i demonstrace na jejich podporu. Na Staroměstském náměstí vystoupil i předseda dělnické strany Tomáš Vandas a kontroverzní slovenský politik Marian Kotleba. Dalšího slovenského nacionalistu Mariána Magáta odvedla policie.', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + infourl = self._html_search_regex(r'Misc.videoFLV\({ data: "([^"]+)"', webpage, 'xmlinfourl') + parsedurl = compat_urlparse.urlparse(infourl) + qs = compat_urlparse.parse_qs(parsedurl.query) + if 'reklama' in qs: # Don't ask for ads + qs['reklama'] = ['0'] + qs['type'] = ['js'] # Ask for JS-based info file + newquery = compat_urllib_parse.urlencode(qs, True) + infourl = compat_urlparse.urlunparse(parsedurl[:4] + (newquery, '')) + jsoninfo = self._download_json(infourl, video_id, transform_source=_extract_json) + + item = None + for i in jsoninfo['items']: + if i['type'] == 'video' or i['type'] == 'stream': + item = i + break + if item is None: + raise ExtractorError('No suitable stream found') + title = item['title'] + thumbnail = item['image'] + is_live = item['type'] == 'stream' + if is_live: + title = self._live_title(title) + + formats = [] + for fmt in item['video']: + format_entry = {'url': fmt['file'], + 'format_id': ("%s_%s" % (fmt['format'], fmt['quality'])), + } + if fmt['quality'] == 'middle': + format_entry['quality'] = -2 + elif fmt['quality'] == 'low': + format_entry['quality'] = -3 + + if fmt['format'] == 'mp4': + format_entry['ext'] = 'mp4' + elif fmt['format'] == 'webm': + format_entry['ext'] = 'webm' + elif fmt['format'] == 'apple': + format_entry['ext'] = 'mp4' + format_entry['protocol'] = 'm3u8' + # Some streams have mp3 audio which does not play + # well with ffmpeg filter aac_adtstoasc + format_entry['preference'] = -1 + elif fmt['format'] == 'rtmp': + format_entry['ext'] = 'flv' + else: # Other formats not supported yet + continue + + formats.append(format_entry) + + self._sort_formats(formats) + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'description': self._og_search_description(webpage), + 'is_live': is_live, + 'formats': formats, + } From 276c9897720fe087924aef4ac80cf528e621b832 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 16 Aug 2015 03:07:05 +0600 Subject: [PATCH 1077/2145] [playtvak] Improve and simplify --- youtube_dl/extractor/playtvak.py | 145 ++++++++++++++++++++----------- 1 file changed, 92 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/playtvak.py b/youtube_dl/extractor/playtvak.py index 6dff6650c..4e5034dc6 100644 --- a/youtube_dl/extractor/playtvak.py +++ b/youtube_dl/extractor/playtvak.py @@ -1,23 +1,22 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import ( compat_urlparse, compat_urllib_parse, ) -from ..utils import ExtractorError - - -def _extract_json(code): - return re.sub( - r'(?s)^VideoPlayer.data\("", ({.*})\);?\s*?(?://[^\n]*)*$', r'\1', code) +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, + qualities, +) class PlaytvakIE(InfoExtractor): - _VALID_URL = r'https?://.*?(playtvak|idnes|lidovky|metro)\.cz/.*\?c=(?P<id>[A-Z][0-9]{6}_[0-9]{6}_.*)' + IE_DESC = 'Playtvak.cz, iDNES.cz and Lidovky.cz' + _VALID_URL = r'https?://(?:.+?\.)?(?:playtvak|idnes|lidovky|metro)\.cz/.*\?(?:c|idvideo)=(?P<id>[^&]+)' _TESTS = [{ 'url': 'http://www.playtvak.cz/vyzente-vosy-a-srsne-ze-zahrady-dn5-/hodinovy-manzel.aspx?c=A150730_150323_hodinovy-manzel_kuko', 'md5': '4525ae312c324b4be2f4603cc78ceb4a', @@ -25,8 +24,12 @@ class PlaytvakIE(InfoExtractor): 'id': 'A150730_150323_hodinovy-manzel_kuko', 'ext': 'mp4', 'title': 'Vyžeňte vosy a sršně ze zahrady', - 'thumbnail': 'http://oidnes.cz/15/074/mobil/KUK5cea00_010hodmanel58154.jpg', - 'description': 'Málo co kazí atmosféru venkovního posezení tak jako neustálé bzučení kolem hlavy. Vyzkoušejte náš lapač a odpuzovač vos a sršňů.', + 'description': 'md5:f93d398691044d303bc4a3de62f3e976', + 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$', + 'duration': 279, + 'timestamp': 1438732860, + 'upload_date': '20150805', + 'is_live': False, } }, { # live video test 'url': 'http://slowtv.playtvak.cz/planespotting-0pr-/planespotting.aspx?c=A150624_164934_planespotting_cat', @@ -34,8 +37,8 @@ class PlaytvakIE(InfoExtractor): 'id': 'A150624_164934_planespotting_cat', 'ext': 'flv', 'title': 're:^Přímý přenos iDNES.cz [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'thumbnail': 'http://data.idnes.cz/soubory/servisni-play-porady/89A150630_ACEK_026_VIDEOPLAYER-STREA.PNG', 'description': 'Sledujte provoz na ranveji Letiště Václava Havla v Praze', + 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$', 'is_live': True, }, 'params': { @@ -48,8 +51,12 @@ class PlaytvakIE(InfoExtractor): 'id': 'A150809_104116_domaci_pku', 'ext': 'mp4', 'title': 'Zavřeli jsme mraženou pizzu do auta. Upekla se', - 'thumbnail': 'http://i.idnes.cz/15/081/vidw/SHA5d1786_pizzaauto.jpg', - 'description': 'Na sociálních sítích se objevila výzva, aby lidé, kteří v horkých letních dnech uvidí v zaparkovaném autě zavřeného psa, neváhali rozbít okénko. Zastánci tohoto postoje argumentují zdravím zvířete, které v dusnu může zkolabovat. Policie doporučuje nejprve volat tísňovou linku.', + 'description': 'md5:01e73f02329e2e5760bd5eed4d42e3c2', + 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$', + 'duration': 39, + 'timestamp': 1438969140, + 'upload_date': '20150807', + 'is_live': False, } }, { # lidovky.cz 'url': 'http://www.lidovky.cz/dalsi-demonstrace-v-praze-o-migraci-duq-/video.aspx?c=A150808_214044_ln-video_ELE', @@ -58,70 +65,102 @@ class PlaytvakIE(InfoExtractor): 'id': 'A150808_214044_ln-video_ELE', 'ext': 'mp4', 'title': 'Táhni! Demonstrace proti imigrantům budila emoce', - 'thumbnail': 'http://i.idnes.cz/15/081/vidw/PID5d1d52_vandas3.jpg', - 'description': 'Desítky lidí se sešly v Praze na protest proti imigrantům. Současně probíhala i demonstrace na jejich podporu. Na Staroměstském náměstí vystoupil i předseda dělnické strany Tomáš Vandas a kontroverzní slovenský politik Marian Kotleba. Dalšího slovenského nacionalistu Mariána Magáta odvedla policie.', + 'description': 'md5:97c81d589a9491fbfa323c9fa3cca72c', + 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$', + 'timestamp': 1439052180, + 'upload_date': '20150808', + 'is_live': False, } + }, { + 'url': 'http://www.playtvak.cz/embed.aspx?idvideo=V150729_141549_play-porad_kuko', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - infourl = self._html_search_regex(r'Misc.videoFLV\({ data: "([^"]+)"', webpage, 'xmlinfourl') - parsedurl = compat_urlparse.urlparse(infourl) - qs = compat_urlparse.parse_qs(parsedurl.query) - if 'reklama' in qs: # Don't ask for ads - qs['reklama'] = ['0'] - qs['type'] = ['js'] # Ask for JS-based info file - newquery = compat_urllib_parse.urlencode(qs, True) - infourl = compat_urlparse.urlunparse(parsedurl[:4] + (newquery, '')) - jsoninfo = self._download_json(infourl, video_id, transform_source=_extract_json) + + info_url = self._html_search_regex( + r'Misc\.videoFLV\(\s*{\s*data\s*:\s*"([^"]+)"', webpage, 'info url') + + parsed_url = compat_urlparse.urlparse(info_url) + + qs = compat_urlparse.parse_qs(parsed_url.query) + qs.update({ + 'reklama': ['0'], + 'type': ['js'], + }) + + info_url = compat_urlparse.urlunparse( + parsed_url._replace(query = compat_urllib_parse.urlencode(qs, True))) + + json_info = self._download_json( + info_url, video_id, + transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) item = None - for i in jsoninfo['items']: - if i['type'] == 'video' or i['type'] == 'stream': + for i in json_info['items']: + if i.get('type') == 'video' or i.get('type') == 'stream': item = i break - if item is None: + if not item: raise ExtractorError('No suitable stream found') - title = item['title'] - thumbnail = item['image'] - is_live = item['type'] == 'stream' - if is_live: - title = self._live_title(title) + + quality = qualities(['low', 'middle', 'high']) formats = [] for fmt in item['video']: - format_entry = {'url': fmt['file'], - 'format_id': ("%s_%s" % (fmt['format'], fmt['quality'])), - } - if fmt['quality'] == 'middle': - format_entry['quality'] = -2 - elif fmt['quality'] == 'low': - format_entry['quality'] = -3 + video_url = fmt.get('file') + if not video_url: + continue - if fmt['format'] == 'mp4': - format_entry['ext'] = 'mp4' - elif fmt['format'] == 'webm': - format_entry['ext'] = 'webm' - elif fmt['format'] == 'apple': - format_entry['ext'] = 'mp4' - format_entry['protocol'] = 'm3u8' + format_ = fmt['format'] + format_id = '%s_%s' % (format_, fmt['quality']) + preference = None + + if format_ in ['mp4', 'webm']: + ext = format_ + elif format_ == 'rtmp': + ext = 'flv' + elif format_ == 'apple': + ext = 'mp4' # Some streams have mp3 audio which does not play # well with ffmpeg filter aac_adtstoasc - format_entry['preference'] = -1 - elif fmt['format'] == 'rtmp': - format_entry['ext'] = 'flv' + preference = -1 + elif format_ == 'adobe': # f4m manifest fails with 404 in 80% of requests + continue else: # Other formats not supported yet continue - formats.append(format_entry) - + formats.append({ + 'url': video_url, + 'ext': ext, + 'format_id': format_id, + 'quality': quality(fmt.get('quality')), + 'preference': preference, + }) self._sort_formats(formats) + + title = item['title'] + is_live = item['type'] == 'stream' + if is_live: + title = self._live_title(title) + timestamp = None + duration = None + if not is_live: + duration = int_or_none(item.get('length')) + timestamp = item.get('published') + if timestamp: + timestamp = parse_iso8601(timestamp[:-5]) + return { 'id': video_id, 'title': title, - 'thumbnail': thumbnail, 'description': self._og_search_description(webpage), + 'thumbnail': item.get('image'), + 'duration': duration, + 'timestamp': timestamp, 'is_live': is_live, 'formats': formats, } From 6900b4f6f52ba783bb7a6028fd174250c8832a38 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 16 Aug 2015 01:05:04 +0200 Subject: [PATCH 1078/2145] release 2015.08.16 --- CONTRIBUTING.md | 2 +- docs/supportedsites.md | 6 ++++++ youtube_dl/version.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 588b15bde..42333c450 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -125,7 +125,7 @@ If you want to add support for a new site, you can follow this quick list (assum ``` 5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will be then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. -7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Add tests and code for as many as you want. +7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L62-L200). Add tests and code for as many as you want. 8. If you can, check the code with [flake8](https://pypi.python.org/pypi/flake8). 9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e21471102..9099e2da4 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -150,6 +150,7 @@ - **EroProfile** - **Escapist** - **ESPN** (Currently broken) + - **EsriVideo** - **EveryonesMixtape** - **exfm**: ex.fm - **ExpoTV** @@ -220,6 +221,8 @@ - **imdb:list**: Internet Movie Database lists - **Imgur** - **Ina** + - **Indavideo** + - **IndavideoEmbed** - **InfoQ** - **Instagram** - **instagram:user**: Instagram user profile @@ -386,6 +389,7 @@ - **PlanetaPlay** - **play.fm** - **played.to** + - **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz - **Playvid** - **Playwire** - **plus.google**: Google Plus @@ -432,6 +436,7 @@ - **rtve.es:alacarta**: RTVE a la carta - **rtve.es:infantil**: RTVE infantil - **rtve.es:live**: RTVE.es live streams + - **RTVNH** - **RUHD** - **rutube**: Rutube videos - **rutube:channel**: Rutube channels @@ -455,6 +460,7 @@ - **ServingSys** - **Sexu** - **SexyKarma**: Sexy Karma and Watch Indian Porn + - **Shahid** - **Shared** - **ShareSix** - **Sina** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 6462d4477..689d6fca7 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.08.09' +__version__ = '2015.08.16' From 974f1a385a452b1c86d6f3ff16035b30baaeeb64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 16 Aug 2015 17:22:13 +0600 Subject: [PATCH 1079/2145] [playtvak] Improve description extraction and add test for metro --- youtube_dl/extractor/playtvak.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/playtvak.py b/youtube_dl/extractor/playtvak.py index 4e5034dc6..2b338966f 100644 --- a/youtube_dl/extractor/playtvak.py +++ b/youtube_dl/extractor/playtvak.py @@ -71,6 +71,19 @@ class PlaytvakIE(InfoExtractor): 'upload_date': '20150808', 'is_live': False, } + }, { # metro.cz + 'url': 'http://www.metro.cz/video-pod-billboardem-se-na-vltavske-roztocil-kolotoc-deti-vozil-jen-par-hodin-1hx-/metro-extra.aspx?c=A141111_173251_metro-extra_row', + 'md5': '84fc1deedcac37b7d4a6ccae7c716668', + 'info_dict': { + 'id': 'A141111_173251_metro-extra_row', + 'ext': 'mp4', + 'title': 'Recesisté udělali z billboardu kolotoč', + 'description': 'md5:7369926049588c3989a66c9c1a043c4c', + 'thumbnail': 're:(?i)^https?://.*\.(?:jpg|png)$', + 'timestamp': 1415725500, + 'upload_date': '20141111', + 'is_live': False, + } }, { 'url': 'http://www.playtvak.cz/embed.aspx?idvideo=V150729_141549_play-porad_kuko', 'only_matching': True, @@ -146,6 +159,8 @@ class PlaytvakIE(InfoExtractor): is_live = item['type'] == 'stream' if is_live: title = self._live_title(title) + description = self._og_search_description(webpage, default=None) or self._html_search_meta( + 'description', webpage, 'description') timestamp = None duration = None if not is_live: @@ -157,7 +172,7 @@ class PlaytvakIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'description': self._og_search_description(webpage), + 'description': description, 'thumbnail': item.get('image'), 'duration': duration, 'timestamp': timestamp, From 7fc18d930917ad407c78bb9b0465dc4fae2fb335 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 16 Aug 2015 19:53:14 +0600 Subject: [PATCH 1080/2145] [screenwavemedia] Fix extraction (Closes #6575) --- youtube_dl/extractor/screenwavemedia.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py index 3bc84989e..78b068be2 100644 --- a/youtube_dl/extractor/screenwavemedia.py +++ b/youtube_dl/extractor/screenwavemedia.py @@ -1,6 +1,8 @@ # encoding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( int_or_none, @@ -35,15 +37,18 @@ class ScreenwaveMediaIE(InfoExtractor): sources = self._parse_json( js_to_json( - self._search_regex( - r"sources\s*:\s*(\[[^\]]+?\])", playerconfig, - 'sources', - ).replace( - "' + thisObj.options.videoserver + '", - videoserver - ).replace( - "' + playerVidId + '", - video_id + re.sub( + r'(?s)/\*.*?\*/', '', + self._search_regex( + r"sources\s*:\s*(\[[^\]]+?\])", playerconfig, + 'sources', + ).replace( + "' + thisObj.options.videoserver + '", + videoserver + ).replace( + "' + playerVidId + '", + video_id + ) ) ), video_id From 008687427725b8d857c44d75f358059c2533539a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 16 Aug 2015 19:59:03 +0600 Subject: [PATCH 1081/2145] [playtvak] Use tuples --- youtube_dl/extractor/playtvak.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/playtvak.py b/youtube_dl/extractor/playtvak.py index 2b338966f..278fdc1aa 100644 --- a/youtube_dl/extractor/playtvak.py +++ b/youtube_dl/extractor/playtvak.py @@ -120,7 +120,7 @@ class PlaytvakIE(InfoExtractor): if not item: raise ExtractorError('No suitable stream found') - quality = qualities(['low', 'middle', 'high']) + quality = qualities(('low', 'middle', 'high')) formats = [] for fmt in item['video']: @@ -132,7 +132,7 @@ class PlaytvakIE(InfoExtractor): format_id = '%s_%s' % (format_, fmt['quality']) preference = None - if format_ in ['mp4', 'webm']: + if format_ in ('mp4', 'webm'): ext = format_ elif format_ == 'rtmp': ext = 'flv' From 8626b23e4ea091c4093c25626ca9fc12293b2830 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 16 Aug 2015 20:18:15 +0600 Subject: [PATCH 1082/2145] [screenwavemedia] Make more robust --- youtube_dl/extractor/screenwavemedia.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py index 78b068be2..220d39078 100644 --- a/youtube_dl/extractor/screenwavemedia.py +++ b/youtube_dl/extractor/screenwavemedia.py @@ -51,19 +51,38 @@ class ScreenwaveMediaIE(InfoExtractor): ) ) ), - video_id + video_id, fatal=False ) + # Fallback to hardcoded sources if JS changes again + if not sources: + sources = [{ + 'file': 'http://%s/vod/%s_%s.mp4' % (videoserver, video_id, format_id), + 'type': 'mp4', + 'label': format_label, + } for format_id, format_label in ( + ('low', '144p Low'), ('med', '160p Med'), ('high', '360p High'), ('hd1', '720p HD1'))] + sources.append({ + 'file': 'http://%s/vod/smil:%s.smil/playlist.m3u8' % (videoserver, video_id), + 'type': 'hls', + }) + formats = [] for source in sources: if source['type'] == 'hls': formats.extend(self._extract_m3u8_formats(source['file'], video_id)) else: + file_ = source.get('file') + if not file_: + continue format_label = source.get('label') + format_id = self._search_regex( + r'_(.+?)\.[^.]+$', file_, 'format id', default=None) height = int_or_none(self._search_regex( r'^(\d+)[pP]', format_label, 'height', default=None)) formats.append({ 'url': source['file'], + 'format_id': format_id, 'format': format_label, 'ext': source.get('type'), 'height': height, From f74a7348f6ac52259ea66b74a40165b448fbd702 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 16 Aug 2015 23:33:17 +0600 Subject: [PATCH 1083/2145] [youtube:search_url] Fix extraction (Closes #6578) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 887c46d95..4d1ca9298 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1762,7 +1762,7 @@ class YoutubeSearchURLIE(InfoExtractor): r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML') part_codes = re.findall( - r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code) + r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code) entries = [] for part_code in part_codes: part_title = self._html_search_regex( From cbaed4bb5e5e90103a1164d9326043a3abd0bf83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 17 Aug 2015 02:04:13 +0600 Subject: [PATCH 1084/2145] [youtube] Expand _VALID_URL to support vid.plus --- youtube_dl/extractor/youtube.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4d1ca9298..8e2da46e3 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -202,7 +202,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): v= ) )) - |youtu\.be/ # just youtu.be/xxxx + |(?: + youtu\.be| # just youtu.be/xxxx + vid\.plus # or vid.plus/xxxx + )/ |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= ) )? # all until now is optional -> you can pass the naked ID @@ -624,6 +627,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + }, + { + 'url': 'http://vid.plus/FlRa-iH7PGw', + 'only_matching': True, } ] From c00c7c0af0fdcb380aef0ea9e072a61979d17816 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 16 Aug 2015 23:39:50 +0200 Subject: [PATCH 1085/2145] [sportdeutschland] Fix extraction --- youtube_dl/extractor/sportdeutschland.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py index 1a57aebf1..7ec6c613f 100644 --- a/youtube_dl/extractor/sportdeutschland.py +++ b/youtube_dl/extractor/sportdeutschland.py @@ -38,10 +38,12 @@ class SportDeutschlandIE(InfoExtractor): 'upload_date': '20140825', 'description': 'md5:60a20536b57cee7d9a4ec005e8687504', 'timestamp': 1408976060, + 'duration': 2732, 'title': 'Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen: Herren Einzel, Wei Lee vs. Keun Lee', 'thumbnail': 're:^https?://.*\.jpg$', 'view_count': int, 'categories': ['Li-Ning Badminton WM 2014'], + } }] @@ -50,7 +52,7 @@ class SportDeutschlandIE(InfoExtractor): video_id = mobj.group('id') sport_id = mobj.group('sport') - api_url = 'http://splink.tv/api/permalinks/%s/%s' % ( + api_url = 'http://proxy.vidibusdynamic.net/sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % ( sport_id, video_id) req = compat_urllib_request.Request(api_url, headers={ 'Accept': 'application/vnd.vidibus.v2.html+json', @@ -58,12 +60,11 @@ class SportDeutschlandIE(InfoExtractor): }) data = self._download_json(req, video_id) - categories = list(data.get('section', {}).get('tags', {}).values()) asset = data['asset'] - assets_info = self._download_json(asset['url'], video_id) + categories = [data['section']['title']] formats = [] - smil_url = assets_info['video'] + smil_url = asset['video'] if '.smil' in smil_url: m3u8_url = smil_url.replace('.smil', '.m3u8') formats.extend( @@ -91,6 +92,7 @@ class SportDeutschlandIE(InfoExtractor): 'title': asset['title'], 'thumbnail': asset.get('image'), 'description': asset.get('teaser'), + 'duration': asset.get('duration'), 'categories': categories, 'view_count': asset.get('views'), 'rtmp_live': asset.get('live'), From 0fa5795b85f8d97bf67f10e39a79b49656be58db Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 16 Aug 2015 23:40:07 +0200 Subject: [PATCH 1086/2145] release 2015.08.16.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 689d6fca7..c090c6df7 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.08.16' +__version__ = '2015.08.16.1' From 369c12e038c3183a0e725a929dd9bed4ec35fa11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 17 Aug 2015 20:16:43 +0600 Subject: [PATCH 1087/2145] [twitch] Allow untitled videos (Closes #6585) --- youtube_dl/extractor/twitch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index a2b6a35aa..0521257e5 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -132,7 +132,7 @@ class TwitchItemBaseIE(TwitchBaseIE): def _extract_info(self, info): return { 'id': info['_id'], - 'title': info['title'], + 'title': info.get('title') or 'Untitled Broadcast', 'description': info['description'], 'duration': info['length'], 'thumbnail': info['preview'], From 7a6e8a1b17a6a821d9200531ebf65562ccc2d428 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 17 Aug 2015 20:20:04 +0600 Subject: [PATCH 1088/2145] [twitch] Make more robust --- youtube_dl/extractor/twitch.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 0521257e5..8cba97bd4 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -15,6 +15,7 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + int_or_none, parse_duration, parse_iso8601, ) @@ -133,13 +134,13 @@ class TwitchItemBaseIE(TwitchBaseIE): return { 'id': info['_id'], 'title': info.get('title') or 'Untitled Broadcast', - 'description': info['description'], - 'duration': info['length'], - 'thumbnail': info['preview'], - 'uploader': info['channel']['display_name'], - 'uploader_id': info['channel']['name'], - 'timestamp': parse_iso8601(info['recorded_at']), - 'view_count': info['views'], + 'description': info.get('description'), + 'duration': int_or_none(info.get('length')), + 'thumbnail': info.get('preview'), + 'uploader': info.get('channel', {}).get('display_name'), + 'uploader_id': info.get('channel', {}).get('name'), + 'timestamp': parse_iso8601(info.get('recorded_at')), + 'view_count': int_or_none(info.get('views')), } def _real_extract(self, url): From 9c724a980210ec6a7659fe869cce401dde6e189d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 17 Aug 2015 20:23:52 +0600 Subject: [PATCH 1089/2145] [twitch:vod] Add test for #6585 --- youtube_dl/extractor/twitch.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 8cba97bd4..4f4eb6d72 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -188,7 +188,7 @@ class TwitchVodIE(TwitchItemBaseIE): _ITEM_TYPE = 'vod' _ITEM_SHORTCUT = 'v' - _TEST = { + _TESTS = [{ 'url': 'http://www.twitch.tv/riotgames/v/6528877?t=5m10s', 'info_dict': { 'id': 'v6528877', @@ -207,7 +207,26 @@ class TwitchVodIE(TwitchItemBaseIE): # m3u8 download 'skip_download': True, }, - } + }, { + # Untitled broadcast (title is None) + 'url': 'http://www.twitch.tv/belkao_o/v/11230755', + 'info_dict': { + 'id': 'v11230755', + 'ext': 'mp4', + 'title': 'Untitled Broadcast', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 1638, + 'timestamp': 1439746708, + 'upload_date': '20150816', + 'uploader': 'BelkAO_o', + 'uploader_id': 'belkao_o', + 'view_count': int, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] def _real_extract(self, url): item_id = self._match_id(url) From 3b9b32f404ab09d9dc801dd8ec57d79711be5cb3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 18 Aug 2015 13:02:41 +0200 Subject: [PATCH 1090/2145] [libsyn] Strip options from player URL --- youtube_dl/extractor/libsyn.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py index 9ab1416f5..d375695f5 100644 --- a/youtube_dl/extractor/libsyn.py +++ b/youtube_dl/extractor/libsyn.py @@ -8,9 +8,9 @@ from ..utils import unified_strdate class LibsynIE(InfoExtractor): - _VALID_URL = r'https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+)' + _VALID_URL = r'(?P<mainurl>https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+))' - _TEST = { + _TESTS = [{ 'url': 'http://html5-player.libsyn.com/embed/episode/id/3377616/', 'md5': '443360ee1b58007bc3dcf09b41d093bb', 'info_dict': { @@ -19,12 +19,24 @@ class LibsynIE(InfoExtractor): 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart", 'description': 'md5:601cb790edd05908957dae8aaa866465', 'upload_date': '20150220', + 'thumbnail': 're:^https?://.*', }, - } + }, { + 'url': 'https://html5-player.libsyn.com/embed/episode/id/3727166/height/75/width/200/theme/standard/direction/no/autoplay/no/autonext/no/thumbnail/no/preload/no/no_addthis/no/', + 'md5': '6c5cb21acd622d754d3b1a92b582ce42', + 'info_dict': { + 'id': '3727166', + 'ext': 'mp3', + 'title': 'Clients From Hell Podcast - How a Sex Toy Company Kickstarted my Freelance Career', + 'upload_date': '20150818', + 'thumbnail': 're:^https?://.*', + } + }] def _real_extract(self, url): - video_id = self._match_id(url) - + m = re.match(self._VALID_URL, url) + video_id = m.group('id') + url = m.group('mainurl') webpage = self._download_webpage(url, video_id) formats = [{ @@ -32,20 +44,18 @@ class LibsynIE(InfoExtractor): } for media_url in set(re.findall('var\s+mediaURL(?:Libsyn)?\s*=\s*"([^"]+)"', webpage))] podcast_title = self._search_regex( - r'<h2>([^<]+)</h2>', webpage, 'title') + r'<h2>([^<]+)</h2>', webpage, 'podcast title', default=None) episode_title = self._search_regex( - r'<h3>([^<]+)</h3>', webpage, 'title', default=None) + r'(?:<div class="episode-title">|<h3>)([^<]+)</', webpage, 'episode title') title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title description = self._html_search_regex( r'<div id="info_text_body">(.+?)</div>', webpage, - 'description', fatal=False) - + 'description', default=None) thumbnail = self._search_regex( r'<img[^>]+class="info-show-icon"[^>]+src="([^"]+)"', webpage, 'thumbnail', fatal=False) - release_date = unified_strdate(self._search_regex( r'<div class="release_date">Released: ([^<]+)<', webpage, 'release date', fatal=False)) From 34a4cd0a34bc9f07d865b02f6982fba60421ed0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 18 Aug 2015 20:02:56 +0600 Subject: [PATCH 1091/2145] [telecinco] Relax _VALID_URL (Closes #6601) --- youtube_dl/extractor/telecinco.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index a0c744fd1..ae94f055c 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -6,7 +6,7 @@ from .mitele import MiTeleIE class TelecincoIE(MiTeleIE): IE_NAME = 'telecinco.es' - _VALID_URL = r'https?://www\.telecinco\.es/[^/]+/[^/]+/(?:[^/]+/)?(?P<id>.*?)\.html' + _VALID_URL = r'https?://www\.telecinco\.es/(?:[^/]+/)+(?P<id>.+?)\.html' _TESTS = [{ 'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html', @@ -23,4 +23,7 @@ class TelecincoIE(MiTeleIE): }, { 'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html', 'only_matching': True, + }, { + 'url': 'http://www.telecinco.es/espanasinirmaslejos/Espana-gran-destino-turistico_2_1240605043.html', + 'only_matching': True, }] From 03c635a4b57e6ea4b874029d9fe3738508f6fc7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 18 Aug 2015 20:26:45 +0600 Subject: [PATCH 1092/2145] [twitch] Fix login (Closes #6599) --- youtube_dl/extractor/twitch.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 4f4eb6d72..023911c41 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -12,6 +12,7 @@ from ..compat import ( compat_urllib_parse, compat_urllib_parse_urlparse, compat_urllib_request, + compat_urlparse, ) from ..utils import ( ExtractorError, @@ -27,7 +28,7 @@ class TwitchBaseIE(InfoExtractor): _API_BASE = 'https://api.twitch.tv' _USHER_BASE = 'http://usher.twitch.tv' _LOGIN_URL = 'https://secure.twitch.tv/login' - _LOGIN_POST_URL = 'https://passport.twitch.tv/authorize' + _LOGIN_POST_URL = 'https://passport.twitch.tv/authentications/new' _NETRC_MACHINE = 'twitch' def _handle_error(self, response): @@ -70,8 +71,15 @@ class TwitchBaseIE(InfoExtractor): 'password': password.encode('utf-8'), }) + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, + 'post url', default=self._LOGIN_POST_URL, group='url') + + if not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + request = compat_urllib_request.Request( - self._LOGIN_POST_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + post_url, compat_urllib_parse.urlencode(login_form).encode('utf-8')) request.add_header('Referer', self._LOGIN_URL) response = self._download_webpage( request, None, 'Logging in as %s' % username) From 559f4c550f215a657ce386cab572bfc212128595 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 18 Aug 2015 20:27:58 +0600 Subject: [PATCH 1093/2145] [playtvak] PEP 8 --- youtube_dl/extractor/playtvak.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/playtvak.py b/youtube_dl/extractor/playtvak.py index 278fdc1aa..e360404f7 100644 --- a/youtube_dl/extractor/playtvak.py +++ b/youtube_dl/extractor/playtvak.py @@ -106,7 +106,7 @@ class PlaytvakIE(InfoExtractor): }) info_url = compat_urlparse.urlunparse( - parsed_url._replace(query = compat_urllib_parse.urlencode(qs, True))) + parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True))) json_info = self._download_json( info_url, video_id, From f3a65d9636908ee49ff3d50c24efb8067caa32c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 18 Aug 2015 21:10:52 +0600 Subject: [PATCH 1094/2145] [travis] Move to new infrastructure We don't use rtmpdump in tests anyway --- .travis.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 511bee64c..e78a2fa76 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,9 +5,7 @@ python: - "3.2" - "3.3" - "3.4" -before_install: - - sudo apt-get update -qq - - sudo apt-get install -yqq rtmpdump +sudo: false script: nosetests test --verbose notifications: email: From a01da8bbf83dfd4f87e3fdd105b9f7c850e76cad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 18 Aug 2015 23:02:57 +0600 Subject: [PATCH 1095/2145] [crunchyroll] Workaround fplive.net rtmp URLs (Closes #5881) --- youtube_dl/extractor/crunchyroll.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index d1b6d7366..33a033a7f 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -14,11 +14,13 @@ from ..compat import ( compat_urllib_parse, compat_urllib_parse_unquote, compat_urllib_request, + compat_urlparse, ) from ..utils import ( ExtractorError, bytes_to_intlist, intlist_to_bytes, + remove_end, unified_strdate, urlencode_postdata, ) @@ -279,6 +281,20 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text stream_info = streamdata.find('./{default}preload/stream_info') video_url = stream_info.find('./host').text video_play_path = stream_info.find('./file').text + + if '.fplive.net/' in video_url: + video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip()) + parsed_video_url = compat_urlparse.urlparse(video_url) + direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( + netloc='v.lvlt.crcdn.net', + path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_play_path.split(':')[-1]))) + if self._is_valid_url(direct_video_url, video_id, video_format): + formats.append({ + 'url': direct_video_url, + 'format_id': video_format, + }) + continue + formats.append({ 'url': video_url, 'play_path': video_play_path, From ca681f7041838fa215f8ab5266cd7b442f3f9445 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 19 Aug 2015 20:52:36 +0600 Subject: [PATCH 1096/2145] [videobam] Remove extractor videobam.com redirects to sendvid.com now --- youtube_dl/extractor/__init__.py | 1 - youtube_dl/extractor/videobam.py | 81 -------------------------------- 2 files changed, 82 deletions(-) delete mode 100644 youtube_dl/extractor/videobam.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index fa9acc923..c8c9f1855 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -696,7 +696,6 @@ from .vgtv import ( from .vh1 import VH1IE from .vice import ViceIE from .viddler import ViddlerIE -from .videobam import VideoBamIE from .videodetective import VideoDetectiveIE from .videolecturesnet import VideoLecturesNetIE from .videofyme import VideofyMeIE diff --git a/youtube_dl/extractor/videobam.py b/youtube_dl/extractor/videobam.py deleted file mode 100644 index 0eb3d9414..000000000 --- a/youtube_dl/extractor/videobam.py +++ /dev/null @@ -1,81 +0,0 @@ -from __future__ import unicode_literals - -import re -import json - -from .common import InfoExtractor -from ..utils import int_or_none - - -class VideoBamIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?videobam\.com/(?:videos/download/)?(?P<id>[a-zA-Z]+)' - - _TESTS = [ - { - 'url': 'http://videobam.com/OiJQM', - 'md5': 'db471f27763a531f10416a0c58b5a1e0', - 'info_dict': { - 'id': 'OiJQM', - 'ext': 'mp4', - 'title': 'Is Alcohol Worse Than Ecstasy?', - 'description': 'md5:d25b96151515c91debc42bfbb3eb2683', - 'uploader': 'frihetsvinge', - }, - }, - { - 'url': 'http://videobam.com/pqLvq', - 'md5': 'd9a565b5379a99126ef94e1d7f9a383e', - 'note': 'HD video', - 'info_dict': { - 'id': 'pqLvq', - 'ext': 'mp4', - 'title': '_', - } - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - page = self._download_webpage('http://videobam.com/%s' % video_id, video_id, 'Downloading page') - - formats = [] - - for preference, format_id in enumerate(['low', 'high']): - mobj = re.search(r"%s: '(?P<url>[^']+)'" % format_id, page) - if not mobj: - continue - formats.append({ - 'url': mobj.group('url'), - 'ext': 'mp4', - 'format_id': format_id, - 'preference': preference, - }) - - if not formats: - player_config = json.loads(self._html_search_regex(r'var player_config = ({.+?});', page, 'player config')) - formats = [{ - 'url': item['url'], - 'ext': 'mp4', - } for item in player_config['playlist'] if 'autoPlay' in item] - - self._sort_formats(formats) - - title = self._og_search_title(page, default='_', fatal=False) - description = self._og_search_description(page, default=None) - thumbnail = self._og_search_thumbnail(page) - uploader = self._html_search_regex(r'Upload by ([^<]+)</a>', page, 'uploader', fatal=False, default=None) - view_count = int_or_none( - self._html_search_regex(r'<strong>Views:</strong> (\d+) ', page, 'view count', fatal=False)) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'view_count': view_count, - 'formats': formats, - 'age_limit': 18, - } From f877c6ae5a6e252d6904f90d597479451d2107aa Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 19 Aug 2015 23:11:25 +0800 Subject: [PATCH 1097/2145] [theplatform] Use InfoExtractor._parse_smil_formats() --- youtube_dl/extractor/common.py | 8 +++- youtube_dl/extractor/theplatform.py | 72 ++++++----------------------- 2 files changed, 20 insertions(+), 60 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 65835d257..ac12be933 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1052,7 +1052,7 @@ class InfoExtractor(object): return self._search_regex( r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None): + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): base = smil_url for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): b = meta.get('base') or meta.get('httpBase') @@ -1091,6 +1091,12 @@ class InfoExtractor(object): 'width': width, 'height': height, }) + if transform_rtmp_url: + streamer, src = transform_rtmp_url(streamer, src) + formats[-1].update({ + 'url': streamer, + 'play_path': src, + }) continue src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 0643eccaf..29f938a76 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -9,9 +9,6 @@ import hashlib from .common import InfoExtractor -from ..compat import ( - compat_str, -) from ..utils import ( determine_ext, ExtractorError, @@ -20,7 +17,8 @@ from ..utils import ( int_or_none, ) -_x = lambda p: xpath_with_ns(p, {'smil': 'http://www.w3.org/2005/SMIL21/Language'}) +default_ns = 'http://www.w3.org/2005/SMIL21/Language' +_x = lambda p: xpath_with_ns(p, {'smil': default_ns}) class ThePlatformIE(InfoExtractor): @@ -145,63 +143,19 @@ class ThePlatformIE(InfoExtractor): 'url': src, }] - head = meta.find(_x('smil:head')) - body = meta.find(_x('smil:body')) - - f4m_node = body.find(_x('smil:seq//smil:video')) - if f4m_node is None: - f4m_node = body.find(_x('smil:seq/smil:video')) - if f4m_node is not None and '.f4m' in f4m_node.attrib['src']: - f4m_url = f4m_node.attrib['src'] - if 'manifest.f4m?' not in f4m_url: - f4m_url += '?' + formats = self._parse_smil_formats( + meta, smil_url, video_id, namespace=default_ns, # the parameters are from syfy.com, other sites may use others, # they also work for nbc.com - f4m_url += '&g=UXWGVKRWHFSP&hdcore=3.0.3' - formats = self._extract_f4m_formats(f4m_url, video_id) - else: - formats = [] - switch = body.find(_x('smil:switch')) - if switch is None: - switch = body.find(_x('smil:par//smil:switch')) - if switch is None: - switch = body.find(_x('smil:par/smil:switch')) - if switch is None: - switch = body.find(_x('smil:par')) - if switch is not None: - base_url = head.find(_x('smil:meta')).attrib['base'] - for f in switch.findall(_x('smil:video')): - attr = f.attrib - width = int_or_none(attr.get('width')) - height = int_or_none(attr.get('height')) - vbr = int_or_none(attr.get('system-bitrate'), 1000) - format_id = '%dx%d_%dk' % (width, height, vbr) - formats.append({ - 'format_id': format_id, - 'url': base_url, - 'play_path': 'mp4:' + attr['src'], - 'ext': 'flv', - 'width': width, - 'height': height, - 'vbr': vbr, - }) - else: - switch = body.find(_x('smil:seq//smil:switch')) - if switch is None: - switch = body.find(_x('smil:seq/smil:switch')) - for f in switch.findall(_x('smil:video')): - attr = f.attrib - vbr = int_or_none(attr.get('system-bitrate'), 1000) - ext = determine_ext(attr['src']) - if ext == 'once': - ext = 'mp4' - formats.append({ - 'format_id': compat_str(vbr), - 'url': attr['src'], - 'vbr': vbr, - 'ext': ext, - }) - self._sort_formats(formats) + f4m_params={'g': 'UXWGVKRWHFSP', 'hdcore': '3.0.3'}, + transform_rtmp_url=lambda streamer, src: (streamer, 'mp4:' + src)) + + for _format in formats: + ext = determine_ext(_format['url']) + if ext == 'once': + _format['ext'] = 'mp4' + + self._sort_formats(formats) return { 'id': video_id, From 7900aede14d2e2a46c8fd4430e48cde41f354859 Mon Sep 17 00:00:00 2001 From: ping <lipng.ong@gmail.com> Date: Wed, 22 Jul 2015 14:31:29 +0800 Subject: [PATCH 1098/2145] [mwave] New extractor for mwave.interest.me --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/mwave.py | 46 ++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) create mode 100644 youtube_dl/extractor/mwave.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index c8c9f1855..006ef3922 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -345,6 +345,7 @@ from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE from .musicvault import MusicVaultIE from .muzu import MuzuTVIE +from .mwave import MwaveIE from .myspace import MySpaceIE, MySpaceAlbumIE from .myspass import MySpassIE from .myvi import MyviIE diff --git a/youtube_dl/extractor/mwave.py b/youtube_dl/extractor/mwave.py new file mode 100644 index 000000000..7f91aa269 --- /dev/null +++ b/youtube_dl/extractor/mwave.py @@ -0,0 +1,46 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MwaveIE(InfoExtractor): + IE_NAME = 'mwave' + _VALID_URL = r'https?://mwave\.interest\.me/mnettv/videodetail\.m\?searchVideoDetailVO\.clip_id=(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=168859', + 'info_dict': { + 'id': '168859', + 'ext': 'flv', + 'title': '[M COUNTDOWN] SISTAR - SHAKE IT', + 'creator': 'M COUNTDOWN', + } + }, { + 'url': 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=168860', + 'info_dict': { + 'id': '168860', + 'ext': 'flv', + 'title': '[Full Ver.] M GIGS Ep. 59 - IDIOTAPE Live Part 1', + 'creator': 'M-GIGS', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + stream_info = self._download_json( + 'http://mwave.interest.me/onair/vod_info.m?vodtype=CL§orid=&endinfo=Y&id=%s' % video_id, + 'Download stream info') + + formats = [] + for info in stream_info['cdn']: + f4m_stream = self._download_json(info['url'], video_id, 'Download f4m stream') + formats.extend( + self._extract_f4m_formats(f4m_stream['fileurl'] + '&g=PCROWKHLYUDY&hdcore=3.0.3', video_id)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': stream_info['title'], + 'creator': stream_info.get('program_title'), + 'formats': formats, + } From 22c83245c51faa53118a8f815b13b2e4c2df9923 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 19 Aug 2015 23:07:41 +0600 Subject: [PATCH 1099/2145] [mwave] Improve --- youtube_dl/extractor/mwave.py | 50 ++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/mwave.py b/youtube_dl/extractor/mwave.py index 7f91aa269..66b523197 100644 --- a/youtube_dl/extractor/mwave.py +++ b/youtube_dl/extractor/mwave.py @@ -1,46 +1,58 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + parse_duration, +) class MwaveIE(InfoExtractor): - IE_NAME = 'mwave' _VALID_URL = r'https?://mwave\.interest\.me/mnettv/videodetail\.m\?searchVideoDetailVO\.clip_id=(?P<id>[0-9]+)' - _TESTS = [{ + _TEST = { 'url': 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=168859', + 'md5': 'c930e27b7720aaa3c9d0018dfc8ff6cc', 'info_dict': { 'id': '168859', 'ext': 'flv', 'title': '[M COUNTDOWN] SISTAR - SHAKE IT', - 'creator': 'M COUNTDOWN', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'M COUNTDOWN', + 'duration': 206, + 'view_count': int, } - }, { - 'url': 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=168860', - 'info_dict': { - 'id': '168860', - 'ext': 'flv', - 'title': '[Full Ver.] M GIGS Ep. 59 - IDIOTAPE Live Part 1', - 'creator': 'M-GIGS', - } - }] + } def _real_extract(self, url): video_id = self._match_id(url) - stream_info = self._download_json( + vod_info = self._download_json( 'http://mwave.interest.me/onair/vod_info.m?vodtype=CL§orid=&endinfo=Y&id=%s' % video_id, - 'Download stream info') + video_id, 'Download vod JSON') formats = [] - for info in stream_info['cdn']: - f4m_stream = self._download_json(info['url'], video_id, 'Download f4m stream') + for num, cdn_info in enumerate(vod_info['cdn']): + stream_url = cdn_info.get('url') + if not stream_url: + continue + stream_name = cdn_info.get('name') or compat_str(num) + f4m_stream = self._download_json( + stream_url, video_id, + 'Download %s stream JSON' % stream_name) + f4m_url = f4m_stream.get('fileurl') + if not f4m_url: + continue formats.extend( - self._extract_f4m_formats(f4m_stream['fileurl'] + '&g=PCROWKHLYUDY&hdcore=3.0.3', video_id)) + self._extract_f4m_formats(f4m_url + '&hdcore=3.0.3', video_id, f4m_id=stream_name)) self._sort_formats(formats) return { 'id': video_id, - 'title': stream_info['title'], - 'creator': stream_info.get('program_title'), + 'title': vod_info['title'], + 'thumbnail': vod_info.get('cover'), + 'uploader': vod_info.get('program_title'), + 'duration': parse_duration(vod_info.get('time')), + 'view_count': int_or_none(vod_info.get('hit')), 'formats': formats, } From 26e1c3514f4af1ed60cd1114a653fe49e1fa8d11 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 20 Aug 2015 01:24:32 +0800 Subject: [PATCH 1100/2145] [theplatform] Add ThePlatformFeedIE --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/theplatform.py | 162 +++++++++++++++++++++------- 2 files changed, 126 insertions(+), 41 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index fa9acc923..b5f7ff9a9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -612,7 +612,10 @@ from .testurl import TestURLIE from .testtube import TestTubeIE from .tf1 import TF1IE from .theonion import TheOnionIE -from .theplatform import ThePlatformIE +from .theplatform import ( + ThePlatformIE, + ThePlatformFeedIE, +) from .thesixtyone import TheSixtyOneIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 29f938a76..f02e0f58d 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -15,13 +15,68 @@ from ..utils import ( xpath_with_ns, unsmuggle_url, int_or_none, + url_basename, + float_or_none, ) default_ns = 'http://www.w3.org/2005/SMIL21/Language' _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) -class ThePlatformIE(InfoExtractor): +class ThePlatformBaseIE(InfoExtractor): + def _extract_theplatform_smil_formats(self, smil_url, video_id, note='Downloading SMIL data'): + meta = self._download_xml(smil_url, video_id, note=note) + try: + error_msg = next( + n.attrib['abstract'] + for n in meta.findall(_x('.//smil:ref')) + if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired') + except StopIteration: + pass + else: + raise ExtractorError(error_msg, expected=True) + + formats = self._parse_smil_formats( + meta, smil_url, video_id, namespace=default_ns, + # the parameters are from syfy.com, other sites may use others, + # they also work for nbc.com + f4m_params={'g': 'UXWGVKRWHFSP', 'hdcore': '3.0.3'}, + transform_rtmp_url=lambda streamer, src: (streamer, 'mp4:' + src)) + + for _format in formats: + ext = determine_ext(_format['url']) + if ext == 'once': + _format['ext'] = 'mp4' + + self._sort_formats(formats) + + return formats + + def get_metadata(self, path, video_id): + info_url = 'http://link.theplatform.com/s/%s?format=preview' % path + info_json = self._download_webpage(info_url, video_id) + info = json.loads(info_json) + + subtitles = {} + captions = info.get('captions') + if isinstance(captions, list): + for caption in captions: + lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type') + subtitles[lang] = [{ + 'ext': 'srt' if mime == 'text/srt' else 'ttml', + 'url': src, + }] + + return { + 'title': info['title'], + 'subtitles': subtitles, + 'description': info['description'], + 'thumbnail': info['defaultThumbnailUrl'], + 'duration': int_or_none(info.get('duration'), 1000), + } + + +class ThePlatformIE(ThePlatformBaseIE): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/ (?:(?P<media>(?:[^/]+/)+select/media/)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))? @@ -118,51 +173,78 @@ class ThePlatformIE(InfoExtractor): if sig: smil_url = self._sign_url(smil_url, sig['key'], sig['secret']) - meta = self._download_xml(smil_url, video_id) - try: - error_msg = next( - n.attrib['abstract'] - for n in meta.findall(_x('.//smil:ref')) - if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired') - except StopIteration: - pass - else: - raise ExtractorError(error_msg, expected=True) + formats = self._extract_theplatform_smil_formats(smil_url, video_id) - info_url = 'http://link.theplatform.com/s/%s?format=preview' % path - info_json = self._download_webpage(info_url, video_id) - info = json.loads(info_json) + ret = self.get_metadata(path, video_id) + ret.update({ + 'id': video_id, + 'formats': formats, + }) - subtitles = {} - captions = info.get('captions') - if isinstance(captions, list): - for caption in captions: - lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type') - subtitles[lang] = [{ - 'ext': 'srt' if mime == 'text/srt' else 'ttml', - 'url': src, - }] + return ret - formats = self._parse_smil_formats( - meta, smil_url, video_id, namespace=default_ns, - # the parameters are from syfy.com, other sites may use others, - # they also work for nbc.com - f4m_params={'g': 'UXWGVKRWHFSP', 'hdcore': '3.0.3'}, - transform_rtmp_url=lambda streamer, src: (streamer, 'mp4:' + src)) - for _format in formats: - ext = determine_ext(_format['url']) - if ext == 'once': - _format['ext'] = 'mp4' +class ThePlatformFeedIE(ThePlatformBaseIE): + _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&byGuid=%s' + _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*byGuid=(?P<id>[a-zA-Z0-9_]+)' + _TEST = { + # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207 + 'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207', + 'md5': '22d2b84f058d3586efcd99e57d59d314', + 'info_dict': { + 'id': 'n_hardball_5biden_140207', + 'ext': 'mp4', + 'title': 'The Biden factor: will Joe run in 2016?', + 'description': 'Could Vice President Joe Biden be preparing a 2016 campaign? Mark Halperin and Sam Stein weigh in.', + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20140208', + 'timestamp': 1391824260, + 'duration': 467.0, + 'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'], + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + provider_id = mobj.group('provider_id') + feed_id = mobj.group('feed_id') + + real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, video_id) + feed = self._download_json(real_url, video_id) + entry = feed['entries'][0] + + formats = [] + first_video_id = None + duration = None + for item in entry['media$content']: + smil_url = item['plfile$url'] + '&format=SMIL&Tracking=true&Embedded=true&formats=MPEG4,F4M' + cur_video_id = url_basename(smil_url) + if first_video_id is None: + first_video_id = cur_video_id + duration = float_or_none(item.get('plfile$duration')) + formats.extend(self._extract_theplatform_smil_formats(smil_url, video_id, 'Downloading SMIL data for %s' % cur_video_id)) self._sort_formats(formats) - return { + thumbnails = [{ + 'url': thumbnail['plfile$url'], + 'width': int_or_none(thumbnail.get('plfile$width')), + 'height': int_or_none(thumbnail.get('plfile$height')), + } for thumbnail in entry.get('media$thumbnails', [])] + + timestamp = int_or_none(entry.get('media$availableDate'), scale=1000) + categories = [item['media$name'] for item in entry.get('media$categories', [])] + + ret = self.get_metadata('%s/%s' % (provider_id, first_video_id), video_id) + ret.update({ 'id': video_id, - 'title': info['title'], - 'subtitles': subtitles, 'formats': formats, - 'description': info['description'], - 'thumbnail': info['defaultThumbnailUrl'], - 'duration': int_or_none(info.get('duration'), 1000), - } + 'thumbnails': thumbnails, + 'duration': duration, + 'timestamp': timestamp, + 'categories': categories, + }) + + return ret From 05fe2594e4589b4e714a423550172eeec3949a70 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 20 Aug 2015 01:38:39 +0800 Subject: [PATCH 1101/2145] [theplatform] Support URLs with 'guid=' --- youtube_dl/extractor/theplatform.py | 36 +++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index f02e0f58d..883bf491c 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -9,6 +9,10 @@ import hashlib from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( determine_ext, ExtractorError, @@ -120,6 +124,20 @@ class ThePlatformIE(ThePlatformBaseIE): }, { 'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7', 'only_matching': True, + }, { + 'url': 'http://player.theplatform.com/p/2E2eJC/nbcNewsOffsite?guid=tdy_or_siri_150701', + 'md5': '734f3790fb5fc4903da391beeebc4836', + 'info_dict': { + 'id': 'tdy_or_siri_150701', + 'ext': 'mp4', + 'title': 'iPhone Siri’s sassy response to a math question has people talking', + 'description': 'md5:a565d1deadd5086f3331d57298ec6333', + 'duration': 83.0, + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1435752600, + 'upload_date': '20150701', + 'categories': ['Today/Shows/Orange Room', 'Today/Sections/Money', 'Today/Topics/Tech', "Today/Topics/Editor's picks"], + }, }] @staticmethod @@ -154,6 +172,24 @@ class ThePlatformIE(ThePlatformBaseIE): path += '/media' path += '/' + video_id + qs_dict = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + if 'guid' in qs_dict: + webpage = self._download_webpage(url, video_id) + scripts = re.findall(r'<script[^>]+src="([^"]+)"', webpage) + feed_id = None + # feed id usually locates in the last script. + # Seems there's no pattern for the interested script filename, so + # I try one by one + for script in reversed(scripts): + feed_script = self._download_webpage(script, video_id, 'Downloading feed script') + feed_id = self._search_regex(r'defaultFeedId\s*:\s*"([^"]+)"', feed_script, 'default feed id', default=None) + if feed_id is not None: + break + if feed_id is None: + raise ExtractorError('Unable to find feed id') + return self.url_result('http://feed.theplatform.com/f/%s/%s?byGuid=%s' % ( + provider_id, feed_id, qs_dict['guid'][0])) + if smuggled_data.get('force_smil_url', False): smil_url = url elif mobj.group('config'): From dac14bf311fd1b3c6af6c57b3b03878a11ef5aae Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 20 Aug 2015 01:41:18 +0800 Subject: [PATCH 1102/2145] [nbc] Add MSNBCIE --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/nbc.py | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index b5f7ff9a9..86ea0576a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -358,6 +358,7 @@ from .nbc import ( NBCNewsIE, NBCSportsIE, NBCSportsVPlayerIE, + MSNBCIE, ) from .ndr import ( NDRIE, diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index ccdbfb6c9..e683d24c4 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -236,3 +236,28 @@ class NBCNewsIE(InfoExtractor): 'url': info['videoAssets'][-1]['publicUrl'], 'ie_key': 'ThePlatform', } + + +class MSNBCIE(InfoExtractor): + # https URLs redirect to corresponding http ones + _VALID_URL = r'http://www\.msnbc\.com/[^/]+/watch/(?P<id>[^/]+)' + _TEST = { + 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', + 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', + 'info_dict': { + 'id': 'n_hayes_Aimm_140801_272214', + 'ext': 'mp4', + 'title': 'The chaotic GOP immigration vote', + 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1406937606, + 'upload_date': '20140802', + 'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'], + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + embed_url = self._html_search_meta('embedURL', webpage) + return self.url_result(embed_url) From aa6cd05ed82b14af0e3827b2ff43eed02087b574 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 20 Aug 2015 01:47:55 +0800 Subject: [PATCH 1103/2145] [theplatform] Fix Python 2: declare coding --- youtube_dl/extractor/theplatform.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 883bf491c..adaec3375 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from __future__ import unicode_literals import re From ce00af87670d47f4dff6ad80e46a29e49cbdfe4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 20 Aug 2015 00:56:17 +0600 Subject: [PATCH 1104/2145] [extractor/common] Add default subtitles lang --- youtube_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index ac12be933..b1af45870 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1135,7 +1135,7 @@ class InfoExtractor(object): return formats - def _parse_smil_subtitles(self, smil, namespace=None): + def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): subtitles = {} for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): src = textstream.get('src') @@ -1146,7 +1146,7 @@ class InfoExtractor(object): type_ = textstream.get('type') if type_ == 'text/srt': ext = 'srt' - lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') + lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or subtitles_lang subtitles.setdefault(lang, []).append({ 'url': src, 'ext': ext, From 5cdefc46257802708816e1d4ea7ff5cafe910ff6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 20 Aug 2015 01:02:50 +0600 Subject: [PATCH 1105/2145] [extractor/common] Add more subtitle mime types for guess when ext is missing --- youtube_dl/extractor/common.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b1af45870..ce2030d28 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1144,8 +1144,13 @@ class InfoExtractor(object): ext = textstream.get('ext') or determine_ext(src) if not ext: type_ = textstream.get('type') - if type_ == 'text/srt': - ext = 'srt' + SUBTITLES_TYPES = { + 'text/vtt': 'vtt', + 'text/srt': 'srt', + 'application/smptett+xml': 'tt', + } + if type_ in SUBTITLES_TYPES: + ext = SUBTITLES_TYPES[type_] lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or subtitles_lang subtitles.setdefault(lang, []).append({ 'url': src, From dd565ac1ad22fe48f8e358d95ea912b1768b1e5a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 20 Aug 2015 03:07:04 +0800 Subject: [PATCH 1106/2145] [theplatform] Use _download_json --- youtube_dl/extractor/theplatform.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index adaec3375..ba05ada39 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import json import time import hmac import binascii @@ -59,8 +58,7 @@ class ThePlatformBaseIE(InfoExtractor): def get_metadata(self, path, video_id): info_url = 'http://link.theplatform.com/s/%s?format=preview' % path - info_json = self._download_webpage(info_url, video_id) - info = json.loads(info_json) + info = self._download_json(info_url, video_id) subtitles = {} captions = info.get('captions') From 061f62da54cb4184a039108e40dee8e9eb2611c1 Mon Sep 17 00:00:00 2001 From: ping <lipng.ong@gmail.com> Date: Thu, 20 Aug 2015 12:56:11 +0800 Subject: [PATCH 1107/2145] [vlive] New extractor for vlive.tv --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/vlive.py | 94 ++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 youtube_dl/extractor/vlive.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1c53a5632..6bee5b63c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -735,6 +735,7 @@ from .vk import ( VKIE, VKUserVideosIE, ) +from .vlive import VLiveIE from .vodlocker import VodlockerIE from .voicerepublic import VoiceRepublicIE from .vporn import VpornIE diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py new file mode 100644 index 000000000..b3bbd80fb --- /dev/null +++ b/youtube_dl/extractor/vlive.py @@ -0,0 +1,94 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hmac +from hashlib import sha1 +from base64 import b64encode +from time import time + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext +) +from ..compat import compat_urllib_parse + + +class VLiveIE(InfoExtractor): + IE_NAME = 'vlive' + _VALID_URL = r'https?://(?:(www|m)\.)?vlive\.tv/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://m.vlive.tv/video/1326', + 'md5': 'cc7314812855ce56de70a06a27314983', + 'info_dict': { + 'id': '1326', + 'ext': 'mp4', + 'title': '[V] Girl\'s Day\'s Broadcast', + 'creator': 'Girl\'s Day', + 'upload_date': '20150817', + }, + } + _SECRET = 'rFkwZet6pqk1vQt6SxxUkAHX7YL3lmqzUMrU4IDusTo4jEBdtOhNfT4BYYAdArwH' + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://m.vlive.tv/video/%s' % video_id, + video_id, note='Download video page') + + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) + creator = self._html_search_regex( + r'<span class="name">([^<>]+)</span>', webpage, 'creator') + upload_date = self._html_search_regex( + r'<span class="time">(\d{4}\.\d{2}\.\d{2})</span>', webpage, + 'upload date', default=None, fatal=False) + if upload_date: + upload_date = upload_date.replace('.', '') + + url = 'http://global.apis.naver.com/globalV/globalV/vod/%s/playinfo?' % video_id + msgpad = {'msgpad': '%.0f' % (time() * 1000)} + md = { + 'md': b64encode( + hmac.new(self._SECRET.encode('ascii'), + (url[:255] + msgpad['msgpad']).encode('ascii'), sha1).digest()) + } + url += '&' + compat_urllib_parse.urlencode(msgpad) + '&' + compat_urllib_parse.urlencode(md) + + playinfo = self._download_json(url, video_id, 'Downloading video json') + + if playinfo.get('message', '') != 'success': + raise ExtractorError(playinfo['message']) + + if not playinfo.get('result'): + raise ExtractorError('No videos found.') + + formats = [] + for vid in playinfo['result'].get('videos', {}).get('list', []): + formats.append({ + 'url': vid['source'], + 'ext': 'mp4', + 'abr': vid.get('bitrate', {}).get('audio'), + 'vbr': vid.get('bitrate', {}).get('video'), + 'format_id': vid['encodingOption']['name'], + 'height': vid.get('height'), + 'width': vid.get('width'), + }) + self._sort_formats(formats) + + subtitles = {} + for caption in playinfo['result'].get('captions', {}).get('list', []): + subtitles[caption['language']] = [ + {'ext': determine_ext(caption['source'], default_ext='vtt'), + 'url': caption['source']}] + + return { + 'id': video_id, + 'title': title, + 'creator': creator, + 'thumbnail': thumbnail, + 'formats': formats, + 'upload_date': upload_date, + 'subtitles': subtitles, + } From 03bc7237add1747de4c0c5d09e72e03639b4fd21 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 20 Aug 2015 23:18:58 +0800 Subject: [PATCH 1108/2145] [common] _parse_smil_subtitles: accept `lang` as the subtitle language --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index ce2030d28..999afc110 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1151,7 +1151,7 @@ class InfoExtractor(object): } if type_ in SUBTITLES_TYPES: ext = SUBTITLES_TYPES[type_] - lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or subtitles_lang + lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang subtitles.setdefault(lang, []).append({ 'url': src, 'ext': ext, From 912e0b7e46d795df3ec1866f9b0ff071cca8d550 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 21 Aug 2015 01:37:07 +0800 Subject: [PATCH 1109/2145] [common] Add _merge_subtitles() --- youtube_dl/extractor/common.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 999afc110..b7437af5a 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1279,6 +1279,26 @@ class InfoExtractor(object): def _get_subtitles(self, *args, **kwargs): raise NotImplementedError("This method must be implemented by subclasses") + @staticmethod + def _merge_subtitle_items(subtitle_list1, subtitle_list2): + """ Merge subtitle items for one language. Items with duplicated URLs + will be dropped. """ + list1_urls = set([item['url'] for item in subtitle_list1]) + ret = list(subtitle_list1) + ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) + return ret + + @classmethod + def _merge_subtitles(kls, subtitle_dict1, subtitle_dict2): + """ Merge two subtitle dictionaries, language by language. """ + print(subtitle_dict1) + print(subtitle_dict2) + ret = dict(subtitle_dict1) + for lang in subtitle_dict2: + ret[lang] = kls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) + print(ret) + return ret + def extract_automatic_captions(self, *args, **kwargs): if (self._downloader.params.get('writeautomaticsub', False) or self._downloader.params.get('listsubtitles')): From c687ac745b3c94b2fd246214e78c92a31bd9fc0f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 21 Aug 2015 01:37:43 +0800 Subject: [PATCH 1110/2145] [theplatform] Use subtitles from SMIL, too --- youtube_dl/extractor/theplatform.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index ba05ada39..25edc3100 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -28,7 +28,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) class ThePlatformBaseIE(InfoExtractor): - def _extract_theplatform_smil_formats(self, smil_url, video_id, note='Downloading SMIL data'): + def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): meta = self._download_xml(smil_url, video_id, note=note) try: error_msg = next( @@ -54,7 +54,9 @@ class ThePlatformBaseIE(InfoExtractor): self._sort_formats(formats) - return formats + subtitles = self._parse_smil_subtitles(meta, default_ns) + + return formats, subtitles def get_metadata(self, path, video_id): info_url = 'http://link.theplatform.com/s/%s?format=preview' % path @@ -208,12 +210,14 @@ class ThePlatformIE(ThePlatformBaseIE): if sig: smil_url = self._sign_url(smil_url, sig['key'], sig['secret']) - formats = self._extract_theplatform_smil_formats(smil_url, video_id) + formats, subtitles = self._extract_theplatform_smil(smil_url, video_id) ret = self.get_metadata(path, video_id) + combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles) ret.update({ 'id': video_id, 'formats': formats, + 'subtitles': combined_subtitles, }) return ret @@ -251,6 +255,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE): entry = feed['entries'][0] formats = [] + subtitles = {} first_video_id = None duration = None for item in entry['media$content']: @@ -259,7 +264,9 @@ class ThePlatformFeedIE(ThePlatformBaseIE): if first_video_id is None: first_video_id = cur_video_id duration = float_or_none(item.get('plfile$duration')) - formats.extend(self._extract_theplatform_smil_formats(smil_url, video_id, 'Downloading SMIL data for %s' % cur_video_id)) + cur_formats, cur_subtitles = self._extract_theplatform_smil(smil_url, video_id, 'Downloading SMIL data for %s' % cur_video_id) + formats.extend(cur_formats) + subtitles = self._merge_subtitles(subtitles, cur_subtitles) self._sort_formats(formats) @@ -273,9 +280,11 @@ class ThePlatformFeedIE(ThePlatformBaseIE): categories = [item['media$name'] for item in entry.get('media$categories', [])] ret = self.get_metadata('%s/%s' % (provider_id, first_video_id), video_id) + subtitles = self._merge_subtitles(subtitles, ret['subtitles']) ret.update({ 'id': video_id, 'formats': formats, + 'subtitles': subtitles, 'thumbnails': thumbnails, 'duration': duration, 'timestamp': timestamp, From f908b74fa38b2678e26aea128dbd934cd781a9b6 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 21 Aug 2015 01:38:57 +0800 Subject: [PATCH 1111/2145] [test/subtitles] Add test for ThePlatformFeedIE --- test/test_subtitles.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index c4e3adb67..0343967d9 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -25,6 +25,7 @@ from youtube_dl.extractor import ( RaiIE, VikiIE, ThePlatformIE, + ThePlatformFeedIE, RTVEALaCartaIE, FunnyOrDieIE, ) @@ -307,6 +308,18 @@ class TestThePlatformSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), '97e7670cbae3c4d26ae8bcc7fdd78d4b') +class TestThePlatformFeedSubtitles(BaseTestSubtitles): + url = 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207' + IE = ThePlatformFeedIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), '48649a22e82b2da21c9a67a395eedade') + + class TestRtveSubtitles(BaseTestSubtitles): url = 'http://www.rtve.es/alacarta/videos/los-misterios-de-laura/misterios-laura-capitulo-32-misterio-del-numero-17-2-parte/2428621/' IE = RTVEALaCartaIE From f738dd7b7c7aefe4d26a65905dee9567a691d262 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 21 Aug 2015 01:43:22 +0800 Subject: [PATCH 1112/2145] [common] Remove debugging codes --- youtube_dl/extractor/common.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b7437af5a..f731703fb 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1291,12 +1291,9 @@ class InfoExtractor(object): @classmethod def _merge_subtitles(kls, subtitle_dict1, subtitle_dict2): """ Merge two subtitle dictionaries, language by language. """ - print(subtitle_dict1) - print(subtitle_dict2) ret = dict(subtitle_dict1) for lang in subtitle_dict2: ret[lang] = kls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) - print(ret) return ret def extract_automatic_captions(self, *args, **kwargs): From dc95bd503e82d3eb04c347ac0cdbcbabd7e14552 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 21 Aug 2015 08:54:28 +0600 Subject: [PATCH 1113/2145] [folketinget] Add skip_download for test --- youtube_dl/extractor/folketinget.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/folketinget.py b/youtube_dl/extractor/folketinget.py index 0fb29de75..75399fa7d 100644 --- a/youtube_dl/extractor/folketinget.py +++ b/youtube_dl/extractor/folketinget.py @@ -30,6 +30,10 @@ class FolketingetIE(InfoExtractor): 'upload_date': '20141120', 'duration': 3960, }, + 'params': { + # rtmp download + 'skip_download': True, + }, } def _real_extract(self, url): From 5d003e29b188dd2f140fe1b9b93f1bb1ad8263a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 21 Aug 2015 08:56:05 +0600 Subject: [PATCH 1114/2145] [rtp] Add skip_download for test --- youtube_dl/extractor/rtp.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/rtp.py b/youtube_dl/extractor/rtp.py index ecf4939cd..82b323cdd 100644 --- a/youtube_dl/extractor/rtp.py +++ b/youtube_dl/extractor/rtp.py @@ -18,6 +18,10 @@ class RTPIE(InfoExtractor): 'description': 'As paixões musicais de António Cartaxo e António Macedo', 'thumbnail': 're:^https?://.*\.jpg', }, + 'params': { + # rtmp download + 'skip_download': True, + }, }, { 'url': 'http://www.rtp.pt/play/p831/a-quimica-das-coisas', 'only_matching': True, From 4932a817a0c2375df14d66c9ac86cfa28988327d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 21 Aug 2015 13:00:08 +0800 Subject: [PATCH 1115/2145] [rtl2] Add skip_download for test --- youtube_dl/extractor/rtl2.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/rtl2.py b/youtube_dl/extractor/rtl2.py index 72cd80498..e9589449e 100644 --- a/youtube_dl/extractor/rtl2.py +++ b/youtube_dl/extractor/rtl2.py @@ -15,6 +15,10 @@ class RTL2IE(InfoExtractor): 'title': 'GRIP sucht den Sommerkönig', 'description': 'Matthias, Det und Helge treten gegeneinander an.' }, + 'params': { + # rtmp download + 'skip_download': True, + }, }, { 'url': 'http://www.rtl2.de/sendung/koeln-50667/video/5512-anna/21040-anna-erwischt-alex/', 'md5': 'ffcd517d2805b57ce11a58a2980c2b02', From 9eb4ab6ad915a777b6f7d7b39d03d05d7d31cd24 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 21 Aug 2015 13:04:25 +0800 Subject: [PATCH 1116/2145] [rtl2] Remove an unused line --- youtube_dl/extractor/rtl2.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/rtl2.py b/youtube_dl/extractor/rtl2.py index e9589449e..9e0c6890e 100644 --- a/youtube_dl/extractor/rtl2.py +++ b/youtube_dl/extractor/rtl2.py @@ -43,7 +43,6 @@ class RTL2IE(InfoExtractor): vivi_id = self._html_search_regex( r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id') info_url = 'http://www.rtl2.de/video/php/get_video.php?vico_id=' + vico_id + '&vivi_id=' + vivi_id - webpage = self._download_webpage(info_url, '') info = self._download_json(info_url, video_id) video_info = info['video'] From 5e1a5ac8de12391cb22d2fa0dfb2119527bd7fc2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 21 Aug 2015 13:20:32 +0800 Subject: [PATCH 1117/2145] [rtl2] Fix extraction for test_RTL2_1 --- youtube_dl/extractor/rtl2.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/rtl2.py b/youtube_dl/extractor/rtl2.py index 9e0c6890e..276612fc7 100644 --- a/youtube_dl/extractor/rtl2.py +++ b/youtube_dl/extractor/rtl2.py @@ -1,6 +1,7 @@ # encoding: utf-8 from __future__ import unicode_literals +import re from .common import InfoExtractor @@ -28,6 +29,10 @@ class RTL2IE(InfoExtractor): 'title': 'Anna erwischt Alex!', 'description': 'Anna ist Alex\' Tochter bei Köln 50667.' }, + 'params': { + # rtmp download + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -38,10 +43,17 @@ class RTL2IE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - vico_id = self._html_search_regex( - r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id') - vivi_id = self._html_search_regex( - r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id') + mobj = re.search( + r'<div[^>]+data-collection="(?P<vico_id>\d+)"[^>]+data-video="(?P<vivi_id>\d+)"', + webpage) + if mobj: + vico_id = mobj.group('vico_id') + vivi_id = mobj.group('vivi_id') + else: + vico_id = self._html_search_regex( + r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id') + vivi_id = self._html_search_regex( + r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id') info_url = 'http://www.rtl2.de/video/php/get_video.php?vico_id=' + vico_id + '&vivi_id=' + vivi_id info = self._download_json(info_url, video_id) From d7c16305707f2af5c47d91b67cf0850b4dcada3a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 21 Aug 2015 13:21:21 +0800 Subject: [PATCH 1118/2145] [rtl2] Remove MD5 checksums --- youtube_dl/extractor/rtl2.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/rtl2.py b/youtube_dl/extractor/rtl2.py index 276612fc7..25f7faf76 100644 --- a/youtube_dl/extractor/rtl2.py +++ b/youtube_dl/extractor/rtl2.py @@ -9,7 +9,6 @@ class RTL2IE(InfoExtractor): _VALID_URL = r'http?://(?:www\.)?rtl2\.de/[^?#]*?/(?P<id>[^?#/]*?)(?:$|/(?:$|[?#]))' _TESTS = [{ 'url': 'http://www.rtl2.de/sendung/grip-das-motormagazin/folge/folge-203-0', - 'md5': 'bfcc179030535b08dc2b36b469b5adc7', 'info_dict': { 'id': 'folge-203-0', 'ext': 'f4v', @@ -22,7 +21,6 @@ class RTL2IE(InfoExtractor): }, }, { 'url': 'http://www.rtl2.de/sendung/koeln-50667/video/5512-anna/21040-anna-erwischt-alex/', - 'md5': 'ffcd517d2805b57ce11a58a2980c2b02', 'info_dict': { 'id': '21040-anna-erwischt-alex', 'ext': 'mp4', From 8c97f81943de1c2bf8d2f524ba5ca09b29579dc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 21 Aug 2015 11:35:51 +0200 Subject: [PATCH 1119/2145] [common] Follow convention of using 'cls' in classmethods --- youtube_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f731703fb..5d24bcb6a 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1289,11 +1289,11 @@ class InfoExtractor(object): return ret @classmethod - def _merge_subtitles(kls, subtitle_dict1, subtitle_dict2): + def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): """ Merge two subtitle dictionaries, language by language. """ ret = dict(subtitle_dict1) for lang in subtitle_dict2: - ret[lang] = kls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) + ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) return ret def extract_automatic_captions(self, *args, **kwargs): From 66ce97024d0de7836777562a6eb60603796636d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 22 Aug 2015 06:30:00 +0600 Subject: [PATCH 1120/2145] [soundcloud:user] Update tests --- youtube_dl/extractor/soundcloud.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 6ce86cbcd..ed5dcc0d3 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -309,7 +309,7 @@ class SoundcloudUserIE(SoundcloudIE): 'id': '114582580', 'title': 'The Akashic Chronicler (All)', }, - 'playlist_mincount': 112, + 'playlist_mincount': 111, }, { 'url': 'https://soundcloud.com/the-akashic-chronicler/tracks', 'info_dict': { @@ -330,14 +330,14 @@ class SoundcloudUserIE(SoundcloudIE): 'id': '114582580', 'title': 'The Akashic Chronicler (Reposts)', }, - 'playlist_mincount': 9, + 'playlist_mincount': 7, }, { 'url': 'https://soundcloud.com/the-akashic-chronicler/likes', 'info_dict': { 'id': '114582580', 'title': 'The Akashic Chronicler (Likes)', }, - 'playlist_mincount': 333, + 'playlist_mincount': 321, }, { 'url': 'https://soundcloud.com/grynpyret/spotlight', 'info_dict': { From 483fc223bb1509d11ac1843a5852f75c0aec3475 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 23 Aug 2015 10:42:34 +0600 Subject: [PATCH 1121/2145] [pluralsight] Add extractor (Closes #6090) --- youtube_dl/extractor/__init__.py | 4 + youtube_dl/extractor/pluralsight.py | 218 ++++++++++++++++++++++++++++ 2 files changed, 222 insertions(+) create mode 100644 youtube_dl/extractor/pluralsight.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1c53a5632..d59882598 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -454,6 +454,10 @@ from .playfm import PlayFMIE from .playtvak import PlaytvakIE from .playvid import PlayvidIE from .playwire import PlaywireIE +from .pluralsight import ( + PluralsightIE, + PluralsightCourseIE, +) from .podomatic import PodomaticIE from .porn91 import Porn91IE from .pornhd import PornHdIE diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py new file mode 100644 index 000000000..1bdcacbaa --- /dev/null +++ b/youtube_dl/extractor/pluralsight.py @@ -0,0 +1,218 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse, + compat_urllib_request, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + int_or_none, + parse_duration, +) + + +class PluralsightIE(InfoExtractor): + IE_NAME = 'pluralsight' + _VALID_URL = r'https?://(?:www\.)?pluralsight\.com/training/player\?author=(?P<author>[^&]+)&name=(?P<name>[^&]+)(?:&mode=live)?&clip=(?P<clip>\d+)&course=(?P<course>[^&]+)' + _LOGIN_URL = 'https://www.pluralsight.com/id/' + _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' + _NETRC_MACHINE = 'pluralsight' + + _TEST = { + 'url': 'http://www.pluralsight.com/training/player?author=mike-mckeown&name=hosting-sql-server-windows-azure-iaas-m7-mgmt&mode=live&clip=3&course=hosting-sql-server-windows-azure-iaas', + 'md5': '4d458cf5cf4c593788672419a8dd4cf8', + 'info_dict': { + 'id': 'hosting-sql-server-windows-azure-iaas-m7-mgmt-04', + 'ext': 'mp4', + 'title': 'Management of SQL Server - Demo Monitoring', + 'duration': 338, + }, + 'skip': 'Requires pluralsight account credentials', + } + + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + raise ExtractorError( + 'Pluralsight account is required, use --username and --password options to provide account credentials.', + expected=True) + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'Username': username.encode('utf-8'), + 'Password': password.encode('utf-8'), + }) + + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, + 'post url', default=self._LOGIN_URL, group='url') + + if not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + + request = compat_urllib_request.Request( + post_url, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') + + response = self._download_webpage( + request, None, 'Logging in as %s' % username) + + error = self._search_regex( + r'<span[^>]+class="field-validation-error"[^>]*>([^<]+)</span>', + response, 'error message', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + author = mobj.group('author') + name = mobj.group('name') + clip_id = mobj.group('clip') + course = mobj.group('course') + + display_id = '%s-%s' % (name, clip_id) + + webpage = self._download_webpage(url, display_id) + + collection = self._parse_json( + self._search_regex( + r'moduleCollection\s*:\s*new\s+ModuleCollection\((\[.+?\])\s*,\s*\$rootScope\)', + webpage, 'modules'), + display_id) + + module, clip = None, None + + for module_ in collection: + if module_.get('moduleName') == name: + module = module_ + for clip_ in module_.get('clips', []): + clip_index = clip_.get('clipIndex') + if clip_index is None: + continue + if compat_str(clip_index) == clip_id: + clip = clip_ + break + + if not clip: + raise ExtractorError('Unable to resolve clip') + + QUALITIES = { + 'low': {'width': 640, 'height': 480}, + 'medium': {'width': 848, 'height': 640}, + 'high': {'width': 1024, 'height': 768}, + } + + ALLOWED_QUALITIES = ( + ('webm', ('high',)), + ('mp4', ('low', 'medium', 'high',)), + ) + + formats = [] + for ext, qualities in ALLOWED_QUALITIES: + for quality in qualities: + f = QUALITIES[quality].copy() + clip_post = { + 'a': author, + 'cap': 'false', + 'cn': clip_id, + 'course': course, + 'lc': 'en', + 'm': name, + 'mt': ext, + 'q': '%dx%d' % (f['width'], f['height']), + } + request = compat_urllib_request.Request( + 'http://www.pluralsight.com/training/Player/ViewClip', + json.dumps(clip_post).encode('utf-8')) + request.add_header('Content-Type', 'application/json;charset=utf-8') + format_id = '%s-%s' % (ext, quality) + clip_url = self._download_webpage( + request, display_id, 'Downloading %s URL' % format_id, fatal=False) + if not clip_url: + continue + f.update({ + 'url': clip_url, + 'ext': ext, + 'format_id': format_id, + }) + formats.append(f) + self._sort_formats(formats) + + # TODO: captions + # http://www.pluralsight.com/training/Player/ViewClip + cap = true + # or + # http://www.pluralsight.com/training/Player/Captions + # { a = author, cn = clip_id, lc = end, m = name } + + return { + 'id': clip['clipName'], + 'title': '%s - %s' % (module['title'], clip['title']), + 'duration': int_or_none(clip.get('duration')) or parse_duration(clip.get('formattedDuration')), + 'creator': author, + 'formats': formats + } + + +class PluralsightCourseIE(InfoExtractor): + IE_NAME = 'pluralsight:course' + _VALID_URL = r'https?://(?:www\.)?pluralsight\.com/courses/(?P<id>[^/]+)' + _TEST = { + # Free course from Pluralsight Starter Subscription for Microsoft TechNet + # https://offers.pluralsight.com/technet?loc=zTS3z&prod=zOTprodz&tech=zOttechz&prog=zOTprogz&type=zSOz&media=zOTmediaz&country=zUSz + 'url': 'http://www.pluralsight.com/courses/hosting-sql-server-windows-azure-iaas', + 'info_dict': { + 'id': 'hosting-sql-server-windows-azure-iaas', + 'title': 'Hosting SQL Server in Microsoft Azure IaaS Fundamentals', + 'description': 'md5:61b37e60f21c4b2f91dc621a977d0986', + }, + 'playlist_count': 31, + } + + def _real_extract(self, url): + course_id = self._match_id(url) + + course = self._download_json( + 'http://www.pluralsight.com/data/course/%s' % course_id, + course_id, 'Downloading course JSON') + + title = course['title'] + description = course.get('description') or course.get('shortDescription') + + course_data = self._download_json( + 'http://www.pluralsight.com/data/course/content/%s' % course_id, + course_id, 'Downloading course data JSON') + + may_not_view = 0 + + entries = [] + for module in course_data: + for clip in module.get('clips', []): + if clip.get('userMayViewClip') is False: + may_not_view += 1 + continue + player_parameters = clip.get('playerParameters') + if not player_parameters: + continue + entries.append(self.url_result( + 'http://www.pluralsight.com/training/player?%s' % player_parameters, + 'Pluralsight')) + + if may_not_view > 0: + self._downloader.report_warning( + 'There are %d videos in this course that are not available for you. ' + 'Upgrade your account to get access to these videos.' % may_not_view) + + return self.playlist_result(entries, course_id, title, description) From 468083d2f5596314a0813859f3afe7d2fce3cac7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 23 Aug 2015 10:44:10 +0600 Subject: [PATCH 1122/2145] [pluralsight] Remove unused const --- youtube_dl/extractor/pluralsight.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 1bdcacbaa..7c7f762ff 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -21,7 +21,6 @@ class PluralsightIE(InfoExtractor): IE_NAME = 'pluralsight' _VALID_URL = r'https?://(?:www\.)?pluralsight\.com/training/player\?author=(?P<author>[^&]+)&name=(?P<name>[^&]+)(?:&mode=live)?&clip=(?P<clip>\d+)&course=(?P<course>[^&]+)' _LOGIN_URL = 'https://www.pluralsight.com/id/' - _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' _NETRC_MACHINE = 'pluralsight' _TEST = { From 2b6bda1ed86e1b64242b33c032286dc315d541ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 23 Aug 2015 11:21:56 +0600 Subject: [PATCH 1123/2145] [pluralsight] Do not yet rely on userMayViewClip --- youtube_dl/extractor/pluralsight.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 7c7f762ff..7ba396aef 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -183,6 +183,8 @@ class PluralsightCourseIE(InfoExtractor): def _real_extract(self, url): course_id = self._match_id(url) + # TODO: PSM cookie + course = self._download_json( 'http://www.pluralsight.com/data/course/%s' % course_id, course_id, 'Downloading course JSON') @@ -194,14 +196,9 @@ class PluralsightCourseIE(InfoExtractor): 'http://www.pluralsight.com/data/course/content/%s' % course_id, course_id, 'Downloading course data JSON') - may_not_view = 0 - entries = [] for module in course_data: for clip in module.get('clips', []): - if clip.get('userMayViewClip') is False: - may_not_view += 1 - continue player_parameters = clip.get('playerParameters') if not player_parameters: continue @@ -209,9 +206,4 @@ class PluralsightCourseIE(InfoExtractor): 'http://www.pluralsight.com/training/player?%s' % player_parameters, 'Pluralsight')) - if may_not_view > 0: - self._downloader.report_warning( - 'There are %d videos in this course that are not available for you. ' - 'Upgrade your account to get access to these videos.' % may_not_view) - return self.playlist_result(entries, course_id, title, description) From 2006a06eff606c5a996c315a3e597b9d2603db9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 23 Aug 2015 21:43:28 +0600 Subject: [PATCH 1124/2145] [kontrtube] Fix extraction (Closes #6644) --- youtube_dl/extractor/kontrtube.py | 40 +++++++++++++++---------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py index 720bc939b..a59c529f4 100644 --- a/youtube_dl/extractor/kontrtube.py +++ b/youtube_dl/extractor/kontrtube.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + parse_duration, +) class KontrTubeIE(InfoExtractor): @@ -34,33 +37,28 @@ class KontrTubeIE(InfoExtractor): webpage = self._download_webpage( url, display_id, 'Downloading page') - video_url = self._html_search_regex( + video_url = self._search_regex( r"video_url\s*:\s*'(.+?)/?',", webpage, 'video URL') - thumbnail = self._html_search_regex( - r"preview_url\s*:\s*'(.+?)/?',", webpage, 'video thumbnail', fatal=False) + thumbnail = self._search_regex( + r"preview_url\s*:\s*'(.+?)/?',", webpage, 'thumbnail', fatal=False) title = self._html_search_regex( - r'<title>(.+?)', webpage, 'video title') + r'(?s)

(.+?)

', webpage, 'title') description = self._html_search_meta( - 'description', webpage, 'video description') + 'description', webpage, 'description') - mobj = re.search( - r'
Длительность: (?P\d+)м:(?P\d+)с
', - webpage) - duration = int(mobj.group('minutes')) * 60 + int(mobj.group('seconds')) if mobj else None + duration = self._search_regex( + r'Длительность: ([^<]+)', webpage, 'duration', fatal=False) + if duration: + duration = parse_duration(duration.replace('мин', 'min').replace('сек', 'sec')) - view_count = self._html_search_regex( - r'
Просмотров: (\d+)
', + view_count = self._search_regex( + r'Просмотров: ([^<]+)', webpage, 'view count', fatal=False) + if view_count: + view_count = int_or_none(view_count.replace(' ', '')) - comment_count = None - comment_str = self._html_search_regex( - r'Комментарии: ([^<]+)', webpage, 'comment count', fatal=False) - if comment_str.startswith('комментариев нет'): - comment_count = 0 - else: - mobj = re.search(r'\d+ из (?P\d+) комментариев', comment_str) - if mobj: - comment_count = mobj.group('total') + comment_count = int_or_none(self._search_regex( + r'Комментарии \((\d+)\)<', webpage, ' comment count', fatal=False)) return { 'id': video_id, From 9990c960f2d944cfbecb7d613062b98fe99464a7 Mon Sep 17 00:00:00 2001 From: clauderains Date: Sun, 23 Aug 2015 02:46:29 -0700 Subject: [PATCH 1125/2145] [spankwire] Fixed uploader_id field extraction so that test case passes --- youtube_dl/extractor/spankwire.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 5fa6faf18..0a35c2b3b 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -54,7 +54,7 @@ class SpankwireIE(InfoExtractor): r'by:\s*]*>(.+?)', webpage, 'uploader', fatal=False) uploader_id = self._html_search_regex( - r'by:\s* on (.+?) at \d+:\d+', From 59e6acc757a9df85ca78e519623e84072ffd9c01 Mon Sep 17 00:00:00 2001 From: clauderains Date: Sun, 23 Aug 2015 02:47:20 -0700 Subject: [PATCH 1126/2145] [spankwire] Support new cdn video url format --- youtube_dl/extractor/spankwire.py | 84 +++++++++++++++++++++---------- 1 file changed, 58 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 0a35c2b3b..0f2d8d0de 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -17,20 +17,34 @@ from ..aes import aes_decrypt_text class SpankwireIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?Pspankwire\.com/[^/]*/video(?P[0-9]+)/?)' - _TEST = { - 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', - 'md5': '8bbfde12b101204b39e4b9fe7eb67095', - 'info_dict': { - 'id': '103545', - 'ext': 'mp4', - 'title': 'Buckcherry`s X Rated Music Video Crazy Bitch', - 'description': 'Crazy Bitch X rated music video.', - 'uploader': 'oreusz', - 'uploader_id': '124697', - 'upload_date': '20070507', - 'age_limit': 18, - } - } + _TESTS = [{ + 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', + 'md5': '8bbfde12b101204b39e4b9fe7eb67095', + 'info_dict': { + 'id': '103545', + 'ext': 'mp4', + 'title': 'Buckcherry`s X Rated Music Video Crazy Bitch', + 'description': 'Crazy Bitch X rated music video.', + 'uploader': 'oreusz', + 'uploader_id': '124697', + 'upload_date': '20070507', + 'age_limit': 18, + } + }, + { + 'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/', + 'md5': '09b3c20833308b736ae8902db2f8d7e6', + 'info_dict': { + 'id': '1921551', + 'ext': 'mp4', + 'title': 'Titcums Compiloation I', + 'description': 'cum on tits', + 'uploader': 'dannyh78999', + 'uploader_id': '3056053', + 'upload_date': '20150822', + 'age_limit': 18, + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -82,18 +96,36 @@ class SpankwireIE(InfoExtractor): for video_url in video_urls: path = compat_urllib_parse_urlparse(video_url).path format = path.split('/')[4].split('_')[:2] - resolution, bitrate_str = format - format = "-".join(format) - height = int(resolution.rstrip('Pp')) - tbr = int(bitrate_str.rstrip('Kk')) - formats.append({ - 'url': video_url, - 'resolution': resolution, - 'format': format, - 'tbr': tbr, - 'height': height, - 'format_id': format, - }) + if format[0] == 'mp4': + format_id, quality = format + format = "-".join(format) + if quality == 'normal': + height = 180 + elif quality == 'high': + height = 240 + elif quality == 'ultra': + height = 480 + elif quality == '720p': + height = 720 + formats.append({ + 'url': video_url, + 'format': format, + 'height': height, + 'format_id': format, + }) + else: + resolution, bitrate_str = format + format = "-".join(format) + height = int(resolution.rstrip('Pp')) + tbr = int(bitrate_str.rstrip('Kk')) + formats.append({ + 'url': video_url, + 'resolution': resolution, + 'format': format, + 'tbr': tbr, + 'height': height, + 'format_id': format, + }) self._sort_formats(formats) age_limit = self._rta_search(webpage) From 551c7837ace81190ce9141551dceec24dfdae1bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 23 Aug 2015 22:32:20 +0600 Subject: [PATCH 1127/2145] [spankwire] Simplify and properly format --- youtube_dl/extractor/spankwire.py | 105 +++++++++++++----------------- 1 file changed, 45 insertions(+), 60 deletions(-) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 0f2d8d0de..0a47441b1 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -18,33 +18,34 @@ from ..aes import aes_decrypt_text class SpankwireIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?Pspankwire\.com/[^/]*/video(?P[0-9]+)/?)' _TESTS = [{ - 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', - 'md5': '8bbfde12b101204b39e4b9fe7eb67095', - 'info_dict': { - 'id': '103545', - 'ext': 'mp4', - 'title': 'Buckcherry`s X Rated Music Video Crazy Bitch', - 'description': 'Crazy Bitch X rated music video.', - 'uploader': 'oreusz', - 'uploader_id': '124697', - 'upload_date': '20070507', - 'age_limit': 18, - } - }, - { - 'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/', - 'md5': '09b3c20833308b736ae8902db2f8d7e6', - 'info_dict': { - 'id': '1921551', - 'ext': 'mp4', - 'title': 'Titcums Compiloation I', - 'description': 'cum on tits', - 'uploader': 'dannyh78999', - 'uploader_id': '3056053', - 'upload_date': '20150822', - 'age_limit': 18, - } - }] + # download URL pattern: */P_K_.mp4 + 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', + 'md5': '8bbfde12b101204b39e4b9fe7eb67095', + 'info_dict': { + 'id': '103545', + 'ext': 'mp4', + 'title': 'Buckcherry`s X Rated Music Video Crazy Bitch', + 'description': 'Crazy Bitch X rated music video.', + 'uploader': 'oreusz', + 'uploader_id': '124697', + 'upload_date': '20070507', + 'age_limit': 18, + } + }, { + # download URL pattern: */mp4__.mp4 + 'url': 'http://www.spankwire.com/Titcums-Compiloation-I/video1921551/', + 'md5': '09b3c20833308b736ae8902db2f8d7e6', + 'info_dict': { + 'id': '1921551', + 'ext': 'mp4', + 'title': 'Titcums Compiloation I', + 'description': 'cum on tits', + 'uploader': 'dannyh78999', + 'uploader_id': '3056053', + 'upload_date': '20150822', + 'age_limit': 18, + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -81,9 +82,10 @@ class SpankwireIE(InfoExtractor): r']*>([\d,\.]+)', webpage, 'comment count', fatal=False)) - video_urls = list(map( - compat_urllib_parse_unquote, - re.findall(r'playerData\.cdnPath[0-9]{3,}\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage))) + videos = re.findall( + r'playerData\.cdnPath([0-9]{3,})\s*=\s*(?:encodeURIComponent\()?["\']([^"\']+)["\']', webpage) + heights = [int(video[0]) for video in videos] + video_urls = list(map(compat_urllib_parse_unquote, [video[1] for video in videos])) if webpage.find('flashvars\.encrypted = "true"') != -1: password = self._search_regex( r'flashvars\.video_title = "([^"]+)', @@ -93,39 +95,22 @@ class SpankwireIE(InfoExtractor): video_urls)) formats = [] - for video_url in video_urls: + for height, video_url in zip(heights, video_urls): path = compat_urllib_parse_urlparse(video_url).path - format = path.split('/')[4].split('_')[:2] - if format[0] == 'mp4': - format_id, quality = format - format = "-".join(format) - if quality == 'normal': - height = 180 - elif quality == 'high': - height = 240 - elif quality == 'ultra': - height = 480 - elif quality == '720p': - height = 720 - formats.append({ - 'url': video_url, - 'format': format, - 'height': height, - 'format_id': format, + _, quality = path.split('/')[4].split('_')[:2] + f = { + 'url': video_url, + 'height': height, + } + tbr = self._search_regex(r'^(\d+)[Kk]$', quality, 'tbr', default=None) + if tbr: + f.update({ + 'tbr': int(tbr), + 'format_id': '%dp' % height, }) else: - resolution, bitrate_str = format - format = "-".join(format) - height = int(resolution.rstrip('Pp')) - tbr = int(bitrate_str.rstrip('Kk')) - formats.append({ - 'url': video_url, - 'resolution': resolution, - 'format': format, - 'tbr': tbr, - 'height': height, - 'format_id': format, - }) + f['format_id'] = quality + formats.append(f) self._sort_formats(formats) age_limit = self._rta_search(webpage) From 28b83495d898530e72d242874576f4d2d6d8ab3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 23 Aug 2015 22:32:44 +0600 Subject: [PATCH 1128/2145] [spankwire] Simplify --- youtube_dl/extractor/spankwire.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 0a47441b1..609f78294 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -16,7 +16,7 @@ from ..aes import aes_decrypt_text class SpankwireIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?Pspankwire\.com/[^/]*/video(?P[0-9]+)/?)' + _VALID_URL = r'https?://(?:www\.)?(?Pspankwire\.com/[^/]*/video(?P[0-9]+)/?)' _TESTS = [{ # download URL pattern: */P_K_.mp4 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', @@ -49,10 +49,9 @@ class SpankwireIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('videoid') - url = 'http://www.' + mobj.group('url') + video_id = mobj.group('id') - req = compat_urllib_request.Request(url) + req = compat_urllib_request.Request('http://www.' + mobj.group('url')) req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) From 90076b6172f25a36ca2a00c1b85cda169f2133c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 23 Aug 2015 22:33:26 +0600 Subject: [PATCH 1129/2145] [spankwire] Preserve old uploader pattern --- youtube_dl/extractor/spankwire.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py index 609f78294..9e8fb35b2 100644 --- a/youtube_dl/extractor/spankwire.py +++ b/youtube_dl/extractor/spankwire.py @@ -68,7 +68,7 @@ class SpankwireIE(InfoExtractor): r'by:\s*]*>(.+?)', webpage, 'uploader', fatal=False) uploader_id = self._html_search_regex( - r'by:\s* on (.+?) at \d+:\d+', From e7c14660d3aef3a5a303a82dee7e11dfe063048d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 24 Aug 2015 00:36:24 +0600 Subject: [PATCH 1130/2145] [yandexmusic] Defer link resolve till actual download time to prevent link expiry (Closes #6650) --- youtube_dl/extractor/yandexmusic.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index f4c0f5702..85c495c11 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -1,4 +1,4 @@ -# coding=utf-8 +# coding: utf-8 from __future__ import unicode_literals import re @@ -64,7 +64,15 @@ class YandexMusicTrackIE(YandexMusicBaseIE): return self._get_track_info(track) -class YandexMusicAlbumIE(YandexMusicBaseIE): +class YandexMusicPlaylistBaseIE(InfoExtractor): + def _build_playlist(self, tracks): + return [ + self.url_result( + 'http://music.yandex.ru/album/%s/track/%s' % (track['albums'][0]['id'], track['id'])) + for track in tracks] + + +class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE): IE_NAME = 'yandexmusic:album' IE_DESC = 'Яндекс.Музыка - Альбом' _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P\d+)/?(\?|$)' @@ -85,7 +93,7 @@ class YandexMusicAlbumIE(YandexMusicBaseIE): 'http://music.yandex.ru/handlers/album.jsx?album=%s' % album_id, album_id, 'Downloading album JSON') - entries = [self._get_track_info(track) for track in album['volumes'][0]] + entries = self._build_playlist(album['volumes'][0]) title = '%s - %s' % (album['artists'][0]['name'], album['title']) year = album.get('year') @@ -95,7 +103,7 @@ class YandexMusicAlbumIE(YandexMusicBaseIE): return self.playlist_result(entries, compat_str(album['id']), title) -class YandexMusicPlaylistIE(YandexMusicBaseIE): +class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): IE_NAME = 'yandexmusic:playlist' IE_DESC = 'Яндекс.Музыка - Плейлист' _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P\d+)' @@ -120,8 +128,7 @@ class YandexMusicPlaylistIE(YandexMusicBaseIE): r'var\s+Mu\s*=\s*({.+?});\s*', webpage, 'player'), playlist_id)['pageData']['playlist'] - entries = [self._get_track_info(track) for track in playlist['tracks']] - return self.playlist_result( - entries, compat_str(playlist_id), + self._build_playlist(playlist['tracks']), + compat_str(playlist_id), playlist['title'], playlist.get('description')) From e4df2f98ccbe2e24785dd6883d7fd495193fd8e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 24 Aug 2015 00:36:54 +0600 Subject: [PATCH 1131/2145] [yandexmusic:track] Eliminate base class --- youtube_dl/extractor/yandexmusic.py | 36 ++++++++++++++--------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 85c495c11..91829be1c 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -12,7 +12,23 @@ from ..utils import ( ) -class YandexMusicBaseIE(InfoExtractor): +class YandexMusicTrackIE(InfoExtractor): + IE_NAME = 'yandexmusic:track' + IE_DESC = 'Яндекс.Музыка - Трек' + _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P\d+)/track/(?P\d+)' + + _TEST = { + 'url': 'http://music.yandex.ru/album/540508/track/4878838', + 'md5': 'f496818aa2f60b6c0062980d2e00dc20', + 'info_dict': { + 'id': '4878838', + 'ext': 'mp3', + 'title': 'Carlo Ambrosio - Gypsy Eyes 1', + 'filesize': 4628061, + 'duration': 193.04, + } + } + def _get_track_url(self, storage_dir, track_id): data = self._download_json( 'http://music.yandex.ru/api/v1.5/handlers/api-jsonp.jsx?action=getTrackSrc&p=download-info/%s' @@ -35,24 +51,6 @@ class YandexMusicBaseIE(InfoExtractor): 'duration': float_or_none(track.get('durationMs'), 1000), } - -class YandexMusicTrackIE(YandexMusicBaseIE): - IE_NAME = 'yandexmusic:track' - IE_DESC = 'Яндекс.Музыка - Трек' - _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P\d+)/track/(?P\d+)' - - _TEST = { - 'url': 'http://music.yandex.ru/album/540508/track/4878838', - 'md5': 'f496818aa2f60b6c0062980d2e00dc20', - 'info_dict': { - 'id': '4878838', - 'ext': 'mp3', - 'title': 'Carlo Ambrosio - Gypsy Eyes 1', - 'filesize': 4628061, - 'duration': 193.04, - } - } - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) album_id, track_id = mobj.group('album_id'), mobj.group('id') From 11addc50ffa9ce65ac3bef7af6b1c38d7eae1af6 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 23 Aug 2015 23:52:47 +0200 Subject: [PATCH 1132/2145] release 2015.08.23 --- docs/supportedsites.md | 6 +++++- youtube_dl/version.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 9099e2da4..8d9db53a6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -301,6 +301,7 @@ - **Moviezine** - **movshare**: MovShare - **MPORA** + - **MSNBC** - **MTV** - **mtviggy.com** - **mtvservices:embedded** @@ -308,6 +309,7 @@ - **MusicPlayOn** - **MusicVault** - **muzu.tv** + - **Mwave** - **MySpace** - **MySpace:album** - **MySpass** @@ -392,6 +394,8 @@ - **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz - **Playvid** - **Playwire** + - **pluralsight** + - **pluralsight:course** - **plus.google**: Google Plus - **pluzz.francetv.fr** - **podomatic** @@ -534,6 +538,7 @@ - **TF1** - **TheOnion** - **ThePlatform** + - **ThePlatformFeed** - **TheSixtyOne** - **ThisAmericanLife** - **ThisAV** @@ -599,7 +604,6 @@ - **Viddler** - **video.google:search**: Google Video search - **video.mit.edu** - - **VideoBam** - **VideoDetective** - **videofy.me** - **videolectures.net** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c090c6df7..394951ca7 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.08.16.1' +__version__ = '2015.08.23' From eba470f2f22389ab32164e4eb39067ceecf900f5 Mon Sep 17 00:00:00 2001 From: ping Date: Mon, 24 Aug 2015 16:30:00 +0800 Subject: [PATCH 1133/2145] [vlive] Remove upload_date extraction & cleanup --- youtube_dl/extractor/vlive.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index b3bbd80fb..6a403cc64 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -25,7 +25,6 @@ class VLiveIE(InfoExtractor): 'ext': 'mp4', 'title': '[V] Girl\'s Day\'s Broadcast', 'creator': 'Girl\'s Day', - 'upload_date': '20150817', }, } _SECRET = 'rFkwZet6pqk1vQt6SxxUkAHX7YL3lmqzUMrU4IDusTo4jEBdtOhNfT4BYYAdArwH' @@ -41,21 +40,14 @@ class VLiveIE(InfoExtractor): thumbnail = self._og_search_thumbnail(webpage) creator = self._html_search_regex( r'([^<>]+)', webpage, 'creator') - upload_date = self._html_search_regex( - r'(\d{4}\.\d{2}\.\d{2})', webpage, - 'upload date', default=None, fatal=False) - if upload_date: - upload_date = upload_date.replace('.', '') - + url = 'http://global.apis.naver.com/globalV/globalV/vod/%s/playinfo?' % video_id - msgpad = {'msgpad': '%.0f' % (time() * 1000)} - md = { - 'md': b64encode( - hmac.new(self._SECRET.encode('ascii'), - (url[:255] + msgpad['msgpad']).encode('ascii'), sha1).digest()) - } - url += '&' + compat_urllib_parse.urlencode(msgpad) + '&' + compat_urllib_parse.urlencode(md) - + msgpad = '%.0f' % (time() * 1000) + md = b64encode( + hmac.new(self._SECRET.encode('ascii'), + (url[:255] + msgpad).encode('ascii'), sha1).digest() + ) + url += '&' + compat_urllib_parse.urlencode({'msgpad': msgpad, 'md': md}) playinfo = self._download_json(url, video_id, 'Downloading video json') if playinfo.get('message', '') != 'success': @@ -89,6 +81,5 @@ class VLiveIE(InfoExtractor): 'creator': creator, 'thumbnail': thumbnail, 'formats': formats, - 'upload_date': upload_date, 'subtitles': subtitles, } From 95e431e9ec2477694d368a050222d6381a6f88ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 25 Aug 2015 21:08:38 +0600 Subject: [PATCH 1134/2145] [mailru] Skip tests --- youtube_dl/extractor/mailru.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py index 54a14cb94..ab1300185 100644 --- a/youtube_dl/extractor/mailru.py +++ b/youtube_dl/extractor/mailru.py @@ -25,6 +25,7 @@ class MailRuIE(InfoExtractor): 'uploader_id': 'sonypicturesrus@mail.ru', 'duration': 184, }, + 'skip': 'Not accessible from Travis CI server', }, { 'url': 'http://my.mail.ru/corp/hitech/video/news_hi-tech_mail_ru/1263.html', @@ -39,6 +40,7 @@ class MailRuIE(InfoExtractor): 'uploader_id': 'hitech@corp.mail.ru', 'duration': 245, }, + 'skip': 'Not accessible from Travis CI server', }, ] From ebbf078c7df575903ceb1be53e53533508c79dad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 25 Aug 2015 21:19:21 +0600 Subject: [PATCH 1135/2145] [krasview] Skip download for test --- youtube_dl/extractor/krasview.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/krasview.py b/youtube_dl/extractor/krasview.py index 96f95979a..0ae8ebd68 100644 --- a/youtube_dl/extractor/krasview.py +++ b/youtube_dl/extractor/krasview.py @@ -25,6 +25,9 @@ class KrasViewIE(InfoExtractor): 'duration': 27, 'thumbnail': 're:^https?://.*\.jpg', }, + 'params': { + 'skip_download': 'Not accessible from Travis CI server', + }, } def _real_extract(self, url): From 6d53cdd6ce441dd7bc1d93bf1445f0594cfdffef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 25 Aug 2015 23:29:02 +0600 Subject: [PATCH 1136/2145] [yandexmusic] Skip removed tracks (#6666) --- youtube_dl/extractor/yandexmusic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 91829be1c..166cbf344 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -67,7 +67,7 @@ class YandexMusicPlaylistBaseIE(InfoExtractor): return [ self.url_result( 'http://music.yandex.ru/album/%s/track/%s' % (track['albums'][0]['id'], track['id'])) - for track in tracks] + for track in tracks if track.get('albums') and isinstance(track.get('albums'), list)] class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE): From baf510bf8cb296d2ed2a2f742ec9387d094623e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 00:11:15 +0600 Subject: [PATCH 1137/2145] [yandexmusic:playlist] Handle playlists with more than 150 tracks (Closes #6666) --- youtube_dl/extractor/yandexmusic.py | 51 +++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 166cbf344..4098e4629 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -5,7 +5,11 @@ import re import hashlib from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urllib_parse, + compat_urllib_request, +) from ..utils import ( int_or_none, float_or_none, @@ -106,7 +110,7 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): IE_DESC = 'Яндекс.Музыка - Плейлист' _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://music.yandex.ru/users/music.partners/playlists/1245', 'info_dict': { 'id': '1245', @@ -114,19 +118,54 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9', }, 'playlist_count': 6, - } + }, { + # playlist exceeding the limit of 150 tracks shipped with webpage (see + # https://github.com/rg3/youtube-dl/issues/6666) + 'url': 'https://music.yandex.ru/users/ya.playlist/playlists/1036', + 'info_dict': { + 'id': '1036', + 'title': 'Музыка 90-х', + }, + 'playlist_count': 310, + }] def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - playlist = self._parse_json( + mu = self._parse_json( self._search_regex( r'var\s+Mu\s*=\s*({.+?});\s*', webpage, 'player'), - playlist_id)['pageData']['playlist'] + playlist_id) + + playlist = mu['pageData']['playlist'] + tracks, track_ids = playlist['tracks'], playlist['trackIds'] + + # tracks dictionary shipped with webpage is limited to 150 tracks, + # missing tracks should be retrieved manually. + if len(tracks) < len(track_ids): + present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')]) + missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids) + request = compat_urllib_request.Request( + 'https://music.yandex.ru/handlers/track-entries.jsx', + compat_urllib_parse.urlencode({ + 'entries': ','.join(missing_track_ids), + 'lang': mu.get('settings', {}).get('lang', 'en'), + 'external-domain': 'music.yandex.ru', + 'overembed': 'false', + 'sign': mu.get('authData', {}).get('user', {}).get('sign'), + 'strict': 'true', + }).encode('utf-8')) + request.add_header('Referer', url) + request.add_header('X-Requested-With', 'XMLHttpRequest') + + missing_tracks = self._download_json( + request, playlist_id, 'Downloading missing tracks JSON', fatal=False) + if missing_tracks: + tracks.extend(missing_tracks) return self.playlist_result( - self._build_playlist(playlist['tracks']), + self._build_playlist(tracks), compat_str(playlist_id), playlist['title'], playlist.get('description')) From 4bc8eec4ebf5ffcca3b2e17c864be08df5215f44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 26 Aug 2015 15:21:55 +0200 Subject: [PATCH 1138/2145] [youtube] Adapt player version regex to handle urls ending in '/html5player-new.js' It was always extracting 'new' as the version, breaking the cache system. --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8e2da46e3..ab6754154 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -660,7 +660,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_signature_function(self, video_id, player_url, example_sig): id_m = re.match( - r'.*?-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P[a-z]+)$', + r'.*?-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?)?\.(?P[a-z]+)$', player_url) if not id_m: raise ExtractorError('Cannot identify player %r' % player_url) @@ -1289,7 +1289,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_desc = 'flash player %s' % player_version else: player_version = self._search_regex( - r'html5player-([^/]+?)(?:/html5player)?\.js', + r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', player_url, 'html5 player', fatal=False) player_desc = 'html5 player %s' % player_version From 2f72e83bbd915054cac0e8f70df0c2cab4b9c116 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 20:47:57 +0600 Subject: [PATCH 1139/2145] [crunchyroll] Detect required login (#6677) --- youtube_dl/extractor/crunchyroll.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 33a033a7f..98d1881ae 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -237,7 +237,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text webpage_url = 'http://www.' + mobj.group('url') webpage = self._download_webpage(webpage_url, video_id, 'Downloading webpage') - note_m = self._html_search_regex(r'
(.+?)
', webpage, 'trailer-notice', default='') + note_m = self._html_search_regex( + r'
(.+?)
', + webpage, 'trailer-notice', default='') if note_m: raise ExtractorError(note_m) @@ -247,6 +249,12 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text if msg.get('type') == 'error': raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True) + if 'To view this, please log in to verify you are 18 or older.' in webpage: + raise ExtractorError( + 'This video is only available for registered users, ' + 'use --username and --password options to provide account credentials.', + expected=True) + video_title = self._html_search_regex(r']*>(.+?)

', webpage, 'video_title', flags=re.DOTALL) video_title = re.sub(r' {2,}', ' ', video_title) video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='') From 43e7d3c9453338ae29552311b1447fe95be05db5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 21:24:47 +0600 Subject: [PATCH 1140/2145] [extractor/common] Add raise_login_required --- youtube_dl/extractor/common.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5d24bcb6a..39cef9c5b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -510,6 +510,12 @@ class InfoExtractor(object): """Report attempt to log in.""" self.to_screen('Logging in') + @staticmethod + def raise_login_required(msg='This video is only available for registered users'): + raise ExtractorError( + '%s. Use --username and --password or --netrc to provide account credentials.' % msg, + expected=True) + # Methods for following #608 @staticmethod def url_result(url, ie=None, video_id=None, video_title=None): From bbb43a39fd11c2fdf28ae593eaa994f22ce663bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 21:25:04 +0600 Subject: [PATCH 1141/2145] [crunchyroll] Use raise_login_required --- youtube_dl/extractor/crunchyroll.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 98d1881ae..801b9b48e 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -250,10 +250,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True) if 'To view this, please log in to verify you are 18 or older.' in webpage: - raise ExtractorError( - 'This video is only available for registered users, ' - 'use --username and --password options to provide account credentials.', - expected=True) + self.raise_login_required(video_id) video_title = self._html_search_regex(r']*>(.+?)

', webpage, 'video_title', flags=re.DOTALL) video_title = re.sub(r' {2,}', ' ', video_title) From 3c53455d15035a94bcd2bc915f565420e1a4279f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 21:25:37 +0600 Subject: [PATCH 1142/2145] [eroprofile] Use raise_login_required --- youtube_dl/extractor/eroprofile.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py index 316033cf1..7fcd0151d 100644 --- a/youtube_dl/extractor/eroprofile.py +++ b/youtube_dl/extractor/eroprofile.py @@ -71,8 +71,7 @@ class EroProfileIE(InfoExtractor): m = re.search(r'You must be logged in to view this video\.', webpage) if m: - raise ExtractorError( - 'This video requires login. Please specify a username and password and try again.', expected=True) + self.raise_login_required('This video requires login') video_id = self._search_regex( [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'], From 62984e4584c2962e622514c7d6a475636a8c21d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 21:25:53 +0600 Subject: [PATCH 1143/2145] [lynda] Use raise_login_required --- youtube_dl/extractor/lynda.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 5b9157ed4..378117270 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -118,9 +118,7 @@ class LyndaIE(LyndaBaseIE): 'lynda returned error: %s' % video_json['Message'], expected=True) if video_json['HasAccess'] is False: - raise ExtractorError( - 'Video %s is only available for members. ' - % video_id + self._ACCOUNT_CREDENTIALS_HINT, expected=True) + self.raise_login_required('Video %s is only available for members' % video_id) video_id = compat_str(video_json['ID']) duration = video_json['DurationInSeconds'] From e7ddaef5bd209dd8d24b0025631cde1f5969e71d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 21:26:09 +0600 Subject: [PATCH 1144/2145] [pluralsight] Use raise_login_required --- youtube_dl/extractor/pluralsight.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 7ba396aef..fd32836cc 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -41,9 +41,7 @@ class PluralsightIE(InfoExtractor): def _login(self): (username, password) = self._get_login_info() if username is None: - raise ExtractorError( - 'Pluralsight account is required, use --username and --password options to provide account credentials.', - expected=True) + self.raise_login_required('Pluralsight account is required') login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') From e269d3ae7dbebb22d5b51bd5e6d477a69ae4f3b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 21:26:24 +0600 Subject: [PATCH 1145/2145] [safari] Use raise_login_required --- youtube_dl/extractor/safari.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index f3c80708c..a602af692 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -20,7 +20,6 @@ from ..utils import ( class SafariBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/' _SUCCESSFUL_LOGIN_REGEX = r']*>Sign Out' - _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to supply credentials for safaribooksonline.com' _NETRC_MACHINE = 'safari' _API_BASE = 'https://www.safaribooksonline.com/api/v1/book' @@ -37,9 +36,7 @@ class SafariBaseIE(InfoExtractor): def _login(self): (username, password) = self._get_login_info() if username is None: - raise ExtractorError( - self._ACCOUNT_CREDENTIALS_HINT, - expected=True) + self.raise_login_required('safaribooksonline.com account is required') headers = std_headers if 'Referer' not in headers: From 42e7373bd3c819ee7cebf5898e4bdd33730dde6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 21:26:35 +0600 Subject: [PATCH 1146/2145] [smotri] Use raise_login_required --- youtube_dl/extractor/smotri.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 93a7cfe15..35a81ee87 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -330,10 +330,7 @@ class SmotriBroadcastIE(InfoExtractor): (username, password) = self._get_login_info() if username is None: - raise ExtractorError( - 'Erotic broadcasts allowed only for registered users, ' - 'use --username and --password options to provide account credentials.', - expected=True) + self.raise_login_required('Erotic broadcasts allowed only for registered users') login_form = { 'login-hint53': '1', From 61a7ff16222accdb259f771d0a6f0adb229b34dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 21:26:47 +0600 Subject: [PATCH 1147/2145] [tubitv] Use raise_login_required --- youtube_dl/extractor/tubitv.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py index 2c4b21807..4f86b3ee9 100644 --- a/youtube_dl/extractor/tubitv.py +++ b/youtube_dl/extractor/tubitv.py @@ -60,9 +60,7 @@ class TubiTvIE(InfoExtractor): webpage = self._download_webpage(url, video_id) if re.search(r"<(?:DIV|div) class='login-required-screen'>", webpage): - raise ExtractorError( - 'This video requires login, use --username and --password ' - 'options to provide account credentials.', expected=True) + self.raise_login_required('This video requires login') title = self._og_search_title(webpage) description = self._og_search_description(webpage) From a882c5f4747c527bb50d87828ea4cceae6d12533 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 21:27:07 +0600 Subject: [PATCH 1148/2145] [udemy] Use raise_login_required --- youtube_dl/extractor/udemy.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 4a0eaf65f..365d8b4bf 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -70,9 +70,7 @@ class UdemyIE(InfoExtractor): def _login(self): (username, password) = self._get_login_info() if username is None: - raise ExtractorError( - 'Udemy account is required, use --username and --password options to provide account credentials.', - expected=True) + self.raise_login_required('Udemy account is required') login_popup = self._download_webpage( self._LOGIN_URL, None, 'Downloading login popup') From 39affb5aa427a3a1830e97523470d11bfdbd067e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 21:27:57 +0600 Subject: [PATCH 1149/2145] [crunchyroll] Fix typo --- youtube_dl/extractor/crunchyroll.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 801b9b48e..c2162aa68 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -250,7 +250,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True) if 'To view this, please log in to verify you are 18 or older.' in webpage: - self.raise_login_required(video_id) + self.raise_login_required() video_title = self._html_search_regex(r']*>(.+?)', webpage, 'video_title', flags=re.DOTALL) video_title = re.sub(r' {2,}', ' ', video_title) From 3d8132f5e20b7cbdaa8f69aca482553b2c02bed8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 22:03:58 +0600 Subject: [PATCH 1150/2145] [shared] Extend _VALID_URL to support vivo.sx (Closes #6681) --- youtube_dl/extractor/shared.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index a07677686..000ef1a07 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -14,7 +14,7 @@ from ..utils import ( class SharedIE(InfoExtractor): - _VALID_URL = r'http://shared\.sx/(?P[\da-z]{10})' + _VALID_URL = r'http://(?:shared|vivo)\.sx/(?P[\da-z]{10})' _TEST = { 'url': 'http://shared.sx/0060718775', From 70113c38c9e551d7d9ab2a4d1f7e76c81b68ae76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 22:04:39 +0600 Subject: [PATCH 1151/2145] [shared] Clarify IE_DESC --- youtube_dl/extractor/shared.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 000ef1a07..cf0a3bfef 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -14,6 +14,7 @@ from ..utils import ( class SharedIE(InfoExtractor): + IE_DESC = 'shared.sx and vivo.sx' _VALID_URL = r'http://(?:shared|vivo)\.sx/(?P[\da-z]{10})' _TEST = { From f62e02c24f1f0e0488b40df178ddb9bb5fdf9fc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 22:05:45 +0600 Subject: [PATCH 1152/2145] [shared] Add test for vivo --- youtube_dl/extractor/shared.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index cf0a3bfef..4fa991dff 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -17,7 +17,7 @@ class SharedIE(InfoExtractor): IE_DESC = 'shared.sx and vivo.sx' _VALID_URL = r'http://(?:shared|vivo)\.sx/(?P[\da-z]{10})' - _TEST = { + _TESTS = [{ 'url': 'http://shared.sx/0060718775', 'md5': '106fefed92a8a2adb8c98e6a0652f49b', 'info_dict': { @@ -25,7 +25,16 @@ class SharedIE(InfoExtractor): 'ext': 'mp4', 'title': 'Bmp4', }, - } + }, { + 'url': 'http://vivo.sx/d7ddda0e78', + 'md5': '15b3af41be0b4fe01f4df075c2678b2c', + 'info_dict': { + 'id': 'd7ddda0e78', + 'ext': 'mp4', + 'title': 'Chicken', + 'filesize': 528031, + }, + }] def _real_extract(self, url): video_id = self._match_id(url) From f11c316347bea41d9148d1c8d5d7738a594a06d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 22:06:10 +0600 Subject: [PATCH 1153/2145] [shared] Add filesize to test --- youtube_dl/extractor/shared.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 4fa991dff..c5636e8e9 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -24,6 +24,7 @@ class SharedIE(InfoExtractor): 'id': '0060718775', 'ext': 'mp4', 'title': 'Bmp4', + 'filesize': 1720110, }, }, { 'url': 'http://vivo.sx/d7ddda0e78', From d7e8264517d29156697f82b7761dc99d13994c21 Mon Sep 17 00:00:00 2001 From: nmrugg Date: Thu, 27 Aug 2015 23:24:13 +0800 Subject: [PATCH 1154/2145] Make FoxBusiness work. --- youtube_dl/extractor/foxnews.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 917f76b1e..7de88ab66 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( parse_iso8601, @@ -8,7 +10,7 @@ from ..utils import ( class FoxNewsIE(InfoExtractor): - _VALID_URL = r'https?://video\.foxnews\.com/v/(?:video-embed\.html\?video_id=)?(?P\d+)' + _VALID_URL = r'https?://video\.fox(?:news|business)\.com/v/(?:video-embed\.html\?video_id=)?(?P\d+)' _TESTS = [ { 'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips', @@ -47,8 +49,10 @@ class FoxNewsIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + m = re.match(r'^https?://video\.fox(news|business)', url) + video = self._download_json( - 'http://video.foxnews.com/v/feed/video/%s.js?template=fox' % video_id, video_id) + 'http://video.fox' + m.group(1) + '.com/v/feed/video/%s.js?template=fox' % video_id, video_id) item = video['channel']['item'] title = item['title'] From 8df8c278b6d5e2b5a350446690873dc9f5f48aff Mon Sep 17 00:00:00 2001 From: nmrugg Date: Thu, 27 Aug 2015 23:24:28 +0800 Subject: [PATCH 1155/2145] Added matching test for FoxBusiness. --- youtube_dl/extractor/foxnews.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 7de88ab66..a8902c960 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -44,6 +44,10 @@ class FoxNewsIE(InfoExtractor): 'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com', 'only_matching': True, }, + { + 'url': 'http://video.foxbusiness.com/v/4442309889001', + 'only_matching': True, + }, ] def _real_extract(self, url): From 1b660cce120c733f2bb195ef1cfe2ff2421b439f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 27 Aug 2015 21:48:03 +0600 Subject: [PATCH 1156/2145] [foxnews] Simplify (Closes #6694) --- youtube_dl/extractor/foxnews.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index a8902c960..244c75f0b 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -10,7 +10,7 @@ from ..utils import ( class FoxNewsIE(InfoExtractor): - _VALID_URL = r'https?://video\.fox(?:news|business)\.com/v/(?:video-embed\.html\?video_id=)?(?P\d+)' + _VALID_URL = r'https?://(?Pvideo\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P\d+)' _TESTS = [ { 'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips', @@ -51,12 +51,12 @@ class FoxNewsIE(InfoExtractor): ] def _real_extract(self, url): - video_id = self._match_id(url) - - m = re.match(r'^https?://video\.fox(news|business)', url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + host = mobj.group('host') video = self._download_json( - 'http://video.fox' + m.group(1) + '.com/v/feed/video/%s.js?template=fox' % video_id, video_id) + 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id), video_id) item = video['channel']['item'] title = item['title'] From 5307c332329d6a1f3eec240b66a4f11905889f23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 27 Aug 2015 21:48:47 +0600 Subject: [PATCH 1157/2145] [foxnews] Clarify IE_DESC --- youtube_dl/extractor/foxnews.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 244c75f0b..3a4a59135 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -10,6 +10,7 @@ from ..utils import ( class FoxNewsIE(InfoExtractor): + IE_DESC = 'Fox News and Fox Business Video' _VALID_URL = r'https?://(?Pvideo\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P\d+)' _TESTS = [ { From a4962b80d668de704fc347d5e76587be0e95dfef Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 28 Aug 2015 05:04:39 +0200 Subject: [PATCH 1158/2145] release 2015.08.28 --- docs/supportedsites.md | 4 ++-- youtube_dl/version.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 8d9db53a6..328a819b3 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -166,7 +166,7 @@ - **Folketinget**: Folketinget (ft.dk; Danish parliament) - **FootyRoom** - **Foxgay** - - **FoxNews** + - **FoxNews**: Fox News and Fox Business Video - **FoxSports** - **france2.fr:generation-quoi** - **FranceCulture** @@ -465,7 +465,7 @@ - **Sexu** - **SexyKarma**: Sexy Karma and Watch Indian Porn - **Shahid** - - **Shared** + - **Shared**: shared.sx and vivo.sx - **ShareSix** - **Sina** - **Slideshare** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 394951ca7..a07bc9233 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.08.23' +__version__ = '2015.08.28' From 071c10137b6b17b79ecfc8676736d5cc243022f6 Mon Sep 17 00:00:00 2001 From: Paul Hartmann Date: Wed, 26 Aug 2015 00:06:44 +0200 Subject: [PATCH 1159/2145] [MTV] move German mtv site to new class --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/mtv.py | 37 ++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d59882598..66422b005 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -340,6 +340,7 @@ from .mtv import ( MTVIE, MTVServicesEmbeddedIE, MTVIggyIE, + MTVDEIE, ) from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index b48fac5e3..15df62649 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -288,3 +288,40 @@ class MTVIggyIE(MTVServicesInfoExtractor): } } _FEED_URL = 'http://all.mtvworldverticals.com/feed-xml/' + +class MTVDEIE(MTVServicesInfoExtractor): + IE_NAME = 'mtv.de' + _VALID_URL = r'''(?x)^https?://(?:www\.)?mtv\.de(?P/artists/.*)''' + _TESTS = [ + { + 'url': 'http://www.mtv.de/artists/10571-cro/videos/61131-traum', + 'info_dict': { + 'id': 'a50bc5f0b3aa4b3190aa', + 'ext': 'mp4', + 'title': 'cro-traum', + 'description': 'Cro - Traum', + }, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + return self._get_videos_info(url, mobj.group('video_path')) + + def _get_videos_info(self, url, video_path): + webpage = self._download_webpage(url, video_path) + playlist_js = self._search_regex(r'|$)', + webpage, 'videoplayer applet', default=None) + if config_json: + config = self._parse_json(config_json, display_id, fatal=False) + if config: + sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi') + if sapi: + return self._extract_info(display_id, sapi, webpage) + items_json = self._search_regex( r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, default=None) @@ -190,22 +217,10 @@ class YahooIE(InfoExtractor): video_id = info['id'] return self._get_info(video_id, display_id, webpage) - def _get_info(self, video_id, display_id, webpage): - region = self._search_regex( - r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', - webpage, 'region', fatal=False, default='US') - data = compat_urllib_parse.urlencode({ - 'protocol': 'http', - 'region': region, - }) - query_url = ( - 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' - '{id}?{data}'.format(id=video_id, data=data)) - query_result = self._download_json( - query_url, display_id, 'Downloading video info') - - info = query_result['query']['results']['mediaObj'][0] + def _extract_info(self, display_id, query, webpage): + info = query['query']['results']['mediaObj'][0] meta = info.get('meta') + video_id = info.get('id') if not meta: msg = info['status'].get('msg') @@ -231,6 +246,9 @@ class YahooIE(InfoExtractor): 'ext': 'flv', }) else: + if s.get('format') == 'm3u8_playlist': + format_info['protocol'] = 'm3u8_native' + format_info['ext'] = 'mp4' format_url = compat_urlparse.urljoin(host, path) format_info['url'] = format_url formats.append(format_info) @@ -264,6 +282,21 @@ class YahooIE(InfoExtractor): 'subtitles': subtitles, } + def _get_info(self, video_id, display_id, webpage): + region = self._search_regex( + r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', + webpage, 'region', fatal=False, default='US') + data = compat_urllib_parse.urlencode({ + 'protocol': 'http', + 'region': region, + }) + query_url = ( + 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' + '{id}?{data}'.format(id=video_id, data=data)) + query_result = self._download_json( + query_url, display_id, 'Downloading video info') + return self._extract_info(display_id, query_result, webpage) + class YahooSearchIE(SearchInfoExtractor): IE_DESC = 'Yahoo screen search' From 1721fef28b89ac4264db978ab7fee3b4dd154056 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 10 Sep 2015 02:58:40 +0600 Subject: [PATCH 1295/2145] [yahoo] Fix test --- youtube_dl/extractor/yahoo.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 1d9b98750..fca5ddc69 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -147,6 +147,7 @@ class YahooIE(InfoExtractor): }, { # Query result is embedded in webpage, but explicit request to video API fails with geo restriction 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html', + 'md5': '4fbafb9c9b6f07aa8f870629f6671b35', 'info_dict': { 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504', 'ext': 'mp4', @@ -154,10 +155,6 @@ class YahooIE(InfoExtractor): 'description': 'md5:8fc39608213295748e1e289807838c97', 'duration': 1646, }, - 'params': { - # m3u8 download - 'skip_download': True, - }, } ] From 689fb748ee1ba8e61f99d21a3bcb1bc83b708649 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 11 Sep 2015 04:44:17 +0100 Subject: [PATCH 1296/2145] [utlis] add extract_attributes for extracting html tags attributes --- youtube_dl/utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 206dd56bc..bcebf9cc5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -248,6 +248,14 @@ def get_element_by_attribute(attribute, value, html): return unescapeHTML(res) +def extract_attributes(attributes_str, attributes_regex=r'(?s)\s*([^\s=]+)\s*=\s*["\']([^"\']+)["\']'): + attributes = re.findall(attributes_regex, attributes_str) + attributes_dict = {} + if attributes: + attributes_dict = {attribute_name: attribute_value for (attribute_name, attribute_value) in attributes} + return attributes_dict + + def clean_html(html): """Clean an HTML snippet into a readable string""" From ed1269000f24a6ddc683a295ff402ef3ded5c4fb Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 11 Sep 2015 04:46:21 +0100 Subject: [PATCH 1297/2145] [brightcove] add support for brightcove in page embed(fixes #6824) --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/brightcove.py | 92 ++++++++++++++++++++++++++++++ youtube_dl/extractor/generic.py | 21 ++++++- 3 files changed, 116 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 57f55b479..fcd9edec3 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -59,7 +59,10 @@ from .bloomberg import BloombergIE from .bpb import BpbIE from .br import BRIE from .breakcom import BreakIE -from .brightcove import BrightcoveIE +from .brightcove import ( + BrightcoveIE, + BrightcoveInPageEmbedIE, +) from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 4721c2293..a07c0888f 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -22,6 +22,10 @@ from ..utils import ( fix_xml_ampersands, unescapeHTML, unsmuggle_url, + js_to_json, + int_or_none, + parse_iso8601, + extract_attributes, ) @@ -346,3 +350,91 @@ class BrightcoveIE(InfoExtractor): if 'url' not in info and not info.get('formats'): raise ExtractorError('Unable to extract video url for %s' % info['id']) return info + + +class BrightcoveInPageEmbedIE(InfoExtractor): + _VALID_URL = r'https?://players\.brightcove\.net/(?P\d+)/([a-z0-9-]+)_([a-z]+)/index.html?.*videoId=(?P\d+)' + TEST = { + 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', + 'info_dict': { + 'id': '4463358922001', + 'ext': 'flv', + 'title': 'Meet the man behind Popcorn Time', + 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', + 'duration': 165768, + } + } + + @staticmethod + def _extract_url(webpage): + video_attributes = re.search(r'(?s)]*)>.*?', webpage) + if video_attributes: + video_attributes = extract_attributes(video_attributes.group(), r'(?s)\s*data-(account|video-id|playlist-id|policy-key|player|embed)\s*=\s*["\']([^"\']+)["\']') + account_id = video_attributes.get('account') + player_id = video_attributes.get('player') + embed = video_attributes.get('embed') + video_id = video_attributes.get('video-id') + if account_id and player_id and embed and video_id: + return 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (account_id, player_id, embed, video_id) + return None + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + account_id, player_id, embed, video_id = mobj.groups() + + webpage = self._download_webpage('http://players.brightcove.net/%s/%s_%s/index.min.js' % (account_id, player_id, embed), video_id) + + catalog = self._parse_json( + js_to_json( + self._search_regex( + r'catalog\(({[^}]+})\);', + webpage, + 'catalog' + ) + ), + video_id + ) + policy_key = catalog['policyKey'] + + req = compat_urllib_request.Request( + 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id), + headers={'Accept': 'application/json;pk=%s' % policy_key}) + json_data = self._download_json(req, video_id) + + title = json_data['name'] + description = json_data.get('description') + thumbnail = json_data.get('name') + timestamp = parse_iso8601(json_data.get('published_at')) + duration = int_or_none(json_data.get('duration')) + + formats = [] + for source in json_data.get('sources'): + source_type = source.get('type') + if source_type == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats(source.get('src'), video_id)) + else: + src = source.get('src') + if src: + formats.append({ + 'url': src, + 'abr': source.get('avg_bitrate'), + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + 'filesize': source.get('size'), + 'container': source.get('container'), + 'vcodec': source.get('container'), + }) + else: + formats.extend(self._extract_f4m_formats(source.get('streaming_src'), video_id)) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ec748ed9f..7a3a7f66b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -29,7 +29,10 @@ from ..utils import ( url_basename, xpath_text, ) -from .brightcove import BrightcoveIE +from .brightcove import ( + BrightcoveIE, + BrightcoveInPageEmbedIE, +) from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE @@ -1012,6 +1015,17 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'cinemasnob', }, + }, + # BrightcoveInPageEmbed embed + { + 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', + 'info_dict': { + 'id': '4238694884001', + 'ext': 'flv', + 'title': 'Tabletop: Dread, Last Thoughts', + 'description': 'Tabletop: Dread, Last Thoughts', + 'duration': 51690, + }, } ] @@ -1288,6 +1302,11 @@ class GenericIE(InfoExtractor): 'entries': entries, } + # Look for Brightcove In Page Embed: + brightcove_in_page_embed_url = BrightcoveInPageEmbedIE._extract_url(webpage) + if brightcove_in_page_embed_url: + return self.url_result(brightcove_in_page_embed_url, 'BrightcoveInPageEmbed') + # Look for embedded rtl.nl player matches = re.findall( r']+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', From 73eb13dfc74132b8f0e5c1ac1ea75f66e0aca6bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 11 Sep 2015 20:43:05 +0600 Subject: [PATCH 1298/2145] [extractor/common] Case insensitive inputs extraction --- youtube_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5eeeda08d..835f6f368 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -732,7 +732,7 @@ class InfoExtractor(object): @staticmethod def _hidden_inputs(html): hidden_inputs = {} - for input in re.findall(r']+)>', html): + for input in re.findall(r'(?i)]+)>', html): if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): continue name = re.search(r'name=(["\'])(?P.+?)\1', input) @@ -746,7 +746,7 @@ class InfoExtractor(object): def _form_hidden_inputs(self, form_id, html): form = self._search_regex( - r'(?s)]+?id=(["\'])%s\1[^>]*>(?P
.+?)
' % form_id, + r'(?is)]+?id=(["\'])%s\1[^>]*>(?P
.+?)
' % form_id, html, '%s form' % form_id, group='form') return self._hidden_inputs(form) From 586f1cc532d167c28e733779cbf132b94d8f76e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 11 Sep 2015 21:07:32 +0600 Subject: [PATCH 1299/2145] [extractor/common] Skip html comment tags (Closes #6822) --- youtube_dl/extractor/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 835f6f368..d694e818e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -731,6 +731,7 @@ class InfoExtractor(object): @staticmethod def _hidden_inputs(html): + html = re.sub(r'', '', html) hidden_inputs = {} for input in re.findall(r'(?i)]+)>', html): if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): From 60ed60353b9ca57e8181f0b14d525ce487e673ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 12 Sep 2015 20:34:48 +0600 Subject: [PATCH 1300/2145] [openfilm] Remove extractor OpenFilm has been shut down --- youtube_dl/extractor/__init__.py | 1 - youtube_dl/extractor/openfilm.py | 70 -------------------------------- 2 files changed, 71 deletions(-) delete mode 100644 youtube_dl/extractor/openfilm.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 57f55b479..2e7272931 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -432,7 +432,6 @@ from .ooyala import ( OoyalaIE, OoyalaExternalIE, ) -from .openfilm import OpenFilmIE from .orf import ( ORFTVthekIE, ORFOE1IE, diff --git a/youtube_dl/extractor/openfilm.py b/youtube_dl/extractor/openfilm.py deleted file mode 100644 index d2ceedd01..000000000 --- a/youtube_dl/extractor/openfilm.py +++ /dev/null @@ -1,70 +0,0 @@ -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote_plus -from ..utils import ( - parse_iso8601, - parse_age_limit, - int_or_none, -) - - -class OpenFilmIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)openfilm\.com/videos/(?P.+)' - _TEST = { - 'url': 'http://www.openfilm.com/videos/human-resources-remastered', - 'md5': '42bcd88c2f3ec13b65edf0f8ad1cac37', - 'info_dict': { - 'id': '32736', - 'display_id': 'human-resources-remastered', - 'ext': 'mp4', - 'title': 'Human Resources (Remastered)', - 'description': 'Social Engineering in the 20th Century.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'duration': 7164, - 'timestamp': 1334756988, - 'upload_date': '20120418', - 'uploader_id': '41117', - 'view_count': int, - 'age_limit': 0, - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - player = compat_urllib_parse_unquote_plus( - self._og_search_video_url(webpage)) - - video = json.loads(self._search_regex( - r'\bp=({.+?})(?:&|$)', player, 'video JSON')) - - video_url = '%s1.mp4' % video['location'] - video_id = video.get('video_id') - display_id = video.get('alias') or display_id - title = video.get('title') - description = video.get('description') - thumbnail = video.get('main_thumb') - duration = int_or_none(video.get('duration')) - timestamp = parse_iso8601(video.get('dt_published'), ' ') - uploader_id = video.get('user_id') - view_count = int_or_none(video.get('views_count')) - age_limit = parse_age_limit(video.get('age_limit')) - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'uploader_id': uploader_id, - 'view_count': view_count, - 'age_limit': age_limit, - } From 41ebd6530b124b9265a3df9d7d09aef02041b088 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 12 Sep 2015 22:42:57 +0800 Subject: [PATCH 1301/2145] [tudou] Add the test case (#6273) --- youtube_dl/extractor/tudou.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index e800477e2..950c42afb 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -27,6 +27,9 @@ class TudouIE(InfoExtractor): 'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012', 'thumbnail': 're:^https?://.*\.jpg$', } + }, { + 'url': 'http://www.tudou.com/albumplay/cJAHGih4yYg.html', + 'only_matching': True, }] _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' From 94e507aea798dac6974237cc44257dda45d5fa5a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 12 Sep 2015 22:45:09 +0800 Subject: [PATCH 1302/2145] [tudou] A more comprehensive _VALID_URL --- youtube_dl/extractor/tudou.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 950c42afb..68712cb4a 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -9,7 +9,7 @@ from .common import InfoExtractor class TudouIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/?.*/(?P[^/?#]+?)(?:\.html)?/?(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/([^/]+/)*(?P[^/?#]+?)(?:\.html)?/?(?:$|[?#])' _TESTS = [{ 'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html', 'md5': '140a49ed444bd22f93330985d8475fcb', From 141ba36996f77a420df69903a59792f6f93ae314 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 12 Sep 2015 22:51:49 +0800 Subject: [PATCH 1303/2145] [tudou] Modernize --- youtube_dl/extractor/tudou.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 68712cb4a..c9d80a7ef 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -2,9 +2,6 @@ from __future__ import unicode_literals -import re -import json - from .common import InfoExtractor @@ -46,13 +43,10 @@ class TudouIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - m = re.search(r'vcode:\s*[\'"](.+?)[\'"]', webpage) - if m and m.group(1): - return { - '_type': 'url', - 'url': 'youku:' + m.group(1), - 'ie_key': 'Youku' - } + youku_vcode = self._search_regex( + r'vcode:\s*[\'"](.+?)[\'"]', webpage, 'youku vcode', default=None) + if youku_vcode: + return self.url_result('youku:' + youku_vcode, ie='Youku') title = self._search_regex( r",kw:\s*['\"](.+?)[\"']", webpage, 'title') @@ -63,8 +57,8 @@ class TudouIE(InfoExtractor): r"playerUrl\s*:\s*['\"](.+?\.swf)[\"']", webpage, 'player URL', default=self._PLAYER_URL) - segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments') - segments = json.loads(segs_json) + segments = self._parse_json(self._search_regex( + r'segs: \'(.*)\'', webpage, 'segments'), video_id) # It looks like the keys are the arguments that have to be passed as # the hd field in the request url, we pick the higher # Also, filter non-number qualities (see issue #3643). From aab135516b288f24c55178b024976fd3e130c7b8 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 12 Sep 2015 22:52:51 +0800 Subject: [PATCH 1304/2145] [tudou] Avoid shadowing builtin names --- youtube_dl/extractor/tudou.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index c9d80a7ef..6116b209d 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -31,11 +31,11 @@ class TudouIE(InfoExtractor): _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' - def _url_for_id(self, id, quality=None): - info_url = "http://v2.tudou.com/f?id=" + str(id) + def _url_for_id(self, video_id, quality=None): + info_url = "http://v2.tudou.com/f?id=" + str(video_id) if quality: info_url += '&hd' + quality - webpage = self._download_webpage(info_url, id, "Opening the info webpage") + webpage = self._download_webpage(info_url, video_id, "Opening the info webpage") final_url = self._html_search_regex('>(.+?)', webpage, 'video url') return final_url From 87813a857009dc3c3dfcc421679e5e806d363863 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 13 Sep 2015 02:36:51 +0800 Subject: [PATCH 1305/2145] [tudou] Use _download_xml --- youtube_dl/extractor/tudou.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 6116b209d..3b993192c 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -35,8 +35,8 @@ class TudouIE(InfoExtractor): info_url = "http://v2.tudou.com/f?id=" + str(video_id) if quality: info_url += '&hd' + quality - webpage = self._download_webpage(info_url, video_id, "Opening the info webpage") - final_url = self._html_search_regex('>(.+?)', webpage, 'video url') + xml_data = self._download_xml(info_url, video_id, "Opening the info XML page") + final_url = xml_data.text return final_url def _real_extract(self, url): From 349b3a2ea0d6c264facacd92508516e8530108b2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 13 Sep 2015 02:51:20 +0800 Subject: [PATCH 1306/2145] [tudou] Improve regexs --- youtube_dl/extractor/tudou.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 3b993192c..53ba8511f 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -44,21 +44,21 @@ class TudouIE(InfoExtractor): webpage = self._download_webpage(url, video_id) youku_vcode = self._search_regex( - r'vcode:\s*[\'"](.+?)[\'"]', webpage, 'youku vcode', default=None) + r'vcode\s*:\s*[\'"]([^\'"]*)[\'"]', webpage, 'youku vcode', default=None) if youku_vcode: return self.url_result('youku:' + youku_vcode, ie='Youku') title = self._search_regex( - r",kw:\s*['\"](.+?)[\"']", webpage, 'title') + r',kw\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'title') thumbnail_url = self._search_regex( - r",pic:\s*[\"'](.+?)[\"']", webpage, 'thumbnail URL', fatal=False) + r',pic\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'thumbnail URL', fatal=False) player_url = self._search_regex( - r"playerUrl\s*:\s*['\"](.+?\.swf)[\"']", + r'playerUrl\s*:\s*[\'"]([^\'"]+\.swf)[\'"]', webpage, 'player URL', default=self._PLAYER_URL) segments = self._parse_json(self._search_regex( - r'segs: \'(.*)\'', webpage, 'segments'), video_id) + r'segs: \'([^\']+)\'', webpage, 'segments'), video_id) # It looks like the keys are the arguments that have to be passed as # the hd field in the request url, we pick the higher # Also, filter non-number qualities (see issue #3643). From b264c2130221912adfc7cc35d73c2a88d79eafeb Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 13 Sep 2015 02:57:14 +0800 Subject: [PATCH 1307/2145] [tudou] Use single quotes and compat_str --- youtube_dl/extractor/tudou.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 53ba8511f..5f7ac4b35 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str class TudouIE(InfoExtractor): @@ -32,7 +33,7 @@ class TudouIE(InfoExtractor): _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' def _url_for_id(self, video_id, quality=None): - info_url = "http://v2.tudou.com/f?id=" + str(video_id) + info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id) if quality: info_url += '&hd' + quality xml_data = self._download_xml(info_url, video_id, "Opening the info XML page") From 2ffe3bc14b5e65c902fe5ddd610143c791edaa52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 13 Sep 2015 04:15:49 +0600 Subject: [PATCH 1308/2145] [ndr] Rework and cover with tests --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/ndr.py | 469 ++++++++++++++++++++++--------- 2 files changed, 334 insertions(+), 136 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 44ab7ce3c..fadba905d 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -367,6 +367,7 @@ from .nbc import ( from .ndr import ( NDRIE, NJoyIE, + NDREmbedBaseIE, NDREmbedIE, NJoyEmbedIE, ) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 87f3edbbe..e3cc6fde8 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -1,183 +1,380 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( - ExtractorError, + determine_ext, int_or_none, + parse_iso8601, qualities, ) -preference = qualities(['xs', 's', 'm','l', 'xl']) - - class NDRBaseIE(InfoExtractor): - - def extract_video_info(self, playlist, video_id): - formats = [] - streamType = playlist.get('config').get('streamType') - if streamType == 'httpVideo': - for key, f in playlist.items(): - if key != 'config': - src = f['src'] - if '.f4m' in src: - formats.extend(self._extract_f4m_formats(src, video_id)) - elif '.m3u8' in src: - formats.extend(self._extract_m3u8_formats(src, video_id, fatal=False)) - else: - quality = f.get('quality') - formats.append({ - 'url': src, - 'format_id': quality, - 'preference': preference(quality), - }) - elif streamType == 'httpAudio': - for key, f in playlist.items(): - if key != 'config': - formats.append({ - 'url': f['src'], - 'format_id': 'mp3', - 'vcodec': 'none', - }) - else: - raise ExtractorError('No media links available for %s' % video_id) - - self._sort_formats(formats) - - config = playlist.get('config') - - title = config['title'] - duration = int_or_none(config.get('duration')) - thumbnails = [{ - 'id': thumbnail.get('quality'), - 'url': thumbnail.get('src'), - 'preference': preference(thumbnail.get('quality')) - } for thumbnail in config.get('poster').values()] - - return { - 'id': video_id, - 'title': title, - 'thumbnails': thumbnails, - 'duration': duration, - 'formats': formats, - } - def _real_extract(self, url): - video_id = self._match_id(url) - - json_data = self._download_json('http://www.ndr.de/%s-ppjson.json' % video_id, video_id, fatal=False) - - if not json_data: - webpage = self._download_webpage(url, video_id) - embed_url = self._html_search_regex(r']+id="pp_\w+"[^>]+src="(/.*)"', webpage, 'embed url', None, False) - if not embed_url: - embed_url = self._html_search_meta('embedURL', webpage, fatal=False) - if embed_url: - if embed_url.startswith('/'): - return self.url_result('http://www.ndr.de%s' % embed_url, 'NDREmbed') - else: - return self.url_result(embed_url, 'NDREmbed') - raise ExtractorError('No media links available for %s' % video_id) - - return self.extract_video_info(json_data['playlist'], video_id) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + return self._extract_embed(webpage, display_id) class NDRIE(NDRBaseIE): IE_NAME = 'ndr' - IE_DESC = 'NDR.de - Mediathek' - _VALID_URL = r'https?://www\.ndr\.de/.+?,(?P\w+)\.html' + IE_DESC = 'NDR.de - Norddeutscher Rundfunk' + _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)+(?P[^/?#]+),[\da-z]+\.html' + _TESTS = [{ + # httpVideo, same content id + 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', + 'md5': '6515bc255dc5c5f8c85bbc38e035a659', + 'info_dict': { + 'id': 'hafengeburtstag988', + 'display_id': 'Party-Poette-und-Parade', + 'ext': 'mp4', + 'title': 'Party, Pötte und Parade', + 'description': 'md5:ad14f9d2f91d3040b6930c697e5f6b4c', + 'uploader': 'ndrtv', + 'timestamp': 1431108900, + 'upload_date': '20150510', + 'duration': 3498, + }, + 'params': { + 'skip_download': True, + }, + }, { + # httpVideo, different content id + 'url': 'http://www.ndr.de/sport/fussball/40-Osnabrueck-spielt-sich-in-einen-Rausch,osna270.html', + 'md5': '1043ff203eab307f0c51702ec49e9a71', + 'info_dict': { + 'id': 'osna272', + 'display_id': '40-Osnabrueck-spielt-sich-in-einen-Rausch', + 'ext': 'mp4', + 'title': 'Osnabrück - Wehen Wiesbaden: Die Highlights', + 'description': 'md5:32e9b800b3d2d4008103752682d5dc01', + 'uploader': 'ndrtv', + 'timestamp': 1442059200, + 'upload_date': '20150912', + 'duration': 510, + }, + 'params': { + 'skip_download': True, + }, + }, { + # httpAudio, same content id + 'url': 'http://www.ndr.de/info/La-Valette-entgeht-der-Hinrichtung,audio51535.html', + 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', + 'info_dict': { + 'id': 'audio51535', + 'display_id': 'La-Valette-entgeht-der-Hinrichtung', + 'ext': 'mp3', + 'title': 'La Valette entgeht der Hinrichtung', + 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', + 'uploader': 'ndrinfo', + 'timestamp': 1290626100, + 'upload_date': '20140729', + 'duration': 884, + }, + 'params': { + 'skip_download': True, + }, + }] - _TESTS = [ - { - 'url': 'http://www.ndr.de/fernsehen/sendungen/nordmagazin/Kartoffeltage-in-der-Lewitz,nordmagazin25866.html', - 'md5': '5bc5f5b92c82c0f8b26cddca34f8bb2c', - 'note': 'Video file', - 'info_dict': { - 'id': 'nordmagazin25866', - 'ext': 'mp4', - 'title': 'Kartoffeltage in der Lewitz', - 'duration': 166, - }, - 'skip': '404 Not found', - }, - { - 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', - 'md5': 'dadc003c55ae12a5d2f6bd436cd73f59', - 'info_dict': { - 'id': 'hafengeburtstag988', - 'ext': 'mp4', - 'title': 'Party, Pötte und Parade', - 'duration': 3498, - }, - }, - { - 'url': 'http://www.ndr.de/info/La-Valette-entgeht-der-Hinrichtung,audio51535.html', - 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', - 'note': 'Audio file', - 'info_dict': { - 'id': 'audio51535', - 'ext': 'mp3', - 'title': 'La Valette entgeht der Hinrichtung', - 'duration': 884, - } + def _extract_embed(self, webpage, display_id): + embed_url = self._html_search_meta( + 'embedURL', webpage, 'embed URL', fatal=True) + description = self._search_regex( + r']+itemprop="description">([^<]+)

', + webpage, 'description', fatal=False) + timestamp = parse_iso8601( + self._search_regex( + r'