From a0758dfa1afd5b04773ba3b3b17ac71d22054821 Mon Sep 17 00:00:00 2001 From: felix Date: Wed, 5 Aug 2015 22:40:46 +0200 Subject: [PATCH 001/137] [filmon] new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/filmon.py | 144 +++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+) create mode 100644 youtube_dl/extractor/filmon.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 578359a5e..c9b9ebd23 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -287,6 +287,7 @@ from .fc2 import ( FC2EmbedIE, ) from .fczenit import FczenitIE +from .filmon import FilmOnIE, FilmOnVODIE from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE diff --git a/youtube_dl/extractor/filmon.py b/youtube_dl/extractor/filmon.py new file mode 100644 index 000000000..987792fec --- /dev/null +++ b/youtube_dl/extractor/filmon.py @@ -0,0 +1,144 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import qualities +from ..compat import compat_urllib_request + + +_QUALITY = qualities(('low', 'high')) + + +class FilmOnIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?filmon\.com/(?:tv|channel)/(?P[a-z0-9-]+)' + _TESTS = [{ + 'url': 'https://www.filmon.com/channel/filmon-sports', + 'only_matching': True, + }, { + 'url': 'https://www.filmon.com/tv/2894', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel_id = self._match_id(url) + + request = compat_urllib_request.Request('https://www.filmon.com/channel/%s' % (channel_id)) + request.add_header('X-Requested-With', 'XMLHttpRequest') + channel_info = self._download_json(request, channel_id) + now_playing = channel_info['now_playing'] + + thumbnails = [] + for thumb in now_playing.get('images', ()): + if thumb['type'] != '2': + continue + thumbnails.append({ + 'url': thumb['url'], + 'width': int(thumb['width']), + 'height': int(thumb['height']), + }) + + formats = [] + + for stream in channel_info['streams']: + formats.append({ + 'format_id': str(stream['id']), + # this is an m3u8 stream, but we are deliberately not using _extract_m3u8_formats + # because 0) it doesn't have bitrate variants anyway, and 1) the ids generated + # by that method are highly unstable (because the bitrate is variable) + 'url': stream['url'], + 'resolution': stream['name'], + 'format_note': 'expires after %u seconds' % int(stream['watch-timeout']), + 'ext': 'mp4', + 'quality': _QUALITY(stream['quality']), + 'preference': int(stream['watch-timeout']), + }) + self._sort_formats(formats) + + return { + 'id': str(channel_info['id']), + 'display_id': channel_info['alias'], + 'formats': formats, + # XXX: use the channel description (channel_info['description'])? + 'uploader_id': channel_info['alias'], + 'uploader': channel_info['title'], # XXX: kinda stretching it... + 'title': now_playing.get('programme_name') or channel_info['title'], + 'description': now_playing.get('programme_description'), + 'thumbnails': thumbnails, + 'is_live': True, + } + + +class FilmOnVODIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?filmon\.com/vod/view/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.filmon.com/vod/view/24869-0-plan-9-from-outer-space', + 'info_dict': { + 'id': '24869', + 'ext': 'mp4', + 'title': 'Plan 9 From Outer Space', + 'description': 'Dead human, zombies and vampires', + }, + }, { + 'url': 'https://www.filmon.com/vod/view/2825-1-popeye-series-1', + 'info_dict': { + 'id': '2825', + 'title': 'Popeye Series 1', + }, + 'playlist_count': 8, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + result = self._download_json('https://www.filmon.com/api/vod/movie?id=%s' % (video_id), video_id) + if result['code'] != 200: + raise ExtractorError('FilmOn said: %s' % (result['reason']), expected=True) + + response = result['response'] + + if response.get('episodes'): + return { + '_type': 'playlist', + 'id': video_id, + 'title': response['title'], + 'entries': [{ + '_type': 'url', + 'url': 'https://www.filmon.com/vod/view/%s' % (ep), + } for ep in response['episodes']] + } + + formats = [] + for (id, stream) in response['streams'].items(): + formats.append({ + 'format_id': id, + 'url': stream['url'], + 'resolution': stream['name'], + 'format_note': 'expires after %u seconds' % int(stream['watch-timeout']), + 'ext': 'mp4', + 'quality': _QUALITY(stream['quality']), + 'preference': int(stream['watch-timeout']), + }) + self._sort_formats(formats) + + poster = response['poster'] + thumbnails = [{ + 'id': 'poster', + 'url': poster['url'], + 'width': poster['width'], + 'height': poster['height'], + }] + for (id, thumb) in poster['thumbs'].items(): + thumbnails.append({ + 'id': id, + 'url': thumb['url'], + 'width': thumb['width'], + 'height': thumb['height'], + }) + + return { + 'id': video_id, + 'title': response['title'], + 'formats': formats, + 'description': response['description'], + 'thumbnails': thumbnails, + } From 0c1c6f4b9f97375ffc68cbc9c7276838f7bf8514 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 23 Jan 2017 23:31:43 +0800 Subject: [PATCH 002/137] [utils] Add another date format seen in NextTV --- youtube_dl/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 12863e74a..98acc2b45 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -143,6 +143,7 @@ DATE_FORMATS = ( '%Y/%m/%d', '%Y/%m/%d %H:%M', '%Y/%m/%d %H:%M:%S', + '%Y-%m-%d %H:%M', '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M:%S.%f', '%d.%m.%Y %H:%M', From bc35ed3fb6fcae88d59fd440b505b9e1a7cf112e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 23 Jan 2017 23:33:30 +0800 Subject: [PATCH 003/137] =?UTF-8?q?[nextmedia]=20Add=20support=20for=20Nex?= =?UTF-8?q?tTV=20(=E5=A3=B9=E9=9B=BB=E8=A6=96)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ChangeLog | 6 ++++ youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/nextmedia.py | 54 +++++++++++++++++++++++++++++- 3 files changed, 60 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index beea17e54..ba2f5cffc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version + +Extractors ++ [nextmedia] Add support for NextTV (壹電視) + + version 2017.01.22 Extractors diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index cfddf5b92..e23b5d0f6 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -598,6 +598,7 @@ from .nextmedia import ( NextMediaIE, NextMediaActionNewsIE, AppleDailyIE, + NextTVIE, ) from .nfb import NFBIE from .nfl import NFLIE diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py index 626ed8b49..680f03aad 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/youtube_dl/extractor/nextmedia.py @@ -3,7 +3,14 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urlparse -from ..utils import parse_iso8601 +from ..utils import ( + clean_html, + get_element_by_class, + int_or_none, + parse_iso8601, + remove_start, + unified_timestamp, +) class NextMediaIE(InfoExtractor): @@ -184,3 +191,48 @@ class AppleDailyIE(NextMediaIE): def _fetch_description(self, page): return self._html_search_meta('description', page, 'news description') + + +class NextTVIE(InfoExtractor): + IE_DESC = '壹電視' + _VALID_URL = r'https?://(?:www\.)?nexttv\.com\.tw/(?:[^/]+/)+(?P\d+)' + + _TEST = { + 'url': 'http://www.nexttv.com.tw/news/realtime/politics/11779671', + 'info_dict': { + 'id': '11779671', + 'ext': 'mp4', + 'title': '「超收稅」近4千億! 藍議員籲發消費券', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1484825400, + 'upload_date': '20170119', + 'view_count': int, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex( + r']*>([^<]+)', webpage, 'title') + + data = self._hidden_inputs(webpage) + + video_url = data['ntt-vod-src-detailview'] + + date_str = get_element_by_class('date', webpage) + timestamp = unified_timestamp(date_str + '+0800') if date_str else None + + view_count = int_or_none(remove_start( + clean_html(get_element_by_class('click', webpage)), '點閱:')) + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'thumbnail': data.get('ntt-vod-img-src'), + 'timestamp': timestamp, + 'view_count': view_count, + } From b494d6856c55bd351107fd7266f8ac2eeaee341f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Jan 2017 02:50:49 +0700 Subject: [PATCH 004/137] [pluralsight] Fix extraction (closes #11820) --- youtube_dl/extractor/pluralsight.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 0ffd41ecd..5c798e874 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -157,13 +157,10 @@ class PluralsightIE(PluralsightBaseIE): display_id = '%s-%s' % (name, clip_id) - parsed_url = compat_urlparse.urlparse(url) - - payload_url = compat_urlparse.urlunparse(parsed_url._replace( - netloc='app.pluralsight.com', path='player/api/v1/payload')) - course = self._download_json( - payload_url, display_id, headers={'Referer': url})['payload']['course'] + 'https://app.pluralsight.com/player/user/api/v1/player/payload', + display_id, data=urlencode_postdata({'courseId': course_name}), + headers={'Referer': url}) collection = course['modules'] From ee4c091ce5bb3732c3016410230f45f2283e5055 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Jan 2017 02:56:19 +0700 Subject: [PATCH 005/137] [ChangeLog] Actualize --- ChangeLog | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ChangeLog b/ChangeLog index ba2f5cffc..406301549 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,11 @@ version Extractors +* [pluralsight] Fix extraction (#11820) + [nextmedia] Add support for NextTV (壹電視) +* [24video] Fix extraction (#11811) +* [youtube:playlist] Fix nonexistent and private playlist detection (#11604) ++ [chirbit] Extract uploader (#11809) version 2017.01.22 From c3a65c3de0667b8de4af8fdc8c1eb04a1498e104 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Jan 2017 02:58:37 +0700 Subject: [PATCH 006/137] release 2017.01.24 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 30cc27c7b..f771d72c0 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.01.22*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.01.22** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.01.24*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.01.24** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.01.22 +[debug] youtube-dl version 2017.01.24 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 406301549..4bc30cff7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.01.24 Extractors * [pluralsight] Fix extraction (#11820) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index b906d443a..2d28b3f72 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -485,6 +485,7 @@ - **Newstube** - **NextMedia**: 蘋果日報 - **NextMediaActionNews**: 蘋果日報 - 動新聞 + - **NextTV**: 壹電視 - **nfb**: National Film Board of Canada - **nfl.com** - **NhkVod** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9466c9637..8a66c2fb9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.01.22' +__version__ = '2017.01.24' From d61aa5eb37244a04caa09f1f238a4f81366c109b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Jan 2017 22:46:40 +0700 Subject: [PATCH 007/137] [vimeo:review] Fix config URL extraction (closes #11821) --- youtube_dl/extractor/vimeo.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index a6bbd4c05..c12eeadd4 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -884,10 +884,14 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): def _get_config_url(self, webpage_url, video_id, video_password_verified=False): webpage = self._download_webpage(webpage_url, video_id) - data = self._parse_json(self._search_regex( - r'window\s*=\s*_extend\(window,\s*({.+?})\);', webpage, 'data', - default=NO_DEFAULT if video_password_verified else '{}'), video_id) - config_url = data.get('vimeo_esi', {}).get('config', {}).get('configUrl') + config_url = self._html_search_regex( + r'data-config-url=(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'config URL', default=None, group='url') + if not config_url: + data = self._parse_json(self._search_regex( + r'window\s*=\s*_extend\(window,\s*({.+?})\);', webpage, 'data', + default=NO_DEFAULT if video_password_verified else '{}'), video_id) + config_url = data.get('vimeo_esi', {}).get('config', {}).get('configUrl') if config_url is None: self._verify_video_password(webpage_url, video_id, webpage) config_url = self._get_config_url( From 74af9c700d308e3638db0ff2e4510770f9daf31c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Jan 2017 22:55:49 +0700 Subject: [PATCH 008/137] [konserthusetplay] Add support for hls formats (closes #11823) --- youtube_dl/extractor/konserthusetplay.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/konserthusetplay.py b/youtube_dl/extractor/konserthusetplay.py index 55291c66f..7e6ea9696 100644 --- a/youtube_dl/extractor/konserthusetplay.py +++ b/youtube_dl/extractor/konserthusetplay.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + determine_ext, float_or_none, int_or_none, ) @@ -42,12 +43,18 @@ class KonserthusetPlayIE(InfoExtractor): player_config = media['playerconfig'] playlist = player_config['playlist'] - source = next(f for f in playlist if f.get('bitrates')) + source = next(f for f in playlist if f.get('bitrates') or f.get('provider')) FORMAT_ID_REGEX = r'_([^_]+)_h264m\.mp4' formats = [] + m3u8_url = source.get('url') + if m3u8_url and determine_ext(m3u8_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + fallback_url = source.get('fallbackUrl') fallback_format_id = None if fallback_url: From 23b35a634e06d9b92c9650b0d66a3d5d7eb03a54 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 24 Jan 2017 16:55:07 +0100 Subject: [PATCH 009/137] [crackle] improve extraction - extract vtt subtitles - extract multiple resolutions for thumbnails - pass geo verification proxy headers - add support for mobile urls --- youtube_dl/extractor/crackle.py | 53 ++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index 25c5e7d04..377fb45e9 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -6,7 +6,7 @@ from ..utils import int_or_none class CrackleIE(InfoExtractor): - _VALID_URL = r'(?:crackle:|https?://(?:www\.)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P\d+)' + _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P\d+)' _TEST = { 'url': 'http://www.crackle.com/comedians-in-cars-getting-coffee/2498934', 'info_dict': { @@ -31,8 +31,32 @@ class CrackleIE(InfoExtractor): } } + _THUMBNAIL_RES = [ + (120, 90), + (208, 156), + (220, 124), + (220, 220), + (240, 180), + (250, 141), + (315, 236), + (320, 180), + (360, 203), + (400, 300), + (421, 316), + (460, 330), + (460, 460), + (462, 260), + (480, 270), + (587, 330), + (640, 480), + (700, 330), + (700, 394), + (854, 480), + (1024, 1024), + (1920, 1080), + ] + # extracted from http://legacyweb-us.crackle.com/flash/ReferrerRedirect.ashx - _THUMBNAIL_TEMPLATE = 'http://images-us-am.crackle.com/%stnl_1920x1080.jpg?ts=20140107233116?c=635333335057637614' _MEDIA_FILE_SLOTS = { 'c544.flv': { 'width': 544, @@ -61,17 +85,25 @@ class CrackleIE(InfoExtractor): item = self._download_xml( 'http://legacyweb-us.crackle.com/app/revamp/vidwallcache.aspx?flags=-1&fm=%s' % video_id, - video_id).find('i') + video_id, headers=self.geo_verification_headers()).find('i') title = item.attrib['t'] subtitles = {} formats = self._extract_m3u8_formats( 'http://content.uplynk.com/ext/%s/%s.m3u8' % (config_doc.attrib['strUplynkOwnerId'], video_id), video_id, 'mp4', m3u8_id='hls', fatal=None) - thumbnail = None + thumbnails = [] path = item.attrib.get('p') if path: - thumbnail = self._THUMBNAIL_TEMPLATE % path + for width, height in self._THUMBNAIL_RES: + res = '%dx%d' % (width, height) + thumbnails.append({ + 'id': res, + 'url': 'http://images-us-am.crackle.com/%stnl_%s.jpg' % (path, res), + 'width': width, + 'height': height, + 'resolution': res, + }) http_base_url = 'http://ahttp.crackle.com/' + path for mfs_path, mfs_info in self._MEDIA_FILE_SLOTS.items(): formats.append({ @@ -86,10 +118,11 @@ class CrackleIE(InfoExtractor): if locale and v: if locale not in subtitles: subtitles[locale] = [] - subtitles[locale] = [{ - 'url': '%s/%s%s_%s.xml' % (config_doc.attrib['strSubtitleServer'], path, locale, v), - 'ext': 'ttml', - }] + for url_ext, ext in (('vtt', 'vtt'), ('xml', 'tt')): + subtitles.setdefault(locale, []).append({ + 'url': '%s/%s%s_%s.%s' % (config_doc.attrib['strSubtitleServer'], path, locale, v, url_ext), + 'ext': ext, + }) self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) return { @@ -100,7 +133,7 @@ class CrackleIE(InfoExtractor): 'series': item.attrib.get('sn'), 'season_number': int_or_none(item.attrib.get('se')), 'episode_number': int_or_none(item.attrib.get('ep')), - 'thumbnail': thumbnail, + 'thumbnails': thumbnails, 'subtitles': subtitles, 'formats': formats, } From af59bddc4e4a6c260e7966fe75d9d687c3b13b32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 24 Jan 2017 23:02:20 +0700 Subject: [PATCH 010/137] [konserthusetplay] Extract subtitles (#11823) --- youtube_dl/extractor/konserthusetplay.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/konserthusetplay.py b/youtube_dl/extractor/konserthusetplay.py index 7e6ea9696..3ae2aa317 100644 --- a/youtube_dl/extractor/konserthusetplay.py +++ b/youtube_dl/extractor/konserthusetplay.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( determine_ext, float_or_none, @@ -104,6 +105,13 @@ class KonserthusetPlayIE(InfoExtractor): thumbnail = media.get('image') duration = float_or_none(media.get('duration'), 1000) + subtitles = {} + captions = source.get('captionsAvailableLanguages') + if isinstance(captions, dict): + for lang, subtitle_url in captions.items(): + if lang != 'none' and isinstance(subtitle_url, compat_str): + subtitles.setdefault(lang, []).append({'url': subtitle_url}) + return { 'id': video_id, 'title': title, @@ -111,4 +119,5 @@ class KonserthusetPlayIE(InfoExtractor): 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, + 'subtitles': subtitles, } From c60089c0222433775dcc1305d85b42fc6158c8df Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 25 Jan 2017 07:38:17 +0100 Subject: [PATCH 011/137] [afreecatv:global] Add new extractor(closes #11807) --- youtube_dl/extractor/afreecatv.py | 92 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 5 +- 2 files changed, 96 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index 75b366993..4f6cdb8a2 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -18,6 +18,7 @@ from ..utils import ( class AfreecaTVIE(InfoExtractor): + IE_NAME = 'afreecatv' IE_DESC = 'afreecatv.com' _VALID_URL = r'''(?x) https?:// @@ -143,3 +144,94 @@ class AfreecaTVIE(InfoExtractor): expected=True) return info + + +class AfreecaTVGlobalIE(AfreecaTVIE): + IE_NAME = 'afreecatv:global' + _VALID_URL = r'https?://(?:www\.)?afreeca\.tv/(?P\d+)(?:/v/(?P\d+))?' + _TESTS = [{ + 'url': 'http://afreeca.tv/36853014/v/58301', + 'info_dict': { + 'id': '58301', + 'title': 'tryhard top100', + 'uploader_id': '36853014', + 'uploader': 'makgi Hearthstone Live!', + }, + 'playlist_count': 3, + }] + + def _real_extract(self, url): + channel_id, video_id = re.match(self._VALID_URL, url).groups() + video_type = 'video' if video_id else 'live' + query = { + 'pt': 'view', + 'bid': channel_id, + } + if video_id: + query['vno'] = video_id + video_data = self._download_json( + 'http://api.afreeca.tv/%s/view_%s.php' % (video_type, video_type), + video_id or channel_id, query=query)['channel'] + + if video_data.get('result') != 1: + raise ExtractorError('%s said: %s' % (self.IE_NAME, video_data['remsg'])) + + title = video_data['title'] + + info = { + 'thumbnail': video_data.get('thumb'), + 'view_count': int_or_none(video_data.get('vcnt')), + 'age_limit': int_or_none(video_data.get('grade')), + 'uploader_id': channel_id, + 'uploader': video_data.get('cname'), + } + + if video_id: + entries = [] + for i, f in enumerate(video_data.get('flist', [])): + video_key = self.parse_video_key(f.get('key', '')) + f_url = f.get('file') + if not video_key or not f_url: + continue + entries.append({ + 'id': '%s_%s' % (video_id, video_key.get('part', i + 1)), + 'title': title, + 'upload_date': video_key.get('upload_date'), + 'duration': int_or_none(f.get('length')), + 'url': f_url, + 'protocol': 'm3u8_native', + 'ext': 'mp4', + }) + + info.update({ + 'id': video_id, + 'title': title, + 'duration': int_or_none(video_data.get('length')), + }) + if len(entries) > 1: + info['_type'] = 'multi_video' + info['entries'] = entries + elif len(entries) == 1: + i = entries[0].copy() + i.update(info) + info = i + else: + formats = [] + for s in video_data.get('strm', []): + s_url = s.get('purl') + if not s_url: + continue + # TODO: extract rtmp formats + if s.get('stype') == 'HLS': + formats.extend(self._extract_m3u8_formats( + s_url, channel_id, 'mp4', fatal=False)) + self._sort_formats(formats) + + info.update({ + 'id': channel_id, + 'title': self._live_title(title), + 'is_live': True, + 'formats': formats, + }) + + return info diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index e23b5d0f6..f09b4cf2c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -30,7 +30,10 @@ from .aenetworks import ( AENetworksIE, HistoryTopicIE, ) -from .afreecatv import AfreecaTVIE +from .afreecatv import ( + AfreecaTVIE, + AfreecaTVGlobalIE, +) from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE From b8a03b66601f6af9e6b4009cba634dac6e0d30e6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 25 Jan 2017 07:39:11 +0100 Subject: [PATCH 012/137] [srgssr] fix rts video extraction(closes #11831) --- youtube_dl/extractor/srgssr.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index 47aa887cc..319a48a7a 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -48,9 +48,6 @@ class SRGSSRIE(InfoExtractor): def _real_extract(self, url): bu, media_type, media_id = re.match(self._VALID_URL, url).groups() - if bu == 'rts': - return self.url_result('rts:%s' % media_id, 'RTS') - media_data = self.get_media_data(bu, media_type, media_id) metadata = media_data['AssetMetadatas']['AssetMetadata'][0] From 17f8deeb481a7aa3079d7e11da2c255f893b9e8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 25 Jan 2017 23:27:22 +0700 Subject: [PATCH 013/137] [extractor/generic] Add support for openload embeds (closes #11536, closes #11812) --- youtube_dl/extractor/generic.py | 7 +++++++ youtube_dl/extractor/openload.py | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 40201f311..a23486620 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -80,6 +80,7 @@ from .piksel import PikselIE from .videa import VideaIE from .twentymin import TwentyMinutenIE from .ustream import UstreamIE +from .openload import OpenloadIE class GenericIE(InfoExtractor): @@ -2431,6 +2432,12 @@ class GenericIE(InfoExtractor): return _playlist_from_matches( twentymin_urls, ie=TwentyMinutenIE.ie_key()) + # Look for Openload embeds + openload_urls = OpenloadIE._extract_urls(webpage) + if openload_urls: + return _playlist_from_matches( + openload_urls, ie=OpenloadIE.ie_key()) + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( webpage, video_id, default={}, expected_type='VideoObject') diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 3d4ad7dca..4893ade5d 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_chr from ..utils import ( @@ -56,6 +58,12 @@ class OpenloadIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+src=["\']((?:https?://)?(?:openload\.(?:co|io)|oload\.tv)/embed/[a-zA-Z0-9-_]+)', + webpage) + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage('https://openload.co/embed/%s/' % video_id, video_id) From c1fa3f46727ccbbb75389ce82753f2e63449ece6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 25 Jan 2017 23:28:45 +0700 Subject: [PATCH 014/137] [openload] Fallback video extension to mp4 --- youtube_dl/extractor/openload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 4893ade5d..32289d897 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -101,7 +101,7 @@ class OpenloadIE(InfoExtractor): 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'url': video_url, # Seems all videos have extensions in their titles - 'ext': determine_ext(title), + 'ext': determine_ext(title, 'mp4'), 'subtitles': subtitles, } return info_dict From 2c302cf66b235aed6be5786489f259c0fa993fae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 25 Jan 2017 23:33:46 +0700 Subject: [PATCH 015/137] [ChangeLog] Actualize --- ChangeLog | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ChangeLog b/ChangeLog index 4bc30cff7..e0af3f671 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,18 @@ +version + +Extractors ++ [openload] Fallback video extension to mp4 ++ [extractor/generic] Add support for Openload embeds (#11536, #11812) +* [srgssr] Fix rts video extraction (#11831) ++ [afreecatv:global] Add support for afreeca.tv (#11807) ++ [crackle] Extract vtt subtitles ++ [crackle] Extract multiple resolutions for thumbnails ++ [crackle] Add support for mobile URLs ++ [konserthusetplay] Extract subtitles (#11823) ++ [konserthusetplay] Add support for HLS videos (#11823) +* [vimeo:review] Fix config URL extraction (#11821) + + version 2017.01.24 Extractors From 2417d41535a907a2da05a8b6490198916279d2ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 25 Jan 2017 23:36:03 +0700 Subject: [PATCH 016/137] release 2017.01.25 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index f771d72c0..4d409f785 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.01.24*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.01.24** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.01.25*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.01.25** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.01.24 +[debug] youtube-dl version 2017.01.25 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index e0af3f671..ff305d7e8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.01.25 Extractors + [openload] Fallback video extension to mp4 diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 2d28b3f72..f640cfcaa 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -33,7 +33,8 @@ - **AdobeTVVideo** - **AdultSwim** - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network - - **AfreecaTV**: afreecatv.com + - **afreecatv**: afreecatv.com + - **afreecatv:global**: afreecatv.com - **AirMozilla** - **AlJazeera** - **Allocine** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8a66c2fb9..c23fe85de 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.01.24' +__version__ = '2017.01.25' From 556dbe7fe35667cb061dbf0ee84d3a065ad11055 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 26 Jan 2017 21:43:14 +0700 Subject: [PATCH 017/137] [youtube] Add fallback for duration extraction (closes #11841) --- youtube_dl/extractor/youtube.py | 36 ++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5202beb3e..630586796 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -40,6 +40,7 @@ from ..utils import ( sanitized_Request, smuggle_url, str_to_int, + try_get, unescapeHTML, unified_strdate, unsmuggle_url, @@ -383,6 +384,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], + 'duration': 10, 'like_count': int, 'dislike_count': int, 'start_time': 1, @@ -402,6 +404,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', 'iconic ep', 'iconic', 'love', 'it'], + 'duration': 180, 'uploader': 'Icona Pop', 'uploader_id': 'IconaPop', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop', @@ -419,6 +422,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Justin Timberlake - Tunnel Vision (Explicit)', 'alt_title': 'Tunnel Vision', 'description': 'md5:64249768eec3bc4276236606ea996373', + 'duration': 419, 'uploader': 'justintimberlakeVEVO', 'uploader_id': 'justintimberlakeVEVO', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', @@ -458,6 +462,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], + 'duration': 10, 'like_count': int, 'dislike_count': int, }, @@ -493,6 +498,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'm4a', 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson', 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d', + 'duration': 244, 'uploader': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', @@ -512,6 +518,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Taylor Swift - Shake It Off', 'alt_title': 'Shake It Off', 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3', + 'duration': 242, 'uploader': 'TaylorSwiftVEVO', 'uploader_id': 'TaylorSwiftVEVO', 'upload_date': '20140818', @@ -529,6 +536,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'T4XJQO3qol8', 'ext': 'mp4', + 'duration': 219, 'upload_date': '20100909', 'uploader': 'The Amazing Atheist', 'uploader_id': 'TheAmazingAtheist', @@ -546,6 +554,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', + 'duration': 142, 'uploader': 'The Witcher', 'uploader_id': 'WitcherGame', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', @@ -562,6 +571,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', + 'duration': 247, 'uploader': 'LloydVEVO', 'uploader_id': 'LloydVEVO', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', @@ -576,6 +586,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': '__2ABJjxzNo', 'ext': 'mp4', + 'duration': 266, 'upload_date': '20100430', 'uploader_id': 'deadmau5', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', @@ -596,6 +607,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'lqQg6PlCWgI', 'ext': 'mp4', + 'duration': 6085, 'upload_date': '20150827', 'uploader_id': 'olympic', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', @@ -615,6 +627,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': '_b-2C3KPAM0', 'ext': 'mp4', 'stretched_ratio': 16 / 9., + 'duration': 85, 'upload_date': '20110310', 'uploader_id': 'AllenMeow', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', @@ -649,6 +662,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'md5:7b81415841e02ecd4313668cde88737a', 'description': 'md5:116377fd2963b81ec4ce64b542173306', + 'duration': 220, 'upload_date': '20150625', 'uploader_id': 'dorappi2000', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', @@ -691,6 +705,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)', 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'duration': 7335, 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', @@ -703,6 +718,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)', 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'duration': 7337, 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', @@ -715,6 +731,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'teamPGP: Rocket League Noob Stream (grizzle)', 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'duration': 7337, 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', @@ -727,6 +744,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'teamPGP: Rocket League Noob Stream (zim)', 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'duration': 7334, 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', @@ -768,6 +786,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', 'alt_title': 'Dark Walk', 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', + 'duration': 133, 'upload_date': '20151119', 'uploader_id': 'IronSoulElf', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', @@ -809,10 +828,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'md5:e41008789470fc2533a3252216f1c1d1', 'description': 'md5:a677553cf0840649b731a3024aeff4cc', + 'duration': 721, 'upload_date': '20150127', 'uploader_id': 'BerkmanCenter', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter', - 'uploader': 'BerkmanCenter', + 'uploader': 'The Berkman Klein Center for Internet & Society', 'license': 'Creative Commons Attribution license (reuse allowed)', }, 'params': { @@ -827,6 +847,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders', 'description': 'md5:dda0d780d5a6e120758d1711d062a867', + 'duration': 4060, 'upload_date': '20151119', 'uploader': 'Bernie 2016', 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', @@ -871,7 +892,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'iqKdEhx-dD4', 'ext': 'mp4', 'title': 'Isolation - Mind Field (Ep 1)', - 'description': 'md5:3a72f23c086a1496c9e2c54a25fa0822', + 'description': 'md5:8013b7ddea787342608f63a13ddc9492', + 'duration': 2085, 'upload_date': '20170118', 'uploader': 'Vsauce', 'uploader_id': 'Vsauce', @@ -1516,11 +1538,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_subtitles = self.extract_subtitles(video_id, video_webpage) automatic_captions = self.extract_automatic_captions(video_id, video_webpage) - if 'length_seconds' not in video_info: - self._downloader.report_warning('unable to extract video duration') - video_duration = None - else: - video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0])) + video_duration = try_get( + video_info, lambda x: int_or_none(x['length_seconds'][0])) + if not video_duration: + video_duration = parse_duration(self._html_search_meta( + 'duration', video_webpage, 'video duration')) # annotations video_annotations = None From cf0cabbe5011228c78a3d88c1a1b179b10333d6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 26 Jan 2017 21:49:34 +0700 Subject: [PATCH 018/137] [cmt,mtv,southpark] Add support for episode URLs (closes #11837) --- youtube_dl/extractor/cmt.py | 2 +- youtube_dl/extractor/mtv.py | 5 ++++- youtube_dl/extractor/southpark.py | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py index f6b794fb3..e701fbeab 100644 --- a/youtube_dl/extractor/cmt.py +++ b/youtube_dl/extractor/cmt.py @@ -5,7 +5,7 @@ from .mtv import MTVIE class CMTIE(MTVIE): IE_NAME = 'cmt.com' - _VALID_URL = r'https?://(?:www\.)?cmt\.com/(?:videos|shows|full-episodes|video-clips)/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?cmt\.com/(?:videos|shows|(?:full-)?episodes|video-clips)/(?P[^/]+)' _TESTS = [{ 'url': 'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061', diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index e48ea2481..855c3996f 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -304,7 +304,7 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): class MTVIE(MTVServicesInfoExtractor): IE_NAME = 'mtv' - _VALID_URL = r'https?://(?:www\.)?mtv\.com/(?:video-clips|full-episodes)/(?P[^/?#.]+)' + _VALID_URL = r'https?://(?:www\.)?mtv\.com/(?:video-clips|(?:full-)?episodes)/(?P[^/?#.]+)' _FEED_URL = 'http://www.mtv.com/feeds/mrss/' _TESTS = [{ @@ -321,6 +321,9 @@ class MTVIE(MTVServicesInfoExtractor): }, { 'url': 'http://www.mtv.com/full-episodes/94tujl/unlocking-the-truth-gates-of-hell-season-1-ep-101', 'only_matching': True, + }, { + 'url': 'http://www.mtv.com/episodes/g8xu7q/teen-mom-2-breaking-the-wall-season-7-ep-713', + 'only_matching': True, }] diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index 08f8c5744..d8ce416fc 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -6,7 +6,7 @@ from .mtv import MTVServicesInfoExtractor class SouthParkIE(MTVServicesInfoExtractor): IE_NAME = 'southpark.cc.com' - _VALID_URL = r'https?://(?:www\.)?(?Psouthpark\.cc\.com/(?:clips|full-episodes)/(?P.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?Psouthpark\.cc\.com/(?:clips|(?:full-)?episodes)/(?P.+?)(\?|#|$))' _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' @@ -75,7 +75,7 @@ class SouthParkDeIE(SouthParkIE): class SouthParkNlIE(SouthParkIE): IE_NAME = 'southpark.nl' - _VALID_URL = r'https?://(?:www\.)?(?Psouthpark\.nl/(?:clips|full-episodes)/(?P.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?Psouthpark\.nl/(?:clips|(?:full-)?episodes)/(?P.+?)(\?|#|$))' _FEED_URL = 'http://www.southpark.nl/feeds/video-player/mrss/' _TESTS = [{ From 9bccdc7004f48963da9a51b6fe24a398d59da725 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 26 Jan 2017 16:06:01 +0100 Subject: [PATCH 019/137] [vevo] remove request to old api and catch apiv2 errors --- youtube_dl/extractor/vevo.py | 267 +++++++++++------------------------ 1 file changed, 79 insertions(+), 188 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index f0a8075fb..c4e37f694 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -4,9 +4,9 @@ import re from .common import InfoExtractor from ..compat import ( - compat_etree_fromstring, compat_str, compat_urlparse, + compat_HTTPError, ) from ..utils import ( ExtractorError, @@ -140,21 +140,6 @@ class VevoIE(VevoBaseIE): 'url': 'http://www.vevo.com/watch/INS171400764', 'only_matching': True, }] - _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com' - _SOURCE_TYPES = { - 0: 'youtube', - 1: 'brightcove', - 2: 'http', - 3: 'hls_ios', - 4: 'hls', - 5: 'smil', # http - 7: 'f4m_cc', - 8: 'f4m_ak', - 9: 'f4m_l3', - 10: 'ism', - 13: 'smil', # rtmp - 18: 'dash', - } _VERSIONS = { 0: 'youtube', # only in AuthenticateVideo videoVersions 1: 'level3', @@ -163,41 +148,6 @@ class VevoIE(VevoBaseIE): 4: 'amazon', } - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): - formats = [] - els = smil.findall('.//{http://www.w3.org/2001/SMIL20/Language}video') - for el in els: - src = el.attrib['src'] - m = re.match(r'''(?xi) - (?P[a-z0-9]+): - (?P - [/a-z0-9]+ # The directory and main part of the URL - _(?P[0-9]+)k - _(?P[0-9]+)x(?P[0-9]+) - _(?P[a-z0-9]+) - _(?P[0-9]+) - _(?P[a-z0-9]+) - _(?P[0-9]+) - \.[a-z0-9]+ # File extension - )''', src) - if not m: - continue - - format_url = self._SMIL_BASE_URL + m.group('path') - formats.append({ - 'url': format_url, - 'format_id': 'smil_' + m.group('tbr'), - 'vcodec': m.group('vcodec'), - 'acodec': m.group('acodec'), - 'tbr': int(m.group('tbr')), - 'vbr': int(m.group('vbr')), - 'abr': int(m.group('abr')), - 'ext': m.group('ext'), - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - return formats - def _initialize_api(self, video_id): req = sanitized_Request( 'http://www.vevo.com/auth', data=b'') @@ -214,148 +164,91 @@ class VevoIE(VevoBaseIE): self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token'] def _call_api(self, path, *args, **kwargs): - return self._download_json(self._api_url_template % path, *args, **kwargs) + try: + data = self._download_json(self._api_url_template % path, *args, **kwargs) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + errors = self._parse_json(e.cause.read().decode(), None)['errors'] + error_message = ', '.join([error['message'] for error in errors]) + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) + raise + return data def _real_extract(self, url): video_id = self._match_id(url) - json_url = 'http://api.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id - response = self._download_json( - json_url, video_id, 'Downloading video info', - 'Unable to download info', fatal=False) or {} - video_info = response.get('video') or {} + self._initialize_api(video_id) + + video_info = self._call_api( + 'video/%s' % video_id, video_id, 'Downloading api video info', + 'Failed to download video info') + + video_versions = self._call_api( + 'video/%s/streams' % video_id, video_id, + 'Downloading video versions info', + 'Failed to download video versions info', + fatal=False) + + # Some videos are only available via webpage (e.g. + # https://github.com/rg3/youtube-dl/issues/9366) + if not video_versions: + webpage = self._download_webpage(url, video_id) + video_versions = self._extract_json(webpage, video_id, 'streams')[video_id][0] + + uploader = None artist = None featured_artist = None - uploader = None - view_count = None + artists = video_info.get('artists') + for curr_artist in artists: + if curr_artist.get('role') == 'Featured': + featured_artist = curr_artist['name'] + else: + artist = uploader = curr_artist['name'] + formats = [] + for video_version in video_versions: + version = self._VERSIONS.get(video_version['version']) + version_url = video_version.get('url') + if not version_url: + continue - if not video_info: - try: - self._initialize_api(video_id) - except ExtractorError: - ytid = response.get('errorInfo', {}).get('ytid') - if ytid: - self.report_warning( - 'Video is geoblocked, trying with the YouTube video %s' % ytid) - return self.url_result(ytid, 'Youtube', ytid) - - raise - - video_info = self._call_api( - 'video/%s' % video_id, video_id, 'Downloading api video info', - 'Failed to download video info') - - video_versions = self._call_api( - 'video/%s/streams' % video_id, video_id, - 'Downloading video versions info', - 'Failed to download video versions info', - fatal=False) - - # Some videos are only available via webpage (e.g. - # https://github.com/rg3/youtube-dl/issues/9366) - if not video_versions: - webpage = self._download_webpage(url, video_id) - video_versions = self._extract_json(webpage, video_id, 'streams')[video_id][0] - - timestamp = parse_iso8601(video_info.get('releaseDate')) - artists = video_info.get('artists') - for curr_artist in artists: - if curr_artist.get('role') == 'Featured': - featured_artist = curr_artist['name'] - else: - artist = uploader = curr_artist['name'] - view_count = int_or_none(video_info.get('views', {}).get('total')) - - for video_version in video_versions: - version = self._VERSIONS.get(video_version['version']) - version_url = video_version.get('url') - if not version_url: + if '.ism' in version_url: + continue + elif '.mpd' in version_url: + formats.extend(self._extract_mpd_formats( + version_url, video_id, mpd_id='dash-%s' % version, + note='Downloading %s MPD information' % version, + errnote='Failed to download %s MPD information' % version, + fatal=False)) + elif '.m3u8' in version_url: + formats.extend(self._extract_m3u8_formats( + version_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls-%s' % version, + note='Downloading %s m3u8 information' % version, + errnote='Failed to download %s m3u8 information' % version, + fatal=False)) + else: + m = re.search(r'''(?xi) + _(?P[0-9]+)x(?P[0-9]+) + _(?P[a-z0-9]+) + _(?P[0-9]+) + _(?P[a-z0-9]+) + _(?P[0-9]+) + \.(?P[a-z0-9]+)''', version_url) + if not m: continue - if '.ism' in version_url: - continue - elif '.mpd' in version_url: - formats.extend(self._extract_mpd_formats( - version_url, video_id, mpd_id='dash-%s' % version, - note='Downloading %s MPD information' % version, - errnote='Failed to download %s MPD information' % version, - fatal=False)) - elif '.m3u8' in version_url: - formats.extend(self._extract_m3u8_formats( - version_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls-%s' % version, - note='Downloading %s m3u8 information' % version, - errnote='Failed to download %s m3u8 information' % version, - fatal=False)) - else: - m = re.search(r'''(?xi) - _(?P[0-9]+)x(?P[0-9]+) - _(?P[a-z0-9]+) - _(?P[0-9]+) - _(?P[a-z0-9]+) - _(?P[0-9]+) - \.(?P[a-z0-9]+)''', version_url) - if not m: - continue - - formats.append({ - 'url': version_url, - 'format_id': 'http-%s-%s' % (version, video_version['quality']), - 'vcodec': m.group('vcodec'), - 'acodec': m.group('acodec'), - 'vbr': int(m.group('vbr')), - 'abr': int(m.group('abr')), - 'ext': m.group('ext'), - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - else: - timestamp = int_or_none(self._search_regex( - r'/Date\((\d+)\)/', - video_info['releaseDate'], 'release date', fatal=False), - scale=1000) - artists = video_info.get('mainArtists') - if artists: - artist = uploader = artists[0]['artistName'] - - featured_artists = video_info.get('featuredArtists') - if featured_artists: - featured_artist = featured_artists[0]['artistName'] - - smil_parsed = False - for video_version in video_info['videoVersions']: - version = self._VERSIONS.get(video_version['version']) - if version == 'youtube': - continue - else: - source_type = self._SOURCE_TYPES.get(video_version['sourceType']) - renditions = compat_etree_fromstring(video_version['data']) - if source_type == 'http': - for rend in renditions.findall('rendition'): - attr = rend.attrib - formats.append({ - 'url': attr['url'], - 'format_id': 'http-%s-%s' % (version, attr['name']), - 'height': int_or_none(attr.get('frameheight')), - 'width': int_or_none(attr.get('frameWidth')), - 'tbr': int_or_none(attr.get('totalBitrate')), - 'vbr': int_or_none(attr.get('videoBitrate')), - 'abr': int_or_none(attr.get('audioBitrate')), - 'vcodec': attr.get('videoCodec'), - 'acodec': attr.get('audioCodec'), - }) - elif source_type == 'hls': - formats.extend(self._extract_m3u8_formats( - renditions.find('rendition').attrib['url'], video_id, - 'mp4', 'm3u8_native', m3u8_id='hls-%s' % version, - note='Downloading %s m3u8 information' % version, - errnote='Failed to download %s m3u8 information' % version, - fatal=False)) - elif source_type == 'smil' and version == 'level3' and not smil_parsed: - formats.extend(self._extract_smil_formats( - renditions.find('rendition').attrib['url'], video_id, False)) - smil_parsed = True + formats.append({ + 'url': version_url, + 'format_id': 'http-%s-%s' % (version, video_version['quality']), + 'vcodec': m.group('vcodec'), + 'acodec': m.group('acodec'), + 'vbr': int(m.group('vbr')), + 'abr': int(m.group('abr')), + 'ext': m.group('ext'), + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) self._sort_formats(formats) track = video_info['title'] @@ -376,17 +269,15 @@ class VevoIE(VevoBaseIE): else: age_limit = None - duration = video_info.get('duration') - return { 'id': video_id, 'title': title, 'formats': formats, 'thumbnail': video_info.get('imageUrl') or video_info.get('thumbnailUrl'), - 'timestamp': timestamp, + 'timestamp': parse_iso8601(video_info.get('releaseDate')), 'uploader': uploader, - 'duration': duration, - 'view_count': view_count, + 'duration': int_or_none(video_info.get('duration')), + 'view_count': int_or_none(video_info.get('views', {}).get('total')), 'age_limit': age_limit, 'track': track, 'artist': uploader, From b3277115a192b88df34692e42f62f39bd4a65bac Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 26 Jan 2017 16:14:42 +0100 Subject: [PATCH 020/137] [disney] Add new extractor(closes #7409)(closes #11801)(#4975)(#11000) --- youtube_dl/extractor/disney.py | 115 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 116 insertions(+) create mode 100644 youtube_dl/extractor/disney.py diff --git a/youtube_dl/extractor/disney.py b/youtube_dl/extractor/disney.py new file mode 100644 index 000000000..396873c6d --- /dev/null +++ b/youtube_dl/extractor/disney.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, + compat_str, + determine_ext, +) + + +class DisneyIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?P(?:[^/]+\.)?(?:disney\.[a-z]{2,3}(?:\.[a-z]{2})?|disney(?:(?:me|latino)\.com|turkiye\.com\.tr)|starwars\.com))/(?:embed/|(?:[^/]+/)+[\w-]+-)(?P[a-z0-9]{24})''' + _TESTS = [{ + 'url': 'http://video.disney.com/watch/moana-trailer-545ed1857afee5a0ec239977', + 'info_dict': { + 'id': '545ed1857afee5a0ec239977', + 'ext': 'mp4', + 'title': 'Moana - Trailer', + 'description': 'A fun adventure for the entire Family! Bring home Moana on Digital HD Feb 21 & Blu-ray March 7', + 'upload_date': '20170112', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'http://videos.disneylatino.com/ver/spider-man-de-regreso-a-casa-primer-adelanto-543a33a1850bdcfcca13bae2', + 'only_matching': True, + }, { + 'url': 'http://video.en.disneyme.com/watch/future-worm/robo-carp-2001-544b66002aa7353cdd3f5114', + 'only_matching': True, + }, { + 'url': 'http://video.disneyturkiye.com.tr/izle/7c-7-cuceler/kimin-sesi-zaten-5456f3d015f6b36c8afdd0e2', + 'only_matching': True, + }, { + 'url': 'http://disneyjunior.disney.com/embed/546a4798ddba3d1612e4005d', + 'only_matching': True, + }, { + 'url': 'http://www.starwars.com/embed/54690d1e6c42e5f09a0fb097', + 'only_matching': True, + }] + + def _real_extract(self, url): + domain, video_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage( + 'http://%s/embed/%s' % (domain, video_id), video_id) + video_data = self._parse_json(self._search_regex( + r'Disney\.EmbedVideo=({.+});', webpage, 'embed data'), video_id)['video'] + + for external in video_data.get('externals', []): + if external.get('source') == 'vevo': + return self.url_result('vevo:' + external['data_id'], 'Vevo') + + title = video_data['title'] + + formats = [] + for flavor in video_data.get('flavors', []): + flavor_format = flavor.get('format') + flavor_url = flavor.get('url') + if not flavor_url or not re.match(r'https?://', flavor_url): + continue + tbr = int_or_none(flavor.get('bitrate')) + if tbr == 99999: + formats.extend(self._extract_m3u8_formats( + flavor_url, video_id, 'mp4', m3u8_id=flavor_format, fatal=False)) + continue + format_id = [] + if flavor_format: + format_id.append(flavor_format) + if tbr: + format_id.append(compat_str(tbr)) + ext = determine_ext(flavor_url) + if flavor_format == 'applehttp' or ext == 'm3u8': + ext = 'mp4' + width = int_or_none(flavor.get('width')) + height = int_or_none(flavor.get('height')) + formats.append({ + 'format_id': '-'.join(format_id), + 'url': flavor_url, + 'width': width, + 'height': height, + 'tbr': tbr, + 'ext': ext, + 'vcodec': 'none' if (width == 0 and height == 0) else None, + }) + self._sort_formats(formats) + + subtitles = {} + for caption in video_data.get('captions', []): + caption_url = caption.get('url') + caption_format = caption.get('format') + if not caption_url or caption_format.startswith('unknown'): + continue + subtitles.setdefault(caption.get('language', 'en'), []).append({ + 'url': caption_url, + 'ext': { + 'webvtt': 'vtt', + }.get(caption_format, caption_format), + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description') or video_data.get('short_desc'), + 'thumbnail': video_data.get('thumb') or video_data.get('thumb_secure'), + 'duration': int_or_none(video_data.get('duration_sec')), + 'upload_date': unified_strdate(video_data.get('publish_date')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index f09b4cf2c..0c3e081ad 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -251,6 +251,7 @@ from .dumpert import DumpertIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE from .discoverygo import DiscoveryGoIE +from .disney import DisneyIE from .dispeak import DigitallySpeakingIE from .dropbox import DropboxIE from .dw import ( From c19ef77c3138ecf1ce5c988de2d94031f58b4f69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20P=C3=B6schel?= Date: Wed, 25 Jan 2017 20:44:03 +0100 Subject: [PATCH 021/137] [jamendo] Extract full title --- youtube_dl/extractor/jamendo.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/jamendo.py b/youtube_dl/extractor/jamendo.py index 51d19e67d..3db07e79f 100644 --- a/youtube_dl/extractor/jamendo.py +++ b/youtube_dl/extractor/jamendo.py @@ -16,7 +16,7 @@ class JamendoIE(InfoExtractor): 'id': '196219', 'display_id': 'stories-from-emona-i', 'ext': 'flac', - 'title': 'Stories from Emona I', + 'title': 'Maya Filipič - Stories from Emona I', 'thumbnail': r're:^https?://.*\.jpg' } } @@ -28,7 +28,7 @@ class JamendoIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - title = self._html_search_meta('name', webpage, 'title') + title = self._search_regex(r'(.*?)\ \|\ Jamendo\ Music\ .*', webpage, 'title') formats = [{ 'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294' @@ -62,21 +62,21 @@ class JamendoAlbumIE(InfoExtractor): 'url': 'https://www.jamendo.com/album/121486/duck-on-cover', 'info_dict': { 'id': '121486', - 'title': 'Duck On Cover' + 'title': 'Shearer - Duck On Cover' }, 'playlist': [{ 'md5': 'e1a2fcb42bda30dfac990212924149a8', 'info_dict': { 'id': '1032333', 'ext': 'flac', - 'title': 'Warmachine' + 'title': 'Shearer - Warmachine' } }, { 'md5': '1f358d7b2f98edfe90fd55dac0799d50', 'info_dict': { 'id': '1032330', 'ext': 'flac', - 'title': 'Without Your Ghost' + 'title': 'Shearer - Without Your Ghost' } }], 'params': { @@ -90,7 +90,7 @@ class JamendoAlbumIE(InfoExtractor): webpage = self._download_webpage(url, mobj.group('display_id')) - title = self._html_search_meta('name', webpage, 'title') + title = self._search_regex(r'(.*?)\ \|\ Jamendo\ Music\ .*', webpage, 'title') entries = [ self.url_result( From 15846398ca0af9154b88a69f594557568c6a4782 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 26 Jan 2017 23:23:08 +0700 Subject: [PATCH 022/137] [utils] Improve parse_duration --- test/test_utils.py | 1 + youtube_dl/utils.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index e99bf794e..a74d59f34 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -510,6 +510,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_duration('1 hour 3 minutes'), 3780) self.assertEqual(parse_duration('87 Min.'), 5220) self.assertEqual(parse_duration('PT1H0.040S'), 3600.04) + self.assertEqual(parse_duration('PT00H03M30SZ'), 210) def test_fix_xml_ampersands(self): self.assertEqual( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 98acc2b45..cf46711b9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1773,7 +1773,7 @@ def parse_duration(s): s = s.strip() days, hours, mins, secs, ms = [None] * 5 - m = re.match(r'(?:(?:(?:(?P[0-9]+):)?(?P[0-9]+):)?(?P[0-9]+):)?(?P[0-9]+)(?P\.[0-9]+)?$', s) + m = re.match(r'(?:(?:(?:(?P[0-9]+):)?(?P[0-9]+):)?(?P[0-9]+):)?(?P[0-9]+)(?P\.[0-9]+)?Z?$', s) if m: days, hours, mins, secs, ms = m.groups() else: @@ -1790,11 +1790,11 @@ def parse_duration(s): )? (?: (?P[0-9]+)(?P\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s* - )?$''', s) + )?Z?$''', s) if m: days, hours, mins, secs, ms = m.groups() else: - m = re.match(r'(?i)(?:(?P[0-9.]+)\s*(?:hours?)|(?P[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s) + m = re.match(r'(?i)(?:(?P[0-9.]+)\s*(?:hours?)|(?P[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s) if m: hours, mins = m.groups() else: From 3cbecdd11121b9c7ff0284e481992f7230806399 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 26 Jan 2017 23:25:40 +0700 Subject: [PATCH 023/137] [jamendo] Improve and extract more metadata (closes #11836) --- youtube_dl/extractor/jamendo.py | 65 ++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/jamendo.py b/youtube_dl/extractor/jamendo.py index 3db07e79f..595d7a5b7 100644 --- a/youtube_dl/extractor/jamendo.py +++ b/youtube_dl/extractor/jamendo.py @@ -5,9 +5,27 @@ import re from ..compat import compat_urlparse from .common import InfoExtractor +from ..utils import parse_duration -class JamendoIE(InfoExtractor): +class JamendoBaseIE(InfoExtractor): + def _extract_meta(self, webpage, fatal=True): + title = self._og_search_title( + webpage, default=None) or self._search_regex( + r'([^<]+)', webpage, + 'title', default=None) + if title: + title = self._search_regex( + r'(.+?)\s*\|\s*Jamendo Music', title, 'title', default=None) + if not title: + title = self._html_search_meta( + 'name', webpage, 'title', fatal=fatal) + mobj = re.search(r'(.+) - (.+)', title or '') + artist, second = mobj.groups() if mobj else [None] * 2 + return title, artist, second + + +class JamendoIE(JamendoBaseIE): _VALID_URL = r'https?://(?:www\.)?jamendo\.com/track/(?P<id>[0-9]+)/(?P<display_id>[^/?#&]+)' _TEST = { 'url': 'https://www.jamendo.com/track/196219/stories-from-emona-i', @@ -17,6 +35,9 @@ class JamendoIE(InfoExtractor): 'display_id': 'stories-from-emona-i', 'ext': 'flac', 'title': 'Maya Filipič - Stories from Emona I', + 'artist': 'Maya Filipič', + 'track': 'Stories from Emona I', + 'duration': 210, 'thumbnail': r're:^https?://.*\.jpg' } } @@ -28,7 +49,7 @@ class JamendoIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - title = self._search_regex(r'<title>(.*?)\ \|\ Jamendo\ Music\ .*', webpage, 'title') + title, artist, track = self._extract_meta(webpage) formats = [{ 'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294' @@ -46,17 +67,23 @@ class JamendoIE(InfoExtractor): thumbnail = self._html_search_meta( 'image', webpage, 'thumbnail', fatal=False) + duration = parse_duration(self._search_regex( + r']+itemprop=["\']duration["\'][^>]+content=["\'](.+?)["\']', + webpage, 'duration', fatal=False)) return { 'id': track_id, 'display_id': display_id, 'thumbnail': thumbnail, 'title': title, + 'duration': duration, + 'artist': artist, + 'track': track, 'formats': formats } -class JamendoAlbumIE(InfoExtractor): +class JamendoAlbumIE(JamendoBaseIE): _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P[0-9]+)/(?P[\w-]+)' _TEST = { 'url': 'https://www.jamendo.com/album/121486/duck-on-cover', @@ -69,14 +96,18 @@ class JamendoAlbumIE(InfoExtractor): 'info_dict': { 'id': '1032333', 'ext': 'flac', - 'title': 'Shearer - Warmachine' + 'title': 'Shearer - Warmachine', + 'artist': 'Shearer', + 'track': 'Warmachine', } }, { 'md5': '1f358d7b2f98edfe90fd55dac0799d50', 'info_dict': { 'id': '1032330', 'ext': 'flac', - 'title': 'Shearer - Without Your Ghost' + 'title': 'Shearer - Without Your Ghost', + 'artist': 'Shearer', + 'track': 'Without Your Ghost', } }], 'params': { @@ -90,18 +121,18 @@ class JamendoAlbumIE(InfoExtractor): webpage = self._download_webpage(url, mobj.group('display_id')) - title = self._search_regex(r'(.*?)\ \|\ Jamendo\ Music\ .*', webpage, 'title') + title, artist, album = self._extract_meta(webpage, fatal=False) - entries = [ - self.url_result( - compat_urlparse.urljoin(url, m.group('path')), - ie=JamendoIE.ie_key(), - video_id=self._search_regex( - r'/track/(\d+)', m.group('path'), - 'track id', default=None)) - for m in re.finditer( - r']+href=(["\'])(?P(?:(?!\1).)+)\1[^>]+class=["\'][^>]*js-trackrow-albumpage-link', - webpage) - ] + entries = [{ + '_type': 'url_transparent', + 'url': compat_urlparse.urljoin(url, m.group('path')), + 'ie_key': JamendoIE.ie_key(), + 'id': self._search_regex( + r'/track/(\d+)', m.group('path'), 'track id', default=None), + 'artist': artist, + 'album': album, + } for m in re.finditer( + r']+href=(["\'])(?P(?:(?!\1).)+)\1[^>]+class=["\'][^>]*js-trackrow-albumpage-link', + webpage)] return self.playlist_result(entries, album_id, title) From 9463637887ba784e3499410ab0945dcd68002bc1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 26 Jan 2017 18:36:28 +0100 Subject: [PATCH 024/137] [tva] Add new extractor(closes #11842) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tva.py | 54 ++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 youtube_dl/extractor/tva.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 0c3e081ad..81366f933 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -985,6 +985,7 @@ from .tv2 import ( ) from .tv3 import TV3IE from .tv4 import TV4IE +from .tva import TVAIE from .tvanouvelles import ( TVANouvellesIE, TVANouvellesArticleIE, diff --git a/youtube_dl/extractor/tva.py b/youtube_dl/extractor/tva.py new file mode 100644 index 000000000..3ced098f9 --- /dev/null +++ b/youtube_dl/extractor/tva.py @@ -0,0 +1,54 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + smuggle_url, +) + + +class TVAIE(InfoExtractor): + _VALID_URL = r'https?://videos\.tva\.ca/episode/(?P\d+)' + _TEST = { + 'url': 'http://videos.tva.ca/episode/85538', + 'info_dict': { + 'id': '85538', + 'ext': 'mp4', + 'title': 'Épisode du 25 janvier 2017', + 'description': 'md5:e9e7fb5532ab37984d2dc87229cadf98', + 'upload_date': '20170126', + 'timestamp': 1485442329, + }, + 'params': { + # m3u8 download + 'skip_download': True, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + "https://d18jmrhziuoi7p.cloudfront.net/isl/api/v1/dataservice/Items('%s')" % video_id, + video_id, query={ + '$expand': 'Metadata,CustomId', + '$select': 'Metadata,Id,Title,ShortDescription,LongDescription,CreatedDate,CustomId,AverageUserRating,Categories,ShowName', + '$format': 'json', + }) + metadata = video_data.get('Metadata', {}) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'title': video_data['Title'], + 'url': smuggle_url('ooyala:' + video_data['CustomId'], {'supportedformats': 'm3u8,hds'}), + 'description': video_data.get('LongDescription') or video_data.get('ShortDescription'), + 'series': video_data.get('ShowName'), + 'episode': metadata.get('EpisodeTitle'), + 'episode_number': int_or_none(metadata.get('EpisodeNumber')), + 'categories': video_data.get('Categories'), + 'average_rating': video_data.get('AverageUserRating'), + 'timestamp': parse_iso8601(video_data.get('CreatedDate')), + 'ie_key': 'Ooyala', + } From b51a4ebed45a3944c02bb3c36778630fd9306de7 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 26 Jan 2017 19:15:43 +0100 Subject: [PATCH 025/137] [aenetworks] fix season episodes extraction(fixes #11669) --- youtube_dl/extractor/aenetworks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index c5e079a40..c97317400 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -87,7 +87,7 @@ class AENetworksIE(AENetworksBaseIE): self._html_search_meta('aetn:SeriesTitle', webpage)) elif url_parts_len == 2: entries = [] - for episode_item in re.findall(r'(?s)]+class="[^"]*episode-item[^"]*"[^>]*>', webpage): + for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage): episode_attributes = extract_attributes(episode_item) episode_url = compat_urlparse.urljoin( url, episode_attributes['data-canonical']) From 0b23c222ba099d73c287d024f45f90714c15f289 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 27 Jan 2017 21:31:26 +0700 Subject: [PATCH 026/137] [twitch:vod] Expand _VALID_URL (closes #11846) --- youtube_dl/extractor/twitch.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 6d67bda86..1ca159a4d 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -209,7 +209,7 @@ class TwitchVodIE(TwitchItemBaseIE): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?twitch\.tv/[^/]+/v/| + (?:www\.)?twitch\.tv/(?:[^/]+/v|videos)/| player\.twitch\.tv/\?.*?\bvideo=v ) (?P\d+) @@ -259,6 +259,9 @@ class TwitchVodIE(TwitchItemBaseIE): }, { 'url': 'http://player.twitch.tv/?t=5m10s&video=v6528877', 'only_matching': True, + }, { + 'url': 'https://www.twitch.tv/videos/6528877', + 'only_matching': True, }] def _real_extract(self, url): From 489ffc118232056537e86bd0281488e217fce7d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 27 Jan 2017 22:55:42 +0700 Subject: [PATCH 027/137] [soundcloud] Fix track URL extraction (closes #11852) --- youtube_dl/extractor/soundcloud.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 5a201eaa8..96bebeec5 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -173,11 +173,12 @@ class SoundcloudIE(InfoExtractor): }) # We have to retrieve the url - streams_url = ('http://api.soundcloud.com/i1/tracks/{0}/streams?' - 'client_id={1}&secret_token={2}'.format(track_id, self._IPHONE_CLIENT_ID, secret_token)) format_dict = self._download_json( - streams_url, - track_id, 'Downloading track url') + 'http://api.soundcloud.com/i1/tracks/%s/streams' % track_id, + track_id, 'Downloading track url', query={ + 'client_id': self._CLIENT_ID, + 'secret_token': secret_token, + }) for key, stream_url in format_dict.items(): if key.startswith('http'): From 9b73471801d24cec678226c82cce9e9ece92732e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 27 Jan 2017 23:08:32 +0700 Subject: [PATCH 028/137] [soundcloud] Extract hls formats --- youtube_dl/extractor/soundcloud.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 96bebeec5..55c80e1cc 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -198,6 +198,13 @@ class SoundcloudIE(InfoExtractor): 'ext': 'flv', 'vcodec': 'none', }) + elif key.startswith('hls'): + m3u8_formats = self._extract_m3u8_formats( + stream_url, track_id, 'mp3', entry_protocol='m3u8_native', + m3u8_id=key, fatal=False) + for f in m3u8_formats: + f['vcodec'] = 'none' + formats.extend(m3u8_formats) if not formats: # We fallback to the stream_url in the original info, this From 3a194cb4ecfa8c2590f22236dffc84e1b1565196 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 27 Jan 2017 23:16:30 +0700 Subject: [PATCH 029/137] [soundcloud] Improve formats extraction and extract audio bitrate --- youtube_dl/extractor/soundcloud.py | 48 +++++++++++++++--------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 55c80e1cc..b3aa4ce26 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -181,46 +181,46 @@ class SoundcloudIE(InfoExtractor): }) for key, stream_url in format_dict.items(): + abr = int_or_none(self._search_regex( + r'_(\d+)_url', key, 'audio bitrate', default=None)) if key.startswith('http'): - formats.append({ + stream_formats = [{ 'format_id': key, 'ext': ext, 'url': stream_url, - 'vcodec': 'none', - }) + }] elif key.startswith('rtmp'): # The url doesn't have an rtmp app, we have to extract the playpath url, path = stream_url.split('mp3:', 1) - formats.append({ + stream_formats = [{ 'format_id': key, 'url': url, 'play_path': 'mp3:' + path, 'ext': 'flv', - 'vcodec': 'none', - }) + }] elif key.startswith('hls'): - m3u8_formats = self._extract_m3u8_formats( + stream_formats = self._extract_m3u8_formats( stream_url, track_id, 'mp3', entry_protocol='m3u8_native', m3u8_id=key, fatal=False) - for f in m3u8_formats: - f['vcodec'] = 'none' - formats.extend(m3u8_formats) + else: + continue - if not formats: - # We fallback to the stream_url in the original info, this - # cannot be always used, sometimes it can give an HTTP 404 error - formats.append({ - 'format_id': 'fallback', - 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, - 'ext': ext, - 'vcodec': 'none', - }) + for f in stream_formats: + f['abr'] = abr - for f in formats: - if f['format_id'].startswith('http'): - f['protocol'] = 'http' - if f['format_id'].startswith('rtmp'): - f['protocol'] = 'rtmp' + formats.extend(stream_formats) + + if not formats: + # We fallback to the stream_url in the original info, this + # cannot be always used, sometimes it can give an HTTP 404 error + formats.append({ + 'format_id': 'fallback', + 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, + 'ext': ext, + }) + + for f in formats: + f['vcodec'] = 'none' self._check_formats(formats, track_id) self._sort_formats(formats) From e0b6e50ccd124c6f618bf25bc94361d83cbc8b86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 27 Jan 2017 23:55:55 +0700 Subject: [PATCH 030/137] [crunchyroll] Improve series and season metadata extraction (closes #11832) --- youtube_dl/extractor/crunchyroll.py | 38 ++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 559044352..f811c7f33 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -166,6 +166,25 @@ class CrunchyrollIE(CrunchyrollBaseIE): # m3u8 download 'skip_download': True, }, + }, { + 'url': 'http://www.crunchyroll.com/konosuba-gods-blessing-on-this-wonderful-world/episode-1-give-me-deliverance-from-this-judicial-injustice-727589', + 'info_dict': { + 'id': '727589', + 'ext': 'mp4', + 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance from this Judicial Injustice!", + 'description': 'md5:cbcf05e528124b0f3a0a419fc805ea7d', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Kadokawa Pictures Inc.', + 'upload_date': '20170118', + 'series': "KONOSUBA -God's blessing on this wonderful world!", + 'season_number': 2, + 'episode': 'Give Me Deliverance from this Judicial Injustice!', + 'episode_number': 1, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', 'only_matching': True, @@ -439,6 +458,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text subtitles = self.extract_subtitles(video_id, webpage) + # webpage provide more accurate data than series_title from XML + series = self._html_search_regex( + r'id=["\']showmedia_about_episode_num[^>]+>\s*]+>([^<]+)', + webpage, 'series', default=xpath_text(metadata, 'series_title')) + + episode = xpath_text(metadata, 'episode_title') + episode_number = int_or_none(xpath_text(metadata, 'episode_number')) + + season_number = int_or_none(self._search_regex( + r'(?s)]+id=["\']showmedia_about_episode_num[^>]+>.+?\s*

\s*Season (\d+)', + webpage, 'season number', default=None)) + return { 'id': video_id, 'title': video_title, @@ -446,9 +477,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'thumbnail': xpath_text(metadata, 'episode_image_url'), 'uploader': video_uploader, 'upload_date': video_upload_date, - 'series': xpath_text(metadata, 'series_title'), - 'episode': xpath_text(metadata, 'episode_title'), - 'episode_number': int_or_none(xpath_text(metadata, 'episode_number')), + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, 'subtitles': subtitles, 'formats': formats, } From 815d2a36d81c4cc6181d0536ce811b0e2e4a5021 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 28 Jan 2017 00:03:21 +0700 Subject: [PATCH 031/137] [ChangeLog] Actualize --- ChangeLog | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/ChangeLog b/ChangeLog index ff305d7e8..2c670c62e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,23 @@ +version + +Core +* [utils] Improve parse_duration + +Extractors +* [crunchyroll] Improve series and season metadata extraction (#11832) +* [soundcloud] Improve formats extraction and extract audio bitrate ++ [soundcloud] Extract HLS formats +* [soundcloud] Fix track URL extraction (#11852) ++ [twitch:vod] Expand URL regular expressions (#11846) +* [aenetworks] Fix season episodes extraction (#11669) ++ [tva] Add support for videos.tva.ca (#11842) +* [jamendo] Improve and extract more metadata (#11836) ++ [disney] Add support for Disney sites (#7409, #11801, #4975, #11000) +* [vevo] Remove request to old API and catch API v2 errors ++ [cmt,mtv,southpark] Add support for episode URLs (#11837) ++ [youtube] Add fallback for duration extraction (#11841) + + version 2017.01.25 Extractors From d41ed6d243c2079db123963a7f65e91f24b390f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 28 Jan 2017 00:33:55 +0700 Subject: [PATCH 032/137] release 2017.01.28 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 4d409f785..693f3b745 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.01.25*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.01.25** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.01.28*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.01.28** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.01.25 +[debug] youtube-dl version 2017.01.28 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 2c670c62e..8e5a04b42 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.01.28 Core * [utils] Improve parse_duration diff --git a/docs/supportedsites.md b/docs/supportedsites.md index f640cfcaa..6318a862f 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -202,6 +202,7 @@ - **Digiteka** - **Discovery** - **DiscoveryGo** + - **Disney** - **Dotsub** - **DouyuTV**: 斗鱼 - **DPlay** @@ -785,6 +786,7 @@ - **TV2Article** - **TV3** - **TV4**: tv4.se and tv4play.se + - **TVA** - **TVANouvelles** - **TVANouvellesArticle** - **TVC** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c23fe85de..c22c410a8 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.01.25' +__version__ = '2017.01.28' From 99a0baf370c7652f6103cff71f878872229b4129 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Tue, 24 Jan 2017 17:42:00 +0100 Subject: [PATCH 033/137] [konserthusetplay] Add support for rspoplay.se --- youtube_dl/extractor/konserthusetplay.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/konserthusetplay.py b/youtube_dl/extractor/konserthusetplay.py index 3ae2aa317..c11cbcf47 100644 --- a/youtube_dl/extractor/konserthusetplay.py +++ b/youtube_dl/extractor/konserthusetplay.py @@ -11,22 +11,22 @@ from ..utils import ( class KonserthusetPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?konserthusetplay\.se/\?.*\bm=(?P[^&]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:konserthusetplay|rspoplay)\.se/\?.*\bm=(?P[^&]+)' + _TESTS = [{ 'url': 'http://www.konserthusetplay.se/?m=CKDDnlCY-dhWAAqiMERd-A', + 'md5': 'e3fd47bf44e864bd23c08e487abe1967', 'info_dict': { 'id': 'CKDDnlCY-dhWAAqiMERd-A', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Orkesterns instrument: Valthornen', 'description': 'md5:f10e1f0030202020396a4d712d2fa827', 'thumbnail': 're:^https?://.*$', - 'duration': 398.8, + 'duration': 398.76, }, - 'params': { - # rtmp download - 'skip_download': True, - }, - } + }, { + 'url': 'http://rspoplay.se/?m=elWuEH34SMKvaO4wO_cHBw', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From 26e40542dd730b1a18f9d7eebe241972b77810cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 28 Jan 2017 17:50:56 +0700 Subject: [PATCH 034/137] [kaltura] Improve uploader_id extraction --- youtube_dl/extractor/kaltura.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index c0ddad6f9..a57d913af 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -319,6 +319,6 @@ class KalturaIE(InfoExtractor): 'thumbnail': info.get('thumbnailUrl'), 'duration': info.get('duration'), 'timestamp': info.get('createdAt'), - 'uploader_id': info.get('userId'), + 'uploader_id': info.get('userId') if info.get('userId') != 'None' else None, 'view_count': info.get('plays'), } From ab6f6aee78fc4757fcb65bd8f4699aaf9feac3a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 28 Jan 2017 18:27:42 +0700 Subject: [PATCH 035/137] [kaltura] Add fallback for fileExt --- youtube_dl/extractor/kaltura.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index a57d913af..5ef382f9f 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -266,9 +266,12 @@ class KalturaIE(InfoExtractor): # skip for now. if f.get('fileExt') == 'chun': continue - if not f.get('fileExt') and f.get('containerFormat') == 'qt': + if not f.get('fileExt'): # QT indicates QuickTime; some videos have broken fileExt - f['fileExt'] = 'mov' + if f.get('containerFormat') == 'qt': + f['fileExt'] = 'mov' + else: + f['fileExt'] = 'mp4' video_url = sign_url( '%s/flavorId/%s' % (data_url, f['id'])) # audio-only has no videoCodecId (e.g. kaltura:1926081:0_c03e1b5g From b92d3c5343536eb0a865afa79e3787fc384ec0ec Mon Sep 17 00:00:00 2001 From: ping Date: Tue, 24 Jan 2017 13:52:17 +0800 Subject: [PATCH 036/137] [vlive] Add support for channels --- youtube_dl/extractor/extractors.py | 5 ++- youtube_dl/extractor/vlive.py | 68 ++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 81366f933..c781c9b87 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1129,7 +1129,10 @@ from .vk import ( VKUserVideosIE, VKWallPostIE, ) -from .vlive import VLiveIE +from .vlive import ( + VLiveIE, + VLiveChannelIE +) from .vodlocker import VodlockerIE from .vodplatform import VODPlatformIE from .voicerepublic import VoiceRepublicIE diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 540246c79..70bab1f04 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals import re +import time +import itertools from .common import InfoExtractor from ..utils import ( @@ -169,3 +171,69 @@ class VLiveIE(InfoExtractor): 'subtitles': subtitles, }) return info + + +class VLiveChannelIE(InfoExtractor): + IE_NAME = 'vlive:channel' + _VALID_URL = r'https?://channels\.vlive\.tv/(?P[0-9A-Z]+)/video' + _TEST = { + 'url': 'http://channels.vlive.tv/FCD4B/video', + 'info_dict': { + 'id': 'FCD4B', + 'title': 'MAMAMOO', + }, + 'playlist_mincount': 110 + } + _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' + + def _real_extract(self, url): + channel_code = self._match_id(url) + + webpage = self._download_webpage( + 'http://channels.vlive.tv/%s/video' % channel_code, channel_code) + app_js_url = self._search_regex( + r'(http[^\'\"\s]+app\.js)', webpage, 'app js', default='') + + if app_js_url: + app_js = self._download_webpage(app_js_url, channel_code, 'app js') + app_id = self._search_regex( + r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]', + app_js, 'app id', default=self._APP_ID) + else: + app_id = self._APP_ID + + channel_info = self._download_json( + 'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode', + channel_code, note='decode channel code', + query={'app_id': app_id, 'channelCode': channel_code, '_': int(time.time())}) + + channel_seq = channel_info['result']['channelSeq'] + channel_name = None + entries = [] + + for page_num in itertools.count(1): + video_list = self._download_json( + 'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList', + channel_code, note='channel list %d' % page_num, + query={ + 'app_id': app_id, + 'channelSeq': channel_seq, + 'maxNumOfRows': 1000, + '_': int(time.time()), + 'pageNo': page_num + } + ) + if not channel_name: + channel_name = video_list['result']['channelInfo']['channelName'] + + if not video_list['result'].get('videoList'): + break + + for video in video_list['result']['videoList']: + video_id = str(video['videoSeq']) + entries.append( + self.url_result( + 'http://www.vlive.tv/video/%s' % video_id, 'Vlive', video_id)) + + return self.playlist_result( + entries, channel_code, channel_name) From 661cc229d2e885dd303d26535477c8905805ddf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 28 Jan 2017 19:08:01 +0700 Subject: [PATCH 037/137] [vlive:channel] Improve --- youtube_dl/extractor/vlive.py | 62 ++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 70bab1f04..b9718901b 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -6,15 +6,19 @@ import time import itertools from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlencode, + compat_str, +) from ..utils import ( dict_get, ExtractorError, float_or_none, int_or_none, remove_start, + try_get, urlencode_postdata, ) -from ..compat import compat_urllib_parse_urlencode class VLiveIE(InfoExtractor): @@ -175,9 +179,9 @@ class VLiveIE(InfoExtractor): class VLiveChannelIE(InfoExtractor): IE_NAME = 'vlive:channel' - _VALID_URL = r'https?://channels\.vlive\.tv/(?P[0-9A-Z]+)/video' + _VALID_URL = r'https?://channels\.vlive\.tv/(?P[0-9A-Z]+)' _TEST = { - 'url': 'http://channels.vlive.tv/FCD4B/video', + 'url': 'http://channels.vlive.tv/FCD4B', 'info_dict': { 'id': 'FCD4B', 'title': 'MAMAMOO', @@ -191,21 +195,31 @@ class VLiveChannelIE(InfoExtractor): webpage = self._download_webpage( 'http://channels.vlive.tv/%s/video' % channel_code, channel_code) + + app_id = None + app_js_url = self._search_regex( - r'(http[^\'\"\s]+app\.js)', webpage, 'app js', default='') + r']+src=(["\'])(?Phttp.+?/app\.js.*?)\1', + webpage, 'app js', default=None, group='url') if app_js_url: - app_js = self._download_webpage(app_js_url, channel_code, 'app js') - app_id = self._search_regex( - r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]', - app_js, 'app id', default=self._APP_ID) - else: - app_id = self._APP_ID + app_js = self._download_webpage( + app_js_url, channel_code, 'Downloading app JS', fatal=False) + if app_js: + app_id = self._search_regex( + r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]', + app_js, 'app id', default=None) + + app_id = app_id or self._APP_ID channel_info = self._download_json( 'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode', - channel_code, note='decode channel code', - query={'app_id': app_id, 'channelCode': channel_code, '_': int(time.time())}) + channel_code, note='Downloading decode channel code', + query={ + 'app_id': app_id, + 'channelCode': channel_code, + '_': int(time.time()) + }) channel_seq = channel_info['result']['channelSeq'] channel_name = None @@ -214,7 +228,7 @@ class VLiveChannelIE(InfoExtractor): for page_num in itertools.count(1): video_list = self._download_json( 'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList', - channel_code, note='channel list %d' % page_num, + channel_code, note='Downloading channel list page #%d' % page_num, query={ 'app_id': app_id, 'channelSeq': channel_seq, @@ -223,17 +237,27 @@ class VLiveChannelIE(InfoExtractor): 'pageNo': page_num } ) - if not channel_name: - channel_name = video_list['result']['channelInfo']['channelName'] - if not video_list['result'].get('videoList'): + if not channel_name: + channel_name = try_get( + video_list, + lambda x: x['result']['channelInfo']['channelName'], + compat_str) + + videos = try_get( + video_list, lambda x: x['result']['videoList'], list) + if not videos: break - for video in video_list['result']['videoList']: - video_id = str(video['videoSeq']) + for video in videos: + video_id = video.get('videoSeq') + if not video_id: + continue + video_id = compat_str(video_id) entries.append( self.url_result( - 'http://www.vlive.tv/video/%s' % video_id, 'Vlive', video_id)) + 'http://www.vlive.tv/video/%s' % video_id, + ie=VLiveIE.ie_key(), video_id=video_id)) return self.playlist_result( entries, channel_code, channel_name) From 008f247077027f10c947060d8f3bb886c9af6aa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 28 Jan 2017 20:29:22 +0700 Subject: [PATCH 038/137] [mtv81] Add extractor (closes #7619) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/mtv.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c781c9b87..915291f74 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -553,6 +553,7 @@ from .mtv import ( MTVVideoIE, MTVServicesEmbeddedIE, MTVDEIE, + MTV81IE, ) from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 855c3996f..8acea1461 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -327,6 +327,35 @@ class MTVIE(MTVServicesInfoExtractor): }] +class MTV81IE(InfoExtractor): + IE_NAME = 'mtv81' + _VALID_URL = r'https?://(?:www\.)?mtv81\.com/videos/(?P[^/?#.]+)' + + _TEST = { + 'url': 'http://www.mtv81.com/videos/artist-to-watch/the-godfather-of-japanese-hip-hop-segment-1/', + 'md5': '1edbcdf1e7628e414a8c5dcebca3d32b', + 'info_dict': { + 'id': '5e14040d-18a4-47c4-a582-43ff602de88e', + 'ext': 'mp4', + 'title': 'Unlocking The Truth|July 18, 2016|1|101|Trailer', + 'description': '"Unlocking the Truth" premieres August 17th at 11/10c.', + 'timestamp': 1468846800, + 'upload_date': '20160718', + }, + } + + def _extract_mgid(self, webpage): + return self._search_regex( + r'getTheVideo\((["\'])(?Pmgid:.+?)\1', webpage, + 'mgid', group='id') + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + mgid = self._extract_mgid(webpage) + return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) + + class MTVVideoIE(MTVServicesInfoExtractor): IE_NAME = 'mtv:video' _VALID_URL = r'''(?x)^https?:// From 732fb3f8be6cca47c60b3befee83ee9b5002984d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 28 Jan 2017 21:06:22 +0700 Subject: [PATCH 039/137] [options] Move --abort-on-unavailable-fragment to download section --- youtube_dl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 0d2ce8d15..5e2936555 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -450,7 +450,7 @@ def parseOpts(overrideArguments=None): '--skip-unavailable-fragments', action='store_true', dest='skip_unavailable_fragments', default=True, help='Skip unavailable fragments (DASH and hlsnative only)') - general.add_option( + downloader.add_option( '--abort-on-unavailable-fragment', action='store_false', dest='skip_unavailable_fragments', help='Abort downloading when some fragment is not available') From a71b8d3b3bb399acb82f3ccfbd8a19d411848db4 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 28 Jan 2017 15:51:52 +0100 Subject: [PATCH 040/137] [itv] Add new extractor(closes #9240) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/itv.py | 181 +++++++++++++++++++++++++++++ 2 files changed, 182 insertions(+) create mode 100644 youtube_dl/extractor/itv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 915291f74..086a2296d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -415,6 +415,7 @@ from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE from .iqiyi import IqiyiIE from .ir90tv import Ir90TvIE +from .itv import ITVIE from .ivi import ( IviIE, IviCompilationIE diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py new file mode 100644 index 000000000..d029609c3 --- /dev/null +++ b/youtube_dl/extractor/itv.py @@ -0,0 +1,181 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import uuid +import xml.etree.ElementTree as etree +import json + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + extract_attributes, + xpath_with_ns, + xpath_element, + xpath_text, + int_or_none, + parse_duration, + ExtractorError, + determine_ext, +) + + +class ITVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P[0-9a-z]+)' + _TEST = { + 'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053', + 'info_dict': { + 'id': '2a2936a0053', + 'ext': 'flv', + 'title': 'Home Movie', + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + params = extract_attributes(self._search_regex( + r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params')) + + ns_map = { + 'soapenv': 'http://schemas.xmlsoap.org/soap/envelope/', + 'tem': 'http://tempuri.org/', + 'itv': 'http://schemas.datacontract.org/2004/07/Itv.BB.Mercury.Common.Types', + 'com': 'http://schemas.itv.com/2009/05/Common', + } + for ns, full_ns in ns_map.items(): + etree.register_namespace(ns, full_ns) + + def _add_ns(name): + return xpath_with_ns(name, ns_map) + + def _add_sub_element(element, name): + return etree.SubElement(element, _add_ns(name)) + + req_env = etree.Element(_add_ns('soapenv:Envelope')) + _add_sub_element(req_env, 'soapenv:Header') + body = _add_sub_element(req_env, 'soapenv:Body') + get_playlist = _add_sub_element(body, ('tem:GetPlaylist')) + request = _add_sub_element(get_playlist, 'tem:request') + _add_sub_element(request, 'itv:ProductionId').text = params['data-video-id'] + _add_sub_element(request, 'itv:RequestGuid').text = compat_str(uuid.uuid4()).upper() + vodcrid = _add_sub_element(request, 'itv:Vodcrid') + _add_sub_element(vodcrid, 'com:Id') + _add_sub_element(request, 'itv:Partition') + user_info = _add_sub_element(get_playlist, 'tem:userInfo') + _add_sub_element(user_info, 'itv:Broadcaster').text = 'Itv' + _add_sub_element(user_info, 'itv:DM') + _add_sub_element(user_info, 'itv:RevenueScienceValue') + _add_sub_element(user_info, 'itv:SessionId') + _add_sub_element(user_info, 'itv:SsoToken') + _add_sub_element(user_info, 'itv:UserToken') + site_info = _add_sub_element(get_playlist, 'tem:siteInfo') + _add_sub_element(site_info, 'itv:AdvertisingRestriction').text = 'None' + _add_sub_element(site_info, 'itv:AdvertisingSite').text = 'ITV' + _add_sub_element(site_info, 'itv:AdvertisingType').text = 'Any' + _add_sub_element(site_info, 'itv:Area').text = 'ITVPLAYER.VIDEO' + _add_sub_element(site_info, 'itv:Category') + _add_sub_element(site_info, 'itv:Platform').text = 'DotCom' + _add_sub_element(site_info, 'itv:Site').text = 'ItvCom' + device_info = _add_sub_element(get_playlist, 'tem:deviceInfo') + _add_sub_element(device_info, 'itv:ScreenSize').text = 'Big' + player_info = _add_sub_element(get_playlist, 'tem:playerInfo') + _add_sub_element(player_info, 'itv:Version').text = '2' + + headers = self.geo_verification_headers() + headers.update({ + 'Content-Type': 'text/xml; charset=utf-8', + 'SOAPAction': 'http://tempuri.org/PlaylistService/GetPlaylist', + }) + resp_env = self._download_xml( + params['data-playlist-url'], video_id, + headers=headers, data=etree.tostring(req_env)) + playlist = xpath_element(resp_env, './/Playlist') + if playlist is None: + fault_string = xpath_text(resp_env, './/faultstring') + raise ExtractorError('%s said: %s' % (self.IE_NAME, fault_string)) + title = xpath_text(playlist, 'EpisodeTitle', fatal=True) + media_files = xpath_element(playlist, 'VideoEntries/Video/MediaFiles', fatal=True) + rtmp_url = media_files.attrib['base'] + + formats = [] + for media_file in media_files.findall('MediaFile'): + play_path = xpath_text(media_file, 'URL') + if not play_path: + continue + tbr = int_or_none(media_file.get('bitrate'), 1000) + formats.append({ + 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), + 'url': rtmp_url, + 'play_path': play_path, + 'tbr': tbr, + 'ext': 'flv', + }) + + ios_playlist_url = params.get('data-video-playlist') + hmac = params.get('data-video-hmac') + if ios_playlist_url and hmac: + headers = self.geo_verification_headers() + headers.update({ + 'Accept': 'application/vnd.itv.vod.playlist.v2+json', + 'Content-Type': 'application/json', + 'hmac': hmac.upper(), + }) + ios_playlist = self._download_json( + ios_playlist_url, video_id, data=json.dumps({ + 'user': { + 'itvUserId': '', + 'entitlements': [], + 'token': '' + }, + 'device': { + 'manufacturer': 'Apple', + 'model': 'iPad', + 'os': { + 'name': 'iPhone OS', + 'version': '9.3', + 'type': 'ios' + } + }, + 'client': { + 'version': '4.1', + 'id': 'browser' + }, + 'variantAvailability': { + 'featureset': { + 'min': ['hls', 'aes'], + 'max': ['hls', 'aes'] + }, + 'platformTag': 'mobile' + } + }).encode(), headers=headers, fatal=False) + if ios_playlist: + video_data = ios_playlist.get('Playlist', {}).get('Video', {}) + ios_base_url = video_data.get('Base') + for media_file in video_data.get('MediaFiles', []): + href = media_file.get('Href') + if not href: + continue + if ios_base_url: + href = ios_base_url + href + ext = determine_ext(href) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats(href, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': href, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'episode_title': title, + 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), + 'series': xpath_text(playlist, 'ProgrammeTitle'), + 'duartion': parse_duration(xpath_text(playlist, 'Duration')), + } From 24ee6b9721770b7066f10f6a6773f1ce15f82ed0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 28 Jan 2017 22:40:07 +0700 Subject: [PATCH 041/137] [options] Remove experimental mark from some options --- youtube_dl/options.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 5e2936555..09c9387ca 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -216,23 +216,23 @@ def parseOpts(overrideArguments=None): network.add_option( '--source-address', metavar='IP', dest='source_address', default=None, - help='Client-side IP address to bind to (experimental)', + help='Client-side IP address to bind to', ) network.add_option( '-4', '--force-ipv4', action='store_const', const='0.0.0.0', dest='source_address', - help='Make all connections via IPv4 (experimental)', + help='Make all connections via IPv4', ) network.add_option( '-6', '--force-ipv6', action='store_const', const='::', dest='source_address', - help='Make all connections via IPv6 (experimental)', + help='Make all connections via IPv6', ) network.add_option( '--geo-verification-proxy', dest='geo_verification_proxy', default=None, metavar='URL', help='Use this proxy to verify the IP address for some geo-restricted sites. ' - 'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading. (experimental)' + 'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading.' ) network.add_option( '--cn-verification-proxy', @@ -297,7 +297,7 @@ def parseOpts(overrideArguments=None): '--match-filter', metavar='FILTER', dest='match_filter', default=None, help=( - 'Generic video filter (experimental). ' + 'Generic video filter. ' 'Specify any key (see help for -o for a list of available keys) to' ' match if the key is present, ' '!key to check if the key is not present,' From f592ff98683794e0f79c96cbec67b737ae8da00c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 28 Jan 2017 17:25:15 +0100 Subject: [PATCH 042/137] [itv] extract subtitles --- youtube_dl/extractor/itv.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index d029609c3..d65cdc6af 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -20,7 +20,7 @@ from ..utils import ( class ITVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P[0-9a-z]+)' + _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P[0-9a-zA-Z]+)' _TEST = { 'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053', 'info_dict': { @@ -98,7 +98,8 @@ class ITVIE(InfoExtractor): fault_string = xpath_text(resp_env, './/faultstring') raise ExtractorError('%s said: %s' % (self.IE_NAME, fault_string)) title = xpath_text(playlist, 'EpisodeTitle', fatal=True) - media_files = xpath_element(playlist, 'VideoEntries/Video/MediaFiles', fatal=True) + video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) + media_files = xpath_element(video_element, 'MediaFiles', fatal=True) rtmp_url = media_files.attrib['base'] formats = [] @@ -170,10 +171,21 @@ class ITVIE(InfoExtractor): }) self._sort_formats(formats) + subtitles = {} + for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): + if not caption_url.text: + continue + ext = determine_ext(caption_url.text, 'ttml') + subtitles.setdefault('en', []).append({ + 'url': caption_url, + 'ext': 'ttml' if ext == 'xml' else ext, + }) + return { 'id': video_id, 'title': title, 'formats': formats, + 'subtitles': subtitles, 'episode_title': title, 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), 'series': xpath_text(playlist, 'ProgrammeTitle'), From 4edeac5bfae76966fd14f636bd68850ea0403ece Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 28 Jan 2017 17:28:18 +0100 Subject: [PATCH 043/137] [itv] fix subtitle extraction --- youtube_dl/extractor/itv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index d65cdc6af..0328c7093 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -177,7 +177,7 @@ class ITVIE(InfoExtractor): continue ext = determine_ext(caption_url.text, 'ttml') subtitles.setdefault('en', []).append({ - 'url': caption_url, + 'url': caption_url.text, 'ext': 'ttml' if ext == 'xml' else ext, }) From acbb2374bce27eda16764b80832f88cf833a51e5 Mon Sep 17 00:00:00 2001 From: Costy Petrisor Date: Sun, 1 May 2016 12:34:11 +0000 Subject: [PATCH 044/137] added --autonumber-start NUMBER as a command line option to be able to offset the index at which autonumber formats filenames --- youtube_dl/YoutubeDL.py | 2 +- youtube_dl/__init__.py | 1 + youtube_dl/options.py | 4 ++++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 41d9a63ee..c71e94518 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -584,7 +584,7 @@ class YoutubeDL(object): if autonumber_size is None: autonumber_size = 5 autonumber_templ = '%0' + str(autonumber_size) + 'd' - template_dict['autonumber'] = autonumber_templ % self._num_downloads + template_dict['autonumber'] = autonumber_templ % (self.params.get('autonumber_start', 1) - 1 + self._num_downloads) if template_dict.get('playlist_index') is not None: template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index']) if template_dict.get('resolution') is None: diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index dfa4ae839..577bc880f 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -321,6 +321,7 @@ def _real_main(argv=None): 'listformats': opts.listformats, 'outtmpl': outtmpl, 'autonumber_size': opts.autonumber_size, + 'autonumber_start': opts.autonumber_start, 'restrictfilenames': opts.restrictfilenames, 'ignoreerrors': opts.ignoreerrors, 'force_generic_extractor': opts.force_generic_extractor, diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 09c9387ca..571525434 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -663,6 +663,10 @@ def parseOpts(overrideArguments=None): '--autonumber-size', dest='autonumber_size', metavar='NUMBER', help='Specify the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option is given') + filesystem.add_option( + '--autonumber-start', + dest='autonumber_start', metavar='NUMBER', type="int", default=1, + help='Specify the start value for the %(autonumber)s counter. Defaults to 1.') filesystem.add_option( '--restrict-filenames', action='store_true', dest='restrictfilenames', default=False, From 1a241a2d02e2507219e81d7b18c18f10937ae6e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 28 Jan 2017 23:57:56 +0700 Subject: [PATCH 045/137] [options] Refactor autonumber options and add validation (closes #727, closes #2702, closes #9362) --- youtube_dl/__init__.py | 6 ++++++ youtube_dl/options.py | 8 ++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 577bc880f..2b156342a 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -133,6 +133,12 @@ def _real_main(argv=None): parser.error('TV Provider account username missing\n') if opts.outtmpl is not None and (opts.usetitle or opts.autonumber or opts.useid): parser.error('using output template conflicts with using title, video ID or auto number') + if opts.autonumber_size is not None: + if opts.autonumber_size <= 0: + parser.error('auto number size must be positive') + if opts.autonumber_start is not None: + if opts.autonumber_start < 0: + parser.error('auto number start must be positive or 0') if opts.usetitle and opts.useid: parser.error('using title conflicts with using video ID') if opts.username is not None and opts.password is None: diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 571525434..3abf621c0 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -661,12 +661,12 @@ def parseOpts(overrideArguments=None): help=('Output filename template, see the "OUTPUT TEMPLATE" for all the info')) filesystem.add_option( '--autonumber-size', - dest='autonumber_size', metavar='NUMBER', - help='Specify the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option is given') + dest='autonumber_size', metavar='NUMBER', default=5, type=int, + help='Specify the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option is given (default is %default)') filesystem.add_option( '--autonumber-start', - dest='autonumber_start', metavar='NUMBER', type="int", default=1, - help='Specify the start value for the %(autonumber)s counter. Defaults to 1.') + dest='autonumber_start', metavar='NUMBER', default=1, type=int, + help='Specify the start value for %(autonumber)s (default is %default)') filesystem.add_option( '--restrict-filenames', action='store_true', dest='restrictfilenames', default=False, From c0af11abeeaad75f4387ad77adc751715dfc0cf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Jan 2017 00:52:23 +0700 Subject: [PATCH 046/137] Credit @AVerwer for showroomlive (#11458) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 9e092cccc..90ff4d789 100644 --- a/AUTHORS +++ b/AUTHORS @@ -191,3 +191,4 @@ Rich Leeper Zhong Jianxin Thor77 Mattias Wadman +Arjan Verwer From ffcfb7e3e01cec5f5468e4639b2e4d44a0c7bfba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Jan 2017 00:54:31 +0700 Subject: [PATCH 047/137] Credit @costypetrisor for autonumber start (#9362) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 90ff4d789..6b6e38613 100644 --- a/AUTHORS +++ b/AUTHORS @@ -192,3 +192,4 @@ Zhong Jianxin Thor77 Mattias Wadman Arjan Verwer +Costy Petrisor From 34cea6137e6df158c99d83fd1c1af55f94ee4a38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Jan 2017 00:57:15 +0700 Subject: [PATCH 048/137] Credit @einstein95 for pornflip (#11795) and chaturbate fix (#11797) --- AUTHORS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/AUTHORS b/AUTHORS index 6b6e38613..600e2c55b 100644 --- a/AUTHORS +++ b/AUTHORS @@ -193,3 +193,5 @@ Thor77 Mattias Wadman Arjan Verwer Costy Petrisor +Logan B + From 186f4abe938e0f631b63c5dc1aaa4d622513a366 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Jan 2017 00:59:17 +0700 Subject: [PATCH 049/137] Credit @goggle for 20min (#11683) and azmedien (#11805) --- AUTHORS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AUTHORS b/AUTHORS index 600e2c55b..b3193f7da 100644 --- a/AUTHORS +++ b/AUTHORS @@ -194,4 +194,4 @@ Mattias Wadman Arjan Verwer Costy Petrisor Logan B - +Alex Seiler From f5169501d2749503e5d19f9c51937aedcce357e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Jan 2017 01:00:17 +0700 Subject: [PATCH 050/137] Credit @sudovijay for openload fix (#11646) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index b3193f7da..434217abb 100644 --- a/AUTHORS +++ b/AUTHORS @@ -195,3 +195,4 @@ Arjan Verwer Costy Petrisor Logan B Alex Seiler +Vijay Singh From 4d07b748c2e8057fa6417ab5422cb19be313d7b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Jan 2017 01:01:39 +0700 Subject: [PATCH 051/137] Credit @bastik for zdf fix (#11063) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 434217abb..49ffc99aa 100644 --- a/AUTHORS +++ b/AUTHORS @@ -196,3 +196,4 @@ Costy Petrisor Logan B Alex Seiler Vijay Singh +Paul Hartmann From 59c307891ac2cca2b2db42a534a3f4de61820450 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Jan 2017 01:02:28 +0700 Subject: [PATCH 052/137] Credit @RPing for cntv (#8541) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 49ffc99aa..78a7a5291 100644 --- a/AUTHORS +++ b/AUTHORS @@ -197,3 +197,4 @@ Logan B Alex Seiler Vijay Singh Paul Hartmann +Stephen Chen From 0842b8241d9f8984dd70266b59aa68241259401f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Jan 2017 01:03:59 +0700 Subject: [PATCH 053/137] Credit @fast90 for config location (#10648) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 78a7a5291..022a5de84 100644 --- a/AUTHORS +++ b/AUTHORS @@ -198,3 +198,4 @@ Alex Seiler Vijay Singh Paul Hartmann Stephen Chen +Fabian Stahl From 56fc078da84a7f26d8290b2b425cc2da66a5975a Mon Sep 17 00:00:00 2001 From: Andre Walker Date: Sat, 28 Jan 2017 16:19:38 +0100 Subject: [PATCH 054/137] [npo] Update subtitles url NPO websites changed the domain they used for subtitles, from e.omroep.nl to tt888.omroep.nl. --- youtube_dl/extractor/npo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index c91f58461..962437145 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -241,7 +241,7 @@ class NPOIE(NPOBaseIE): if metadata.get('tt888') == 'ja': subtitles['nl'] = [{ 'ext': 'vtt', - 'url': 'http://e.omroep.nl/tt888/%s' % video_id, + 'url': 'http://tt888.omroep.nl/tt888/%s' % video_id, }] return { From 76aaf1faaed613569cb71e4f9aa7bd218f27c54b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Jan 2017 03:43:46 +0700 Subject: [PATCH 055/137] Credit @BagiraHun for videa (#11133) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 022a5de84..3ef2800c9 100644 --- a/AUTHORS +++ b/AUTHORS @@ -199,3 +199,4 @@ Vijay Singh Paul Hartmann Stephen Chen Fabian Stahl +Bagira From d04621daf451d601dba80dc0f2baa29e404e4ca6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Jan 2017 05:36:53 +0700 Subject: [PATCH 056/137] [extractor/common] Fix duration per dash segment (closes #11868) --- youtube_dl/extractor/common.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index dce8c7d0d..a3048fb59 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1751,14 +1751,16 @@ class InfoExtractor(object): # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI # or any YouTube dashsegments video fragments = [] - s_num = 0 - for segment_url in representation_ms_info['segment_urls']: - s = representation_ms_info['s'][s_num] + segment_index = 0 + timescale = representation_ms_info['timescale'] + for s in representation_ms_info['s']: + duration = float_or_none(s['d'], timescale) for r in range(s.get('r', 0) + 1): fragments.append({ - 'url': segment_url, - 'duration': float_or_none(s['d'], representation_ms_info['timescale']), + 'url': representation_ms_info['segment_urls'][segment_index], + 'duration': duration, }) + segment_index += 1 representation_ms_info['fragments'] = fragments # NB: MPD manifest may contain direct URLs to unfragmented media. # No fragments key is present in this case. From c58c2d63cbde07af66885829b7c3dbcdfbc096dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Jan 2017 05:56:43 +0700 Subject: [PATCH 057/137] [extractor/common] Document forgotten fragment base and path interfaces --- youtube_dl/extractor/common.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a3048fb59..fb484b6f2 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -121,9 +121,19 @@ class InfoExtractor(object): download, lower-case. "http", "https", "rtsp", "rtmp", "rtmpe", "m3u8", "m3u8_native" or "http_dash_segments". - * fragments A list of fragments of the fragmented media, - with the following entries: - * "url" (mandatory) - fragment's URL + * fragment_base_url + Base URL for fragments. Each fragment's path + value (if present) will be relative to + this URL. + * fragments A list of fragments of a fragmented media. + Each fragment entry must contain either an url + or a path. If an url is present it should be + considered by a client. Otherwise both path and + fragment_base_url must be present. Here is + the list of all potential fields: + * "url" - fragment's URL + * "path" - fragment's path relative to + fragment_base_url * "duration" (optional, int or float) * "filesize" (optional, int) * preference Order number of this format. If this field is From e228616c6e73561f0c6d32d6b681bbba321c06aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 29 Jan 2017 06:57:39 +0700 Subject: [PATCH 058/137] [extractor/common] Fix initialization template (closes #11605, closes #11825) --- youtube_dl/extractor/common.py | 48 ++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index fb484b6f2..5a15a9536 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1637,12 +1637,12 @@ class InfoExtractor(object): segment_template = element.find(_add_ns('SegmentTemplate')) if segment_template is not None: extract_common(segment_template) - media_template = segment_template.get('media') - if media_template: - ms_info['media_template'] = media_template + media = segment_template.get('media') + if media: + ms_info['media'] = media initialization = segment_template.get('initialization') if initialization: - ms_info['initialization_url'] = initialization + ms_info['initialization'] = initialization else: extract_Initialization(segment_template) return ms_info @@ -1686,6 +1686,7 @@ class InfoExtractor(object): lang = representation_attrib.get('lang') url_el = representation.find(_add_ns('BaseURL')) filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) + bandwidth = int_or_none(representation_attrib.get('bandwidth')) f = { 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 'url': base_url, @@ -1693,7 +1694,7 @@ class InfoExtractor(object): 'ext': mimetype2ext(mime_type), 'width': int_or_none(representation_attrib.get('width')), 'height': int_or_none(representation_attrib.get('height')), - 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000), + 'tbr': int_or_none(bandwidth, 1000), 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), 'fps': int_or_none(representation_attrib.get('frameRate')), 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, @@ -1702,13 +1703,32 @@ class InfoExtractor(object): } f.update(parse_codecs(representation_attrib.get('codecs'))) representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) - if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info: - media_template = representation_ms_info['media_template'] - media_template = media_template.replace('$RepresentationID$', representation_id) - media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template) - media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template) - media_template.replace('$$', '$') + def prepare_template(template_name, identifiers): + t = representation_ms_info[template_name] + t = t.replace('$RepresentationID$', representation_id) + t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) + t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) + t.replace('$$', '$') + return t + + # @initialization is a regular template like @media one + # so it should be handled just the same way (see + # https://github.com/rg3/youtube-dl/issues/11605) + if 'initialization' in representation_ms_info: + initialization_template = prepare_template( + 'initialization', + # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and + # $Time$ shall not be included for @initialization thus + # only $Bandwidth$ remains + ('Bandwidth', )) + representation_ms_info['initialization_url'] = initialization_template % { + 'Bandwidth': bandwidth, + } + + if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: + + media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ # can't be used at the same time @@ -1720,7 +1740,7 @@ class InfoExtractor(object): representation_ms_info['fragments'] = [{ 'url': media_template % { 'Number': segment_number, - 'Bandwidth': int_or_none(representation_attrib.get('bandwidth')), + 'Bandwidth': bandwidth, }, 'duration': segment_duration, } for segment_number in range( @@ -1738,7 +1758,7 @@ class InfoExtractor(object): def add_segment_url(): segment_url = media_template % { 'Time': segment_time, - 'Bandwidth': int_or_none(representation_attrib.get('bandwidth')), + 'Bandwidth': bandwidth, 'Number': segment_number, } representation_ms_info['fragments'].append({ @@ -1780,7 +1800,7 @@ class InfoExtractor(object): 'protocol': 'http_dash_segments', }) if 'initialization_url' in representation_ms_info: - initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id) + initialization_url = representation_ms_info['initialization_url'] if not f.get('url'): f['url'] = initialization_url f['fragments'].append({'url': initialization_url}) From f13da8af289d7d9365e34ef705a53ac62aa3b570 Mon Sep 17 00:00:00 2001 From: Alex Seiler Date: Sat, 28 Jan 2017 17:52:07 +0100 Subject: [PATCH 059/137] [azmedien:playlist] Add support for topic and themen playlists --- youtube_dl/extractor/azmedien.py | 39 +++++++++++++++++++++++++----- youtube_dl/extractor/extractors.py | 2 +- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index a89f71c20..cbc3ed564 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -5,8 +5,9 @@ import re from .common import InfoExtractor from .kaltura import KalturaIE from ..utils import ( - get_element_by_class, + get_element_by_id, strip_or_none, + urljoin, ) @@ -83,8 +84,8 @@ class AZMedienIE(AZMedienBaseIE): return self._kaltura_video(partner_id, entry_id) -class AZMedienShowIE(AZMedienBaseIE): - IE_DESC = 'AZ Medien shows' +class AZMedienPlaylistIE(AZMedienBaseIE): + IE_DESC = 'AZ Medien playlists' _VALID_URL = r'''(?x) https?:// (?:www\.)? @@ -93,7 +94,12 @@ class AZMedienShowIE(AZMedienBaseIE): telebaern\.tv| telem1\.ch )/ - (?P[0-9]+-show-[^/\#]+ + (?P[0-9]+- + (?: + show| + topic| + themen + )-[^/\#]+ (?: /[0-9]+-episode-[^/\#]+ )? @@ -108,6 +114,18 @@ class AZMedienShowIE(AZMedienBaseIE): 'title': 'News - Donnerstag, 15. Dezember 2016', }, 'playlist_count': 9, + }, { + # URL with 'themen' + 'url': 'http://www.telem1.ch/258-themen-tele-m1-classics', + 'info_dict': { + 'id': '258-themen-tele-m1-classics', + 'title': 'Tele M1 Classics', + }, + 'playlist_mincount': 15, + }, { + # URL with 'topic', contains nested playlists + 'url': 'http://www.telezueri.ch/219-topic-aera-trump-hat-offiziell-begonnen', + 'only_matching': True, }, { # URL with 'show' only 'url': 'http://www.telezueri.ch/86-show-talktaeglich', @@ -136,10 +154,19 @@ class AZMedienShowIE(AZMedienBaseIE): for m in re.finditer( r']+data-real=(["\'])(?Phttp.+?)\1', webpage)] + if not entries: + entries = [ + # May contain nested playlists (e.g. [1]) thus no explicit + # ie_key + # 1. http://www.telezueri.ch/219-topic-aera-trump-hat-offiziell-begonnen) + self.url_result(urljoin(url, m.group('url'))) + for m in re.finditer( + r']+name=[^>]+href=(["\'])(?P/.+?)\1', webpage)] + title = self._search_regex( r'episodeShareTitle\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'title', - default=strip_or_none(get_element_by_class( - 'title-block-cell', webpage)), group='title') + default=strip_or_none(get_element_by_id( + 'video-title', webpage)), group='title') return self.playlist_result(entries, show_id, title) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 086a2296d..2590b5e1b 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -82,7 +82,7 @@ from .awaan import ( ) from .azmedien import ( AZMedienIE, - AZMedienShowIE, + AZMedienPlaylistIE, ) from .azubu import AzubuIE, AzubuLiveIE from .baidu import BaiduVideoIE From fe323a4800d67d0ad2fecebcc3b627a7a22be427 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 29 Jan 2017 21:21:26 +0700 Subject: [PATCH 060/137] [ChangeLog] Actualize --- ChangeLog | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ChangeLog b/ChangeLog index 8e5a04b42..ab2818f9e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,24 @@ +version <unreleased> + +Core +* [extractor/common] Fix initialization template (#11605, #11825) ++ [extractor/common] Document fragment_base_url and fragment's path fields +* [extractor/common] Fix duration per DASH segment (#11868) ++ Introduce --autonumber-start option for initial value of %(autonumber)s + template (#727, #2702, #9362, #10457, #10529, #11862) + +Extractors ++ [azmedien:playlist] Add support for topic and themen playlists (#11817) +* [npo] Fix subtitles extraction ++ [itv] Extract subtitles ++ [itv] Add support for itv.com (#9240) ++ [mtv81] Add support for mtv81.com (#7619) ++ [vlive] Add support for channels (#11826) ++ [kaltura] Add fallback for fileExt ++ [kaltura] Improve uploader_id extraction ++ [konserthusetplay] Add support for rspoplay.se (#11828) + + version 2017.01.28 Core From 4d2fdb07c47e2d9f96d58f5fbf3da8665a1144a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 29 Jan 2017 13:21:42 +0700 Subject: [PATCH 061/137] release 2017.01.29 --- .github/ISSUE_TEMPLATE.md | 6 ++--- ChangeLog | 2 +- README.md | 47 +++++++++++++++++++-------------------- docs/supportedsites.md | 5 ++++- youtube_dl/version.py | 2 +- 5 files changed, 32 insertions(+), 30 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 693f3b745..10c982fd0 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.01.28*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.01.28** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.01.29*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.01.29** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.01.28 +[debug] youtube-dl version 2017.01.29 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index ab2818f9e..cd7017f6d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.01.29 Core * [extractor/common] Fix initialization template (#11605, #11825) diff --git a/README.md b/README.md index 4f677d0cc..2ee00f515 100644 --- a/README.md +++ b/README.md @@ -88,8 +88,6 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --mark-watched Mark videos watched (YouTube only) --no-mark-watched Do not mark videos watched (YouTube only) --no-color Do not emit color codes in output - --abort-on-unavailable-fragment Abort downloading when some fragment is not - available ## Network Options: --proxy URL Use the specified HTTP/HTTPS/SOCKS proxy. @@ -99,16 +97,13 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo string (--proxy "") for direct connection --socket-timeout SECONDS Time to wait before giving up, in seconds --source-address IP Client-side IP address to bind to - (experimental) -4, --force-ipv4 Make all connections via IPv4 - (experimental) -6, --force-ipv6 Make all connections via IPv6 - (experimental) --geo-verification-proxy URL Use this proxy to verify the IP address for some geo-restricted sites. The default proxy specified by --proxy (or none, if the options is not present) is used for the - actual downloading. (experimental) + actual downloading. ## Video Selection: --playlist-start NUMBER Playlist video to start at (default is 1) @@ -139,23 +134,23 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo COUNT views --max-views COUNT Do not download any videos with more than COUNT views - --match-filter FILTER Generic video filter (experimental). - Specify any key (see help for -o for a list - of available keys) to match if the key is - present, !key to check if the key is not - present,key > NUMBER (like "comment_count > - 12", also works with >=, <, <=, !=, =) to - compare against a number, and & to require - multiple matches. Values which are not - known are excluded unless you put a - question mark (?) after the operator.For - example, to only match videos that have - been liked more than 100 times and disliked - less than 50 times (or the dislike - functionality is not available at the given - service), but who also have a description, - use --match-filter "like_count > 100 & - dislike_count <? 50 & description" . + --match-filter FILTER Generic video filter. Specify any key (see + help for -o for a list of available keys) + to match if the key is present, !key to + check if the key is not present,key > + NUMBER (like "comment_count > 12", also + works with >=, <, <=, !=, =) to compare + against a number, and & to require multiple + matches. Values which are not known are + excluded unless you put a question mark (?) + after the operator.For example, to only + match videos that have been liked more than + 100 times and disliked less than 50 times + (or the dislike functionality is not + available at the given service), but who + also have a description, use --match-filter + "like_count > 100 & dislike_count <? 50 & + description" . --no-playlist Download only the video, if the URL refers to a video and a playlist. --yes-playlist Download the playlist, if the URL refers to @@ -178,6 +173,8 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo only) --skip-unavailable-fragments Skip unavailable fragments (DASH and hlsnative only) + --abort-on-unavailable-fragment Abort downloading when some fragment is not + available --buffer-size SIZE Size of download buffer (e.g. 1024 or 16K) (default is 1024) --no-resize-buffer Do not automatically adjust the buffer @@ -210,7 +207,9 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --autonumber-size NUMBER Specify the number of digits in %(autonumber)s when it is present in output filename template or --auto-number option - is given + is given (default is 5) + --autonumber-start NUMBER Specify the start value for %(autonumber)s + (default is 1) --restrict-filenames Restrict filenames to only ASCII characters, and avoid "&" and spaces in filenames diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 6318a862f..d4231577b 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -76,7 +76,7 @@ - **awaan:season** - **awaan:video** - **AZMedien**: AZ Medien videos - - **AZMedienShow**: AZ Medien shows + - **AZMedienPlaylist**: AZ Medien playlists - **Azubu** - **AzubuLive** - **BaiduVideo**: 百度视频 @@ -337,6 +337,7 @@ - **IPrima** - **iqiyi**: 爱奇艺 - **Ir90Tv** + - **ITV** - **ivi**: ivi.ru - **ivi:compilation**: ivi.ru compilations - **ivideon**: Ivideon TV @@ -445,6 +446,7 @@ - **mtg**: MTG services - **mtv** - **mtv.de** + - **mtv81** - **mtv:video** - **mtvservices:embedded** - **MuenchenTV**: münchen.tv @@ -887,6 +889,7 @@ - **vk:uservideos**: VK - User's Videos - **vk:wallpost** - **vlive** + - **vlive:channel** - **Vodlocker** - **VODPlatform** - **VoiceRepublic** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c22c410a8..a37a65db9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.01.28' +__version__ = '2017.01.29' From c2d9c25f818da2e0e622b475ffc714f35df0887c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 29 Jan 2017 16:03:39 +0100 Subject: [PATCH 062/137] [compat] add compat_etree_register_namespace --- youtube_dl/compat.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 02abf8c1e..49e3c90e2 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2529,6 +2529,24 @@ else: el.text = el.text.decode('utf-8') return doc +if hasattr(etree, 'register_namespace'): + compat_etree_register_namespace = etree.register_namespace +else: + def compat_etree_register_namespace(prefix, uri): + """Register a namespace prefix. + The registry is global, and any existing mapping for either the + given prefix or the namespace URI will be removed. + *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and + attributes in this namespace will be serialized with prefix if possible. + ValueError is raised if prefix is reserved or is invalid. + """ + if re.match(r"ns\d+$", prefix): + raise ValueError("Prefix format reserved for internal use") + for k, v in list(etree._namespace_map.items()): + if k == uri or v == prefix: + del etree._namespace_map[k] + etree._namespace_map[uri] = prefix + if sys.version_info < (2, 7): # Here comes the crazy part: In 2.6, if the xpath is a unicode, # .//node does not match if a node is a direct child of . ! From 4719419951ced20e42cddb26b437908ba636debb Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sun, 29 Jan 2017 16:04:15 +0100 Subject: [PATCH 063/137] [itv] fix extraction in python 2.6 --- youtube_dl/extractor/itv.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 0328c7093..b0d860452 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -6,7 +6,10 @@ import xml.etree.ElementTree as etree import json from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_etree_register_namespace, +) from ..utils import ( extract_attributes, xpath_with_ns, @@ -47,7 +50,7 @@ class ITVIE(InfoExtractor): 'com': 'http://schemas.itv.com/2009/05/Common', } for ns, full_ns in ns_map.items(): - etree.register_namespace(ns, full_ns) + compat_etree_register_namespace(ns, full_ns) def _add_ns(name): return xpath_with_ns(name, ns_map) From dadb836139f070da9364439bf3b148eec8bc0b11 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Mon, 30 Jan 2017 09:32:31 +0100 Subject: [PATCH 064/137] [ruutu] extract dash formats --- youtube_dl/extractor/ruutu.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py index f12bc5614..20d01754a 100644 --- a/youtube_dl/extractor/ruutu.py +++ b/youtube_dl/extractor/ruutu.py @@ -81,6 +81,9 @@ class RuutuIE(InfoExtractor): elif ext == 'f4m': formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id='hds', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) else: proto = compat_urllib_parse_urlparse(video_url).scheme if not child.tag.startswith('HTTP') and proto != 'rtmp': From 75822ca7909d7f7e15694f73b05b2bf0f1fa61f3 Mon Sep 17 00:00:00 2001 From: Thomas Christlieb <thomaschristlieb@hotmail.com> Date: Tue, 31 Jan 2017 10:03:31 +0100 Subject: [PATCH 065/137] New parameter --playlist-random to randomize playlist download order. Fixes #11889 --- youtube_dl/YoutubeDL.py | 5 +++++ youtube_dl/__init__.py | 1 + youtube_dl/options.py | 4 ++++ 3 files changed, 10 insertions(+) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index c71e94518..a7bf5a1b0 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -24,6 +24,7 @@ import sys import time import tokenize import traceback +import random from .compat import ( compat_basestring, @@ -159,6 +160,7 @@ class YoutubeDL(object): playlistend: Playlist item to end at. playlist_items: Specific indices of playlist to download. playlistreverse: Download playlist items in reverse order. + playlistrandom: Download playlist items in random order. matchtitle: Download only matching titles. rejecttitle: Reject downloads for matching titles. logger: Log messages to a logging.Logger instance. @@ -842,6 +844,9 @@ class YoutubeDL(object): if self.params.get('playlistreverse', False): entries = entries[::-1] + if self.params.get('playlistrandom', False): + random.shuffle(entries) + for i, entry in enumerate(entries, 1): self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) extra = { diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 2b156342a..5c5b8094b 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -344,6 +344,7 @@ def _real_main(argv=None): 'playliststart': opts.playliststart, 'playlistend': opts.playlistend, 'playlistreverse': opts.playlist_reverse, + 'playlistrandom': opts.playlist_random, 'noplaylist': opts.noplaylist, 'logtostderr': opts.outtmpl == '-', 'consoletitle': opts.consoletitle, diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 3abf621c0..349f44778 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -470,6 +470,10 @@ def parseOpts(overrideArguments=None): '--playlist-reverse', action='store_true', help='Download playlist videos in reverse order') + downloader.add_option( + '--playlist-random', + action='store_true', + help='Download playlist videos in random order') downloader.add_option( '--xattr-set-filesize', dest='xattr_set_filesize', action='store_true', From ae9a173b6421a3fdf70dd50d2dc0386f8861fe71 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Tue, 31 Jan 2017 14:47:56 +0100 Subject: [PATCH 066/137] [vimeo] extract both mixed and separated dash formats --- youtube_dl/extractor/vimeo.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index c12eeadd4..8b6a5cc3c 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -142,10 +142,19 @@ class VimeoBaseInfoExtractor(InfoExtractor): note='Downloading %s m3u8 information' % cdn_name, fatal=False)) elif files_type == 'dash': - formats.extend(self._extract_mpd_formats( - manifest_url.replace('/master.json', '/master.mpd'), video_id, format_id, - 'Downloading %s MPD information' % cdn_name, - fatal=False)) + mpd_pattern = r'/%s/(?:sep/)?video/' % video_id + mpd_manifest_urls = [] + if re.search(mpd_pattern, manifest_url): + for suffix, repl in (('', 'video'), ('_sep', 'sep/video')): + mpd_manifest_urls.append((format_id + suffix, re.sub( + mpd_pattern, '/%s/%s/' % (video_id, repl), manifest_url))) + else: + mpd_manifest_urls = [(format_id, manifest_url)] + for f_id, m_url in mpd_manifest_urls: + formats.extend(self._extract_mpd_formats( + m_url.replace('/master.json', '/master.mpd'), video_id, f_id, + 'Downloading %s MPD information' % cdn_name, + fatal=False)) subtitles = {} text_tracks = config['request'].get('text_tracks') From 3c90cc8b6fc069930264b41f5505dc34c1077442 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 31 Jan 2017 22:19:29 +0700 Subject: [PATCH 067/137] [youtube] Fix extraction for domainless player URLs Closes #11890 Closes #11891 Closes #11892 Closes #11894 Closes #11895 Closes #11897 Closes #11900 Closes #11903 Closes #11904 Closes #11906 Closes #11907 Closes #11909 Closes #11913 Closes #11914 Closes #11915 Closes #11916 Closes #11917 Closes #11918 Closes #11919 --- youtube_dl/extractor/youtube.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 630586796..ea398bcc8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1028,8 +1028,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _parse_sig_js(self, jscode): funcname = self._search_regex( - r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode, - 'Initial JS player signature function name') + (r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(', + r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\('), + jscode, 'Initial JS player signature function name', group='sig') jsi = JSInterpreter(jscode) initial_function = jsi.extract_function(funcname) @@ -1050,6 +1051,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if player_url.startswith('//'): player_url = 'https:' + player_url + elif not re.match(r'https?://', player_url): + player_url = compat_urlparse.urljoin( + 'https://www.youtube.com', player_url) try: player_id = (player_url, self._signature_cache_id(s)) if player_id not in self._player_cache: From 3a528ffd8944417c99b139da18d0dff907ade517 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 31 Jan 2017 22:21:54 +0700 Subject: [PATCH 068/137] [ChangeLog] Actualize --- ChangeLog | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ChangeLog b/ChangeLog index cd7017f6d..e331acacc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,17 @@ +version <unreleased> + +Core ++ [compat] Add compat_etree_register_namespace + +Extractors +* [youtube] Fix extraction for domainless player URLs (#11890, #11891, #11892, + #11894, #11895, #11897, #11900, #11903, #11904, #11906, #11907, #11909, + #11913, #11914, #11915, #11916, #11917, #11918, #11919) ++ [vimeo] Extract both mixed and separated DASH formats ++ [ruutu] Extract DASH formats +* [itv] Fix extraction for python 2.6 + + version 2017.01.29 Core From d7e215b42dcaf71298a7e1dc953cf93523b3da81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 31 Jan 2017 22:24:45 +0700 Subject: [PATCH 069/137] release 2017.01.31 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 10c982fd0..180013f72 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.01.29*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.01.29** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.01.31*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.01.31** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.01.29 +[debug] youtube-dl version 2017.01.31 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index e331acacc..d5ab0e0a7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.01.31 Core + [compat] Add compat_etree_register_namespace diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a37a65db9..fee0ac7c5 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.01.29' +__version__ = '2017.01.31' From 8fd65faece98139def3a6538e98053bebd400263 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Odd=20Str=C3=A5b=C3=B8?= <oddstr13@openshell.no> Date: Sat, 14 Jan 2017 02:36:04 +0100 Subject: [PATCH 070/137] [NRKTV] Added NRKTVSeriesIE [NRKTV] Added season and episode number to metadata. [NRKTV] Added category to metadata. [NRKTV] Added tests to NRKTVSeries. [NRKTV] Fixed whitespace issues (flake8). --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/nrk.py | 49 ++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2590b5e1b..06e6d4620 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -668,6 +668,7 @@ from .nrk import ( NRKTVIE, NRKTVDirekteIE, NRKTVEpisodesIE, + NRKTVSeriesIE, ) from .ntvde import NTVDeIE from .ntvru import NTVRuIE diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index ea7be005a..26604f84f 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -128,6 +128,18 @@ class NRKBaseIE(InfoExtractor): series = conviva.get('seriesName') or data.get('seriesTitle') episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') + season_number = None + episode_number = None + if data.get('mediaElementType') == 'Episode': + _season_episode = data.get('scoresStatistics', {}).get('springStreamStream') or \ + data.get('relativeOriginUrl', '') + EPISODENUM_RE = [ + r'/s(?P<season>\d+)e(?P<episode>\d+)\.', + r'/sesong-(?P<season>\d+)/episode-(?P<episode>\d+)', + ] + season_number = int_or_none(self._search_regex(EPISODENUM_RE, _season_episode, "S##E##", fatal=False, group='season')) + episode_number = int_or_none(self._search_regex(EPISODENUM_RE, _season_episode, "S##E##", fatal=False, group='episode')) + thumbnails = None images = data.get('images') if images and isinstance(images, dict): @@ -140,11 +152,15 @@ class NRKBaseIE(InfoExtractor): } for image in web_images if image.get('imageUrl')] description = data.get('description') + category = data.get('mediaAnalytics', {}).get('category') common_info = { 'description': description, 'series': series, 'episode': episode, + 'season_number': season_number, + 'episode_number': episode_number, + 'categories': [category] if category else None, 'age_limit': parse_age_limit(data.get('legalAge')), 'thumbnails': thumbnails, } @@ -360,6 +376,39 @@ class NRKTVEpisodesIE(NRKPlaylistBaseIE): r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False) +class NRKTVSeriesIE(InfoExtractor): + _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P<id>[^/]+)/?' + _ITEM_RE = r'data-season=["\'](?P<id>\d+)["\']' + _TESTS = [{ + 'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene', + 'playlist_count': 1, + }, { + 'url': 'https://tv.nrk.no/serie/saving-the-human-race', + 'playlist_count': 1, + }, { + 'url': 'https://tv.nrk.no/serie/postmann-pat', + 'playlist_count': 3, + }, { + 'url': 'https://tv.nrk.no/serie/groenn-glede', + 'playlist_count': 9, + }] + + def _real_extract(self, url): + series_id = self._match_id(url) + + webpage = self._download_webpage(url, series_id) + + entries = [ + self.url_result('https://tv.nrk.no/program/Episodes/{series}/{season}'.format( + series=series_id, + season=season_id + )) + for season_id in re.findall(self._ITEM_RE, webpage) + ] + + return self.playlist_result(entries) + + class NRKSkoleIE(InfoExtractor): IE_DESC = 'NRK Skole' _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/?\?.*\bmediaId=(?P<id>\d+)' From 7c5329e6f4152b48c5476b1b9b8ab931caa10331 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 1 Feb 2017 00:29:29 +0700 Subject: [PATCH 071/137] [nrk] Improve extraction and update tests (closes #11571) --- youtube_dl/extractor/nrk.py | 145 +++++++++++++++++++++++++++--------- 1 file changed, 111 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 26604f84f..fc3c0cd3c 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -134,11 +134,15 @@ class NRKBaseIE(InfoExtractor): _season_episode = data.get('scoresStatistics', {}).get('springStreamStream') or \ data.get('relativeOriginUrl', '') EPISODENUM_RE = [ - r'/s(?P<season>\d+)e(?P<episode>\d+)\.', - r'/sesong-(?P<season>\d+)/episode-(?P<episode>\d+)', + r'/s(?P<season>\d{,2})e(?P<episode>\d{,2})\.', + r'/sesong-(?P<season>\d{,2})/episode-(?P<episode>\d{,2})', ] - season_number = int_or_none(self._search_regex(EPISODENUM_RE, _season_episode, "S##E##", fatal=False, group='season')) - episode_number = int_or_none(self._search_regex(EPISODENUM_RE, _season_episode, "S##E##", fatal=False, group='episode')) + season_number = int_or_none(self._search_regex( + EPISODENUM_RE, _season_episode, 'season number', + default=None, group='season')) + episode_number = int_or_none(self._search_regex( + EPISODENUM_RE, _season_episode, 'episode number', + default=None, group='episode')) thumbnails = None images = data.get('images') @@ -243,54 +247,102 @@ class NRKTVIE(NRKBaseIE): 'title': '20 spørsmål 23.05.2014', 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'duration': 1741, + 'series': '20 spørsmål - TV', + 'episode': '23.05.2014', }, }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', - 'md5': '43d0be26663d380603a9cf0c24366531', 'info_dict': { 'id': 'MDFP15000514CA', 'ext': 'mp4', 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', 'duration': 4605, + 'series': 'Kunnskapskanalen', + 'episode': '24.05.2014', + }, + 'params': { + 'skip_download': True, }, }, { # single playlist video 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', - 'md5': 'adbd1dbd813edaf532b0a253780719c2', 'info_dict': { 'id': 'MSPO40010515-part2', 'ext': 'flv', 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', }, - 'skip': 'Only works from Norway', + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Video is geo restricted'], + 'skip': 'particular part is not supported currently', }, { 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', 'playlist': [{ - 'md5': '9480285eff92d64f06e02a5367970a7a', 'info_dict': { - 'id': 'MSPO40010515-part1', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'id': 'MSPO40010515AH', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', + 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'duration': 772, + 'series': 'Tour de Ski', + 'episode': '06.01.2015', + }, + 'params': { + 'skip_download': True, }, }, { - 'md5': 'adbd1dbd813edaf532b0a253780719c2', 'info_dict': { - 'id': 'MSPO40010515-part2', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'id': 'MSPO40010515BH', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', + 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'duration': 6175, + 'series': 'Tour de Ski', + 'episode': '06.01.2015', + }, + 'params': { + 'skip_download': True, }, }], 'info_dict': { 'id': 'MSPO40010515', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', - 'duration': 6947.52, + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', + 'description': 'md5:c03aba1e917561eface5214020551b7a', + }, + 'expected_warnings': ['Video is geo restricted'], + }, { + 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', + 'info_dict': { + 'id': 'KMTE50001317AA', + 'ext': 'mp4', + 'title': 'Anno 13:30', + 'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa', + 'duration': 2340, + 'series': 'Anno', + 'episode': '13:30', + 'season_number': 3, + 'episode_number': 13, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', + 'info_dict': { + 'id': 'MUHH46000317AA', + 'ext': 'mp4', + 'title': 'Nytt på Nytt 27.01.2017', + 'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b', + 'duration': 1796, + 'series': 'Nytt på nytt', + 'episode': '27.01.2017', + }, + 'params': { + 'skip_download': True, }, - 'skip': 'Only works from Norway', }, { 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', 'only_matching': True, @@ -377,36 +429,61 @@ class NRKTVEpisodesIE(NRKPlaylistBaseIE): class NRKTVSeriesIE(InfoExtractor): - _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P<id>[^/]+)/?' - _ITEM_RE = r'data-season=["\'](?P<id>\d+)["\']' + _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)' + _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)' _TESTS = [{ + 'url': 'https://tv.nrk.no/serie/groenn-glede', + 'info_dict': { + 'id': 'groenn-glede', + 'title': 'Grønn glede', + 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', + }, + 'playlist_mincount': 9, + }, { + 'url': 'http://tv.nrksuper.no/serie/labyrint', + 'info_dict': { + 'id': 'labyrint', + 'title': 'Labyrint', + 'description': 'md5:58afd450974c89e27d5a19212eee7115', + }, + 'playlist_mincount': 3, + }, { 'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene', - 'playlist_count': 1, + 'only_matching': True, }, { 'url': 'https://tv.nrk.no/serie/saving-the-human-race', - 'playlist_count': 1, + 'only_matching': True, }, { 'url': 'https://tv.nrk.no/serie/postmann-pat', - 'playlist_count': 3, - }, { - 'url': 'https://tv.nrk.no/serie/groenn-glede', - 'playlist_count': 9, + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if NRKTVIE.suitable(url) else super(NRKTVSeriesIE, cls).suitable(url) + def _real_extract(self, url): series_id = self._match_id(url) webpage = self._download_webpage(url, series_id) entries = [ - self.url_result('https://tv.nrk.no/program/Episodes/{series}/{season}'.format( - series=series_id, - season=season_id - )) + self.url_result( + 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( + series=series_id, season=season_id)) for season_id in re.findall(self._ITEM_RE, webpage) ] - return self.playlist_result(entries) + title = self._html_search_meta( + 'seriestitle', webpage, + 'title', default=None) or self._og_search_title( + webpage, fatal=False) + + description = self._html_search_meta( + 'series_description', webpage, + 'description', default=None) or self._og_search_description(webpage) + + return self.playlist_result(entries, series_id, title, description) class NRKSkoleIE(InfoExtractor): From 363245ad94dfdf0c34b4c2c801e7cf6cea74f39c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 1 Feb 2017 00:30:19 +0700 Subject: [PATCH 072/137] Credit @oddstr13 for nrk:series (#11571) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 3ef2800c9..f2875d504 100644 --- a/AUTHORS +++ b/AUTHORS @@ -200,3 +200,4 @@ Paul Hartmann Stephen Chen Fabian Stahl Bagira +Odd Stråbø From c38a67bcd5df639b9d7e7faa8685e76446803527 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 1 Feb 2017 00:49:28 +0700 Subject: [PATCH 073/137] [vimeo] Extract license (closes #11880) --- youtube_dl/extractor/vimeo.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 8b6a5cc3c..32179e915 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -218,6 +218,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'user7108434', 'uploader': 'Filippo Valsorda', 'duration': 10, + 'license': 'by-sa', }, }, { @@ -486,6 +487,8 @@ class VimeoIE(VimeoBaseInfoExtractor): '%s said: %s' % (self.IE_NAME, seed_status['title']), expected=True) + cc_license = None + # Extract the config JSON try: try: @@ -499,8 +502,9 @@ class VimeoIE(VimeoBaseInfoExtractor): vimeo_clip_page_config = self._search_regex( r'vimeo\.clip_page_config\s*=\s*({.+?});', webpage, 'vimeo clip page config') - config_url = self._parse_json( - vimeo_clip_page_config, video_id)['player']['config_url'] + page_config = self._parse_json(vimeo_clip_page_config, video_id) + config_url = page_config['player']['config_url'] + cc_license = page_config.get('cc_license') config_json = self._download_webpage(config_url, video_id) config = json.loads(config_json) except RegexNotFoundError: @@ -609,6 +613,12 @@ class VimeoIE(VimeoBaseInfoExtractor): info_dict = self._parse_config(config, video_id) formats.extend(info_dict['formats']) self._vimeo_sort_formats(formats) + + if not cc_license: + cc_license = self._search_regex( + r'<link[^>]+rel=["\']license["\'][^>]+href=(["\'])(?P<license>(?:(?!\1).)+)\1', + webpage, 'license', default=None, group='license') + info_dict.update({ 'id': video_id, 'formats': formats, @@ -618,6 +628,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'view_count': view_count, 'like_count': like_count, 'comment_count': comment_count, + 'license': cc_license, }) return info_dict From c15cd296404e164b72fd7f2666d5875f35057d93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 1 Feb 2017 00:58:02 +0700 Subject: [PATCH 074/137] [vimeo] Extract upload timestamp --- youtube_dl/extractor/vimeo.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 32179e915..8ba222224 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -21,7 +21,9 @@ from ..utils import ( sanitized_Request, smuggle_url, std_headers, + try_get, unified_strdate, + unified_timestamp, unsmuggle_url, urlencode_postdata, unescapeHTML, @@ -213,6 +215,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'ext': 'mp4', 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", 'description': 'md5:2d3305bad981a06ff79f027f19865021', + 'timestamp': 1355990239, 'upload_date': '20121220', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user7108434', 'uploader_id': 'user7108434', @@ -259,6 +262,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '68375962', 'ext': 'mp4', 'title': 'youtube-dl password protected test video', + 'timestamp': 1371200155, 'upload_date': '20130614', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', 'uploader_id': 'user18948128', @@ -281,7 +285,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/atencio', 'uploader_id': 'atencio', 'uploader': 'Peter Atencio', - 'upload_date': '20130927', + 'timestamp': 1380339469, + 'upload_date': '20130928', 'duration': 187, }, }, @@ -293,6 +298,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'ext': 'mp4', 'title': 'The New Vimeo Player (You Know, For Videos)', 'description': 'md5:2ec900bf97c3f389378a96aee11260ea', + 'timestamp': 1381846109, 'upload_date': '20131015', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/staff', 'uploader_id': 'staff', @@ -324,6 +330,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'The DMCI', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/dmci', 'uploader_id': 'dmci', + 'timestamp': 1324343742, 'upload_date': '20111220', 'description': 'md5:ae23671e82d05415868f7ad1aec21147', }, @@ -339,6 +346,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'Casey Donahue', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/caseydonahue', 'uploader_id': 'caseydonahue', + 'timestamp': 1250886430, 'upload_date': '20090821', 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', }, @@ -488,6 +496,7 @@ class VimeoIE(VimeoBaseInfoExtractor): expected=True) cc_license = None + timestamp = None # Extract the config JSON try: @@ -505,6 +514,9 @@ class VimeoIE(VimeoBaseInfoExtractor): page_config = self._parse_json(vimeo_clip_page_config, video_id) config_url = page_config['player']['config_url'] cc_license = page_config.get('cc_license') + timestamp = try_get( + page_config, lambda x: x['clip']['uploaded_on'], + compat_str) config_json = self._download_webpage(config_url, video_id) config = json.loads(config_json) except RegexNotFoundError: @@ -573,10 +585,10 @@ class VimeoIE(VimeoBaseInfoExtractor): self._downloader.report_warning('Cannot find video description') # Extract upload date - video_upload_date = None - mobj = re.search(r'<time[^>]+datetime="([^"]+)"', webpage) - if mobj is not None: - video_upload_date = unified_strdate(mobj.group(1)) + if not timestamp: + timestamp = self._search_regex( + r'<time[^>]+datetime="([^"]+)"', webpage, + 'timestamp', default=None) try: view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count')) @@ -622,7 +634,7 @@ class VimeoIE(VimeoBaseInfoExtractor): info_dict.update({ 'id': video_id, 'formats': formats, - 'upload_date': video_upload_date, + 'timestamp': unified_timestamp(timestamp), 'description': video_description, 'webpage_url': url, 'view_count': view_count, From 26c0f09935d51cc8837230ad48db08acd3744dd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 1 Feb 2017 02:15:52 +0700 Subject: [PATCH 075/137] [vimeo] PEP 8 --- youtube_dl/extractor/vimeo.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 8ba222224..61cc469bf 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -22,13 +22,11 @@ from ..utils import ( smuggle_url, std_headers, try_get, - unified_strdate, unified_timestamp, unsmuggle_url, urlencode_postdata, unescapeHTML, parse_filesize, - try_get, ) From 2b2d5d319b563a12e26c55966a047fa5bb039cd0 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 1 Feb 2017 16:39:32 +0800 Subject: [PATCH 076/137] [crunchyroll] Remove ScaledBorderAndShadow settings See https://github.com/rg3/youtube-dl/pull/9028, especially @lachs0r's comments for the reason behind this change --- youtube_dl/extractor/crunchyroll.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index f811c7f33..109d1c5a8 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -255,8 +255,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): output += 'WrapStyle: %s\n' % sub_root.attrib['wrap_style'] output += 'PlayResX: %s\n' % sub_root.attrib['play_res_x'] output += 'PlayResY: %s\n' % sub_root.attrib['play_res_y'] - output += """ScaledBorderAndShadow: no - + output += """ [V4+ Styles] Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding """ From 7882f1115e8eca2d2c958e2dbb6be45450e4027c Mon Sep 17 00:00:00 2001 From: Thomas Christlieb <thomaschristlieb@hotmail.com> Date: Wed, 1 Feb 2017 16:00:41 +0100 Subject: [PATCH 077/137] Added new Regex for prosiebensat1 Extractor Description. Fixes #11810 (#11929) * Added new Regex for prosiebensat1 Extractor Description. Fixes #11810 * Using _og_search_description() as a Fallback for Description-Regex * Using _og_search_description() as a Fallback for Description-Regex - Second try * Also added fallback regex * Using _og_search_description() as a Fallback for Description-Regex - Third try * removed fatal=False from search for description regex. default=None should be preferred only * Using fatal=false for _og_search_description * Revert "Using fatal=false for _og_search_description" This reverts commit 2b7e123f9d0f2bd6ada54fa8e4e6035fece5dbf4. * Deleted default=None Parameter for _og_search_property --- youtube_dl/extractor/prosiebensat1.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 03e1b1f7f..6856bacaf 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -375,7 +375,9 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title') info = self._extract_video_info(url, clip_id) description = self._html_search_regex( - self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) + self._DESCRIPTION_REGEXES, webpage, 'description', default=None) + if description is None: + description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) upload_date = unified_strdate(self._html_search_regex( self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) From fe5aa197b58be1bbf88a152be0e84f24f1711bd7 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 1 Feb 2017 23:13:45 +0800 Subject: [PATCH 078/137] [prosiebensat1] PEP8 and update _TESTS --- youtube_dl/extractor/prosiebensat1.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 6856bacaf..5091d8456 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -147,16 +147,12 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): 'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge', 'info_dict': { 'id': '2104602', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Episode 18 - Staffel 2', 'description': 'md5:8733c81b702ea472e069bc48bb658fc1', 'upload_date': '20131231', 'duration': 5845.04, }, - 'params': { - # rtmp download - 'skip_download': True, - }, }, { 'url': 'http://www.prosieben.de/videokatalog/Gesellschaft/Leben/Trends/video-Lady-Umstyling-f%C3%BCr-Audrina-Rebekka-Audrina-Fergen-billig-aussehen-Battal-Modica-700544.html', @@ -258,7 +254,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): 'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip', 'info_dict': { 'id': '2572814', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Andreas Kümmert: Rocket Man', 'description': 'md5:6ddb02b0781c6adf778afea606652e38', 'upload_date': '20131017', @@ -272,7 +268,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): 'url': 'http://www.fem.com/wellness/videos/wellness-video-clip-kurztripps-zum-valentinstag.html', 'info_dict': { 'id': '2156342', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Kurztrips zum Valentinstag', 'description': 'Romantischer Kurztrip zum Valentinstag? Nina Heinemann verrät, was sich hier wirklich lohnt.', 'duration': 307.24, @@ -289,12 +285,13 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): 'description': 'md5:63b8963e71f481782aeea877658dec84', }, 'playlist_count': 2, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.7tv.de/circus-halligalli/615-best-of-circus-halligalli-ganze-folge', 'info_dict': { 'id': '4187506', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Best of Circus HalliGalli', 'description': 'md5:8849752efd90b9772c9db6fdf87fb9e9', 'upload_date': '20151229', @@ -376,7 +373,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): info = self._extract_video_info(url, clip_id) description = self._html_search_regex( self._DESCRIPTION_REGEXES, webpage, 'description', default=None) - if description is None: + if description is None: description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) upload_date = unified_strdate(self._html_search_regex( From 000f207944e277e63dbec5a60007c30e3187d3fd Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 1 Feb 2017 23:16:35 +0800 Subject: [PATCH 079/137] [prosiebensat1] Update ChangeLog --- ChangeLog | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ChangeLog b/ChangeLog index d5ab0e0a7..da5b75b47 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +version <unreleased> + +Extractors +* [prosiebensat1] Fix extraction of descriptions (#11810, #11929) + version 2017.01.31 Core From b83ef507b457e6ea8c52265ea42b6c5d2c500a7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 1 Feb 2017 23:15:38 +0700 Subject: [PATCH 080/137] [facebook] Fix extraction (closes #11926) --- youtube_dl/extractor/facebook.py | 36 ++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index c0a7fc7d8..47bcc0dbc 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -12,14 +12,16 @@ from ..compat import ( compat_urllib_parse_unquote_plus, ) from ..utils import ( + clean_html, error_to_compat_str, ExtractorError, + get_element_by_id, int_or_none, + js_to_json, limit_length, sanitized_Request, + try_get, urlencode_postdata, - get_element_by_id, - clean_html, ) @@ -243,14 +245,30 @@ class FacebookIE(InfoExtractor): video_data = None + def extract_video_data(instances): + for item in instances: + if item[1][0] == 'VideoConfig': + video_item = item[2][0] + if video_item.get('video_id') == video_id: + return video_item['videoData'] + server_js_data = self._parse_json(self._search_regex( - r'handleServerJS\(({.+})(?:\);|,")', webpage, 'server js data', default='{}'), video_id) - for item in server_js_data.get('instances', []): - if item[1][0] == 'VideoConfig': - video_item = item[2][0] - if video_item.get('video_id') == video_id: - video_data = video_item['videoData'] - break + r'handleServerJS\(({.+})(?:\);|,")', webpage, + 'server js data', default='{}'), video_id, fatal=False) + + if server_js_data: + video_data = extract_video_data(server_js_data.get('instances', [])) + + if not video_data: + server_js_data = self._parse_json( + self._search_regex( + r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+stream_pagelet', + webpage, 'js data', default='{}'), + video_id, transform_source=js_to_json, fatal=False) + if server_js_data: + video_data = extract_video_data(try_get( + server_js_data, lambda x: x['jsmods']['instances'], + list) or []) if not video_data: if not fatal_if_no_video: From b996b8809285c2c8526dfe96f5ea9835ea799fe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 1 Feb 2017 23:29:59 +0700 Subject: [PATCH 081/137] [ChangeLog] Actualize --- ChangeLog | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ChangeLog b/ChangeLog index da5b75b47..d24169af8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,13 @@ version <unreleased> Extractors ++ [facebook] Add another fallback extraction scenario (#11926) * [prosiebensat1] Fix extraction of descriptions (#11810, #11929) +- [crunchyroll] Remove ScaledBorderAndShadow settings (#9028) ++ [vimeo] Extract upload timestamp ++ [vimeo] Extract license (#8726, #11880) ++ [nrk:series] Add support for series (#11571, #11711) + version 2017.01.31 From 50695949937bf399b611ef7957f44aac9fbee9dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 1 Feb 2017 03:20:09 +0700 Subject: [PATCH 082/137] release 2017.02.01 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 180013f72..8914569b6 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.01.31*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.01.31** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.02.01*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.02.01** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.01.31 +[debug] youtube-dl version 2017.02.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index d24169af8..c1e8f643a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.02.01 Extractors + [facebook] Add another fallback extraction scenario (#11926) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d4231577b..d900f5e12 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -528,6 +528,7 @@ - **NRKTV**: NRK TV and NRK Radio - **NRKTVDirekte**: NRK TV Direkte and NRK Radio Direkte - **NRKTVEpisodes** + - **NRKTVSeries** - **ntv.ru** - **Nuvid** - **NYTimes** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index fee0ac7c5..0f9b6b703 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.01.31' +__version__ = '2017.02.01' From da162c1135febbb653a302b598dba2d24ac4e24e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Wed, 1 Feb 2017 20:15:25 +0100 Subject: [PATCH 083/137] [compat] add compat_etree_register_namespace to __all__ list --- youtube_dl/compat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 49e3c90e2..718902019 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2883,6 +2883,7 @@ __all__ = [ 'compat_cookiejar', 'compat_cookies', 'compat_etree_fromstring', + 'compat_etree_register_namespace', 'compat_expanduser', 'compat_get_terminal_size', 'compat_getenv', From 020c5df52d61af0630be8c982282e110a83fc8df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 1 Feb 2017 23:48:34 +0100 Subject: [PATCH 084/137] [elpais] Fix extraction for some URLs (closes #11765) --- ChangeLog | 1 + youtube_dl/extractor/elpais.py | 23 ++++++++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index c1e8f643a..8e3a04d7d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -7,6 +7,7 @@ Extractors + [vimeo] Extract upload timestamp + [vimeo] Extract license (#8726, #11880) + [nrk:series] Add support for series (#11571, #11711) ++ [elpais] Fix extraction for some URLs (#11765) version 2017.01.31 diff --git a/youtube_dl/extractor/elpais.py b/youtube_dl/extractor/elpais.py index 8c725a4e6..99e00cf3c 100644 --- a/youtube_dl/extractor/elpais.py +++ b/youtube_dl/extractor/elpais.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import strip_jsonp, unified_strdate class ElPaisIE(InfoExtractor): @@ -29,6 +29,16 @@ class ElPaisIE(InfoExtractor): 'description': 'Que sí, que las cápsulas son cómodas. Pero si le pides algo más a la vida, quizá deberías aprender a usar bien la cafetera italiana. No tienes más que ver este vídeo y seguir sus siete normas básicas.', 'upload_date': '20160303', } + }, { + 'url': 'http://elpais.com/elpais/2017/01/26/ciencia/1485456786_417876.html', + 'md5': '9c79923a118a067e1a45789e1e0b0f9c', + 'info_dict': { + 'id': '1485456786_417876', + 'ext': 'mp4', + 'title': 'Hallado un barco de la antigua Roma que naufragó en Baleares hace 1.800 años', + 'description': 'La nave portaba cientos de ánforas y se hundió cerca de la isla de Cabrera por razones desconocidas', + 'upload_date': '20170127', + }, }] def _real_extract(self, url): @@ -37,8 +47,15 @@ class ElPaisIE(InfoExtractor): prefix = self._html_search_regex( r'var\s+url_cache\s*=\s*"([^"]+)";', webpage, 'URL prefix') - video_suffix = self._search_regex( - r"(?:URLMediaFile|urlVideo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", webpage, 'video URL') + id_multimedia = self._search_regex( + r"id_multimedia\s*=\s*'([^']+)'", webpage, 'ID multimedia', default=None) + if id_multimedia: + url_info = self._download_json( + 'http://elpais.com/vdpep/1/?pepid=' + id_multimedia, video_id, transform_source=strip_jsonp) + video_suffix = url_info['mp4'] + else: + video_suffix = self._search_regex( + r"(?:URLMediaFile|urlVideo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", webpage, 'video URL') video_url = prefix + video_suffix thumbnail_suffix = self._search_regex( r"(?:URLMediaStill|urlFotogramaFijo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", From 8bdc149441a86e01c56946090087c005a525260e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 2 Feb 2017 08:05:16 +0100 Subject: [PATCH 085/137] [downloader/external:ffmpeg] minimize the use of aac_adtstoasc filter --- youtube_dl/downloader/external.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 5d3e5d8d3..138f353ef 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -17,6 +17,7 @@ from ..utils import ( encodeArgument, handle_youtubedl_headers, check_executable, + is_outdated_version, ) @@ -264,7 +265,9 @@ class FFmpegFD(ExternalFD): if self.params.get('hls_use_mpegts', False) or tmpfilename == '-': args += ['-f', 'mpegts'] else: - args += ['-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] + args += ['-f', 'mp4'] + if (ffpp.basename == 'ffmpeg' and is_outdated_version(ffpp._versions['ffmpeg'], '3.2')) and (not info_dict.get('acodec') or info_dict['acodec'].split('.')[0] in ('aac', 'mp4a')): + args += ['-bsf:a', 'aac_adtstoasc'] elif protocol == 'rtmp': args += ['-f', 'flv'] else: From 81aeafeb44a16b341e47c3bb85d288252a095eda Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Thu, 2 Feb 2017 08:07:06 +0100 Subject: [PATCH 086/137] [cbc:watch] extract audio codec for audion only formats(fixes #11893) --- youtube_dl/extractor/cbc.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index a291685bf..cf678e7f8 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -296,6 +296,12 @@ class CBCWatchVideoIE(CBCWatchBaseIE): formats = self._extract_m3u8_formats(re.sub(r'/([^/]+)/[^/?]+\.m3u8', r'/\1/\1.m3u8', m3u8_url), video_id, 'mp4', fatal=False) if len(formats) < 2: formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + for f in formats: + format_id = f.get('format_id') + if format_id.startswith('AAC'): + f['acodec'] = 'aac' + elif format_id.startswith('AC3'): + f['acodec'] = 'ac-3' self._sort_formats(formats) info = { From bd8f48c78b952ebe3bf335185c819e265f63cb50 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 2 Feb 2017 21:51:31 +0800 Subject: [PATCH 087/137] [bilibili] Support new Bangumi URLs (closes #11845) To reduce complexity, I don't support old Bangumi URLs directly via _VALID_URL. Instead, I choose to let it go to generic redirection. An example can be found in #10190: http://bangumi.bilibili.com/anime/v/40062 --- ChangeLog | 5 ++ youtube_dl/extractor/bilibili.py | 135 ++++++++++++++++++++++++++--- youtube_dl/extractor/extractors.py | 5 +- 3 files changed, 134 insertions(+), 11 deletions(-) diff --git a/ChangeLog b/ChangeLog index 8e3a04d7d..c27907f51 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +version <unreleased> + +Extractors ++ [bilibili] Support new Bangumi URLs (#11845) + version 2017.02.01 Extractors diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 85ea5e6ee..80dd8382e 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -5,19 +5,27 @@ import hashlib import re from .common import InfoExtractor -from ..compat import compat_parse_qs +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) from ..utils import ( + ExtractorError, int_or_none, float_or_none, + parse_iso8601, + smuggle_url, + strip_jsonp, unified_timestamp, + unsmuggle_url, urlencode_postdata, ) class BiliBiliIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/v/)(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/(?P<anime_id>\d+)/play#)(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', 'md5': '9fa226fe2b8a9a4d5a69b4c6a183417e', 'info_dict': { @@ -32,25 +40,61 @@ class BiliBiliIE(InfoExtractor): 'uploader': '菊子桑', 'uploader_id': '156160', }, - } + }, { + # Tested in BiliBiliBangumiIE + 'url': 'http://bangumi.bilibili.com/anime/1869/play#40062', + 'only_matching': True, + }, { + 'url': 'http://bangumi.bilibili.com/anime/5802/play#100643', + 'md5': '3f721ad1e75030cc06faf73587cfec57', + 'info_dict': { + 'id': '100643', + 'ext': 'mp4', + 'title': 'CHAOS;CHILD', + 'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...', + }, + 'skip': 'Geo-restricted to China', + }] _APP_KEY = '84956560bc028eb7' _BILIBILI_KEY = '94aba54af9065f71de72f5508f1cd42e' + def _report_error(self, result): + if 'message' in result: + raise ExtractorError('%s said: %s' % (self.IE_NAME, result['message']), expected=True) + elif 'code' in result: + raise ExtractorError('%s returns error %d' % (self.IE_NAME, result['code']), expected=True) + else: + raise ExtractorError('Can\'t extract Bangumi episode ID') + def _real_extract(self, url): - video_id = self._match_id(url) + url, smuggled_data = unsmuggle_url(url, {}) + + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + anime_id = mobj.group('anime_id') webpage = self._download_webpage(url, video_id) - if 'anime/v' not in url: + if 'anime/' not in url: cid = compat_parse_qs(self._search_regex( [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'], webpage, 'player parameters'))['cid'][0] else: + if 'no_bangumi_tip' not in smuggled_data: + self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run youtube-dl with %s' % ( + video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id))) + headers = { + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + } + headers.update(self.geo_verification_headers()) + js = self._download_json( 'http://bangumi.bilibili.com/web_api/get_source', video_id, data=urlencode_postdata({'episode_id': video_id}), - headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}) + headers=headers) + if 'result' not in js: + self._report_error(js) cid = js['result']['cid'] payload = 'appkey=%s&cid=%s&otype=json&quality=2&type=mp4' % (self._APP_KEY, cid) @@ -58,7 +102,11 @@ class BiliBiliIE(InfoExtractor): video_info = self._download_json( 'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign), - video_id, note='Downloading video info page') + video_id, note='Downloading video info page', + headers=self.geo_verification_headers()) + + if 'durl' not in video_info: + self._report_error(video_info) entries = [] @@ -85,7 +133,7 @@ class BiliBiliIE(InfoExtractor): title = self._html_search_regex('<h1[^>]+title="([^"]+)">', webpage, 'title') description = self._html_search_meta('description', webpage) timestamp = unified_timestamp(self._html_search_regex( - r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', fatal=False)) + r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', default=None)) thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage) # TODO 'view_count' requires deobfuscating Javascript @@ -99,7 +147,7 @@ class BiliBiliIE(InfoExtractor): } uploader_mobj = re.search( - r'<a[^>]+href="https?://space\.bilibili\.com/(?P<id>\d+)"[^>]+title="(?P<name>[^"]+)"', + r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]+title="(?P<name>[^"]+)"', webpage) if uploader_mobj: info.update({ @@ -123,3 +171,70 @@ class BiliBiliIE(InfoExtractor): 'description': description, 'entries': entries, } + + +class BiliBiliBangumiIE(InfoExtractor): + _VALID_URL = r'https?://bangumi\.bilibili\.com/anime/(?P<id>\d+)' + + IE_NAME = 'bangumi.bilibili.com' + IE_DESC = 'BiliBili番剧' + + _TESTS = [{ + 'url': 'http://bangumi.bilibili.com/anime/1869', + 'info_dict': { + 'id': '1869', + 'title': '混沌武士', + 'description': 'md5:6a9622b911565794c11f25f81d6a97d2', + }, + 'playlist_count': 26, + }, { + 'url': 'http://bangumi.bilibili.com/anime/1869', + 'info_dict': { + 'id': '1869', + 'title': '混沌武士', + 'description': 'md5:6a9622b911565794c11f25f81d6a97d2', + }, + 'playlist': [{ + 'md5': '91da8621454dd58316851c27c68b0c13', + 'info_dict': { + 'id': '40062', + 'ext': 'mp4', + 'title': '混沌武士', + 'description': '故事发生在日本的江户时代。风是一个小酒馆的打工女。一日,酒馆里来了一群恶霸,虽然他们的举动令风十分不满,但是毕竟风只是一届女流,无法对他们采取什么行动,只能在心里嘟哝。这时,酒家里又进来了个“不良份子...', + 'timestamp': 1414538739, + 'upload_date': '20141028', + 'episode': '疾风怒涛 Tempestuous Temperaments', + 'episode_number': 1, + }, + }], + 'params': { + 'playlist_items': '1', + }, + }] + + @classmethod + def suitable(cls, url): + return False if BiliBiliIE.suitable(url) else super(BiliBiliBangumiIE, cls).suitable(url) + + def _real_extract(self, url): + bangumi_id = self._match_id(url) + + # Sometimes this API returns a JSONP response + season_info = self._download_json( + 'http://bangumi.bilibili.com/jsonp/seasoninfo/%s.ver' % bangumi_id, + bangumi_id, transform_source=strip_jsonp)['result'] + + entries = [{ + '_type': 'url_transparent', + 'url': smuggle_url(episode['webplay_url'], {'no_bangumi_tip': 1}), + 'ie_key': BiliBiliIE.ie_key(), + 'timestamp': parse_iso8601(episode.get('update_time'), delimiter=' '), + 'episode': episode.get('index_title'), + 'episode_number': int_or_none(episode.get('index')), + } for episode in season_info['episodes']] + + entries = sorted(entries, key=lambda entry: entry.get('episode_number')) + + return self.playlist_result( + entries, bangumi_id, + season_info.get('bangumi_title'), season_info.get('evaluate')) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 06e6d4620..1d1c05d42 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -103,7 +103,10 @@ from .beatport import BeatportIE from .bet import BetIE from .bigflix import BigflixIE from .bild import BildIE -from .bilibili import BiliBiliIE +from .bilibili import ( + BiliBiliIE, + BiliBiliBangumiIE, +) from .biobiochiletv import BioBioChileTVIE from .biqle import BIQLEIE from .bleacherreport import ( From a685751051f277b8ce99ee0949420bca4ea28c28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 2 Feb 2017 22:01:11 +0700 Subject: [PATCH 088/137] [youtube:playlist] Recognize TL playlists (closes #11945) --- youtube_dl/extractor/youtube.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ea398bcc8..0e67fdd12 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1857,13 +1857,13 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= ) ( - (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,} + (?:PL|LL|EC|UU|FL|RD|UL|TL)?[0-9A-Za-z-_]{10,} # Top tracks, they can also include dots |(?:MC)[\w\.]* ) .* | - ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) + ((?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&disable_polymer=true' _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?' @@ -1985,6 +1985,9 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): }, { 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', 'only_matching': True, + }, { + 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', + 'only_matching': True, }] def _real_initialize(self): From 5a116e13020813f9f1d952504455043986c28b9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 2 Feb 2017 22:45:18 +0700 Subject: [PATCH 089/137] [facebook] Fix title extraction (closes #11941) --- youtube_dl/extractor/facebook.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 47bcc0dbc..b325c8200 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -73,7 +73,7 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '274175099429670', 'ext': 'mp4', - 'title': 'Facebook video #274175099429670', + 'title': 'Asif Nawab Butt posted a video to his Timeline.', 'uploader': 'Asif Nawab Butt', 'upload_date': '20140506', 'timestamp': 1399398998, @@ -318,10 +318,16 @@ class FacebookIE(InfoExtractor): video_title = self._html_search_regex( r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>', webpage, 'alternative title', default=None) - video_title = limit_length(video_title, 80) if not video_title: + video_title = self._html_search_meta( + 'description', webpage, 'title') + if video_title: + video_title = limit_length(video_title, 80) + else: video_title = 'Facebook video #%s' % video_id - uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) + uploader = clean_html(get_element_by_id( + 'fbPhotoPageAuthorName', webpage)) or self._search_regex( + r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', fatal=False) timestamp = int_or_none(self._search_regex( r'<abbr[^>]+data-utime=["\'](\d+)', webpage, 'timestamp', default=None)) From c54c01f82dba6d3e982c73c81ad71c49f31d8af1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 2 Feb 2017 23:03:38 +0700 Subject: [PATCH 090/137] [go] Relax video id regex (closes #11937) --- youtube_dl/extractor/go.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index c7776b186..a34779b16 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -43,7 +43,10 @@ class GoIE(InfoExtractor): sub_domain, video_id, display_id = re.match(self._VALID_URL, url).groups() if not video_id: webpage = self._download_webpage(url, display_id) - video_id = self._search_regex(r'data-video-id=["\']VDKA(\w+)', webpage, 'video id') + video_id = self._search_regex( + # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" + # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood + r'data-video-id=["\']*VDKA(\w+)', webpage, 'video id') brand = self._BRANDS[sub_domain] video_data = self._download_json( 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/-1/-1/%s/-1/-1.json' % (brand, video_id), From a22b2fd19bd8c08d50f884d1903486d4f00f76ec Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 3 Feb 2017 01:28:24 +0800 Subject: [PATCH 091/137] [youtube] Fix ytsearch* when cookies are provided Closes #11924 The API with `page` is no longer used in browsers, and YouTube always returns {'reload': 'now'} when cookies are provided. See http://youtube.github.io/spfjs/documentation/start/ for how SPF works. Basically appending static link with a `spf` parameter yields the corresponding dynamic link. --- ChangeLog | 1 + youtube_dl/extractor/youtube.py | 22 ++++++++++++++-------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/ChangeLog b/ChangeLog index c27907f51..c80126cfb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version <unreleased> Extractors +* [youtube] Fix ytsearch when cookies are provided (#11924) + [bilibili] Support new Bangumi URLs (#11845) version 2017.02.01 diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0e67fdd12..f2f751104 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2348,18 +2348,18 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): videos = [] limit = n + url_query = { + 'search_query': query.encode('utf-8'), + } + url_query.update(self._EXTRA_QUERY_ARGS) + result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query) + for pagenum in itertools.count(1): - url_query = { - 'search_query': query.encode('utf-8'), - 'page': pagenum, - 'spf': 'navigate', - } - url_query.update(self._EXTRA_QUERY_ARGS) - result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query) data = self._download_json( result_url, video_id='query "%s"' % query, note='Downloading page %s' % pagenum, - errnote='Unable to download API page') + errnote='Unable to download API page', + query={'spf': 'navigate'}) html_content = data[1]['body']['content'] if 'class="search-message' in html_content: @@ -2371,6 +2371,12 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): videos += new_videos if not new_videos or len(videos) > limit: break + next_link = self._html_search_regex( + r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next', + html_content, 'next link', default=None) + if next_link is None: + break + result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link) if len(videos) > n: videos = videos[:n] From b3ee552e4b918fb720111b23147e24fa5475a74b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20=C4=8Ciha=C5=99?= <michal@cihar.com> Date: Tue, 31 Jan 2017 07:54:53 +0100 Subject: [PATCH 092/137] [utils] Handle single-line comments in js_to_json --- test/test_utils.py | 3 +++ youtube_dl/utils.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index a74d59f34..954bb7d8b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -791,6 +791,9 @@ class TestUtil(unittest.TestCase): on = js_to_json('{ 0: /* " \n */ ",]" , }') self.assertEqual(json.loads(on), {'0': ',]'}) + on = js_to_json('{ 0: // comment\n1 }') + self.assertEqual(json.loads(on), {'0': 1}) + on = js_to_json(r'["<p>x<\/p>"]') self.assertEqual(json.loads(on), ['<p>x</p>']) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index cf46711b9..6c462625b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2107,7 +2107,7 @@ def js_to_json(code): v = m.group(0) if v in ('true', 'false', 'null'): return v - elif v.startswith('/*') or v == ',': + elif v.startswith('/*') or v.startswith('//') or v == ',': return "" if v[0] in ("'", '"'): @@ -2134,7 +2134,7 @@ def js_to_json(code): return re.sub(r'''(?sx) "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| - /\*.*?\*/|,(?=\s*[\]}])| + /\*.*?\*/|//[^\n]*|,(?=\s*[\]}])| [a-zA-Z_][.a-zA-Z_0-9]*| \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?| [0-9]+(?=\s*:) From 0bbcc8a10a4bd339540bf149dd263419fd8b6e66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20=C4=8Ciha=C5=99?= <michal@cihar.com> Date: Tue, 31 Jan 2017 07:59:55 +0100 Subject: [PATCH 093/137] [iprima] Fix extraction (closes #11920, closes #11896) --- youtube_dl/extractor/iprima.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index da2cdc656..0fe576883 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -65,7 +65,7 @@ class IPrimaIE(InfoExtractor): options = self._parse_json( self._search_regex( - r'(?s)var\s+playerOptions\s*=\s*({.+?});', + r'(?s)(?:TDIPlayerOptions|playerOptions)\s*=\s*({.+?});\s*\]\]', playerpage, 'player options', default='{}'), video_id, transform_source=js_to_json, fatal=False) if options: From 4195096ea8da8237a63e1ba3876dc8856b8605c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 3 Feb 2017 02:55:06 +0700 Subject: [PATCH 094/137] [utils] Improve comments processing in js_to_json (closes #11947) --- test/test_utils.py | 24 ++++++++++++++++++++++++ youtube_dl/utils.py | 20 +++++++++++--------- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 954bb7d8b..edc712f07 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -785,12 +785,24 @@ class TestUtil(unittest.TestCase): on = js_to_json('["abc", "def",]') self.assertEqual(json.loads(on), ['abc', 'def']) + on = js_to_json('[/*comment\n*/"abc"/*comment\n*/,/*comment\n*/"def",/*comment\n*/]') + self.assertEqual(json.loads(on), ['abc', 'def']) + + on = js_to_json('[//comment\n"abc" //comment\n,//comment\n"def",//comment\n]') + self.assertEqual(json.loads(on), ['abc', 'def']) + on = js_to_json('{"abc": "def",}') self.assertEqual(json.loads(on), {'abc': 'def'}) + on = js_to_json('{/*comment\n*/"abc"/*comment\n*/:/*comment\n*/"def"/*comment\n*/,/*comment\n*/}') + self.assertEqual(json.loads(on), {'abc': 'def'}) + on = js_to_json('{ 0: /* " \n */ ",]" , }') self.assertEqual(json.loads(on), {'0': ',]'}) + on = js_to_json('{ /*comment\n*/0/*comment\n*/: /* " \n */ ",]" , }') + self.assertEqual(json.loads(on), {'0': ',]'}) + on = js_to_json('{ 0: // comment\n1 }') self.assertEqual(json.loads(on), {'0': 1}) @@ -803,15 +815,27 @@ class TestUtil(unittest.TestCase): on = js_to_json("['a\\\nb']") self.assertEqual(json.loads(on), ['ab']) + on = js_to_json("/*comment\n*/[/*comment\n*/'a\\\nb'/*comment\n*/]/*comment\n*/") + self.assertEqual(json.loads(on), ['ab']) + on = js_to_json('{0xff:0xff}') self.assertEqual(json.loads(on), {'255': 255}) + on = js_to_json('{/*comment\n*/0xff/*comment\n*/:/*comment\n*/0xff/*comment\n*/}') + self.assertEqual(json.loads(on), {'255': 255}) + on = js_to_json('{077:077}') self.assertEqual(json.loads(on), {'63': 63}) + on = js_to_json('{/*comment\n*/077/*comment\n*/:/*comment\n*/077/*comment\n*/}') + self.assertEqual(json.loads(on), {'63': 63}) + on = js_to_json('{42:42}') self.assertEqual(json.loads(on), {'42': 42}) + on = js_to_json('{/*comment\n*/42/*comment\n*/:/*comment\n*/42/*comment\n*/}') + self.assertEqual(json.loads(on), {'42': 42}) + def test_extract_attributes(self): self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'}) self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'}) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 6c462625b..67a847eba 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2103,6 +2103,13 @@ def strip_jsonp(code): def js_to_json(code): + COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*' + SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE) + INTEGER_TABLE = ( + (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16), + (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8), + ) + def fix_kv(m): v = m.group(0) if v in ('true', 'false', 'null'): @@ -2118,11 +2125,6 @@ def js_to_json(code): '\\x': '\\u00', }.get(m.group(0), m.group(0)), v[1:-1]) - INTEGER_TABLE = ( - (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16), - (r'^(0+[0-7]+)\s*:?$', 8), - ) - for regex, base in INTEGER_TABLE: im = re.match(regex, v) if im: @@ -2134,11 +2136,11 @@ def js_to_json(code): return re.sub(r'''(?sx) "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| - /\*.*?\*/|//[^\n]*|,(?=\s*[\]}])| + {comment}|,(?={skip}[\]}}])| [a-zA-Z_][.a-zA-Z_0-9]*| - \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?| - [0-9]+(?=\s*:) - ''', fix_kv, code) + \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?| + [0-9]+(?={skip}:) + '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code) def qualities(quality_ids): From 33da98f4933ddc54c944bae985cfcc7b53563208 Mon Sep 17 00:00:00 2001 From: Justsoos <justso@gmail.com> Date: Wed, 1 Feb 2017 21:30:01 +0800 Subject: [PATCH 095/137] [douyutv] Improve room id regex http://www.douyu.com/t/lpl source get extra '\' with "room_id\" (from js coding) --- youtube_dl/extractor/douyutv.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index 2f3c5113e..911594413 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -18,7 +18,7 @@ from ..utils import ( class DouyuTVIE(InfoExtractor): IE_DESC = '斗鱼' - _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(?P<id>[A-Za-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(?:[^/]+/)*(?P<id>[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://www.douyutv.com/iseven', 'info_dict': { @@ -68,6 +68,10 @@ class DouyuTVIE(InfoExtractor): }, { 'url': 'http://www.douyu.com/xiaocang', 'only_matching': True, + }, { + # \"room_id\" + 'url': 'http://www.douyu.com/t/lpl', + 'only_matching': True, }] # Decompile core.swf in webpage by ffdec "Search SWFs in memory". core.swf @@ -82,7 +86,7 @@ class DouyuTVIE(InfoExtractor): else: page = self._download_webpage(url, video_id) room_id = self._html_search_regex( - r'"room_id"\s*:\s*(\d+),', page, 'room id') + r'"room_id\\?"\s*:\s*(\d+),', page, 'room id') room = self._download_json( 'http://m.douyu.com/html5/live?roomId=%s' % room_id, video_id, From 45024183aea169dc898902388f782485de02cbac Mon Sep 17 00:00:00 2001 From: Mattias Wadman <mattias.wadman@gmail.com> Date: Fri, 3 Feb 2017 05:10:13 +0100 Subject: [PATCH 096/137] [infoq] Add audio only format if available (#11565) * [infoq] Add audio only format if available Refactor cookie code into a function. Renamed formats to http_video, http_audio, rtmp_video Renamed extract functions to video instead of videos as they return one or no video. * [infoq] Rename to _extract_cookies as it more than one * [infoq] Remove redundant determine_ext * [infoq] Add comment about hardcoded URL * [infoq] Use _hidden_inputs instead of messy regex * [infoq] Probe if audio URL is valid Make it possible to pass headers to _is_valid_url * [infoq] Add audio only test --- youtube_dl/extractor/common.py | 4 +-- youtube_dl/extractor/infoq.py | 63 ++++++++++++++++++++++++++++------ 2 files changed, 55 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5a15a9536..2c8ec1417 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1025,13 +1025,13 @@ class InfoExtractor(object): unique_formats.append(f) formats[:] = unique_formats - def _is_valid_url(self, url, video_id, item='video'): + def _is_valid_url(self, url, video_id, item='video', headers={}): url = self._proto_relative_url(url, scheme='http:') # For now assume non HTTP(S) URLs always valid if not (url.startswith('http://') or url.startswith('https://')): return True try: - self._request_webpage(url, video_id, 'Checking %s URL' % item) + self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) return True except ExtractorError as e: if isinstance(e.cause, compat_urllib_error.URLError): diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index cca0b8a93..9fb71e8ef 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import base64 -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_urllib_parse_unquote, + compat_urlparse, +) from ..utils import determine_ext from .bokecc import BokeCCBaseIE @@ -33,9 +36,21 @@ class InfoQIE(BokeCCBaseIE): 'ext': 'flv', 'description': 'md5:308d981fb28fa42f49f9568322c683ff', }, + }, { + 'url': 'https://www.infoq.com/presentations/Simple-Made-Easy', + 'md5': '0e34642d4d9ef44bf86f66f6399672db', + 'info_dict': { + 'id': 'Simple-Made-Easy', + 'title': 'Simple Made Easy', + 'ext': 'mp3', + 'description': 'md5:3e0e213a8bbd074796ef89ea35ada25b', + }, + 'params': { + 'format': 'bestaudio', + }, }] - def _extract_rtmp_videos(self, webpage): + def _extract_rtmp_video(self, webpage): # The server URL is hardcoded video_url = 'rtmpe://video.infoq.com/cfx/st/' @@ -47,28 +62,53 @@ class InfoQIE(BokeCCBaseIE): playpath = 'mp4:' + real_id return [{ - 'format_id': 'rtmp', + 'format_id': 'rtmp_video', 'url': video_url, 'ext': determine_ext(playpath), 'play_path': playpath, }] - def _extract_http_videos(self, webpage): - http_video_url = self._search_regex(r'P\.s\s*=\s*\'([^\']+)\'', webpage, 'video URL') - + def _extract_cookies(self, webpage): policy = self._search_regex(r'InfoQConstants.scp\s*=\s*\'([^\']+)\'', webpage, 'policy') signature = self._search_regex(r'InfoQConstants.scs\s*=\s*\'([^\']+)\'', webpage, 'signature') key_pair_id = self._search_regex(r'InfoQConstants.sck\s*=\s*\'([^\']+)\'', webpage, 'key-pair-id') + return 'CloudFront-Policy=%s; CloudFront-Signature=%s; CloudFront-Key-Pair-Id=%s' % ( + policy, signature, key_pair_id) + def _extract_http_video(self, webpage): + http_video_url = self._search_regex(r'P\.s\s*=\s*\'([^\']+)\'', webpage, 'video URL') return [{ - 'format_id': 'http', + 'format_id': 'http_video', 'url': http_video_url, 'http_headers': { - 'Cookie': 'CloudFront-Policy=%s; CloudFront-Signature=%s; CloudFront-Key-Pair-Id=%s' % ( - policy, signature, key_pair_id), + 'Cookie': self._extract_cookies(webpage) }, }] + def _extract_http_audio(self, webpage, video_id): + fields = self._hidden_inputs(webpage) + http_audio_url = fields['filename'] + if http_audio_url is None: + return [] + + cookies_header = {'Cookie': self._extract_cookies(webpage)} + + # base URL is found in the Location header in the response returned by + # GET https://www.infoq.com/mp3download.action?filename=... when logged in. + http_audio_url = compat_urlparse.urljoin('http://res.infoq.com/downloads/mp3downloads/', http_audio_url) + + # audio file seem to be missing some times even if there is a download link + # so probe URL to make sure + if not self._is_valid_url(http_audio_url, video_id, headers=cookies_header): + return [] + + return [{ + 'format_id': 'http_audio', + 'url': http_audio_url, + 'vcodec': 'none', + 'http_headers': cookies_header, + }] + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -80,7 +120,10 @@ class InfoQIE(BokeCCBaseIE): # for China videos, HTTP video URL exists but always fails with 403 formats = self._extract_bokecc_formats(webpage, video_id) else: - formats = self._extract_rtmp_videos(webpage) + self._extract_http_videos(webpage) + formats = ( + self._extract_rtmp_video(webpage) + + self._extract_http_video(webpage) + + self._extract_http_audio(webpage, video_id)) self._sort_formats(formats) From d7f9242e301fa7c08542932c9348140cf2e07172 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 3 Feb 2017 12:13:24 +0800 Subject: [PATCH 097/137] [ChangeLog] Update after #11565 --- ChangeLog | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog b/ChangeLog index c80126cfb..487ed3f0f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version <unreleased> Extractors ++ [infoq] Add audio only formats (#11565) * [youtube] Fix ytsearch when cookies are provided (#11924) + [bilibili] Support new Bangumi URLs (#11845) From 4ce3407d089ae8c34341e6d68267910683d4b500 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 3 Feb 2017 10:15:03 +0100 Subject: [PATCH 098/137] [filmon] improve extraction --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/filmon.py | 236 +++++++++++++++++------------ 2 files changed, 139 insertions(+), 102 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c9b9ebd23..e4ee43ee3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -287,7 +287,10 @@ from .fc2 import ( FC2EmbedIE, ) from .fczenit import FczenitIE -from .filmon import FilmOnIE, FilmOnVODIE +from .filmon import ( + FilmOnIE, + FilmOnChannelIE, +) from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE diff --git a/youtube_dl/extractor/filmon.py b/youtube_dl/extractor/filmon.py index 987792fec..f775fe0ba 100644 --- a/youtube_dl/extractor/filmon.py +++ b/youtube_dl/extractor/filmon.py @@ -2,74 +2,21 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import qualities -from ..compat import compat_urllib_request - - -_QUALITY = qualities(('low', 'high')) +from ..compat import ( + compat_str, + compat_HTTPError, +) +from ..utils import ( + qualities, + strip_or_none, + int_or_none, + ExtractorError, +) class FilmOnIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?filmon\.com/(?:tv|channel)/(?P<id>[a-z0-9-]+)' - _TESTS = [{ - 'url': 'https://www.filmon.com/channel/filmon-sports', - 'only_matching': True, - }, { - 'url': 'https://www.filmon.com/tv/2894', - 'only_matching': True, - }] - - def _real_extract(self, url): - channel_id = self._match_id(url) - - request = compat_urllib_request.Request('https://www.filmon.com/channel/%s' % (channel_id)) - request.add_header('X-Requested-With', 'XMLHttpRequest') - channel_info = self._download_json(request, channel_id) - now_playing = channel_info['now_playing'] - - thumbnails = [] - for thumb in now_playing.get('images', ()): - if thumb['type'] != '2': - continue - thumbnails.append({ - 'url': thumb['url'], - 'width': int(thumb['width']), - 'height': int(thumb['height']), - }) - - formats = [] - - for stream in channel_info['streams']: - formats.append({ - 'format_id': str(stream['id']), - # this is an m3u8 stream, but we are deliberately not using _extract_m3u8_formats - # because 0) it doesn't have bitrate variants anyway, and 1) the ids generated - # by that method are highly unstable (because the bitrate is variable) - 'url': stream['url'], - 'resolution': stream['name'], - 'format_note': 'expires after %u seconds' % int(stream['watch-timeout']), - 'ext': 'mp4', - 'quality': _QUALITY(stream['quality']), - 'preference': int(stream['watch-timeout']), - }) - self._sort_formats(formats) - - return { - 'id': str(channel_info['id']), - 'display_id': channel_info['alias'], - 'formats': formats, - # XXX: use the channel description (channel_info['description'])? - 'uploader_id': channel_info['alias'], - 'uploader': channel_info['title'], # XXX: kinda stretching it... - 'title': now_playing.get('programme_name') or channel_info['title'], - 'description': now_playing.get('programme_description'), - 'thumbnails': thumbnails, - 'is_live': True, - } - - -class FilmOnVODIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?filmon\.com/vod/view/(?P<id>\d+)' + IE_NAME = 'filmon' + _VALID_URL = r'(?:https?://(?:www\.)?filmon\.com/vod/view/|filmon:)(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.filmon.com/vod/view/24869-0-plan-9-from-outer-space', 'info_dict': { @@ -83,62 +30,149 @@ class FilmOnVODIE(InfoExtractor): 'info_dict': { 'id': '2825', 'title': 'Popeye Series 1', + 'description': 'The original series of Popeye.', }, - 'playlist_count': 8, + 'playlist_mincount': 8, }] def _real_extract(self, url): video_id = self._match_id(url) - result = self._download_json('https://www.filmon.com/api/vod/movie?id=%s' % (video_id), video_id) - if result['code'] != 200: - raise ExtractorError('FilmOn said: %s' % (result['reason']), expected=True) + try: + response = self._download_json( + 'https://www.filmon.com/api/vod/movie?id=%s' % video_id, + video_id)['response'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + errmsg = self._parse_json(e.cause.read().decode(), video_id)['reason'] + raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True) + raise - response = result['response'] + title = response['title'] + description = strip_or_none(response.get('description')) - if response.get('episodes'): - return { - '_type': 'playlist', - 'id': video_id, - 'title': response['title'], - 'entries': [{ - '_type': 'url', - 'url': 'https://www.filmon.com/vod/view/%s' % (ep), - } for ep in response['episodes']] - } + if response.get('type_id') == 1: + entries = [self.url_result('filmon:' + episode_id) for episode_id in response.get('episodes', [])] + return self.playlist_result(entries, video_id, title, description) + QUALITY = qualities(('low', 'high')) formats = [] - for (id, stream) in response['streams'].items(): + for format_id, stream in response.get('streams', {}).items(): + stream_url = stream.get('url') + if not stream_url: + continue formats.append({ - 'format_id': id, - 'url': stream['url'], - 'resolution': stream['name'], - 'format_note': 'expires after %u seconds' % int(stream['watch-timeout']), + 'format_id': format_id, + 'url': stream_url, 'ext': 'mp4', - 'quality': _QUALITY(stream['quality']), - 'preference': int(stream['watch-timeout']), + 'quality': QUALITY(stream.get('quality')), + 'protocol': 'm3u8_native', }) self._sort_formats(formats) - poster = response['poster'] - thumbnails = [{ - 'id': 'poster', - 'url': poster['url'], - 'width': poster['width'], - 'height': poster['height'], - }] - for (id, thumb) in poster['thumbs'].items(): + thumbnails = [] + poster = response.get('poster', {}) + thumbs = poster.get('thumbs', {}) + thumbs['poster'] = poster + for thumb_id, thumb in thumbs.items(): + thumb_url = thumb.get('url') + if not thumb_url: + continue thumbnails.append({ - 'id': id, - 'url': thumb['url'], - 'width': thumb['width'], - 'height': thumb['height'], + 'id': thumb_id, + 'url': thumb_url, + 'width': int_or_none(thumb.get('width')), + 'height': int_or_none(thumb.get('height')), }) return { 'id': video_id, - 'title': response['title'], + 'title': title, 'formats': formats, - 'description': response['description'], + 'description': description, 'thumbnails': thumbnails, } + + +class FilmOnChannelIE(InfoExtractor): + IE_NAME = 'filmon:channel' + _VALID_URL = r'https?://(?:www\.)?filmon\.com/(?:tv|channel)/(?P<id>[a-z0-9-]+)' + _TESTS = [{ + # VOD + 'url': 'http://www.filmon.com/tv/sports-haters', + 'info_dict': { + 'id': '4190', + 'ext': 'mp4', + 'title': 'Sports Haters', + 'description': 'md5:dabcb4c1d9cfc77085612f1a85f8275d', + }, + }, { + # LIVE + 'url': 'https://www.filmon.com/channel/filmon-sports', + 'only_matching': True, + }, { + 'url': 'https://www.filmon.com/tv/2894', + 'only_matching': True, + }] + + _THUMBNAIL_RES = [ + ('logo', 56, 28), + ('big_logo', 106, 106), + ('extra_big_logo', 300, 300), + ] + + def _real_extract(self, url): + channel_id = self._match_id(url) + + try: + channel_data = self._download_json( + 'http://www.filmon.com/api-v2/channel/' + channel_id, channel_id)['data'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + errmsg = self._parse_json(e.cause.read().decode(), channel_id)['message'] + raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True) + raise + + channel_id = compat_str(channel_data['id']) + is_live = not channel_data.get('is_vod') and not channel_data.get('is_vox') + title = channel_data['title'] + + QUALITY = qualities(('low', 'high')) + formats = [] + for stream in channel_data.get('streams', []): + stream_url = stream.get('url') + if not stream_url: + continue + if not is_live: + formats.extend(self._extract_wowza_formats( + stream_url, channel_id, skip_protocols=['dash', 'rtmp', 'rtsp'])) + continue + quality = stream.get('quality') + formats.append({ + 'format_id': quality, + # this is an m3u8 stream, but we are deliberately not using _extract_m3u8_formats + # because it doesn't have bitrate variants anyway + 'url': stream_url, + 'ext': 'mp4', + 'quality': QUALITY(quality), + }) + self._sort_formats(formats) + + thumbnails = [] + for name, width, height in self._THUMBNAIL_RES: + thumbnails.append({ + 'id': name, + 'url': 'http://static.filmon.com/assets/channels/%s/%s.png' % (channel_id, name), + 'width': width, + 'height': height, + }) + + return { + 'id': channel_id, + 'display_id': channel_data.get('alias'), + 'title': self._live_title(title) if is_live else title, + 'description': channel_data.get('description'), + 'thumbnails': thumbnails, + 'formats': formats, + 'is_live': is_live, + } From daac118bf4e8bf3dc1ec202fe8b21b9319d15dbf Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 3 Feb 2017 18:56:40 +0800 Subject: [PATCH 099/137] [ChangeLog] Update after #11901 --- ChangeLog | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ChangeLog b/ChangeLog index 487ed3f0f..947590b94 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,10 +1,14 @@ version <unreleased> +Core ++ Add --playlist-random to shuffle playlists (#11889, #11901) + Extractors + [infoq] Add audio only formats (#11565) * [youtube] Fix ytsearch when cookies are provided (#11924) + [bilibili] Support new Bangumi URLs (#11845) + version 2017.02.01 Extractors From f7a10d8cd6d1378d5f8e67b4b3572fa474b47cde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 3 Feb 2017 21:25:44 +0700 Subject: [PATCH 100/137] [sportbox] Remove extractor (closes #11954) Covered by generic extractor --- youtube_dl/extractor/sportbox.py | 54 -------------------------------- 1 file changed, 54 deletions(-) diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index b512cd20f..05a0b5a80 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -11,60 +11,6 @@ from ..utils import ( ) -class SportBoxIE(InfoExtractor): - _VALID_URL = r'https?://news\.sportbox\.ru/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)' - _TESTS = [{ - 'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S', - 'md5': 'ff56a598c2cf411a9a38a69709e97079', - 'info_dict': { - 'id': '80822', - 'ext': 'mp4', - 'title': 'Гонка 2 заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн', - 'description': 'md5:3d72dc4a006ab6805d82f037fdc637ad', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20140928', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4', - 'only_matching': True, - }, { - 'url': 'http://news.sportbox.ru/video/no_ads/spbvideo_NI536574_V_Novorossijske_proshel_detskij_turnir_Pole_slavy_bojevoj?ci=211355', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - player = self._search_regex( - r'src="/?(vdl/player/[^"]+)"', webpage, 'player') - - title = self._html_search_regex( - [r'"nodetitle"\s*:\s*"([^"]+)"', r'class="node-header_{1,2}title">([^<]+)'], - webpage, 'title') - description = self._og_search_description(webpage) or self._html_search_meta( - 'description', webpage, 'description') - thumbnail = self._og_search_thumbnail(webpage) - upload_date = unified_strdate(self._html_search_meta( - 'dateCreated', webpage, 'upload date')) - - return { - '_type': 'url_transparent', - 'url': compat_urlparse.urljoin(url, '/%s' % player), - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - } - - class SportBoxEmbedIE(InfoExtractor): _VALID_URL = r'https?://news\.sportbox\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)' _TESTS = [{ From b7cc5f078eca4d90b3e3d31d1247452953dba1fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 3 Feb 2017 21:56:10 +0700 Subject: [PATCH 101/137] [extractors] Remove remnants of sportbox extractor (#11954) --- youtube_dl/extractor/extractors.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index aa235bec1..eaf3676df 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -888,10 +888,7 @@ from .spiegeltv import SpiegeltvIE from .spike import SpikeIE from .stitcher import StitcherIE from .sport5 import Sport5IE -from .sportbox import ( - SportBoxIE, - SportBoxEmbedIE, -) +from .sportbox import SportBoxEmbedIE from .sportdeutschland import SportDeutschlandIE from .sportschau import SportschauIE from .srgssr import ( From f962790ee53c634758021d9fc752ae476c6a142b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 3 Feb 2017 21:56:48 +0700 Subject: [PATCH 102/137] [vine] Fix extraction (closes #11955) --- youtube_dl/extractor/vine.py | 103 +++++++++++++++-------------------- 1 file changed, 44 insertions(+), 59 deletions(-) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 0183f052a..4957a07f7 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -6,8 +6,9 @@ import itertools from .common import InfoExtractor from ..utils import ( + determine_ext, int_or_none, - unified_strdate, + unified_timestamp, ) @@ -20,50 +21,16 @@ class VineIE(InfoExtractor): 'id': 'b9KOOWX7HUx', 'ext': 'mp4', 'title': 'Chicken.', - 'alt_title': 'Vine by Jack Dorsey', + 'alt_title': 'Vine by Jack', + 'timestamp': 1368997951, 'upload_date': '20130519', - 'uploader': 'Jack Dorsey', + 'uploader': 'Jack', 'uploader_id': '76', 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, }, - }, { - 'url': 'https://vine.co/v/MYxVapFvz2z', - 'md5': '7b9a7cbc76734424ff942eb52c8f1065', - 'info_dict': { - 'id': 'MYxVapFvz2z', - 'ext': 'mp4', - 'title': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14', - 'alt_title': 'Vine by Mars Ruiz', - 'upload_date': '20140815', - 'uploader': 'Mars Ruiz', - 'uploader_id': '1102363502380728320', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - }, - }, { - 'url': 'https://vine.co/v/bxVjBbZlPUH', - 'md5': 'ea27decea3fa670625aac92771a96b73', - 'info_dict': { - 'id': 'bxVjBbZlPUH', - 'ext': 'mp4', - 'title': '#mw3 #ac130 #killcam #angelofdeath', - 'alt_title': 'Vine by Z3k3', - 'upload_date': '20130430', - 'uploader': 'Z3k3', - 'uploader_id': '936470460173008896', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - }, - }, { - 'url': 'https://vine.co/oembed/MYxVapFvz2z.json', - 'only_matching': True, }, { 'url': 'https://vine.co/v/e192BnZnZ9V', 'info_dict': { @@ -71,6 +38,7 @@ class VineIE(InfoExtractor): 'ext': 'mp4', 'title': 'ยิ้ม~ เขิน~ อาย~ น่าร้ากอ้ะ >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', 'alt_title': 'Vine by Pimry_zaa', + 'timestamp': 1436057405, 'upload_date': '20150705', 'uploader': 'Pimry_zaa', 'uploader_id': '1135760698325307392', @@ -82,43 +50,60 @@ class VineIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://vine.co/v/MYxVapFvz2z', + 'only_matching': True, + }, { + 'url': 'https://vine.co/v/bxVjBbZlPUH', + 'only_matching': True, + }, { + 'url': 'https://vine.co/oembed/MYxVapFvz2z.json', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage('https://vine.co/v/' + video_id, video_id) - data = self._parse_json( - self._search_regex( - r'window\.POST_DATA\s*=\s*({.+?});\s*</script>', - webpage, 'vine data'), - video_id) + data = self._download_json( + 'https://archive.vine.co/posts/%s.json' % video_id, video_id) - data = data[list(data.keys())[0]] - - formats = [{ - 'format_id': '%(format)s-%(rate)s' % f, - 'vcodec': f.get('format'), - 'quality': f.get('rate'), - 'url': f['videoUrl'], - } for f in data['videoUrls'] if f.get('videoUrl')] + def video_url(kind): + for url_suffix in ('Url', 'URL'): + format_url = data.get('video%s%s' % (kind, url_suffix)) + if format_url: + return format_url + formats = [] + for quality, format_id in enumerate(('low', '', 'dash')): + format_url = video_url(format_id.capitalize()) + if not format_url: + continue + # DASH link returns plain mp4 + if format_id == 'dash' and determine_ext(format_url) == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id or 'standard', + 'quality': quality, + }) self._sort_formats(formats) username = data.get('username') return { 'id': video_id, - 'title': data.get('description') or self._og_search_title(webpage), - 'alt_title': 'Vine by %s' % username if username else self._og_search_description(webpage, default=None), + 'title': data.get('description'), + 'alt_title': 'Vine by %s' % username if username else None, 'thumbnail': data.get('thumbnailUrl'), - 'upload_date': unified_strdate(data.get('created')), + 'timestamp': unified_timestamp(data.get('created')), 'uploader': username, 'uploader_id': data.get('userIdStr'), - 'view_count': int_or_none(data.get('loops', {}).get('count')), - 'like_count': int_or_none(data.get('likes', {}).get('count')), - 'comment_count': int_or_none(data.get('comments', {}).get('count')), - 'repost_count': int_or_none(data.get('reposts', {}).get('count')), + 'view_count': int_or_none(data.get('loops')), + 'like_count': int_or_none(data.get('likes')), + 'comment_count': int_or_none(data.get('comments')), + 'repost_count': int_or_none(data.get('reposts')), 'formats': formats, } From 605fd6392fedd2599115e1f1e12df2a6212df1ae Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 3 Feb 2017 17:59:48 +0100 Subject: [PATCH 103/137] [youtube] add format info for itag 325 and 328 --- youtube_dl/extractor/youtube.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f2f751104..76710931a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -329,6 +329,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'}, '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'}, '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'}, + '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'preference': -50, 'container': 'm4a_dash'}, + '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'preference': -50, 'container': 'm4a_dash'}, # Dash webm '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, From f65dba7cdb98bb5444ad5656c9626a15d210f6f6 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 3 Feb 2017 22:25:19 +0100 Subject: [PATCH 104/137] [myspace] fix extraction and extract hls and http formats --- youtube_dl/extractor/myspace.py | 108 +++++++++++++++++--------------- 1 file changed, 58 insertions(+), 50 deletions(-) diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py index ab32e632e..f281238c9 100644 --- a/youtube_dl/extractor/myspace.py +++ b/youtube_dl/extractor/myspace.py @@ -17,9 +17,10 @@ class MySpaceIE(InfoExtractor): _TESTS = [ { 'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919', + 'md5': '9c1483c106f4a695c47d2911feed50a7', 'info_dict': { 'id': '109594919', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Little Big Town', 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.', 'uploader': 'Five Minutes to the Stage', @@ -27,37 +28,30 @@ class MySpaceIE(InfoExtractor): 'timestamp': 1414108751, 'upload_date': '20141023', }, - 'params': { - # rtmp download - 'skip_download': True, - }, }, # songs { 'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681', + 'md5': '1d7ee4604a3da226dd69a123f748b262', 'info_dict': { 'id': '93388656', - 'ext': 'flv', + 'ext': 'm4a', 'title': 'Of weakened soul...', 'uploader': 'Killsorrow', 'uploader_id': 'killsorrow', }, - 'params': { - # rtmp download - 'skip_download': True, - }, }, { - 'add_ie': ['Vevo'], + 'add_ie': ['Youtube'], 'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041', 'info_dict': { - 'id': 'USZM20600099', - 'ext': 'mp4', - 'title': 'Animal I Have Become', - 'uploader': 'Three Days Grace', - 'timestamp': int, - 'upload_date': '20060502', + 'id': 'xqds0B_meys', + 'ext': 'webm', + 'title': 'Three Days Grace - Animal I Have Become', + 'description': 'md5:8bd86b3693e72a077cf863a8530c54bb', + 'uploader': 'ThreeDaysGraceVEVO', + 'uploader_id': 'ThreeDaysGraceVEVO', + 'upload_date': '20091002', }, - 'skip': 'VEVO is only available in some countries', }, { 'add_ie': ['Youtube'], 'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426', @@ -76,24 +70,46 @@ class MySpaceIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + is_song = mobj.group('mediatype').startswith('music/song') webpage = self._download_webpage(url, video_id) player_url = self._search_regex( - r'playerSwf":"([^"?]*)', webpage, 'player URL') + r'videoSwf":"([^"?]*)', webpage, 'player URL', fatal=False) - def rtmp_format_from_stream_url(stream_url, width=None, height=None): - rtmp_url, play_path = stream_url.split(';', 1) - return { - 'format_id': 'rtmp', - 'url': rtmp_url, - 'play_path': play_path, - 'player_url': player_url, - 'protocol': 'rtmp', - 'ext': 'flv', - 'width': width, - 'height': height, - } + def formats_from_stream_urls(stream_url, hls_stream_url, http_stream_url, width=None, height=None): + formats = [] + vcodec = 'none' if is_song else None + if hls_stream_url: + formats.append({ + 'format_id': 'hls', + 'url': hls_stream_url, + 'protocol': 'm3u8_native', + 'ext': 'm4a' if is_song else 'mp4', + 'vcodec': vcodec, + }) + if stream_url and player_url: + rtmp_url, play_path = stream_url.split(';', 1) + formats.append({ + 'format_id': 'rtmp', + 'url': rtmp_url, + 'play_path': play_path, + 'player_url': player_url, + 'protocol': 'rtmp', + 'ext': 'flv', + 'width': width, + 'height': height, + 'vcodec': vcodec, + }) + if http_stream_url: + formats.append({ + 'format_id': 'http', + 'url': http_stream_url, + 'width': width, + 'height': height, + 'vcodec': vcodec, + }) + return formats - if mobj.group('mediatype').startswith('music/song'): + if is_song: # songs don't store any useful info in the 'context' variable song_data = self._search_regex( r'''<button.*data-song-id=(["\'])%s\1.*''' % video_id, @@ -108,8 +124,10 @@ class MySpaceIE(InfoExtractor): return self._search_regex( r'''data-%s=([\'"])(?P<data>.*?)\1''' % name, song_data, name, default='', group='data') - stream_url = search_data('stream-url') - if not stream_url: + formats = formats_from_stream_urls( + search_data('stream-url'), search_data('hls-stream-url'), + search_data('http-stream-url')) + if not formats: vevo_id = search_data('vevo-id') youtube_id = search_data('youtube-id') if vevo_id: @@ -121,6 +139,7 @@ class MySpaceIE(InfoExtractor): else: raise ExtractorError( 'Found song but don\'t know how to download it') + self._sort_formats(formats) return { 'id': video_id, 'title': self._og_search_title(webpage), @@ -128,27 +147,16 @@ class MySpaceIE(InfoExtractor): 'uploader_id': search_data('artist-username'), 'thumbnail': self._og_search_thumbnail(webpage), 'duration': int_or_none(search_data('duration')), - 'formats': [rtmp_format_from_stream_url(stream_url)] + 'formats': formats, } else: video = self._parse_json(self._search_regex( r'context = ({.*?});', webpage, 'context'), video_id)['video'] - formats = [] - hls_stream_url = video.get('hlsStreamUrl') - if hls_stream_url: - formats.append({ - 'format_id': 'hls', - 'url': hls_stream_url, - 'protocol': 'm3u8_native', - 'ext': 'mp4', - }) - stream_url = video.get('streamUrl') - if stream_url: - formats.append(rtmp_format_from_stream_url( - stream_url, - int_or_none(video.get('width')), - int_or_none(video.get('height')))) + formats = formats_from_stream_urls( + video.get('streamUrl'), video.get('hlsStreamUrl'), + video.get('mp4StreamUrl'), int_or_none(video.get('width')), + int_or_none(video.get('height'))) self._sort_formats(formats) return { 'id': video_id, From 2c15db829c1bd8311ed82e2884661271f0cf73ed Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 4 Feb 2017 08:38:28 +0100 Subject: [PATCH 105/137] [drtv] add support for live and radio sections(closes #1827)(closes #3427) --- youtube_dl/extractor/drtv.py | 74 +++++++++++++++++++++++++++--- youtube_dl/extractor/extractors.py | 5 +- 2 files changed, 72 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index 88d096b30..e966d7483 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -9,12 +9,13 @@ from ..utils import ( mimetype2ext, parse_iso8601, remove_end, + update_url_query, ) class DRTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv/se|nyheder)/(?:[^/]+/)*(?P<id>[\da-z-]+)(?:[/#?]|$)' - + _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv/se|nyheder|radio/ondemand)/(?:[^/]+/)*(?P<id>[\da-z-]+)(?:[/#?]|$)' + IE_NAME = 'drtv' _TESTS = [{ 'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10', 'md5': '25e659cccc9a2ed956110a299fdf5983', @@ -79,9 +80,10 @@ class DRTVIE(InfoExtractor): subtitles = {} for asset in data['Assets']: - if asset.get('Kind') == 'Image': + kind = asset.get('Kind') + if kind == 'Image': thumbnail = asset.get('Uri') - elif asset.get('Kind') == 'VideoResource': + elif kind in ('VideoResource', 'AudioResource'): duration = float_or_none(asset.get('DurationInMilliseconds'), 1000) restricted_to_denmark = asset.get('RestrictedToDenmark') spoken_subtitles = asset.get('Target') == 'SpokenSubtitles' @@ -96,9 +98,13 @@ class DRTVIE(InfoExtractor): preference = -1 format_id += '-spoken-subtitles' if target == 'HDS': - formats.extend(self._extract_f4m_formats( + f4m_formats = self._extract_f4m_formats( uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', - video_id, preference, f4m_id=format_id)) + video_id, preference, f4m_id=format_id) + if kind == 'AudioResource': + for f in f4m_formats: + f['vcodec'] = 'none' + formats.extend(f4m_formats) elif target == 'HLS': formats.extend(self._extract_m3u8_formats( uri, video_id, 'mp4', entry_protocol='m3u8_native', @@ -112,6 +118,7 @@ class DRTVIE(InfoExtractor): 'format_id': format_id, 'tbr': int_or_none(bitrate), 'ext': link.get('FileFormat'), + 'vcodec': 'none' if kind == 'AudioResource' else None, }) subtitles_list = asset.get('SubtitlesList') if isinstance(subtitles_list, list): @@ -144,3 +151,58 @@ class DRTVIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } + + +class DRTVLiveIE(InfoExtractor): + IE_NAME = 'drtv:live' + _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv|TV)/live/(?P<id>[\da-z-]+)' + _TEST = { + 'url': 'https://www.dr.dk/tv/live/dr1', + 'info_dict': { + 'id': 'dr1', + 'ext': 'mp4', + 'title': 're:^DR1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + channel_id = self._match_id(url) + channel_data = self._download_json( + 'https://www.dr.dk/mu-online/api/1.0/channel/' + channel_id, + channel_id) + title = self._live_title(channel_data['Title']) + + formats = [] + for streaming_server in channel_data.get('StreamingServers', []): + server = streaming_server.get('Server') + if not server: + continue + link_type = streaming_server.get('LinkType') + for quality in streaming_server.get('Qualities', []): + for stream in quality.get('Streams', []): + stream_path = stream.get('Stream') + if not stream_path: + continue + stream_url = update_url_query( + '%s/%s' % (server, stream_path), {'b': ''}) + if link_type == 'HLS': + formats.extend(self._extract_m3u8_formats( + stream_url, channel_id, 'mp4', + m3u8_id=link_type, fatal=False, live=True)) + elif link_type == 'HDS': + formats.extend(self._extract_f4m_formats(update_url_query( + '%s/%s' % (server, stream_path), {'hdcore': '3.7.0'}), + channel_id, f4m_id=link_type, fatal=False)) + self._sort_formats(formats) + + return { + 'id': channel_id, + 'title': title, + 'thumbnail': channel_data.get('PrimaryImageUri'), + 'formats': formats, + 'is_live': True, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index eaf3676df..32420937c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -248,7 +248,10 @@ from .dramafever import ( from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE -from .drtv import DRTVIE +from .drtv import ( + DRTVIE, + DRTVLiveIE, +) from .dvtv import DVTVIE from .dumpert import DumpertIE from .defense import DefenseGouvFrIE From 36fce54816eb1f1d792ac7ed4d07e292d44d62f5 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 4 Feb 2017 15:23:46 +0100 Subject: [PATCH 106/137] [turner] fix downloading of secure hls formats using ffmpeg(closes #11358)(closes #11373)(closes #11800) --- youtube_dl/downloader/external.py | 9 +++++++++ youtube_dl/extractor/turner.py | 8 ++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 138f353ef..41e37261d 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -199,6 +199,15 @@ class FFmpegFD(ExternalFD): args = [ffpp.executable, '-y'] + seekable = info_dict.get('_seekable') + if seekable is not None: + # setting -seekable prevents ffmpeg from guessing if the server + # supports seeking(by adding the header `Range: bytes=0-`), which + # can cause problems in some cases + # https://github.com/rg3/youtube-dl/issues/11800#issuecomment-275037127 + # http://trac.ffmpeg.org/ticket/6125#comment:10 + args += ['-seekable', '1' if seekable else '0'] + args += self._configuration_args() # start_time = info_dict.get('start_time') or 0 diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index 57ffedb87..1c0be9fc6 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -100,9 +100,13 @@ class TurnerBaseIE(AdobePassIE): formats.extend(self._extract_smil_formats( video_url, video_id, fatal=False)) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( video_url, video_id, 'mp4', - m3u8_id=format_id or 'hls', fatal=False)) + m3u8_id=format_id or 'hls', fatal=False) + if '/secure/' in video_url and '?hdnea=' in video_url: + for f in m3u8_formats: + f['_seekable'] = False + formats.extend(m3u8_formats) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( update_url_query(video_url, {'hdcore': '3.7.0'}), From 643dc0fcfed5e5eb152000190d0c7ba9dd577ef8 Mon Sep 17 00:00:00 2001 From: A Connecticut Princess <bugchecker@dibaby.org> Date: Sat, 4 Feb 2017 13:23:35 +0500 Subject: [PATCH 107/137] [vk] Catch author blocked error message Example link (video in blocked group): https://vk.com/search?c%5Bq%5D=%D0%9F%D1%80%D1%8B%D0%B6%D0%BE%D0%BA%20c%20%D0%BA%D1%80%D0%B0%D0%BD%D0%B0%20%D0%B2%20%D1%81%D1%82%D0%B8%D0%BB%D0%B5%20%D0%A7%D0%B5%D0%BB%D0%BE%D0%B2%D0%B5%D0%BA%D0%B0-%D0%BF%D0%B0%D1%83%D0%BA%D0%B0&c%5Bsection%5D=video&c%5Bsort%5D=2&z=video-10639516_456240611 --- youtube_dl/extractor/vk.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 6e6c3a0e1..7c42a4f54 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -281,6 +281,11 @@ class VKIE(VKBaseIE): { 'url': 'http://new.vk.com/video205387401_165548505', 'only_matching': True, + }, + { + # This video is no longer available, because its author has been blocked. + 'url': 'https://vk.com/video-10639516_456240611', + 'only_matching': True, } ] @@ -328,6 +333,12 @@ class VKIE(VKBaseIE): r'<!>Access denied': 'Access denied to video %s.', + + r'<!>Видеозапись недоступна, так как её автор был заблокирован.': + 'Video %s is no longer available, because its author has been blocked.', + + r'<!>This video is no longer available, because its author has been blocked.': + 'Video %s is no longer available, because its author has been blocked.', } for error_re, error_msg in ERRORS.items(): From c2521c1ac6bbd24cd5d01ba764f2d084b16c506f Mon Sep 17 00:00:00 2001 From: John Hawkinson <jhawk@mit.edu> Date: Sat, 4 Feb 2017 10:23:14 -0500 Subject: [PATCH 108/137] [Piksel] Add another app token regex --- youtube_dl/extractor/piksel.py | 43 ++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/piksel.py b/youtube_dl/extractor/piksel.py index d44edcdfb..c0c276a50 100644 --- a/youtube_dl/extractor/piksel.py +++ b/youtube_dl/extractor/piksel.py @@ -16,18 +16,33 @@ from ..utils import ( class PikselIE(InfoExtractor): _VALID_URL = r'https?://player\.piksel\.com/v/(?P<id>[a-z0-9]+)' - _TEST = { - 'url': 'http://player.piksel.com/v/nv60p12f', - 'md5': 'd9c17bbe9c3386344f9cfd32fad8d235', - 'info_dict': { - 'id': 'nv60p12f', - 'ext': 'mp4', - 'title': 'فن الحياة - الحلقة 1', - 'description': 'احدث برامج الداعية الاسلامي " مصطفي حسني " فى رمضان 2016علي النهار نور', - 'timestamp': 1465231790, - 'upload_date': '20160606', + _TESTS = [ + { + 'url': 'http://player.piksel.com/v/nv60p12f', + 'md5': 'd9c17bbe9c3386344f9cfd32fad8d235', + 'info_dict': { + 'id': 'nv60p12f', + 'ext': 'mp4', + 'title': 'فن الحياة - الحلقة 1', + 'description': 'احدث برامج الداعية الاسلامي " مصطفي حسني " فى رمضان 2016علي النهار نور', + 'timestamp': 1465231790, + 'upload_date': '20160606', + } + }, + { + # Original source: http://www.uscourts.gov/cameras-courts/state-washington-vs-donald-j-trump-et-al + 'url': 'https://player.piksel.com/v/v80kqp41', + 'md5': '753ddcd8cc8e4fa2dda4b7be0e77744d', + 'info_dict': { + 'id': 'v80kqp41', + 'ext': 'mp4', + 'title': 'WAW- State of Washington vs. Donald J. Trump, et al', + 'description': 'State of Washington vs. Donald J. Trump, et al, Case Number 17-CV-00141-JLR, TRO Hearing, Civil Rights Case, 02/3/2017, 1:00 PM (PST), Seattle Federal Courthouse, Seattle, WA, Judge James L. Robart presiding.', + 'timestamp': 1486171129, + 'upload_date': '20170204', + } } - } + ] @staticmethod def _extract_url(webpage): @@ -40,8 +55,10 @@ class PikselIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - app_token = self._search_regex( - r'clientAPI\s*:\s*"([^"]+)"', webpage, 'app token') + app_token = self._search_regex([ + r'clientAPI\s*:\s*"([^"]+)"', + r'data-de-api-key\s*=\s*"([^"]+)"' + ], webpage, 'app token') response = self._download_json( 'http://player.piksel.com/ws/ws_program/api/%s/mode/json/apiv/5' % app_token, video_id, query={ From 31487eb9746123b7c4e28be7e48908773beab40c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 4 Feb 2017 22:57:48 +0700 Subject: [PATCH 109/137] release 2017.02.04 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- README.md | 1 + docs/supportedsites.md | 7 +++++-- youtube_dl/version.py | 2 +- 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 8914569b6..11fd56038 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.02.01*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.02.01** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.02.04*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.02.04** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.02.01 +[debug] youtube-dl version 2017.02.04 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 947590b94..5323769d8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.02.04 Core + Add --playlist-random to shuffle playlists (#11889, #11901) diff --git a/README.md b/README.md index 2ee00f515..89876bd7a 100644 --- a/README.md +++ b/README.md @@ -182,6 +182,7 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo automatically resized from an initial value of SIZE. --playlist-reverse Download playlist videos in reverse order + --playlist-random Download playlist videos in random order --xattr-set-filesize Set file xattribute ytdl.filesize with expected file size (experimental) --hls-prefer-native Use the native HLS downloader instead of diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d900f5e12..50a339bc4 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -84,6 +84,7 @@ - **bambuser:channel** - **Bandcamp** - **Bandcamp:album** + - **bangumi.bilibili.com**: BiliBili番剧 - **bbc**: BBC - **bbc.co.uk**: BBC iPlayer - **bbc.co.uk:article**: BBC articles @@ -211,7 +212,8 @@ - **DRBonanza** - **Dropbox** - **DrTuber** - - **DRTV** + - **drtv** + - **drtv:live** - **Dumpert** - **dvtv**: http://video.aktualne.cz/ - **dw** @@ -247,6 +249,8 @@ - **fc2:embed** - **Fczenit** - **fernsehkritik.tv** + - **filmon** + - **filmon:channel** - **Firstpost** - **FiveTV** - **Flickr** @@ -703,7 +707,6 @@ - **Spiegeltv** - **Spike** - **Sport5** - - **SportBox** - **SportBoxEmbed** - **SportDeutschland** - **Sportschau** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0f9b6b703..376b31397 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.02.01' +__version__ = '2017.02.04' From 8e4041cf3f8e769ee2188f3db4747b7133ab5c2d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 4 Feb 2017 17:02:12 +0100 Subject: [PATCH 110/137] [radiocanada] fix extraction for toutv rtmp formats --- youtube_dl/extractor/radiocanada.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py index 321917ad0..3b40002a8 100644 --- a/youtube_dl/extractor/radiocanada.py +++ b/youtube_dl/extractor/radiocanada.py @@ -54,9 +54,8 @@ class RadioCanadaIE(InfoExtractor): raise ExtractorError('This video is DRM protected.', expected=True) device_types = ['ipad'] - if app_code != 'toutv': - device_types.append('flash') if not smuggled_data: + device_types.append('flash') device_types.append('android') formats = [] @@ -103,7 +102,7 @@ class RadioCanadaIE(InfoExtractor): continue f_url = re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url) protocol = determine_protocol({'url': f_url}) - formats.append({ + f = { 'format_id': '%s-%d' % (protocol, tbr), 'url': f_url, 'ext': 'flv' if protocol == 'rtmp' else ext, @@ -111,7 +110,14 @@ class RadioCanadaIE(InfoExtractor): 'width': int_or_none(url_e.get('width')), 'height': int_or_none(url_e.get('height')), 'tbr': tbr, - }) + } + mobj = re.match(r'(?P<url>rtmp://[^/]+/[^/]+)/(?P<playpath>[^?]+)(?P<auth>\?.+)', f_url) + if mobj: + f.update({ + 'url': mobj.group('url') + mobj.group('auth'), + 'play_path': mobj.group('playpath'), + }) + formats.append(f) if protocol == 'rtsp': base_url = self._search_regex( r'rtsp://([^?]+)', f_url, 'base url', default=None) From 9db8f6c54021a9c809c8ae65a37544ad566ed159 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 4 Feb 2017 23:21:07 +0700 Subject: [PATCH 111/137] [twitch:stream] Improve _VALID_URL (closes #11971) --- youtube_dl/extractor/twitch.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 1ca159a4d..bbba394b0 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -447,7 +447,14 @@ class TwitchHighlightsIE(TwitchVideosBaseIE): class TwitchStreamIE(TwitchBaseIE): IE_NAME = 'twitch:stream' - _VALID_URL = r'%s/(?P<id>[^/#?]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?twitch\.tv/| + player\.twitch\.tv/\?.*?\bchannel= + ) + (?P<id>[^/#?]+) + ''' _TESTS = [{ 'url': 'http://www.twitch.tv/shroomztv', @@ -471,8 +478,25 @@ class TwitchStreamIE(TwitchBaseIE): }, { 'url': 'http://www.twitch.tv/miracle_doto#profile-0', 'only_matching': True, + }, { + 'url': 'https://player.twitch.tv/?channel=lotsofs', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return (False + if any(ie.suitable(url) for ie in ( + TwitchVideoIE, + TwitchChapterIE, + TwitchVodIE, + TwitchProfileIE, + TwitchAllVideosIE, + TwitchUploadsIE, + TwitchPastBroadcastsIE, + TwitchHighlightsIE)) + else super(TwitchStreamIE, cls).suitable(url)) + def _real_extract(self, url): channel_id = self._match_id(url) From 3144eccf551afc4c5e66e06de541c033e6f90681 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 4 Feb 2017 23:22:28 +0700 Subject: [PATCH 112/137] [ChangeLog] Actualize --- ChangeLog | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 5323769d8..fe9cd3440 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,12 +1,38 @@ +version <unreleased> + +Extractors ++ [twitch:stream] Add support for player.twitch.tv (#11971) + + version 2017.02.04 Core + Add --playlist-random to shuffle playlists (#11889, #11901) +* [utils] Improve comments processing in js_to_json (#11947) +* [utils] Handle single-line comments in js_to_json +* [downloader/external:ffmpeg] Minimize the use of aac_adtstoasc filter Extractors ++ [piksel] Add another app token pattern (#11969) ++ [vk] Capture and output author blocked error message (#11965) ++ [turner] Fix secure HLS formats downloading with ffmpeg (#11358, #11373, + #11800) ++ [drtv] Add support for live and radio sections (#1827, #3427) +* [myspace] Fix extraction and extract HLS and HTTP formats ++ [youtube] Add format info for itag 325 and 328 +* [vine] Fix extraction (#11955) +- [sportbox] Remove extractor (#11954) ++ [filmon] Add support for filmon.com (#11187) + [infoq] Add audio only formats (#11565) +* [douyutv] Improve room id regular expression (#11931) +* [iprima] Fix extraction (#11920, #11896) * [youtube] Fix ytsearch when cookies are provided (#11924) +* [go] Relax video id regular expression (#11937) +* [facebook] Fix title extraction (#11941) ++ [youtube:playlist] Recognize TL playlists (#11945) + [bilibili] Support new Bangumi URLs (#11845) ++ [cbc:watch] Extract audio codec for audio only formats (#11893) ++ [elpais] Fix extraction for some URLs (#11765) version 2017.02.01 @@ -18,7 +44,6 @@ Extractors + [vimeo] Extract upload timestamp + [vimeo] Extract license (#8726, #11880) + [nrk:series] Add support for series (#11571, #11711) -+ [elpais] Fix extraction for some URLs (#11765) version 2017.01.31 From 7bccd5fc8ac35b1a3952522c0aa176c982f20206 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 4 Feb 2017 23:23:38 +0700 Subject: [PATCH 113/137] [ChangeLog] Actualize --- ChangeLog | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog b/ChangeLog index fe9cd3440..76be8dbd9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -2,6 +2,7 @@ version <unreleased> Extractors + [twitch:stream] Add support for player.twitch.tv (#11971) +* [radiocanada] Fix extraction for toutv rtmp formats version 2017.02.04 From a713a86755ba864a7b765fd2ce9a5ac8a8f4cc63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 4 Feb 2017 23:26:39 +0700 Subject: [PATCH 114/137] release 2017.02.04.1 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 11fd56038..15e7d4944 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.02.04*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.02.04** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.02.04.1*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.02.04.1** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.02.04 +[debug] youtube-dl version 2017.02.04.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 76be8dbd9..23a729559 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.02.04.1 Extractors + [twitch:stream] Add support for player.twitch.tv (#11971) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 376b31397..5dde47a26 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.02.04' +__version__ = '2017.02.04.1' From 3d2c2752c5cd70fc7f9cebe8c4683a1de626017d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 4 Feb 2017 18:18:03 +0100 Subject: [PATCH 115/137] [afreecatv] extract rtmp formats --- youtube_dl/extractor/afreecatv.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index 4f6cdb8a2..e0a0f7c57 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -221,10 +221,23 @@ class AfreecaTVGlobalIE(AfreecaTVIE): s_url = s.get('purl') if not s_url: continue - # TODO: extract rtmp formats - if s.get('stype') == 'HLS': + stype = s.get('stype') + if stype == 'HLS': formats.extend(self._extract_m3u8_formats( - s_url, channel_id, 'mp4', fatal=False)) + s_url, channel_id, 'mp4', m3u8_id=stype, fatal=False)) + elif stype == 'RTMP': + format_id = [stype] + label = s.get('label') + if label: + format_id.append(label) + formats.append({ + 'format_id': '-'.join(format_id), + 'url': s_url, + 'tbr': int_or_none(s.get('bps')), + 'height': int_or_none(s.get('brt')), + 'ext': 'flv', + 'rtmp_live': True, + }) self._sort_formats(formats) info.update({ From 49bd8d5e2e5c4de8c1c409adffc557cb198f7eee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 5 Feb 2017 02:41:22 +0700 Subject: [PATCH 116/137] [travis] Add python 3.6 --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index c74c9cc12..4833c76e9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,7 @@ python: - "3.3" - "3.4" - "3.5" + - "3.6" sudo: false script: nosetests test --verbose notifications: From 6fd138bed892ac8ae1714d64f4a53d8ea7a1d5bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 5 Feb 2017 13:36:52 +0700 Subject: [PATCH 117/137] [sportbox] PEP 8 --- youtube_dl/extractor/sportbox.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index 05a0b5a80..e7bd5bf91 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -4,11 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - js_to_json, - unified_strdate, -) +from ..utils import js_to_json class SportBoxEmbedIE(InfoExtractor): From 6ef3e65a7b244d5e432e764772177c7d48cab237 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 5 Feb 2017 13:37:27 +0700 Subject: [PATCH 118/137] [videopress] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 22 +++++++ youtube_dl/extractor/videopress.py | 99 ++++++++++++++++++++++++++++++ 3 files changed, 122 insertions(+) create mode 100644 youtube_dl/extractor/videopress.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 32420937c..cf608faee 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1095,6 +1095,7 @@ from .videomore import ( VideomoreSeasonIE, ) from .videopremium import VideoPremiumIE +from .videopress import VideoPressIE from .vidio import VidioIE from .vidme import ( VidmeIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a23486620..4156cf27d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -81,6 +81,7 @@ from .videa import VideaIE from .twentymin import TwentyMinutenIE from .ustream import UstreamIE from .openload import OpenloadIE +from .videopress import VideoPressIE class GenericIE(InfoExtractor): @@ -1473,6 +1474,21 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, 'add_ie': [TwentyMinutenIE.ie_key()], + }, + { + # VideoPress embed + 'url': 'https://en.support.wordpress.com/videopress/', + 'info_dict': { + 'id': 'OcobLTqC', + 'ext': 'm4v', + 'title': 'IMG_5786', + 'timestamp': 1435711927, + 'upload_date': '20150701', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [VideoPressIE.ie_key()], } # { # # TODO: find another test @@ -2438,6 +2454,12 @@ class GenericIE(InfoExtractor): return _playlist_from_matches( openload_urls, ie=OpenloadIE.ie_key()) + # Look for VideoPress embeds + videopress_urls = VideoPressIE._extract_urls(webpage) + if videopress_urls: + return _playlist_from_matches( + videopress_urls, ie=VideoPressIE.ie_key()) + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( webpage, video_id, default={}, expected_type='VideoObject') diff --git a/youtube_dl/extractor/videopress.py b/youtube_dl/extractor/videopress.py new file mode 100644 index 000000000..049db25a5 --- /dev/null +++ b/youtube_dl/extractor/videopress.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + float_or_none, + parse_age_limit, + qualities, + try_get, + unified_timestamp, + urljoin, +) + + +class VideoPressIE(InfoExtractor): + _VALID_URL = r'https?://videopress\.com/embed/(?P<id>[\da-zA-Z]+)' + _TESTS = [{ + 'url': 'https://videopress.com/embed/kUJmAcSf', + 'md5': '706956a6c875873d51010921310e4bc6', + 'info_dict': { + 'id': 'kUJmAcSf', + 'ext': 'mp4', + 'title': 'VideoPress Demo', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 634.6, + 'timestamp': 1434983935, + 'upload_date': '20150622', + 'age_limit': 0, + }, + }, { + # 17+, requires birth_* params + 'url': 'https://videopress.com/embed/iH3gstfZ', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+src=["\']((?:https?://)?videopress\.com/embed/[\da-zA-Z]+)', + webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id, + video_id, query={ + 'birth_month': random.randint(1, 12), + 'birth_day': random.randint(1, 31), + 'birth_year': random.randint(1950, 1995), + }) + + title = video['title'] + + def base_url(scheme): + return try_get( + video, lambda x: x['file_url_base'][scheme], compat_str) + + base_url = base_url('https') or base_url('http') + + QUALITIES = ('std', 'dvd', 'hd') + quality = qualities(QUALITIES) + + formats = [] + for format_id, f in video['files'].items(): + if not isinstance(f, dict): + continue + for ext, path in f.items(): + if ext in ('mp4', 'ogg'): + formats.append({ + 'url': urljoin(base_url, path), + 'format_id': '%s-%s' % (format_id, ext), + 'ext': determine_ext(path, ext), + 'quality': quality(format_id), + }) + original_url = try_get(video, lambda x: x['original'], compat_str) + if original_url: + formats.append({ + 'url': original_url, + 'format_id': 'original', + 'quality': len(QUALITIES), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': video.get('poster'), + 'duration': float_or_none(video.get('duration'), 1000), + 'timestamp': unified_timestamp(video.get('upload_date')), + 'age_limit': parse_age_limit(video.get('rating')), + 'formats': formats, + } From e4e50f60b1040a4b6aa8ecb9139f7d5de195f407 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 5 Feb 2017 21:41:08 +0800 Subject: [PATCH 119/137] [googledrive] Fix extraction on Python 3.6 Since Python 3.6, invalid escape sequences are deprecated. It's likely that there are invalid escape sequences somewhere on the webpage, so instead of unescaping the whole webpage, just unescape the URL. See https://bugs.python.org/issue27364. That change was designed for string literals, while it affects the 'unicode_escape' encoding as well. The code path is: str.decode('unicode_escape') codecs.unicode_escape_decode() PyUnicode_DecodeUnicodeEscape() --- ChangeLog | 6 ++++++ youtube_dl/extractor/googledrive.py | 9 +++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index 23a729559..a0025ab91 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version <unreleased> + +Extractors +* [googledrive] Fix extraction on Python 3.6 + + version 2017.02.04.1 Extractors diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 766fc26d0..fec36cbbb 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, + lowercase_escape, ) @@ -13,12 +14,12 @@ class GoogleDriveIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})' _TESTS = [{ 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', - 'md5': '881f7700aec4f538571fa1e0eed4a7b6', + 'md5': 'd109872761f7e7ecf353fa108c0dbe1e', 'info_dict': { 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', 'ext': 'mp4', 'title': 'Big Buck Bunny.mp4', - 'duration': 46, + 'duration': 45, } }, { # video id is longer than 28 characters @@ -55,7 +56,7 @@ class GoogleDriveIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://docs.google.com/file/d/%s' % video_id, video_id, encoding='unicode_escape') + 'http://docs.google.com/file/d/%s' % video_id, video_id) reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) if reason: @@ -74,7 +75,7 @@ class GoogleDriveIE(InfoExtractor): resolution = fmt.split('/')[1] width, height = resolution.split('x') formats.append({ - 'url': fmt_url, + 'url': lowercase_escape(fmt_url), 'format_id': fmt_id, 'resolution': resolution, 'width': int_or_none(width), From caf0f5f8b7d0854caaf6778fe3a646ee0d7668fe Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 5 Feb 2017 21:48:13 +0800 Subject: [PATCH 120/137] [iwara] Fix extraction (closes #11781) --- ChangeLog | 1 + youtube_dl/extractor/iwara.py | 41 +++++++++++++++++++++++++---------- 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/ChangeLog b/ChangeLog index a0025ab91..77286dbef 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version <unreleased> Extractors +* [iwara] Fix extraction (#11781) * [googledrive] Fix extraction on Python 3.6 diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py index 8d7e7f472..011274b02 100644 --- a/youtube_dl/extractor/iwara.py +++ b/youtube_dl/extractor/iwara.py @@ -3,14 +3,18 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_parse_urlparse -from ..utils import remove_end +from ..utils import ( + int_or_none, + mimetype2ext, + remove_end, +) class IwaraIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos/(?P<id>[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'http://iwara.tv/videos/amVwUl1EHpAD9RD', - 'md5': '1d53866b2c514b23ed69e4352fdc9839', + # md5 is unstable 'info_dict': { 'id': 'amVwUl1EHpAD9RD', 'ext': 'mp4', @@ -23,17 +27,17 @@ class IwaraIE(InfoExtractor): 'info_dict': { 'id': '0B1LvuHnL-sRFNXB1WHNqbGw4SXc', 'ext': 'mp4', - 'title': '[3D Hentai] Kyonyu Ã\x97 Genkai Ã\x97 Emaki Shinobi Girls.mp4', + 'title': '[3D Hentai] Kyonyu × Genkai × Emaki Shinobi Girls.mp4', 'age_limit': 18, }, 'add_ie': ['GoogleDrive'], }, { 'url': 'http://www.iwara.tv/videos/nawkaumd6ilezzgq', - 'md5': '1d85f1e5217d2791626cff5ec83bb189', + # md5 is unstable 'info_dict': { 'id': '6liAP9s2Ojc', 'ext': 'mp4', - 'age_limit': 0, + 'age_limit': 18, 'title': '[MMD] Do It Again Ver.2 [1080p 60FPS] (Motion,Camera,Wav+DL)', 'description': 'md5:590c12c0df1443d833fbebe05da8c47a', 'upload_date': '20160910', @@ -52,9 +56,9 @@ class IwaraIE(InfoExtractor): # ecchi is 'sexy' in Japanese age_limit = 18 if hostname.split('.')[0] == 'ecchi' else 0 - entries = self._parse_html5_media_entries(url, webpage, video_id) + video_data = self._download_json('http://www.iwara.tv/api/video/%s' % video_id, video_id) - if not entries: + if not video_data: iframe_url = self._html_search_regex( r'<iframe[^>]+src=([\'"])(?P<url>[^\'"]+)\1', webpage, 'iframe URL', group='url') @@ -67,11 +71,24 @@ class IwaraIE(InfoExtractor): title = remove_end(self._html_search_regex( r'<title>([^<]+)', webpage, 'title'), ' | Iwara') - info_dict = entries[0] - info_dict.update({ + formats = [] + for a_format in video_data: + format_id = a_format.get('resolution') + height = int_or_none(self._search_regex( + r'(\d+)p', format_id, 'height', default=None)) + formats.append({ + 'url': a_format['uri'], + 'format_id': format_id, + 'ext': mimetype2ext(a_format.get('mime')) or 'mp4', + 'height': height, + 'quality': 1 if format_id == 'Source' else 0, + }) + + self._sort_formats(formats) + + return { 'id': video_id, 'title': title, 'age_limit': age_limit, - }) - - return info_dict + 'formats': formats, + } From 2ab2c0d1f53f66614eda4fefb042e851e78097f0 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 5 Feb 2017 22:30:13 +0800 Subject: [PATCH 121/137] [iwara] Add width (closes #11724) The heuristic is from #11724 --- youtube_dl/extractor/iwara.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py index 011274b02..a7514fc80 100644 --- a/youtube_dl/extractor/iwara.py +++ b/youtube_dl/extractor/iwara.py @@ -81,6 +81,7 @@ class IwaraIE(InfoExtractor): 'format_id': format_id, 'ext': mimetype2ext(a_format.get('mime')) or 'mp4', 'height': height, + 'width': int_or_none(height / 9.0 * 16.0 if height else None), 'quality': 1 if format_id == 'Source' else 0, }) From 019f4c03717bfd2b887309e5a4c96ea82cbedf34 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 5 Feb 2017 22:47:04 +0800 Subject: [PATCH 122/137] [bandcamp] Fix extraction for incomplete albums Closes #11727 --- ChangeLog | 1 + youtube_dl/extractor/bandcamp.py | 19 ++++++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index 77286dbef..984191925 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version Extractors +* [bandcamp] Fix extraction for incomplete albums (#11727) * [iwara] Fix extraction (#11781) * [googledrive] Fix extraction on Python 3.6 diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 88c590e98..056e06376 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -209,6 +209,15 @@ class BandcampAlbumIE(InfoExtractor): 'id': 'entropy-ep', }, 'playlist_mincount': 3, + }, { + # not all tracks have songs + 'url': 'https://insulters.bandcamp.com/album/we-are-the-plague', + 'info_dict': { + 'id': 'we-are-the-plague', + 'title': 'WE ARE THE PLAGUE', + 'uploader_id': 'insulters', + }, + 'playlist_count': 2, }] def _real_extract(self, url): @@ -217,12 +226,16 @@ class BandcampAlbumIE(InfoExtractor): album_id = mobj.group('album_id') playlist_id = album_id or uploader_id webpage = self._download_webpage(url, playlist_id) - tracks_paths = re.findall(r'