From a0758dfa1afd5b04773ba3b3b17ac71d22054821 Mon Sep 17 00:00:00 2001 From: felix Date: Wed, 5 Aug 2015 22:40:46 +0200 Subject: [PATCH 01/77] [filmon] new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/filmon.py | 144 +++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+) create mode 100644 youtube_dl/extractor/filmon.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 578359a5e..c9b9ebd23 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -287,6 +287,7 @@ from .fc2 import ( FC2EmbedIE, ) from .fczenit import FczenitIE +from .filmon import FilmOnIE, FilmOnVODIE from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE diff --git a/youtube_dl/extractor/filmon.py b/youtube_dl/extractor/filmon.py new file mode 100644 index 000000000..987792fec --- /dev/null +++ b/youtube_dl/extractor/filmon.py @@ -0,0 +1,144 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import qualities +from ..compat import compat_urllib_request + + +_QUALITY = qualities(('low', 'high')) + + +class FilmOnIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?filmon\.com/(?:tv|channel)/(?P[a-z0-9-]+)' + _TESTS = [{ + 'url': 'https://www.filmon.com/channel/filmon-sports', + 'only_matching': True, + }, { + 'url': 'https://www.filmon.com/tv/2894', + 'only_matching': True, + }] + + def _real_extract(self, url): + channel_id = self._match_id(url) + + request = compat_urllib_request.Request('https://www.filmon.com/channel/%s' % (channel_id)) + request.add_header('X-Requested-With', 'XMLHttpRequest') + channel_info = self._download_json(request, channel_id) + now_playing = channel_info['now_playing'] + + thumbnails = [] + for thumb in now_playing.get('images', ()): + if thumb['type'] != '2': + continue + thumbnails.append({ + 'url': thumb['url'], + 'width': int(thumb['width']), + 'height': int(thumb['height']), + }) + + formats = [] + + for stream in channel_info['streams']: + formats.append({ + 'format_id': str(stream['id']), + # this is an m3u8 stream, but we are deliberately not using _extract_m3u8_formats + # because 0) it doesn't have bitrate variants anyway, and 1) the ids generated + # by that method are highly unstable (because the bitrate is variable) + 'url': stream['url'], + 'resolution': stream['name'], + 'format_note': 'expires after %u seconds' % int(stream['watch-timeout']), + 'ext': 'mp4', + 'quality': _QUALITY(stream['quality']), + 'preference': int(stream['watch-timeout']), + }) + self._sort_formats(formats) + + return { + 'id': str(channel_info['id']), + 'display_id': channel_info['alias'], + 'formats': formats, + # XXX: use the channel description (channel_info['description'])? + 'uploader_id': channel_info['alias'], + 'uploader': channel_info['title'], # XXX: kinda stretching it... + 'title': now_playing.get('programme_name') or channel_info['title'], + 'description': now_playing.get('programme_description'), + 'thumbnails': thumbnails, + 'is_live': True, + } + + +class FilmOnVODIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?filmon\.com/vod/view/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.filmon.com/vod/view/24869-0-plan-9-from-outer-space', + 'info_dict': { + 'id': '24869', + 'ext': 'mp4', + 'title': 'Plan 9 From Outer Space', + 'description': 'Dead human, zombies and vampires', + }, + }, { + 'url': 'https://www.filmon.com/vod/view/2825-1-popeye-series-1', + 'info_dict': { + 'id': '2825', + 'title': 'Popeye Series 1', + }, + 'playlist_count': 8, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + result = self._download_json('https://www.filmon.com/api/vod/movie?id=%s' % (video_id), video_id) + if result['code'] != 200: + raise ExtractorError('FilmOn said: %s' % (result['reason']), expected=True) + + response = result['response'] + + if response.get('episodes'): + return { + '_type': 'playlist', + 'id': video_id, + 'title': response['title'], + 'entries': [{ + '_type': 'url', + 'url': 'https://www.filmon.com/vod/view/%s' % (ep), + } for ep in response['episodes']] + } + + formats = [] + for (id, stream) in response['streams'].items(): + formats.append({ + 'format_id': id, + 'url': stream['url'], + 'resolution': stream['name'], + 'format_note': 'expires after %u seconds' % int(stream['watch-timeout']), + 'ext': 'mp4', + 'quality': _QUALITY(stream['quality']), + 'preference': int(stream['watch-timeout']), + }) + self._sort_formats(formats) + + poster = response['poster'] + thumbnails = [{ + 'id': 'poster', + 'url': poster['url'], + 'width': poster['width'], + 'height': poster['height'], + }] + for (id, thumb) in poster['thumbs'].items(): + thumbnails.append({ + 'id': id, + 'url': thumb['url'], + 'width': thumb['width'], + 'height': thumb['height'], + }) + + return { + 'id': video_id, + 'title': response['title'], + 'formats': formats, + 'description': response['description'], + 'thumbnails': thumbnails, + } From 75822ca7909d7f7e15694f73b05b2bf0f1fa61f3 Mon Sep 17 00:00:00 2001 From: Thomas Christlieb Date: Tue, 31 Jan 2017 10:03:31 +0100 Subject: [PATCH 02/77] New parameter --playlist-random to randomize playlist download order. Fixes #11889 --- youtube_dl/YoutubeDL.py | 5 +++++ youtube_dl/__init__.py | 1 + youtube_dl/options.py | 4 ++++ 3 files changed, 10 insertions(+) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index c71e94518..a7bf5a1b0 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -24,6 +24,7 @@ import sys import time import tokenize import traceback +import random from .compat import ( compat_basestring, @@ -159,6 +160,7 @@ class YoutubeDL(object): playlistend: Playlist item to end at. playlist_items: Specific indices of playlist to download. playlistreverse: Download playlist items in reverse order. + playlistrandom: Download playlist items in random order. matchtitle: Download only matching titles. rejecttitle: Reject downloads for matching titles. logger: Log messages to a logging.Logger instance. @@ -842,6 +844,9 @@ class YoutubeDL(object): if self.params.get('playlistreverse', False): entries = entries[::-1] + if self.params.get('playlistrandom', False): + random.shuffle(entries) + for i, entry in enumerate(entries, 1): self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) extra = { diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 2b156342a..5c5b8094b 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -344,6 +344,7 @@ def _real_main(argv=None): 'playliststart': opts.playliststart, 'playlistend': opts.playlistend, 'playlistreverse': opts.playlist_reverse, + 'playlistrandom': opts.playlist_random, 'noplaylist': opts.noplaylist, 'logtostderr': opts.outtmpl == '-', 'consoletitle': opts.consoletitle, diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 3abf621c0..349f44778 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -470,6 +470,10 @@ def parseOpts(overrideArguments=None): '--playlist-reverse', action='store_true', help='Download playlist videos in reverse order') + downloader.add_option( + '--playlist-random', + action='store_true', + help='Download playlist videos in random order') downloader.add_option( '--xattr-set-filesize', dest='xattr_set_filesize', action='store_true', From ae9a173b6421a3fdf70dd50d2dc0386f8861fe71 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 31 Jan 2017 14:47:56 +0100 Subject: [PATCH 03/77] [vimeo] extract both mixed and separated dash formats --- youtube_dl/extractor/vimeo.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index c12eeadd4..8b6a5cc3c 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -142,10 +142,19 @@ class VimeoBaseInfoExtractor(InfoExtractor): note='Downloading %s m3u8 information' % cdn_name, fatal=False)) elif files_type == 'dash': - formats.extend(self._extract_mpd_formats( - manifest_url.replace('/master.json', '/master.mpd'), video_id, format_id, - 'Downloading %s MPD information' % cdn_name, - fatal=False)) + mpd_pattern = r'/%s/(?:sep/)?video/' % video_id + mpd_manifest_urls = [] + if re.search(mpd_pattern, manifest_url): + for suffix, repl in (('', 'video'), ('_sep', 'sep/video')): + mpd_manifest_urls.append((format_id + suffix, re.sub( + mpd_pattern, '/%s/%s/' % (video_id, repl), manifest_url))) + else: + mpd_manifest_urls = [(format_id, manifest_url)] + for f_id, m_url in mpd_manifest_urls: + formats.extend(self._extract_mpd_formats( + m_url.replace('/master.json', '/master.mpd'), video_id, f_id, + 'Downloading %s MPD information' % cdn_name, + fatal=False)) subtitles = {} text_tracks = config['request'].get('text_tracks') From 3c90cc8b6fc069930264b41f5505dc34c1077442 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 31 Jan 2017 22:19:29 +0700 Subject: [PATCH 04/77] [youtube] Fix extraction for domainless player URLs Closes #11890 Closes #11891 Closes #11892 Closes #11894 Closes #11895 Closes #11897 Closes #11900 Closes #11903 Closes #11904 Closes #11906 Closes #11907 Closes #11909 Closes #11913 Closes #11914 Closes #11915 Closes #11916 Closes #11917 Closes #11918 Closes #11919 --- youtube_dl/extractor/youtube.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 630586796..ea398bcc8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1028,8 +1028,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _parse_sig_js(self, jscode): funcname = self._search_regex( - r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode, - 'Initial JS player signature function name') + (r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', + r'\.sig\|\|(?P[a-zA-Z0-9$]+)\('), + jscode, 'Initial JS player signature function name', group='sig') jsi = JSInterpreter(jscode) initial_function = jsi.extract_function(funcname) @@ -1050,6 +1051,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if player_url.startswith('//'): player_url = 'https:' + player_url + elif not re.match(r'https?://', player_url): + player_url = compat_urlparse.urljoin( + 'https://www.youtube.com', player_url) try: player_id = (player_url, self._signature_cache_id(s)) if player_id not in self._player_cache: From 3a528ffd8944417c99b139da18d0dff907ade517 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 31 Jan 2017 22:21:54 +0700 Subject: [PATCH 05/77] [ChangeLog] Actualize --- ChangeLog | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ChangeLog b/ChangeLog index cd7017f6d..e331acacc 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,17 @@ +version + +Core ++ [compat] Add compat_etree_register_namespace + +Extractors +* [youtube] Fix extraction for domainless player URLs (#11890, #11891, #11892, + #11894, #11895, #11897, #11900, #11903, #11904, #11906, #11907, #11909, + #11913, #11914, #11915, #11916, #11917, #11918, #11919) ++ [vimeo] Extract both mixed and separated DASH formats ++ [ruutu] Extract DASH formats +* [itv] Fix extraction for python 2.6 + + version 2017.01.29 Core From d7e215b42dcaf71298a7e1dc953cf93523b3da81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 31 Jan 2017 22:24:45 +0700 Subject: [PATCH 06/77] release 2017.01.31 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 10c982fd0..180013f72 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.01.29*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.01.29** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.01.31*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.01.31** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.01.29 +[debug] youtube-dl version 2017.01.31 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index e331acacc..d5ab0e0a7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.01.31 Core + [compat] Add compat_etree_register_namespace diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a37a65db9..fee0ac7c5 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.01.29' +__version__ = '2017.01.31' From 8fd65faece98139def3a6538e98053bebd400263 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Odd=20Str=C3=A5b=C3=B8?= Date: Sat, 14 Jan 2017 02:36:04 +0100 Subject: [PATCH 07/77] [NRKTV] Added NRKTVSeriesIE [NRKTV] Added season and episode number to metadata. [NRKTV] Added category to metadata. [NRKTV] Added tests to NRKTVSeries. [NRKTV] Fixed whitespace issues (flake8). --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/nrk.py | 49 ++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2590b5e1b..06e6d4620 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -668,6 +668,7 @@ from .nrk import ( NRKTVIE, NRKTVDirekteIE, NRKTVEpisodesIE, + NRKTVSeriesIE, ) from .ntvde import NTVDeIE from .ntvru import NTVRuIE diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index ea7be005a..26604f84f 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -128,6 +128,18 @@ class NRKBaseIE(InfoExtractor): series = conviva.get('seriesName') or data.get('seriesTitle') episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') + season_number = None + episode_number = None + if data.get('mediaElementType') == 'Episode': + _season_episode = data.get('scoresStatistics', {}).get('springStreamStream') or \ + data.get('relativeOriginUrl', '') + EPISODENUM_RE = [ + r'/s(?P\d+)e(?P\d+)\.', + r'/sesong-(?P\d+)/episode-(?P\d+)', + ] + season_number = int_or_none(self._search_regex(EPISODENUM_RE, _season_episode, "S##E##", fatal=False, group='season')) + episode_number = int_or_none(self._search_regex(EPISODENUM_RE, _season_episode, "S##E##", fatal=False, group='episode')) + thumbnails = None images = data.get('images') if images and isinstance(images, dict): @@ -140,11 +152,15 @@ class NRKBaseIE(InfoExtractor): } for image in web_images if image.get('imageUrl')] description = data.get('description') + category = data.get('mediaAnalytics', {}).get('category') common_info = { 'description': description, 'series': series, 'episode': episode, + 'season_number': season_number, + 'episode_number': episode_number, + 'categories': [category] if category else None, 'age_limit': parse_age_limit(data.get('legalAge')), 'thumbnails': thumbnails, } @@ -360,6 +376,39 @@ class NRKTVEpisodesIE(NRKPlaylistBaseIE): r'

([^<]+)

', webpage, 'title', fatal=False) +class NRKTVSeriesIE(InfoExtractor): + _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P[^/]+)/?' + _ITEM_RE = r'data-season=["\'](?P\d+)["\']' + _TESTS = [{ + 'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene', + 'playlist_count': 1, + }, { + 'url': 'https://tv.nrk.no/serie/saving-the-human-race', + 'playlist_count': 1, + }, { + 'url': 'https://tv.nrk.no/serie/postmann-pat', + 'playlist_count': 3, + }, { + 'url': 'https://tv.nrk.no/serie/groenn-glede', + 'playlist_count': 9, + }] + + def _real_extract(self, url): + series_id = self._match_id(url) + + webpage = self._download_webpage(url, series_id) + + entries = [ + self.url_result('https://tv.nrk.no/program/Episodes/{series}/{season}'.format( + series=series_id, + season=season_id + )) + for season_id in re.findall(self._ITEM_RE, webpage) + ] + + return self.playlist_result(entries) + + class NRKSkoleIE(InfoExtractor): IE_DESC = 'NRK Skole' _VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/?\?.*\bmediaId=(?P\d+)' From 7c5329e6f4152b48c5476b1b9b8ab931caa10331 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 1 Feb 2017 00:29:29 +0700 Subject: [PATCH 08/77] [nrk] Improve extraction and update tests (closes #11571) --- youtube_dl/extractor/nrk.py | 145 +++++++++++++++++++++++++++--------- 1 file changed, 111 insertions(+), 34 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 26604f84f..fc3c0cd3c 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -134,11 +134,15 @@ class NRKBaseIE(InfoExtractor): _season_episode = data.get('scoresStatistics', {}).get('springStreamStream') or \ data.get('relativeOriginUrl', '') EPISODENUM_RE = [ - r'/s(?P\d+)e(?P\d+)\.', - r'/sesong-(?P\d+)/episode-(?P\d+)', + r'/s(?P\d{,2})e(?P\d{,2})\.', + r'/sesong-(?P\d{,2})/episode-(?P\d{,2})', ] - season_number = int_or_none(self._search_regex(EPISODENUM_RE, _season_episode, "S##E##", fatal=False, group='season')) - episode_number = int_or_none(self._search_regex(EPISODENUM_RE, _season_episode, "S##E##", fatal=False, group='episode')) + season_number = int_or_none(self._search_regex( + EPISODENUM_RE, _season_episode, 'season number', + default=None, group='season')) + episode_number = int_or_none(self._search_regex( + EPISODENUM_RE, _season_episode, 'episode number', + default=None, group='episode')) thumbnails = None images = data.get('images') @@ -243,54 +247,102 @@ class NRKTVIE(NRKBaseIE): 'title': '20 spørsmål 23.05.2014', 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'duration': 1741, + 'series': '20 spørsmål - TV', + 'episode': '23.05.2014', }, }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', - 'md5': '43d0be26663d380603a9cf0c24366531', 'info_dict': { 'id': 'MDFP15000514CA', 'ext': 'mp4', 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', 'duration': 4605, + 'series': 'Kunnskapskanalen', + 'episode': '24.05.2014', + }, + 'params': { + 'skip_download': True, }, }, { # single playlist video 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', - 'md5': 'adbd1dbd813edaf532b0a253780719c2', 'info_dict': { 'id': 'MSPO40010515-part2', 'ext': 'flv', 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', }, - 'skip': 'Only works from Norway', + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Video is geo restricted'], + 'skip': 'particular part is not supported currently', }, { 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', 'playlist': [{ - 'md5': '9480285eff92d64f06e02a5367970a7a', 'info_dict': { - 'id': 'MSPO40010515-part1', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'id': 'MSPO40010515AH', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', + 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'duration': 772, + 'series': 'Tour de Ski', + 'episode': '06.01.2015', + }, + 'params': { + 'skip_download': True, }, }, { - 'md5': 'adbd1dbd813edaf532b0a253780719c2', 'info_dict': { - 'id': 'MSPO40010515-part2', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'id': 'MSPO40010515BH', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', + 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'duration': 6175, + 'series': 'Tour de Ski', + 'episode': '06.01.2015', + }, + 'params': { + 'skip_download': True, }, }], 'info_dict': { 'id': 'MSPO40010515', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', - 'duration': 6947.52, + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', + 'description': 'md5:c03aba1e917561eface5214020551b7a', + }, + 'expected_warnings': ['Video is geo restricted'], + }, { + 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', + 'info_dict': { + 'id': 'KMTE50001317AA', + 'ext': 'mp4', + 'title': 'Anno 13:30', + 'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa', + 'duration': 2340, + 'series': 'Anno', + 'episode': '13:30', + 'season_number': 3, + 'episode_number': 13, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', + 'info_dict': { + 'id': 'MUHH46000317AA', + 'ext': 'mp4', + 'title': 'Nytt på Nytt 27.01.2017', + 'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b', + 'duration': 1796, + 'series': 'Nytt på nytt', + 'episode': '27.01.2017', + }, + 'params': { + 'skip_download': True, }, - 'skip': 'Only works from Norway', }, { 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', 'only_matching': True, @@ -377,36 +429,61 @@ class NRKTVEpisodesIE(NRKPlaylistBaseIE): class NRKTVSeriesIE(InfoExtractor): - _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P[^/]+)/?' - _ITEM_RE = r'data-season=["\'](?P\d+)["\']' + _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P[^/]+)' + _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P\d+)' _TESTS = [{ + 'url': 'https://tv.nrk.no/serie/groenn-glede', + 'info_dict': { + 'id': 'groenn-glede', + 'title': 'Grønn glede', + 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', + }, + 'playlist_mincount': 9, + }, { + 'url': 'http://tv.nrksuper.no/serie/labyrint', + 'info_dict': { + 'id': 'labyrint', + 'title': 'Labyrint', + 'description': 'md5:58afd450974c89e27d5a19212eee7115', + }, + 'playlist_mincount': 3, + }, { 'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene', - 'playlist_count': 1, + 'only_matching': True, }, { 'url': 'https://tv.nrk.no/serie/saving-the-human-race', - 'playlist_count': 1, + 'only_matching': True, }, { 'url': 'https://tv.nrk.no/serie/postmann-pat', - 'playlist_count': 3, - }, { - 'url': 'https://tv.nrk.no/serie/groenn-glede', - 'playlist_count': 9, + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if NRKTVIE.suitable(url) else super(NRKTVSeriesIE, cls).suitable(url) + def _real_extract(self, url): series_id = self._match_id(url) webpage = self._download_webpage(url, series_id) entries = [ - self.url_result('https://tv.nrk.no/program/Episodes/{series}/{season}'.format( - series=series_id, - season=season_id - )) + self.url_result( + 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( + series=series_id, season=season_id)) for season_id in re.findall(self._ITEM_RE, webpage) ] - return self.playlist_result(entries) + title = self._html_search_meta( + 'seriestitle', webpage, + 'title', default=None) or self._og_search_title( + webpage, fatal=False) + + description = self._html_search_meta( + 'series_description', webpage, + 'description', default=None) or self._og_search_description(webpage) + + return self.playlist_result(entries, series_id, title, description) class NRKSkoleIE(InfoExtractor): From 363245ad94dfdf0c34b4c2c801e7cf6cea74f39c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 1 Feb 2017 00:30:19 +0700 Subject: [PATCH 09/77] Credit @oddstr13 for nrk:series (#11571) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 3ef2800c9..f2875d504 100644 --- a/AUTHORS +++ b/AUTHORS @@ -200,3 +200,4 @@ Paul Hartmann Stephen Chen Fabian Stahl Bagira +Odd Stråbø From c38a67bcd5df639b9d7e7faa8685e76446803527 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 1 Feb 2017 00:49:28 +0700 Subject: [PATCH 10/77] [vimeo] Extract license (closes #11880) --- youtube_dl/extractor/vimeo.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 8b6a5cc3c..32179e915 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -218,6 +218,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'user7108434', 'uploader': 'Filippo Valsorda', 'duration': 10, + 'license': 'by-sa', }, }, { @@ -486,6 +487,8 @@ class VimeoIE(VimeoBaseInfoExtractor): '%s said: %s' % (self.IE_NAME, seed_status['title']), expected=True) + cc_license = None + # Extract the config JSON try: try: @@ -499,8 +502,9 @@ class VimeoIE(VimeoBaseInfoExtractor): vimeo_clip_page_config = self._search_regex( r'vimeo\.clip_page_config\s*=\s*({.+?});', webpage, 'vimeo clip page config') - config_url = self._parse_json( - vimeo_clip_page_config, video_id)['player']['config_url'] + page_config = self._parse_json(vimeo_clip_page_config, video_id) + config_url = page_config['player']['config_url'] + cc_license = page_config.get('cc_license') config_json = self._download_webpage(config_url, video_id) config = json.loads(config_json) except RegexNotFoundError: @@ -609,6 +613,12 @@ class VimeoIE(VimeoBaseInfoExtractor): info_dict = self._parse_config(config, video_id) formats.extend(info_dict['formats']) self._vimeo_sort_formats(formats) + + if not cc_license: + cc_license = self._search_regex( + r']+rel=["\']license["\'][^>]+href=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'license', default=None, group='license') + info_dict.update({ 'id': video_id, 'formats': formats, @@ -618,6 +628,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'view_count': view_count, 'like_count': like_count, 'comment_count': comment_count, + 'license': cc_license, }) return info_dict From c15cd296404e164b72fd7f2666d5875f35057d93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 1 Feb 2017 00:58:02 +0700 Subject: [PATCH 11/77] [vimeo] Extract upload timestamp --- youtube_dl/extractor/vimeo.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 32179e915..8ba222224 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -21,7 +21,9 @@ from ..utils import ( sanitized_Request, smuggle_url, std_headers, + try_get, unified_strdate, + unified_timestamp, unsmuggle_url, urlencode_postdata, unescapeHTML, @@ -213,6 +215,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'ext': 'mp4', 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", 'description': 'md5:2d3305bad981a06ff79f027f19865021', + 'timestamp': 1355990239, 'upload_date': '20121220', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user7108434', 'uploader_id': 'user7108434', @@ -259,6 +262,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '68375962', 'ext': 'mp4', 'title': 'youtube-dl password protected test video', + 'timestamp': 1371200155, 'upload_date': '20130614', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user18948128', 'uploader_id': 'user18948128', @@ -281,7 +285,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/atencio', 'uploader_id': 'atencio', 'uploader': 'Peter Atencio', - 'upload_date': '20130927', + 'timestamp': 1380339469, + 'upload_date': '20130928', 'duration': 187, }, }, @@ -293,6 +298,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'ext': 'mp4', 'title': 'The New Vimeo Player (You Know, For Videos)', 'description': 'md5:2ec900bf97c3f389378a96aee11260ea', + 'timestamp': 1381846109, 'upload_date': '20131015', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/staff', 'uploader_id': 'staff', @@ -324,6 +330,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'The DMCI', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/dmci', 'uploader_id': 'dmci', + 'timestamp': 1324343742, 'upload_date': '20111220', 'description': 'md5:ae23671e82d05415868f7ad1aec21147', }, @@ -339,6 +346,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader': 'Casey Donahue', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/caseydonahue', 'uploader_id': 'caseydonahue', + 'timestamp': 1250886430, 'upload_date': '20090821', 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', }, @@ -488,6 +496,7 @@ class VimeoIE(VimeoBaseInfoExtractor): expected=True) cc_license = None + timestamp = None # Extract the config JSON try: @@ -505,6 +514,9 @@ class VimeoIE(VimeoBaseInfoExtractor): page_config = self._parse_json(vimeo_clip_page_config, video_id) config_url = page_config['player']['config_url'] cc_license = page_config.get('cc_license') + timestamp = try_get( + page_config, lambda x: x['clip']['uploaded_on'], + compat_str) config_json = self._download_webpage(config_url, video_id) config = json.loads(config_json) except RegexNotFoundError: @@ -573,10 +585,10 @@ class VimeoIE(VimeoBaseInfoExtractor): self._downloader.report_warning('Cannot find video description') # Extract upload date - video_upload_date = None - mobj = re.search(r']+datetime="([^"]+)"', webpage) - if mobj is not None: - video_upload_date = unified_strdate(mobj.group(1)) + if not timestamp: + timestamp = self._search_regex( + r']+datetime="([^"]+)"', webpage, + 'timestamp', default=None) try: view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count')) @@ -622,7 +634,7 @@ class VimeoIE(VimeoBaseInfoExtractor): info_dict.update({ 'id': video_id, 'formats': formats, - 'upload_date': video_upload_date, + 'timestamp': unified_timestamp(timestamp), 'description': video_description, 'webpage_url': url, 'view_count': view_count, From 26c0f09935d51cc8837230ad48db08acd3744dd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 1 Feb 2017 02:15:52 +0700 Subject: [PATCH 12/77] [vimeo] PEP 8 --- youtube_dl/extractor/vimeo.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 8ba222224..61cc469bf 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -22,13 +22,11 @@ from ..utils import ( smuggle_url, std_headers, try_get, - unified_strdate, unified_timestamp, unsmuggle_url, urlencode_postdata, unescapeHTML, parse_filesize, - try_get, ) From 2b2d5d319b563a12e26c55966a047fa5bb039cd0 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 1 Feb 2017 16:39:32 +0800 Subject: [PATCH 13/77] [crunchyroll] Remove ScaledBorderAndShadow settings See https://github.com/rg3/youtube-dl/pull/9028, especially @lachs0r's comments for the reason behind this change --- youtube_dl/extractor/crunchyroll.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index f811c7f33..109d1c5a8 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -255,8 +255,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): output += 'WrapStyle: %s\n' % sub_root.attrib['wrap_style'] output += 'PlayResX: %s\n' % sub_root.attrib['play_res_x'] output += 'PlayResY: %s\n' % sub_root.attrib['play_res_y'] - output += """ScaledBorderAndShadow: no - + output += """ [V4+ Styles] Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding """ From 7882f1115e8eca2d2c958e2dbb6be45450e4027c Mon Sep 17 00:00:00 2001 From: Thomas Christlieb Date: Wed, 1 Feb 2017 16:00:41 +0100 Subject: [PATCH 14/77] Added new Regex for prosiebensat1 Extractor Description. Fixes #11810 (#11929) * Added new Regex for prosiebensat1 Extractor Description. Fixes #11810 * Using _og_search_description() as a Fallback for Description-Regex * Using _og_search_description() as a Fallback for Description-Regex - Second try * Also added fallback regex * Using _og_search_description() as a Fallback for Description-Regex - Third try * removed fatal=False from search for description regex. default=None should be preferred only * Using fatal=false for _og_search_description * Revert "Using fatal=false for _og_search_description" This reverts commit 2b7e123f9d0f2bd6ada54fa8e4e6035fece5dbf4. * Deleted default=None Parameter for _og_search_property --- youtube_dl/extractor/prosiebensat1.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 03e1b1f7f..6856bacaf 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -375,7 +375,9 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title') info = self._extract_video_info(url, clip_id) description = self._html_search_regex( - self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) + self._DESCRIPTION_REGEXES, webpage, 'description', default=None) + if description is None: + description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) upload_date = unified_strdate(self._html_search_regex( self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) From fe5aa197b58be1bbf88a152be0e84f24f1711bd7 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 1 Feb 2017 23:13:45 +0800 Subject: [PATCH 15/77] [prosiebensat1] PEP8 and update _TESTS --- youtube_dl/extractor/prosiebensat1.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 6856bacaf..5091d8456 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -147,16 +147,12 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): 'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge', 'info_dict': { 'id': '2104602', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Episode 18 - Staffel 2', 'description': 'md5:8733c81b702ea472e069bc48bb658fc1', 'upload_date': '20131231', 'duration': 5845.04, }, - 'params': { - # rtmp download - 'skip_download': True, - }, }, { 'url': 'http://www.prosieben.de/videokatalog/Gesellschaft/Leben/Trends/video-Lady-Umstyling-f%C3%BCr-Audrina-Rebekka-Audrina-Fergen-billig-aussehen-Battal-Modica-700544.html', @@ -258,7 +254,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): 'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip', 'info_dict': { 'id': '2572814', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Andreas Kümmert: Rocket Man', 'description': 'md5:6ddb02b0781c6adf778afea606652e38', 'upload_date': '20131017', @@ -272,7 +268,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): 'url': 'http://www.fem.com/wellness/videos/wellness-video-clip-kurztripps-zum-valentinstag.html', 'info_dict': { 'id': '2156342', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Kurztrips zum Valentinstag', 'description': 'Romantischer Kurztrip zum Valentinstag? Nina Heinemann verrät, was sich hier wirklich lohnt.', 'duration': 307.24, @@ -289,12 +285,13 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): 'description': 'md5:63b8963e71f481782aeea877658dec84', }, 'playlist_count': 2, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.7tv.de/circus-halligalli/615-best-of-circus-halligalli-ganze-folge', 'info_dict': { 'id': '4187506', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Best of Circus HalliGalli', 'description': 'md5:8849752efd90b9772c9db6fdf87fb9e9', 'upload_date': '20151229', @@ -376,7 +373,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): info = self._extract_video_info(url, clip_id) description = self._html_search_regex( self._DESCRIPTION_REGEXES, webpage, 'description', default=None) - if description is None: + if description is None: description = self._og_search_description(webpage) thumbnail = self._og_search_thumbnail(webpage) upload_date = unified_strdate(self._html_search_regex( From 000f207944e277e63dbec5a60007c30e3187d3fd Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 1 Feb 2017 23:16:35 +0800 Subject: [PATCH 16/77] [prosiebensat1] Update ChangeLog --- ChangeLog | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ChangeLog b/ChangeLog index d5ab0e0a7..da5b75b47 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +version + +Extractors +* [prosiebensat1] Fix extraction of descriptions (#11810, #11929) + version 2017.01.31 Core From b83ef507b457e6ea8c52265ea42b6c5d2c500a7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 1 Feb 2017 23:15:38 +0700 Subject: [PATCH 17/77] [facebook] Fix extraction (closes #11926) --- youtube_dl/extractor/facebook.py | 36 ++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index c0a7fc7d8..47bcc0dbc 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -12,14 +12,16 @@ from ..compat import ( compat_urllib_parse_unquote_plus, ) from ..utils import ( + clean_html, error_to_compat_str, ExtractorError, + get_element_by_id, int_or_none, + js_to_json, limit_length, sanitized_Request, + try_get, urlencode_postdata, - get_element_by_id, - clean_html, ) @@ -243,14 +245,30 @@ class FacebookIE(InfoExtractor): video_data = None + def extract_video_data(instances): + for item in instances: + if item[1][0] == 'VideoConfig': + video_item = item[2][0] + if video_item.get('video_id') == video_id: + return video_item['videoData'] + server_js_data = self._parse_json(self._search_regex( - r'handleServerJS\(({.+})(?:\);|,")', webpage, 'server js data', default='{}'), video_id) - for item in server_js_data.get('instances', []): - if item[1][0] == 'VideoConfig': - video_item = item[2][0] - if video_item.get('video_id') == video_id: - video_data = video_item['videoData'] - break + r'handleServerJS\(({.+})(?:\);|,")', webpage, + 'server js data', default='{}'), video_id, fatal=False) + + if server_js_data: + video_data = extract_video_data(server_js_data.get('instances', [])) + + if not video_data: + server_js_data = self._parse_json( + self._search_regex( + r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+stream_pagelet', + webpage, 'js data', default='{}'), + video_id, transform_source=js_to_json, fatal=False) + if server_js_data: + video_data = extract_video_data(try_get( + server_js_data, lambda x: x['jsmods']['instances'], + list) or []) if not video_data: if not fatal_if_no_video: From b996b8809285c2c8526dfe96f5ea9835ea799fe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 1 Feb 2017 23:29:59 +0700 Subject: [PATCH 18/77] [ChangeLog] Actualize --- ChangeLog | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ChangeLog b/ChangeLog index da5b75b47..d24169af8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,13 @@ version Extractors ++ [facebook] Add another fallback extraction scenario (#11926) * [prosiebensat1] Fix extraction of descriptions (#11810, #11929) +- [crunchyroll] Remove ScaledBorderAndShadow settings (#9028) ++ [vimeo] Extract upload timestamp ++ [vimeo] Extract license (#8726, #11880) ++ [nrk:series] Add support for series (#11571, #11711) + version 2017.01.31 From 50695949937bf399b611ef7957f44aac9fbee9dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 1 Feb 2017 03:20:09 +0700 Subject: [PATCH 19/77] release 2017.02.01 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 1 + youtube_dl/version.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 180013f72..8914569b6 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.01.31*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.01.31** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.02.01*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.02.01** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.01.31 +[debug] youtube-dl version 2017.02.01 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index d24169af8..c1e8f643a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.02.01 Extractors + [facebook] Add another fallback extraction scenario (#11926) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d4231577b..d900f5e12 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -528,6 +528,7 @@ - **NRKTV**: NRK TV and NRK Radio - **NRKTVDirekte**: NRK TV Direkte and NRK Radio Direkte - **NRKTVEpisodes** + - **NRKTVSeries** - **ntv.ru** - **Nuvid** - **NYTimes** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index fee0ac7c5..0f9b6b703 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.01.31' +__version__ = '2017.02.01' From da162c1135febbb653a302b598dba2d24ac4e24e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 1 Feb 2017 20:15:25 +0100 Subject: [PATCH 20/77] [compat] add compat_etree_register_namespace to __all__ list --- youtube_dl/compat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 49e3c90e2..718902019 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2883,6 +2883,7 @@ __all__ = [ 'compat_cookiejar', 'compat_cookies', 'compat_etree_fromstring', + 'compat_etree_register_namespace', 'compat_expanduser', 'compat_get_terminal_size', 'compat_getenv', From 020c5df52d61af0630be8c982282e110a83fc8df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 1 Feb 2017 23:48:34 +0100 Subject: [PATCH 21/77] [elpais] Fix extraction for some URLs (closes #11765) --- ChangeLog | 1 + youtube_dl/extractor/elpais.py | 23 ++++++++++++++++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index c1e8f643a..8e3a04d7d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -7,6 +7,7 @@ Extractors + [vimeo] Extract upload timestamp + [vimeo] Extract license (#8726, #11880) + [nrk:series] Add support for series (#11571, #11711) ++ [elpais] Fix extraction for some URLs (#11765) version 2017.01.31 diff --git a/youtube_dl/extractor/elpais.py b/youtube_dl/extractor/elpais.py index 8c725a4e6..99e00cf3c 100644 --- a/youtube_dl/extractor/elpais.py +++ b/youtube_dl/extractor/elpais.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import strip_jsonp, unified_strdate class ElPaisIE(InfoExtractor): @@ -29,6 +29,16 @@ class ElPaisIE(InfoExtractor): 'description': 'Que sí, que las cápsulas son cómodas. Pero si le pides algo más a la vida, quizá deberías aprender a usar bien la cafetera italiana. No tienes más que ver este vídeo y seguir sus siete normas básicas.', 'upload_date': '20160303', } + }, { + 'url': 'http://elpais.com/elpais/2017/01/26/ciencia/1485456786_417876.html', + 'md5': '9c79923a118a067e1a45789e1e0b0f9c', + 'info_dict': { + 'id': '1485456786_417876', + 'ext': 'mp4', + 'title': 'Hallado un barco de la antigua Roma que naufragó en Baleares hace 1.800 años', + 'description': 'La nave portaba cientos de ánforas y se hundió cerca de la isla de Cabrera por razones desconocidas', + 'upload_date': '20170127', + }, }] def _real_extract(self, url): @@ -37,8 +47,15 @@ class ElPaisIE(InfoExtractor): prefix = self._html_search_regex( r'var\s+url_cache\s*=\s*"([^"]+)";', webpage, 'URL prefix') - video_suffix = self._search_regex( - r"(?:URLMediaFile|urlVideo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", webpage, 'video URL') + id_multimedia = self._search_regex( + r"id_multimedia\s*=\s*'([^']+)'", webpage, 'ID multimedia', default=None) + if id_multimedia: + url_info = self._download_json( + 'http://elpais.com/vdpep/1/?pepid=' + id_multimedia, video_id, transform_source=strip_jsonp) + video_suffix = url_info['mp4'] + else: + video_suffix = self._search_regex( + r"(?:URLMediaFile|urlVideo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", webpage, 'video URL') video_url = prefix + video_suffix thumbnail_suffix = self._search_regex( r"(?:URLMediaStill|urlFotogramaFijo_\d+)\s*=\s*url_cache\s*\+\s*'([^']+)'", From 8bdc149441a86e01c56946090087c005a525260e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 2 Feb 2017 08:05:16 +0100 Subject: [PATCH 22/77] [downloader/external:ffmpeg] minimize the use of aac_adtstoasc filter --- youtube_dl/downloader/external.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 5d3e5d8d3..138f353ef 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -17,6 +17,7 @@ from ..utils import ( encodeArgument, handle_youtubedl_headers, check_executable, + is_outdated_version, ) @@ -264,7 +265,9 @@ class FFmpegFD(ExternalFD): if self.params.get('hls_use_mpegts', False) or tmpfilename == '-': args += ['-f', 'mpegts'] else: - args += ['-f', 'mp4', '-bsf:a', 'aac_adtstoasc'] + args += ['-f', 'mp4'] + if (ffpp.basename == 'ffmpeg' and is_outdated_version(ffpp._versions['ffmpeg'], '3.2')) and (not info_dict.get('acodec') or info_dict['acodec'].split('.')[0] in ('aac', 'mp4a')): + args += ['-bsf:a', 'aac_adtstoasc'] elif protocol == 'rtmp': args += ['-f', 'flv'] else: From 81aeafeb44a16b341e47c3bb85d288252a095eda Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 2 Feb 2017 08:07:06 +0100 Subject: [PATCH 23/77] [cbc:watch] extract audio codec for audion only formats(fixes #11893) --- youtube_dl/extractor/cbc.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index a291685bf..cf678e7f8 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -296,6 +296,12 @@ class CBCWatchVideoIE(CBCWatchBaseIE): formats = self._extract_m3u8_formats(re.sub(r'/([^/]+)/[^/?]+\.m3u8', r'/\1/\1.m3u8', m3u8_url), video_id, 'mp4', fatal=False) if len(formats) < 2: formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + for f in formats: + format_id = f.get('format_id') + if format_id.startswith('AAC'): + f['acodec'] = 'aac' + elif format_id.startswith('AC3'): + f['acodec'] = 'ac-3' self._sort_formats(formats) info = { From bd8f48c78b952ebe3bf335185c819e265f63cb50 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 2 Feb 2017 21:51:31 +0800 Subject: [PATCH 24/77] [bilibili] Support new Bangumi URLs (closes #11845) To reduce complexity, I don't support old Bangumi URLs directly via _VALID_URL. Instead, I choose to let it go to generic redirection. An example can be found in #10190: http://bangumi.bilibili.com/anime/v/40062 --- ChangeLog | 5 ++ youtube_dl/extractor/bilibili.py | 135 ++++++++++++++++++++++++++--- youtube_dl/extractor/extractors.py | 5 +- 3 files changed, 134 insertions(+), 11 deletions(-) diff --git a/ChangeLog b/ChangeLog index 8e3a04d7d..c27907f51 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +version + +Extractors ++ [bilibili] Support new Bangumi URLs (#11845) + version 2017.02.01 Extractors diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 85ea5e6ee..80dd8382e 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -5,19 +5,27 @@ import hashlib import re from .common import InfoExtractor -from ..compat import compat_parse_qs +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) from ..utils import ( + ExtractorError, int_or_none, float_or_none, + parse_iso8601, + smuggle_url, + strip_jsonp, unified_timestamp, + unsmuggle_url, urlencode_postdata, ) class BiliBiliIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/v/)(?P\d+)' + _VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/(?P\d+)/play#)(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', 'md5': '9fa226fe2b8a9a4d5a69b4c6a183417e', 'info_dict': { @@ -32,25 +40,61 @@ class BiliBiliIE(InfoExtractor): 'uploader': '菊子桑', 'uploader_id': '156160', }, - } + }, { + # Tested in BiliBiliBangumiIE + 'url': 'http://bangumi.bilibili.com/anime/1869/play#40062', + 'only_matching': True, + }, { + 'url': 'http://bangumi.bilibili.com/anime/5802/play#100643', + 'md5': '3f721ad1e75030cc06faf73587cfec57', + 'info_dict': { + 'id': '100643', + 'ext': 'mp4', + 'title': 'CHAOS;CHILD', + 'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...', + }, + 'skip': 'Geo-restricted to China', + }] _APP_KEY = '84956560bc028eb7' _BILIBILI_KEY = '94aba54af9065f71de72f5508f1cd42e' + def _report_error(self, result): + if 'message' in result: + raise ExtractorError('%s said: %s' % (self.IE_NAME, result['message']), expected=True) + elif 'code' in result: + raise ExtractorError('%s returns error %d' % (self.IE_NAME, result['code']), expected=True) + else: + raise ExtractorError('Can\'t extract Bangumi episode ID') + def _real_extract(self, url): - video_id = self._match_id(url) + url, smuggled_data = unsmuggle_url(url, {}) + + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + anime_id = mobj.group('anime_id') webpage = self._download_webpage(url, video_id) - if 'anime/v' not in url: + if 'anime/' not in url: cid = compat_parse_qs(self._search_regex( [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', r']+src="https://secure\.bilibili\.com/secure,([^"]+)"'], webpage, 'player parameters'))['cid'][0] else: + if 'no_bangumi_tip' not in smuggled_data: + self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run youtube-dl with %s' % ( + video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id))) + headers = { + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + } + headers.update(self.geo_verification_headers()) + js = self._download_json( 'http://bangumi.bilibili.com/web_api/get_source', video_id, data=urlencode_postdata({'episode_id': video_id}), - headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}) + headers=headers) + if 'result' not in js: + self._report_error(js) cid = js['result']['cid'] payload = 'appkey=%s&cid=%s&otype=json&quality=2&type=mp4' % (self._APP_KEY, cid) @@ -58,7 +102,11 @@ class BiliBiliIE(InfoExtractor): video_info = self._download_json( 'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign), - video_id, note='Downloading video info page') + video_id, note='Downloading video info page', + headers=self.geo_verification_headers()) + + if 'durl' not in video_info: + self._report_error(video_info) entries = [] @@ -85,7 +133,7 @@ class BiliBiliIE(InfoExtractor): title = self._html_search_regex(']+title="([^"]+)">', webpage, 'title') description = self._html_search_meta('description', webpage) timestamp = unified_timestamp(self._html_search_regex( - r']+datetime="([^"]+)"', webpage, 'upload time', fatal=False)) + r']+datetime="([^"]+)"', webpage, 'upload time', default=None)) thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage) # TODO 'view_count' requires deobfuscating Javascript @@ -99,7 +147,7 @@ class BiliBiliIE(InfoExtractor): } uploader_mobj = re.search( - r']+href="https?://space\.bilibili\.com/(?P\d+)"[^>]+title="(?P[^"]+)"', + r']+href="(?:https?:)?//space\.bilibili\.com/(?P\d+)"[^>]+title="(?P[^"]+)"', webpage) if uploader_mobj: info.update({ @@ -123,3 +171,70 @@ class BiliBiliIE(InfoExtractor): 'description': description, 'entries': entries, } + + +class BiliBiliBangumiIE(InfoExtractor): + _VALID_URL = r'https?://bangumi\.bilibili\.com/anime/(?P\d+)' + + IE_NAME = 'bangumi.bilibili.com' + IE_DESC = 'BiliBili番剧' + + _TESTS = [{ + 'url': 'http://bangumi.bilibili.com/anime/1869', + 'info_dict': { + 'id': '1869', + 'title': '混沌武士', + 'description': 'md5:6a9622b911565794c11f25f81d6a97d2', + }, + 'playlist_count': 26, + }, { + 'url': 'http://bangumi.bilibili.com/anime/1869', + 'info_dict': { + 'id': '1869', + 'title': '混沌武士', + 'description': 'md5:6a9622b911565794c11f25f81d6a97d2', + }, + 'playlist': [{ + 'md5': '91da8621454dd58316851c27c68b0c13', + 'info_dict': { + 'id': '40062', + 'ext': 'mp4', + 'title': '混沌武士', + 'description': '故事发生在日本的江户时代。风是一个小酒馆的打工女。一日,酒馆里来了一群恶霸,虽然他们的举动令风十分不满,但是毕竟风只是一届女流,无法对他们采取什么行动,只能在心里嘟哝。这时,酒家里又进来了个“不良份子...', + 'timestamp': 1414538739, + 'upload_date': '20141028', + 'episode': '疾风怒涛 Tempestuous Temperaments', + 'episode_number': 1, + }, + }], + 'params': { + 'playlist_items': '1', + }, + }] + + @classmethod + def suitable(cls, url): + return False if BiliBiliIE.suitable(url) else super(BiliBiliBangumiIE, cls).suitable(url) + + def _real_extract(self, url): + bangumi_id = self._match_id(url) + + # Sometimes this API returns a JSONP response + season_info = self._download_json( + 'http://bangumi.bilibili.com/jsonp/seasoninfo/%s.ver' % bangumi_id, + bangumi_id, transform_source=strip_jsonp)['result'] + + entries = [{ + '_type': 'url_transparent', + 'url': smuggle_url(episode['webplay_url'], {'no_bangumi_tip': 1}), + 'ie_key': BiliBiliIE.ie_key(), + 'timestamp': parse_iso8601(episode.get('update_time'), delimiter=' '), + 'episode': episode.get('index_title'), + 'episode_number': int_or_none(episode.get('index')), + } for episode in season_info['episodes']] + + entries = sorted(entries, key=lambda entry: entry.get('episode_number')) + + return self.playlist_result( + entries, bangumi_id, + season_info.get('bangumi_title'), season_info.get('evaluate')) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 06e6d4620..1d1c05d42 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -103,7 +103,10 @@ from .beatport import BeatportIE from .bet import BetIE from .bigflix import BigflixIE from .bild import BildIE -from .bilibili import BiliBiliIE +from .bilibili import ( + BiliBiliIE, + BiliBiliBangumiIE, +) from .biobiochiletv import BioBioChileTVIE from .biqle import BIQLEIE from .bleacherreport import ( From a685751051f277b8ce99ee0949420bca4ea28c28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 2 Feb 2017 22:01:11 +0700 Subject: [PATCH 25/77] [youtube:playlist] Recognize TL playlists (closes #11945) --- youtube_dl/extractor/youtube.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ea398bcc8..0e67fdd12 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1857,13 +1857,13 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= ) ( - (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,} + (?:PL|LL|EC|UU|FL|RD|UL|TL)?[0-9A-Za-z-_]{10,} # Top tracks, they can also include dots |(?:MC)[\w\.]* ) .* | - ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) + ((?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&disable_polymer=true' _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)(?:[^>]+>(?P[^<]+))?' @@ -1985,6 +1985,9 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): }, { 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', 'only_matching': True, + }, { + 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', + 'only_matching': True, }] def _real_initialize(self): From 5a116e13020813f9f1d952504455043986c28b9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 2 Feb 2017 22:45:18 +0700 Subject: [PATCH 26/77] [facebook] Fix title extraction (closes #11941) --- youtube_dl/extractor/facebook.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 47bcc0dbc..b325c8200 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -73,7 +73,7 @@ class FacebookIE(InfoExtractor): 'info_dict': { 'id': '274175099429670', 'ext': 'mp4', - 'title': 'Facebook video #274175099429670', + 'title': 'Asif Nawab Butt posted a video to his Timeline.', 'uploader': 'Asif Nawab Butt', 'upload_date': '20140506', 'timestamp': 1399398998, @@ -318,10 +318,16 @@ class FacebookIE(InfoExtractor): video_title = self._html_search_regex( r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>', webpage, 'alternative title', default=None) - video_title = limit_length(video_title, 80) if not video_title: + video_title = self._html_search_meta( + 'description', webpage, 'title') + if video_title: + video_title = limit_length(video_title, 80) + else: video_title = 'Facebook video #%s' % video_id - uploader = clean_html(get_element_by_id('fbPhotoPageAuthorName', webpage)) + uploader = clean_html(get_element_by_id( + 'fbPhotoPageAuthorName', webpage)) or self._search_regex( + r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', fatal=False) timestamp = int_or_none(self._search_regex( r'<abbr[^>]+data-utime=["\'](\d+)', webpage, 'timestamp', default=None)) From c54c01f82dba6d3e982c73c81ad71c49f31d8af1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 2 Feb 2017 23:03:38 +0700 Subject: [PATCH 27/77] [go] Relax video id regex (closes #11937) --- youtube_dl/extractor/go.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index c7776b186..a34779b16 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -43,7 +43,10 @@ class GoIE(InfoExtractor): sub_domain, video_id, display_id = re.match(self._VALID_URL, url).groups() if not video_id: webpage = self._download_webpage(url, display_id) - video_id = self._search_regex(r'data-video-id=["\']VDKA(\w+)', webpage, 'video id') + video_id = self._search_regex( + # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" + # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood + r'data-video-id=["\']*VDKA(\w+)', webpage, 'video id') brand = self._BRANDS[sub_domain] video_data = self._download_json( 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/-1/-1/%s/-1/-1.json' % (brand, video_id), From a22b2fd19bd8c08d50f884d1903486d4f00f76ec Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 3 Feb 2017 01:28:24 +0800 Subject: [PATCH 28/77] [youtube] Fix ytsearch* when cookies are provided Closes #11924 The API with `page` is no longer used in browsers, and YouTube always returns {'reload': 'now'} when cookies are provided. See http://youtube.github.io/spfjs/documentation/start/ for how SPF works. Basically appending static link with a `spf` parameter yields the corresponding dynamic link. --- ChangeLog | 1 + youtube_dl/extractor/youtube.py | 22 ++++++++++++++-------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/ChangeLog b/ChangeLog index c27907f51..c80126cfb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version <unreleased> Extractors +* [youtube] Fix ytsearch when cookies are provided (#11924) + [bilibili] Support new Bangumi URLs (#11845) version 2017.02.01 diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0e67fdd12..f2f751104 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2348,18 +2348,18 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): videos = [] limit = n + url_query = { + 'search_query': query.encode('utf-8'), + } + url_query.update(self._EXTRA_QUERY_ARGS) + result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query) + for pagenum in itertools.count(1): - url_query = { - 'search_query': query.encode('utf-8'), - 'page': pagenum, - 'spf': 'navigate', - } - url_query.update(self._EXTRA_QUERY_ARGS) - result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query) data = self._download_json( result_url, video_id='query "%s"' % query, note='Downloading page %s' % pagenum, - errnote='Unable to download API page') + errnote='Unable to download API page', + query={'spf': 'navigate'}) html_content = data[1]['body']['content'] if 'class="search-message' in html_content: @@ -2371,6 +2371,12 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): videos += new_videos if not new_videos or len(videos) > limit: break + next_link = self._html_search_regex( + r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next', + html_content, 'next link', default=None) + if next_link is None: + break + result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link) if len(videos) > n: videos = videos[:n] From b3ee552e4b918fb720111b23147e24fa5475a74b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20=C4=8Ciha=C5=99?= <michal@cihar.com> Date: Tue, 31 Jan 2017 07:54:53 +0100 Subject: [PATCH 29/77] [utils] Handle single-line comments in js_to_json --- test/test_utils.py | 3 +++ youtube_dl/utils.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index a74d59f34..954bb7d8b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -791,6 +791,9 @@ class TestUtil(unittest.TestCase): on = js_to_json('{ 0: /* " \n */ ",]" , }') self.assertEqual(json.loads(on), {'0': ',]'}) + on = js_to_json('{ 0: // comment\n1 }') + self.assertEqual(json.loads(on), {'0': 1}) + on = js_to_json(r'["<p>x<\/p>"]') self.assertEqual(json.loads(on), ['<p>x</p>']) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index cf46711b9..6c462625b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2107,7 +2107,7 @@ def js_to_json(code): v = m.group(0) if v in ('true', 'false', 'null'): return v - elif v.startswith('/*') or v == ',': + elif v.startswith('/*') or v.startswith('//') or v == ',': return "" if v[0] in ("'", '"'): @@ -2134,7 +2134,7 @@ def js_to_json(code): return re.sub(r'''(?sx) "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| - /\*.*?\*/|,(?=\s*[\]}])| + /\*.*?\*/|//[^\n]*|,(?=\s*[\]}])| [a-zA-Z_][.a-zA-Z_0-9]*| \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?| [0-9]+(?=\s*:) From 0bbcc8a10a4bd339540bf149dd263419fd8b6e66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20=C4=8Ciha=C5=99?= <michal@cihar.com> Date: Tue, 31 Jan 2017 07:59:55 +0100 Subject: [PATCH 30/77] [iprima] Fix extraction (closes #11920, closes #11896) --- youtube_dl/extractor/iprima.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iprima.py b/youtube_dl/extractor/iprima.py index da2cdc656..0fe576883 100644 --- a/youtube_dl/extractor/iprima.py +++ b/youtube_dl/extractor/iprima.py @@ -65,7 +65,7 @@ class IPrimaIE(InfoExtractor): options = self._parse_json( self._search_regex( - r'(?s)var\s+playerOptions\s*=\s*({.+?});', + r'(?s)(?:TDIPlayerOptions|playerOptions)\s*=\s*({.+?});\s*\]\]', playerpage, 'player options', default='{}'), video_id, transform_source=js_to_json, fatal=False) if options: From 4195096ea8da8237a63e1ba3876dc8856b8605c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 3 Feb 2017 02:55:06 +0700 Subject: [PATCH 31/77] [utils] Improve comments processing in js_to_json (closes #11947) --- test/test_utils.py | 24 ++++++++++++++++++++++++ youtube_dl/utils.py | 20 +++++++++++--------- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 954bb7d8b..edc712f07 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -785,12 +785,24 @@ class TestUtil(unittest.TestCase): on = js_to_json('["abc", "def",]') self.assertEqual(json.loads(on), ['abc', 'def']) + on = js_to_json('[/*comment\n*/"abc"/*comment\n*/,/*comment\n*/"def",/*comment\n*/]') + self.assertEqual(json.loads(on), ['abc', 'def']) + + on = js_to_json('[//comment\n"abc" //comment\n,//comment\n"def",//comment\n]') + self.assertEqual(json.loads(on), ['abc', 'def']) + on = js_to_json('{"abc": "def",}') self.assertEqual(json.loads(on), {'abc': 'def'}) + on = js_to_json('{/*comment\n*/"abc"/*comment\n*/:/*comment\n*/"def"/*comment\n*/,/*comment\n*/}') + self.assertEqual(json.loads(on), {'abc': 'def'}) + on = js_to_json('{ 0: /* " \n */ ",]" , }') self.assertEqual(json.loads(on), {'0': ',]'}) + on = js_to_json('{ /*comment\n*/0/*comment\n*/: /* " \n */ ",]" , }') + self.assertEqual(json.loads(on), {'0': ',]'}) + on = js_to_json('{ 0: // comment\n1 }') self.assertEqual(json.loads(on), {'0': 1}) @@ -803,15 +815,27 @@ class TestUtil(unittest.TestCase): on = js_to_json("['a\\\nb']") self.assertEqual(json.loads(on), ['ab']) + on = js_to_json("/*comment\n*/[/*comment\n*/'a\\\nb'/*comment\n*/]/*comment\n*/") + self.assertEqual(json.loads(on), ['ab']) + on = js_to_json('{0xff:0xff}') self.assertEqual(json.loads(on), {'255': 255}) + on = js_to_json('{/*comment\n*/0xff/*comment\n*/:/*comment\n*/0xff/*comment\n*/}') + self.assertEqual(json.loads(on), {'255': 255}) + on = js_to_json('{077:077}') self.assertEqual(json.loads(on), {'63': 63}) + on = js_to_json('{/*comment\n*/077/*comment\n*/:/*comment\n*/077/*comment\n*/}') + self.assertEqual(json.loads(on), {'63': 63}) + on = js_to_json('{42:42}') self.assertEqual(json.loads(on), {'42': 42}) + on = js_to_json('{/*comment\n*/42/*comment\n*/:/*comment\n*/42/*comment\n*/}') + self.assertEqual(json.loads(on), {'42': 42}) + def test_extract_attributes(self): self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'}) self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'}) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 6c462625b..67a847eba 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2103,6 +2103,13 @@ def strip_jsonp(code): def js_to_json(code): + COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*' + SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE) + INTEGER_TABLE = ( + (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16), + (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8), + ) + def fix_kv(m): v = m.group(0) if v in ('true', 'false', 'null'): @@ -2118,11 +2125,6 @@ def js_to_json(code): '\\x': '\\u00', }.get(m.group(0), m.group(0)), v[1:-1]) - INTEGER_TABLE = ( - (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16), - (r'^(0+[0-7]+)\s*:?$', 8), - ) - for regex, base in INTEGER_TABLE: im = re.match(regex, v) if im: @@ -2134,11 +2136,11 @@ def js_to_json(code): return re.sub(r'''(?sx) "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| - /\*.*?\*/|//[^\n]*|,(?=\s*[\]}])| + {comment}|,(?={skip}[\]}}])| [a-zA-Z_][.a-zA-Z_0-9]*| - \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?| - [0-9]+(?=\s*:) - ''', fix_kv, code) + \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?| + [0-9]+(?={skip}:) + '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code) def qualities(quality_ids): From 33da98f4933ddc54c944bae985cfcc7b53563208 Mon Sep 17 00:00:00 2001 From: Justsoos <justso@gmail.com> Date: Wed, 1 Feb 2017 21:30:01 +0800 Subject: [PATCH 32/77] [douyutv] Improve room id regex http://www.douyu.com/t/lpl source get extra '\' with "room_id\" (from js coding) --- youtube_dl/extractor/douyutv.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index 2f3c5113e..911594413 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -18,7 +18,7 @@ from ..utils import ( class DouyuTVIE(InfoExtractor): IE_DESC = '斗鱼' - _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(?P<id>[A-Za-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(?:[^/]+/)*(?P<id>[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://www.douyutv.com/iseven', 'info_dict': { @@ -68,6 +68,10 @@ class DouyuTVIE(InfoExtractor): }, { 'url': 'http://www.douyu.com/xiaocang', 'only_matching': True, + }, { + # \"room_id\" + 'url': 'http://www.douyu.com/t/lpl', + 'only_matching': True, }] # Decompile core.swf in webpage by ffdec "Search SWFs in memory". core.swf @@ -82,7 +86,7 @@ class DouyuTVIE(InfoExtractor): else: page = self._download_webpage(url, video_id) room_id = self._html_search_regex( - r'"room_id"\s*:\s*(\d+),', page, 'room id') + r'"room_id\\?"\s*:\s*(\d+),', page, 'room id') room = self._download_json( 'http://m.douyu.com/html5/live?roomId=%s' % room_id, video_id, From 45024183aea169dc898902388f782485de02cbac Mon Sep 17 00:00:00 2001 From: Mattias Wadman <mattias.wadman@gmail.com> Date: Fri, 3 Feb 2017 05:10:13 +0100 Subject: [PATCH 33/77] [infoq] Add audio only format if available (#11565) * [infoq] Add audio only format if available Refactor cookie code into a function. Renamed formats to http_video, http_audio, rtmp_video Renamed extract functions to video instead of videos as they return one or no video. * [infoq] Rename to _extract_cookies as it more than one * [infoq] Remove redundant determine_ext * [infoq] Add comment about hardcoded URL * [infoq] Use _hidden_inputs instead of messy regex * [infoq] Probe if audio URL is valid Make it possible to pass headers to _is_valid_url * [infoq] Add audio only test --- youtube_dl/extractor/common.py | 4 +-- youtube_dl/extractor/infoq.py | 63 ++++++++++++++++++++++++++++------ 2 files changed, 55 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5a15a9536..2c8ec1417 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1025,13 +1025,13 @@ class InfoExtractor(object): unique_formats.append(f) formats[:] = unique_formats - def _is_valid_url(self, url, video_id, item='video'): + def _is_valid_url(self, url, video_id, item='video', headers={}): url = self._proto_relative_url(url, scheme='http:') # For now assume non HTTP(S) URLs always valid if not (url.startswith('http://') or url.startswith('https://')): return True try: - self._request_webpage(url, video_id, 'Checking %s URL' % item) + self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) return True except ExtractorError as e: if isinstance(e.cause, compat_urllib_error.URLError): diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index cca0b8a93..9fb71e8ef 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import base64 -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_urllib_parse_unquote, + compat_urlparse, +) from ..utils import determine_ext from .bokecc import BokeCCBaseIE @@ -33,9 +36,21 @@ class InfoQIE(BokeCCBaseIE): 'ext': 'flv', 'description': 'md5:308d981fb28fa42f49f9568322c683ff', }, + }, { + 'url': 'https://www.infoq.com/presentations/Simple-Made-Easy', + 'md5': '0e34642d4d9ef44bf86f66f6399672db', + 'info_dict': { + 'id': 'Simple-Made-Easy', + 'title': 'Simple Made Easy', + 'ext': 'mp3', + 'description': 'md5:3e0e213a8bbd074796ef89ea35ada25b', + }, + 'params': { + 'format': 'bestaudio', + }, }] - def _extract_rtmp_videos(self, webpage): + def _extract_rtmp_video(self, webpage): # The server URL is hardcoded video_url = 'rtmpe://video.infoq.com/cfx/st/' @@ -47,28 +62,53 @@ class InfoQIE(BokeCCBaseIE): playpath = 'mp4:' + real_id return [{ - 'format_id': 'rtmp', + 'format_id': 'rtmp_video', 'url': video_url, 'ext': determine_ext(playpath), 'play_path': playpath, }] - def _extract_http_videos(self, webpage): - http_video_url = self._search_regex(r'P\.s\s*=\s*\'([^\']+)\'', webpage, 'video URL') - + def _extract_cookies(self, webpage): policy = self._search_regex(r'InfoQConstants.scp\s*=\s*\'([^\']+)\'', webpage, 'policy') signature = self._search_regex(r'InfoQConstants.scs\s*=\s*\'([^\']+)\'', webpage, 'signature') key_pair_id = self._search_regex(r'InfoQConstants.sck\s*=\s*\'([^\']+)\'', webpage, 'key-pair-id') + return 'CloudFront-Policy=%s; CloudFront-Signature=%s; CloudFront-Key-Pair-Id=%s' % ( + policy, signature, key_pair_id) + def _extract_http_video(self, webpage): + http_video_url = self._search_regex(r'P\.s\s*=\s*\'([^\']+)\'', webpage, 'video URL') return [{ - 'format_id': 'http', + 'format_id': 'http_video', 'url': http_video_url, 'http_headers': { - 'Cookie': 'CloudFront-Policy=%s; CloudFront-Signature=%s; CloudFront-Key-Pair-Id=%s' % ( - policy, signature, key_pair_id), + 'Cookie': self._extract_cookies(webpage) }, }] + def _extract_http_audio(self, webpage, video_id): + fields = self._hidden_inputs(webpage) + http_audio_url = fields['filename'] + if http_audio_url is None: + return [] + + cookies_header = {'Cookie': self._extract_cookies(webpage)} + + # base URL is found in the Location header in the response returned by + # GET https://www.infoq.com/mp3download.action?filename=... when logged in. + http_audio_url = compat_urlparse.urljoin('http://res.infoq.com/downloads/mp3downloads/', http_audio_url) + + # audio file seem to be missing some times even if there is a download link + # so probe URL to make sure + if not self._is_valid_url(http_audio_url, video_id, headers=cookies_header): + return [] + + return [{ + 'format_id': 'http_audio', + 'url': http_audio_url, + 'vcodec': 'none', + 'http_headers': cookies_header, + }] + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -80,7 +120,10 @@ class InfoQIE(BokeCCBaseIE): # for China videos, HTTP video URL exists but always fails with 403 formats = self._extract_bokecc_formats(webpage, video_id) else: - formats = self._extract_rtmp_videos(webpage) + self._extract_http_videos(webpage) + formats = ( + self._extract_rtmp_video(webpage) + + self._extract_http_video(webpage) + + self._extract_http_audio(webpage, video_id)) self._sort_formats(formats) From d7f9242e301fa7c08542932c9348140cf2e07172 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 3 Feb 2017 12:13:24 +0800 Subject: [PATCH 34/77] [ChangeLog] Update after #11565 --- ChangeLog | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog b/ChangeLog index c80126cfb..487ed3f0f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version <unreleased> Extractors ++ [infoq] Add audio only formats (#11565) * [youtube] Fix ytsearch when cookies are provided (#11924) + [bilibili] Support new Bangumi URLs (#11845) From 4ce3407d089ae8c34341e6d68267910683d4b500 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 3 Feb 2017 10:15:03 +0100 Subject: [PATCH 35/77] [filmon] improve extraction --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/filmon.py | 236 +++++++++++++++++------------ 2 files changed, 139 insertions(+), 102 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c9b9ebd23..e4ee43ee3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -287,7 +287,10 @@ from .fc2 import ( FC2EmbedIE, ) from .fczenit import FczenitIE -from .filmon import FilmOnIE, FilmOnVODIE +from .filmon import ( + FilmOnIE, + FilmOnChannelIE, +) from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE diff --git a/youtube_dl/extractor/filmon.py b/youtube_dl/extractor/filmon.py index 987792fec..f775fe0ba 100644 --- a/youtube_dl/extractor/filmon.py +++ b/youtube_dl/extractor/filmon.py @@ -2,74 +2,21 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import qualities -from ..compat import compat_urllib_request - - -_QUALITY = qualities(('low', 'high')) +from ..compat import ( + compat_str, + compat_HTTPError, +) +from ..utils import ( + qualities, + strip_or_none, + int_or_none, + ExtractorError, +) class FilmOnIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?filmon\.com/(?:tv|channel)/(?P<id>[a-z0-9-]+)' - _TESTS = [{ - 'url': 'https://www.filmon.com/channel/filmon-sports', - 'only_matching': True, - }, { - 'url': 'https://www.filmon.com/tv/2894', - 'only_matching': True, - }] - - def _real_extract(self, url): - channel_id = self._match_id(url) - - request = compat_urllib_request.Request('https://www.filmon.com/channel/%s' % (channel_id)) - request.add_header('X-Requested-With', 'XMLHttpRequest') - channel_info = self._download_json(request, channel_id) - now_playing = channel_info['now_playing'] - - thumbnails = [] - for thumb in now_playing.get('images', ()): - if thumb['type'] != '2': - continue - thumbnails.append({ - 'url': thumb['url'], - 'width': int(thumb['width']), - 'height': int(thumb['height']), - }) - - formats = [] - - for stream in channel_info['streams']: - formats.append({ - 'format_id': str(stream['id']), - # this is an m3u8 stream, but we are deliberately not using _extract_m3u8_formats - # because 0) it doesn't have bitrate variants anyway, and 1) the ids generated - # by that method are highly unstable (because the bitrate is variable) - 'url': stream['url'], - 'resolution': stream['name'], - 'format_note': 'expires after %u seconds' % int(stream['watch-timeout']), - 'ext': 'mp4', - 'quality': _QUALITY(stream['quality']), - 'preference': int(stream['watch-timeout']), - }) - self._sort_formats(formats) - - return { - 'id': str(channel_info['id']), - 'display_id': channel_info['alias'], - 'formats': formats, - # XXX: use the channel description (channel_info['description'])? - 'uploader_id': channel_info['alias'], - 'uploader': channel_info['title'], # XXX: kinda stretching it... - 'title': now_playing.get('programme_name') or channel_info['title'], - 'description': now_playing.get('programme_description'), - 'thumbnails': thumbnails, - 'is_live': True, - } - - -class FilmOnVODIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?filmon\.com/vod/view/(?P<id>\d+)' + IE_NAME = 'filmon' + _VALID_URL = r'(?:https?://(?:www\.)?filmon\.com/vod/view/|filmon:)(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.filmon.com/vod/view/24869-0-plan-9-from-outer-space', 'info_dict': { @@ -83,62 +30,149 @@ class FilmOnVODIE(InfoExtractor): 'info_dict': { 'id': '2825', 'title': 'Popeye Series 1', + 'description': 'The original series of Popeye.', }, - 'playlist_count': 8, + 'playlist_mincount': 8, }] def _real_extract(self, url): video_id = self._match_id(url) - result = self._download_json('https://www.filmon.com/api/vod/movie?id=%s' % (video_id), video_id) - if result['code'] != 200: - raise ExtractorError('FilmOn said: %s' % (result['reason']), expected=True) + try: + response = self._download_json( + 'https://www.filmon.com/api/vod/movie?id=%s' % video_id, + video_id)['response'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + errmsg = self._parse_json(e.cause.read().decode(), video_id)['reason'] + raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True) + raise - response = result['response'] + title = response['title'] + description = strip_or_none(response.get('description')) - if response.get('episodes'): - return { - '_type': 'playlist', - 'id': video_id, - 'title': response['title'], - 'entries': [{ - '_type': 'url', - 'url': 'https://www.filmon.com/vod/view/%s' % (ep), - } for ep in response['episodes']] - } + if response.get('type_id') == 1: + entries = [self.url_result('filmon:' + episode_id) for episode_id in response.get('episodes', [])] + return self.playlist_result(entries, video_id, title, description) + QUALITY = qualities(('low', 'high')) formats = [] - for (id, stream) in response['streams'].items(): + for format_id, stream in response.get('streams', {}).items(): + stream_url = stream.get('url') + if not stream_url: + continue formats.append({ - 'format_id': id, - 'url': stream['url'], - 'resolution': stream['name'], - 'format_note': 'expires after %u seconds' % int(stream['watch-timeout']), + 'format_id': format_id, + 'url': stream_url, 'ext': 'mp4', - 'quality': _QUALITY(stream['quality']), - 'preference': int(stream['watch-timeout']), + 'quality': QUALITY(stream.get('quality')), + 'protocol': 'm3u8_native', }) self._sort_formats(formats) - poster = response['poster'] - thumbnails = [{ - 'id': 'poster', - 'url': poster['url'], - 'width': poster['width'], - 'height': poster['height'], - }] - for (id, thumb) in poster['thumbs'].items(): + thumbnails = [] + poster = response.get('poster', {}) + thumbs = poster.get('thumbs', {}) + thumbs['poster'] = poster + for thumb_id, thumb in thumbs.items(): + thumb_url = thumb.get('url') + if not thumb_url: + continue thumbnails.append({ - 'id': id, - 'url': thumb['url'], - 'width': thumb['width'], - 'height': thumb['height'], + 'id': thumb_id, + 'url': thumb_url, + 'width': int_or_none(thumb.get('width')), + 'height': int_or_none(thumb.get('height')), }) return { 'id': video_id, - 'title': response['title'], + 'title': title, 'formats': formats, - 'description': response['description'], + 'description': description, 'thumbnails': thumbnails, } + + +class FilmOnChannelIE(InfoExtractor): + IE_NAME = 'filmon:channel' + _VALID_URL = r'https?://(?:www\.)?filmon\.com/(?:tv|channel)/(?P<id>[a-z0-9-]+)' + _TESTS = [{ + # VOD + 'url': 'http://www.filmon.com/tv/sports-haters', + 'info_dict': { + 'id': '4190', + 'ext': 'mp4', + 'title': 'Sports Haters', + 'description': 'md5:dabcb4c1d9cfc77085612f1a85f8275d', + }, + }, { + # LIVE + 'url': 'https://www.filmon.com/channel/filmon-sports', + 'only_matching': True, + }, { + 'url': 'https://www.filmon.com/tv/2894', + 'only_matching': True, + }] + + _THUMBNAIL_RES = [ + ('logo', 56, 28), + ('big_logo', 106, 106), + ('extra_big_logo', 300, 300), + ] + + def _real_extract(self, url): + channel_id = self._match_id(url) + + try: + channel_data = self._download_json( + 'http://www.filmon.com/api-v2/channel/' + channel_id, channel_id)['data'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + errmsg = self._parse_json(e.cause.read().decode(), channel_id)['message'] + raise ExtractorError('%s said: %s' % (self.IE_NAME, errmsg), expected=True) + raise + + channel_id = compat_str(channel_data['id']) + is_live = not channel_data.get('is_vod') and not channel_data.get('is_vox') + title = channel_data['title'] + + QUALITY = qualities(('low', 'high')) + formats = [] + for stream in channel_data.get('streams', []): + stream_url = stream.get('url') + if not stream_url: + continue + if not is_live: + formats.extend(self._extract_wowza_formats( + stream_url, channel_id, skip_protocols=['dash', 'rtmp', 'rtsp'])) + continue + quality = stream.get('quality') + formats.append({ + 'format_id': quality, + # this is an m3u8 stream, but we are deliberately not using _extract_m3u8_formats + # because it doesn't have bitrate variants anyway + 'url': stream_url, + 'ext': 'mp4', + 'quality': QUALITY(quality), + }) + self._sort_formats(formats) + + thumbnails = [] + for name, width, height in self._THUMBNAIL_RES: + thumbnails.append({ + 'id': name, + 'url': 'http://static.filmon.com/assets/channels/%s/%s.png' % (channel_id, name), + 'width': width, + 'height': height, + }) + + return { + 'id': channel_id, + 'display_id': channel_data.get('alias'), + 'title': self._live_title(title) if is_live else title, + 'description': channel_data.get('description'), + 'thumbnails': thumbnails, + 'formats': formats, + 'is_live': is_live, + } From daac118bf4e8bf3dc1ec202fe8b21b9319d15dbf Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 3 Feb 2017 18:56:40 +0800 Subject: [PATCH 36/77] [ChangeLog] Update after #11901 --- ChangeLog | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ChangeLog b/ChangeLog index 487ed3f0f..947590b94 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,10 +1,14 @@ version <unreleased> +Core ++ Add --playlist-random to shuffle playlists (#11889, #11901) + Extractors + [infoq] Add audio only formats (#11565) * [youtube] Fix ytsearch when cookies are provided (#11924) + [bilibili] Support new Bangumi URLs (#11845) + version 2017.02.01 Extractors From f7a10d8cd6d1378d5f8e67b4b3572fa474b47cde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 3 Feb 2017 21:25:44 +0700 Subject: [PATCH 37/77] [sportbox] Remove extractor (closes #11954) Covered by generic extractor --- youtube_dl/extractor/sportbox.py | 54 -------------------------------- 1 file changed, 54 deletions(-) diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index b512cd20f..05a0b5a80 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -11,60 +11,6 @@ from ..utils import ( ) -class SportBoxIE(InfoExtractor): - _VALID_URL = r'https?://news\.sportbox\.ru/(?:[^/]+/)+spbvideo_NI\d+_(?P<display_id>.+)' - _TESTS = [{ - 'url': 'http://news.sportbox.ru/Vidy_sporta/Avtosport/Rossijskij/spbvideo_NI483529_Gonka-2-zaezd-Obyedinenniy-2000-klassi-Turing-i-S', - 'md5': 'ff56a598c2cf411a9a38a69709e97079', - 'info_dict': { - 'id': '80822', - 'ext': 'mp4', - 'title': 'Гонка 2 заезд ««Объединенный 2000»: классы Туринг и Супер-продакшн', - 'description': 'md5:3d72dc4a006ab6805d82f037fdc637ad', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20140928', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://news.sportbox.ru/Vidy_sporta/billiard/spbvideo_NI486287_CHempionat-mira-po-dinamichnoy-piramide-4', - 'only_matching': True, - }, { - 'url': 'http://news.sportbox.ru/video/no_ads/spbvideo_NI536574_V_Novorossijske_proshel_detskij_turnir_Pole_slavy_bojevoj?ci=211355', - 'only_matching': True, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - player = self._search_regex( - r'src="/?(vdl/player/[^"]+)"', webpage, 'player') - - title = self._html_search_regex( - [r'"nodetitle"\s*:\s*"([^"]+)"', r'class="node-header_{1,2}title">([^<]+)'], - webpage, 'title') - description = self._og_search_description(webpage) or self._html_search_meta( - 'description', webpage, 'description') - thumbnail = self._og_search_thumbnail(webpage) - upload_date = unified_strdate(self._html_search_meta( - 'dateCreated', webpage, 'upload date')) - - return { - '_type': 'url_transparent', - 'url': compat_urlparse.urljoin(url, '/%s' % player), - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - } - - class SportBoxEmbedIE(InfoExtractor): _VALID_URL = r'https?://news\.sportbox\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)' _TESTS = [{ From b7cc5f078eca4d90b3e3d31d1247452953dba1fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 3 Feb 2017 21:56:10 +0700 Subject: [PATCH 38/77] [extractors] Remove remnants of sportbox extractor (#11954) --- youtube_dl/extractor/extractors.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index aa235bec1..eaf3676df 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -888,10 +888,7 @@ from .spiegeltv import SpiegeltvIE from .spike import SpikeIE from .stitcher import StitcherIE from .sport5 import Sport5IE -from .sportbox import ( - SportBoxIE, - SportBoxEmbedIE, -) +from .sportbox import SportBoxEmbedIE from .sportdeutschland import SportDeutschlandIE from .sportschau import SportschauIE from .srgssr import ( From f962790ee53c634758021d9fc752ae476c6a142b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 3 Feb 2017 21:56:48 +0700 Subject: [PATCH 39/77] [vine] Fix extraction (closes #11955) --- youtube_dl/extractor/vine.py | 103 +++++++++++++++-------------------- 1 file changed, 44 insertions(+), 59 deletions(-) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 0183f052a..4957a07f7 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -6,8 +6,9 @@ import itertools from .common import InfoExtractor from ..utils import ( + determine_ext, int_or_none, - unified_strdate, + unified_timestamp, ) @@ -20,50 +21,16 @@ class VineIE(InfoExtractor): 'id': 'b9KOOWX7HUx', 'ext': 'mp4', 'title': 'Chicken.', - 'alt_title': 'Vine by Jack Dorsey', + 'alt_title': 'Vine by Jack', + 'timestamp': 1368997951, 'upload_date': '20130519', - 'uploader': 'Jack Dorsey', + 'uploader': 'Jack', 'uploader_id': '76', 'view_count': int, 'like_count': int, 'comment_count': int, 'repost_count': int, }, - }, { - 'url': 'https://vine.co/v/MYxVapFvz2z', - 'md5': '7b9a7cbc76734424ff942eb52c8f1065', - 'info_dict': { - 'id': 'MYxVapFvz2z', - 'ext': 'mp4', - 'title': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14', - 'alt_title': 'Vine by Mars Ruiz', - 'upload_date': '20140815', - 'uploader': 'Mars Ruiz', - 'uploader_id': '1102363502380728320', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - }, - }, { - 'url': 'https://vine.co/v/bxVjBbZlPUH', - 'md5': 'ea27decea3fa670625aac92771a96b73', - 'info_dict': { - 'id': 'bxVjBbZlPUH', - 'ext': 'mp4', - 'title': '#mw3 #ac130 #killcam #angelofdeath', - 'alt_title': 'Vine by Z3k3', - 'upload_date': '20130430', - 'uploader': 'Z3k3', - 'uploader_id': '936470460173008896', - 'view_count': int, - 'like_count': int, - 'comment_count': int, - 'repost_count': int, - }, - }, { - 'url': 'https://vine.co/oembed/MYxVapFvz2z.json', - 'only_matching': True, }, { 'url': 'https://vine.co/v/e192BnZnZ9V', 'info_dict': { @@ -71,6 +38,7 @@ class VineIE(InfoExtractor): 'ext': 'mp4', 'title': 'ยิ้ม~ เขิน~ อาย~ น่าร้ากอ้ะ >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', 'alt_title': 'Vine by Pimry_zaa', + 'timestamp': 1436057405, 'upload_date': '20150705', 'uploader': 'Pimry_zaa', 'uploader_id': '1135760698325307392', @@ -82,43 +50,60 @@ class VineIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://vine.co/v/MYxVapFvz2z', + 'only_matching': True, + }, { + 'url': 'https://vine.co/v/bxVjBbZlPUH', + 'only_matching': True, + }, { + 'url': 'https://vine.co/oembed/MYxVapFvz2z.json', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage('https://vine.co/v/' + video_id, video_id) - data = self._parse_json( - self._search_regex( - r'window\.POST_DATA\s*=\s*({.+?});\s*</script>', - webpage, 'vine data'), - video_id) + data = self._download_json( + 'https://archive.vine.co/posts/%s.json' % video_id, video_id) - data = data[list(data.keys())[0]] - - formats = [{ - 'format_id': '%(format)s-%(rate)s' % f, - 'vcodec': f.get('format'), - 'quality': f.get('rate'), - 'url': f['videoUrl'], - } for f in data['videoUrls'] if f.get('videoUrl')] + def video_url(kind): + for url_suffix in ('Url', 'URL'): + format_url = data.get('video%s%s' % (kind, url_suffix)) + if format_url: + return format_url + formats = [] + for quality, format_id in enumerate(('low', '', 'dash')): + format_url = video_url(format_id.capitalize()) + if not format_url: + continue + # DASH link returns plain mp4 + if format_id == 'dash' and determine_ext(format_url) == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id or 'standard', + 'quality': quality, + }) self._sort_formats(formats) username = data.get('username') return { 'id': video_id, - 'title': data.get('description') or self._og_search_title(webpage), - 'alt_title': 'Vine by %s' % username if username else self._og_search_description(webpage, default=None), + 'title': data.get('description'), + 'alt_title': 'Vine by %s' % username if username else None, 'thumbnail': data.get('thumbnailUrl'), - 'upload_date': unified_strdate(data.get('created')), + 'timestamp': unified_timestamp(data.get('created')), 'uploader': username, 'uploader_id': data.get('userIdStr'), - 'view_count': int_or_none(data.get('loops', {}).get('count')), - 'like_count': int_or_none(data.get('likes', {}).get('count')), - 'comment_count': int_or_none(data.get('comments', {}).get('count')), - 'repost_count': int_or_none(data.get('reposts', {}).get('count')), + 'view_count': int_or_none(data.get('loops')), + 'like_count': int_or_none(data.get('likes')), + 'comment_count': int_or_none(data.get('comments')), + 'repost_count': int_or_none(data.get('reposts')), 'formats': formats, } From 605fd6392fedd2599115e1f1e12df2a6212df1ae Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 3 Feb 2017 17:59:48 +0100 Subject: [PATCH 40/77] [youtube] add format info for itag 325 and 328 --- youtube_dl/extractor/youtube.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index f2f751104..76710931a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -329,6 +329,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'}, '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'}, '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'}, + '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'preference': -50, 'container': 'm4a_dash'}, + '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'preference': -50, 'container': 'm4a_dash'}, # Dash webm '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, From f65dba7cdb98bb5444ad5656c9626a15d210f6f6 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 3 Feb 2017 22:25:19 +0100 Subject: [PATCH 41/77] [myspace] fix extraction and extract hls and http formats --- youtube_dl/extractor/myspace.py | 108 +++++++++++++++++--------------- 1 file changed, 58 insertions(+), 50 deletions(-) diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py index ab32e632e..f281238c9 100644 --- a/youtube_dl/extractor/myspace.py +++ b/youtube_dl/extractor/myspace.py @@ -17,9 +17,10 @@ class MySpaceIE(InfoExtractor): _TESTS = [ { 'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919', + 'md5': '9c1483c106f4a695c47d2911feed50a7', 'info_dict': { 'id': '109594919', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Little Big Town', 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.', 'uploader': 'Five Minutes to the Stage', @@ -27,37 +28,30 @@ class MySpaceIE(InfoExtractor): 'timestamp': 1414108751, 'upload_date': '20141023', }, - 'params': { - # rtmp download - 'skip_download': True, - }, }, # songs { 'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681', + 'md5': '1d7ee4604a3da226dd69a123f748b262', 'info_dict': { 'id': '93388656', - 'ext': 'flv', + 'ext': 'm4a', 'title': 'Of weakened soul...', 'uploader': 'Killsorrow', 'uploader_id': 'killsorrow', }, - 'params': { - # rtmp download - 'skip_download': True, - }, }, { - 'add_ie': ['Vevo'], + 'add_ie': ['Youtube'], 'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041', 'info_dict': { - 'id': 'USZM20600099', - 'ext': 'mp4', - 'title': 'Animal I Have Become', - 'uploader': 'Three Days Grace', - 'timestamp': int, - 'upload_date': '20060502', + 'id': 'xqds0B_meys', + 'ext': 'webm', + 'title': 'Three Days Grace - Animal I Have Become', + 'description': 'md5:8bd86b3693e72a077cf863a8530c54bb', + 'uploader': 'ThreeDaysGraceVEVO', + 'uploader_id': 'ThreeDaysGraceVEVO', + 'upload_date': '20091002', }, - 'skip': 'VEVO is only available in some countries', }, { 'add_ie': ['Youtube'], 'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426', @@ -76,24 +70,46 @@ class MySpaceIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + is_song = mobj.group('mediatype').startswith('music/song') webpage = self._download_webpage(url, video_id) player_url = self._search_regex( - r'playerSwf":"([^"?]*)', webpage, 'player URL') + r'videoSwf":"([^"?]*)', webpage, 'player URL', fatal=False) - def rtmp_format_from_stream_url(stream_url, width=None, height=None): - rtmp_url, play_path = stream_url.split(';', 1) - return { - 'format_id': 'rtmp', - 'url': rtmp_url, - 'play_path': play_path, - 'player_url': player_url, - 'protocol': 'rtmp', - 'ext': 'flv', - 'width': width, - 'height': height, - } + def formats_from_stream_urls(stream_url, hls_stream_url, http_stream_url, width=None, height=None): + formats = [] + vcodec = 'none' if is_song else None + if hls_stream_url: + formats.append({ + 'format_id': 'hls', + 'url': hls_stream_url, + 'protocol': 'm3u8_native', + 'ext': 'm4a' if is_song else 'mp4', + 'vcodec': vcodec, + }) + if stream_url and player_url: + rtmp_url, play_path = stream_url.split(';', 1) + formats.append({ + 'format_id': 'rtmp', + 'url': rtmp_url, + 'play_path': play_path, + 'player_url': player_url, + 'protocol': 'rtmp', + 'ext': 'flv', + 'width': width, + 'height': height, + 'vcodec': vcodec, + }) + if http_stream_url: + formats.append({ + 'format_id': 'http', + 'url': http_stream_url, + 'width': width, + 'height': height, + 'vcodec': vcodec, + }) + return formats - if mobj.group('mediatype').startswith('music/song'): + if is_song: # songs don't store any useful info in the 'context' variable song_data = self._search_regex( r'''<button.*data-song-id=(["\'])%s\1.*''' % video_id, @@ -108,8 +124,10 @@ class MySpaceIE(InfoExtractor): return self._search_regex( r'''data-%s=([\'"])(?P<data>.*?)\1''' % name, song_data, name, default='', group='data') - stream_url = search_data('stream-url') - if not stream_url: + formats = formats_from_stream_urls( + search_data('stream-url'), search_data('hls-stream-url'), + search_data('http-stream-url')) + if not formats: vevo_id = search_data('vevo-id') youtube_id = search_data('youtube-id') if vevo_id: @@ -121,6 +139,7 @@ class MySpaceIE(InfoExtractor): else: raise ExtractorError( 'Found song but don\'t know how to download it') + self._sort_formats(formats) return { 'id': video_id, 'title': self._og_search_title(webpage), @@ -128,27 +147,16 @@ class MySpaceIE(InfoExtractor): 'uploader_id': search_data('artist-username'), 'thumbnail': self._og_search_thumbnail(webpage), 'duration': int_or_none(search_data('duration')), - 'formats': [rtmp_format_from_stream_url(stream_url)] + 'formats': formats, } else: video = self._parse_json(self._search_regex( r'context = ({.*?});', webpage, 'context'), video_id)['video'] - formats = [] - hls_stream_url = video.get('hlsStreamUrl') - if hls_stream_url: - formats.append({ - 'format_id': 'hls', - 'url': hls_stream_url, - 'protocol': 'm3u8_native', - 'ext': 'mp4', - }) - stream_url = video.get('streamUrl') - if stream_url: - formats.append(rtmp_format_from_stream_url( - stream_url, - int_or_none(video.get('width')), - int_or_none(video.get('height')))) + formats = formats_from_stream_urls( + video.get('streamUrl'), video.get('hlsStreamUrl'), + video.get('mp4StreamUrl'), int_or_none(video.get('width')), + int_or_none(video.get('height'))) self._sort_formats(formats) return { 'id': video_id, From 2c15db829c1bd8311ed82e2884661271f0cf73ed Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 4 Feb 2017 08:38:28 +0100 Subject: [PATCH 42/77] [drtv] add support for live and radio sections(closes #1827)(closes #3427) --- youtube_dl/extractor/drtv.py | 74 +++++++++++++++++++++++++++--- youtube_dl/extractor/extractors.py | 5 +- 2 files changed, 72 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index 88d096b30..e966d7483 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -9,12 +9,13 @@ from ..utils import ( mimetype2ext, parse_iso8601, remove_end, + update_url_query, ) class DRTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv/se|nyheder)/(?:[^/]+/)*(?P<id>[\da-z-]+)(?:[/#?]|$)' - + _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv/se|nyheder|radio/ondemand)/(?:[^/]+/)*(?P<id>[\da-z-]+)(?:[/#?]|$)' + IE_NAME = 'drtv' _TESTS = [{ 'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10', 'md5': '25e659cccc9a2ed956110a299fdf5983', @@ -79,9 +80,10 @@ class DRTVIE(InfoExtractor): subtitles = {} for asset in data['Assets']: - if asset.get('Kind') == 'Image': + kind = asset.get('Kind') + if kind == 'Image': thumbnail = asset.get('Uri') - elif asset.get('Kind') == 'VideoResource': + elif kind in ('VideoResource', 'AudioResource'): duration = float_or_none(asset.get('DurationInMilliseconds'), 1000) restricted_to_denmark = asset.get('RestrictedToDenmark') spoken_subtitles = asset.get('Target') == 'SpokenSubtitles' @@ -96,9 +98,13 @@ class DRTVIE(InfoExtractor): preference = -1 format_id += '-spoken-subtitles' if target == 'HDS': - formats.extend(self._extract_f4m_formats( + f4m_formats = self._extract_f4m_formats( uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', - video_id, preference, f4m_id=format_id)) + video_id, preference, f4m_id=format_id) + if kind == 'AudioResource': + for f in f4m_formats: + f['vcodec'] = 'none' + formats.extend(f4m_formats) elif target == 'HLS': formats.extend(self._extract_m3u8_formats( uri, video_id, 'mp4', entry_protocol='m3u8_native', @@ -112,6 +118,7 @@ class DRTVIE(InfoExtractor): 'format_id': format_id, 'tbr': int_or_none(bitrate), 'ext': link.get('FileFormat'), + 'vcodec': 'none' if kind == 'AudioResource' else None, }) subtitles_list = asset.get('SubtitlesList') if isinstance(subtitles_list, list): @@ -144,3 +151,58 @@ class DRTVIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } + + +class DRTVLiveIE(InfoExtractor): + IE_NAME = 'drtv:live' + _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv|TV)/live/(?P<id>[\da-z-]+)' + _TEST = { + 'url': 'https://www.dr.dk/tv/live/dr1', + 'info_dict': { + 'id': 'dr1', + 'ext': 'mp4', + 'title': 're:^DR1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + channel_id = self._match_id(url) + channel_data = self._download_json( + 'https://www.dr.dk/mu-online/api/1.0/channel/' + channel_id, + channel_id) + title = self._live_title(channel_data['Title']) + + formats = [] + for streaming_server in channel_data.get('StreamingServers', []): + server = streaming_server.get('Server') + if not server: + continue + link_type = streaming_server.get('LinkType') + for quality in streaming_server.get('Qualities', []): + for stream in quality.get('Streams', []): + stream_path = stream.get('Stream') + if not stream_path: + continue + stream_url = update_url_query( + '%s/%s' % (server, stream_path), {'b': ''}) + if link_type == 'HLS': + formats.extend(self._extract_m3u8_formats( + stream_url, channel_id, 'mp4', + m3u8_id=link_type, fatal=False, live=True)) + elif link_type == 'HDS': + formats.extend(self._extract_f4m_formats(update_url_query( + '%s/%s' % (server, stream_path), {'hdcore': '3.7.0'}), + channel_id, f4m_id=link_type, fatal=False)) + self._sort_formats(formats) + + return { + 'id': channel_id, + 'title': title, + 'thumbnail': channel_data.get('PrimaryImageUri'), + 'formats': formats, + 'is_live': True, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index eaf3676df..32420937c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -248,7 +248,10 @@ from .dramafever import ( from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE -from .drtv import DRTVIE +from .drtv import ( + DRTVIE, + DRTVLiveIE, +) from .dvtv import DVTVIE from .dumpert import DumpertIE from .defense import DefenseGouvFrIE From 36fce54816eb1f1d792ac7ed4d07e292d44d62f5 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 4 Feb 2017 15:23:46 +0100 Subject: [PATCH 43/77] [turner] fix downloading of secure hls formats using ffmpeg(closes #11358)(closes #11373)(closes #11800) --- youtube_dl/downloader/external.py | 9 +++++++++ youtube_dl/extractor/turner.py | 8 ++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 138f353ef..41e37261d 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -199,6 +199,15 @@ class FFmpegFD(ExternalFD): args = [ffpp.executable, '-y'] + seekable = info_dict.get('_seekable') + if seekable is not None: + # setting -seekable prevents ffmpeg from guessing if the server + # supports seeking(by adding the header `Range: bytes=0-`), which + # can cause problems in some cases + # https://github.com/rg3/youtube-dl/issues/11800#issuecomment-275037127 + # http://trac.ffmpeg.org/ticket/6125#comment:10 + args += ['-seekable', '1' if seekable else '0'] + args += self._configuration_args() # start_time = info_dict.get('start_time') or 0 diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index 57ffedb87..1c0be9fc6 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -100,9 +100,13 @@ class TurnerBaseIE(AdobePassIE): formats.extend(self._extract_smil_formats( video_url, video_id, fatal=False)) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( video_url, video_id, 'mp4', - m3u8_id=format_id or 'hls', fatal=False)) + m3u8_id=format_id or 'hls', fatal=False) + if '/secure/' in video_url and '?hdnea=' in video_url: + for f in m3u8_formats: + f['_seekable'] = False + formats.extend(m3u8_formats) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( update_url_query(video_url, {'hdcore': '3.7.0'}), From 643dc0fcfed5e5eb152000190d0c7ba9dd577ef8 Mon Sep 17 00:00:00 2001 From: A Connecticut Princess <bugchecker@dibaby.org> Date: Sat, 4 Feb 2017 13:23:35 +0500 Subject: [PATCH 44/77] [vk] Catch author blocked error message Example link (video in blocked group): https://vk.com/search?c%5Bq%5D=%D0%9F%D1%80%D1%8B%D0%B6%D0%BE%D0%BA%20c%20%D0%BA%D1%80%D0%B0%D0%BD%D0%B0%20%D0%B2%20%D1%81%D1%82%D0%B8%D0%BB%D0%B5%20%D0%A7%D0%B5%D0%BB%D0%BE%D0%B2%D0%B5%D0%BA%D0%B0-%D0%BF%D0%B0%D1%83%D0%BA%D0%B0&c%5Bsection%5D=video&c%5Bsort%5D=2&z=video-10639516_456240611 --- youtube_dl/extractor/vk.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 6e6c3a0e1..7c42a4f54 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -281,6 +281,11 @@ class VKIE(VKBaseIE): { 'url': 'http://new.vk.com/video205387401_165548505', 'only_matching': True, + }, + { + # This video is no longer available, because its author has been blocked. + 'url': 'https://vk.com/video-10639516_456240611', + 'only_matching': True, } ] @@ -328,6 +333,12 @@ class VKIE(VKBaseIE): r'<!>Access denied': 'Access denied to video %s.', + + r'<!>Видеозапись недоступна, так как её автор был заблокирован.': + 'Video %s is no longer available, because its author has been blocked.', + + r'<!>This video is no longer available, because its author has been blocked.': + 'Video %s is no longer available, because its author has been blocked.', } for error_re, error_msg in ERRORS.items(): From c2521c1ac6bbd24cd5d01ba764f2d084b16c506f Mon Sep 17 00:00:00 2001 From: John Hawkinson <jhawk@mit.edu> Date: Sat, 4 Feb 2017 10:23:14 -0500 Subject: [PATCH 45/77] [Piksel] Add another app token regex --- youtube_dl/extractor/piksel.py | 43 ++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/piksel.py b/youtube_dl/extractor/piksel.py index d44edcdfb..c0c276a50 100644 --- a/youtube_dl/extractor/piksel.py +++ b/youtube_dl/extractor/piksel.py @@ -16,18 +16,33 @@ from ..utils import ( class PikselIE(InfoExtractor): _VALID_URL = r'https?://player\.piksel\.com/v/(?P<id>[a-z0-9]+)' - _TEST = { - 'url': 'http://player.piksel.com/v/nv60p12f', - 'md5': 'd9c17bbe9c3386344f9cfd32fad8d235', - 'info_dict': { - 'id': 'nv60p12f', - 'ext': 'mp4', - 'title': 'فن الحياة - الحلقة 1', - 'description': 'احدث برامج الداعية الاسلامي " مصطفي حسني " فى رمضان 2016علي النهار نور', - 'timestamp': 1465231790, - 'upload_date': '20160606', + _TESTS = [ + { + 'url': 'http://player.piksel.com/v/nv60p12f', + 'md5': 'd9c17bbe9c3386344f9cfd32fad8d235', + 'info_dict': { + 'id': 'nv60p12f', + 'ext': 'mp4', + 'title': 'فن الحياة - الحلقة 1', + 'description': 'احدث برامج الداعية الاسلامي " مصطفي حسني " فى رمضان 2016علي النهار نور', + 'timestamp': 1465231790, + 'upload_date': '20160606', + } + }, + { + # Original source: http://www.uscourts.gov/cameras-courts/state-washington-vs-donald-j-trump-et-al + 'url': 'https://player.piksel.com/v/v80kqp41', + 'md5': '753ddcd8cc8e4fa2dda4b7be0e77744d', + 'info_dict': { + 'id': 'v80kqp41', + 'ext': 'mp4', + 'title': 'WAW- State of Washington vs. Donald J. Trump, et al', + 'description': 'State of Washington vs. Donald J. Trump, et al, Case Number 17-CV-00141-JLR, TRO Hearing, Civil Rights Case, 02/3/2017, 1:00 PM (PST), Seattle Federal Courthouse, Seattle, WA, Judge James L. Robart presiding.', + 'timestamp': 1486171129, + 'upload_date': '20170204', + } } - } + ] @staticmethod def _extract_url(webpage): @@ -40,8 +55,10 @@ class PikselIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - app_token = self._search_regex( - r'clientAPI\s*:\s*"([^"]+)"', webpage, 'app token') + app_token = self._search_regex([ + r'clientAPI\s*:\s*"([^"]+)"', + r'data-de-api-key\s*=\s*"([^"]+)"' + ], webpage, 'app token') response = self._download_json( 'http://player.piksel.com/ws/ws_program/api/%s/mode/json/apiv/5' % app_token, video_id, query={ From 31487eb9746123b7c4e28be7e48908773beab40c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 4 Feb 2017 22:57:48 +0700 Subject: [PATCH 46/77] release 2017.02.04 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- README.md | 1 + docs/supportedsites.md | 7 +++++-- youtube_dl/version.py | 2 +- 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 8914569b6..11fd56038 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.02.01*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.02.01** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.02.04*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.02.04** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.02.01 +[debug] youtube-dl version 2017.02.04 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 947590b94..5323769d8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.02.04 Core + Add --playlist-random to shuffle playlists (#11889, #11901) diff --git a/README.md b/README.md index 2ee00f515..89876bd7a 100644 --- a/README.md +++ b/README.md @@ -182,6 +182,7 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo automatically resized from an initial value of SIZE. --playlist-reverse Download playlist videos in reverse order + --playlist-random Download playlist videos in random order --xattr-set-filesize Set file xattribute ytdl.filesize with expected file size (experimental) --hls-prefer-native Use the native HLS downloader instead of diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d900f5e12..50a339bc4 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -84,6 +84,7 @@ - **bambuser:channel** - **Bandcamp** - **Bandcamp:album** + - **bangumi.bilibili.com**: BiliBili番剧 - **bbc**: BBC - **bbc.co.uk**: BBC iPlayer - **bbc.co.uk:article**: BBC articles @@ -211,7 +212,8 @@ - **DRBonanza** - **Dropbox** - **DrTuber** - - **DRTV** + - **drtv** + - **drtv:live** - **Dumpert** - **dvtv**: http://video.aktualne.cz/ - **dw** @@ -247,6 +249,8 @@ - **fc2:embed** - **Fczenit** - **fernsehkritik.tv** + - **filmon** + - **filmon:channel** - **Firstpost** - **FiveTV** - **Flickr** @@ -703,7 +707,6 @@ - **Spiegeltv** - **Spike** - **Sport5** - - **SportBox** - **SportBoxEmbed** - **SportDeutschland** - **Sportschau** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0f9b6b703..376b31397 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.02.01' +__version__ = '2017.02.04' From 8e4041cf3f8e769ee2188f3db4747b7133ab5c2d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 4 Feb 2017 17:02:12 +0100 Subject: [PATCH 47/77] [radiocanada] fix extraction for toutv rtmp formats --- youtube_dl/extractor/radiocanada.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py index 321917ad0..3b40002a8 100644 --- a/youtube_dl/extractor/radiocanada.py +++ b/youtube_dl/extractor/radiocanada.py @@ -54,9 +54,8 @@ class RadioCanadaIE(InfoExtractor): raise ExtractorError('This video is DRM protected.', expected=True) device_types = ['ipad'] - if app_code != 'toutv': - device_types.append('flash') if not smuggled_data: + device_types.append('flash') device_types.append('android') formats = [] @@ -103,7 +102,7 @@ class RadioCanadaIE(InfoExtractor): continue f_url = re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url) protocol = determine_protocol({'url': f_url}) - formats.append({ + f = { 'format_id': '%s-%d' % (protocol, tbr), 'url': f_url, 'ext': 'flv' if protocol == 'rtmp' else ext, @@ -111,7 +110,14 @@ class RadioCanadaIE(InfoExtractor): 'width': int_or_none(url_e.get('width')), 'height': int_or_none(url_e.get('height')), 'tbr': tbr, - }) + } + mobj = re.match(r'(?P<url>rtmp://[^/]+/[^/]+)/(?P<playpath>[^?]+)(?P<auth>\?.+)', f_url) + if mobj: + f.update({ + 'url': mobj.group('url') + mobj.group('auth'), + 'play_path': mobj.group('playpath'), + }) + formats.append(f) if protocol == 'rtsp': base_url = self._search_regex( r'rtsp://([^?]+)', f_url, 'base url', default=None) From 9db8f6c54021a9c809c8ae65a37544ad566ed159 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 4 Feb 2017 23:21:07 +0700 Subject: [PATCH 48/77] [twitch:stream] Improve _VALID_URL (closes #11971) --- youtube_dl/extractor/twitch.py | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 1ca159a4d..bbba394b0 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -447,7 +447,14 @@ class TwitchHighlightsIE(TwitchVideosBaseIE): class TwitchStreamIE(TwitchBaseIE): IE_NAME = 'twitch:stream' - _VALID_URL = r'%s/(?P<id>[^/#?]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?twitch\.tv/| + player\.twitch\.tv/\?.*?\bchannel= + ) + (?P<id>[^/#?]+) + ''' _TESTS = [{ 'url': 'http://www.twitch.tv/shroomztv', @@ -471,8 +478,25 @@ class TwitchStreamIE(TwitchBaseIE): }, { 'url': 'http://www.twitch.tv/miracle_doto#profile-0', 'only_matching': True, + }, { + 'url': 'https://player.twitch.tv/?channel=lotsofs', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return (False + if any(ie.suitable(url) for ie in ( + TwitchVideoIE, + TwitchChapterIE, + TwitchVodIE, + TwitchProfileIE, + TwitchAllVideosIE, + TwitchUploadsIE, + TwitchPastBroadcastsIE, + TwitchHighlightsIE)) + else super(TwitchStreamIE, cls).suitable(url)) + def _real_extract(self, url): channel_id = self._match_id(url) From 3144eccf551afc4c5e66e06de541c033e6f90681 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 4 Feb 2017 23:22:28 +0700 Subject: [PATCH 49/77] [ChangeLog] Actualize --- ChangeLog | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 5323769d8..fe9cd3440 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,12 +1,38 @@ +version <unreleased> + +Extractors ++ [twitch:stream] Add support for player.twitch.tv (#11971) + + version 2017.02.04 Core + Add --playlist-random to shuffle playlists (#11889, #11901) +* [utils] Improve comments processing in js_to_json (#11947) +* [utils] Handle single-line comments in js_to_json +* [downloader/external:ffmpeg] Minimize the use of aac_adtstoasc filter Extractors ++ [piksel] Add another app token pattern (#11969) ++ [vk] Capture and output author blocked error message (#11965) ++ [turner] Fix secure HLS formats downloading with ffmpeg (#11358, #11373, + #11800) ++ [drtv] Add support for live and radio sections (#1827, #3427) +* [myspace] Fix extraction and extract HLS and HTTP formats ++ [youtube] Add format info for itag 325 and 328 +* [vine] Fix extraction (#11955) +- [sportbox] Remove extractor (#11954) ++ [filmon] Add support for filmon.com (#11187) + [infoq] Add audio only formats (#11565) +* [douyutv] Improve room id regular expression (#11931) +* [iprima] Fix extraction (#11920, #11896) * [youtube] Fix ytsearch when cookies are provided (#11924) +* [go] Relax video id regular expression (#11937) +* [facebook] Fix title extraction (#11941) ++ [youtube:playlist] Recognize TL playlists (#11945) + [bilibili] Support new Bangumi URLs (#11845) ++ [cbc:watch] Extract audio codec for audio only formats (#11893) ++ [elpais] Fix extraction for some URLs (#11765) version 2017.02.01 @@ -18,7 +44,6 @@ Extractors + [vimeo] Extract upload timestamp + [vimeo] Extract license (#8726, #11880) + [nrk:series] Add support for series (#11571, #11711) -+ [elpais] Fix extraction for some URLs (#11765) version 2017.01.31 From 7bccd5fc8ac35b1a3952522c0aa176c982f20206 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 4 Feb 2017 23:23:38 +0700 Subject: [PATCH 50/77] [ChangeLog] Actualize --- ChangeLog | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog b/ChangeLog index fe9cd3440..76be8dbd9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -2,6 +2,7 @@ version <unreleased> Extractors + [twitch:stream] Add support for player.twitch.tv (#11971) +* [radiocanada] Fix extraction for toutv rtmp formats version 2017.02.04 From a713a86755ba864a7b765fd2ce9a5ac8a8f4cc63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 4 Feb 2017 23:26:39 +0700 Subject: [PATCH 51/77] release 2017.02.04.1 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 11fd56038..15e7d4944 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.02.04*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.02.04** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.02.04.1*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.02.04.1** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.02.04 +[debug] youtube-dl version 2017.02.04.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 76be8dbd9..23a729559 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version <unreleased> +version 2017.02.04.1 Extractors + [twitch:stream] Add support for player.twitch.tv (#11971) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 376b31397..5dde47a26 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.02.04' +__version__ = '2017.02.04.1' From 3d2c2752c5cd70fc7f9cebe8c4683a1de626017d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Sat, 4 Feb 2017 18:18:03 +0100 Subject: [PATCH 52/77] [afreecatv] extract rtmp formats --- youtube_dl/extractor/afreecatv.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index 4f6cdb8a2..e0a0f7c57 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -221,10 +221,23 @@ class AfreecaTVGlobalIE(AfreecaTVIE): s_url = s.get('purl') if not s_url: continue - # TODO: extract rtmp formats - if s.get('stype') == 'HLS': + stype = s.get('stype') + if stype == 'HLS': formats.extend(self._extract_m3u8_formats( - s_url, channel_id, 'mp4', fatal=False)) + s_url, channel_id, 'mp4', m3u8_id=stype, fatal=False)) + elif stype == 'RTMP': + format_id = [stype] + label = s.get('label') + if label: + format_id.append(label) + formats.append({ + 'format_id': '-'.join(format_id), + 'url': s_url, + 'tbr': int_or_none(s.get('bps')), + 'height': int_or_none(s.get('brt')), + 'ext': 'flv', + 'rtmp_live': True, + }) self._sort_formats(formats) info.update({ From 49bd8d5e2e5c4de8c1c409adffc557cb198f7eee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 5 Feb 2017 02:41:22 +0700 Subject: [PATCH 53/77] [travis] Add python 3.6 --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index c74c9cc12..4833c76e9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,7 @@ python: - "3.3" - "3.4" - "3.5" + - "3.6" sudo: false script: nosetests test --verbose notifications: From 6fd138bed892ac8ae1714d64f4a53d8ea7a1d5bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 5 Feb 2017 13:36:52 +0700 Subject: [PATCH 54/77] [sportbox] PEP 8 --- youtube_dl/extractor/sportbox.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py index 05a0b5a80..e7bd5bf91 100644 --- a/youtube_dl/extractor/sportbox.py +++ b/youtube_dl/extractor/sportbox.py @@ -4,11 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - js_to_json, - unified_strdate, -) +from ..utils import js_to_json class SportBoxEmbedIE(InfoExtractor): From 6ef3e65a7b244d5e432e764772177c7d48cab237 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 5 Feb 2017 13:37:27 +0700 Subject: [PATCH 55/77] [videopress] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 22 +++++++ youtube_dl/extractor/videopress.py | 99 ++++++++++++++++++++++++++++++ 3 files changed, 122 insertions(+) create mode 100644 youtube_dl/extractor/videopress.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 32420937c..cf608faee 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1095,6 +1095,7 @@ from .videomore import ( VideomoreSeasonIE, ) from .videopremium import VideoPremiumIE +from .videopress import VideoPressIE from .vidio import VidioIE from .vidme import ( VidmeIE, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a23486620..4156cf27d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -81,6 +81,7 @@ from .videa import VideaIE from .twentymin import TwentyMinutenIE from .ustream import UstreamIE from .openload import OpenloadIE +from .videopress import VideoPressIE class GenericIE(InfoExtractor): @@ -1473,6 +1474,21 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, 'add_ie': [TwentyMinutenIE.ie_key()], + }, + { + # VideoPress embed + 'url': 'https://en.support.wordpress.com/videopress/', + 'info_dict': { + 'id': 'OcobLTqC', + 'ext': 'm4v', + 'title': 'IMG_5786', + 'timestamp': 1435711927, + 'upload_date': '20150701', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [VideoPressIE.ie_key()], } # { # # TODO: find another test @@ -2438,6 +2454,12 @@ class GenericIE(InfoExtractor): return _playlist_from_matches( openload_urls, ie=OpenloadIE.ie_key()) + # Look for VideoPress embeds + videopress_urls = VideoPressIE._extract_urls(webpage) + if videopress_urls: + return _playlist_from_matches( + videopress_urls, ie=VideoPressIE.ie_key()) + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( webpage, video_id, default={}, expected_type='VideoObject') diff --git a/youtube_dl/extractor/videopress.py b/youtube_dl/extractor/videopress.py new file mode 100644 index 000000000..049db25a5 --- /dev/null +++ b/youtube_dl/extractor/videopress.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import random +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + float_or_none, + parse_age_limit, + qualities, + try_get, + unified_timestamp, + urljoin, +) + + +class VideoPressIE(InfoExtractor): + _VALID_URL = r'https?://videopress\.com/embed/(?P<id>[\da-zA-Z]+)' + _TESTS = [{ + 'url': 'https://videopress.com/embed/kUJmAcSf', + 'md5': '706956a6c875873d51010921310e4bc6', + 'info_dict': { + 'id': 'kUJmAcSf', + 'ext': 'mp4', + 'title': 'VideoPress Demo', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 634.6, + 'timestamp': 1434983935, + 'upload_date': '20150622', + 'age_limit': 0, + }, + }, { + # 17+, requires birth_* params + 'url': 'https://videopress.com/embed/iH3gstfZ', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe[^>]+src=["\']((?:https?://)?videopress\.com/embed/[\da-zA-Z]+)', + webpage) + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id, + video_id, query={ + 'birth_month': random.randint(1, 12), + 'birth_day': random.randint(1, 31), + 'birth_year': random.randint(1950, 1995), + }) + + title = video['title'] + + def base_url(scheme): + return try_get( + video, lambda x: x['file_url_base'][scheme], compat_str) + + base_url = base_url('https') or base_url('http') + + QUALITIES = ('std', 'dvd', 'hd') + quality = qualities(QUALITIES) + + formats = [] + for format_id, f in video['files'].items(): + if not isinstance(f, dict): + continue + for ext, path in f.items(): + if ext in ('mp4', 'ogg'): + formats.append({ + 'url': urljoin(base_url, path), + 'format_id': '%s-%s' % (format_id, ext), + 'ext': determine_ext(path, ext), + 'quality': quality(format_id), + }) + original_url = try_get(video, lambda x: x['original'], compat_str) + if original_url: + formats.append({ + 'url': original_url, + 'format_id': 'original', + 'quality': len(QUALITIES), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': video.get('poster'), + 'duration': float_or_none(video.get('duration'), 1000), + 'timestamp': unified_timestamp(video.get('upload_date')), + 'age_limit': parse_age_limit(video.get('rating')), + 'formats': formats, + } From e4e50f60b1040a4b6aa8ecb9139f7d5de195f407 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 5 Feb 2017 21:41:08 +0800 Subject: [PATCH 56/77] [googledrive] Fix extraction on Python 3.6 Since Python 3.6, invalid escape sequences are deprecated. It's likely that there are invalid escape sequences somewhere on the webpage, so instead of unescaping the whole webpage, just unescape the URL. See https://bugs.python.org/issue27364. That change was designed for string literals, while it affects the 'unicode_escape' encoding as well. The code path is: str.decode('unicode_escape') codecs.unicode_escape_decode() PyUnicode_DecodeUnicodeEscape() --- ChangeLog | 6 ++++++ youtube_dl/extractor/googledrive.py | 9 +++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index 23a729559..a0025ab91 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version <unreleased> + +Extractors +* [googledrive] Fix extraction on Python 3.6 + + version 2017.02.04.1 Extractors diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 766fc26d0..fec36cbbb 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, + lowercase_escape, ) @@ -13,12 +14,12 @@ class GoogleDriveIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})' _TESTS = [{ 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', - 'md5': '881f7700aec4f538571fa1e0eed4a7b6', + 'md5': 'd109872761f7e7ecf353fa108c0dbe1e', 'info_dict': { 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', 'ext': 'mp4', 'title': 'Big Buck Bunny.mp4', - 'duration': 46, + 'duration': 45, } }, { # video id is longer than 28 characters @@ -55,7 +56,7 @@ class GoogleDriveIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://docs.google.com/file/d/%s' % video_id, video_id, encoding='unicode_escape') + 'http://docs.google.com/file/d/%s' % video_id, video_id) reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) if reason: @@ -74,7 +75,7 @@ class GoogleDriveIE(InfoExtractor): resolution = fmt.split('/')[1] width, height = resolution.split('x') formats.append({ - 'url': fmt_url, + 'url': lowercase_escape(fmt_url), 'format_id': fmt_id, 'resolution': resolution, 'width': int_or_none(width), From caf0f5f8b7d0854caaf6778fe3a646ee0d7668fe Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 5 Feb 2017 21:48:13 +0800 Subject: [PATCH 57/77] [iwara] Fix extraction (closes #11781) --- ChangeLog | 1 + youtube_dl/extractor/iwara.py | 41 +++++++++++++++++++++++++---------- 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/ChangeLog b/ChangeLog index a0025ab91..77286dbef 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version <unreleased> Extractors +* [iwara] Fix extraction (#11781) * [googledrive] Fix extraction on Python 3.6 diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py index 8d7e7f472..011274b02 100644 --- a/youtube_dl/extractor/iwara.py +++ b/youtube_dl/extractor/iwara.py @@ -3,14 +3,18 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_parse_urlparse -from ..utils import remove_end +from ..utils import ( + int_or_none, + mimetype2ext, + remove_end, +) class IwaraIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos/(?P<id>[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'http://iwara.tv/videos/amVwUl1EHpAD9RD', - 'md5': '1d53866b2c514b23ed69e4352fdc9839', + # md5 is unstable 'info_dict': { 'id': 'amVwUl1EHpAD9RD', 'ext': 'mp4', @@ -23,17 +27,17 @@ class IwaraIE(InfoExtractor): 'info_dict': { 'id': '0B1LvuHnL-sRFNXB1WHNqbGw4SXc', 'ext': 'mp4', - 'title': '[3D Hentai] Kyonyu Ã\x97 Genkai Ã\x97 Emaki Shinobi Girls.mp4', + 'title': '[3D Hentai] Kyonyu × Genkai × Emaki Shinobi Girls.mp4', 'age_limit': 18, }, 'add_ie': ['GoogleDrive'], }, { 'url': 'http://www.iwara.tv/videos/nawkaumd6ilezzgq', - 'md5': '1d85f1e5217d2791626cff5ec83bb189', + # md5 is unstable 'info_dict': { 'id': '6liAP9s2Ojc', 'ext': 'mp4', - 'age_limit': 0, + 'age_limit': 18, 'title': '[MMD] Do It Again Ver.2 [1080p 60FPS] (Motion,Camera,Wav+DL)', 'description': 'md5:590c12c0df1443d833fbebe05da8c47a', 'upload_date': '20160910', @@ -52,9 +56,9 @@ class IwaraIE(InfoExtractor): # ecchi is 'sexy' in Japanese age_limit = 18 if hostname.split('.')[0] == 'ecchi' else 0 - entries = self._parse_html5_media_entries(url, webpage, video_id) + video_data = self._download_json('http://www.iwara.tv/api/video/%s' % video_id, video_id) - if not entries: + if not video_data: iframe_url = self._html_search_regex( r'<iframe[^>]+src=([\'"])(?P<url>[^\'"]+)\1', webpage, 'iframe URL', group='url') @@ -67,11 +71,24 @@ class IwaraIE(InfoExtractor): title = remove_end(self._html_search_regex( r'<title>([^<]+)', webpage, 'title'), ' | Iwara') - info_dict = entries[0] - info_dict.update({ + formats = [] + for a_format in video_data: + format_id = a_format.get('resolution') + height = int_or_none(self._search_regex( + r'(\d+)p', format_id, 'height', default=None)) + formats.append({ + 'url': a_format['uri'], + 'format_id': format_id, + 'ext': mimetype2ext(a_format.get('mime')) or 'mp4', + 'height': height, + 'quality': 1 if format_id == 'Source' else 0, + }) + + self._sort_formats(formats) + + return { 'id': video_id, 'title': title, 'age_limit': age_limit, - }) - - return info_dict + 'formats': formats, + } From 2ab2c0d1f53f66614eda4fefb042e851e78097f0 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 5 Feb 2017 22:30:13 +0800 Subject: [PATCH 58/77] [iwara] Add width (closes #11724) The heuristic is from #11724 --- youtube_dl/extractor/iwara.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py index 011274b02..a7514fc80 100644 --- a/youtube_dl/extractor/iwara.py +++ b/youtube_dl/extractor/iwara.py @@ -81,6 +81,7 @@ class IwaraIE(InfoExtractor): 'format_id': format_id, 'ext': mimetype2ext(a_format.get('mime')) or 'mp4', 'height': height, + 'width': int_or_none(height / 9.0 * 16.0 if height else None), 'quality': 1 if format_id == 'Source' else 0, }) From 019f4c03717bfd2b887309e5a4c96ea82cbedf34 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 5 Feb 2017 22:47:04 +0800 Subject: [PATCH 59/77] [bandcamp] Fix extraction for incomplete albums Closes #11727 --- ChangeLog | 1 + youtube_dl/extractor/bandcamp.py | 19 ++++++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index 77286dbef..984191925 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version Extractors +* [bandcamp] Fix extraction for incomplete albums (#11727) * [iwara] Fix extraction (#11781) * [googledrive] Fix extraction on Python 3.6 diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 88c590e98..056e06376 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -209,6 +209,15 @@ class BandcampAlbumIE(InfoExtractor): 'id': 'entropy-ep', }, 'playlist_mincount': 3, + }, { + # not all tracks have songs + 'url': 'https://insulters.bandcamp.com/album/we-are-the-plague', + 'info_dict': { + 'id': 'we-are-the-plague', + 'title': 'WE ARE THE PLAGUE', + 'uploader_id': 'insulters', + }, + 'playlist_count': 2, }] def _real_extract(self, url): @@ -217,12 +226,16 @@ class BandcampAlbumIE(InfoExtractor): album_id = mobj.group('album_id') playlist_id = album_id or uploader_id webpage = self._download_webpage(url, playlist_id) - tracks_paths = re.findall(r'