From 79a2e94e79e65cdf4898bc2dedb6a1bb4ca9af3c Mon Sep 17 00:00:00 2001 From: Adam Thalhammer Date: Mon, 2 May 2016 13:21:39 +1000 Subject: [PATCH 01/18] Instead of replacing accented characters with an underscore when sanitizing file names in restricted mode, replace them with their non-accented equivalents fixes #9347 --- test/test_utils.py | 9 +++++++-- youtube_dl/utils.py | 9 +++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index e16a6761b..0072ba241 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -139,8 +139,8 @@ class TestUtil(unittest.TestCase): self.assertEqual('yes_no', sanitize_filename('yes? no', restricted=True)) self.assertEqual('this_-_that', sanitize_filename('this: that', restricted=True)) - tests = 'a\xe4b\u4e2d\u56fd\u7684c' - self.assertEqual(sanitize_filename(tests, restricted=True), 'a_b_c') + tests = 'aäb\u4e2d\u56fd\u7684c' + self.assertEqual(sanitize_filename(tests, restricted=True), 'aab_c') self.assertTrue(sanitize_filename('\xf6', restricted=True) != '') # No empty filename forbidden = '"\0\\/&!: \'\t\n()[]{}$;`^,#' @@ -155,6 +155,11 @@ class TestUtil(unittest.TestCase): self.assertTrue(sanitize_filename('-', restricted=True) != '') self.assertTrue(sanitize_filename(':', restricted=True) != '') + self.assertEqual(sanitize_filename( + 'ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', restricted=True), + 'AAAAAAAECEEEEIIIIDNOOOOOOUUUUYPssaaaaaaaeceeeeiiiionoooooouuuuypy') + pass + def test_sanitize_ids(self): self.assertEqual(sanitize_filename('_n_cd26wFpw', is_id=True), '_n_cd26wFpw') self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7bcc85e2b..f74f62268 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -14,8 +14,8 @@ import email.utils import errno import functools import gzip -import itertools import io +import itertools import json import locale import math @@ -24,8 +24,8 @@ import os import pipes import platform import re -import ssl import socket +import ssl import struct import subprocess import sys @@ -365,6 +365,11 @@ def sanitize_filename(s, restricted=False, is_id=False): Set is_id if this is not an arbitrary string, but an ID that should be kept if possible """ def replace_insane(char): + accents = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', + itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'], + 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy'))) + if restricted and char in accents: + return accents[char] if char == '?' or ord(char) < 32 or ord(char) == 127: return '' elif char == '"': From 31c4448f6ea8ec4236e0b67af33b53db1a93ef13 Mon Sep 17 00:00:00 2001 From: Adam Thalhammer Date: Mon, 2 May 2016 13:25:12 +1000 Subject: [PATCH 02/18] Instead of replacing accented characters with an underscore when sanitizing file names in restricted mode, replace them with their non-accented equivalents fixes #9347 --- test/test_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 0072ba241..00ada95ec 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -158,7 +158,6 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_filename( 'ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', restricted=True), 'AAAAAAAECEEEEIIIIDNOOOOOOUUUUYPssaaaaaaaeceeeeiiiionoooooouuuuypy') - pass def test_sanitize_ids(self): self.assertEqual(sanitize_filename('_n_cd26wFpw', is_id=True), '_n_cd26wFpw') From c587cbb793a6eda4fc7b1c7b4163e236abec1a00 Mon Sep 17 00:00:00 2001 From: Adam Thalhammer Date: Tue, 3 May 2016 10:40:30 +1000 Subject: [PATCH 03/18] improved performance by extracting accented chars to top level --- youtube_dl/utils.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index f74f62268..a5922b2b5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -89,6 +89,11 @@ KNOWN_EXTENSIONS = ( 'wav', 'f4f', 'f4m', 'm3u8', 'smil') +# needed for sanitizing filenames in restricted mode +ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', + itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'], + 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy'))) + def preferredencoding(): """Get preferred encoding. @@ -365,11 +370,8 @@ def sanitize_filename(s, restricted=False, is_id=False): Set is_id if this is not an arbitrary string, but an ID that should be kept if possible """ def replace_insane(char): - accents = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', - itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'], - 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy'))) - if restricted and char in accents: - return accents[char] + if restricted and char in ACCENT_CHARS: + return ACCENT_CHARS[char] if char == '?' or ord(char) < 32 or ord(char) == 127: return '' elif char == '"': From e960c3c223acadb2fac81fb68595d902cf21e349 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 4 May 2016 22:25:39 +0600 Subject: [PATCH 04/18] [yandexmusic:playlist] Improve extraction (Closes #6801) --- youtube_dl/extractor/yandexmusic.py | 38 +++++++++++++++++++---------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index ce3723b55..22050add3 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -177,7 +177,7 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE): class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): IE_NAME = 'yandexmusic:playlist' IE_DESC = 'Яндекс.Музыка - Плейлист' - _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P\d+)' + _VALID_URL = r'https?://music\.yandex\.(?Pru|kz|ua|by)/users/(?P[^/]+)/playlists/(?P\d+)' _TESTS = [{ 'url': 'http://music.yandex.ru/users/music.partners/playlists/1245', @@ -201,19 +201,32 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): }] def _real_extract(self, url): - playlist_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + tld = mobj.group('tld') + user = mobj.group('user') + playlist_id = mobj.group('id') - webpage = self._download_webpage(url, playlist_id) + playlist = self._download_json( + 'https://music.yandex.%s/handlers/playlist.jsx' % tld, + playlist_id, 'Downloading missing tracks JSON', + fatal=False, + headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + 'X-Retpath-Y': url, + }, + query={ + 'owner': user, + 'kinds': playlist_id, + 'light': 'true', + 'lang': tld, + 'external-domain': 'music.yandex.%s' % tld, + 'overembed': 'false', + })['playlist'] - mu = self._parse_json( - self._search_regex( - r'var\s+Mu\s*=\s*({.+?});\s*', webpage, 'player'), - playlist_id) - - playlist = mu['pageData']['playlist'] tracks, track_ids = playlist['tracks'], playlist['trackIds'] - # tracks dictionary shipped with webpage is limited to 150 tracks, + # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks, # missing tracks should be retrieved manually. if len(tracks) < len(track_ids): present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')]) @@ -222,10 +235,9 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'https://music.yandex.ru/handlers/track-entries.jsx', urlencode_postdata({ 'entries': ','.join(missing_track_ids), - 'lang': mu.get('settings', {}).get('lang', 'en'), - 'external-domain': 'music.yandex.ru', + 'lang': tld, + 'external-domain': 'music.yandex.%s' % tld, 'overembed': 'false', - 'sign': mu.get('authData', {}).get('user', {}).get('sign'), 'strict': 'true', })) request.add_header('Referer', url) From 15fc0658f75403e76f9cb29dd4ce5d1e514d4bf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 4 May 2016 22:33:29 +0600 Subject: [PATCH 05/18] [yandexmusic:playlist] Modernize --- youtube_dl/extractor/yandexmusic.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 22050add3..0dda901a8 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -231,20 +231,21 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): if len(tracks) < len(track_ids): present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')]) missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids) - request = sanitized_Request( - 'https://music.yandex.ru/handlers/track-entries.jsx', - urlencode_postdata({ + missing_tracks = self._download_json( + 'https://music.yandex.%s/handlers/track-entries.jsx' % tld, + playlist_id, 'Downloading missing tracks JSON', + fatal=False, + headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + }, + query={ 'entries': ','.join(missing_track_ids), 'lang': tld, 'external-domain': 'music.yandex.%s' % tld, 'overembed': 'false', 'strict': 'true', - })) - request.add_header('Referer', url) - request.add_header('X-Requested-With', 'XMLHttpRequest') - - missing_tracks = self._download_json( - request, playlist_id, 'Downloading missing tracks JSON', fatal=False) + }) if missing_tracks: tracks.extend(missing_tracks) From d36724cca47c0dbbe691d5467c0c88e28a6627db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 4 May 2016 22:34:37 +0600 Subject: [PATCH 06/18] [yandexmusic:playlist] Remove unused imports --- youtube_dl/extractor/yandexmusic.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 0dda901a8..a9ebbbc04 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -10,8 +10,6 @@ from ..utils import ( ExtractorError, int_or_none, float_or_none, - sanitized_Request, - urlencode_postdata, ) From 203a3c0e6a589627f9b0f0c4c6ec56e24c27272c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 4 May 2016 22:35:28 +0600 Subject: [PATCH 07/18] [yandexmusic:playlist] Make title optional --- youtube_dl/extractor/yandexmusic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index a9ebbbc04..6e7a23782 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -250,4 +250,4 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): return self.playlist_result( self._build_playlist(tracks), compat_str(playlist_id), - playlist['title'], playlist.get('description')) + playlist.get('title'), playlist.get('description')) From 2a48e6f01a20545cd117cacb78463dba87d97b21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 4 May 2016 22:45:01 +0600 Subject: [PATCH 08/18] [yandexmusic:playlist] Respect track order for long (>150) playlists --- youtube_dl/extractor/yandexmusic.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 6e7a23782..283b55a91 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -222,13 +222,17 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'overembed': 'false', })['playlist'] - tracks, track_ids = playlist['tracks'], playlist['trackIds'] + tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds']) # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks, # missing tracks should be retrieved manually. if len(tracks) < len(track_ids): - present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')]) - missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids) + present_track_ids = set([ + compat_str(track['id']) + for track in tracks if track.get('id')]) + missing_track_ids = [ + track_id for track_id in track_ids + if track_id not in present_track_ids] missing_tracks = self._download_json( 'https://music.yandex.%s/handlers/track-entries.jsx' % tld, playlist_id, 'Downloading missing tracks JSON', From aabdc83d6e9ea1e6a28c48cb228a644c55fc11b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 4 May 2016 23:03:44 +0600 Subject: [PATCH 09/18] [udemy] Fix course enroll (Closes #9393) --- youtube_dl/extractor/udemy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index d1e6f2703..7ebd231f0 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -84,7 +84,8 @@ class UdemyIE(InfoExtractor): if enroll_url: webpage = self._download_webpage( combine_url(base_url, enroll_url), - course_id, 'Enrolling in the course') + course_id, 'Enrolling in the course', + headers={'Referer': base_url}) if '>You have enrolled in' in webpage: self.to_screen('%s: Successfully enrolled in the course' % course_id) From 75b81df3af31bdbe794b55924dd9ab85e1314d04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 4 May 2016 23:14:12 +0600 Subject: [PATCH 10/18] [udemy] Modernize --- youtube_dl/extractor/udemy.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 7ebd231f0..13e0cd237 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, ) @@ -91,12 +90,12 @@ class UdemyIE(InfoExtractor): def _download_lecture(self, course_id, lecture_id): return self._download_json( - 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % ( - course_id, lecture_id, compat_urllib_parse_urlencode({ - 'fields[lecture]': 'title,description,view_html,asset', - 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', - })), - lecture_id, 'Downloading lecture JSON') + 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?' + % (course_id, lecture_id), + lecture_id, 'Downloading lecture JSON', query={ + 'fields[lecture]': 'title,description,view_html,asset', + 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', + }) def _handle_error(self, response): if not isinstance(response, dict): @@ -156,13 +155,13 @@ class UdemyIE(InfoExtractor): 'password': password, }) - request = sanitized_Request( - self._LOGIN_URL, urlencode_postdata(login_form)) - request.add_header('Referer', self._ORIGIN_URL) - request.add_header('Origin', self._ORIGIN_URL) - response = self._download_webpage( - request, None, 'Logging in as %s' % username) + self._LOGIN_URL, None, 'Logging in as %s' % username, + data=urlencode_postdata(login_form), + headers={ + 'Referer': self._ORIGIN_URL, + 'Origin': self._ORIGIN_URL, + }) if not is_logged(response): error = self._html_search_regex( From 9da526aae75fc7bbc07e791b9f4eab78c007636c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 4 May 2016 23:18:48 +0600 Subject: [PATCH 11/18] [yandexmusic:playlist] Update test --- youtube_dl/extractor/yandexmusic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 283b55a91..0f78466e6 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -194,7 +194,7 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'id': '1036', 'title': 'Музыка 90-х', }, - 'playlist_count': 310, + 'playlist_mincount': 300, 'skip': 'Travis CI servers blocked by YandexMusic', }] From 4f8c56eb4e52d0a61a5facc6f22e36a2e420f4c9 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 5 May 2016 17:55:37 +0800 Subject: [PATCH 12/18] [fczenit] Fix extraction and update test Closes #9359 --- youtube_dl/extractor/fczenit.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/fczenit.py b/youtube_dl/extractor/fczenit.py index f1f150ef2..8d1010b88 100644 --- a/youtube_dl/extractor/fczenit.py +++ b/youtube_dl/extractor/fczenit.py @@ -1,20 +1,19 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_urlparse class FczenitIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/gl(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/(?P[0-9]+)' _TEST = { - 'url': 'http://fc-zenit.ru/video/gl6785/', - 'md5': '458bacc24549173fe5a5aa29174a5606', + 'url': 'http://fc-zenit.ru/video/41044/', + 'md5': '0e3fab421b455e970fa1aa3891e57df0', 'info_dict': { - 'id': '6785', + 'id': '41044', 'ext': 'mp4', - 'title': '«Зенит-ТВ»: как Олег Шатов играл против «Урала»', + 'title': 'Так пишется история: казанский разгром ЦСКА на «Зенит-ТВ»', }, } @@ -22,15 +21,23 @@ class FczenitIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_title = self._html_search_regex(r'
([^<]+)', webpage, 'title') + video_title = self._html_search_regex( + r'<[^>]+class=\"photoalbum__title\">([^<]+)', webpage, 'title') - bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, 'video URL') - bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw) + video_items = self._parse_json(self._search_regex( + r'arrPath\s*=\s*JSON\.parse\(\'(.+)\'\)', webpage, 'video items'), + video_id) + + def merge_dicts(*dicts): + ret = {} + for a_dict in dicts: + ret.update(a_dict) + return ret formats = [{ - 'url': furl, - 'tbr': tbr, - } for furl, tbr in bitrates] + 'url': compat_urlparse.urljoin(url, video_url), + 'tbr': int(tbr), + } for tbr, video_url in merge_dicts(*video_items).items()] self._sort_formats(formats) From 758a05924180ad2d1197ee1a293f28363ace54aa Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 5 May 2016 13:12:28 +0100 Subject: [PATCH 13/18] [dailymail] Add new extractor(closes #2667) --- youtube_dl/extractor/dailymail.py | 61 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 62 insertions(+) create mode 100644 youtube_dl/extractor/dailymail.py diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py new file mode 100644 index 000000000..b60a1d813 --- /dev/null +++ b/youtube_dl/extractor/dailymail.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + determine_protocol, +) + + +class DailyMailIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P[0-9]+)' + _TEST = { + 'url': 'http://www.dailymail.co.uk/video/sciencetech/video-1288527/Turn-video-impressionist-masterpiece.html', + 'md5': '2f639d446394f53f3a33658b518b6615', + 'info_dict': { + 'id': '1288527', + 'ext': 'mp4', + 'title': 'Turn any video into an impressionist masterpiece', + 'description': 'md5:88ddbcb504367987b2708bb38677c9d2', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_data = self._parse_json(self._search_regex( + r"data-opts='({.+?})'", webpage, 'video data'), video_id) + title = video_data['title'] + video_sources = self._download_json(video_data.get( + 'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id) + + formats = [] + for rendition in video_sources['renditions']: + rendition_url = rendition.get('url') + if not rendition_url: + continue + tbr = int_or_none(rendition.get('encodingRate'), 1000) + container = rendition.get('videoContainer') + is_hls = container == 'M2TS' + protocol = 'm3u8_native' if is_hls else determine_protocol({'url': rendition_url}) + formats.append({ + 'format_id': ('hls' if is_hls else protocol) + ('-%d' % tbr if tbr else ''), + 'url': rendition_url, + 'width': int_or_none(rendition.get('frameWidth')), + 'height': int_or_none(rendition.get('frameHeight')), + 'tbr': tbr, + 'vcodec': rendition.get('videoCodec'), + 'container': container, + 'protocol': protocol, + 'ext': 'mp4' if is_hls else None, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('descr'), + 'thumbnail': video_data.get('poster') or video_data.get('thumbnail'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ef4431364..aac85066f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -157,6 +157,7 @@ from .cspan import CSpanIE from .ctsnews import CtsNewsIE from .cultureunplugged import CultureUnpluggedIE from .cwtv import CWTVIE +from .dailymail import DailyMailIE from .dailymotion import ( DailymotionIE, DailymotionPlaylistIE, From 7d08f6073d42c8623031615c4b7e0bb26136c128 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 5 May 2016 20:06:59 +0800 Subject: [PATCH 14/18] [kuwo:category] Update test --- youtube_dl/extractor/kuwo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 616ed19e1..11b31a699 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -266,6 +266,7 @@ class KuwoCategoryIE(InfoExtractor): 'info_dict': { 'id': '86375', 'title': '八十年代精选', + 'description': '这些都是属于八十年代的回忆!', }, 'playlist_mincount': 24, } From b1c6a5bac8e3daf233cc79f0172d907905290f24 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 5 May 2016 20:50:39 +0800 Subject: [PATCH 15/18] [Makefile] Remove more media files in `make clean` --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 06cffcb71..c9ce216d1 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites clean: - rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe + rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi *.mkv *.webm CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe find . -name "*.pyc" -delete find . -name "*.class" -delete From ac12e888f9d76c75666822a79c125db8577e5fa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 5 May 2016 21:02:54 +0600 Subject: [PATCH 16/18] [redtube] Extract all formats, duration, upload date and view count (Closes #9397) --- youtube_dl/extractor/redtube.py | 59 +++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 7ba41ba59..1e532286c 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -1,7 +1,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + int_or_none, + str_to_int, + unified_strdate, +) class RedTubeIE(InfoExtractor): @@ -13,6 +18,9 @@ class RedTubeIE(InfoExtractor): 'id': '66418', 'ext': 'mp4', 'title': 'Sucked on a toilet', + 'upload_date': '20120831', + 'duration': 596, + 'view_count': int, 'age_limit': 18, } } @@ -24,12 +32,40 @@ class RedTubeIE(InfoExtractor): if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']): raise ExtractorError('Video %s has been removed' % video_id, expected=True) - video_url = self._html_search_regex( - r'', webpage, 'video URL') - video_title = self._html_search_regex( - r'

(.+?)

', - webpage, 'title') - video_thumbnail = self._og_search_thumbnail(webpage) + title = self._html_search_regex( + (r'

(?P.+?)</h1>', + r'videoTitle\s*:\s*(["\'])(?P<title>)\1'), + webpage, 'title', group='title') + + formats = [] + sources = self._parse_json( + self._search_regex( + r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'), + video_id, fatal=False) + if sources and isinstance(sources, dict): + for format_id, format_url in sources.items(): + if format_url: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + else: + video_url = self._html_search_regex( + r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') + formats.append({'url': video_url}) + self._sort_formats(formats) + + thumbnail = self._og_search_thumbnail(webpage) + upload_date = unified_strdate(self._search_regex( + r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<', + webpage, 'upload date', fatal=False)) + duration = int_or_none(self._search_regex( + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) + view_count = str_to_int(self._search_regex( + r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)', + webpage, 'view count', fatal=False)) + # No self-labeling, but they describe themselves as # "Home of Videos Porno" @@ -37,9 +73,12 @@ class RedTubeIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, 'ext': 'mp4', - 'title': video_title, - 'thumbnail': video_thumbnail, + 'title': title, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, 'age_limit': age_limit, + 'formats': formats, } From 915620fd6894d92f89f9e5c9362d20f94e787e57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 May 2016 21:34:06 +0600 Subject: [PATCH 17/18] [redtube] PEP 8 --- youtube_dl/extractor/redtube.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index 1e532286c..721fc3a9e 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -66,7 +66,6 @@ class RedTubeIE(InfoExtractor): r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)', webpage, 'view count', fatal=False)) - # No self-labeling, but they describe themselves as # "Home of Videos Porno" age_limit = 18 From 6f59aa934bd5842a57d4c27ace0f3735be18ac27 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 6 May 2016 02:14:39 +0800 Subject: [PATCH 18/18] [periscope:user] Add new extractor for user pages Closes #9388 --- youtube_dl/extractor/extractors.py | 5 +++- youtube_dl/extractor/periscope.py | 37 ++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index aac85066f..c9d1422e5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -561,7 +561,10 @@ from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE from .people import PeopleIE -from .periscope import PeriscopeIE +from .periscope import ( + PeriscopeIE, + PeriscopeUserIE, +) from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 514e9b433..0a4bc761d 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -7,6 +7,7 @@ from ..utils import parse_iso8601 class PeriscopeIE(InfoExtractor): IE_DESC = 'Periscope' + IE_NAME = 'periscope' _VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P<id>[^/?#]+)' # Alive example URLs can be found here http://onperiscope.com/ _TESTS = [{ @@ -79,3 +80,39 @@ class PeriscopeIE(InfoExtractor): 'thumbnails': thumbnails, 'formats': formats, } + + +class PeriscopeUserIE(InfoExtractor): + _VALID_URL = r'https?://www\.periscope\.tv/(?P<id>[^/]+)/?$' + IE_DESC = 'Periscope user videos' + IE_NAME = 'periscope:user' + + _TEST = { + 'url': 'https://www.periscope.tv/LularoeHusbandMike/', + 'info_dict': { + 'id': 'LularoeHusbandMike', + 'title': 'LULAROE HUSBAND MIKE', + }, + # Periscope only shows videos in the last 24 hours, so it's possible to + # get 0 videos + 'playlist_mincount': 0, + } + + def _real_extract(self, url): + user_id = self._match_id(url) + + webpage = self._download_webpage(url, user_id) + + broadcast_data = self._parse_json(self._html_search_meta( + 'broadcast-data', webpage, default='{}'), user_id) + username = broadcast_data.get('user', {}).get('display_name') + user_broadcasts = self._parse_json( + self._html_search_meta('user-broadcasts', webpage, default='{}'), + user_id) + + entries = [ + self.url_result( + 'https://www.periscope.tv/%s/%s' % (user_id, broadcast['id'])) + for broadcast in user_broadcasts.get('broadcasts', [])] + + return self.playlist_result(entries, user_id, username)