From 233c1c0e76d64c9e13dc8968bfd8a014c49e66a8 Mon Sep 17 00:00:00 2001 From: Antti Ajanki Date: Sun, 3 May 2015 11:04:14 +0300 Subject: [PATCH 001/208] [downloader/f4m] Fragment filenames must be sanitized because the fragment was written to a file with a sanitized name by http_dl.download() --- youtube_dl/downloader/f4m.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index b1a858c45..3cb07e15f 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -396,18 +396,19 @@ class F4mFD(FileDownloader): success = http_dl.download(frag_filename, {'url': url}) if not success: return False - with open(frag_filename, 'rb') as down: - down_data = down.read() - reader = FlvReader(down_data) - while True: - _, box_type, box_data = reader.read_box_info() - if box_type == b'mdat': - dest_stream.write(box_data) - break + (down, frag_sanitized) = sanitize_open(frag_filename, 'rb') + down_data = down.read() + down.close() + reader = FlvReader(down_data) + while True: + _, box_type, box_data = reader.read_box_info() + if box_type == b'mdat': + dest_stream.write(box_data) + break if live: - os.remove(frag_filename) + os.remove(encodeFilename(frag_sanitized)) else: - frags_filenames.append(frag_filename) + frags_filenames.append(frag_sanitized) except (compat_urllib_error.HTTPError, ) as err: if live and (err.code == 404 or err.code == 410): # We didn't keep up with the live window. Continue @@ -430,7 +431,7 @@ class F4mFD(FileDownloader): elapsed = time.time() - start self.try_rename(tmpfilename, filename) for frag_file in frags_filenames: - os.remove(frag_file) + os.remove(encodeFilename(frag_file)) fsize = os.path.getsize(encodeFilename(filename)) self._hook_progress({ From bfed4813b224f720e716de4d4f27471ca9053a0b Mon Sep 17 00:00:00 2001 From: linhua55 <5linhua5@gmail.com> Date: Sat, 18 Jul 2015 23:33:42 +0800 Subject: [PATCH 002/208] fix extraction for http://www.tudou.com/albumplay/cJAHGih4yYg.html --- youtube_dl/extractor/tudou.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index c89de5ba4..8095e18d2 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -9,7 +9,7 @@ from .common import InfoExtractor class TudouIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/.*?/(?P[^/?#]+?)(?:\.html)?/?(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/?.*/(?P[^/?#]+?)(?:\.html)?/?(?:$|[?#])' _TESTS = [{ 'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html', 'md5': '140a49ed444bd22f93330985d8475fcb', From 061f62da54cb4184a039108e40dee8e9eb2611c1 Mon Sep 17 00:00:00 2001 From: ping Date: Thu, 20 Aug 2015 12:56:11 +0800 Subject: [PATCH 003/208] [vlive] New extractor for vlive.tv --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/vlive.py | 94 ++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 youtube_dl/extractor/vlive.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1c53a5632..6bee5b63c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -735,6 +735,7 @@ from .vk import ( VKIE, VKUserVideosIE, ) +from .vlive import VLiveIE from .vodlocker import VodlockerIE from .voicerepublic import VoiceRepublicIE from .vporn import VpornIE diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py new file mode 100644 index 000000000..b3bbd80fb --- /dev/null +++ b/youtube_dl/extractor/vlive.py @@ -0,0 +1,94 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hmac +from hashlib import sha1 +from base64 import b64encode +from time import time + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + determine_ext +) +from ..compat import compat_urllib_parse + + +class VLiveIE(InfoExtractor): + IE_NAME = 'vlive' + _VALID_URL = r'https?://(?:(www|m)\.)?vlive\.tv/video/(?P[0-9]+)' + _TEST = { + 'url': 'http://m.vlive.tv/video/1326', + 'md5': 'cc7314812855ce56de70a06a27314983', + 'info_dict': { + 'id': '1326', + 'ext': 'mp4', + 'title': '[V] Girl\'s Day\'s Broadcast', + 'creator': 'Girl\'s Day', + 'upload_date': '20150817', + }, + } + _SECRET = 'rFkwZet6pqk1vQt6SxxUkAHX7YL3lmqzUMrU4IDusTo4jEBdtOhNfT4BYYAdArwH' + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://m.vlive.tv/video/%s' % video_id, + video_id, note='Download video page') + + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) + creator = self._html_search_regex( + r'([^<>]+)', webpage, 'creator') + upload_date = self._html_search_regex( + r'(\d{4}\.\d{2}\.\d{2})', webpage, + 'upload date', default=None, fatal=False) + if upload_date: + upload_date = upload_date.replace('.', '') + + url = 'http://global.apis.naver.com/globalV/globalV/vod/%s/playinfo?' % video_id + msgpad = {'msgpad': '%.0f' % (time() * 1000)} + md = { + 'md': b64encode( + hmac.new(self._SECRET.encode('ascii'), + (url[:255] + msgpad['msgpad']).encode('ascii'), sha1).digest()) + } + url += '&' + compat_urllib_parse.urlencode(msgpad) + '&' + compat_urllib_parse.urlencode(md) + + playinfo = self._download_json(url, video_id, 'Downloading video json') + + if playinfo.get('message', '') != 'success': + raise ExtractorError(playinfo['message']) + + if not playinfo.get('result'): + raise ExtractorError('No videos found.') + + formats = [] + for vid in playinfo['result'].get('videos', {}).get('list', []): + formats.append({ + 'url': vid['source'], + 'ext': 'mp4', + 'abr': vid.get('bitrate', {}).get('audio'), + 'vbr': vid.get('bitrate', {}).get('video'), + 'format_id': vid['encodingOption']['name'], + 'height': vid.get('height'), + 'width': vid.get('width'), + }) + self._sort_formats(formats) + + subtitles = {} + for caption in playinfo['result'].get('captions', {}).get('list', []): + subtitles[caption['language']] = [ + {'ext': determine_ext(caption['source'], default_ext='vtt'), + 'url': caption['source']}] + + return { + 'id': video_id, + 'title': title, + 'creator': creator, + 'thumbnail': thumbnail, + 'formats': formats, + 'upload_date': upload_date, + 'subtitles': subtitles, + } From eba470f2f22389ab32164e4eb39067ceecf900f5 Mon Sep 17 00:00:00 2001 From: ping Date: Mon, 24 Aug 2015 16:30:00 +0800 Subject: [PATCH 004/208] [vlive] Remove upload_date extraction & cleanup --- youtube_dl/extractor/vlive.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index b3bbd80fb..6a403cc64 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -25,7 +25,6 @@ class VLiveIE(InfoExtractor): 'ext': 'mp4', 'title': '[V] Girl\'s Day\'s Broadcast', 'creator': 'Girl\'s Day', - 'upload_date': '20150817', }, } _SECRET = 'rFkwZet6pqk1vQt6SxxUkAHX7YL3lmqzUMrU4IDusTo4jEBdtOhNfT4BYYAdArwH' @@ -41,21 +40,14 @@ class VLiveIE(InfoExtractor): thumbnail = self._og_search_thumbnail(webpage) creator = self._html_search_regex( r'([^<>]+)', webpage, 'creator') - upload_date = self._html_search_regex( - r'(\d{4}\.\d{2}\.\d{2})', webpage, - 'upload date', default=None, fatal=False) - if upload_date: - upload_date = upload_date.replace('.', '') - + url = 'http://global.apis.naver.com/globalV/globalV/vod/%s/playinfo?' % video_id - msgpad = {'msgpad': '%.0f' % (time() * 1000)} - md = { - 'md': b64encode( - hmac.new(self._SECRET.encode('ascii'), - (url[:255] + msgpad['msgpad']).encode('ascii'), sha1).digest()) - } - url += '&' + compat_urllib_parse.urlencode(msgpad) + '&' + compat_urllib_parse.urlencode(md) - + msgpad = '%.0f' % (time() * 1000) + md = b64encode( + hmac.new(self._SECRET.encode('ascii'), + (url[:255] + msgpad).encode('ascii'), sha1).digest() + ) + url += '&' + compat_urllib_parse.urlencode({'msgpad': msgpad, 'md': md}) playinfo = self._download_json(url, video_id, 'Downloading video json') if playinfo.get('message', '') != 'success': @@ -89,6 +81,5 @@ class VLiveIE(InfoExtractor): 'creator': creator, 'thumbnail': thumbnail, 'formats': formats, - 'upload_date': upload_date, 'subtitles': subtitles, } From 95e431e9ec2477694d368a050222d6381a6f88ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 25 Aug 2015 21:08:38 +0600 Subject: [PATCH 005/208] [mailru] Skip tests --- youtube_dl/extractor/mailru.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py index 54a14cb94..ab1300185 100644 --- a/youtube_dl/extractor/mailru.py +++ b/youtube_dl/extractor/mailru.py @@ -25,6 +25,7 @@ class MailRuIE(InfoExtractor): 'uploader_id': 'sonypicturesrus@mail.ru', 'duration': 184, }, + 'skip': 'Not accessible from Travis CI server', }, { 'url': 'http://my.mail.ru/corp/hitech/video/news_hi-tech_mail_ru/1263.html', @@ -39,6 +40,7 @@ class MailRuIE(InfoExtractor): 'uploader_id': 'hitech@corp.mail.ru', 'duration': 245, }, + 'skip': 'Not accessible from Travis CI server', }, ] From ebbf078c7df575903ceb1be53e53533508c79dad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 25 Aug 2015 21:19:21 +0600 Subject: [PATCH 006/208] [krasview] Skip download for test --- youtube_dl/extractor/krasview.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/krasview.py b/youtube_dl/extractor/krasview.py index 96f95979a..0ae8ebd68 100644 --- a/youtube_dl/extractor/krasview.py +++ b/youtube_dl/extractor/krasview.py @@ -25,6 +25,9 @@ class KrasViewIE(InfoExtractor): 'duration': 27, 'thumbnail': 're:^https?://.*\.jpg', }, + 'params': { + 'skip_download': 'Not accessible from Travis CI server', + }, } def _real_extract(self, url): From 6d53cdd6ce441dd7bc1d93bf1445f0594cfdffef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 25 Aug 2015 23:29:02 +0600 Subject: [PATCH 007/208] [yandexmusic] Skip removed tracks (#6666) --- youtube_dl/extractor/yandexmusic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 91829be1c..166cbf344 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -67,7 +67,7 @@ class YandexMusicPlaylistBaseIE(InfoExtractor): return [ self.url_result( 'http://music.yandex.ru/album/%s/track/%s' % (track['albums'][0]['id'], track['id'])) - for track in tracks] + for track in tracks if track.get('albums') and isinstance(track.get('albums'), list)] class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE): From baf510bf8cb296d2ed2a2f742ec9387d094623e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 00:11:15 +0600 Subject: [PATCH 008/208] [yandexmusic:playlist] Handle playlists with more than 150 tracks (Closes #6666) --- youtube_dl/extractor/yandexmusic.py | 51 +++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 166cbf344..4098e4629 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -5,7 +5,11 @@ import re import hashlib from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urllib_parse, + compat_urllib_request, +) from ..utils import ( int_or_none, float_or_none, @@ -106,7 +110,7 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): IE_DESC = 'Яндекс.Музыка - Плейлист' _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://music.yandex.ru/users/music.partners/playlists/1245', 'info_dict': { 'id': '1245', @@ -114,19 +118,54 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9', }, 'playlist_count': 6, - } + }, { + # playlist exceeding the limit of 150 tracks shipped with webpage (see + # https://github.com/rg3/youtube-dl/issues/6666) + 'url': 'https://music.yandex.ru/users/ya.playlist/playlists/1036', + 'info_dict': { + 'id': '1036', + 'title': 'Музыка 90-х', + }, + 'playlist_count': 310, + }] def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - playlist = self._parse_json( + mu = self._parse_json( self._search_regex( r'var\s+Mu\s*=\s*({.+?});\s*', webpage, 'player'), - playlist_id)['pageData']['playlist'] + playlist_id) + + playlist = mu['pageData']['playlist'] + tracks, track_ids = playlist['tracks'], playlist['trackIds'] + + # tracks dictionary shipped with webpage is limited to 150 tracks, + # missing tracks should be retrieved manually. + if len(tracks) < len(track_ids): + present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')]) + missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids) + request = compat_urllib_request.Request( + 'https://music.yandex.ru/handlers/track-entries.jsx', + compat_urllib_parse.urlencode({ + 'entries': ','.join(missing_track_ids), + 'lang': mu.get('settings', {}).get('lang', 'en'), + 'external-domain': 'music.yandex.ru', + 'overembed': 'false', + 'sign': mu.get('authData', {}).get('user', {}).get('sign'), + 'strict': 'true', + }).encode('utf-8')) + request.add_header('Referer', url) + request.add_header('X-Requested-With', 'XMLHttpRequest') + + missing_tracks = self._download_json( + request, playlist_id, 'Downloading missing tracks JSON', fatal=False) + if missing_tracks: + tracks.extend(missing_tracks) return self.playlist_result( - self._build_playlist(playlist['tracks']), + self._build_playlist(tracks), compat_str(playlist_id), playlist['title'], playlist.get('description')) From 4bc8eec4ebf5ffcca3b2e17c864be08df5215f44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 26 Aug 2015 15:21:55 +0200 Subject: [PATCH 009/208] [youtube] Adapt player version regex to handle urls ending in '/html5player-new.js' It was always extracting 'new' as the version, breaking the cache system. --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8e2da46e3..ab6754154 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -660,7 +660,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_signature_function(self, video_id, player_url, example_sig): id_m = re.match( - r'.*?-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P[a-z]+)$', + r'.*?-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?)?\.(?P[a-z]+)$', player_url) if not id_m: raise ExtractorError('Cannot identify player %r' % player_url) @@ -1289,7 +1289,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_desc = 'flash player %s' % player_version else: player_version = self._search_regex( - r'html5player-([^/]+?)(?:/html5player)?\.js', + r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', player_url, 'html5 player', fatal=False) player_desc = 'html5 player %s' % player_version From 2f72e83bbd915054cac0e8f70df0c2cab4b9c116 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 20:47:57 +0600 Subject: [PATCH 010/208] [crunchyroll] Detect required login (#6677) --- youtube_dl/extractor/crunchyroll.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 33a033a7f..98d1881ae 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -237,7 +237,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text webpage_url = 'http://www.' + mobj.group('url') webpage = self._download_webpage(webpage_url, video_id, 'Downloading webpage') - note_m = self._html_search_regex(r'
(.+?)
', webpage, 'trailer-notice', default='') + note_m = self._html_search_regex( + r'
(.+?)
', + webpage, 'trailer-notice', default='') if note_m: raise ExtractorError(note_m) @@ -247,6 +249,12 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text if msg.get('type') == 'error': raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True) + if 'To view this, please log in to verify you are 18 or older.' in webpage: + raise ExtractorError( + 'This video is only available for registered users, ' + 'use --username and --password options to provide account credentials.', + expected=True) + video_title = self._html_search_regex(r']*>(.+?)', webpage, 'video_title', flags=re.DOTALL) video_title = re.sub(r' {2,}', ' ', video_title) video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='') From 43e7d3c9453338ae29552311b1447fe95be05db5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 21:24:47 +0600 Subject: [PATCH 011/208] [extractor/common] Add raise_login_required --- youtube_dl/extractor/common.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5d24bcb6a..39cef9c5b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -510,6 +510,12 @@ class InfoExtractor(object): """Report attempt to log in.""" self.to_screen('Logging in') + @staticmethod + def raise_login_required(msg='This video is only available for registered users'): + raise ExtractorError( + '%s. Use --username and --password or --netrc to provide account credentials.' % msg, + expected=True) + # Methods for following #608 @staticmethod def url_result(url, ie=None, video_id=None, video_title=None): From bbb43a39fd11c2fdf28ae593eaa994f22ce663bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 21:25:04 +0600 Subject: [PATCH 012/208] [crunchyroll] Use raise_login_required --- youtube_dl/extractor/crunchyroll.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 98d1881ae..801b9b48e 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -250,10 +250,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True) if 'To view this, please log in to verify you are 18 or older.' in webpage: - raise ExtractorError( - 'This video is only available for registered users, ' - 'use --username and --password options to provide account credentials.', - expected=True) + self.raise_login_required(video_id) video_title = self._html_search_regex(r']*>(.+?)', webpage, 'video_title', flags=re.DOTALL) video_title = re.sub(r' {2,}', ' ', video_title) From 3c53455d15035a94bcd2bc915f565420e1a4279f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 21:25:37 +0600 Subject: [PATCH 013/208] [eroprofile] Use raise_login_required --- youtube_dl/extractor/eroprofile.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/eroprofile.py b/youtube_dl/extractor/eroprofile.py index 316033cf1..7fcd0151d 100644 --- a/youtube_dl/extractor/eroprofile.py +++ b/youtube_dl/extractor/eroprofile.py @@ -71,8 +71,7 @@ class EroProfileIE(InfoExtractor): m = re.search(r'You must be logged in to view this video\.', webpage) if m: - raise ExtractorError( - 'This video requires login. Please specify a username and password and try again.', expected=True) + self.raise_login_required('This video requires login') video_id = self._search_regex( [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'], From 62984e4584c2962e622514c7d6a475636a8c21d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 21:25:53 +0600 Subject: [PATCH 014/208] [lynda] Use raise_login_required --- youtube_dl/extractor/lynda.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 5b9157ed4..378117270 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -118,9 +118,7 @@ class LyndaIE(LyndaBaseIE): 'lynda returned error: %s' % video_json['Message'], expected=True) if video_json['HasAccess'] is False: - raise ExtractorError( - 'Video %s is only available for members. ' - % video_id + self._ACCOUNT_CREDENTIALS_HINT, expected=True) + self.raise_login_required('Video %s is only available for members' % video_id) video_id = compat_str(video_json['ID']) duration = video_json['DurationInSeconds'] From e7ddaef5bd209dd8d24b0025631cde1f5969e71d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 21:26:09 +0600 Subject: [PATCH 015/208] [pluralsight] Use raise_login_required --- youtube_dl/extractor/pluralsight.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 7ba396aef..fd32836cc 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -41,9 +41,7 @@ class PluralsightIE(InfoExtractor): def _login(self): (username, password) = self._get_login_info() if username is None: - raise ExtractorError( - 'Pluralsight account is required, use --username and --password options to provide account credentials.', - expected=True) + self.raise_login_required('Pluralsight account is required') login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') From e269d3ae7dbebb22d5b51bd5e6d477a69ae4f3b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 21:26:24 +0600 Subject: [PATCH 016/208] [safari] Use raise_login_required --- youtube_dl/extractor/safari.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index f3c80708c..a602af692 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -20,7 +20,6 @@ from ..utils import ( class SafariBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/' _SUCCESSFUL_LOGIN_REGEX = r']*>Sign Out' - _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to supply credentials for safaribooksonline.com' _NETRC_MACHINE = 'safari' _API_BASE = 'https://www.safaribooksonline.com/api/v1/book' @@ -37,9 +36,7 @@ class SafariBaseIE(InfoExtractor): def _login(self): (username, password) = self._get_login_info() if username is None: - raise ExtractorError( - self._ACCOUNT_CREDENTIALS_HINT, - expected=True) + self.raise_login_required('safaribooksonline.com account is required') headers = std_headers if 'Referer' not in headers: From 42e7373bd3c819ee7cebf5898e4bdd33730dde6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 21:26:35 +0600 Subject: [PATCH 017/208] [smotri] Use raise_login_required --- youtube_dl/extractor/smotri.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 93a7cfe15..35a81ee87 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -330,10 +330,7 @@ class SmotriBroadcastIE(InfoExtractor): (username, password) = self._get_login_info() if username is None: - raise ExtractorError( - 'Erotic broadcasts allowed only for registered users, ' - 'use --username and --password options to provide account credentials.', - expected=True) + self.raise_login_required('Erotic broadcasts allowed only for registered users') login_form = { 'login-hint53': '1', From 61a7ff16222accdb259f771d0a6f0adb229b34dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 21:26:47 +0600 Subject: [PATCH 018/208] [tubitv] Use raise_login_required --- youtube_dl/extractor/tubitv.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py index 2c4b21807..4f86b3ee9 100644 --- a/youtube_dl/extractor/tubitv.py +++ b/youtube_dl/extractor/tubitv.py @@ -60,9 +60,7 @@ class TubiTvIE(InfoExtractor): webpage = self._download_webpage(url, video_id) if re.search(r"<(?:DIV|div) class='login-required-screen'>", webpage): - raise ExtractorError( - 'This video requires login, use --username and --password ' - 'options to provide account credentials.', expected=True) + self.raise_login_required('This video requires login') title = self._og_search_title(webpage) description = self._og_search_description(webpage) From a882c5f4747c527bb50d87828ea4cceae6d12533 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 21:27:07 +0600 Subject: [PATCH 019/208] [udemy] Use raise_login_required --- youtube_dl/extractor/udemy.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 4a0eaf65f..365d8b4bf 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -70,9 +70,7 @@ class UdemyIE(InfoExtractor): def _login(self): (username, password) = self._get_login_info() if username is None: - raise ExtractorError( - 'Udemy account is required, use --username and --password options to provide account credentials.', - expected=True) + self.raise_login_required('Udemy account is required') login_popup = self._download_webpage( self._LOGIN_URL, None, 'Downloading login popup') From 39affb5aa427a3a1830e97523470d11bfdbd067e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 21:27:57 +0600 Subject: [PATCH 020/208] [crunchyroll] Fix typo --- youtube_dl/extractor/crunchyroll.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 801b9b48e..c2162aa68 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -250,7 +250,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True) if 'To view this, please log in to verify you are 18 or older.' in webpage: - self.raise_login_required(video_id) + self.raise_login_required() video_title = self._html_search_regex(r']*>(.+?)', webpage, 'video_title', flags=re.DOTALL) video_title = re.sub(r' {2,}', ' ', video_title) From 3d8132f5e20b7cbdaa8f69aca482553b2c02bed8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 22:03:58 +0600 Subject: [PATCH 021/208] [shared] Extend _VALID_URL to support vivo.sx (Closes #6681) --- youtube_dl/extractor/shared.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index a07677686..000ef1a07 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -14,7 +14,7 @@ from ..utils import ( class SharedIE(InfoExtractor): - _VALID_URL = r'http://shared\.sx/(?P[\da-z]{10})' + _VALID_URL = r'http://(?:shared|vivo)\.sx/(?P[\da-z]{10})' _TEST = { 'url': 'http://shared.sx/0060718775', From 70113c38c9e551d7d9ab2a4d1f7e76c81b68ae76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 22:04:39 +0600 Subject: [PATCH 022/208] [shared] Clarify IE_DESC --- youtube_dl/extractor/shared.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 000ef1a07..cf0a3bfef 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -14,6 +14,7 @@ from ..utils import ( class SharedIE(InfoExtractor): + IE_DESC = 'shared.sx and vivo.sx' _VALID_URL = r'http://(?:shared|vivo)\.sx/(?P[\da-z]{10})' _TEST = { From f62e02c24f1f0e0488b40df178ddb9bb5fdf9fc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 22:05:45 +0600 Subject: [PATCH 023/208] [shared] Add test for vivo --- youtube_dl/extractor/shared.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index cf0a3bfef..4fa991dff 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -17,7 +17,7 @@ class SharedIE(InfoExtractor): IE_DESC = 'shared.sx and vivo.sx' _VALID_URL = r'http://(?:shared|vivo)\.sx/(?P[\da-z]{10})' - _TEST = { + _TESTS = [{ 'url': 'http://shared.sx/0060718775', 'md5': '106fefed92a8a2adb8c98e6a0652f49b', 'info_dict': { @@ -25,7 +25,16 @@ class SharedIE(InfoExtractor): 'ext': 'mp4', 'title': 'Bmp4', }, - } + }, { + 'url': 'http://vivo.sx/d7ddda0e78', + 'md5': '15b3af41be0b4fe01f4df075c2678b2c', + 'info_dict': { + 'id': 'd7ddda0e78', + 'ext': 'mp4', + 'title': 'Chicken', + 'filesize': 528031, + }, + }] def _real_extract(self, url): video_id = self._match_id(url) From f11c316347bea41d9148d1c8d5d7738a594a06d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 26 Aug 2015 22:06:10 +0600 Subject: [PATCH 024/208] [shared] Add filesize to test --- youtube_dl/extractor/shared.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index 4fa991dff..c5636e8e9 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -24,6 +24,7 @@ class SharedIE(InfoExtractor): 'id': '0060718775', 'ext': 'mp4', 'title': 'Bmp4', + 'filesize': 1720110, }, }, { 'url': 'http://vivo.sx/d7ddda0e78', From d7e8264517d29156697f82b7761dc99d13994c21 Mon Sep 17 00:00:00 2001 From: nmrugg Date: Thu, 27 Aug 2015 23:24:13 +0800 Subject: [PATCH 025/208] Make FoxBusiness work. --- youtube_dl/extractor/foxnews.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 917f76b1e..7de88ab66 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( parse_iso8601, @@ -8,7 +10,7 @@ from ..utils import ( class FoxNewsIE(InfoExtractor): - _VALID_URL = r'https?://video\.foxnews\.com/v/(?:video-embed\.html\?video_id=)?(?P\d+)' + _VALID_URL = r'https?://video\.fox(?:news|business)\.com/v/(?:video-embed\.html\?video_id=)?(?P\d+)' _TESTS = [ { 'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips', @@ -47,8 +49,10 @@ class FoxNewsIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + m = re.match(r'^https?://video\.fox(news|business)', url) + video = self._download_json( - 'http://video.foxnews.com/v/feed/video/%s.js?template=fox' % video_id, video_id) + 'http://video.fox' + m.group(1) + '.com/v/feed/video/%s.js?template=fox' % video_id, video_id) item = video['channel']['item'] title = item['title'] From 8df8c278b6d5e2b5a350446690873dc9f5f48aff Mon Sep 17 00:00:00 2001 From: nmrugg Date: Thu, 27 Aug 2015 23:24:28 +0800 Subject: [PATCH 026/208] Added matching test for FoxBusiness. --- youtube_dl/extractor/foxnews.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 7de88ab66..a8902c960 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -44,6 +44,10 @@ class FoxNewsIE(InfoExtractor): 'url': 'http://video.foxnews.com/v/video-embed.html?video_id=3937480&d=video.foxnews.com', 'only_matching': True, }, + { + 'url': 'http://video.foxbusiness.com/v/4442309889001', + 'only_matching': True, + }, ] def _real_extract(self, url): From 1b660cce120c733f2bb195ef1cfe2ff2421b439f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 27 Aug 2015 21:48:03 +0600 Subject: [PATCH 027/208] [foxnews] Simplify (Closes #6694) --- youtube_dl/extractor/foxnews.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index a8902c960..244c75f0b 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -10,7 +10,7 @@ from ..utils import ( class FoxNewsIE(InfoExtractor): - _VALID_URL = r'https?://video\.fox(?:news|business)\.com/v/(?:video-embed\.html\?video_id=)?(?P\d+)' + _VALID_URL = r'https?://(?Pvideo\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P\d+)' _TESTS = [ { 'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips', @@ -51,12 +51,12 @@ class FoxNewsIE(InfoExtractor): ] def _real_extract(self, url): - video_id = self._match_id(url) - - m = re.match(r'^https?://video\.fox(news|business)', url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + host = mobj.group('host') video = self._download_json( - 'http://video.fox' + m.group(1) + '.com/v/feed/video/%s.js?template=fox' % video_id, video_id) + 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id), video_id) item = video['channel']['item'] title = item['title'] From 5307c332329d6a1f3eec240b66a4f11905889f23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 27 Aug 2015 21:48:47 +0600 Subject: [PATCH 028/208] [foxnews] Clarify IE_DESC --- youtube_dl/extractor/foxnews.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 244c75f0b..3a4a59135 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -10,6 +10,7 @@ from ..utils import ( class FoxNewsIE(InfoExtractor): + IE_DESC = 'Fox News and Fox Business Video' _VALID_URL = r'https?://(?Pvideo\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P\d+)' _TESTS = [ { From a4962b80d668de704fc347d5e76587be0e95dfef Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 28 Aug 2015 05:04:39 +0200 Subject: [PATCH 029/208] release 2015.08.28 --- docs/supportedsites.md | 4 ++-- youtube_dl/version.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 8d9db53a6..328a819b3 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -166,7 +166,7 @@ - **Folketinget**: Folketinget (ft.dk; Danish parliament) - **FootyRoom** - **Foxgay** - - **FoxNews** + - **FoxNews**: Fox News and Fox Business Video - **FoxSports** - **france2.fr:generation-quoi** - **FranceCulture** @@ -465,7 +465,7 @@ - **Sexu** - **SexyKarma**: Sexy Karma and Watch Indian Porn - **Shahid** - - **Shared** + - **Shared**: shared.sx and vivo.sx - **ShareSix** - **Sina** - **Slideshare** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 394951ca7..a07bc9233 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.08.23' +__version__ = '2015.08.28' From 071c10137b6b17b79ecfc8676736d5cc243022f6 Mon Sep 17 00:00:00 2001 From: Paul Hartmann Date: Wed, 26 Aug 2015 00:06:44 +0200 Subject: [PATCH 030/208] [MTV] move German mtv site to new class --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/mtv.py | 37 ++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d59882598..66422b005 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -340,6 +340,7 @@ from .mtv import ( MTVIE, MTVServicesEmbeddedIE, MTVIggyIE, + MTVDEIE, ) from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index b48fac5e3..15df62649 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -288,3 +288,40 @@ class MTVIggyIE(MTVServicesInfoExtractor): } } _FEED_URL = 'http://all.mtvworldverticals.com/feed-xml/' + +class MTVDEIE(MTVServicesInfoExtractor): + IE_NAME = 'mtv.de' + _VALID_URL = r'''(?x)^https?://(?:www\.)?mtv\.de(?P/artists/.*)''' + _TESTS = [ + { + 'url': 'http://www.mtv.de/artists/10571-cro/videos/61131-traum', + 'info_dict': { + 'id': 'a50bc5f0b3aa4b3190aa', + 'ext': 'mp4', + 'title': 'cro-traum', + 'description': 'Cro - Traum', + }, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + return self._get_videos_info(url, mobj.group('video_path')) + + def _get_videos_info(self, url, video_path): + webpage = self._download_webpage(url, video_path) + playlist_js = self._search_regex(r'|$)', + webpage, 'videoplayer applet', default=None) + if config_json: + config = self._parse_json(config_json, display_id, fatal=False) + if config: + sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi') + if sapi: + return self._extract_info(display_id, sapi, webpage) + items_json = self._search_regex( r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, default=None) @@ -190,22 +217,10 @@ class YahooIE(InfoExtractor): video_id = info['id'] return self._get_info(video_id, display_id, webpage) - def _get_info(self, video_id, display_id, webpage): - region = self._search_regex( - r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', - webpage, 'region', fatal=False, default='US') - data = compat_urllib_parse.urlencode({ - 'protocol': 'http', - 'region': region, - }) - query_url = ( - 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' - '{id}?{data}'.format(id=video_id, data=data)) - query_result = self._download_json( - query_url, display_id, 'Downloading video info') - - info = query_result['query']['results']['mediaObj'][0] + def _extract_info(self, display_id, query, webpage): + info = query['query']['results']['mediaObj'][0] meta = info.get('meta') + video_id = info.get('id') if not meta: msg = info['status'].get('msg') @@ -231,6 +246,9 @@ class YahooIE(InfoExtractor): 'ext': 'flv', }) else: + if s.get('format') == 'm3u8_playlist': + format_info['protocol'] = 'm3u8_native' + format_info['ext'] = 'mp4' format_url = compat_urlparse.urljoin(host, path) format_info['url'] = format_url formats.append(format_info) @@ -264,6 +282,21 @@ class YahooIE(InfoExtractor): 'subtitles': subtitles, } + def _get_info(self, video_id, display_id, webpage): + region = self._search_regex( + r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"', + webpage, 'region', fatal=False, default='US') + data = compat_urllib_parse.urlencode({ + 'protocol': 'http', + 'region': region, + }) + query_url = ( + 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' + '{id}?{data}'.format(id=video_id, data=data)) + query_result = self._download_json( + query_url, display_id, 'Downloading video info') + return self._extract_info(display_id, query_result, webpage) + class YahooSearchIE(SearchInfoExtractor): IE_DESC = 'Yahoo screen search' From 1721fef28b89ac4264db978ab7fee3b4dd154056 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 10 Sep 2015 02:58:40 +0600 Subject: [PATCH 165/208] [yahoo] Fix test --- youtube_dl/extractor/yahoo.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 1d9b98750..fca5ddc69 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -147,6 +147,7 @@ class YahooIE(InfoExtractor): }, { # Query result is embedded in webpage, but explicit request to video API fails with geo restriction 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html', + 'md5': '4fbafb9c9b6f07aa8f870629f6671b35', 'info_dict': { 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504', 'ext': 'mp4', @@ -154,10 +155,6 @@ class YahooIE(InfoExtractor): 'description': 'md5:8fc39608213295748e1e289807838c97', 'duration': 1646, }, - 'params': { - # m3u8 download - 'skip_download': True, - }, } ] From 73eb13dfc74132b8f0e5c1ac1ea75f66e0aca6bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 11 Sep 2015 20:43:05 +0600 Subject: [PATCH 166/208] [extractor/common] Case insensitive inputs extraction --- youtube_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5eeeda08d..835f6f368 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -732,7 +732,7 @@ class InfoExtractor(object): @staticmethod def _hidden_inputs(html): hidden_inputs = {} - for input in re.findall(r']+)>', html): + for input in re.findall(r'(?i)]+)>', html): if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): continue name = re.search(r'name=(["\'])(?P.+?)\1', input) @@ -746,7 +746,7 @@ class InfoExtractor(object): def _form_hidden_inputs(self, form_id, html): form = self._search_regex( - r'(?s)]+?id=(["\'])%s\1[^>]*>(?P
.+?)
' % form_id, + r'(?is)]+?id=(["\'])%s\1[^>]*>(?P
.+?)
' % form_id, html, '%s form' % form_id, group='form') return self._hidden_inputs(form) From 586f1cc532d167c28e733779cbf132b94d8f76e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 11 Sep 2015 21:07:32 +0600 Subject: [PATCH 167/208] [extractor/common] Skip html comment tags (Closes #6822) --- youtube_dl/extractor/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 835f6f368..d694e818e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -731,6 +731,7 @@ class InfoExtractor(object): @staticmethod def _hidden_inputs(html): + html = re.sub(r'', '', html) hidden_inputs = {} for input in re.findall(r'(?i)]+)>', html): if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): From 60ed60353b9ca57e8181f0b14d525ce487e673ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 12 Sep 2015 20:34:48 +0600 Subject: [PATCH 168/208] [openfilm] Remove extractor OpenFilm has been shut down --- youtube_dl/extractor/__init__.py | 1 - youtube_dl/extractor/openfilm.py | 70 -------------------------------- 2 files changed, 71 deletions(-) delete mode 100644 youtube_dl/extractor/openfilm.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 57f55b479..2e7272931 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -432,7 +432,6 @@ from .ooyala import ( OoyalaIE, OoyalaExternalIE, ) -from .openfilm import OpenFilmIE from .orf import ( ORFTVthekIE, ORFOE1IE, diff --git a/youtube_dl/extractor/openfilm.py b/youtube_dl/extractor/openfilm.py deleted file mode 100644 index d2ceedd01..000000000 --- a/youtube_dl/extractor/openfilm.py +++ /dev/null @@ -1,70 +0,0 @@ -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote_plus -from ..utils import ( - parse_iso8601, - parse_age_limit, - int_or_none, -) - - -class OpenFilmIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)openfilm\.com/videos/(?P.+)' - _TEST = { - 'url': 'http://www.openfilm.com/videos/human-resources-remastered', - 'md5': '42bcd88c2f3ec13b65edf0f8ad1cac37', - 'info_dict': { - 'id': '32736', - 'display_id': 'human-resources-remastered', - 'ext': 'mp4', - 'title': 'Human Resources (Remastered)', - 'description': 'Social Engineering in the 20th Century.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'duration': 7164, - 'timestamp': 1334756988, - 'upload_date': '20120418', - 'uploader_id': '41117', - 'view_count': int, - 'age_limit': 0, - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - player = compat_urllib_parse_unquote_plus( - self._og_search_video_url(webpage)) - - video = json.loads(self._search_regex( - r'\bp=({.+?})(?:&|$)', player, 'video JSON')) - - video_url = '%s1.mp4' % video['location'] - video_id = video.get('video_id') - display_id = video.get('alias') or display_id - title = video.get('title') - description = video.get('description') - thumbnail = video.get('main_thumb') - duration = int_or_none(video.get('duration')) - timestamp = parse_iso8601(video.get('dt_published'), ' ') - uploader_id = video.get('user_id') - view_count = int_or_none(video.get('views_count')) - age_limit = parse_age_limit(video.get('age_limit')) - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'uploader_id': uploader_id, - 'view_count': view_count, - 'age_limit': age_limit, - } From 41ebd6530b124b9265a3df9d7d09aef02041b088 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 12 Sep 2015 22:42:57 +0800 Subject: [PATCH 169/208] [tudou] Add the test case (#6273) --- youtube_dl/extractor/tudou.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index e800477e2..950c42afb 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -27,6 +27,9 @@ class TudouIE(InfoExtractor): 'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012', 'thumbnail': 're:^https?://.*\.jpg$', } + }, { + 'url': 'http://www.tudou.com/albumplay/cJAHGih4yYg.html', + 'only_matching': True, }] _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' From 94e507aea798dac6974237cc44257dda45d5fa5a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 12 Sep 2015 22:45:09 +0800 Subject: [PATCH 170/208] [tudou] A more comprehensive _VALID_URL --- youtube_dl/extractor/tudou.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 950c42afb..68712cb4a 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -9,7 +9,7 @@ from .common import InfoExtractor class TudouIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/?.*/(?P[^/?#]+?)(?:\.html)?/?(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/([^/]+/)*(?P[^/?#]+?)(?:\.html)?/?(?:$|[?#])' _TESTS = [{ 'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html', 'md5': '140a49ed444bd22f93330985d8475fcb', From 141ba36996f77a420df69903a59792f6f93ae314 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 12 Sep 2015 22:51:49 +0800 Subject: [PATCH 171/208] [tudou] Modernize --- youtube_dl/extractor/tudou.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 68712cb4a..c9d80a7ef 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -2,9 +2,6 @@ from __future__ import unicode_literals -import re -import json - from .common import InfoExtractor @@ -46,13 +43,10 @@ class TudouIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - m = re.search(r'vcode:\s*[\'"](.+?)[\'"]', webpage) - if m and m.group(1): - return { - '_type': 'url', - 'url': 'youku:' + m.group(1), - 'ie_key': 'Youku' - } + youku_vcode = self._search_regex( + r'vcode:\s*[\'"](.+?)[\'"]', webpage, 'youku vcode', default=None) + if youku_vcode: + return self.url_result('youku:' + youku_vcode, ie='Youku') title = self._search_regex( r",kw:\s*['\"](.+?)[\"']", webpage, 'title') @@ -63,8 +57,8 @@ class TudouIE(InfoExtractor): r"playerUrl\s*:\s*['\"](.+?\.swf)[\"']", webpage, 'player URL', default=self._PLAYER_URL) - segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments') - segments = json.loads(segs_json) + segments = self._parse_json(self._search_regex( + r'segs: \'(.*)\'', webpage, 'segments'), video_id) # It looks like the keys are the arguments that have to be passed as # the hd field in the request url, we pick the higher # Also, filter non-number qualities (see issue #3643). From aab135516b288f24c55178b024976fd3e130c7b8 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 12 Sep 2015 22:52:51 +0800 Subject: [PATCH 172/208] [tudou] Avoid shadowing builtin names --- youtube_dl/extractor/tudou.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index c9d80a7ef..6116b209d 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -31,11 +31,11 @@ class TudouIE(InfoExtractor): _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' - def _url_for_id(self, id, quality=None): - info_url = "http://v2.tudou.com/f?id=" + str(id) + def _url_for_id(self, video_id, quality=None): + info_url = "http://v2.tudou.com/f?id=" + str(video_id) if quality: info_url += '&hd' + quality - webpage = self._download_webpage(info_url, id, "Opening the info webpage") + webpage = self._download_webpage(info_url, video_id, "Opening the info webpage") final_url = self._html_search_regex('>(.+?)', webpage, 'video url') return final_url From 87813a857009dc3c3dfcc421679e5e806d363863 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 13 Sep 2015 02:36:51 +0800 Subject: [PATCH 173/208] [tudou] Use _download_xml --- youtube_dl/extractor/tudou.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 6116b209d..3b993192c 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -35,8 +35,8 @@ class TudouIE(InfoExtractor): info_url = "http://v2.tudou.com/f?id=" + str(video_id) if quality: info_url += '&hd' + quality - webpage = self._download_webpage(info_url, video_id, "Opening the info webpage") - final_url = self._html_search_regex('>(.+?)', webpage, 'video url') + xml_data = self._download_xml(info_url, video_id, "Opening the info XML page") + final_url = xml_data.text return final_url def _real_extract(self, url): From 349b3a2ea0d6c264facacd92508516e8530108b2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 13 Sep 2015 02:51:20 +0800 Subject: [PATCH 174/208] [tudou] Improve regexs --- youtube_dl/extractor/tudou.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 3b993192c..53ba8511f 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -44,21 +44,21 @@ class TudouIE(InfoExtractor): webpage = self._download_webpage(url, video_id) youku_vcode = self._search_regex( - r'vcode:\s*[\'"](.+?)[\'"]', webpage, 'youku vcode', default=None) + r'vcode\s*:\s*[\'"]([^\'"]*)[\'"]', webpage, 'youku vcode', default=None) if youku_vcode: return self.url_result('youku:' + youku_vcode, ie='Youku') title = self._search_regex( - r",kw:\s*['\"](.+?)[\"']", webpage, 'title') + r',kw\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'title') thumbnail_url = self._search_regex( - r",pic:\s*[\"'](.+?)[\"']", webpage, 'thumbnail URL', fatal=False) + r',pic\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'thumbnail URL', fatal=False) player_url = self._search_regex( - r"playerUrl\s*:\s*['\"](.+?\.swf)[\"']", + r'playerUrl\s*:\s*[\'"]([^\'"]+\.swf)[\'"]', webpage, 'player URL', default=self._PLAYER_URL) segments = self._parse_json(self._search_regex( - r'segs: \'(.*)\'', webpage, 'segments'), video_id) + r'segs: \'([^\']+)\'', webpage, 'segments'), video_id) # It looks like the keys are the arguments that have to be passed as # the hd field in the request url, we pick the higher # Also, filter non-number qualities (see issue #3643). From b264c2130221912adfc7cc35d73c2a88d79eafeb Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 13 Sep 2015 02:57:14 +0800 Subject: [PATCH 175/208] [tudou] Use single quotes and compat_str --- youtube_dl/extractor/tudou.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py index 53ba8511f..5f7ac4b35 100644 --- a/youtube_dl/extractor/tudou.py +++ b/youtube_dl/extractor/tudou.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str class TudouIE(InfoExtractor): @@ -32,7 +33,7 @@ class TudouIE(InfoExtractor): _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf' def _url_for_id(self, video_id, quality=None): - info_url = "http://v2.tudou.com/f?id=" + str(video_id) + info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id) if quality: info_url += '&hd' + quality xml_data = self._download_xml(info_url, video_id, "Opening the info XML page") From 2ffe3bc14b5e65c902fe5ddd610143c791edaa52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 13 Sep 2015 04:15:49 +0600 Subject: [PATCH 176/208] [ndr] Rework and cover with tests --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/ndr.py | 469 ++++++++++++++++++++++--------- 2 files changed, 334 insertions(+), 136 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 44ab7ce3c..fadba905d 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -367,6 +367,7 @@ from .nbc import ( from .ndr import ( NDRIE, NJoyIE, + NDREmbedBaseIE, NDREmbedIE, NJoyEmbedIE, ) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 87f3edbbe..e3cc6fde8 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -1,183 +1,380 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( - ExtractorError, + determine_ext, int_or_none, + parse_iso8601, qualities, ) -preference = qualities(['xs', 's', 'm','l', 'xl']) - - class NDRBaseIE(InfoExtractor): - - def extract_video_info(self, playlist, video_id): - formats = [] - streamType = playlist.get('config').get('streamType') - if streamType == 'httpVideo': - for key, f in playlist.items(): - if key != 'config': - src = f['src'] - if '.f4m' in src: - formats.extend(self._extract_f4m_formats(src, video_id)) - elif '.m3u8' in src: - formats.extend(self._extract_m3u8_formats(src, video_id, fatal=False)) - else: - quality = f.get('quality') - formats.append({ - 'url': src, - 'format_id': quality, - 'preference': preference(quality), - }) - elif streamType == 'httpAudio': - for key, f in playlist.items(): - if key != 'config': - formats.append({ - 'url': f['src'], - 'format_id': 'mp3', - 'vcodec': 'none', - }) - else: - raise ExtractorError('No media links available for %s' % video_id) - - self._sort_formats(formats) - - config = playlist.get('config') - - title = config['title'] - duration = int_or_none(config.get('duration')) - thumbnails = [{ - 'id': thumbnail.get('quality'), - 'url': thumbnail.get('src'), - 'preference': preference(thumbnail.get('quality')) - } for thumbnail in config.get('poster').values()] - - return { - 'id': video_id, - 'title': title, - 'thumbnails': thumbnails, - 'duration': duration, - 'formats': formats, - } - def _real_extract(self, url): - video_id = self._match_id(url) - - json_data = self._download_json('http://www.ndr.de/%s-ppjson.json' % video_id, video_id, fatal=False) - - if not json_data: - webpage = self._download_webpage(url, video_id) - embed_url = self._html_search_regex(r']+id="pp_\w+"[^>]+src="(/.*)"', webpage, 'embed url', None, False) - if not embed_url: - embed_url = self._html_search_meta('embedURL', webpage, fatal=False) - if embed_url: - if embed_url.startswith('/'): - return self.url_result('http://www.ndr.de%s' % embed_url, 'NDREmbed') - else: - return self.url_result(embed_url, 'NDREmbed') - raise ExtractorError('No media links available for %s' % video_id) - - return self.extract_video_info(json_data['playlist'], video_id) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + return self._extract_embed(webpage, display_id) class NDRIE(NDRBaseIE): IE_NAME = 'ndr' - IE_DESC = 'NDR.de - Mediathek' - _VALID_URL = r'https?://www\.ndr\.de/.+?,(?P\w+)\.html' + IE_DESC = 'NDR.de - Norddeutscher Rundfunk' + _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)+(?P[^/?#]+),[\da-z]+\.html' + _TESTS = [{ + # httpVideo, same content id + 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', + 'md5': '6515bc255dc5c5f8c85bbc38e035a659', + 'info_dict': { + 'id': 'hafengeburtstag988', + 'display_id': 'Party-Poette-und-Parade', + 'ext': 'mp4', + 'title': 'Party, Pötte und Parade', + 'description': 'md5:ad14f9d2f91d3040b6930c697e5f6b4c', + 'uploader': 'ndrtv', + 'timestamp': 1431108900, + 'upload_date': '20150510', + 'duration': 3498, + }, + 'params': { + 'skip_download': True, + }, + }, { + # httpVideo, different content id + 'url': 'http://www.ndr.de/sport/fussball/40-Osnabrueck-spielt-sich-in-einen-Rausch,osna270.html', + 'md5': '1043ff203eab307f0c51702ec49e9a71', + 'info_dict': { + 'id': 'osna272', + 'display_id': '40-Osnabrueck-spielt-sich-in-einen-Rausch', + 'ext': 'mp4', + 'title': 'Osnabrück - Wehen Wiesbaden: Die Highlights', + 'description': 'md5:32e9b800b3d2d4008103752682d5dc01', + 'uploader': 'ndrtv', + 'timestamp': 1442059200, + 'upload_date': '20150912', + 'duration': 510, + }, + 'params': { + 'skip_download': True, + }, + }, { + # httpAudio, same content id + 'url': 'http://www.ndr.de/info/La-Valette-entgeht-der-Hinrichtung,audio51535.html', + 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', + 'info_dict': { + 'id': 'audio51535', + 'display_id': 'La-Valette-entgeht-der-Hinrichtung', + 'ext': 'mp3', + 'title': 'La Valette entgeht der Hinrichtung', + 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', + 'uploader': 'ndrinfo', + 'timestamp': 1290626100, + 'upload_date': '20140729', + 'duration': 884, + }, + 'params': { + 'skip_download': True, + }, + }] - _TESTS = [ - { - 'url': 'http://www.ndr.de/fernsehen/sendungen/nordmagazin/Kartoffeltage-in-der-Lewitz,nordmagazin25866.html', - 'md5': '5bc5f5b92c82c0f8b26cddca34f8bb2c', - 'note': 'Video file', - 'info_dict': { - 'id': 'nordmagazin25866', - 'ext': 'mp4', - 'title': 'Kartoffeltage in der Lewitz', - 'duration': 166, - }, - 'skip': '404 Not found', - }, - { - 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', - 'md5': 'dadc003c55ae12a5d2f6bd436cd73f59', - 'info_dict': { - 'id': 'hafengeburtstag988', - 'ext': 'mp4', - 'title': 'Party, Pötte und Parade', - 'duration': 3498, - }, - }, - { - 'url': 'http://www.ndr.de/info/La-Valette-entgeht-der-Hinrichtung,audio51535.html', - 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', - 'note': 'Audio file', - 'info_dict': { - 'id': 'audio51535', - 'ext': 'mp3', - 'title': 'La Valette entgeht der Hinrichtung', - 'duration': 884, - } + def _extract_embed(self, webpage, display_id): + embed_url = self._html_search_meta( + 'embedURL', webpage, 'embed URL', fatal=True) + description = self._search_regex( + r']+itemprop="description">([^<]+)

', + webpage, 'description', fatal=False) + timestamp = parse_iso8601( + self._search_regex( + r'