From f57f84f606b246db4f102fc5bc55e64e4f7a3d60 Mon Sep 17 00:00:00 2001 From: fnord Date: Tue, 21 Jul 2015 16:38:40 -0500 Subject: [PATCH 01/83] Twitter: get and describe video from status urls --- youtube_dl/extractor/twitter.py | 44 +++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1aaa06305..a65252cc6 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -70,3 +70,47 @@ class TwitterCardIE(InfoExtractor): 'duration': duration, 'formats': formats, } + + +class TwitterIE(TwitterCardIE): + _VALID_URL = r'https?://(?:www|m|mobile)?\.?twitter\.com/(?P[^/]+/status/\d+)' + + _TESTS = [{ + 'url': 'https://m.twitter.com/thereaIbanksy/status/614301758345490432', + 'md5': '8bbccb487bd7a31349b775915fcd412f', + 'info_dict': { + 'id': '614301758345490432', + 'ext': 'mp4', + 'title': 'thereaIbanksy - This time lapse is so pretty \U0001f60d\U0001f60d', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 29.5, + 'description': 'banksy on Twitter: "This time lapse is so pretty \U0001f60d\U0001f60d http://t.co/QB8DDbqiR1"', + 'uploader': 'banksy', + 'uploader_id': 'thereaIbanksy', + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + username, twid = re.match(r'([^/]+)/status/(\d+)', id).groups() + name = username + url = re.sub(r'https?://(m|mobile)\.', 'https://', url) + webpage = self._download_webpage(url, 'tweet: ' + url) + description = unescapeHTML(self._search_regex('\s*(.+?)\s*', webpage, 'title')) + title = description.replace('\n', ' ') + splitdesc = re.match(r'^(.+?)\s*on Twitter:\s* "(.+?)"$', title) + if splitdesc: + name, title = splitdesc.groups() + title = re.sub(r'\s*https?://[^ ]+', '', title) # strip 'https -_t.co_BJYgOjSeGA' junk from filenames + card_id = self._search_regex(r'["\']/i/cards/tfw/v1/(\d+)', webpage, '/i/card/...') + card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id + return { + '_type': 'url_transparent', + 'ie_key': 'TwitterCard', + 'uploader_id': username, + 'uploader': name, + 'url': card_url, + 'webpage_url': url, + 'description': description, + 'title': username + ' - ' + title, + } From c3dea3f878133f3cbdad9e548609d3077572af66 Mon Sep 17 00:00:00 2001 From: fnord Date: Tue, 21 Jul 2015 16:45:36 -0500 Subject: [PATCH 02/83] Twittercard: support vmapurl method --- youtube_dl/extractor/twitter.py | 47 ++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index a65252cc6..1dd43ff3c 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -12,17 +12,30 @@ from ..utils import ( class TwitterCardIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P\d+)' - _TEST = { - 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', - 'md5': 'a74f50b310c83170319ba16de6955192', - 'info_dict': { - 'id': '560070183650213889', - 'ext': 'mp4', - 'title': 'TwitterCard', - 'thumbnail': 're:^https?://.*\.jpg$', - 'duration': 30.033, + _TESTS = [ + { + 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', + 'md5': 'a74f50b310c83170319ba16de6955192', + 'info_dict': { + 'id': '560070183650213889', + 'ext': 'mp4', + 'title': 'TwitterCard', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 30.033, + } }, - } + { + 'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768', + 'md5': '7ee2a553b63d1bccba97fbed97d9e1c8', + 'info_dict': { + 'id': '623160978427936768', + 'ext': 'mp4', + 'title': 'TwitterCard', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 80.155, + }, + } + ] def _real_extract(self, url): video_id = self._match_id(url) @@ -44,6 +57,20 @@ class TwitterCardIE(InfoExtractor): unescapeHTML(self._search_regex( r'data-player-config="([^"]+)"', webpage, 'data player config')), video_id) + if 'playlist' not in config: + if 'vmapUrl' in config: + webpage = self._download_webpage(config['vmapUrl'], video_id + ' (xml)') + video_url = self._search_regex( + r'\s*', webpage, 'data player config (xml)') + f = { + 'url': video_url, + } + ext = re.search(r'\.([a-z0-9]{2,4})(\?.+)?$', video_url) + if ext: + f['ext'] = ext.group(1) + formats.append(f) + break # same video regardless of UA + continue video_url = config['playlist'][0]['source'] From 9e7e0dffd5e3e3c959e8d99a5e236b9099886fe9 Mon Sep 17 00:00:00 2001 From: fnord Date: Tue, 21 Jul 2015 16:56:35 -0500 Subject: [PATCH 03/83] Actually add the extractor --- youtube_dl/extractor/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 50da08830..5c03bf8e8 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -651,7 +651,7 @@ from .twitch import ( TwitchBookmarksIE, TwitchStreamIE, ) -from .twitter import TwitterCardIE +from .twitter import TwitterCardIE, TwitterIE from .ubu import UbuIE from .udemy import ( UdemyIE, From ee2edd838a1e8770488e695c380943ded44d0983 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 13 Oct 2015 00:53:05 +0200 Subject: [PATCH 04/83] release 2015.10.13 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0908e963d..aaa43d315 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.10.12' +__version__ = '2015.10.13' From 5946cda7c6f2e4a7eb90fff6f10c66af0ff2a0d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 13 Oct 2015 21:04:39 +0600 Subject: [PATCH 05/83] [beeg] Fix extraction (Closes #7155) --- youtube_dl/extractor/beeg.py | 68 +++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index b38057f2f..e6c928699 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -1,65 +1,67 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) class BeegIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?beeg\.com/(?P\d+)' _TEST = { 'url': 'http://beeg.com/5416503', - 'md5': '1bff67111adb785c51d1b42959ec10e5', + 'md5': '46c384def73b33dbc581262e5ee67cef', 'info_dict': { 'id': '5416503', 'ext': 'mp4', 'title': 'Sultry Striptease', - 'description': 'md5:6db3c6177972822aaba18652ff59c773', - 'categories': list, # NSFW - 'thumbnail': 're:https?://.*\.jpg$', + 'description': 'md5:d22219c09da287c14bed3d6c37ce4bc2', + 'timestamp': 1391813355, + 'upload_date': '20140207', + 'duration': 383, + 'tags': list, 'age_limit': 18, } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - quality_arr = self._search_regex( - r'(?s)var\s+qualityArr\s*=\s*{\s*(.+?)\s*}', webpage, 'quality formats') - - formats = [{ - 'url': fmt[1], - 'format_id': fmt[0], - 'height': int(fmt[0][:-1]), - } for fmt in re.findall(r"'([^']+)'\s*:\s*'([^']+)'", quality_arr)] + video = self._download_json( + 'http://beeg.com/api/v1/video/%s' % video_id, video_id) + formats = [] + for format_id, video_url in video.items(): + height = self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None) + if not height: + continue + formats.append({ + 'url': self._proto_relative_url(video_url.replace('{DATA_MARKERS}', ''), 'http:'), + 'format_id': format_id, + 'height': int(height), + }) self._sort_formats(formats) - title = self._html_search_regex( - r'([^<]+)\s*-\s*beeg\.?', webpage, 'title') + title = video['title'] + video_id = video.get('id') or video_id + display_id = video.get('code') + description = video.get('desc') - description = self._html_search_regex( - r' Date: Tue, 13 Oct 2015 16:29:16 +0700 Subject: [PATCH 06/83] Extract thumbnail url --- youtube_dl/extractor/yandexmusic.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 4098e4629..6842f834f 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -46,6 +46,14 @@ class YandexMusicTrackIE(InfoExtractor): % (data['host'], key, data['ts'] + data['path'], storage[1])) def _get_track_info(self, track): + album = track['albums'][0] + a_thumb = None + + if 'coverUri' in album: + a_thumb = album['coverUri'] + if a_thumb: + a_thumb = 'http://' + a_thumb.replace('%%', '1000x1000') + return { 'id': track['id'], 'ext': 'mp3', @@ -53,6 +61,7 @@ class YandexMusicTrackIE(InfoExtractor): 'title': '%s - %s' % (track['artists'][0]['name'], track['title']), 'filesize': int_or_none(track.get('fileSize')), 'duration': float_or_none(track.get('durationMs'), 1000), + 'thumbnail': a_thumb, } def _real_extract(self, url): From b30c4992a93d411f4f89faf2af153fc580138a90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 13 Oct 2015 21:14:33 +0200 Subject: [PATCH 07/83] [channel9] Return a single dictionary for single videos (closes #7086) Returning a list is deprecated. --- youtube_dl/extractor/channel9.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 3dfc24f5b..79fd0a30e 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -224,12 +224,12 @@ class Channel9IE(InfoExtractor): if contents is None: return contents - authors = self._extract_authors(html) + if len(contents) > 1: + raise ExtractorError('Got more than one entry') + result = contents[0] + result['authors'] = self._extract_authors(html) - for content in contents: - content['authors'] = authors - - return contents + return result def _extract_session(self, html, content_path): contents = self._extract_content(html, content_path) From 506e261d2073d8c00d5b43d272e8173cb0d63728 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 13 Oct 2015 21:18:30 +0200 Subject: [PATCH 08/83] [channel9] strip 'session_day' --- youtube_dl/extractor/channel9.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 79fd0a30e..1ce004932 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -158,7 +158,7 @@ class Channel9IE(InfoExtractor): def _extract_session_day(self, html): m = re.search(r'
  • \s*(?P[^<]+)\s*
  • ', html) - return m.group('day') if m is not None else None + return m.group('day').strip() if m is not None else None def _extract_session_room(self, html): m = re.search(r'
  • \s*(?P.+?)\s*
  • ', html) From 3dc582e5ea69af4ad7f51d30c1d87cf93aa6b72b Mon Sep 17 00:00:00 2001 From: kaspi Date: Mon, 12 Oct 2015 01:25:57 -0400 Subject: [PATCH 09/83] [fczenit] Add extractor Closes #7143. --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/fczenit.py | 41 ++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 youtube_dl/extractor/fczenit.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 75720843c..f6d185818 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -167,6 +167,7 @@ from .extremetube import ExtremeTubeIE from .facebook import FacebookIE from .faz import FazIE from .fc2 import FC2IE +from .fczenit import FczenitIE from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE diff --git a/youtube_dl/extractor/fczenit.py b/youtube_dl/extractor/fczenit.py new file mode 100644 index 000000000..f1f150ef2 --- /dev/null +++ b/youtube_dl/extractor/fczenit.py @@ -0,0 +1,41 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class FczenitIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/gl(?P[0-9]+)' + _TEST = { + 'url': 'http://fc-zenit.ru/video/gl6785/', + 'md5': '458bacc24549173fe5a5aa29174a5606', + 'info_dict': { + 'id': '6785', + 'ext': 'mp4', + 'title': '«Зенит-ТВ»: как Олег Шатов играл против «Урала»', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_title = self._html_search_regex(r'
    ([^<]+)', webpage, 'title') + + bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, 'video URL') + bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw) + + formats = [{ + 'url': furl, + 'tbr': tbr, + } for furl, tbr in bitrates] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_title, + 'formats': formats, + } From 26669ea3cf596f2ea4bce9e21ce73c1d8fc3ff72 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 10 Oct 2015 16:51:37 +0100 Subject: [PATCH 10/83] [5min] extract more video info and formats Closes #7124. --- youtube_dl/extractor/fivemin.py | 84 ++++++++++++++++++++++++++++----- 1 file changed, 71 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py index 157094e8c..2955965d9 100644 --- a/youtube_dl/extractor/fivemin.py +++ b/youtube_dl/extractor/fivemin.py @@ -2,11 +2,15 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( - compat_str, compat_urllib_parse, + compat_parse_qs, + compat_urllib_parse_urlparse, + compat_urlparse, ) from ..utils import ( ExtractorError, + parse_duration, + replace_extension, ) @@ -28,6 +32,7 @@ class FiveMinIE(InfoExtractor): 'id': '518013791', 'ext': 'mp4', 'title': 'iPad Mini with Retina Display Review', + 'duration': 177, }, }, { @@ -38,9 +43,52 @@ class FiveMinIE(InfoExtractor): 'id': '518086247', 'ext': 'mp4', 'title': 'How to Make a Next-Level Fruit Salad', + 'duration': 184, }, }, ] + _ERRORS = { + 'ErrorVideoNotExist': 'We\'re sorry, but the video you are trying to watch does not exist.', + 'ErrorVideoNoLongerAvailable': 'We\'re sorry, but the video you are trying to watch is no longer available.', + 'ErrorVideoRejected': 'We\'re sorry, but the video you are trying to watch has been removed.', + 'ErrorVideoUserNotGeo': 'We\'re sorry, but the video you are trying to watch cannot be viewed from your current location.', + 'ErrorVideoLibraryRestriction': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.', + 'ErrorExposurePermission': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.', + } + _QUALITIES = { + 1: { + 'width': 640, + 'height': 360, + }, + 2: { + 'width': 854, + 'height': 480, + }, + 4: { + 'width': 1280, + 'height': 720, + }, + 8: { + 'width': 1920, + 'height': 1080, + }, + 16: { + 'width': 640, + 'height': 360, + }, + 32: { + 'width': 854, + 'height': 480, + }, + 64: { + 'width': 1280, + 'height': 720, + }, + 128: { + 'width': 640, + 'height': 360, + }, + } def _real_extract(self, url): video_id = self._match_id(url) @@ -59,26 +107,36 @@ class FiveMinIE(InfoExtractor): 'https://syn.5min.com/handlers/SenseHandler.ashx?' + query, video_id) if not response['success']: - err_msg = response['errorMessage'] - if err_msg == 'ErrorVideoUserNotGeo': - msg = 'Video not available from your location' - else: - msg = 'Aol said: %s' % err_msg - raise ExtractorError(msg, expected=True, video_id=video_id) + raise ExtractorError( + '%s said: %s' % ( + self.IE_NAME, + self._ERRORS.get(response['errorMessage'], response['errorMessage'])), + expected=True) info = response['binding'][0] - second_id = compat_str(int(video_id[:-2]) + 1) formats = [] - for quality, height in [(1, 320), (2, 480), (4, 720), (8, 1080)]: - if any(r['ID'] == quality for r in info['Renditions']): + parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs( + compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0]) + for rendition in info['Renditions']: + if rendition['RenditionType'] == 'm3u8': + formats.extend(self._extract_m3u8_formats(rendition['Url'], video_id, m3u8_id='hls')) + elif rendition['RenditionType'] == 'aac': + continue + else: + rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType']))) + quality = self._QUALITIES.get(rendition['ID'], {}) formats.append({ - 'format_id': compat_str(quality), - 'url': 'http://avideos.5min.com/%s/%s/%s_%s.mp4' % (second_id[-3:], second_id, video_id, quality), - 'height': height, + 'format_id': '%s-%d' % (rendition['RenditionType'], rendition['ID']), + 'url': rendition_url, + 'width': quality.get('width'), + 'height': quality.get('height'), }) + self._sort_formats(formats) return { 'id': video_id, 'title': info['Title'], + 'thumbnail': info.get('ThumbURL'), + 'duration': parse_duration(info.get('Duration')), 'formats': formats, } From 1f36085df94c2addd1175e7e299f6235aca3ac68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 14 Oct 2015 13:41:39 +0200 Subject: [PATCH 11/83] [vimeo] Fix extraction of password protected videos (fixes #7169) --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 027f47ee3..fa1b22049 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -212,7 +212,7 @@ class VimeoIE(VimeoBaseInfoExtractor): url = url.replace('http://', 'https://') password_request = compat_urllib_request.Request(url + '/password', data) password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - password_request.add_header('Cookie', 'clip_v=1; vuid=%s' % vuid) + password_request.add_header('Cookie', 'clip_test2=1; vuid=%s' % vuid) password_request.add_header('Referer', url) return self._download_webpage( password_request, video_id, From 36bb63fad19df5ee419979f875e2265936511644 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 14 Oct 2015 14:13:53 +0100 Subject: [PATCH 12/83] [criterion] fix description extraction --- youtube_dl/extractor/criterion.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py index 4fb178165..dedb810a0 100644 --- a/youtube_dl/extractor/criterion.py +++ b/youtube_dl/extractor/criterion.py @@ -27,9 +27,7 @@ class CriterionIE(InfoExtractor): final_url = self._search_regex( r'so.addVariable\("videoURL", "(.+?)"\)\;', webpage, 'video url') title = self._og_search_title(webpage) - description = self._html_search_regex( - r'', - webpage, 'video description') + description = self._html_search_meta('description', webpage) thumbnail = self._search_regex( r'so.addVariable\("thumbnailURL", "(.+?)"\)\;', webpage, 'thumbnail url') From 7a6d76a64d8a89a08bb79791506fc18b993c4580 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 14 Oct 2015 20:49:39 +0600 Subject: [PATCH 13/83] [extractor/common] Require closing quote in _og_regexes (Closes #7174) E.g. do not match `property='og:video:type'` when `og:video` is requested. --- youtube_dl/extractor/common.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0082a4c84..a0c4af92f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -646,7 +646,8 @@ class InfoExtractor(object): @staticmethod def _og_regexes(prop): content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\'|\s*([^\s"\'=<>`]+?))' - property_re = r'(?:name|property)=[\'"]?og:%s[\'"]?' % re.escape(prop) + property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)' + % {'prop': re.escape(prop)}) template = r']+?%s[^>]+?%s' return [ template % (property_re, content_re), From 1c29e81e620241b9013b23e7acd9d6ab06587fb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 14 Oct 2015 20:58:52 +0600 Subject: [PATCH 14/83] [test_InfoExtractor] Add test for 7a6d76a64d8a89a08bb79791506fc18b993c4580 --- test/test_InfoExtractor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index be8d12997..4ce5b5a35 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -35,10 +35,12 @@ class TestInfoExtractor(unittest.TestCase): + ''' self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2') + self.assertEqual(ie._og_search_video_url(html, default=None), None) def test_html_search_meta(self): ie = self.ie From db0a8ad97993cb3f0c398d3a5dc55389565e0ffd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 14 Oct 2015 21:11:06 +0600 Subject: [PATCH 15/83] [test_InfoExtractor] Add test for unquoted attribute --- test/test_InfoExtractor.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 4ce5b5a35..2a00d09a5 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -36,11 +36,13 @@ class TestInfoExtractor(unittest.TestCase): + ''' self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2') self.assertEqual(ie._og_search_video_url(html, default=None), None) + self.assertEqual(ie._og_search_property('foobar', html), 'Foo') def test_html_search_meta(self): ie = self.ie From ab953c64a0e8b8558e95d0318110c0885a4eec3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 14 Oct 2015 21:15:29 +0600 Subject: [PATCH 16/83] [yandexmusic:track] Extract original size thumbnail (Closes #7160) --- youtube_dl/extractor/yandexmusic.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 6842f834f..08dc81f3a 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -46,14 +46,12 @@ class YandexMusicTrackIE(InfoExtractor): % (data['host'], key, data['ts'] + data['path'], storage[1])) def _get_track_info(self, track): - album = track['albums'][0] - a_thumb = None - - if 'coverUri' in album: - a_thumb = album['coverUri'] - if a_thumb: - a_thumb = 'http://' + a_thumb.replace('%%', '1000x1000') - + thumbnail = None + cover_uri = track.get('albums', [{}])[0].get('coverUri') + if cover_uri: + thumbnail = cover_uri.replace('%%', 'orig') + if not thumbnail.startswith('http'): + thumbnail = 'http://' + thumbnail return { 'id': track['id'], 'ext': 'mp3', @@ -61,7 +59,7 @@ class YandexMusicTrackIE(InfoExtractor): 'title': '%s - %s' % (track['artists'][0]['name'], track['title']), 'filesize': int_or_none(track.get('fileSize')), 'duration': float_or_none(track.get('durationMs'), 1000), - 'thumbnail': a_thumb, + 'thumbnail': thumbnail, } def _real_extract(self, url): From 9fb66c780cee8668b1bb07f70e70ae1161e13320 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 14 Oct 2015 17:25:07 +0100 Subject: [PATCH 17/83] [megavideoz] remove extractor --- youtube_dl/extractor/__init__.py | 1 - youtube_dl/extractor/megavideoz.py | 56 ------------------------------ 2 files changed, 57 deletions(-) delete mode 100644 youtube_dl/extractor/megavideoz.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f6d185818..462717b1e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -319,7 +319,6 @@ from .macgamestore import MacGameStoreIE from .mailru import MailRuIE from .malemotion import MalemotionIE from .mdr import MDRIE -from .megavideoz import MegaVideozIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mgoon import MgoonIE diff --git a/youtube_dl/extractor/megavideoz.py b/youtube_dl/extractor/megavideoz.py deleted file mode 100644 index af7ff07ea..000000000 --- a/youtube_dl/extractor/megavideoz.py +++ /dev/null @@ -1,56 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - xpath_text, -) - - -class MegaVideozIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?megavideoz\.eu/video/(?P[^/]+)(?:/(?P[^/]+))?' - _TEST = { - 'url': 'http://megavideoz.eu/video/WM6UB919XMXH/SMPTE-Universal-Film-Leader', - 'info_dict': { - 'id': '48723', - 'display_id': 'SMPTE-Universal-Film-Leader', - 'ext': 'mp4', - 'title': 'SMPTE Universal Film Leader', - 'thumbnail': 're:https?://.*?\.jpg', - 'duration': 10.93, - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id - - webpage = self._download_webpage(url, display_id) - - if any(p in webpage for p in ('>Video Not Found<', '>404 Error<')): - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - config = self._download_xml( - self._search_regex( - r"var\s+cnf\s*=\s*'([^']+)'", webpage, 'cnf url'), - display_id) - - video_url = xpath_text(config, './file', 'video url', fatal=True) - title = xpath_text(config, './title', 'title', fatal=True) - thumbnail = xpath_text(config, './image', 'thumbnail') - duration = float_or_none(xpath_text(config, './duration', 'duration')) - video_id = xpath_text(config, './mediaid', 'video id') or video_id - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration - } From 1812afb7b396f4954d5d1ca1cec1c3f2d67550c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 14 Oct 2015 22:35:01 +0600 Subject: [PATCH 18/83] [utils] Do not fail in int_or_none on non-numeric data (Closes #7175) --- youtube_dl/utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 1dc3153fd..86c693358 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1371,7 +1371,12 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): v = getattr(v, get_attr, None) if v == '': v = None - return default if v is None else (int(v) * invscale // scale) + if v is None: + return default + try: + return int(v) * invscale // scale + except ValueError: + pass def str_or_none(v, default=None): From caf80631f0c57b29187e2aa909fa1a3a6325d6e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 14 Oct 2015 22:36:37 +0600 Subject: [PATCH 19/83] [utils] Do not fail in float_or_none on non-numeric data --- youtube_dl/utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 86c693358..83b44caaa 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1392,7 +1392,12 @@ def str_to_int(int_str): def float_or_none(v, scale=1, invscale=1, default=None): - return default if v is None else (float(v) * invscale / scale) + if v is None: + return default + try: + return float(v) * invscale / scale + except ValueError: + return default def parse_duration(s): From af98f8ff37b3a0d9d1f743f4fc6c646333501eb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 14 Oct 2015 22:37:03 +0600 Subject: [PATCH 20/83] [utils] Return default on fail in int_or_none --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 83b44caaa..7dbe25661 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1376,7 +1376,7 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): try: return int(v) * invscale // scale except ValueError: - pass + return default def str_or_none(v, default=None): From 1db82381e38181aafbd78c65c58f005ad84cc08a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 15 Oct 2015 01:52:25 +0600 Subject: [PATCH 21/83] [channel9] Add low quality formats and modernize --- youtube_dl/extractor/channel9.py | 35 ++++++++++++-------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 1ce004932..3a88181d8 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -3,7 +3,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + parse_filesize, + qualities, +) class Channel9IE(InfoExtractor): @@ -52,23 +56,6 @@ class Channel9IE(InfoExtractor): _RSS_URL = 'http://channel9.msdn.com/%s/RSS' - # Sorted by quality - _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4'] - - def _restore_bytes(self, formatted_size): - if not formatted_size: - return 0 - m = re.match(r'^(?P\d+(?:\.\d+)?)\s+(?P[a-zA-Z]+)', formatted_size) - if not m: - return 0 - units = m.group('units') - try: - exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper()) - except ValueError: - return 0 - size = float(m.group('size')) - return int(size * (1024 ** exponent)) - def _formats_from_html(self, html): FORMAT_REGEX = r''' (?x) @@ -78,16 +65,20 @@ class Channel9IE(InfoExtractor):

    File\s+size

    \s*(?P.*?)\s*
    )? # File size part may be missing ''' - # Extract known formats + quality = qualities(( + 'MP3', 'MP4', + 'Low Quality WMV', 'Low Quality MP4', + 'Mid Quality WMV', 'Mid Quality MP4', + 'High Quality WMV', 'High Quality MP4')) formats = [{ 'url': x.group('url'), 'format_id': x.group('quality'), 'format_note': x.group('note'), 'format': '%s (%s)' % (x.group('quality'), x.group('note')), - 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate - 'preference': self._known_formats.index(x.group('quality')), + 'filesize_approx': parse_filesize(x.group('filesize')), + 'quality': quality(x.group('quality')), 'vcodec': 'none' if x.group('note') == 'Audio only' else None, - } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats] + } for x in list(re.finditer(FORMAT_REGEX, html))] self._sort_formats(formats) From a13d06de420f6968425d48030c37e1150ff9ed6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 15 Oct 2015 01:57:59 +0600 Subject: [PATCH 22/83] [channel9] Add test for low quality mp4 --- youtube_dl/extractor/channel9.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 3a88181d8..554399787 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -51,6 +51,21 @@ class Channel9IE(InfoExtractor): 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', 'authors': ['Mike Wilmot'], }, + }, + { + # low quality mp4 is best + 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', + 'info_dict': { + 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', + 'ext': 'mp4', + 'title': 'Ranges for the Standard Library', + 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d', + 'duration': 5646, + 'thumbnail': 're:http://.*\.jpg', + }, + 'params': { + 'skip_download': True, + }, } ] From fafc7950e2230bf25ac7c7563f1704cf8f134f64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 15 Oct 2015 01:59:11 +0600 Subject: [PATCH 23/83] [channel9] Update tests' thumbnails --- youtube_dl/extractor/channel9.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 554399787..c74553dcf 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -32,7 +32,7 @@ class Channel9IE(InfoExtractor): 'title': 'Developer Kick-Off Session: Stuff We Love', 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', 'duration': 4576, - 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg', + 'thumbnail': 're:http://.*\.jpg', 'session_code': 'KOS002', 'session_day': 'Day 1', 'session_room': 'Arena 1A', @@ -48,7 +48,7 @@ class Channel9IE(InfoExtractor): 'title': 'Self-service BI with Power BI - nuclear testing', 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', 'duration': 1540, - 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', + 'thumbnail': 're:http://.*\.jpg', 'authors': ['Mike Wilmot'], }, }, From 6744f36db710eebe2ccc633e7f4f6132b968b0ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 16 Oct 2015 08:44:19 +0600 Subject: [PATCH 24/83] [jeuxvideo] Fallback on og:title (Closes #7186, closes #7190) --- youtube_dl/extractor/jeuxvideo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index 1df084d87..eef7daa29 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -28,7 +28,7 @@ class JeuxVideoIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) title = mobj.group(1) webpage = self._download_webpage(url, title) - title = self._html_search_meta('name', webpage) + title = self._html_search_meta('name', webpage) or self._og_search_title(webpage) config_url = self._html_search_regex( r'data-src="(/contenu/medias/video.php.*?)"', webpage, 'config URL') From 8daeeedc06f420e2a87ba4755b56e721391cedba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Oct 2015 00:26:45 +0600 Subject: [PATCH 25/83] [bbc] Fix FutureWarning --- youtube_dl/extractor/bbc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 68995f81e..1b3a33e4e 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -421,7 +421,7 @@ class BBCCoUkIE(InfoExtractor): continue title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS) - description = description_el.text if description_el else None + description = description_el.text if description_el is not None else None def get_programme_id(item): def get_from_attributes(item): From 1e52776ac3ebbafc2ec4697f3bc6ba05b7e5a9f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Oct 2015 00:46:38 +0600 Subject: [PATCH 26/83] [bandcamp] Prepend download URL with scheme when necessary (2) (#7077) --- youtube_dl/extractor/bandcamp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index a27f3e748..f19e19001 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -52,7 +52,7 @@ class BandcampIE(InfoExtractor): ext, abr_str = format_id.split('-', 1) formats.append({ 'format_id': format_id, - 'url': format_url, + 'url': self._proto_relative_url(format_url, 'http:'), 'ext': ext, 'vcodec': 'none', 'acodec': ext, From ba717dca97925a21870fedcb46358d06cd5485ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Oct 2015 00:51:35 +0600 Subject: [PATCH 27/83] [bandcamp] Modernize --- youtube_dl/extractor/bandcamp.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index f19e19001..c1ef8051d 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -10,6 +10,8 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + float_or_none, + int_or_none, ) @@ -56,7 +58,7 @@ class BandcampIE(InfoExtractor): 'ext': ext, 'vcodec': 'none', 'acodec': ext, - 'abr': int(abr_str), + 'abr': int_or_none(abr_str), }) self._sort_formats(formats) @@ -65,7 +67,7 @@ class BandcampIE(InfoExtractor): 'id': compat_str(data['id']), 'title': data['title'], 'formats': formats, - 'duration': float(data['duration']), + 'duration': float_or_none(data.get('duration')), } else: raise ExtractorError('No free songs found') From 246ce1085804ead9126328cb1ec761d308f561c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Oct 2015 01:08:23 +0600 Subject: [PATCH 28/83] [rte] Update _VALID_URL (Closes #7198) --- youtube_dl/extractor/rte.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py index 04158b993..427c70866 100644 --- a/youtube_dl/extractor/rte.py +++ b/youtube_dl/extractor/rte.py @@ -9,7 +9,7 @@ from ..utils import ( class RteIE(InfoExtractor): - _VALID_URL = r'http?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/(?P[0-9]+)/' + _VALID_URL = r'http?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P[0-9]+)' _TEST = { 'url': 'http://www.rte.ie/player/de/show/10363114/', 'info_dict': { From 2ccb37beb9e35ebbf2cdf65a4c1641e5286de1e8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 16 Oct 2015 21:40:38 +0200 Subject: [PATCH 29/83] release 2015.10.16 --- docs/supportedsites.md | 2 +- youtube_dl/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index dc0354095..47f7da86d 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -159,6 +159,7 @@ - **facebook** - **faz.net** - **fc2** + - **Fczenit** - **fernsehkritik.tv** - **Firstpost** - **FiveTV** @@ -281,7 +282,6 @@ - **Malemotion** - **MDR** - **media.ccc.de** - - **MegaVideoz** - **metacafe** - **Metacritic** - **Mgoon** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index aaa43d315..31d2a9dc0 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.10.13' +__version__ = '2015.10.16' From 36eb802baffda9930e8c821e1adf94b0b53b5ac6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 17 Oct 2015 11:49:51 +0200 Subject: [PATCH 30/83] [rte] Replace expired test According to their webpage it should be available until October 2035. --- youtube_dl/extractor/rte.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py index 427c70866..2811767b9 100644 --- a/youtube_dl/extractor/rte.py +++ b/youtube_dl/extractor/rte.py @@ -11,14 +11,14 @@ from ..utils import ( class RteIE(InfoExtractor): _VALID_URL = r'http?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P[0-9]+)' _TEST = { - 'url': 'http://www.rte.ie/player/de/show/10363114/', + 'url': 'http://www.rte.ie/player/ie/show/iwitness-862/10478715/', 'info_dict': { - 'id': '10363114', + 'id': '10478715', 'ext': 'mp4', - 'title': 'One News', + 'title': 'Watch iWitness online', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'The One O\'Clock News followed by Weather.', - 'duration': 436.844, + 'description': 'iWitness : The spirit of Ireland, one voice and one minute at a time.', + 'duration': 60.046, }, 'params': { 'skip_download': 'f4m fails with --test atm' From 6df7179e6c3c7df165b5788ecb5e712da122356f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 17 Oct 2015 11:53:59 +0200 Subject: [PATCH 31/83] [rte] Actually recognize https urls There was a missing 's' before the '?'. --- youtube_dl/extractor/rte.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py index 2811767b9..d9cfbf180 100644 --- a/youtube_dl/extractor/rte.py +++ b/youtube_dl/extractor/rte.py @@ -9,7 +9,7 @@ from ..utils import ( class RteIE(InfoExtractor): - _VALID_URL = r'http?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P[0-9]+)' _TEST = { 'url': 'http://www.rte.ie/player/ie/show/iwitness-862/10478715/', 'info_dict': { From fbd9f6ea804328d536aafd2b20a8afb72968e351 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Oct 2015 18:28:21 +0600 Subject: [PATCH 32/83] [twitch] Improve authentication --- youtube_dl/extractor/twitch.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 023911c41..891499a1f 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -15,6 +15,7 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + encode_dict, ExtractorError, int_or_none, parse_duration, @@ -27,8 +28,7 @@ class TwitchBaseIE(InfoExtractor): _API_BASE = 'https://api.twitch.tv' _USHER_BASE = 'http://usher.twitch.tv' - _LOGIN_URL = 'https://secure.twitch.tv/login' - _LOGIN_POST_URL = 'https://passport.twitch.tv/authentications/new' + _LOGIN_URL = 'http://www.twitch.tv/login' _NETRC_MACHINE = 'twitch' def _handle_error(self, response): @@ -61,26 +61,28 @@ class TwitchBaseIE(InfoExtractor): if username is None: return - login_page = self._download_webpage( + login_page, handle = self._download_webpage_handle( self._LOGIN_URL, None, 'Downloading login page') login_form = self._hidden_inputs(login_page) login_form.update({ - 'login': username.encode('utf-8'), - 'password': password.encode('utf-8'), + 'username': username, + 'password': password, }) + redirect_url = handle.geturl() + post_url = self._search_regex( r']+action=(["\'])(?P.+?)\1', login_page, - 'post url', default=self._LOGIN_POST_URL, group='url') + 'post url', default=redirect_url, group='url') if not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + post_url = compat_urlparse.urljoin(redirect_url, post_url) request = compat_urllib_request.Request( - post_url, compat_urllib_parse.urlencode(login_form).encode('utf-8')) - request.add_header('Referer', self._LOGIN_URL) + post_url, compat_urllib_parse.urlencode(encode_dict(login_form)).encode('utf-8')) + request.add_header('Referer', redirect_url) response = self._download_webpage( request, None, 'Logging in as %s' % username) From e5e9966199c00a6b89f1f25e1c7b85effb032537 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Oct 2015 18:29:54 +0600 Subject: [PATCH 33/83] [twitch:vod] Improve extraction --- youtube_dl/extractor/twitch.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 891499a1f..21ea836ea 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -240,14 +240,24 @@ class TwitchVodIE(TwitchItemBaseIE): def _real_extract(self, url): item_id = self._match_id(url) + info = self._download_info(self._ITEM_SHORTCUT, item_id) access_token = self._download_json( - '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id, - 'Downloading %s access token' % self._ITEM_TYPE) + '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id, + 'Downloading %s access token' % self._ITEM_TYPE) + formats = self._extract_m3u8_formats( - '%s/vod/%s?nauth=%s&nauthsig=%s&allow_source=true' - % (self._USHER_BASE, item_id, access_token['token'], access_token['sig']), - item_id, 'mp4') + '%s/vod/%s?%s' % ( + self._USHER_BASE, item_id, + compat_urllib_parse.urlencode({ + 'allow_source': 'true', + 'allow_spectre': 'true', + 'player': 'twitchweb', + 'nauth': access_token['token'], + 'nauthsig': access_token['sig'], + })), + item_id, 'mp4') + self._prefer_source(formats) info['formats'] = formats From 350c9481336ac981eadc982b67ccdbc7e28ca0e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Oct 2015 18:43:12 +0600 Subject: [PATCH 34/83] [twitch:vod] Formatting --- youtube_dl/extractor/twitch.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 21ea836ea..3ec08b674 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -243,20 +243,20 @@ class TwitchVodIE(TwitchItemBaseIE): info = self._download_info(self._ITEM_SHORTCUT, item_id) access_token = self._download_json( - '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id, - 'Downloading %s access token' % self._ITEM_TYPE) + '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id, + 'Downloading %s access token' % self._ITEM_TYPE) formats = self._extract_m3u8_formats( - '%s/vod/%s?%s' % ( - self._USHER_BASE, item_id, - compat_urllib_parse.urlencode({ + '%s/vod/%s?%s' % ( + self._USHER_BASE, item_id, + compat_urllib_parse.urlencode({ 'allow_source': 'true', 'allow_spectre': 'true', 'player': 'twitchweb', 'nauth': access_token['token'], 'nauthsig': access_token['sig'], })), - item_id, 'mp4') + item_id, 'mp4') self._prefer_source(formats) info['formats'] = formats From 41a7b00f183844e93ae2ba46fb4021f257f3ce79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= Date: Sat, 17 Oct 2015 18:18:40 +0200 Subject: [PATCH 35/83] [vimeo] Extract config URL from (new?) React-based Vimeo's page --- youtube_dl/extractor/vimeo.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index fa1b22049..88e462a4d 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -286,7 +286,14 @@ class VimeoIE(VimeoBaseInfoExtractor): try: try: config_url = self._html_search_regex( - r' data-config-url="(.+?)"', webpage, 'config URL') + r' data-config-url="(.+?)"', webpage, + 'config URL', default=None) + if not config_url: + # New react-based page + vimeo_clip_page_config = self._search_regex( + r'vimeo\.clip_page_config\s*=\s*({.+?});', webpage, + 'vimeo clip page config') + config_url = self._parse_json(vimeo_clip_page_config, video_id)['player']['config_url'] config_json = self._download_webpage(config_url, video_id) config = json.loads(config_json) except RegexNotFoundError: From dd8417526b13c541e6db8f4200e717b8922a1620 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Oct 2015 22:48:14 +0600 Subject: [PATCH 36/83] [vimeo] Clarify new react+flux website fallback --- youtube_dl/extractor/vimeo.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 88e462a4d..0f84656c0 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -289,11 +289,14 @@ class VimeoIE(VimeoBaseInfoExtractor): r' data-config-url="(.+?)"', webpage, 'config URL', default=None) if not config_url: - # New react-based page + # Sometimes new react-based page is served instead of old one that require + # different config URL extraction approach (see + # https://github.com/rg3/youtube-dl/pull/7209) vimeo_clip_page_config = self._search_regex( r'vimeo\.clip_page_config\s*=\s*({.+?});', webpage, 'vimeo clip page config') - config_url = self._parse_json(vimeo_clip_page_config, video_id)['player']['config_url'] + config_url = self._parse_json( + vimeo_clip_page_config, video_id)['player']['config_url'] config_json = self._download_webpage(config_url, video_id) config = json.loads(config_json) except RegexNotFoundError: From 59fe4824f80b7e266ea9918ae1b2e49a456b869f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= Date: Sat, 17 Oct 2015 18:52:25 +0200 Subject: [PATCH 37/83] [vidme] Better error message for suspended vidme videos --- youtube_dl/extractor/vidme.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 078d283b2..81dcaa231 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -114,6 +114,12 @@ class VidmeIE(InfoExtractor): video = response['video'] + if video.get('state') == 'user-disabled': + raise ExtractorError( + 'Vidme said: This video has been suspended either due to a copyright claim, ' + 'or for violating the terms of use.', + expected=True) + formats = [{ 'format_id': f.get('type'), 'url': f['uri'], From 9eb31b265f65ec6b04a508702af1a6feddafb8fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 17 Oct 2015 23:01:24 +0600 Subject: [PATCH 38/83] [vidme] Add user-disabled test --- youtube_dl/extractor/vidme.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 81dcaa231..382517a4a 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -93,6 +93,10 @@ class VidmeIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # nsfw, user-disabled + 'url': 'https://vid.me/dzGJ', + 'only_matching': True, }] def _real_extract(self, url): From 583882fdce19f8c565402f42523b275f96c91575 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= Date: Sat, 17 Oct 2015 19:26:30 +0200 Subject: [PATCH 39/83] [dailymotion] Report errors from player v5 --- youtube_dl/extractor/dailymotion.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 80a05cfee..ea1edceb1 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -96,6 +96,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'uploader': 'HotWaves1012', 'age_limit': 18, } + }, + # geo-restricted, player v5 + { + 'url': 'http://www.dailymotion.com/video/xhza0o', + 'only_matching': True, } ] @@ -124,6 +129,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor): if player_v5: player = self._parse_json(player_v5, video_id) metadata = player['metadata'] + + self._check_error(metadata) + formats = [] for quality, media_list in metadata['qualities'].items(): for media in media_list: @@ -201,9 +209,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'video info', flags=re.MULTILINE), video_id) - if info.get('error') is not None: - msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title'] - raise ExtractorError(msg, expected=True) + self._check_error(info) formats = [] for (key, format_id) in self._FORMATS: @@ -246,6 +252,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'duration': info['duration'] } + def _check_error(self, info): + if info.get('error') is not None: + msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title'] + raise ExtractorError(msg, expected=True) + def _get_subtitles(self, video_id, webpage): try: sub_list = self._download_webpage( From 648e6a1ffe45ceae2995c3f9ec6a9413aad55640 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 00:11:34 +0600 Subject: [PATCH 40/83] [youtube] Generalize playlist entries extraction (Closes #6699, closes #6992) --- youtube_dl/extractor/youtube.py | 121 ++++++++++++++------------------ 1 file changed, 52 insertions(+), 69 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b252e36e1..08e821362 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -178,6 +178,52 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return +class YoutubePlaylistBaseInfoExtractor(InfoExtractor): + # Extract the video ids from the playlist pages + def _entries(self, page, playlist_id): + more_widget_html = content_html = page + for page_num in itertools.count(1): + for video_id, video_title in self.extract_videos_from_page(content_html): + yield self.url_result( + video_id, 'Youtube', video_id=video_id, + video_title=video_title) + + mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) + if not mobj: + break + + more = self._download_json( + 'https://youtube.com/%s' % mobj.group('more'), playlist_id, + 'Downloading page #%s' % page_num, + transform_source=uppercase_escape) + content_html = more['content_html'] + if not content_html.strip(): + # Some webpages show a "Load more" button but they don't + # have more videos + break + more_widget_html = more['load_more_widget_html'] + + def extract_videos_from_page(self, page): + ids_in_page = [] + titles_in_page = [] + for mobj in re.finditer(self._VIDEO_RE, page): + # The link with index 0 is not the first video of the playlist (not sure if still actual) + if 'index' in mobj.groupdict() and mobj.group('id') == '0': + continue + video_id = mobj.group('id') + video_title = unescapeHTML(mobj.group('title')) + if video_title: + video_title = video_title.strip() + try: + idx = ids_in_page.index(video_id) + if video_title and not titles_in_page[idx]: + titles_in_page[idx] = video_title + except ValueError: + ids_in_page.append(video_id) + titles_in_page.append(video_title) + return zip(ids_in_page, titles_in_page) + + class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' _VALID_URL = r"""(?x)^ @@ -1419,7 +1465,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } -class YoutubePlaylistIE(YoutubeBaseInfoExtractor): +class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com playlists' _VALID_URL = r"""(?x)(?: (?:https?://)? @@ -1440,7 +1486,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' - _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)' + _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)(?:[^>]+>(?P[^<]+))?' IE_NAME = 'youtube:playlist' _TESTS = [{ 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', @@ -1557,37 +1603,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): else: self.report_warning('Youtube gives an alert message: ' + match) - # Extract the video ids from the playlist pages - def _entries(): - more_widget_html = content_html = page - for page_num in itertools.count(1): - matches = re.finditer(self._VIDEO_RE, content_html) - # We remove the duplicates and the link with index 0 - # (it's not the first video of the playlist) - new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') - for vid_id in new_ids: - yield self.url_result(vid_id, 'Youtube', video_id=vid_id) - - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: - break - - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape) - content_html = more['content_html'] - if not content_html.strip(): - # Some webpages show a "Load more" button but they don't - # have more videos - break - more_widget_html = more['load_more_widget_html'] - playlist_title = self._html_search_regex( r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>', page, 'title') - return self.playlist_result(_entries(), playlist_id, playlist_title) + return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title) def _real_extract(self, url): # Extract playlist id @@ -1613,10 +1633,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): return self._extract_playlist(playlist_id) -class YoutubeChannelIE(InfoExtractor): +class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com channels' _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)' _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' + _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' IE_NAME = 'youtube:channel' _TESTS = [{ 'note': 'paginated channel', @@ -1627,22 +1648,6 @@ class YoutubeChannelIE(InfoExtractor): } }] - @staticmethod - def extract_videos_from_page(page): - ids_in_page = [] - titles_in_page = [] - for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page): - video_id = mobj.group('id') - video_title = unescapeHTML(mobj.group('title')) - try: - idx = ids_in_page.index(video_id) - if video_title and not titles_in_page[idx]: - titles_in_page[idx] = video_title - except ValueError: - ids_in_page.append(video_id) - titles_in_page.append(video_title) - return zip(ids_in_page, titles_in_page) - def _real_extract(self, url): channel_id = self._match_id(url) @@ -1685,29 +1690,7 @@ class YoutubeChannelIE(InfoExtractor): for video_id, video_title in self.extract_videos_from_page(channel_page)] return self.playlist_result(entries, channel_id) - def _entries(): - more_widget_html = content_html = channel_page - for pagenum in itertools.count(1): - - for video_id, video_title in self.extract_videos_from_page(content_html): - yield self.url_result( - video_id, 'Youtube', video_id=video_id, - video_title=video_title) - - mobj = re.search( - r'data-uix-load-more-href="/?(?P<more>[^"]+)"', - more_widget_html) - if not mobj: - break - - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), channel_id, - 'Downloading page #%s' % (pagenum + 1), - transform_source=uppercase_escape) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] - - return self.playlist_result(_entries(), channel_id) + return self.playlist_result(self._entries(channel_page, channel_id), channel_id) class YoutubeUserIE(YoutubeChannelIE): From 8e5b1219489be399de55566090e145c89007fa48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 00:27:06 +0600 Subject: [PATCH 41/83] [test_youtube_lists] Add test flat playlist entries' titles --- test/test_youtube_lists.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index c889b6f15..26aadb34f 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -57,5 +57,14 @@ class TestYoutubeLists(unittest.TestCase): entries = result['entries'] self.assertEqual(len(entries), 100) + def test_youtube_flat_playlist_titles(self): + dl = FakeYDL() + dl.params['extract_flat'] = True + ie = YoutubePlaylistIE(dl) + result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') + self.assertIsPlaylist(result) + for entry in result['entries']: + self.assertTrue(entry.get('title')) + if __name__ == '__main__': unittest.main() From 7593fbaa126f8bf14eecff7f103cb497e3d31de5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 01:00:37 +0600 Subject: [PATCH 42/83] [dailymotion] Error spelling --- youtube_dl/extractor/dailymotion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index ea1edceb1..9cd9ff17d 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -254,8 +254,8 @@ class DailymotionIE(DailymotionBaseInfoExtractor): def _check_error(self, info): if info.get('error') is not None: - msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title'] - raise ExtractorError(msg, expected=True) + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, info['error']['title']), expected=True) def _get_subtitles(self, video_id, webpage): try: From 5a11b793fe70beb6b0c7a74a489db9e52c4a742b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 01:36:03 +0600 Subject: [PATCH 43/83] [lynda] Extract all prioritized streams --- youtube_dl/extractor/lynda.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 378117270..5c973e75c 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -140,13 +140,14 @@ class LyndaIE(LyndaBaseIE): prioritized_streams = video_json.get('PrioritizedStreams') if prioritized_streams: - formats.extend([ - { - 'url': video_url, - 'width': int_or_none(format_id), - 'format_id': format_id, - } for format_id, video_url in prioritized_streams['0'].items() - ]) + for prioritized_stream_id, prioritized_stream in prioritized_streams.items(): + formats.extend([ + { + 'url': video_url, + 'width': int_or_none(format_id), + 'format_id': '%s-%s' % (prioritized_stream_id, format_id), + } for format_id, video_url in prioritized_stream.items() + ]) self._check_formats(formats, video_id) self._sort_formats(formats) From 80f48920c8a909ba55d13932524e55ed970f1c6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 06:57:57 +0600 Subject: [PATCH 44/83] [crunchyroll] Bypass maturity wall (Closes #7202) --- youtube_dl/extractor/crunchyroll.py | 59 ++++++++++++++++++----------- 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 95952bc29..aa258bbc2 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -32,6 +32,26 @@ from ..aes import ( class CrunchyrollBaseIE(InfoExtractor): + _NETRC_MACHINE = 'crunchyroll' + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + self.report_login() + login_url = 'https://www.crunchyroll.com/?a=formhandler' + data = urlencode_postdata({ + 'formname': 'RpcApiUser_Login', + 'name': username, + 'password': password, + }) + login_request = compat_urllib_request.Request(login_url, data) + login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + self._download_webpage(login_request, None, False, 'Wrong login info') + + def _real_initialize(self): + self._login() + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) else compat_urllib_request.Request(url_or_request)) @@ -46,10 +66,22 @@ class CrunchyrollBaseIE(InfoExtractor): return super(CrunchyrollBaseIE, self)._download_webpage( request, video_id, note, errnote, fatal, tries, timeout, encoding) + @staticmethod + def _add_skip_wall(url): + parsed_url = compat_urlparse.urlparse(url) + qs = compat_urlparse.parse_qs(parsed_url.query) + # Always force skip_wall to bypass maturity wall, namely 18+ confirmation message: + # > This content may be inappropriate for some people. + # > Are you sure you want to continue? + # since it's not disabled by default in crunchyroll account's settings. + # See https://github.com/rg3/youtube-dl/issues/7202. + qs['skip_wall'] = ['1'] + return compat_urlparse.urlunparse( + parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True))) + class CrunchyrollIE(CrunchyrollBaseIE): _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)' - _NETRC_MACHINE = 'crunchyroll' _TESTS = [{ 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', 'info_dict': { @@ -81,7 +113,6 @@ class CrunchyrollIE(CrunchyrollBaseIE): # rtmp 'skip_download': True, }, - }, { 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', 'only_matching': True, @@ -94,24 +125,6 @@ class CrunchyrollIE(CrunchyrollBaseIE): '1080': ('80', '108'), } - def _login(self): - (username, password) = self._get_login_info() - if username is None: - return - self.report_login() - login_url = 'https://www.crunchyroll.com/?a=formhandler' - data = urlencode_postdata({ - 'formname': 'RpcApiUser_Login', - 'name': username, - 'password': password, - }) - login_request = compat_urllib_request.Request(login_url, data) - login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - self._download_webpage(login_request, None, False, 'Wrong login info') - - def _real_initialize(self): - self._login() - def _decrypt_subtitles(self, data, iv, id): data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8'))) @@ -254,7 +267,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text else: webpage_url = 'http://www.' + mobj.group('url') - webpage = self._download_webpage(webpage_url, video_id, 'Downloading webpage') + webpage = self._download_webpage(self._add_skip_wall(webpage_url), video_id, 'Downloading webpage') note_m = self._html_search_regex( r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, 'trailer-notice', default='') @@ -352,7 +365,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): IE_NAME = "crunchyroll:playlist" - _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?$' + _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?(?:\?|$)' _TESTS = [{ 'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', @@ -366,7 +379,7 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): def _real_extract(self, url): show_id = self._match_id(url) - webpage = self._download_webpage(url, show_id) + webpage = self._download_webpage(self._add_skip_wall(url), show_id) title = self._html_search_regex( r'(?s)<h1[^>]*>\s*<span itemprop="name">(.*?)</span>', webpage, 'title') From 49941c4e4f6e33785a3be1e0d103bd81657d8a0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 07:06:47 +0600 Subject: [PATCH 45/83] [crunchyroll] Add maturity wall reference tests (#7202) --- youtube_dl/extractor/crunchyroll.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index aa258bbc2..cecd0c784 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -116,6 +116,10 @@ class CrunchyrollIE(CrunchyrollBaseIE): }, { 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', 'only_matching': True, + }, { + # geo-restricted (US), 18+ maturity wall, non-premium available + 'url': 'http://www.crunchyroll.com/cosplay-complex-ova/episode-1-the-birth-of-the-cosplay-club-565617', + 'only_matching': True, }] _FORMAT_IDS = { @@ -374,6 +378,19 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): 'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi' }, 'playlist_count': 13, + }, { + # geo-restricted (US), 18+ maturity wall, non-premium available + 'url': 'http://www.crunchyroll.com/cosplay-complex-ova', + 'info_dict': { + 'id': 'cosplay-complex-ova', + 'title': 'Cosplay Complex OVA' + }, + 'playlist_count': 3, + 'skip': 'Georestricted', + }, { + # geo-restricted (US), 18+ maturity wall, non-premium will be available since 2015.11.14 + 'url': 'http://www.crunchyroll.com/ladies-versus-butlers?skip_wall=1', + 'only_matching': True, }] def _real_extract(self, url): From 448ef1f31c8bcc1550cf907fd46e31026ec981b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 09:11:02 +0600 Subject: [PATCH 46/83] [extractor/common] Allow angle brackets in attributes in _og_regexes (#7215) --- test/test_InfoExtractor.py | 4 ++++ youtube_dl/extractor/common.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 2a00d09a5..938466a80 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -37,12 +37,16 @@ class TestInfoExtractor(unittest.TestCase): <meta property='og:image' content='http://domain.com/pic.jpg?key1=val1&key2=val2'/> <meta content='application/x-shockwave-flash' property='og:video:type'> <meta content='Foo' property=og:foobar> + <meta name="og:test1" content='foo > < bar'/> + <meta name="og:test2" content="foo >//< bar"/> ''' self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2') self.assertEqual(ie._og_search_video_url(html, default=None), None) self.assertEqual(ie._og_search_property('foobar', html), 'Foo') + self.assertEqual(ie._og_search_property('test1', html), 'foo > < bar') + self.assertEqual(ie._og_search_property('test2', html), 'foo >//< bar') def test_html_search_meta(self): ie = self.ie diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a0c4af92f..4365077f1 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -645,7 +645,7 @@ class InfoExtractor(object): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\'|\s*([^\s"\'=<>`]+?))' + content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)' % {'prop': re.escape(prop)}) template = r'<meta[^>]+?%s[^>]+?%s' From 94a773feb94a20be66526348a57ebe20495eba3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= <lukas@oxygene.sk> Date: Sat, 17 Oct 2015 22:25:08 +0200 Subject: [PATCH 47/83] [vine] Use JS data to get title/alt_title --- youtube_dl/extractor/vine.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index c733a48fa..d80b580a0 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -51,6 +51,21 @@ class VineIE(InfoExtractor): }, { 'url': 'https://vine.co/oembed/MYxVapFvz2z.json', 'only_matching': True, + }, { + 'url': 'https://vine.co/v/e192BnZnZ9V', + 'info_dict': { + 'id': 'e192BnZnZ9V', + 'ext': 'mp4', + 'title': u'\u0e22\u0e34\u0e49\u0e21~ \u0e40\u0e02\u0e34\u0e19~ \u0e2d\u0e32\u0e22~ \u0e19\u0e48\u0e32\u0e23\u0e49\u0e32\u0e01\u0e2d\u0e49\u0e30 >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', + 'alt_title': 'Vine by Pimry_zaa', + 'description': u'\u0e22\u0e34\u0e49\u0e21~ \u0e40\u0e02\u0e34\u0e19~ \u0e2d\u0e32\u0e22~ \u0e19\u0e48\u0e32\u0e23\u0e49\u0e32\u0e01\u0e2d\u0e49\u0e30 >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', + 'upload_date': '20150705', + 'uploader': 'Pimry_zaa', + 'uploader_id': '1135760698325307392', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -74,8 +89,8 @@ class VineIE(InfoExtractor): return { 'id': video_id, - 'title': self._og_search_title(webpage), - 'alt_title': self._og_search_description(webpage, default=None), + 'title': data['description'], + 'alt_title': 'Vine by %s' % data['username'], 'description': data['description'], 'thumbnail': data['thumbnailUrl'], 'upload_date': unified_strdate(data['created']), From 10c38c7ca248d06c2c0f069c5a810e27e207c61e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Lalinsk=C3=BD?= <lukas@oxygene.sk> Date: Sat, 17 Oct 2015 22:29:49 +0200 Subject: [PATCH 48/83] [vine] Fix download tests --- youtube_dl/extractor/vine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index d80b580a0..d1dbec893 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -29,10 +29,10 @@ class VineIE(InfoExtractor): 'id': 'MYxVapFvz2z', 'ext': 'mp4', 'title': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14', - 'alt_title': 'Vine by Luna', + 'alt_title': 'Vine by Mars Ruiz', 'description': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14', 'upload_date': '20140815', - 'uploader': 'Luna', + 'uploader': 'Mars Ruiz', 'uploader_id': '1102363502380728320', }, }, { From 91816e8f16408a3a2753fb254a9e963ad9429ced Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 09:32:08 +0600 Subject: [PATCH 49/83] [vine] Remove duplicate metadata, make more robust and modernize (Closes #7215) --- youtube_dl/extractor/vine.py | 39 ++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index d1dbec893..6e72cc253 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -1,10 +1,14 @@ +# coding: utf-8 from __future__ import unicode_literals import re import itertools from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( + int_or_none, + unified_strdate, +) class VineIE(InfoExtractor): @@ -17,7 +21,6 @@ class VineIE(InfoExtractor): 'ext': 'mp4', 'title': 'Chicken.', 'alt_title': 'Vine by Jack Dorsey', - 'description': 'Chicken.', 'upload_date': '20130519', 'uploader': 'Jack Dorsey', 'uploader_id': '76', @@ -30,7 +33,6 @@ class VineIE(InfoExtractor): 'ext': 'mp4', 'title': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14', 'alt_title': 'Vine by Mars Ruiz', - 'description': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14', 'upload_date': '20140815', 'uploader': 'Mars Ruiz', 'uploader_id': '1102363502380728320', @@ -43,7 +45,6 @@ class VineIE(InfoExtractor): 'ext': 'mp4', 'title': '#mw3 #ac130 #killcam #angelofdeath', 'alt_title': 'Vine by Z3k3', - 'description': '#mw3 #ac130 #killcam #angelofdeath', 'upload_date': '20130430', 'uploader': 'Z3k3', 'uploader_id': '936470460173008896', @@ -56,9 +57,8 @@ class VineIE(InfoExtractor): 'info_dict': { 'id': 'e192BnZnZ9V', 'ext': 'mp4', - 'title': u'\u0e22\u0e34\u0e49\u0e21~ \u0e40\u0e02\u0e34\u0e19~ \u0e2d\u0e32\u0e22~ \u0e19\u0e48\u0e32\u0e23\u0e49\u0e32\u0e01\u0e2d\u0e49\u0e30 >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', + 'title': 'ยิ้ม~ เขิน~ อาย~ น่าร้ากอ้ะ >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', 'alt_title': 'Vine by Pimry_zaa', - 'description': u'\u0e22\u0e34\u0e49\u0e21~ \u0e40\u0e02\u0e34\u0e19~ \u0e2d\u0e32\u0e22~ \u0e19\u0e48\u0e32\u0e23\u0e49\u0e32\u0e01\u0e2d\u0e49\u0e30 >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', 'upload_date': '20150705', 'uploader': 'Pimry_zaa', 'uploader_id': '1135760698325307392', @@ -80,25 +80,26 @@ class VineIE(InfoExtractor): formats = [{ 'format_id': '%(format)s-%(rate)s' % f, - 'vcodec': f['format'], - 'quality': f['rate'], + 'vcodec': f.get('format'), + 'quality': f.get('rate'), 'url': f['videoUrl'], - } for f in data['videoUrls']] + } for f in data['videoUrls'] if f.get('videoUrl')] self._sort_formats(formats) + username = data.get('username') + return { 'id': video_id, - 'title': data['description'], - 'alt_title': 'Vine by %s' % data['username'], - 'description': data['description'], - 'thumbnail': data['thumbnailUrl'], - 'upload_date': unified_strdate(data['created']), - 'uploader': data['username'], - 'uploader_id': data['userIdStr'], - 'like_count': data['likes']['count'], - 'comment_count': data['comments']['count'], - 'repost_count': data['reposts']['count'], + 'title': data.get('description') or self._og_search_title(webpage), + 'alt_title': 'Vine by %s' % username if username else self._og_search_description(webpage, default=None), + 'thumbnail': data.get('thumbnailUrl'), + 'upload_date': unified_strdate(data.get('created')), + 'uploader': username, + 'uploader_id': data.get('userIdStr'), + 'like_count': int_or_none(data.get('likes', {}).get('count')), + 'comment_count': int_or_none(data.get('comments', {}).get('count')), + 'repost_count': int_or_none(data.get('reposts', {}).get('count')), 'formats': formats, } From 02835c6bf4403a907c058d43220a83b3b427e181 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 09:34:54 +0600 Subject: [PATCH 50/83] [extractor/common] Document repost_count --- youtube_dl/extractor/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 4365077f1..6169fbbeb 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -172,6 +172,7 @@ class InfoExtractor(object): view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video + repost_count: Number of reposts of the video average_rating: Average rating give by users, the scale used depends on the webpage comment_count: Number of comments on the video comments: A list of comments, each with one or more of the following From 2e022397c45fbcfd2ef6da43d14b0770221aabd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 18 Oct 2015 09:36:19 +0600 Subject: [PATCH 51/83] [vine] Add counters to tests --- youtube_dl/extractor/vine.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 6e72cc253..be72f3147 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -24,6 +24,9 @@ class VineIE(InfoExtractor): 'upload_date': '20130519', 'uploader': 'Jack Dorsey', 'uploader_id': '76', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, { 'url': 'https://vine.co/v/MYxVapFvz2z', @@ -36,6 +39,9 @@ class VineIE(InfoExtractor): 'upload_date': '20140815', 'uploader': 'Mars Ruiz', 'uploader_id': '1102363502380728320', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, { 'url': 'https://vine.co/v/bxVjBbZlPUH', @@ -48,6 +54,9 @@ class VineIE(InfoExtractor): 'upload_date': '20130430', 'uploader': 'Z3k3', 'uploader_id': '936470460173008896', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, { 'url': 'https://vine.co/oembed/MYxVapFvz2z.json', @@ -62,6 +71,9 @@ class VineIE(InfoExtractor): 'upload_date': '20150705', 'uploader': 'Pimry_zaa', 'uploader_id': '1135760698325307392', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, 'params': { 'skip_download': True, From 1e399778ee870ee583135e65458268cd7c0fb923 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 22 Jul 2015 20:03:05 +0800 Subject: [PATCH 52/83] [letv] Fix extraction Using data URIs for passing the decrypted M3U8 manifest, which is supported by ffmpeg only. --- youtube_dl/extractor/letv.py | 70 ++++++++++++++++++++++++++---------- youtube_dl/utils.py | 5 +++ 2 files changed, 57 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index a28abb0f0..9ebbc8089 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -9,13 +9,14 @@ from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_urllib_request, - compat_urlparse, + compat_ord, ) from ..utils import ( determine_ext, ExtractorError, parse_iso8601, int_or_none, + encode_data_uri, ) @@ -25,15 +26,16 @@ class LetvIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.letv.com/ptv/vplay/22005890.html', - 'md5': 'cab23bd68d5a8db9be31c9a222c1e8df', + 'md5': 'edadcfe5406976f42f9f266057ee5e40', 'info_dict': { 'id': '22005890', 'ext': 'mp4', 'title': '第87届奥斯卡颁奖礼完美落幕 《鸟人》成最大赢家', - 'timestamp': 1424747397, - 'upload_date': '20150224', 'description': 'md5:a9cb175fd753e2962176b7beca21a47c', - } + }, + 'params': { + 'hls_prefer_native': True, + }, }, { 'url': 'http://www.letv.com/ptv/vplay/1415246.html', 'info_dict': { @@ -42,16 +44,22 @@ class LetvIE(InfoExtractor): 'title': '美人天下01', 'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda', }, + 'params': { + 'hls_prefer_native': True, + }, }, { 'note': 'This video is available only in Mainland China, thus a proxy is needed', 'url': 'http://www.letv.com/ptv/vplay/1118082.html', - 'md5': 'f80936fbe20fb2f58648e81386ff7927', + 'md5': '2424c74948a62e5f31988438979c5ad1', 'info_dict': { 'id': '1118082', 'ext': 'mp4', 'title': '与龙共舞 完整版', 'description': 'md5:7506a5eeb1722bb9d4068f85024e3986', }, + 'params': { + 'hls_prefer_native': True, + }, 'skip': 'Only available in China', }] @@ -74,6 +82,27 @@ class LetvIE(InfoExtractor): _loc3_ = self.ror(_loc3_, _loc2_ % 17) return _loc3_ + # see M3U8Encryption class in KLetvPlayer.swf + @staticmethod + def decrypt_m3u8(encrypted_data): + if encrypted_data[:5].decode('utf-8').lower() != 'vc_01': + return encrypted_data + encrypted_data = encrypted_data[5:] + + _loc4_ = bytearray() + while encrypted_data: + b = compat_ord(encrypted_data[0]) + _loc4_.extend([b // 16, b & 0x0f]) + encrypted_data = encrypted_data[1:] + idx = len(_loc4_) - 11 + _loc4_ = _loc4_[idx:] + _loc4_[:idx] + _loc7_ = bytearray() + while _loc4_: + _loc7_.append(_loc4_[0] * 16 + _loc4_[1]) + _loc4_ = _loc4_[2:] + + return bytes(_loc7_) + def _real_extract(self, url): media_id = self._match_id(url) page = self._download_webpage(url, media_id) @@ -115,23 +144,28 @@ class LetvIE(InfoExtractor): for format_id in formats: if format_id in dispatch: media_url = playurl['domain'][0] + dispatch[format_id][0] - - # Mimic what flvxz.com do - url_parts = list(compat_urlparse.urlparse(media_url)) - qs = dict(compat_urlparse.parse_qs(url_parts[4])) - qs.update({ - 'platid': '14', - 'splatid': '1401', - 'tss': 'no', - 'retry': 1 + media_url += '&' + compat_urllib_parse.urlencode({ + 'm3v': 1, + 'format': 1, + 'expect': 3, + 'rateid': format_id, }) - url_parts[4] = compat_urllib_parse.urlencode(qs) - media_url = compat_urlparse.urlunparse(url_parts) + + nodes_data = self._download_json( + media_url, media_id, + 'Download JSON metadata for format %s' % format_id) + + req = self._request_webpage( + nodes_data['nodelist'][0]['location'], media_id, + note='Downloading m3u8 information for format %s' % format_id) + + m3u8_data = self.decrypt_m3u8(req.read()) url_info_dict = { - 'url': media_url, + 'url': encode_data_uri(m3u8_data, 'application/x-mpegURL'), 'ext': determine_ext(dispatch[format_id][1]), 'format_id': format_id, + 'protocol': 'm3u8', } if format_id[-1:] == 'p': diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7dbe25661..db5b3698e 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals +import base64 import calendar import codecs import contextlib @@ -1795,6 +1796,10 @@ def urlhandle_detect_ext(url_handle): return mimetype2ext(getheader('Content-Type')) +def encode_data_uri(data, mime_type): + return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii')) + + def age_restricted(content_limit, age_limit): """ Returns True iff the content should be blocked """ From 985e4fdc07f00a3fdc8e7b7b4119471ee97f3890 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 17 Oct 2015 22:49:05 +0800 Subject: [PATCH 53/83] [downloader/hls] Add headers only for http(s) URLs ffmpeg 2.8.1 raises an error with -headers and non-http input files. --- youtube_dl/downloader/hls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index a62d2047b..9a83a73dd 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -30,7 +30,7 @@ class HlsFD(FileDownloader): args = [ffpp.executable, '-y'] - if info_dict['http_headers']: + if info_dict['http_headers'] and re.match(r'^https?://', url): # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. args += [ From 0a67a3632bb9cf76f64658986defc1947090ef50 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 17 Oct 2015 23:15:01 +0800 Subject: [PATCH 54/83] [compat] Add compat_urllib_request_DataHandler --- youtube_dl/compat.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 192e1c515..d103ab9ad 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -1,7 +1,10 @@ from __future__ import unicode_literals +import binascii import collections +import email import getpass +import io import optparse import os import re @@ -38,6 +41,11 @@ try: except ImportError: # Python 2 import urlparse as compat_urlparse +try: + import urllib.response as compat_urllib_response +except ImportError: # Python 2 + import urllib as compat_urllib_response + try: import http.cookiejar as compat_cookiejar except ImportError: # Python 2 @@ -155,6 +163,40 @@ except ImportError: # Python 2 string = string.replace('+', ' ') return compat_urllib_parse_unquote(string, encoding, errors) +try: + from urllib.request import DataHandler as compat_urllib_request_DataHandler +except ImportError: # Python < 3.4 + # Ported from CPython 98774:1733b3bd46db, Lib/urllib/request.py + class compat_urllib_request_DataHandler(compat_urllib_request.BaseHandler): + def data_open(self, req): + # data URLs as specified in RFC 2397. + # + # ignores POSTed data + # + # syntax: + # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data + # mediatype := [ type "/" subtype ] *( ";" parameter ) + # data := *urlchar + # parameter := attribute "=" value + url = req.get_full_url() + + scheme, data = url.split(":", 1) + mediatype, data = data.split(",", 1) + + # even base64 encoded data URLs might be quoted so unquote in any case: + data = compat_urllib_parse_unquote_to_bytes(data) + if mediatype.endswith(";base64"): + data = binascii.a2b_base64(data) + mediatype = mediatype[:-7] + + if not mediatype: + mediatype = "text/plain;charset=US-ASCII" + + headers = email.message_from_string( + "Content-type: %s\nContent-length: %d\n" % (mediatype, len(data))) + + return compat_urllib_response.addinfourl(io.BytesIO(data), headers, url) + try: compat_basestring = basestring # Python 2 except NameError: @@ -489,6 +531,8 @@ __all__ = [ 'compat_urllib_parse_unquote_to_bytes', 'compat_urllib_parse_urlparse', 'compat_urllib_request', + 'compat_urllib_request_DataHandler', + 'compat_urllib_response', 'compat_urlparse', 'compat_urlretrieve', 'compat_xml_parse_error', From 8b172c2e10fb38c62c213673304c7e8dcd17b768 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 17 Oct 2015 23:16:40 +0800 Subject: [PATCH 55/83] [YoutubeDL] Use DataHandler --- youtube_dl/YoutubeDL.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index adf70d658..12977bf80 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -37,6 +37,7 @@ from .compat import ( compat_tokenize_tokenize, compat_urllib_error, compat_urllib_request, + compat_urllib_request_DataHandler, ) from .utils import ( ContentTooShortError, @@ -1967,8 +1968,9 @@ class YoutubeDL(object): debuglevel = 1 if self.params.get('debug_printtraffic') else 0 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) + data_handler = compat_urllib_request_DataHandler() opener = compat_urllib_request.build_opener( - proxy_handler, https_handler, cookie_processor, ydlh) + proxy_handler, https_handler, cookie_processor, ydlh, data_handler) # Delete the default user-agent header, which would otherwise apply in # cases where our custom HTTP handler doesn't come into play From 48aae2d2cf49843d0efa227fa393a0c783fc3c1e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 18 Oct 2015 17:07:48 +0800 Subject: [PATCH 56/83] [twitter] Update tests --- youtube_dl/extractor/twitter.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1dd43ff3c..b2fff73b9 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -15,7 +16,7 @@ class TwitterCardIE(InfoExtractor): _TESTS = [ { 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', - 'md5': 'a74f50b310c83170319ba16de6955192', + 'md5': '7d2f6b4d2eb841a7ccc893d479bfceb4', 'info_dict': { 'id': '560070183650213889', 'ext': 'mp4', @@ -103,17 +104,17 @@ class TwitterIE(TwitterCardIE): _VALID_URL = r'https?://(?:www|m|mobile)?\.?twitter\.com/(?P<id>[^/]+/status/\d+)' _TESTS = [{ - 'url': 'https://m.twitter.com/thereaIbanksy/status/614301758345490432', - 'md5': '8bbccb487bd7a31349b775915fcd412f', + 'url': 'https://twitter.com/freethenipple/status/643211948184596480', + 'md5': '31cd83a116fc41f99ae3d909d4caf6a0', 'info_dict': { - 'id': '614301758345490432', + 'id': '643211948184596480', 'ext': 'mp4', - 'title': 'thereaIbanksy - This time lapse is so pretty \U0001f60d\U0001f60d', + 'title': 'freethenipple - FTN supporters on Hollywood Blvd today!', 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 29.5, - 'description': 'banksy on Twitter: "This time lapse is so pretty \U0001f60d\U0001f60d http://t.co/QB8DDbqiR1"', - 'uploader': 'banksy', - 'uploader_id': 'thereaIbanksy', + 'duration': 12.922, + 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"', + 'uploader': 'FREE THE NIPPLE', + 'uploader_id': 'freethenipple', }, }] From 01d22d47039dedace1c5414c83e9fecfca41b5a5 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 18 Oct 2015 17:11:55 +0800 Subject: [PATCH 57/83] [twitter] Use _download_xml --- youtube_dl/extractor/twitter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index b2fff73b9..37a9fd5fd 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -8,6 +8,7 @@ from ..compat import compat_urllib_request from ..utils import ( float_or_none, unescapeHTML, + xpath_text, ) @@ -60,9 +61,8 @@ class TwitterCardIE(InfoExtractor): video_id) if 'playlist' not in config: if 'vmapUrl' in config: - webpage = self._download_webpage(config['vmapUrl'], video_id + ' (xml)') - video_url = self._search_regex( - r'<MediaFile>\s*<!\[CDATA\[(https?://.+?)\]\]>', webpage, 'data player config (xml)') + vmap_data = self._download_xml(config['vmapUrl'], video_id) + video_url = xpath_text(vmap_data, './/MediaFile').strip() f = { 'url': video_url, } From 014e880372e896cdd63f9075864d2a3bba60e706 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 18 Oct 2015 17:13:58 +0800 Subject: [PATCH 58/83] [twitter] Add IE_NAMEs --- youtube_dl/extractor/twitter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 37a9fd5fd..5f697782e 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -13,6 +13,7 @@ from ..utils import ( class TwitterCardIE(InfoExtractor): + IE_NAME = 'twitter:card' _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)' _TESTS = [ { @@ -101,6 +102,7 @@ class TwitterCardIE(InfoExtractor): class TwitterIE(TwitterCardIE): + IE_NAME = 'twitter' _VALID_URL = r'https?://(?:www|m|mobile)?\.?twitter\.com/(?P<id>[^/]+/status/\d+)' _TESTS = [{ From f322bfb0638aeeb527459ebcf00f8a3dde26280c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 18 Oct 2015 17:15:47 +0800 Subject: [PATCH 59/83] [twitter:card] Remove unneeded 'ext' --- youtube_dl/extractor/twitter.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 5f697782e..48bef5d80 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -64,13 +64,9 @@ class TwitterCardIE(InfoExtractor): if 'vmapUrl' in config: vmap_data = self._download_xml(config['vmapUrl'], video_id) video_url = xpath_text(vmap_data, './/MediaFile').strip() - f = { + formats.append({ 'url': video_url, - } - ext = re.search(r'\.([a-z0-9]{2,4})(\?.+)?$', video_url) - if ext: - f['ext'] = ext.group(1) - formats.append(f) + }) break # same video regardless of UA continue From e04edad621efe56347e155b6dc59a0c3d589b3bd Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 18 Oct 2015 17:16:57 +0800 Subject: [PATCH 60/83] [twitter] Inherit from InfoExtractor directly --- youtube_dl/extractor/twitter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 48bef5d80..c9b783745 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -97,11 +97,11 @@ class TwitterCardIE(InfoExtractor): } -class TwitterIE(TwitterCardIE): +class TwitterIE(InfoExtractor): IE_NAME = 'twitter' _VALID_URL = r'https?://(?:www|m|mobile)?\.?twitter\.com/(?P<id>[^/]+/status/\d+)' - _TESTS = [{ + _TEST = { 'url': 'https://twitter.com/freethenipple/status/643211948184596480', 'md5': '31cd83a116fc41f99ae3d909d4caf6a0', 'info_dict': { @@ -114,7 +114,7 @@ class TwitterIE(TwitterCardIE): 'uploader': 'FREE THE NIPPLE', 'uploader_id': 'freethenipple', }, - }] + } def _real_extract(self, url): id = self._match_id(url) From f6dfd6603a9e9bb88ebcdcd52490974a34d1bd11 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 18 Oct 2015 17:18:01 +0800 Subject: [PATCH 61/83] [twitter] Use _html_search_regex --- youtube_dl/extractor/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index c9b783745..6ff15369c 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -122,7 +122,7 @@ class TwitterIE(InfoExtractor): name = username url = re.sub(r'https?://(m|mobile)\.', 'https://', url) webpage = self._download_webpage(url, 'tweet: ' + url) - description = unescapeHTML(self._search_regex('<title>\s*(.+?)\s*', webpage, 'title')) + description = self._html_search_regex('\s*(.+?)\s*', webpage, 'title') title = description.replace('\n', ' ') splitdesc = re.match(r'^(.+?)\s*on Twitter:\s* "(.+?)"$', title) if splitdesc: From 575036b40504bc921b18f05bde64e0e7dceacec6 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 18 Oct 2015 18:04:13 +0800 Subject: [PATCH 62/83] [twitter] Simplify and improve --- youtube_dl/extractor/twitter.py | 41 +++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 6ff15369c..6b3b39aee 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -9,6 +9,7 @@ from ..utils import ( float_or_none, unescapeHTML, xpath_text, + remove_end, ) @@ -99,7 +100,8 @@ class TwitterCardIE(InfoExtractor): class TwitterIE(InfoExtractor): IE_NAME = 'twitter' - _VALID_URL = r'https?://(?:www|m|mobile)?\.?twitter\.com/(?P[^/]+/status/\d+)' + _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?P[^/]+)/status/(?P\d+)' + _TEMPLATE_URL = 'https://twitter.com/%s/status/%s' _TEST = { 'url': 'https://twitter.com/freethenipple/status/643211948184596480', @@ -107,7 +109,7 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': '643211948184596480', 'ext': 'mp4', - 'title': 'freethenipple - FTN supporters on Hollywood Blvd today!', + 'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!', 'thumbnail': 're:^https?://.*\.jpg', 'duration': 12.922, 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"', @@ -117,26 +119,31 @@ class TwitterIE(InfoExtractor): } def _real_extract(self, url): - id = self._match_id(url) - username, twid = re.match(r'([^/]+)/status/(\d+)', id).groups() - name = username - url = re.sub(r'https?://(m|mobile)\.', 'https://', url) - webpage = self._download_webpage(url, 'tweet: ' + url) - description = self._html_search_regex('\s*(.+?)\s*', webpage, 'title') - title = description.replace('\n', ' ') - splitdesc = re.match(r'^(.+?)\s*on Twitter:\s* "(.+?)"$', title) - if splitdesc: - name, title = splitdesc.groups() - title = re.sub(r'\s*https?://[^ ]+', '', title) # strip 'https -_t.co_BJYgOjSeGA' junk from filenames - card_id = self._search_regex(r'["\']/i/cards/tfw/v1/(\d+)', webpage, '/i/card/...') + mobj = re.match(self._VALID_URL, url) + user_id = mobj.group('user_id') + twid = mobj.group('id') + + webpage = self._download_webpage(self._TEMPLATE_URL % (user_id, twid), twid) + + username = remove_end(self._og_search_title(webpage), ' on Twitter') + + title = self._og_search_description(webpage).strip('').replace('\n', ' ') + + # strip 'https -_t.co_BJYgOjSeGA' junk from filenames + mobj = re.match(r'“(.*)\s+(http://[^ ]+)”', title) + title, short_url = mobj.groups() + + card_id = self._search_regex( + r'["\']/i/cards/tfw/v1/(\d+)', webpage, 'twitter card url') card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id + return { '_type': 'url_transparent', 'ie_key': 'TwitterCard', - 'uploader_id': username, - 'uploader': name, + 'uploader_id': user_id, + 'uploader': username, 'url': card_url, 'webpage_url': url, - 'description': description, + 'description': '%s on Twitter: "%s %s"' % (username, title, short_url), 'title': username + ' - ' + title, } From 77a54b6a658059a11de415d793588fdbfec14194 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 18 Oct 2015 18:08:24 +0800 Subject: [PATCH 63/83] [twitter:card] Use _html_search_regex --- youtube_dl/extractor/twitter.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 6b3b39aee..1cdca544c 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -7,7 +7,6 @@ from .common import InfoExtractor from ..compat import compat_urllib_request from ..utils import ( float_or_none, - unescapeHTML, xpath_text, remove_end, ) @@ -57,9 +56,8 @@ class TwitterCardIE(InfoExtractor): request.add_header('User-Agent', user_agent) webpage = self._download_webpage(request, video_id) - config = self._parse_json( - unescapeHTML(self._search_regex( - r'data-player-config="([^"]+)"', webpage, 'data player config')), + config = self._parse_json(self._html_search_regex( + r'data-player-config="([^"]+)"', webpage, 'data player config'), video_id) if 'playlist' not in config: if 'vmapUrl' in config: From c88aec845a680ef9404b637b3dbcf706dcf00b68 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 18 Oct 2015 18:23:56 +0800 Subject: [PATCH 64/83] [twitter] Fix short URL extraction --- youtube_dl/extractor/twitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1cdca544c..1472f22a7 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -128,7 +128,7 @@ class TwitterIE(InfoExtractor): title = self._og_search_description(webpage).strip('').replace('\n', ' ') # strip 'https -_t.co_BJYgOjSeGA' junk from filenames - mobj = re.match(r'“(.*)\s+(http://[^ ]+)”', title) + mobj = re.match(r'“(.*)\s+(https?://[^ ]+)”', title) title, short_url = mobj.groups() card_id = self._search_regex( From 4a7b79038425f614af49116edab7897f0db13e5a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 18 Oct 2015 19:07:37 +0800 Subject: [PATCH 65/83] [twitter:card] Support YouTube embeds --- youtube_dl/extractor/twitter.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1472f22a7..9d3e46b94 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -37,6 +37,19 @@ class TwitterCardIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg', 'duration': 80.155, }, + }, + { + 'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977', + 'md5': 'b6f35e8b08a0bec6c8af77a2f4b3a814', + 'info_dict': { + 'id': 'dq4Oj5quskI', + 'ext': 'mp4', + 'title': 'Ubuntu 11.10 Overview', + 'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/', + 'upload_date': '20111013', + 'uploader': 'OMG! Ubuntu!', + 'uploader_id': 'omgubuntu', + }, } ] @@ -56,6 +69,12 @@ class TwitterCardIE(InfoExtractor): request.add_header('User-Agent', user_agent) webpage = self._download_webpage(request, video_id) + youtube_url = self._html_search_regex( + r']+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"', + webpage, 'youtube iframe', default=None) + if youtube_url: + return self.url_result(youtube_url, 'Youtube') + config = self._parse_json(self._html_search_regex( r'data-player-config="([^"]+)"', webpage, 'data player config'), video_id) From 05a3879f1c142cc2bf0287cde4690d8ccadcdc8f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 18 Oct 2015 19:19:46 +0800 Subject: [PATCH 66/83] [letv] Update M3U8's MIME type The new MIME type appears in the following places: https://www.iana.org/assignments/media-types/media-types.xhtml#application https://hg.python.org/cpython/file/tip/Lib/mimetypes.py --- youtube_dl/extractor/letv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index 9ebbc8089..effd9eb92 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -162,7 +162,7 @@ class LetvIE(InfoExtractor): m3u8_data = self.decrypt_m3u8(req.read()) url_info_dict = { - 'url': encode_data_uri(m3u8_data, 'application/x-mpegURL'), + 'url': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'), 'ext': determine_ext(dispatch[format_id][1]), 'format_id': format_id, 'protocol': 'm3u8', From dd67702a3ea007369109ee8e4b67043064e1f759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 18 Oct 2015 14:13:06 +0200 Subject: [PATCH 67/83] [imdb] Fix extraction (fixes #7220) --- youtube_dl/extractor/imdb.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 4bb574cf3..02e1e428e 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -4,8 +4,8 @@ import re import json from .common import InfoExtractor -from ..compat import ( - compat_urlparse, +from ..utils import ( + qualities, ) @@ -30,24 +30,33 @@ class ImdbIE(InfoExtractor): descr = self._html_search_regex( r'(?s)(.*?)', webpage, 'description', fatal=False) - available_formats = re.findall( - r'case \'(?P.*?)\' :$\s+url = \'(?P.*?)\'', webpage, - flags=re.MULTILINE) + player_url = 'http://www.imdb.com/video/imdb/vi%s/imdb/single' % video_id + player_page = self._download_webpage( + player_url, video_id, 'Downloading player page') + # the player page contains the info for the default format, we have to + # fetch other pages for the rest of the formats + extra_formats = re.findall(r'href="(?P%s.*?)".*?>(?P.*?)<' % re.escape(player_url), player_page) + format_pages = [ + self._download_webpage( + f_url, video_id, 'Downloading info for %s format' % f_name) + for f_url, f_name in extra_formats] + format_pages.append(player_page) + + quality = qualities(['SD', '480p', '720p']) formats = [] - for f_id, f_path in available_formats: - f_path = f_path.strip() - format_page = self._download_webpage( - compat_urlparse.urljoin(url, f_path), - 'Downloading info for %s format' % f_id) + for format_page in format_pages: json_data = self._search_regex( r']+class="imdb-player-data"[^>]*?>(.*?)', format_page, 'json data', flags=re.DOTALL) info = json.loads(json_data) format_info = info['videoPlayerObject']['video'] + f_id = format_info['ffname'] formats.append({ 'format_id': f_id, 'url': format_info['videoInfoList'][0]['videoUrl'], + 'quality': quality(f_id), }) + self._sort_formats(formats) return { 'id': video_id, From b0f001a6cbd220c8b10c0ce359f17072d6347a8f Mon Sep 17 00:00:00 2001 From: remitamine Date: Mon, 21 Sep 2015 15:52:36 +0100 Subject: [PATCH 68/83] [canalc2] fix info extraction --- youtube_dl/extractor/canalc2.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index c4fefefe4..66a9ff093 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -8,34 +8,40 @@ from .common import InfoExtractor class Canalc2IE(InfoExtractor): IE_NAME = 'canalc2.tv' - _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?.*?idVideo=(?P\d+)' + _VALID_URL = r'https?://(www\.)?canalc2\.tv/video/(?P\d+)' _TEST = { - 'url': 'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', + 'url': 'http://www.canalc2.tv/video/12163', 'md5': '060158428b650f896c542dfbb3d6487f', 'info_dict': { 'id': '12163', 'ext': 'mp4', 'title': 'Terrasses du Numérique' + }, + 'params': { + 'skip_download': True, # Requires rtmpdump } } def _real_extract(self, url): - video_id = re.match(self._VALID_URL, url).group('id') - # We need to set the voir field for getting the file name - url = 'http://www.canalc2.tv/video.asp?idVideo=%s&voir=oui' % video_id + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - file_name = self._search_regex( - r"so\.addVariable\('file','(.*?)'\);", - webpage, 'file name') - video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name + video_url = self._search_regex( + r'jwplayer\("Player"\).setup\({[^}]*file: "([^"]+)"', + webpage, 'video_url') + formats = [{'url': video_url}] + if video_url.startswith('rtmp://'): + rtmp = re.search(r'^(?Prtmp://[^/]+/(?P.+))/(?Pmp4:.+)$', video_url) + formats[0].update({ + 'app': rtmp.group('app'), + 'play_path': rtmp.group('play_path'), + }) title = self._html_search_regex( - r'class="evenement8">(.*?)', webpage, 'title') + r'(?s)class="[^"]*col_description[^"]*">.*?

    (.*?)

    ', webpage, 'title') return { 'id': video_id, - 'ext': 'mp4', - 'url': video_url, + 'formats': formats, 'title': title, } From 6682049dee5e73b98e99e1359b959240d0920d6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 19:19:43 +0600 Subject: [PATCH 69/83] [canalc2] Improve rtmp extraction --- youtube_dl/extractor/canalc2.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index 66a9ff093..648af2e18 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -31,10 +31,12 @@ class Canalc2IE(InfoExtractor): webpage, 'video_url') formats = [{'url': video_url}] if video_url.startswith('rtmp://'): - rtmp = re.search(r'^(?Prtmp://[^/]+/(?P.+))/(?Pmp4:.+)$', video_url) + rtmp = re.search(r'^(?Prtmp://[^/]+/(?P.+/))(?Pmp4:.+)$', video_url) formats[0].update({ + 'url': rtmp.group('url'), 'app': rtmp.group('app'), 'play_path': rtmp.group('play_path'), + 'page_url': url, }) title = self._html_search_regex( From ef6c868f23f2fe0d493831e0d4cba71c735bd160 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 19:23:31 +0600 Subject: [PATCH 70/83] [canalc2] Improve some regexes --- youtube_dl/extractor/canalc2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index 648af2e18..d9137e2ef 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class Canalc2IE(InfoExtractor): IE_NAME = 'canalc2.tv' - _VALID_URL = r'https?://(www\.)?canalc2\.tv/video/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?canalc2\.tv/video/(?P\d+)' _TEST = { 'url': 'http://www.canalc2.tv/video/12163', @@ -27,8 +27,8 @@ class Canalc2IE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_url = self._search_regex( - r'jwplayer\("Player"\).setup\({[^}]*file: "([^"]+)"', - webpage, 'video_url') + r'jwplayer\((["\'])Player\1\)\.setup\({[^}]*file\s*:\s*(["\'])(?P.+?)\2', + webpage, 'video_url', group='file') formats = [{'url': video_url}] if video_url.startswith('rtmp://'): rtmp = re.search(r'^(?Prtmp://[^/]+/(?P.+/))(?Pmp4:.+)$', video_url) From 14bddf35fbe8253e283042630e24b134996b2575 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 19:23:52 +0600 Subject: [PATCH 71/83] [canalc2] Add ext --- youtube_dl/extractor/canalc2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index d9137e2ef..ba82bb2b7 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -34,6 +34,7 @@ class Canalc2IE(InfoExtractor): rtmp = re.search(r'^(?Prtmp://[^/]+/(?P.+/))(?Pmp4:.+)$', video_url) formats[0].update({ 'url': rtmp.group('url'), + 'ext': 'flv', 'app': rtmp.group('app'), 'play_path': rtmp.group('play_path'), 'page_url': url, From b1bf063503893192637f95e929d1a9147de59a7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 19:27:05 +0600 Subject: [PATCH 72/83] [canalc2] Extract duration --- youtube_dl/extractor/canalc2.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index ba82bb2b7..e326b8fbd 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import parse_duration class Canalc2IE(InfoExtractor): @@ -42,9 +43,13 @@ class Canalc2IE(InfoExtractor): title = self._html_search_regex( r'(?s)class="[^"]*col_description[^"]*">.*?

    (.*?)

    ', webpage, 'title') + duration = parse_duration(self._search_regex( + r'id=["\']video_duree["\'][^>]*>([^<]+)', + webpage, 'duration', fatal=False)) return { 'id': video_id, - 'formats': formats, 'title': title, + 'duration': duration, + 'formats': formats, } From 608945d44a7e47fa5115295839c993af545936eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 19:27:22 +0600 Subject: [PATCH 73/83] [canalc2] Fix test --- youtube_dl/extractor/canalc2.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index e326b8fbd..f6a1ff381 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -16,8 +16,9 @@ class Canalc2IE(InfoExtractor): 'md5': '060158428b650f896c542dfbb3d6487f', 'info_dict': { 'id': '12163', - 'ext': 'mp4', - 'title': 'Terrasses du Numérique' + 'ext': 'flv', + 'title': 'Terrasses du Numérique', + 'duration': 122, }, 'params': { 'skip_download': True, # Requires rtmpdump From dedd35c6bc33eb88f19b16eeb37498cee076c47a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 19:59:18 +0600 Subject: [PATCH 74/83] [viewster] Fix failing m3u8 --- youtube_dl/extractor/viewster.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 632e57fb4..7cf930d69 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -131,10 +131,11 @@ class ViewsterIE(InfoExtractor): formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id='hds')) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( video_url, video_id, 'mp4', m3u8_id='hls', - fatal=False # m3u8 sometimes fail - )) + fatal=False) # m3u8 sometimes fail + if m3u8_formats: + formats.extend(m3u8_formats) else: format_id = media.get('Bitrate') f = { From e36963e0eb57294f156a98c38df891dec41ebaa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Oct 2015 20:24:33 +0600 Subject: [PATCH 75/83] [eagleplatform] Identify hls formats --- youtube_dl/extractor/eagleplatform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index e529b9b96..7bbf617d4 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -87,7 +87,7 @@ class EaglePlatformIE(InfoExtractor): m3u8_url = self._get_video_url(secure_m3u8, video_id, 'Downloading m3u8 JSON') formats = self._extract_m3u8_formats( m3u8_url, video_id, - 'mp4', entry_protocol='m3u8_native') + 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') mp4_url = self._get_video_url( # Secure mp4 URL is constructed according to Player.prototype.mp4 from From a6e0afa2bbc93d145b31911b8ce40c502994e2a1 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 18 Oct 2015 19:23:40 +0200 Subject: [PATCH 76/83] release 2015.10.18 --- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 47f7da86d..cfa665d88 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -588,7 +588,8 @@ - **twitch:stream** - **twitch:video** - **twitch:vod** - - **TwitterCard** + - **twitter** + - **twitter:card** - **Ubu** - **udemy** - **udemy:course** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 31d2a9dc0..660b0050b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.10.16' +__version__ = '2015.10.18' From 264b23e1a42378d52f8774a07c1d906cd1cff96c Mon Sep 17 00:00:00 2001 From: kennell Date: Sun, 18 Oct 2015 19:56:22 +0200 Subject: [PATCH 77/83] adds thumbnail support for ZDF Mediathek extractor --- youtube_dl/extractor/zdf.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 98f15177b..f376025e1 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -70,6 +70,23 @@ def extract_from_xml_url(ie, video_id, xml_url): '_available': is_available, } + def xml_to_thumbnails(fnode): + thumbnails = list() + for node in fnode: + width_x_height = node.attrib['key'] + thumbnail = { + 'url': node.text, + 'width': int(width_x_height.split('x')[0]), + 'height': int(width_x_height.split('x')[1]) + } + thumbnails.append(thumbnail) + return thumbnails + + + thumbnail_nodes = doc.findall('.//teaserimages/teaserimage') + thumbnails = xml_to_thumbnails(thumbnail_nodes) + thumbnail = thumbnails[-1]['url'] + format_nodes = doc.findall('.//formitaeten/formitaet') formats = list(filter( lambda f: f['_available'], @@ -81,6 +98,8 @@ def extract_from_xml_url(ie, video_id, xml_url): 'title': title, 'description': description, 'duration': duration, + 'thumbnail': thumbnail, + 'thumbnails': thumbnails, 'uploader': uploader, 'uploader_id': uploader_id, 'upload_date': upload_date, From d762f86e940ad656e8f7e7b93636292e4cf36de5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 19 Oct 2015 00:11:16 +0600 Subject: [PATCH 78/83] [ok] Extend _VALID_URL --- youtube_dl/extractor/odnoklassniki.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index ccc88cfb1..184c7a323 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -13,7 +13,7 @@ from ..utils import ( class OdnoklassnikiIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P[\d-]+)' + _VALID_URL = r'https?://(?:www\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer)/(?P[\d-]+)' _TESTS = [{ # metadata in JSON 'url': 'http://ok.ru/video/20079905452', @@ -66,6 +66,9 @@ class OdnoklassnikiIE(InfoExtractor): }, { 'url': 'http://www.ok.ru/video/20648036891', 'only_matching': True, + }, { + 'url': 'http://www.ok.ru/videoembed/20648036891', + 'only_matching': True, }] def _real_extract(self, url): From 8cc83d301dd0e8029aff804e362860d36e3d7e7a Mon Sep 17 00:00:00 2001 From: kennell Date: Sun, 18 Oct 2015 20:47:42 +0200 Subject: [PATCH 79/83] use int_or_none, check if attrib exists, remove thumbnail --- youtube_dl/extractor/zdf.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index f376025e1..d41c4e712 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -73,19 +73,17 @@ def extract_from_xml_url(ie, video_id, xml_url): def xml_to_thumbnails(fnode): thumbnails = list() for node in fnode: - width_x_height = node.attrib['key'] - thumbnail = { - 'url': node.text, - 'width': int(width_x_height.split('x')[0]), - 'height': int(width_x_height.split('x')[1]) - } + thumbnail = {'url': node.text} + if 'key' in node.attrib: + width_x_height = node.attrib['key'] + thumbnail['width'] = int_or_none(width_x_height.split('x')[0]) + thumbnail['height'] = int_or_none(width_x_height.split('x')[1]) thumbnails.append(thumbnail) return thumbnails thumbnail_nodes = doc.findall('.//teaserimages/teaserimage') thumbnails = xml_to_thumbnails(thumbnail_nodes) - thumbnail = thumbnails[-1]['url'] format_nodes = doc.findall('.//formitaeten/formitaet') formats = list(filter( @@ -98,7 +96,6 @@ def extract_from_xml_url(ie, video_id, xml_url): 'title': title, 'description': description, 'duration': duration, - 'thumbnail': thumbnail, 'thumbnails': thumbnails, 'uploader': uploader, 'uploader_id': uploader_id, From b243340f0ce311443a15a2dfd4356a9504e18c04 Mon Sep 17 00:00:00 2001 From: kennell Date: Sun, 18 Oct 2015 21:07:52 +0200 Subject: [PATCH 80/83] check if key attrib matches resolution pattern --- youtube_dl/extractor/zdf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index d41c4e712..ed385450c 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -75,9 +75,9 @@ def extract_from_xml_url(ie, video_id, xml_url): for node in fnode: thumbnail = {'url': node.text} if 'key' in node.attrib: - width_x_height = node.attrib['key'] - thumbnail['width'] = int_or_none(width_x_height.split('x')[0]) - thumbnail['height'] = int_or_none(width_x_height.split('x')[1]) + if re.match("^[0-9]+x[0-9]+$", node.attrib['key']): + thumbnail['width'] = int_or_none(node.attrib['key'].split('x')[0]) + thumbnail['height'] = int_or_none(node.attrib['key'].split('x')[1]) thumbnails.append(thumbnail) return thumbnails From 2038ad6ee71c842420b83cb6c5ce3c6898e8e380 Mon Sep 17 00:00:00 2001 From: "Sergey M." Date: Mon, 19 Oct 2015 01:12:41 +0600 Subject: [PATCH 81/83] [README.md] Add uploader extraction sample in example extractor --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index cf4aebf3d..a6ec9619c 100644 --- a/README.md +++ b/README.md @@ -710,12 +710,13 @@ If you want to add support for a new site, you can follow this quick list (assum webpage = self._download_webpage(url, video_id) # TODO more code goes here, for example ... - title = self._html_search_regex(r'

    (.*?)

    ', webpage, 'title') + title = self._html_search_regex(r'

    (.+?)

    ', webpage, 'title') return { 'id': video_id, 'title': title, 'description': self._og_search_description(webpage), + 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), # TODO more properties (see youtube_dl/extractor/common.py) } ``` From b7cedb16043c60d4032b206a83539acbd39f994f Mon Sep 17 00:00:00 2001 From: kennell Date: Sun, 18 Oct 2015 21:25:26 +0200 Subject: [PATCH 82/83] simplify thumbnail dict building --- youtube_dl/extractor/zdf.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index ed385450c..c2b196504 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -75,9 +75,10 @@ def extract_from_xml_url(ie, video_id, xml_url): for node in fnode: thumbnail = {'url': node.text} if 'key' in node.attrib: - if re.match("^[0-9]+x[0-9]+$", node.attrib['key']): - thumbnail['width'] = int_or_none(node.attrib['key'].split('x')[0]) - thumbnail['height'] = int_or_none(node.attrib['key'].split('x')[1]) + m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) + if m: + thumbnail['width'] = int(m.group(1)) + thumbnail['height'] = int(m.group(2)) thumbnails.append(thumbnail) return thumbnails From 7b091c370c0f187545df8b1b1cc990fcf95df108 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 19 Oct 2015 01:48:05 +0600 Subject: [PATCH 83/83] [zdf] Modernize and PEP 8 --- youtube_dl/extractor/zdf.py | 43 +++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index c2b196504..a795f56b3 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -9,6 +9,7 @@ from ..utils import ( int_or_none, unified_strdate, OnDemandPagedList, + xpath_text, ) @@ -19,13 +20,11 @@ def extract_from_xml_url(ie, video_id, xml_url): errnote='Failed to download video info') title = doc.find('.//information/title').text - description = doc.find('.//information/detail').text - duration = int(doc.find('.//details/lengthSec').text) - uploader_node = doc.find('.//details/originChannelTitle') - uploader = None if uploader_node is None else uploader_node.text - uploader_id_node = doc.find('.//details/originChannelId') - uploader_id = None if uploader_id_node is None else uploader_id_node.text - upload_date = unified_strdate(doc.find('.//details/airtime').text) + description = xpath_text(doc, './/information/detail', 'description') + duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration')) + uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader') + uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id') + upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date')) def xml_to_format(fnode): video_url = fnode.find('url').text @@ -40,15 +39,14 @@ def extract_from_xml_url(ie, video_id, xml_url): ext = format_m.group('container') proto = format_m.group('proto').lower() - quality = fnode.find('./quality').text - abr = int(fnode.find('./audioBitrate').text) // 1000 - vbr_node = fnode.find('./videoBitrate') - vbr = None if vbr_node is None else int(vbr_node.text) // 1000 + quality = xpath_text(fnode, './quality', 'quality') + abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000) + vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000) - width_node = fnode.find('./width') - width = None if width_node is None else int_or_none(width_node.text) - height_node = fnode.find('./height') - height = None if height_node is None else int_or_none(height_node.text) + width = int_or_none(xpath_text(fnode, './width', 'width')) + height = int_or_none(xpath_text(fnode, './height', 'height')) + + filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize')) format_note = '' if not format_note: @@ -64,16 +62,21 @@ def extract_from_xml_url(ie, video_id, xml_url): 'vbr': vbr, 'width': width, 'height': height, - 'filesize': int_or_none(fnode.find('./filesize').text), + 'filesize': filesize, 'format_note': format_note, 'protocol': proto, '_available': is_available, } def xml_to_thumbnails(fnode): - thumbnails = list() + thumbnails = [] for node in fnode: - thumbnail = {'url': node.text} + thumbnail_url = node.text + if not thumbnail_url: + continue + thumbnail = { + 'url': thumbnail_url, + } if 'key' in node.attrib: m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) if m: @@ -82,9 +85,7 @@ def extract_from_xml_url(ie, video_id, xml_url): thumbnails.append(thumbnail) return thumbnails - - thumbnail_nodes = doc.findall('.//teaserimages/teaserimage') - thumbnails = xml_to_thumbnails(thumbnail_nodes) + thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage')) format_nodes = doc.findall('.//formitaeten/formitaet') formats = list(filter(