From 0d0d5d37174ed611dd823c5be025c49e73d83d1d Mon Sep 17 00:00:00 2001 From: ping Date: Thu, 18 Jun 2015 13:59:37 +0800 Subject: [PATCH 01/70] [qqmusic] Add support for playlists --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/qqmusic.py | 34 ++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6fdaf90b2..d03577cdf 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -427,6 +427,7 @@ from .qqmusic import ( QQMusicSingerIE, QQMusicAlbumIE, QQMusicToplistIE, + QQMusicPlaylistIE, ) from .quickvid import QuickVidIE from .r7 import R7IE diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index bafa81c21..f9aafcd28 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -9,6 +9,7 @@ from .common import InfoExtractor from ..utils import ( strip_jsonp, unescapeHTML, + clean_html, ) from ..compat import compat_urllib_request @@ -243,3 +244,36 @@ class QQMusicToplistIE(QQPlaylistBaseIE): list_name = topinfo.get('ListName') list_description = topinfo.get('info') return self.playlist_result(entries, list_id, list_name, list_description) + + +class QQMusicPlaylistIE(QQPlaylistBaseIE): + IE_NAME = 'qqmusic:playlist' + _VALID_URL = r'http://y\.qq\.com/#type=taoge&id=(?P[0-9]+)' + + _TEST = { + 'url': 'http://y.qq.com/#type=taoge&id=3462654915', + 'info_dict': { + 'id': '3462654915', + 'title': '韩国5月新歌精选下旬', + 'description': 'md5:d2c9d758a96b9888cf4fe82f603121d4', + }, + 'playlist_count': 40, + } + + def _real_extract(self, url): + list_id = self._match_id(url) + + list_json = self._download_json( + 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg?type=1&json=1&utf8=1&onlysong=0&disstid=%s' + % list_id, list_id, 'Download list page', + transform_source=strip_jsonp)['cdlist'][0] + + entries = [ + self.url_result( + 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid'] + ) for song in list_json['songlist'] + ] + + list_name = list_json['dissname'] + list_description = clean_html(unescapeHTML(list_json.get('desc'))) + return self.playlist_result(entries, list_id, list_name, list_description) From 29b809de68aeefb5e991c75929ed3d03fb40c1f1 Mon Sep 17 00:00:00 2001 From: ping Date: Thu, 18 Jun 2015 15:52:04 +0800 Subject: [PATCH 02/70] [qqmusic] Fix album extraction --- youtube_dl/extractor/qqmusic.py | 37 ++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index bafa81c21..d9a783f8a 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -163,31 +163,38 @@ class QQMusicAlbumIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:album' _VALID_URL = r'http://y.qq.com/#type=album&mid=(?P[0-9A-Za-z]+)' - _TEST = { - 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1&play=0', + _TESTS = [{ + 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1', 'info_dict': { 'id': '000gXCTb2AhRR1', 'title': '我们都是这样长大的', - 'description': 'md5:d216c55a2d4b3537fe4415b8767d74d6', + 'description': 'md5:712f0cdbfc7e776820d08150e6df593d', }, 'playlist_count': 4, - } + }, { + 'url': 'http://y.qq.com/#type=album&mid=002Y5a3b3AlCu3', + 'info_dict': { + 'id': '002Y5a3b3AlCu3', + 'title': '그리고...', + 'description': 'md5:b1d133b8c9bac8fed4e1a97df759f4cf', + }, + 'playlist_count': 8, + }] def _real_extract(self, url): mid = self._match_id(url) - album_page = self._download_webpage( - self.qq_static_url('album', mid), mid, 'Download album page') + album = self._download_json( + 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_album_info_cp.fcg?albummid=%s&format=json' % mid, + mid, 'Download album page')['data'] - entries = self.get_entries_from_page(album_page) - - album_name = self._html_search_regex( - r"albumname\s*:\s*'([^']+)',", album_page, 'album name', - default=None) - - album_detail = self._html_search_regex( - r'
\s*

((?:[^<>]+(?:
)?)+)

', - album_page, 'album details', default=None) + entries = [ + self.url_result( + 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid'] + ) for song in album['list'] + ] + album_name = album['name'] + album_detail = album.get('desc') return self.playlist_result(entries, mid, album_name, album_detail) From 5e3915cbe3fabd2dbb633131b851b1158c0bba7b Mon Sep 17 00:00:00 2001 From: ping Date: Thu, 18 Jun 2015 21:06:25 +0800 Subject: [PATCH 03/70] [qqmusic] Fix song extraction when certain formats are unavailable --- youtube_dl/extractor/qqmusic.py | 55 +++++++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index bafa81c21..7ddc4ca25 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -9,8 +9,13 @@ from .common import InfoExtractor from ..utils import ( strip_jsonp, unescapeHTML, + HEADRequest, + ExtractorError, +) +from ..compat import ( + compat_urllib_request, + compat_HTTPError, ) -from ..compat import compat_urllib_request class QQMusicIE(InfoExtractor): @@ -26,6 +31,20 @@ class QQMusicIE(InfoExtractor): 'upload_date': '20141227', 'creator': '林俊杰', 'description': 'md5:d327722d0361576fde558f1ac68a7065', + 'thumbnail': 'http://i.gtimg.cn/music/photo/mid_album_500/7/p/001IV22P1RDX7p.jpg', + } + }, { + 'note': 'There is no mp3-320 version of this song.', + 'url': 'http://y.qq.com/#type=song&mid=004MsGEo3DdNxV', + 'md5': 'fa3926f0c585cda0af8fa4f796482e3e', + 'info_dict': { + 'id': '004MsGEo3DdNxV', + 'ext': 'mp3', + 'title': '如果', + 'upload_date': '20050626', + 'creator': '李季美', + 'description': 'md5:46857d5ed62bc4ba84607a805dccf437', + 'thumbnail': 'http://i.gtimg.cn/music/photo/mid_album_500/r/Q/0042owYj46IxrQ.jpg', } }] @@ -68,6 +87,13 @@ class QQMusicIE(InfoExtractor): if lrc_content: lrc_content = lrc_content.replace('\\n', '\n') + thumbnail_url = None + albummid = self._search_regex( + [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'], detail_info_page, 'album mid', default=None) + if albummid: + thumbnail_url = "http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg" \ + % (albummid[-2:-1], albummid[-1], albummid) + guid = self.m_r_get_ruin() vkey = self._download_json( @@ -77,14 +103,24 @@ class QQMusicIE(InfoExtractor): formats = [] for format_id, details in self._FORMATS.items(): - formats.append({ - 'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0' - % (details['prefix'], mid, details['ext'], vkey, guid), - 'format': format_id, - 'format_id': format_id, - 'preference': details['preference'], - 'abr': details.get('abr'), - }) + video_url = 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0' \ + % (details['prefix'], mid, details['ext'], vkey, guid) + req = HEADRequest(video_url) + try: + res = self._request_webpage( + req, mid, note='Testing %s video URL' % format_id, fatal=False) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in [400, 404]: + self.report_warning('Invalid %s video URL' % format_id, mid) + else: + if res: + formats.append({ + 'url': video_url, + 'format': format_id, + 'format_id': format_id, + 'preference': details['preference'], + 'abr': details.get('abr'), + }) self._sort_formats(formats) return { @@ -94,6 +130,7 @@ class QQMusicIE(InfoExtractor): 'upload_date': publish_time, 'creator': singer, 'description': lrc_content, + 'thumbnail': thumbnail_url, } From 0392ac98d2c5c5a6fd2ab51c51096f82312a287c Mon Sep 17 00:00:00 2001 From: ping Date: Thu, 18 Jun 2015 21:13:03 +0800 Subject: [PATCH 04/70] [qqmusic] Fix code formatting --- youtube_dl/extractor/qqmusic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 7ddc4ca25..5a18191bc 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -89,7 +89,8 @@ class QQMusicIE(InfoExtractor): thumbnail_url = None albummid = self._search_regex( - [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'], detail_info_page, 'album mid', default=None) + [r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'], + detail_info_page, 'album mid', default=None) if albummid: thumbnail_url = "http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg" \ % (albummid[-2:-1], albummid[-1], albummid) From 4d58b24c15ea0efc699a7ad7ee468245029da4e3 Mon Sep 17 00:00:00 2001 From: ping Date: Thu, 18 Jun 2015 23:09:04 +0800 Subject: [PATCH 05/70] [qqmusic] Use _check_formats instead --- youtube_dl/extractor/qqmusic.py | 34 ++++++++++----------------------- 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index 5a18191bc..7183c2bb1 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -9,13 +9,8 @@ from .common import InfoExtractor from ..utils import ( strip_jsonp, unescapeHTML, - HEADRequest, - ExtractorError, -) -from ..compat import ( - compat_urllib_request, - compat_HTTPError, ) +from ..compat import compat_urllib_request class QQMusicIE(InfoExtractor): @@ -104,24 +99,15 @@ class QQMusicIE(InfoExtractor): formats = [] for format_id, details in self._FORMATS.items(): - video_url = 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0' \ - % (details['prefix'], mid, details['ext'], vkey, guid) - req = HEADRequest(video_url) - try: - res = self._request_webpage( - req, mid, note='Testing %s video URL' % format_id, fatal=False) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in [400, 404]: - self.report_warning('Invalid %s video URL' % format_id, mid) - else: - if res: - formats.append({ - 'url': video_url, - 'format': format_id, - 'format_id': format_id, - 'preference': details['preference'], - 'abr': details.get('abr'), - }) + formats.append({ + 'url': 'http://cc.stream.qqmusic.qq.com/%s%s.%s?vkey=%s&guid=%s&fromtag=0' + % (details['prefix'], mid, details['ext'], vkey, guid), + 'format': format_id, + 'format_id': format_id, + 'preference': details['preference'], + 'abr': details.get('abr'), + }) + self._check_formats(formats, mid) self._sort_formats(formats) return { From d8d24a922adb0646f9f78b2fc4a287a72fa7ff73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Jun 2015 00:36:23 +0600 Subject: [PATCH 06/70] [youtube] Extract formats from multiple DASH manifests (Closes #6093) DASH manifest pointed by dashmpd from the video webpage and one pointed by get_video_info may be different (namely different itag set) - some itags are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH manifest pointed by get_video_info's dashmpd). The general idea is to take a union of itags of both DASH manifests (for example video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093). --- youtube_dl/extractor/youtube.py | 77 ++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a3da56c14..fa1a2e544 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -853,6 +853,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: player_url = None + dash_mpds = [] + + def add_dash_mpd(video_info): + dash_mpd = video_info.get('dashmpd') + if dash_mpd and dash_mpd[0] not in dash_mpds: + dash_mpds.append(dash_mpd[0]) + # Get video info embed_webpage = None if re.search(r'player-age-gate-content">', video_webpage) is not None: @@ -873,34 +880,40 @@ class YoutubeIE(YoutubeBaseInfoExtractor): note='Refetching age-gated info webpage', errnote='unable to download video info webpage') video_info = compat_parse_qs(video_info_webpage) + add_dash_mpd(video_info) else: age_gate = False - try: - # Try looking directly into the video webpage - mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage) - if not mobj: - raise ValueError('Could not find ytplayer.config') # caught below + # Try looking directly into the video webpage + mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage) + if mobj: json_code = uppercase_escape(mobj.group(1)) ytplayer_config = json.loads(json_code) args = ytplayer_config['args'] - # Convert to the same format returned by compat_parse_qs - video_info = dict((k, [v]) for k, v in args.items()) - if not args.get('url_encoded_fmt_stream_map'): - raise ValueError('No stream_map present') # caught below - except ValueError: - # We fallback to the get_video_info pages (used by the embed page) - self.report_video_info_webpage_download(video_id) - for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: - video_info_url = ( - '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' - % (proto, video_id, el_type)) - video_info_webpage = self._download_webpage( - video_info_url, - video_id, note=False, - errnote='unable to download video info webpage') - video_info = compat_parse_qs(video_info_webpage) - if 'token' in video_info: - break + if args.get('url_encoded_fmt_stream_map'): + # Convert to the same format returned by compat_parse_qs + video_info = dict((k, [v]) for k, v in args.items()) + add_dash_mpd(video_info) + # We also try looking in get_video_info since it may contain different dashmpd + # URL that points to a DASH manifest with possibly different itag set (some itags + # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH + # manifest pointed by get_video_info's dashmpd). + # The general idea is to take a union of itags of both DASH manifests (for example + # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093) + self.report_video_info_webpage_download(video_id) + for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']: + video_info_url = ( + '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' + % (proto, video_id, el_type)) + video_info_webpage = self._download_webpage( + video_info_url, + video_id, note=False, + errnote='unable to download video info webpage') + get_video_info = compat_parse_qs(video_info_webpage) + add_dash_mpd(get_video_info) + if not video_info: + video_info = get_video_info + if 'token' in get_video_info: + break if 'token' not in video_info: if 'reason' in video_info: raise ExtractorError( @@ -1118,24 +1131,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Look for the DASH manifest if self._downloader.params.get('youtube_include_dash_manifest', True): - dash_mpd = video_info.get('dashmpd') - if dash_mpd: - dash_manifest_url = dash_mpd[0] + for dash_manifest_url in dash_mpds: + dash_formats = {} try: - dash_formats = self._parse_dash_manifest( - video_id, dash_manifest_url, player_url, age_gate) + for df in self._parse_dash_manifest( + video_id, dash_manifest_url, player_url, age_gate): + # Do not overwrite DASH format found in some previous DASH manifest + if df['format_id'] not in dash_formats: + dash_formats[df['format_id']] = df except (ExtractorError, KeyError) as e: self.report_warning( 'Skipping DASH manifest: %r' % e, video_id) - else: + if dash_formats: # Remove the formats we found through non-DASH, they # contain less info and it can be wrong, because we use # fixed values (for example the resolution). See # https://github.com/rg3/youtube-dl/issues/5774 for an # example. - dash_keys = set(df['format_id'] for df in dash_formats) + dash_keys = set(df['format_id'] for df in dash_formats.values()) formats = [f for f in formats if f['format_id'] not in dash_keys] - formats.extend(dash_formats) + formats.extend(dash_formats.values()) # Check for malformed aspect ratio stretched_m = re.search( From 1b5a1ae257c6d64fd69ea91eaf69328d4b6d17df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Jun 2015 00:41:26 +0600 Subject: [PATCH 07/70] [youtube] Pick up codecs info from DASH manifest when not set explicitly --- youtube_dl/extractor/youtube.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a3da56c14..c9d8e5125 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -824,6 +824,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): except StopIteration: full_info = self._formats.get(format_id, {}).copy() full_info.update(f) + codecs = r.attrib.get('codecs') + if codecs: + if full_info.get('acodec') == 'none' and 'vcodec' not in full_info: + full_info['vcodec'] = codecs + elif full_info.get('vcodec') == 'none' and 'acodec' not in full_info: + full_info['acodec'] = codecs formats.append(full_info) else: existing_format.update(f) From d80265ccd6ff08ef273d83aff847a457f051071f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Jun 2015 02:48:50 +0600 Subject: [PATCH 08/70] [youtube] Simplify non-DASH formats exclusion --- youtube_dl/extractor/youtube.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index fa1a2e544..46841617a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1148,8 +1148,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # fixed values (for example the resolution). See # https://github.com/rg3/youtube-dl/issues/5774 for an # example. - dash_keys = set(df['format_id'] for df in dash_formats.values()) - formats = [f for f in formats if f['format_id'] not in dash_keys] + formats = [f for f in formats if f['format_id'] not in dash_formats.keys()] formats.extend(dash_formats.values()) # Check for malformed aspect ratio From bc93bdb5bbf48b121977a5088ab9607f0fbeb83e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Jun 2015 13:19:46 +0600 Subject: [PATCH 09/70] [youtube] Fix reference before assignment for video_info --- youtube_dl/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 46841617a..45e5cb80e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -883,6 +883,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): add_dash_mpd(video_info) else: age_gate = False + video_info = None # Try looking directly into the video webpage mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage) if mobj: From 0a3cf9ad3dc1a038018ff347473c0e2d8fc10a69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Jun 2015 14:31:18 +0600 Subject: [PATCH 10/70] [youtube] Skip get_video_info requests when --youtube-skip-dash-manifest is specified --- youtube_dl/extractor/youtube.py | 43 +++++++++++++++++---------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 45e5cb80e..2c9ad5e92 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -894,27 +894,28 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Convert to the same format returned by compat_parse_qs video_info = dict((k, [v]) for k, v in args.items()) add_dash_mpd(video_info) - # We also try looking in get_video_info since it may contain different dashmpd - # URL that points to a DASH manifest with possibly different itag set (some itags - # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH - # manifest pointed by get_video_info's dashmpd). - # The general idea is to take a union of itags of both DASH manifests (for example - # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093) - self.report_video_info_webpage_download(video_id) - for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']: - video_info_url = ( - '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' - % (proto, video_id, el_type)) - video_info_webpage = self._download_webpage( - video_info_url, - video_id, note=False, - errnote='unable to download video info webpage') - get_video_info = compat_parse_qs(video_info_webpage) - add_dash_mpd(get_video_info) - if not video_info: - video_info = get_video_info - if 'token' in get_video_info: - break + if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): + # We also try looking in get_video_info since it may contain different dashmpd + # URL that points to a DASH manifest with possibly different itag set (some itags + # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH + # manifest pointed by get_video_info's dashmpd). + # The general idea is to take a union of itags of both DASH manifests (for example + # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093) + self.report_video_info_webpage_download(video_id) + for el_type in ['&el=info', '&el=embedded', '&el=detailpage', '&el=vevo', '']: + video_info_url = ( + '%s://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' + % (proto, video_id, el_type)) + video_info_webpage = self._download_webpage( + video_info_url, + video_id, note=False, + errnote='unable to download video info webpage') + get_video_info = compat_parse_qs(video_info_webpage) + add_dash_mpd(get_video_info) + if not video_info: + video_info = get_video_info + if 'token' in get_video_info: + break if 'token' not in video_info: if 'reason' in video_info: raise ExtractorError( From b2575b38e76e14b7552a43cd707a03fe78748f40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Jun 2015 14:38:41 +0600 Subject: [PATCH 11/70] [options] Clarify --youtube-skip-dash-manifest --- youtube_dl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 6aeca61ee..e3dfb7af9 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -346,7 +346,7 @@ def parseOpts(overrideArguments=None): video_format.add_option( '--youtube-skip-dash-manifest', action='store_false', dest='youtube_include_dash_manifest', - help='Do not download the DASH manifest on YouTube videos') + help='Do not download the DASH manifests and related data on YouTube videos') video_format.add_option( '--merge-output-format', action='store', dest='merge_output_format', metavar='FORMAT', default=None, From da77d856a1310b52975abbad82121a1b4c6597a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Jun 2015 14:55:46 +0600 Subject: [PATCH 12/70] [youtube] Add test for #6093 --- youtube_dl/extractor/youtube.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2c9ad5e92..20e1781f8 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -518,6 +518,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': 'requires avconv', } }, + # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097) + { + 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', + 'info_dict': { + 'id': 'FIl7x6_3R5Y', + 'ext': 'mp4', + 'title': 'md5:7b81415841e02ecd4313668cde88737a', + 'description': 'md5:116377fd2963b81ec4ce64b542173306', + 'upload_date': '20150625', + 'uploader_id': 'dorappi2000', + 'uploader': 'dorappi2000', + 'formats': 'mincount:33', + }, + } ] def __init__(self, *args, **kwargs): From c4bd188da46a837ddf8f8f8d4766eb799fa2b484 Mon Sep 17 00:00:00 2001 From: Anders Einar Hilden Date: Mon, 29 Jun 2015 00:11:31 +0200 Subject: [PATCH 13/70] NRK now supports / requires HTTPS Add s? to regexp to support new urls. Update testcases to use HTTPS. --- youtube_dl/extractor/nrk.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index cc70c2950..9e4581cf9 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -13,7 +13,7 @@ from ..utils import ( class NRKIE(InfoExtractor): - _VALID_URL = r'(?:nrk:|http://(?:www\.)?nrk\.no/video/PS\*)(?P\d+)' + _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P\d+)' _TESTS = [ { @@ -76,7 +76,7 @@ class NRKIE(InfoExtractor): class NRKPlaylistIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video)(?:[^/]+/)+(?P[^/]+)' _TESTS = [{ 'url': 'http://www.nrk.no/troms/gjenopplev-den-historiske-solformorkelsen-1.12270763', @@ -116,11 +116,11 @@ class NRKPlaylistIE(InfoExtractor): class NRKTVIE(InfoExtractor): - _VALID_URL = r'(?Phttp://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P\d+))?' + _VALID_URL = r'(?Phttps?://tv\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P\d+))?' _TESTS = [ { - 'url': 'http://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', + 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', 'md5': 'adf2c5454fa2bf032f47a9f8fb351342', 'info_dict': { 'id': 'MUHH48000314', @@ -132,7 +132,7 @@ class NRKTVIE(InfoExtractor): }, }, { - 'url': 'http://tv.nrk.no/program/mdfp15000514', + 'url': 'https://tv.nrk.no/program/mdfp15000514', 'md5': '383650ece2b25ecec996ad7b5bb2a384', 'info_dict': { 'id': 'mdfp15000514', @@ -145,7 +145,7 @@ class NRKTVIE(InfoExtractor): }, { # single playlist video - 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', 'md5': 'adbd1dbd813edaf532b0a253780719c2', 'info_dict': { 'id': 'MSPO40010515-part2', @@ -157,7 +157,7 @@ class NRKTVIE(InfoExtractor): 'skip': 'Only works from Norway', }, { - 'url': 'http://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', 'playlist': [ { 'md5': '9480285eff92d64f06e02a5367970a7a', From bea41c7f3fa4f9072ad2f5354938ab1c8cef0a6d Mon Sep 17 00:00:00 2001 From: corone17 Date: Mon, 29 Jun 2015 00:59:18 +0200 Subject: [PATCH 14/70] Update rtlnl.py Better to extract 'http://manifest.us.rtl.nl' from the json, I'd say. And I think it's better to use the default json-url to make it more futureproof. Succesfully tested with tarball. --- youtube_dl/extractor/rtlnl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 41d202c28..e708e0093 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -51,7 +51,7 @@ class RtlNlIE(InfoExtractor): def _real_extract(self, url): uuid = self._match_id(url) info = self._download_json( - 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=flash/' % uuid, + 'http://www.rtl.nl/system/s4m/vfd/version=2/uuid=%s/fmt=adaptive/' % uuid, uuid) material = info['material'][0] @@ -60,8 +60,8 @@ class RtlNlIE(InfoExtractor): description = material.get('synopsis') or info['episodes'][0]['synopsis'] # Use unencrypted m3u8 streams (See https://github.com/rg3/youtube-dl/issues/4118) - videopath = material['videopath'].replace('.f4m', '.m3u8') - m3u8_url = 'http://manifest.us.rtl.nl' + videopath + videopath = material['videopath'].replace('adaptive', 'flash') + m3u8_url = info['meta']['videohost'] + videopath formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4') From 738b92632296a9fee3eb7e0c915f6ea6b395125f Mon Sep 17 00:00:00 2001 From: nawl Date: Sun, 28 Jun 2015 17:24:00 -0600 Subject: [PATCH 15/70] [hentaistigma] Fix video extractor --- youtube_dl/extractor/hentaistigma.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/hentaistigma.py b/youtube_dl/extractor/hentaistigma.py index 63d87b74c..225af8cb3 100644 --- a/youtube_dl/extractor/hentaistigma.py +++ b/youtube_dl/extractor/hentaistigma.py @@ -32,7 +32,7 @@ class HentaiStigmaIE(InfoExtractor): wrap_webpage = self._download_webpage(wrap_url, video_id) video_url = self._html_search_regex( - r'clip:\s*{\s*url: "([^"]*)"', wrap_webpage, 'video url') + r'file:"([^"]+)"', wrap_webpage, 'video url') return { 'id': video_id, From cf386750c9194839e419a0412f45f25f28236c77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 29 Jun 2015 22:21:09 +0600 Subject: [PATCH 16/70] [hentaistigma] Modernize --- youtube_dl/extractor/hentaistigma.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/hentaistigma.py b/youtube_dl/extractor/hentaistigma.py index 225af8cb3..f5aa73d18 100644 --- a/youtube_dl/extractor/hentaistigma.py +++ b/youtube_dl/extractor/hentaistigma.py @@ -1,7 +1,5 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor @@ -19,20 +17,19 @@ class HentaiStigmaIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_regex( - r'

]*>([^<]+)', + r']+class="posttitle"[^>]*>]*>([^<]+)', webpage, 'title') wrap_url = self._html_search_regex( - r'