From f9e828ab263a11105f5415058af34c1fe7eda4be Mon Sep 17 00:00:00 2001 From: Adam Porter Date: Sat, 26 Sep 2015 22:25:12 -0500 Subject: [PATCH 1/5] YouTube: Get titles from playlist (fix #6699) Parse video titles from playlist pages' HTML/JSON. This broke a while back when YouTube changed something. Now running "-j --flat-playlist" will return both IDs and titles of videos again. --- youtube_dl/extractor/youtube.py | 36 ++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b252e36e1..8471cbe8c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1440,7 +1440,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' - _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)' + _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)[^>]+>(?P[^<]+)' IE_NAME = 'youtube:playlist' _TESTS = [{ 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', @@ -1537,12 +1537,16 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): return self.playlist_result(url_results, playlist_id, title) def _extract_playlist(self, playlist_id): + """Return YoutubeBaseInfoExtractor.playlist_result() for YouTube playlist ID.""" + url = self._TEMPLATE_URL % playlist_id page = self._download_webpage(url, playlist_id) for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page): + # Check YouTube alert messages match = match.strip() - # Check if the playlist exists or is private + + # Check for problems if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match): raise ExtractorError( 'The playlist doesn\'t exist or is private, use --username or ' @@ -1553,25 +1557,37 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): 'Invalid parameters. Maybe URL is incorrect.', expected=True) elif re.match(r'[^<]*Choose your language[^<]*', match): + # Looks good; continue continue else: self.report_warning('Youtube gives an alert message: ' + match) - # Extract the video ids from the playlist pages def _entries(): + # Extract the video ids from the playlist pages more_widget_html = content_html = page - for page_num in itertools.count(1): - matches = re.finditer(self._VIDEO_RE, content_html) - # We remove the duplicates and the link with index 0 - # (it's not the first video of the playlist) - new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') - for vid_id in new_ids: - yield self.url_result(vid_id, 'Youtube', video_id=vid_id) + for page_num in itertools.count(1): + # Loop to find videos until break + + matches = re.finditer(self._VIDEO_RE, content_html) + + # Make list of videos + new_videos = [{'id': m.group('id'), + 'title': m.group('title').strip()} + for m in matches + if m.group('index') != '0' # Ignore link with index 0 + if m.group('title').strip()] # Ignore links without titles, which also prevents duplicates + + for video in new_videos: + yield self.url_result(video['id'], 'Youtube', video_id=video['id'], video_title=video['title']) + + # Find link to load more videos mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) if not mobj: + # No more videos break + # Download JSON to get more videos more = self._download_json( 'https://youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s' % page_num, From d2a5684445c0b45bc20c723171a22528193736d2 Mon Sep 17 00:00:00 2001 From: Adam Porter <adam@alphapapa.net> Date: Sun, 27 Sep 2015 21:12:42 -0500 Subject: [PATCH 2/5] Handle videos without titles, avoiding duplicates --- youtube_dl/extractor/youtube.py | 61 +++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8471cbe8c..ab968caa2 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1440,7 +1440,23 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)[^>]+>(?P<title>[^<]+)' + _VIDEO_RE = re.compile( + r"""href="\s*/watch\? + + # Video ID + v=(?P<id>[0-9A-Za-z_-]{11})& + + [^"]*? + + # Index + index=(?P<index>\d+)[^>]+ + + # End of <a> tag + > + + # Video title (optional) + (?P<title>[^<]+)? + """, re.VERBOSE) IE_NAME = 'youtube:playlist' _TESTS = [{ 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', @@ -1557,7 +1573,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): 'Invalid parameters. Maybe URL is incorrect.', expected=True) elif re.match(r'[^<]*Choose your language[^<]*', match): - # Looks good; continue continue else: self.report_warning('Youtube gives an alert message: ' + match) @@ -1568,18 +1583,44 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): for page_num in itertools.count(1): # Loop to find videos until break + matches = self._VIDEO_RE.finditer(content_html) - matches = re.finditer(self._VIDEO_RE, content_html) + # Get videos from current page. Using OrderedDict to + # avoid duplicates would make this much simpler. + new_videos = {} + for m in matches: + video_index = m.group('index') + if video_index == '0': + # Ignore link with index 0 + continue - # Make list of videos - new_videos = [{'id': m.group('id'), - 'title': m.group('title').strip()} - for m in matches - if m.group('index') != '0' # Ignore link with index 0 - if m.group('title').strip()] # Ignore links without titles, which also prevents duplicates + video_id = m.group('id') + if m.group('title'): + video_title = m.group('title').strip() + else: + video_title = None + + if video_id in new_videos: + # Duplicate video + + if video_title and not new_videos[video_id]['title']: + # Set missing title + new_videos[video_id]['title'] = video_title + + new_videos[video_id]['index'] = video_index + + else: + # New video + new_videos[video_id] = {'index': int(video_index), + 'title': video_title} + + # Sort videos by index + new_videos = sorted(new_videos.iteritems(), key=lambda v: v[1]['index']) + + # Yield current list of videos for video in new_videos: - yield self.url_result(video['id'], 'Youtube', video_id=video['id'], video_title=video['title']) + yield self.url_result(video[0], 'Youtube', video_id=video[0], video_title=video[1]['title']) # Find link to load more videos mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) From b5b683bc517056560d79354911d6bc65a8a3f7f7 Mon Sep 17 00:00:00 2001 From: Adam Porter <adam@alphapapa.net> Date: Sun, 27 Sep 2015 21:13:22 -0500 Subject: [PATCH 3/5] Add test for extracting video titles from playlist --- test/test_youtube_lists.py | 54 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index c889b6f15..49cc3bf9f 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -11,16 +11,59 @@ from test.helper import FakeYDL from youtube_dl.extractor import ( + gen_extractors, YoutubePlaylistIE, YoutubeIE, ) class TestYoutubeLists(unittest.TestCase): + ies = gen_extractors() + def assertIsPlaylist(self, info): """Make sure the info has '_type' set to 'playlist'""" self.assertEqual(info['_type'], 'playlist') + def assertPlaylistHasVideos(self, playlist_url, videos): + """Assert that playlist contains the given videos with matching IDs + and titles. + + playlist_url: Playlist URL + videos: List of dicts with the following entries: + "id": Video ID + "title": Video title + """ + + # Get suitable InfoExtractor + ie = [ie for ie in self.ies if ie.suitable(playlist_url)][0] + + # This results in "TypeError: 'YoutubeUserIE' object is + # not callable", so it's necessary to use the + # "ie.set_downloader(FakeYDL())" workaround. + # YoutubeUserIE inherits from YoutubeChannelIE, which + # inherits from InfoExtractor, which is callable, but it + # doesn't work. Even making YoutubeChannelIE inherit from + # YoutubeBaseInfoExtractor doesn't make YoutubeUserIE + # callable here. + + # ie = ie(FakeYDL()) + ie.set_downloader(FakeYDL()) + + # Get playlist + result = ie._real_extract(playlist_url) + if result['_type'] == 'url': + # Get actual playlist from canonical URL + result = YoutubePlaylistIE(FakeYDL()).extract(result['url']) + + # Save generator output + playlist = [v for v in result['entries']] + + for video in videos: + matching_videos = [v for v in playlist if v['id'] == video['id']] + + self.assertEqual(len(matching_videos), 1) + self.assertEqual(matching_videos[0]['title'], video['title']) + def test_youtube_playlist_noplaylist(self): dl = FakeYDL() dl.params['noplaylist'] = True @@ -57,5 +100,16 @@ class TestYoutubeLists(unittest.TestCase): entries = result['entries'] self.assertEqual(len(entries), 100) + def test_youtube_extract_video_titles_from_playlists(self): + self.assertPlaylistHasVideos("https://www.youtube.com/user/RhettandLink/videos", + [ + {'id': 'uhKejRHODOM', 'title': 'The Overly Complicated Coffee Order'}, + {'id': 'f7eIWlA6Sh8', 'title': 'Burgaz Megatator Commercial'} + ]) + self.assertPlaylistHasVideos("https://www.youtube.com/playlist?list=PLJ49NV73ttrvgyM4n5o-txRnMXH3pNnjK", + [ + {'id': 'x9CH3RtbW_M', 'title': 'The Secret Life of a Hamster Song - Animated Song Biscuits'} + ]) + if __name__ == '__main__': unittest.main() From edb0e97b2efcfeaab8a1b4fba20256833f8804f2 Mon Sep 17 00:00:00 2001 From: Adam Porter <adam@alphapapa.net> Date: Mon, 28 Sep 2015 02:04:24 -0500 Subject: [PATCH 4/5] Fix playlist order The "index" attribute does not seem to correspond to the playlist order (at least, not exactly). An OrderedDict would really help here. If support for Python 2.6 is ever dropped... :) --- test/test_youtube_lists.py | 9 +++++++++ youtube_dl/extractor/youtube.py | 31 ++++++++++++++++++------------- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 49cc3bf9f..e00ba8030 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -58,12 +58,21 @@ class TestYoutubeLists(unittest.TestCase): # Save generator output playlist = [v for v in result['entries']] + # Find videos in playlist for video in videos: matching_videos = [v for v in playlist if v['id'] == video['id']] self.assertEqual(len(matching_videos), 1) self.assertEqual(matching_videos[0]['title'], video['title']) + # TODO: It would be good to check that the videos are returned + # in the correct order (not necessarily back-to-back), which, + # of course, requires creating the test data in the correct + # order. The reason is that simple mistakes (like forgetting + # that dicts don't keep insertion order) can result in the + # order being wrong. This could be in a separate test, or it + # could go here. + def test_youtube_playlist_noplaylist(self): dl = FakeYDL() dl.params['noplaylist'] = True diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ab968caa2..6a7916442 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1586,11 +1586,19 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): matches = self._VIDEO_RE.finditer(content_html) # Get videos from current page. Using OrderedDict to - # avoid duplicates would make this much simpler. + # avoid duplicates would make this much + # simpler. Lacking that, we store the order of the + # videos as video_num so we can sort the dict, keeping + # the order of the playlist. We have to avoid + # duplicates because it seems that every video in the + # playlist shows up in the HTML/JSON twice: once + # without a title, and once with a title. Maybe using + # something like bs4 instead of regexps would also be a + # good idea. new_videos = {} + num = 0 for m in matches: - video_index = m.group('index') - if video_index == '0': + if m.group('index') == '0': # Ignore link with index 0 continue @@ -1602,21 +1610,18 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): video_title = None if video_id in new_videos: - # Duplicate video - + # Video is already in dict if video_title and not new_videos[video_id]['title']: # Set missing title new_videos[video_id]['title'] = video_title - - new_videos[video_id]['index'] = video_index - else: - # New video - new_videos[video_id] = {'index': int(video_index), - 'title': video_title} + # Video not in dict + new_videos[video_id] = {'num': num, 'title': video_title} - # Sort videos by index - new_videos = sorted(new_videos.iteritems(), key=lambda v: v[1]['index']) + num += 1 + + # Sort videos by playlist order + new_videos = sorted(new_videos.iteritems(), key=lambda v: v[1]['num']) # Yield current list of videos for video in new_videos: From e56066a1c527c649d906efd3f5c9fc7841314373 Mon Sep 17 00:00:00 2001 From: Adam Porter <adam@alphapapa.net> Date: Tue, 29 Sep 2015 04:55:30 -0500 Subject: [PATCH 5/5] Reformat regexp --- youtube_dl/extractor/youtube.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6a7916442..004154dcb 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1440,23 +1440,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' - _VIDEO_RE = re.compile( - r"""href="\s*/watch\? - - # Video ID - v=(?P<id>[0-9A-Za-z_-]{11})& - - [^"]*? - - # Index - index=(?P<index>\d+)[^>]+ - - # End of <a> tag - > - - # Video title (optional) - (?P<title>[^<]+)? - """, re.VERBOSE) + _VIDEO_RE = re.compile(r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)[^>]+>(?P<title>[^<]+)?') IE_NAME = 'youtube:playlist' _TESTS = [{ 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',