Merge branch 'paged-lists'
Conflicts: test/test_utils.py youtube_dl/extractor/youtube.py
This commit is contained in:
		
						commit
						65697b3bf3
					
				| @ -19,6 +19,7 @@ from youtube_dl.utils import ( | ||||
|     fix_xml_ampersands, | ||||
|     get_meta_content, | ||||
|     orderedSet, | ||||
|     PagedList, | ||||
|     parse_duration, | ||||
|     sanitize_filename, | ||||
|     shell_quote, | ||||
| @ -214,5 +215,26 @@ class TestUtil(unittest.TestCase): | ||||
|             fix_xml_ampersands('Ӓ᪼'), 'Ӓ᪼') | ||||
|         self.assertEqual(fix_xml_ampersands('&#&#'), '&#&#') | ||||
| 
 | ||||
|     def test_paged_list(self): | ||||
|         def testPL(size, pagesize, sliceargs, expected): | ||||
|             def get_page(pagenum): | ||||
|                 firstid = pagenum * pagesize | ||||
|                 upto = min(size, pagenum * pagesize + pagesize) | ||||
|                 for i in range(firstid, upto): | ||||
|                     yield i | ||||
| 
 | ||||
|             pl = PagedList(get_page, pagesize) | ||||
|             got = pl.getslice(*sliceargs) | ||||
|             self.assertEqual(got, expected) | ||||
| 
 | ||||
|         testPL(5, 2, (), [0, 1, 2, 3, 4]) | ||||
|         testPL(5, 2, (1,), [1, 2, 3, 4]) | ||||
|         testPL(5, 2, (2,), [2, 3, 4]) | ||||
|         testPL(5, 2, (4,), [4]) | ||||
|         testPL(5, 2, (0, 3), [0, 1, 2]) | ||||
|         testPL(5, 2, (1, 4), [1, 2, 3]) | ||||
|         testPL(5, 2, (2, 99), [2, 3, 4]) | ||||
|         testPL(5, 2, (20, 99), []) | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     unittest.main() | ||||
|  | ||||
| @ -39,6 +39,7 @@ from .utils import ( | ||||
|     locked_file, | ||||
|     make_HTTPS_handler, | ||||
|     MaxDownloadsReached, | ||||
|     PagedList, | ||||
|     PostProcessingError, | ||||
|     platform_name, | ||||
|     preferredencoding, | ||||
| @ -578,19 +579,27 @@ class YoutubeDL(object): | ||||
| 
 | ||||
|             playlist_results = [] | ||||
| 
 | ||||
|             n_all_entries = len(ie_result['entries']) | ||||
|             playliststart = self.params.get('playliststart', 1) - 1 | ||||
|             playlistend = self.params.get('playlistend', None) | ||||
|             # For backwards compatibility, interpret -1 as whole list | ||||
|             if playlistend == -1: | ||||
|                 playlistend = None | ||||
| 
 | ||||
|             if isinstance(ie_result['entries'], list): | ||||
|                 n_all_entries = len(ie_result['entries']) | ||||
|                 entries = ie_result['entries'][playliststart:playlistend] | ||||
|                 n_entries = len(entries) | ||||
| 
 | ||||
|                 self.to_screen( | ||||
|                 "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" % | ||||
|                     "[%s] playlist %s: Collected %d video ids (downloading %d of them)" % | ||||
|                     (ie_result['extractor'], playlist, n_all_entries, n_entries)) | ||||
|             else: | ||||
|                 assert isinstance(ie_result['entries'], PagedList) | ||||
|                 entries = ie_result['entries'].getslice( | ||||
|                     playliststart, playlistend) | ||||
|                 n_entries = len(entries) | ||||
|                 self.to_screen( | ||||
|                     "[%s] playlist %s: Downloading %d videos" % | ||||
|                     (ie_result['extractor'], playlist, n_entries)) | ||||
| 
 | ||||
|             for i, entry in enumerate(entries, 1): | ||||
|                 self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries)) | ||||
|  | ||||
| @ -28,6 +28,7 @@ from ..utils import ( | ||||
|     get_element_by_attribute, | ||||
|     ExtractorError, | ||||
|     int_or_none, | ||||
|     PagedList, | ||||
|     RegexNotFoundError, | ||||
|     unescapeHTML, | ||||
|     unified_strdate, | ||||
| @ -1626,44 +1627,35 @@ class YoutubeUserIE(InfoExtractor): | ||||
|         # page by page until there are no video ids - it means we got | ||||
|         # all of them. | ||||
| 
 | ||||
|         url_results = [] | ||||
| 
 | ||||
|         for pagenum in itertools.count(0): | ||||
|         def download_page(pagenum): | ||||
|             start_index = pagenum * self._GDATA_PAGE_SIZE + 1 | ||||
| 
 | ||||
|             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index) | ||||
|             page = self._download_webpage(gdata_url, username, | ||||
|                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE)) | ||||
|             page = self._download_webpage( | ||||
|                 gdata_url, username, | ||||
|                 u'Downloading video ids from %d to %d' % ( | ||||
|                     start_index, start_index + self._GDATA_PAGE_SIZE)) | ||||
| 
 | ||||
|             try: | ||||
|                 response = json.loads(page) | ||||
|             except ValueError as err: | ||||
|                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) | ||||
|             if 'entry' not in response['feed']: | ||||
|                 # Number of videos is a multiple of self._MAX_RESULTS | ||||
|                 break | ||||
|                 return | ||||
| 
 | ||||
|             # Extract video identifiers | ||||
|             entries = response['feed']['entry'] | ||||
|             for entry in entries: | ||||
|                 title = entry['title']['$t'] | ||||
|                 video_id = entry['id']['$t'].split('/')[-1] | ||||
|                 url_results.append({ | ||||
|                 yield { | ||||
|                     '_type': 'url', | ||||
|                     'url': video_id, | ||||
|                     'ie_key': 'Youtube', | ||||
|                     'id': 'video_id', | ||||
|                     'title': title, | ||||
|                 }) | ||||
| 
 | ||||
|             # A little optimization - if current page is not | ||||
|             # "full", ie. does not contain PAGE_SIZE video ids then | ||||
|             # we can assume that this page is the last one - there | ||||
|             # are no more ids on further pages - no need to query | ||||
|             # again. | ||||
| 
 | ||||
|             if len(entries) < self._GDATA_PAGE_SIZE: | ||||
|                 break | ||||
|                 } | ||||
|         url_results = PagedList(download_page, self._GDATA_PAGE_SIZE) | ||||
| 
 | ||||
|         return self.playlist_result(url_results, playlist_title=username) | ||||
| 
 | ||||
|  | ||||
| @ -6,6 +6,7 @@ import datetime | ||||
| import email.utils | ||||
| import errno | ||||
| import gzip | ||||
| import itertools | ||||
| import io | ||||
| import json | ||||
| import locale | ||||
| @ -1164,3 +1165,46 @@ def check_executable(exe, args=[]): | ||||
|     except OSError: | ||||
|         return False | ||||
|     return exe | ||||
| 
 | ||||
| 
 | ||||
| class PagedList(object): | ||||
|     def __init__(self, pagefunc, pagesize): | ||||
|         self._pagefunc = pagefunc | ||||
|         self._pagesize = pagesize | ||||
| 
 | ||||
|     def getslice(self, start=0, end=None): | ||||
|         res = [] | ||||
|         for pagenum in itertools.count(start // self._pagesize): | ||||
|             firstid = pagenum * self._pagesize | ||||
|             nextfirstid = pagenum * self._pagesize + self._pagesize | ||||
|             if start >= nextfirstid: | ||||
|                 continue | ||||
| 
 | ||||
|             page_results = list(self._pagefunc(pagenum)) | ||||
| 
 | ||||
|             startv = ( | ||||
|                 start % self._pagesize | ||||
|                 if firstid <= start < nextfirstid | ||||
|                 else 0) | ||||
| 
 | ||||
|             endv = ( | ||||
|                 ((end - 1) % self._pagesize) + 1 | ||||
|                 if (end is not None and firstid <= end <= nextfirstid) | ||||
|                 else None) | ||||
| 
 | ||||
|             if startv != 0 or endv is not None: | ||||
|                 page_results = page_results[startv:endv] | ||||
|             res.extend(page_results) | ||||
| 
 | ||||
|             # A little optimization - if current page is not "full", ie. does | ||||
|             # not contain page_size videos then we can assume that this page | ||||
|             # is the last one - there are no more ids on further pages - | ||||
|             # i.e. no need to query again. | ||||
|             if len(page_results) + startv < self._pagesize: | ||||
|                 break | ||||
| 
 | ||||
|             # If we got the whole page, but the next page is not interesting, | ||||
|             # break out early as well | ||||
|             if end == nextfirstid: | ||||
|                 break | ||||
|         return res | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user