Merge branch 'paged-lists'
Conflicts: test/test_utils.py youtube_dl/extractor/youtube.py
This commit is contained in:
		
						commit
						65697b3bf3
					
				| @ -19,6 +19,7 @@ from youtube_dl.utils import ( | |||||||
|     fix_xml_ampersands, |     fix_xml_ampersands, | ||||||
|     get_meta_content, |     get_meta_content, | ||||||
|     orderedSet, |     orderedSet, | ||||||
|  |     PagedList, | ||||||
|     parse_duration, |     parse_duration, | ||||||
|     sanitize_filename, |     sanitize_filename, | ||||||
|     shell_quote, |     shell_quote, | ||||||
| @ -214,5 +215,26 @@ class TestUtil(unittest.TestCase): | |||||||
|             fix_xml_ampersands('Ӓ᪼'), 'Ӓ᪼') |             fix_xml_ampersands('Ӓ᪼'), 'Ӓ᪼') | ||||||
|         self.assertEqual(fix_xml_ampersands('&#&#'), '&#&#') |         self.assertEqual(fix_xml_ampersands('&#&#'), '&#&#') | ||||||
| 
 | 
 | ||||||
|  |     def test_paged_list(self): | ||||||
|  |         def testPL(size, pagesize, sliceargs, expected): | ||||||
|  |             def get_page(pagenum): | ||||||
|  |                 firstid = pagenum * pagesize | ||||||
|  |                 upto = min(size, pagenum * pagesize + pagesize) | ||||||
|  |                 for i in range(firstid, upto): | ||||||
|  |                     yield i | ||||||
|  | 
 | ||||||
|  |             pl = PagedList(get_page, pagesize) | ||||||
|  |             got = pl.getslice(*sliceargs) | ||||||
|  |             self.assertEqual(got, expected) | ||||||
|  | 
 | ||||||
|  |         testPL(5, 2, (), [0, 1, 2, 3, 4]) | ||||||
|  |         testPL(5, 2, (1,), [1, 2, 3, 4]) | ||||||
|  |         testPL(5, 2, (2,), [2, 3, 4]) | ||||||
|  |         testPL(5, 2, (4,), [4]) | ||||||
|  |         testPL(5, 2, (0, 3), [0, 1, 2]) | ||||||
|  |         testPL(5, 2, (1, 4), [1, 2, 3]) | ||||||
|  |         testPL(5, 2, (2, 99), [2, 3, 4]) | ||||||
|  |         testPL(5, 2, (20, 99), []) | ||||||
|  | 
 | ||||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||||
|     unittest.main() |     unittest.main() | ||||||
|  | |||||||
| @ -39,6 +39,7 @@ from .utils import ( | |||||||
|     locked_file, |     locked_file, | ||||||
|     make_HTTPS_handler, |     make_HTTPS_handler, | ||||||
|     MaxDownloadsReached, |     MaxDownloadsReached, | ||||||
|  |     PagedList, | ||||||
|     PostProcessingError, |     PostProcessingError, | ||||||
|     platform_name, |     platform_name, | ||||||
|     preferredencoding, |     preferredencoding, | ||||||
| @ -578,19 +579,27 @@ class YoutubeDL(object): | |||||||
| 
 | 
 | ||||||
|             playlist_results = [] |             playlist_results = [] | ||||||
| 
 | 
 | ||||||
|             n_all_entries = len(ie_result['entries']) |  | ||||||
|             playliststart = self.params.get('playliststart', 1) - 1 |             playliststart = self.params.get('playliststart', 1) - 1 | ||||||
|             playlistend = self.params.get('playlistend', None) |             playlistend = self.params.get('playlistend', None) | ||||||
|             # For backwards compatibility, interpret -1 as whole list |             # For backwards compatibility, interpret -1 as whole list | ||||||
|             if playlistend == -1: |             if playlistend == -1: | ||||||
|                 playlistend = None |                 playlistend = None | ||||||
| 
 | 
 | ||||||
|             entries = ie_result['entries'][playliststart:playlistend] |             if isinstance(ie_result['entries'], list): | ||||||
|             n_entries = len(entries) |                 n_all_entries = len(ie_result['entries']) | ||||||
| 
 |                 entries = ie_result['entries'][playliststart:playlistend] | ||||||
|             self.to_screen( |                 n_entries = len(entries) | ||||||
|                 "[%s] playlist '%s': Collected %d video ids (downloading %d of them)" % |                 self.to_screen( | ||||||
|                 (ie_result['extractor'], playlist, n_all_entries, n_entries)) |                     "[%s] playlist %s: Collected %d video ids (downloading %d of them)" % | ||||||
|  |                     (ie_result['extractor'], playlist, n_all_entries, n_entries)) | ||||||
|  |             else: | ||||||
|  |                 assert isinstance(ie_result['entries'], PagedList) | ||||||
|  |                 entries = ie_result['entries'].getslice( | ||||||
|  |                     playliststart, playlistend) | ||||||
|  |                 n_entries = len(entries) | ||||||
|  |                 self.to_screen( | ||||||
|  |                     "[%s] playlist %s: Downloading %d videos" % | ||||||
|  |                     (ie_result['extractor'], playlist, n_entries)) | ||||||
| 
 | 
 | ||||||
|             for i, entry in enumerate(entries, 1): |             for i, entry in enumerate(entries, 1): | ||||||
|                 self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries)) |                 self.to_screen('[download] Downloading video #%s of %s' % (i, n_entries)) | ||||||
|  | |||||||
| @ -28,6 +28,7 @@ from ..utils import ( | |||||||
|     get_element_by_attribute, |     get_element_by_attribute, | ||||||
|     ExtractorError, |     ExtractorError, | ||||||
|     int_or_none, |     int_or_none, | ||||||
|  |     PagedList, | ||||||
|     RegexNotFoundError, |     RegexNotFoundError, | ||||||
|     unescapeHTML, |     unescapeHTML, | ||||||
|     unified_strdate, |     unified_strdate, | ||||||
| @ -1626,44 +1627,35 @@ class YoutubeUserIE(InfoExtractor): | |||||||
|         # page by page until there are no video ids - it means we got |         # page by page until there are no video ids - it means we got | ||||||
|         # all of them. |         # all of them. | ||||||
| 
 | 
 | ||||||
|         url_results = [] |         def download_page(pagenum): | ||||||
| 
 |  | ||||||
|         for pagenum in itertools.count(0): |  | ||||||
|             start_index = pagenum * self._GDATA_PAGE_SIZE + 1 |             start_index = pagenum * self._GDATA_PAGE_SIZE + 1 | ||||||
| 
 | 
 | ||||||
|             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index) |             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index) | ||||||
|             page = self._download_webpage(gdata_url, username, |             page = self._download_webpage( | ||||||
|                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE)) |                 gdata_url, username, | ||||||
|  |                 u'Downloading video ids from %d to %d' % ( | ||||||
|  |                     start_index, start_index + self._GDATA_PAGE_SIZE)) | ||||||
| 
 | 
 | ||||||
|             try: |             try: | ||||||
|                 response = json.loads(page) |                 response = json.loads(page) | ||||||
|             except ValueError as err: |             except ValueError as err: | ||||||
|                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) |                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) | ||||||
|             if 'entry' not in response['feed']: |             if 'entry' not in response['feed']: | ||||||
|                 # Number of videos is a multiple of self._MAX_RESULTS |                 return | ||||||
|                 break |  | ||||||
| 
 | 
 | ||||||
|             # Extract video identifiers |             # Extract video identifiers | ||||||
|             entries = response['feed']['entry'] |             entries = response['feed']['entry'] | ||||||
|             for entry in entries: |             for entry in entries: | ||||||
|                 title = entry['title']['$t'] |                 title = entry['title']['$t'] | ||||||
|                 video_id = entry['id']['$t'].split('/')[-1] |                 video_id = entry['id']['$t'].split('/')[-1] | ||||||
|                 url_results.append({ |                 yield { | ||||||
|                     '_type': 'url', |                     '_type': 'url', | ||||||
|                     'url': video_id, |                     'url': video_id, | ||||||
|                     'ie_key': 'Youtube', |                     'ie_key': 'Youtube', | ||||||
|                     'id': 'video_id', |                     'id': 'video_id', | ||||||
|                     'title': title, |                     'title': title, | ||||||
|                 }) |                 } | ||||||
| 
 |         url_results = PagedList(download_page, self._GDATA_PAGE_SIZE) | ||||||
|             # A little optimization - if current page is not |  | ||||||
|             # "full", ie. does not contain PAGE_SIZE video ids then |  | ||||||
|             # we can assume that this page is the last one - there |  | ||||||
|             # are no more ids on further pages - no need to query |  | ||||||
|             # again. |  | ||||||
| 
 |  | ||||||
|             if len(entries) < self._GDATA_PAGE_SIZE: |  | ||||||
|                 break |  | ||||||
| 
 | 
 | ||||||
|         return self.playlist_result(url_results, playlist_title=username) |         return self.playlist_result(url_results, playlist_title=username) | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -6,6 +6,7 @@ import datetime | |||||||
| import email.utils | import email.utils | ||||||
| import errno | import errno | ||||||
| import gzip | import gzip | ||||||
|  | import itertools | ||||||
| import io | import io | ||||||
| import json | import json | ||||||
| import locale | import locale | ||||||
| @ -1164,3 +1165,46 @@ def check_executable(exe, args=[]): | |||||||
|     except OSError: |     except OSError: | ||||||
|         return False |         return False | ||||||
|     return exe |     return exe | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class PagedList(object): | ||||||
|  |     def __init__(self, pagefunc, pagesize): | ||||||
|  |         self._pagefunc = pagefunc | ||||||
|  |         self._pagesize = pagesize | ||||||
|  | 
 | ||||||
|  |     def getslice(self, start=0, end=None): | ||||||
|  |         res = [] | ||||||
|  |         for pagenum in itertools.count(start // self._pagesize): | ||||||
|  |             firstid = pagenum * self._pagesize | ||||||
|  |             nextfirstid = pagenum * self._pagesize + self._pagesize | ||||||
|  |             if start >= nextfirstid: | ||||||
|  |                 continue | ||||||
|  | 
 | ||||||
|  |             page_results = list(self._pagefunc(pagenum)) | ||||||
|  | 
 | ||||||
|  |             startv = ( | ||||||
|  |                 start % self._pagesize | ||||||
|  |                 if firstid <= start < nextfirstid | ||||||
|  |                 else 0) | ||||||
|  | 
 | ||||||
|  |             endv = ( | ||||||
|  |                 ((end - 1) % self._pagesize) + 1 | ||||||
|  |                 if (end is not None and firstid <= end <= nextfirstid) | ||||||
|  |                 else None) | ||||||
|  | 
 | ||||||
|  |             if startv != 0 or endv is not None: | ||||||
|  |                 page_results = page_results[startv:endv] | ||||||
|  |             res.extend(page_results) | ||||||
|  | 
 | ||||||
|  |             # A little optimization - if current page is not "full", ie. does | ||||||
|  |             # not contain page_size videos then we can assume that this page | ||||||
|  |             # is the last one - there are no more ids on further pages - | ||||||
|  |             # i.e. no need to query again. | ||||||
|  |             if len(page_results) + startv < self._pagesize: | ||||||
|  |                 break | ||||||
|  | 
 | ||||||
|  |             # If we got the whole page, but the next page is not interesting, | ||||||
|  |             # break out early as well | ||||||
|  |             if end == nextfirstid: | ||||||
|  |                 break | ||||||
|  |         return res | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user