Extractor for npo.nl programs (as opposed to episodes of programs).

Retrieves only the most recent episodes of the program in question (hence the name). Some programs have so many episodes available that it doesn't make any practical sense to retrieve all, as discussed in issue #7947.
This commit is contained in:
Jan Hoek 2017-04-03 17:49:50 +02:00
parent b022f4f600
commit d53104f923
3 changed files with 118 additions and 0 deletions

43
testnporecent.ps1 Normal file
View File

@ -0,0 +1,43 @@
Describe 'Flake8' {
It 'Does not return any errors' {
& flake8 /Users/jhoek/GitHub/youtube-dl/youtube_dl/extractor/npo.py | Should BeNullOrEmpty
}
}
Describe 'Tests' {
It 'Should work in Python 2.6' {
& 'python2.6' '--version' 2>&1 | Should Be 'Python 2.6.9'
'', '_1', '_2' | ForEach-Object {
& 'python2.6' /Users/jhoek/GitHub/youtube-dl/test/test_download.py "TestDownload.test_NPORecents$($_)" 2>&1
$LASTEXITCODE | Should Be 0
}
}
It 'Should work in Python 2.7' {
& python '--version' 2>&1 | Should Be 'Python 2.7.13'
'', '_1', '_2' | ForEach-Object {
& python /Users/jhoek/GitHub/youtube-dl/test/test_download.py "TestDownload.test_NPORecents$($_)" 2>&1
$LASTEXITCODE | Should Be 0
}
}
It 'Should work in Python 3.5' {
& python3 '--version' | Should Be 'Python 3.5.2'
'', '_1', '_2' | ForEach-Object {
& python3 /Users/jhoek/GitHub/youtube-dl/test/test_download.py "TestDownload.test_NPORecents$($_)" 2>&1
$LASTEXITCODE | Should Be 0
}
}
It 'Should work in Python 3.6' {
& python3.6 '--version' | Should Be 'Python 3.6.1'
'', '_1', '_2' | ForEach-Object {
& 'python3.6' /Users/jhoek/GitHub/youtube-dl/test/test_download.py "TestDownload.test_NPORecents$($_)" 2>&1
$LASTEXITCODE | Should Be 0
}
}
}

View File

@ -686,6 +686,7 @@ from .npo import (
NPORadioFragmentIE, NPORadioFragmentIE,
SchoolTVIE, SchoolTVIE,
HetKlokhuisIE, HetKlokhuisIE,
NPORecentsIE,
VPROIE, VPROIE,
WNLIE, WNLIE,
) )

View File

@ -478,6 +478,80 @@ class HetKlokhuisIE(NPODataMidEmbedIE):
} }
class NPORecentsIE(NPOIE):
IE_Name = 'npo:recents'
npo12_regex = r"""<div class='span4'>\s*<div class='image-container'>\s*<a href="(.*?)">\s*(<div class="program-not-available">)?"""
npo3_regex = r"""<div class='span4 image'>\s*<a href="(.*?)">\s*<div class="meta-container">\s*<div class="meta first">\s*<div class="md-label"><span class="npo-glyph triangle-right"></span></div>\s*<div class="md-value">.*?</div>\s*</div>\s*</div>\s*(<div class="program-not-available">)?"""
_VALID_URL = r'(?:https?://)?(?:www\.)?npo\.nl/(?P<alt_id>[^/]+)/(?P<program_id>\w+_\d+)'
_TESTS = [{
# Example of an npo3 program
'url': 'https://www.npo.nl/keuringsdienst-van-waarde/KN_1678993',
'info_dict': {
'title': 'Keuringsdienst van Waarde',
'id': 'KN_1678993',
'description': 'md5:5ffaf131f175d8a771e7a7884833dad2'
},
'playlist_mincount': 8
}, {
# Example of an npo1/npo2 program
'url': 'https://www.npo.nl/jinek/KN_1676589',
'info_dict': {
'title': 'Jinek',
'id': 'KN_1676589',
'description': 'md5:6998986899b4903395f0cdd0670cedaf'
},
'playlist_mincount': 8
}, {
# Example of a program for which there will be only one available episode (if any)
'url': 'https://www.npo.nl/midsomer-murders/POW_00828660',
'info_dict': {
'title': 'Midsomer murders',
'id': 'POW_00828660',
'description': 'md5:a8b6e9d3e3bd367be88766e3ce8e8362'
},
'playlist_maxcount': 1
}]
def _extract_entries(self, webpage, program_id, program_url):
is_npo3 = 'www-assets.npo.nl/uploads/tv_channel/265/logo/smaller_npo3-logo.png' in webpage
if is_npo3:
episodes_url = '%s//search?category=broadcasts&page=1' % program_url
regex = self.npo3_regex
else:
episodes_url = '%s/search?media_type=broadcast&start=0&rows=8' % program_url
regex = self.npo12_regex
episodes = self._download_webpage(episodes_url, program_id, note='Retrieving episodes')
for match in re.finditer(regex, episodes):
url = match.group(1)
available = match.group(2) is None
if available:
yield self.url_result(
url='http://npo.nl%s' % url,
video_title=self._og_search_title(webpage))
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
alt_id = mobj.group('alt_id')
program_id = mobj.group('program_id')
webpage = self._download_webpage(url, program_id)
title = self._og_search_title(webpage, fatal=False) or alt_id
description = self._og_search_description(webpage) or self._html_search_meta('description', webpage, 'description', fatal=False)
entries = self._extract_entries(webpage, program_id, url)
return {
'_type': 'playlist',
'id': program_id,
'display_id': alt_id,
'title': title,
'description': description,
'entries': entries
}
class NPOPlaylistBaseIE(NPOIE): class NPOPlaylistBaseIE(NPOIE):
def _real_extract(self, url): def _real_extract(self, url):
playlist_id = self._match_id(url) playlist_id = self._match_id(url)