diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f3fe0d432..d15267d7e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -28,7 +28,7 @@ So please elaborate on what feature you are requesting, or what bug you want to - How it could be fixed - How your proposed solution would look like -If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a commiter myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. +If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a committer myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the `-v` flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. diff --git a/README.md b/README.md index 7002f45e0..3a4707227 100644 --- a/README.md +++ b/README.md @@ -830,7 +830,7 @@ So please elaborate on what feature you are requesting, or what bug you want to - How it could be fixed - How your proposed solution would look like -If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a commiter myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. +If your report is shorter than two lines, it is almost certainly missing some of these, which makes it hard for us to respond to it. We're often too polite to close the issue outright, but the missing info makes misinterpretation likely. As a committer myself, I often get frustrated by these issues, since the only possible way for me to move forward on them is to ask for clarification over and over. For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the `-v` flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 299bc5e72..1a5c7cde9 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -23,7 +23,6 @@ - **AdobeTVShow** - **AdobeTVVideo** - **AdultSwim** - - **Aftenposten** - **Aftonbladet** - **AirMozilla** - **AlJazeera** @@ -34,7 +33,8 @@ - **Aparat** - **AppleConnect** - **AppleDaily**: 臺灣蘋果日報 - - **AppleTrailers** + - **appletrailers** + - **appletrailers:section** - **archive.org**: archive.org videos - **ARD** - **ARD:mediathek** @@ -502,8 +502,6 @@ - **SnagFilmsEmbed** - **Snotr** - **Sohu** - - **soompi** - - **soompi:show** - **soundcloud** - **soundcloud:playlist** - **soundcloud:search**: Soundcloud search @@ -627,7 +625,7 @@ - **Vessel** - **Vesti**: Вести.Ru - **Vevo** - - **VGTV**: VGTV and BTTV + - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet - **vh1.com** - **Vice** - **Viddler** diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 760b65441..365c0b86f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -15,7 +15,6 @@ from .adobetv import ( AdobeTVVideoIE, ) from .adultswim import AdultSwimIE -from .aftenposten import AftenpostenIE from .aftonbladet import AftonbladetIE from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE @@ -26,7 +25,10 @@ from .aol import AolIE from .allocine import AllocineIE from .aparat import AparatIE from .appleconnect import AppleConnectIE -from .appletrailers import AppleTrailersIE +from .appletrailers import ( + AppleTrailersIE, + AppleTrailersSectionIE, +) from .archiveorg import ArchiveOrgIE from .ard import ( ARDIE, @@ -591,10 +593,6 @@ from .snagfilms import ( ) from .snotr import SnotrIE from .sohu import SohuIE -from .soompi import ( - SoompiIE, - SoompiShowIE, -) from .soundcloud import ( SoundcloudIE, SoundcloudSetIE, @@ -663,6 +661,7 @@ from .tenplay import TenPlayIE from .testurl import TestURLIE from .testtube import TestTubeIE from .tf1 import TF1IE +from .theintercept import TheInterceptIE from .theonion import TheOnionIE from .theplatform import ( ThePlatformIE, diff --git a/youtube_dl/extractor/aftenposten.py b/youtube_dl/extractor/aftenposten.py deleted file mode 100644 index 0c00acfb5..000000000 --- a/youtube_dl/extractor/aftenposten.py +++ /dev/null @@ -1,23 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class AftenpostenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/(?:#!/)?video/(?P\d+)' - _TEST = { - 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', - 'md5': 'fd828cd29774a729bf4d4425fe192972', - 'info_dict': { - 'id': '21039', - 'ext': 'mov', - 'title': 'TRAILER: "Sweatshop" - I can´t take any more', - 'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238', - 'timestamp': 1416927969, - 'upload_date': '20141125', - } - } - - def _real_extract(self, url): - return self.url_result('xstream:ap:%s' % self._match_id(url), 'Xstream') diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index f68dc3236..62ed0c918 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -11,6 +11,7 @@ from ..utils import ( class AppleTrailersIE(InfoExtractor): + IE_NAME = 'appletrailers' _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P[^/]+)/(?P[^/]+)' _TESTS = [{ 'url': 'http://trailers.apple.com/trailers/wb/manofsteel/', @@ -63,6 +64,12 @@ class AppleTrailersIE(InfoExtractor): }, }, ] + }, { + 'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/', + 'info_dict': { + 'id': 'blackthorn', + }, + 'playlist_mincount': 2, }, { 'url': 'http://trailers.apple.com/ca/metropole/autrui/', 'only_matching': True, @@ -79,7 +86,7 @@ class AppleTrailersIE(InfoExtractor): def fix_html(s): s = re.sub(r'(?s).*?', '', s) - s = re.sub(r'', r'', s) + s = re.sub(r'', r'', s) # The ' in the onClick attributes are not escaped, it couldn't be parsed # like: http://trailers.apple.com/trailers/wb/gravity/ @@ -96,6 +103,9 @@ class AppleTrailersIE(InfoExtractor): trailer_info_json = self._search_regex(self._JSON_RE, on_click, 'trailer info') trailer_info = json.loads(trailer_info_json) + first_url = trailer_info.get('url') + if not first_url: + continue title = trailer_info['title'] video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() thumbnail = li.find('.//img').attrib['src'] @@ -107,7 +117,6 @@ class AppleTrailersIE(InfoExtractor): if m: duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) - first_url = trailer_info['url'] trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower() settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json') @@ -144,3 +153,76 @@ class AppleTrailersIE(InfoExtractor): 'id': movie, 'entries': playlist, } + + +class AppleTrailersSectionIE(InfoExtractor): + IE_NAME = 'appletrailers:section' + _SECTIONS = { + 'justadded': { + 'feed_path': 'just_added', + 'title': 'Just Added', + }, + 'exclusive': { + 'feed_path': 'exclusive', + 'title': 'Exclusive', + }, + 'justhd': { + 'feed_path': 'just_hd', + 'title': 'Just HD', + }, + 'mostpopular': { + 'feed_path': 'most_pop', + 'title': 'Most Popular', + }, + 'moviestudios': { + 'feed_path': 'studios', + 'title': 'Movie Studios', + }, + } + _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/#section=(?P%s)' % '|'.join(_SECTIONS) + _TESTS = [{ + 'url': 'http://trailers.apple.com/#section=justadded', + 'info_dict': { + 'title': 'Just Added', + 'id': 'justadded', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=exclusive', + 'info_dict': { + 'title': 'Exclusive', + 'id': 'exclusive', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=justhd', + 'info_dict': { + 'title': 'Just HD', + 'id': 'justhd', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=mostpopular', + 'info_dict': { + 'title': 'Most Popular', + 'id': 'mostpopular', + }, + 'playlist_mincount': 80, + }, { + 'url': 'http://trailers.apple.com/#section=moviestudios', + 'info_dict': { + 'title': 'Movie Studios', + 'id': 'moviestudios', + }, + 'playlist_mincount': 80, + }] + + def _real_extract(self, url): + section = self._match_id(url) + section_data = self._download_json( + 'http://trailers.apple.com/trailers/home/feeds/%s.json' % self._SECTIONS[section]['feed_path'], + section) + entries = [ + self.url_result('http://trailers.apple.com' + e['location']) + for e in section_data] + return self.playlist_result(entries, section, self._SECTIONS[section]['title']) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 2a00da3ee..10301a8ea 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -68,9 +68,13 @@ class ArteTVPlus7IE(InfoExtractor): def _extract_url_info(cls, url): mobj = re.match(cls._VALID_URL, url) lang = mobj.group('lang') - # This is not a real id, it can be for example AJT for the news - # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal - video_id = mobj.group('id') + query = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + if 'vid' in query: + video_id = query['vid'][0] + else: + # This is not a real id, it can be for example AJT for the news + # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal + video_id = mobj.group('id') return video_id, lang def _real_extract(self, url): @@ -79,9 +83,15 @@ class ArteTVPlus7IE(InfoExtractor): return self._extract_from_webpage(webpage, video_id, lang) def _extract_from_webpage(self, webpage, video_id, lang): + patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']') + ids = (video_id, '') + # some pages contain multiple videos (like + # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D), + # so we first try to look for json URLs that contain the video id from + # the 'vid' parameter. + patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates] json_url = self._html_search_regex( - [r'arte_vp_url=["\'](.*?)["\']', r'data-url=["\']([^"]+)["\']'], - webpage, 'json vp url', default=None) + patterns, webpage, 'json vp url', default=None) if not json_url: iframe_url = self._html_search_regex( r']+src=(["\'])(?P.+\bjson_url=.+?)\1', diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py index bd2a6340b..38bda3af5 100644 --- a/youtube_dl/extractor/bleacherreport.py +++ b/youtube_dl/extractor/bleacherreport.py @@ -90,7 +90,7 @@ class BleacherReportCMSIE(AMPIE): _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P[0-9a-f-]{36})' _TESTS = [{ 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1', - 'md5': 'f0ca220af012d4df857b54f792c586bb', + 'md5': '8c2c12e3af7805152675446c905d159b', 'info_dict': { 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', 'ext': 'flv', diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py index 81f3d7697..2efa200b5 100644 --- a/youtube_dl/extractor/comcarcoff.py +++ b/youtube_dl/extractor/comcarcoff.py @@ -1,10 +1,12 @@ # encoding: utf-8 from __future__ import unicode_literals -import json - from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, +) class ComCarCoffIE(InfoExtractor): @@ -16,6 +18,7 @@ class ComCarCoffIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20141127', 'timestamp': 1417107600, + 'duration': 1232, 'title': 'Happy Thanksgiving Miranda', 'description': 'Jerry Seinfeld and his special guest Miranda Sings cruise around town in search of coffee, complaining and apologizing along the way.', 'thumbnail': 'http://ccc.crackle.com/images/s5e4_thumb.jpg', @@ -31,9 +34,10 @@ class ComCarCoffIE(InfoExtractor): display_id = 'comediansincarsgettingcoffee.com' webpage = self._download_webpage(url, display_id) - full_data = json.loads(self._search_regex( - r'', - webpage, 'full data json')) + full_data = self._parse_json( + self._search_regex( + r'window\.app\s*=\s*({.+?});\n', webpage, 'full data json'), + display_id)['videoData'] video_id = full_data['activeVideo']['video'] video_data = full_data.get('videos', {}).get(video_id) or full_data['singleshots'][video_id] @@ -45,12 +49,18 @@ class ComCarCoffIE(InfoExtractor): formats = self._extract_m3u8_formats( video_data['mediaUrl'], video_id, ext='mp4') + timestamp = int_or_none(video_data.get('pubDateTime')) or parse_iso8601( + video_data.get('pubDate')) + duration = int_or_none(video_data.get('durationSeconds')) or parse_duration( + video_data.get('duration')) + return { 'id': video_id, 'display_id': display_id, 'title': video_data['title'], 'description': video_data.get('description'), - 'timestamp': parse_iso8601(video_data.get('pubDate')), + 'timestamp': timestamp, + 'duration': duration, 'thumbnails': thumbnails, 'formats': formats, 'webpage_url': 'http://comediansincarsgettingcoffee.com/%s' % (video_data.get('urlSlug', video_data.get('slug'))), diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index 934da765e..9a94cf361 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -24,6 +24,18 @@ class DaumIE(InfoExtractor): 'upload_date': '20130831', 'duration': 3868, }, + }, { + # Test for https://github.com/rg3/youtube-dl/issues/7949 + 'url': 'http://tvpot.daum.net/mypot/View.do?ownerid=M1O35s8HPOo0&clipid=73147290', + 'md5': 'c92d78bcee4424451f1667f275c1dc97', + 'info_dict': { + 'id': '73147290', + 'ext': 'mp4', + 'title': '싸이 - 나팔바지 [유희열의 스케치북] 299회 20151218', + 'description': '싸이 - 나팔바지', + 'upload_date': '20151219', + 'duration': 232, + }, }, { 'url': 'http://tvpot.daum.net/v/vab4dyeDBysyBssyukBUjBz', 'only_matching': True, @@ -37,9 +49,11 @@ class DaumIE(InfoExtractor): video_id = mobj.group('id') canonical_url = 'http://tvpot.daum.net/v/%s' % video_id webpage = self._download_webpage(canonical_url, video_id) + og_url = self._og_search_url(webpage, default=None) or self._search_regex( + r']+rel=(["\'])canonical\1[^>]+href=(["\'])(?P.+?)\2', + webpage, 'canonical url', group='url') full_id = self._search_regex( - r'src=["\']http://videofarm\.daum\.net/controller/video/viewer/Video\.html\?.*?vid=(.+?)[&"\']', - webpage, 'full id') + r'tvpot\.daum\.net/v/([^/]+)', og_url, 'full id') query = compat_urllib_parse.urlencode({'vid': full_id}) info = self._download_xml( 'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py index 6613ee17a..fdc51f44f 100644 --- a/youtube_dl/extractor/franceinter.py +++ b/youtube_dl/extractor/franceinter.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import int_or_none @@ -23,8 +21,7 @@ class FranceInterIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -33,7 +30,7 @@ class FranceInterIE(InfoExtractor): video_url = 'http://www.franceinter.fr/' + path title = self._html_search_regex( - r'(.+?)', webpage, 'title') + r'(.+?)', webpage, 'title') description = self._html_search_regex( r'(.*?)', webpage, 'description', fatal=False) diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index 70c8ca64e..85e9344aa 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -13,7 +13,7 @@ from ..utils import ( class ImgurIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!gallery)(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|topic/[^/]+)/)?(?P[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z]+)?$' _TESTS = [{ 'url': 'https://i.imgur.com/A61SaA1.gifv', @@ -21,7 +21,7 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$', + 'description': 'Imgur: The most awesome images on the Internet.', }, }, { 'url': 'https://imgur.com/A61SaA1', @@ -29,8 +29,20 @@ class ImgurIE(InfoExtractor): 'id': 'A61SaA1', 'ext': 'mp4', 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$', - 'description': 're:The origin of the Internet\'s most viral images$|The Internet\'s visual storytelling community\. Explore, share, and discuss the best visual stories the Internet has to offer\.$', + 'description': 'Imgur: The most awesome images on the Internet.', }, + }, { + 'url': 'https://imgur.com/gallery/YcAQlkx', + 'info_dict': { + 'id': 'YcAQlkx', + 'ext': 'mp4', + 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....', + 'description': 'Imgur: The most awesome images on the Internet.' + + } + }, { + 'url': 'http://imgur.com/topic/Funny/N8rOudd', + 'only_matching': True, }] def _real_extract(self, url): @@ -100,25 +112,38 @@ class ImgurIE(InfoExtractor): class ImgurAlbumIE(InfoExtractor): - _VALID_URL = r'https?://(?:i\.)?imgur\.com/gallery/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:a|gallery|topic/[^/]+)/)?(?P[a-zA-Z0-9]{5})(?:[/?#&]+)?$' - _TEST = { + _TESTS = [{ 'url': 'http://imgur.com/gallery/Q95ko', 'info_dict': { 'id': 'Q95ko', }, 'playlist_count': 25, - } + }, { + 'url': 'http://imgur.com/a/j6Orj', + 'only_matching': True, + }, { + 'url': 'http://imgur.com/topic/Aww/ll5Vk', + 'only_matching': True, + }] def _real_extract(self, url): album_id = self._match_id(url) album_images = self._download_json( 'http://imgur.com/gallery/%s/album_images/hit.json?all=true' % album_id, - album_id)['data']['images'] + album_id, fatal=False) - entries = [ - self.url_result('http://imgur.com/%s' % image['hash']) - for image in album_images if image.get('hash')] + if album_images: + data = album_images.get('data') + if data and isinstance(data, dict): + images = data.get('images') + if images and isinstance(images, list): + entries = [ + self.url_result('http://imgur.com/%s' % image['hash']) + for image in images if image.get('hash')] + return self.playlist_result(entries, album_id) - return self.playlist_result(entries, album_id) + # Fallback to single video + return self.url_result('http://imgur.com/%s' % album_id, ImgurIE.ie_key()) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index c158f2064..e5e16ca3b 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -47,7 +47,7 @@ class InstagramIE(InfoExtractor): class InstagramUserIE(InfoExtractor): - _VALID_URL = r'https://instagram\.com/(?P[^/]{2,})/?(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P[^/]{2,})/?(?:$|[?#])' IE_DESC = 'Instagram user profile' IE_NAME = 'instagram:user' _TEST = { diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 744e4a09a..97e8ffc97 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -16,7 +16,7 @@ from ..utils import ( class PBSIE(InfoExtractor): _STATIONS = ( - (r'(?:video|www)\.pbs\.org', 'PBS: Public Broadcasting Service'), # http://www.pbs.org/ + (r'(?:video|www|player)\.pbs\.org', 'PBS: Public Broadcasting Service'), # http://www.pbs.org/ (r'video\.aptv\.org', 'APT - Alabama Public Television (WBIQ)'), # http://aptv.org/ (r'video\.gpb\.org', 'GPB/Georgia Public Broadcasting (WGTV)'), # http://www.gpb.org/ (r'video\.mpbonline\.org', 'Mississippi Public Broadcasting (WMPN)'), # http://www.mpbonline.org diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 63cc764bb..514e9b433 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -31,9 +31,8 @@ class PeriscopeIE(InfoExtractor): }] def _call_api(self, method, value): - attribute = 'token' if len(value) > 13 else 'broadcast_id' return self._download_json( - 'https://api.periscope.tv/api/v2/%s?%s=%s' % (method, attribute, value), value) + 'https://api.periscope.tv/api/v2/%s?broadcast_id=%s' % (method, value), value) def _real_extract(self, url): token = self._match_id(url) diff --git a/youtube_dl/extractor/soompi.py b/youtube_dl/extractor/soompi.py deleted file mode 100644 index 5da66ca9e..000000000 --- a/youtube_dl/extractor/soompi.py +++ /dev/null @@ -1,146 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .crunchyroll import CrunchyrollIE - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - ExtractorError, - int_or_none, - remove_start, - xpath_text, -) - - -class SoompiBaseIE(InfoExtractor): - def _get_episodes(self, webpage, episode_filter=None): - episodes = self._parse_json( - self._search_regex( - r'VIDEOS\s*=\s*(\[.+?\]);', webpage, 'episodes JSON'), - None) - return list(filter(episode_filter, episodes)) - - -class SoompiIE(SoompiBaseIE, CrunchyrollIE): - IE_NAME = 'soompi' - _VALID_URL = r'https?://tv\.soompi\.com/(?:en/)?watch/(?P[0-9]+)' - _TESTS = [{ - 'url': 'http://tv.soompi.com/en/watch/29235', - 'info_dict': { - 'id': '29235', - 'ext': 'mp4', - 'title': 'Episode 1096', - 'description': '2015-05-20' - }, - 'params': { - 'skip_download': True, - }, - }] - - def _get_episode(self, webpage, video_id): - return self._get_episodes(webpage, lambda x: x['id'] == video_id)[0] - - def _get_subtitles(self, config, video_id): - sub_langs = {} - for subtitle in config.findall('./{default}preload/subtitles/subtitle'): - sub_langs[subtitle.attrib['id']] = subtitle.attrib['title'] - - subtitles = {} - for s in config.findall('./{default}preload/subtitle'): - lang_code = sub_langs.get(s.attrib['id']) - if not lang_code: - continue - sub_id = s.get('id') - data = xpath_text(s, './data', 'data') - iv = xpath_text(s, './iv', 'iv') - if not id or not iv or not data: - continue - subtitle = self._decrypt_subtitles(data, iv, sub_id).decode('utf-8') - subtitles[lang_code] = self._extract_subtitles(subtitle) - return subtitles - - def _real_extract(self, url): - video_id = self._match_id(url) - - try: - webpage = self._download_webpage( - url, video_id, 'Downloading episode page') - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - webpage = ee.cause.read() - block_message = self._html_search_regex( - r'(?s)
(.+?)
', webpage, - 'block message', default=None) - if block_message: - raise ExtractorError(block_message, expected=True) - raise - - formats = [] - config = None - for format_id in re.findall(r'\?quality=([0-9a-zA-Z]+)', webpage): - config = self._download_xml( - 'http://tv.soompi.com/en/show/_/%s-config.xml?mode=hls&quality=%s' % (video_id, format_id), - video_id, 'Downloading %s XML' % format_id) - m3u8_url = xpath_text( - config, './{default}preload/stream_info/file', - '%s m3u8 URL' % format_id) - if not m3u8_url: - continue - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', m3u8_id=format_id)) - self._sort_formats(formats) - - episode = self._get_episode(webpage, video_id) - - title = episode['name'] - description = episode.get('description') - duration = int_or_none(episode.get('duration')) - - thumbnails = [{ - 'id': thumbnail_id, - 'url': thumbnail_url, - } for thumbnail_id, thumbnail_url in episode.get('img_url', {}).items()] - - subtitles = self.extract_subtitles(config, video_id) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnails': thumbnails, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles - } - - -class SoompiShowIE(SoompiBaseIE): - IE_NAME = 'soompi:show' - _VALID_URL = r'https?://tv\.soompi\.com/en/shows/(?P[0-9a-zA-Z\-_]+)' - _TESTS = [{ - 'url': 'http://tv.soompi.com/en/shows/liar-game', - 'info_dict': { - 'id': 'liar-game', - 'title': 'Liar Game', - 'description': 'md5:52c02bce0c1a622a95823591d0589b66', - }, - 'playlist_count': 14, - }] - - def _real_extract(self, url): - show_id = self._match_id(url) - - webpage = self._download_webpage( - url, show_id, 'Downloading show page') - - title = remove_start(self._og_search_title(webpage), 'SoompiTV | ') - description = self._og_search_description(webpage) - - entries = [ - self.url_result('http://tv.soompi.com/en/watch/%s' % episode['id'], 'Soompi') - for episode in self._get_episodes(webpage)] - - return self.playlist_result(entries, show_id, title, description) diff --git a/youtube_dl/extractor/theintercept.py b/youtube_dl/extractor/theintercept.py new file mode 100644 index 000000000..8cb3c3669 --- /dev/null +++ b/youtube_dl/extractor/theintercept.py @@ -0,0 +1,49 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + parse_iso8601, + int_or_none, + ExtractorError, +) + + +class TheInterceptIE(InfoExtractor): + _VALID_URL = r'https://theintercept.com/fieldofvision/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://theintercept.com/fieldofvision/thisisacoup-episode-four-surrender-or-die/', + 'md5': '145f28b41d44aab2f87c0a4ac8ec95bd', + 'info_dict': { + 'id': '46214', + 'ext': 'mp4', + 'title': '#ThisIsACoup – Episode Four: Surrender or Die', + 'description': 'md5:74dd27f0e2fbd50817829f97eaa33140', + 'timestamp': 1450429239, + 'upload_date': '20151218', + 'comment_count': int, + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + json_data = self._parse_json(self._search_regex( + r'initialStoreTree\s*=\s*(?P{.+})', webpage, + 'initialStoreTree'), display_id) + + for post in json_data['resources']['posts'].values(): + if post['slug'] == display_id: + return { + '_type': 'url_transparent', + 'url': 'jwplatform:%s' % post['fov_videoid'], + 'id': compat_str(post['ID']), + 'display_id': display_id, + 'title': post['title'], + 'description': post.get('excerpt'), + 'timestamp': parse_iso8601(post.get('date')), + 'comment_count': int_or_none(post.get('comments_number')), + } + raise ExtractorError('Unable to find the current post') diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index c1ee1decc..e03e2dbaa 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -5,6 +5,8 @@ from .common import InfoExtractor from ..utils import ( parse_iso8601, int_or_none, + xpath_attr, + xpath_element, ) @@ -15,7 +17,7 @@ class TwentyFourVideoIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.24video.net/video/view/1044982', - 'md5': 'd041af8b5b4246ea466226a0d6693345', + 'md5': 'e09fc0901d9eaeedac872f154931deeb', 'info_dict': { 'id': '1044982', 'ext': 'mp4', @@ -64,33 +66,24 @@ class TwentyFourVideoIE(InfoExtractor): r'
(\d+) комментари', webpage, 'comment count', fatal=False)) - formats = [] + # Sets some cookies + self._download_xml( + r'http://www.24video.net/video/xml/%s?mode=init' % video_id, + video_id, 'Downloading init XML') - pc_video = self._download_xml( + video_xml = self._download_xml( 'http://www.24video.net/video/xml/%s?mode=play' % video_id, - video_id, 'Downloading PC video URL').find('.//video') + video_id, 'Downloading video XML') - formats.append({ - 'url': pc_video.attrib['url'], - 'format_id': 'pc', - 'quality': 1, - }) + video = xpath_element(video_xml, './/video', 'video', fatal=True) - like_count = int_or_none(pc_video.get('ratingPlus')) - dislike_count = int_or_none(pc_video.get('ratingMinus')) - age_limit = 18 if pc_video.get('adult') == 'true' else 0 + formats = [{ + 'url': xpath_attr(video, '', 'url', 'video URL', fatal=True), + }] - mobile_video = self._download_xml( - 'http://www.24video.net/video/xml/%s' % video_id, - video_id, 'Downloading mobile video URL').find('.//video') - - formats.append({ - 'url': mobile_video.attrib['url'], - 'format_id': 'mobile', - 'quality': 0, - }) - - self._sort_formats(formats) + like_count = int_or_none(video.get('ratingPlus')) + dislike_count = int_or_none(video.get('ratingMinus')) + age_limit = 18 if video.get('adult') == 'true' else 0 return { 'id': video_id, diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index f38a72fde..811ee197d 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -4,26 +4,48 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .xstream import XstreamIE from ..utils import ( ExtractorError, float_or_none, ) -class VGTVIE(InfoExtractor): - IE_DESC = 'VGTV and BTTV' +class VGTVIE(XstreamIE): + IE_DESC = 'VGTV, BTTV, FTV, Aftenposten and Aftonbladet' + + _HOST_TO_APPNAME = { + 'vgtv.no': 'vgtv', + 'bt.no/tv': 'bttv', + 'aftenbladet.no/tv': 'satv', + 'fvn.no/fvntv': 'fvntv', + 'aftenposten.no/webtv': 'aptv', + } + + _APP_NAME_TO_VENDOR = { + 'vgtv': 'vgtv', + 'bttv': 'bt', + 'satv': 'sa', + 'fvntv': 'fvn', + 'aptv': 'ap', + } + _VALID_URL = r'''(?x) - (?: - vgtv:| - http://(?:www\.)? + (?:https?://(?:www\.)? + (?P + %s ) - (?Pvgtv|bt) + / (?: - :| - \.no/(?:tv/)?\#!/(?:video|live)/ - ) - (?P[0-9]+) - ''' + \#!/(?:video|live)/| + embed?.*id= + )| + (?P + %s + ):) + (?P\d+) + ''' % ('|'.join(_HOST_TO_APPNAME.keys()), '|'.join(_APP_NAME_TO_VENDOR.keys())) + _TESTS = [ { # streamType: vod @@ -59,17 +81,18 @@ class VGTVIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + 'skip': 'Video is no longer available', }, { - # streamType: live + # streamType: wasLive 'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla', 'info_dict': { 'id': '113063', - 'ext': 'flv', - 'title': 're:^DIREKTE: V75 fra Solvalla [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'ext': 'mp4', + 'title': 'V75 fra Solvalla 30.05.15', 'description': 'md5:b3743425765355855f88e096acc93231', 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 0, + 'duration': 25966, 'timestamp': 1432975582, 'upload_date': '20150530', 'view_count': int, @@ -79,6 +102,20 @@ class VGTVIE(InfoExtractor): 'skip_download': True, }, }, + { + 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', + 'md5': 'fd828cd29774a729bf4d4425fe192972', + 'info_dict': { + 'id': '21039', + 'ext': 'mov', + 'title': 'TRAILER: «SWEATSHOP» - I can´t take any more', + 'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238', + 'duration': 66, + 'timestamp': 1417002452, + 'upload_date': '20141126', + 'view_count': int, + } + }, { 'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien', 'only_matching': True, @@ -89,21 +126,27 @@ class VGTVIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') host = mobj.group('host') - - HOST_WEBSITES = { - 'vgtv': 'vgtv', - 'bt': 'bttv', - } + appname = self._HOST_TO_APPNAME[host] if host else mobj.group('appname') + vendor = self._APP_NAME_TO_VENDOR[appname] data = self._download_json( 'http://svp.vg.no/svp/api/v1/%s/assets/%s?appName=%s-website' - % (host, video_id, HOST_WEBSITES[host]), + % (vendor, video_id, appname), video_id, 'Downloading media JSON') if data.get('status') == 'inactive': raise ExtractorError( 'Video %s is no longer available' % video_id, expected=True) + info = { + 'formats': [], + } + if len(video_id) == 5: + if appname == 'bttv': + info = self._extract_video_info('btno', video_id) + elif appname == 'aptv': + info = self._extract_video_info('ap', video_id) + streams = data['streamUrls'] stream_type = data.get('streamType') @@ -111,48 +154,53 @@ class VGTVIE(InfoExtractor): hls_url = streams.get('hls') if hls_url: - formats.extend(self._extract_m3u8_formats( - hls_url, video_id, 'mp4', m3u8_id='hls')) + m3u8_formats = self._extract_m3u8_formats( + hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) hds_url = streams.get('hds') # wasLive hds are always 404 if hds_url and stream_type != 'wasLive': - formats.extend(self._extract_f4m_formats( - hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', - video_id, f4m_id='hds')) + f4m_formats = self._extract_f4m_formats( + hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + mp4_urls = streams.get('pseudostreaming') or [] mp4_url = streams.get('mp4') if mp4_url: - _url = hls_url or hds_url - MP4_URL_TEMPLATE = '%s/%%s.%s' % (mp4_url.rpartition('/')[0], mp4_url.rpartition('.')[-1]) - for mp4_format in _url.split(','): - m = re.search('(?P\d+)_(?P\d+)_(?P\d+)', mp4_format) - if not m: - continue - width = int(m.group('width')) - height = int(m.group('height')) - vbr = int(m.group('vbr')) - formats.append({ - 'url': MP4_URL_TEMPLATE % mp4_format, - 'format_id': 'mp4-%s' % vbr, - 'width': width, - 'height': height, - 'vbr': vbr, - 'preference': 1, + mp4_urls.append(mp4_url) + for mp4_url in mp4_urls: + format_info = { + 'url': mp4_url, + } + mobj = re.search('(\d+)_(\d+)_(\d+)', mp4_url) + if mobj: + tbr = int(mobj.group(3)) + format_info.update({ + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + 'tbr': tbr, + 'format_id': 'mp4-%s' % tbr, }) - self._sort_formats(formats) + formats.append(format_info) - return { + info['formats'].extend(formats) + + self._sort_formats(info['formats']) + + info.update({ 'id': video_id, - 'title': self._live_title(data['title']), + 'title': self._live_title(data['title']) if stream_type == 'live' else data['title'], 'description': data['description'], 'thumbnail': data['images']['main'] + '?t[]=900x506q80', 'timestamp': data['published'], 'duration': float_or_none(data['duration'], 1000), 'view_count': data['displays'], - 'formats': formats, 'is_live': True if stream_type == 'live' else False, - } + }) + return info class BTArticleIE(InfoExtractor): @@ -161,7 +209,7 @@ class BTArticleIE(InfoExtractor): _VALID_URL = 'http://(?:www\.)?bt\.no/(?:[^/]+/)+(?P[^/]+)-\d+\.html' _TEST = { 'url': 'http://www.bt.no/nyheter/lokalt/Kjemper-for-internatet-1788214.html', - 'md5': 'd055e8ee918ef2844745fcfd1a4175fb', + 'md5': '2acbe8ad129b3469d5ae51b1158878df', 'info_dict': { 'id': '23199', 'ext': 'mp4', @@ -178,15 +226,15 @@ class BTArticleIE(InfoExtractor): def _real_extract(self, url): webpage = self._download_webpage(url, self._match_id(url)) video_id = self._search_regex( - r'SVP\.Player\.load\(\s*(\d+)', webpage, 'video id') - return self.url_result('vgtv:bt:%s' % video_id, 'VGTV') + r']+data-id="(\d+)"', webpage, 'video id') + return self.url_result('bttv:%s' % video_id, 'VGTV') class BTVestlendingenIE(InfoExtractor): IE_NAME = 'bt:vestlendingen' IE_DESC = 'Bergens Tidende - Vestlendingen' _VALID_URL = 'http://(?:www\.)?bt\.no/spesial/vestlendingen/#!/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86588', 'md5': 'd7d17e3337dc80de6d3a540aefbe441b', 'info_dict': { @@ -197,7 +245,19 @@ class BTVestlendingenIE(InfoExtractor): 'timestamp': 1430473209, 'upload_date': '20150501', }, - } + 'skip': '404 Error', + }, { + 'url': 'http://www.bt.no/spesial/vestlendingen/#!/86255', + 'md5': 'a2893f8632e96389f4bdf36aa9463ceb', + 'info_dict': { + 'id': '86255', + 'ext': 'mov', + 'title': 'Du må tåle å fryse og være sulten', + 'description': 'md5:b8046f4d022d5830ddab04865791d063', + 'upload_date': '20150321', + 'timestamp': 1426942023, + }, + }] def _real_extract(self, url): - return self.url_result('xstream:btno:%s' % self._match_id(url), 'Xstream') + return self.url_result('bttv:%s' % self._match_id(url), 'VGTV') diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index a63c23617..ca3f20a3d 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -30,6 +30,12 @@ class VikiBaseIE(InfoExtractor): _token = None + _ERRORS = { + 'geo': 'Sorry, this content is not available in your region.', + 'upcoming': 'Sorry, this content is not yet available.', + # 'paywall': 'paywall', + } + def _prepare_call(self, path, timestamp=None, post_data=None): path += '?' if '?' not in path else '&' if not timestamp: @@ -67,6 +73,12 @@ class VikiBaseIE(InfoExtractor): '%s returned error: %s' % (self.IE_NAME, error), expected=True) + def _check_errors(self, data): + for reason, status in data.get('blocking', {}).items(): + if status and reason in self._ERRORS: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, self._ERRORS[reason]), expected=True) + def _real_initialize(self): self._login() @@ -193,6 +205,7 @@ class VikiIE(VikiBaseIE): 'timestamp': 1321985454, 'description': 'md5:44b1e46619df3a072294645c770cef36', 'title': 'Love In Magic', + 'age_limit': 13, }, }] @@ -202,6 +215,8 @@ class VikiIE(VikiBaseIE): video = self._call_api( 'videos/%s.json' % video_id, video_id, 'Downloading video JSON') + self._check_errors(video) + title = self.dict_selection(video.get('titles', {}), 'en') if not title: title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id @@ -262,8 +277,11 @@ class VikiIE(VikiBaseIE): r'^(\d+)[pP]$', format_id, 'height', default=None)) for protocol, format_dict in stream_dict.items(): if format_id == 'm3u8': - formats = self._extract_m3u8_formats( - format_dict['url'], video_id, 'mp4', m3u8_id='m3u8-%s' % protocol) + m3u8_formats = self._extract_m3u8_formats( + format_dict['url'], video_id, 'mp4', 'm3u8_native', + m3u8_id='m3u8-%s' % protocol, fatal=None) + if m3u8_formats: + formats.extend(m3u8_formats) else: formats.append({ 'url': format_dict['url'], @@ -315,6 +333,8 @@ class VikiChannelIE(VikiBaseIE): 'containers/%s.json' % channel_id, channel_id, 'Downloading channel JSON') + self._check_errors(channel) + title = self.dict_selection(channel['titles'], 'en') description = self.dict_selection(channel['descriptions'], 'en') diff --git a/youtube_dl/extractor/xstream.py b/youtube_dl/extractor/xstream.py index 71584c291..76c91bd92 100644 --- a/youtube_dl/extractor/xstream.py +++ b/youtube_dl/extractor/xstream.py @@ -42,11 +42,7 @@ class XstreamIE(InfoExtractor): 'only_matching': True, }] - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - partner_id = mobj.group('partner_id') - video_id = mobj.group('id') - + def _extract_video_info(self, partner_id, video_id): data = self._download_xml( 'http://frontend.xstream.dk/%s/feed/video/?platform=web&id=%s' % (partner_id, video_id), @@ -97,6 +93,7 @@ class XstreamIE(InfoExtractor): formats.append({ 'url': link.get('href'), 'format_id': link.get('rel'), + 'preference': 1, }) thumbnails = [{ @@ -113,3 +110,10 @@ class XstreamIE(InfoExtractor): 'formats': formats, 'thumbnails': thumbnails, } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + partner_id = mobj.group('partner_id') + video_id = mobj.group('id') + + return self._extract_video_info(partner_id, video_id) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 7095033c5..255d64269 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.12.21' +__version__ = '2015.12.23'