diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index eeac09d5d..f9a1aa990 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.16*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.16** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.06.27*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.06.27** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.06.16 +[debug] youtube-dl version 2016.06.27 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/AUTHORS b/AUTHORS index cdf655c39..bdd29687d 100644 --- a/AUTHORS +++ b/AUTHORS @@ -175,3 +175,4 @@ Tomáš Čech Déstin Reed Roman Tsiupa Artur Krysiak +Jakub Adam Wieczorek diff --git a/README.md b/README.md index f1e59542d..c6feef116 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ Or with [MacPorts](https://www.macports.org/): Alternatively, refer to the [developer instructions](#developer-instructions) for how to check out and work with the git repository. For further options, including PGP signatures, see the [youtube-dl Download Page](https://rg3.github.io/youtube-dl/download.html). # DESCRIPTION -**youtube-dl** is a small command-line program to download videos from +**youtube-dl** is a command-line program to download videos from YouTube.com and a few more sites. It requires the Python interpreter, version 2.6, 2.7, or 3.2+, and it is not platform specific. It should work on your Unix box, on Windows or on Mac OS X. It is released to the public domain, diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index b5a8b9190..9a79c2bc5 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -14,15 +14,17 @@ if os.path.exists(lazy_extractors_filename): os.remove(lazy_extractors_filename) from youtube_dl.extractor import _ALL_CLASSES -from youtube_dl.extractor.common import InfoExtractor +from youtube_dl.extractor.common import InfoExtractor, SearchInfoExtractor with open('devscripts/lazy_load_template.py', 'rt') as f: module_template = f.read() -module_contents = [module_template + '\n' + getsource(InfoExtractor.suitable)] +module_contents = [ + module_template + '\n' + getsource(InfoExtractor.suitable) + '\n', + 'class LazyLoadSearchExtractor(LazyLoadExtractor):\n pass\n'] ie_template = ''' -class {name}(LazyLoadExtractor): +class {name}({bases}): _VALID_URL = {valid_url!r} _module = '{module}' ''' @@ -34,10 +36,20 @@ make_valid_template = ''' ''' +def get_base_name(base): + if base is InfoExtractor: + return 'LazyLoadExtractor' + elif base is SearchInfoExtractor: + return 'LazyLoadSearchExtractor' + else: + return base.__name__ + + def build_lazy_ie(ie, name): valid_url = getattr(ie, '_VALID_URL', None) s = ie_template.format( name=name, + bases=', '.join(map(get_base_name, ie.__bases__)), valid_url=valid_url, module=ie.__module__) if ie.suitable.__func__ is not InfoExtractor.suitable.__func__: @@ -47,12 +59,35 @@ def build_lazy_ie(ie, name): s += make_valid_template.format(valid_url=ie._make_valid_url()) return s +# find the correct sorting and add the required base classes so that sublcasses +# can be correctly created +classes = _ALL_CLASSES[:-1] +ordered_cls = [] +while classes: + for c in classes[:]: + bases = set(c.__bases__) - set((object, InfoExtractor, SearchInfoExtractor)) + stop = False + for b in bases: + if b not in classes and b not in ordered_cls: + if b.__name__ == 'GenericIE': + exit() + classes.insert(0, b) + stop = True + if stop: + break + if all(b in ordered_cls for b in bases): + ordered_cls.append(c) + classes.remove(c) + break +ordered_cls.append(_ALL_CLASSES[-1]) + names = [] -for ie in list(sorted(_ALL_CLASSES[:-1], key=lambda cls: cls.ie_key())) + _ALL_CLASSES[-1:]: - name = ie.ie_key() + 'IE' +for ie in ordered_cls: + name = ie.__name__ src = build_lazy_ie(ie, name) module_contents.append(src) - names.append(name) + if ie in _ALL_CLASSES: + names.append(name) module_contents.append( '_ALL_CLASSES = [{0}]'.format(', '.join(names))) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 13315f4f4..2a94f4feb 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -45,7 +45,6 @@ - **archive.org**: archive.org videos - **ARD** - **ARD:mediathek** - - **ARD:mediathek**: Saarländischer Rundfunk - **arte.tv** - **arte.tv:+7** - **arte.tv:cinema** @@ -74,6 +73,8 @@ - **bbc**: BBC - **bbc.co.uk**: BBC iPlayer - **bbc.co.uk:article**: BBC articles + - **bbc.co.uk:iplayer:playlist** + - **bbc.co.uk:playlist** - **BeatportPro** - **Beeg** - **BehindKink** @@ -104,6 +105,8 @@ - **canalc2.tv** - **Canalplus**: canalplus.fr, piwiplus.fr and d8.tv - **Canvas** + - **CarambaTV** + - **CarambaTVPage** - **CBC** - **CBCPlayer** - **CBS** @@ -124,6 +127,7 @@ - **cliphunter** - **ClipRs** - **Clipsyndicate** + - **CloserToTruth** - **cloudtime**: CloudTime - **Cloudy** - **Clubic** @@ -243,7 +247,6 @@ - **Gamersyde** - **GameSpot** - **GameStar** - - **Gametrailers** - **Gazeta** - **GDCVault** - **generic**: Generic downloader that works on some sites @@ -381,7 +384,7 @@ - **MovieFap** - **Moviezine** - **MPORA** - - **MSNBC** + - **MSN** - **MTV** - **mtv.de** - **mtviggy.com** @@ -432,6 +435,7 @@ - **nhl.com:videocenter** - **nhl.com:videocenter:category**: NHL videocenter category - **nick.com** + - **nick.de** - **niconico**: ニコニコ動画 - **NiconicoPlaylist** - **njoy**: N-JOY @@ -497,6 +501,7 @@ - **plus.google**: Google Plus - **pluzz.francetv.fr** - **podomatic** + - **PolskieRadio** - **PornHd** - **PornHub** - **PornHubPlaylist** @@ -516,6 +521,7 @@ - **qqmusic:singer**: QQ音乐 - 歌手 - **qqmusic:toplist**: QQ音乐 - 排行榜 - **R7** + - **R7Article** - **radio.de** - **radiobremen** - **radiocanada** @@ -581,7 +587,7 @@ - **Shared**: shared.sx and vivo.sx - **ShareSix** - **Sina** - - **skynewsarabia:video** + - **skynewsarabia:article** - **skynewsarabia:video** - **Slideshare** - **Slutload** @@ -614,6 +620,7 @@ - **SportBoxEmbed** - **SportDeutschland** - **Sportschau** + - **sr:mediathek**: Saarländischer Rundfunk - **SRGSSR** - **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites - **SSA** @@ -731,6 +738,7 @@ - **vh1.com** - **Vice** - **ViceShow** + - **Vidbit** - **Viddler** - **video.google:search**: Google Video search - **video.mit.edu** diff --git a/setup.py b/setup.py index c1e923f71..508b27f37 100644 --- a/setup.py +++ b/setup.py @@ -21,25 +21,37 @@ try: import py2exe except ImportError: if len(sys.argv) >= 2 and sys.argv[1] == 'py2exe': - print("Cannot import py2exe", file=sys.stderr) + print('Cannot import py2exe', file=sys.stderr) exit(1) py2exe_options = { - "bundle_files": 1, - "compressed": 1, - "optimize": 2, - "dist_dir": '.', - "dll_excludes": ['w9xpopen.exe', 'crypt32.dll'], + 'bundle_files': 1, + 'compressed': 1, + 'optimize': 2, + 'dist_dir': '.', + 'dll_excludes': ['w9xpopen.exe', 'crypt32.dll'], } +# Get the version from youtube_dl/version.py without importing the package +exec(compile(open('youtube_dl/version.py').read(), + 'youtube_dl/version.py', 'exec')) + +DESCRIPTION = 'YouTube video downloader' +LONG_DESCRIPTION = 'Command-line program to download videos from YouTube.com and other video sites' + py2exe_console = [{ - "script": "./youtube_dl/__main__.py", - "dest_base": "youtube-dl", + 'script': './youtube_dl/__main__.py', + 'dest_base': 'youtube-dl', + 'version': __version__, + 'description': DESCRIPTION, + 'comments': LONG_DESCRIPTION, + 'product_name': 'youtube-dl', + 'product_version': __version__, }] py2exe_params = { 'console': py2exe_console, - 'options': {"py2exe": py2exe_options}, + 'options': {'py2exe': py2exe_options}, 'zipfile': None } @@ -72,7 +84,7 @@ else: params['scripts'] = ['bin/youtube-dl'] class build_lazy_extractors(Command): - description = "Build the extractor lazy loading module" + description = 'Build the extractor lazy loading module' user_options = [] def initialize_options(self): @@ -87,16 +99,11 @@ class build_lazy_extractors(Command): dry_run=self.dry_run, ) -# Get the version from youtube_dl/version.py without importing the package -exec(compile(open('youtube_dl/version.py').read(), - 'youtube_dl/version.py', 'exec')) - setup( name='youtube_dl', version=__version__, - description='YouTube video downloader', - long_description='Small command-line program to download videos from' - ' YouTube.com and other video sites.', + description=DESCRIPTION, + long_description=LONG_DESCRIPTION, url='https://github.com/rg3/youtube-dl', author='Ricardo Garcia', author_email='ytdl@yt-dl.org', @@ -112,17 +119,17 @@ setup( # test_requires = ['nosetest'], classifiers=[ - "Topic :: Multimedia :: Video", - "Development Status :: 5 - Production/Stable", - "Environment :: Console", - "License :: Public Domain", - "Programming Language :: Python :: 2.6", - "Programming Language :: Python :: 2.7", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.2", - "Programming Language :: Python :: 3.3", - "Programming Language :: Python :: 3.4", - "Programming Language :: Python :: 3.5", + 'Topic :: Multimedia :: Video', + 'Development Status :: 5 - Production/Stable', + 'Environment :: Console', + 'License :: Public Domain', + 'Programming Language :: Python :: 2.6', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.2', + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', ], cmdclass={'build_lazy_extractors': build_lazy_extractors}, diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 6404ac89f..88e8ff904 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -11,7 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL from youtube_dl.extractor.common import InfoExtractor from youtube_dl.extractor import YoutubeIE, get_info_extractor -from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError +from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError class TestIE(InfoExtractor): @@ -66,6 +66,11 @@ class TestInfoExtractor(unittest.TestCase): self.assertEqual(ie._html_search_meta('d', html), '4') self.assertEqual(ie._html_search_meta('e', html), '5') self.assertEqual(ie._html_search_meta('f', html), '6') + self.assertEqual(ie._html_search_meta(('a', 'b', 'c'), html), '1') + self.assertEqual(ie._html_search_meta(('c', 'b', 'a'), html), '3') + self.assertEqual(ie._html_search_meta(('z', 'x', 'c'), html), '3') + self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True) + self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True) def test_download_json(self): uri = encode_data_uri(b'{"foo": "blah"}', 'application/json') diff --git a/test/test_all_urls.py b/test/test_all_urls.py index f5af184e6..1f6079c29 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -6,6 +6,7 @@ from __future__ import unicode_literals import os import sys import unittest +import collections sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -130,6 +131,15 @@ class TestAllURLsMatching(unittest.TestCase): 'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html', ['Yahoo']) + def test_no_duplicated_ie_names(self): + name_accu = collections.defaultdict(list) + for ie in self.ies: + name_accu[ie.IE_NAME.lower()].append(type(ie).__name__) + for (ie_name, ie_list) in name_accu.items(): + self.assertEqual( + len(ie_list), 1, + 'Multiple extractors with the same IE_NAME "%s" (%s)' % (ie_name, ', '.join(ie_list))) + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index 6b15920a6..1f8e8fb92 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -60,11 +60,13 @@ from youtube_dl.utils import ( timeconvert, unescapeHTML, unified_strdate, + unified_timestamp, unsmuggle_url, uppercase_escape, lowercase_escape, url_basename, urlencode_postdata, + urshift, update_url_query, version_tuple, xpath_with_ns, @@ -283,8 +285,28 @@ class TestUtil(unittest.TestCase): '20150202') self.assertEqual(unified_strdate('Feb 14th 2016 5:45PM'), '20160214') self.assertEqual(unified_strdate('25-09-2014'), '20140925') + self.assertEqual(unified_strdate('27.02.2016 17:30'), '20160227') self.assertEqual(unified_strdate('UNKNOWN DATE FORMAT'), None) + def test_unified_timestamps(self): + self.assertEqual(unified_timestamp('December 21, 2010'), 1292889600) + self.assertEqual(unified_timestamp('8/7/2009'), 1247011200) + self.assertEqual(unified_timestamp('Dec 14, 2012'), 1355443200) + self.assertEqual(unified_timestamp('2012/10/11 01:56:38 +0000'), 1349920598) + self.assertEqual(unified_timestamp('1968 12 10'), -33436800) + self.assertEqual(unified_timestamp('1968-12-10'), -33436800) + self.assertEqual(unified_timestamp('28/01/2014 21:00:00 +0100'), 1390939200) + self.assertEqual( + unified_timestamp('11/26/2014 11:30:00 AM PST', day_first=False), + 1417001400) + self.assertEqual( + unified_timestamp('2/2/2015 6:47:40 PM', day_first=False), + 1422902860) + self.assertEqual(unified_timestamp('Feb 14th 2016 5:45PM'), 1455471900) + self.assertEqual(unified_timestamp('25-09-2014'), 1411603200) + self.assertEqual(unified_timestamp('27.02.2016 17:30'), 1456594200) + self.assertEqual(unified_timestamp('UNKNOWN DATE FORMAT'), None) + def test_determine_ext(self): self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') self.assertEqual(determine_ext('http://example.com/foo/bar/?download', None), None) @@ -995,5 +1017,9 @@ The first line self.assertRaises(ValueError, encode_base_n, 0, 70) self.assertRaises(ValueError, encode_base_n, 0, 60, custom_table) + def test_urshift(self): + self.assertEqual(urshift(3, 1), 1) + self.assertEqual(urshift(-3, 1), 2147483646) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 54f2108e9..3b7bb3508 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -2,14 +2,24 @@ from __future__ import unicode_literals import os.path import re +import binascii +try: + from Crypto.Cipher import AES + can_decrypt_frag = True +except ImportError: + can_decrypt_frag = False from .fragment import FragmentFD from .external import FFmpegFD -from ..compat import compat_urlparse +from ..compat import ( + compat_urlparse, + compat_struct_pack, +) from ..utils import ( encodeFilename, sanitize_open, + parse_m3u8_attributes, ) @@ -21,7 +31,7 @@ class HlsFD(FragmentFD): @staticmethod def can_download(manifest): UNSUPPORTED_FEATURES = ( - r'#EXT-X-KEY:METHOD=(?!NONE)', # encrypted streams [1] + r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1] r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2] # Live streams heuristic does not always work (e.g. geo restricted to Germany @@ -39,7 +49,9 @@ class HlsFD(FragmentFD): # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5 ) - return all(not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES) + check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES] + check_results.append(can_decrypt_frag or '#EXT-X-KEY:METHOD=AES-128' not in manifest) + return all(check_results) def real_download(self, filename, info_dict): man_url = info_dict['url'] @@ -57,36 +69,60 @@ class HlsFD(FragmentFD): fd.add_progress_hook(ph) return fd.real_download(filename, info_dict) - fragment_urls = [] + total_frags = 0 for line in s.splitlines(): line = line.strip() if line and not line.startswith('#'): - segment_url = ( - line - if re.match(r'^https?://', line) - else compat_urlparse.urljoin(man_url, line)) - fragment_urls.append(segment_url) - # We only download the first fragment during the test - if self.params.get('test', False): - break + total_frags += 1 ctx = { 'filename': filename, - 'total_frags': len(fragment_urls), + 'total_frags': total_frags, } self._prepare_and_start_frag_download(ctx) + i = 0 + media_sequence = 0 + decrypt_info = {'METHOD': 'NONE'} frags_filenames = [] - for i, frag_url in enumerate(fragment_urls): - frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i) - success = ctx['dl'].download(frag_filename, {'url': frag_url}) - if not success: - return False - down, frag_sanitized = sanitize_open(frag_filename, 'rb') - ctx['dest_stream'].write(down.read()) - down.close() - frags_filenames.append(frag_sanitized) + for line in s.splitlines(): + line = line.strip() + if line: + if not line.startswith('#'): + frag_url = ( + line + if re.match(r'^https?://', line) + else compat_urlparse.urljoin(man_url, line)) + frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i) + success = ctx['dl'].download(frag_filename, {'url': frag_url}) + if not success: + return False + down, frag_sanitized = sanitize_open(frag_filename, 'rb') + frag_content = down.read() + down.close() + if decrypt_info['METHOD'] == 'AES-128': + iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) + frag_content = AES.new( + decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) + ctx['dest_stream'].write(frag_content) + frags_filenames.append(frag_sanitized) + # We only download the first fragment during the test + if self.params.get('test', False): + break + i += 1 + media_sequence += 1 + elif line.startswith('#EXT-X-KEY'): + decrypt_info = parse_m3u8_attributes(line[11:]) + if decrypt_info['METHOD'] == 'AES-128': + if 'IV' in decrypt_info: + decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:]) + if not re.match(r'^https?://', decrypt_info['URI']): + decrypt_info['URI'] = compat_urlparse.urljoin( + man_url, decrypt_info['URI']) + decrypt_info['KEY'] = self.ydl.urlopen(decrypt_info['URI']).read() + elif line.startswith('#EXT-X-MEDIA-SEQUENCE'): + media_sequence = int(line[22:]) self._finish_frag_download(ctx) diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py index 8753ee2cf..5ae16fa16 100644 --- a/youtube_dl/extractor/adobetv.py +++ b/youtube_dl/extractor/adobetv.py @@ -156,7 +156,10 @@ class AdobeTVVideoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json(url + '?format=json', video_id) + webpage = self._download_webpage(url, video_id) + + video_data = self._parse_json(self._search_regex( + r'var\s+bridge\s*=\s*([^;]+);', webpage, 'bridged data'), video_id) formats = [{ 'format_id': '%s-%s' % (determine_ext(source['src']), source.get('height')), diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 1bbfe2641..1376dd70f 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -7,18 +7,118 @@ from ..utils import ( smuggle_url, update_url_query, unescapeHTML, + extract_attributes, +) +from ..compat import ( + compat_urlparse, ) -class AENetworksIE(InfoExtractor): +class AENetworksBaseIE(InfoExtractor): + def theplatform_url_result(self, theplatform_url, video_id, query): + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': smuggle_url( + update_url_query(theplatform_url, query), + { + 'sig': { + 'key': 'crazyjava', + 'secret': 's3cr3t' + }, + 'force_smil_url': True + }), + 'ie_key': 'ThePlatform', + } + + +class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network' - _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?P[^/]+)/(?:[^/]+/)+(?P[^/]+?)(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/shows/(?P[^/]+(?:/[^/]+){0,2})' + _TESTS = [{ + 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', + 'md5': '8ff93eb073449f151d6b90c0ae1ef0c7', + 'info_dict': { + 'id': '22253814', + 'ext': 'mp4', + 'title': 'Winter Is Coming', + 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', + 'timestamp': 1338306241, + 'upload_date': '20120529', + 'uploader': 'AENE-NEW', + }, + 'add_ie': ['ThePlatform'], + }, { + 'url': 'http://www.history.com/shows/ancient-aliens/season-1', + 'info_dict': { + 'id': '71889446852', + }, + 'playlist_mincount': 5, + }, { + 'url': 'http://www.mylifetime.com/shows/atlanta-plastic', + 'info_dict': { + 'id': 'SERIES4317', + 'title': 'Atlanta Plastic', + }, + 'playlist_mincount': 2, + }, { + 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', + 'only_matching': True + }, { + 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', + 'only_matching': True + }, { + 'url': 'http://www.mylifetime.com/shows/project-runway-junior/season-1/episode-6', + 'only_matching': True + }] + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + url_parts = display_id.split('/') + url_parts_len = len(url_parts) + if url_parts_len == 1: + entries = [] + for season_url_path in re.findall(r'(?s)]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage): + entries.append(self.url_result( + compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) + return self.playlist_result( + entries, self._html_search_meta('aetn:SeriesId', webpage), + self._html_search_meta('aetn:SeriesTitle', webpage)) + elif url_parts_len == 2: + entries = [] + for episode_item in re.findall(r'(?s)]+class="[^"]*episode-item[^"]*"[^>]*>', webpage): + episode_attributes = extract_attributes(episode_item) + episode_url = compat_urlparse.urljoin( + url, episode_attributes['data-canonical']) + entries.append(self.url_result( + episode_url, 'AENetworks', + episode_attributes['data-videoid'])) + return self.playlist_result( + entries, self._html_search_meta('aetn:SeasonId', webpage)) + else: + video_id = self._html_search_meta('aetn:VideoID', webpage) + media_url = self._search_regex( + r"media_url\s*=\s*'([^']+)'", webpage, 'video url') + + info = self._search_json_ld(webpage, video_id, fatal=False) + info.update(self.theplatform_url_result( + media_url, video_id, { + 'mbr': 'true', + 'assetTypes': 'medium_video_s3' + })) + return info + + +class HistoryTopicIE(AENetworksBaseIE): + IE_NAME = 'history:topic' + IE_DESC = 'History.com Topic' + _VALID_URL = r'https?://(?:www\.)?history\.com/topics/(?:[^/]+/)?(?P[^/]+)/videos(?:/(?P[^/?#]+))?' _TESTS = [{ 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false', 'info_dict': { - 'id': 'g12m5Gyt3fdR', + 'id': '40700995724', 'ext': 'mp4', 'title': "Bet You Didn't Know: Valentine's Day", 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', @@ -31,57 +131,38 @@ class AENetworksIE(InfoExtractor): 'skip_download': True, }, 'add_ie': ['ThePlatform'], - 'expected_warnings': ['JSON-LD'], }, { - 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', - 'md5': '8ff93eb073449f151d6b90c0ae1ef0c7', - 'info_dict': { - 'id': 'eg47EERs_JsZ', - 'ext': 'mp4', - 'title': 'Winter Is Coming', - 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', - 'timestamp': 1338306241, - 'upload_date': '20120529', - 'uploader': 'AENE-NEW', + 'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/videos', + 'info_dict': + { + 'id': 'world-war-i-history', }, - 'add_ie': ['ThePlatform'], + 'playlist_mincount': 24, }, { - 'url': 'http://www.aetv.com/shows/duck-dynasty/video/inlawful-entry', - 'only_matching': True - }, { - 'url': 'http://www.fyi.tv/shows/tiny-house-nation/videos/207-sq-ft-minnesota-prairie-cottage', - 'only_matching': True - }, { - 'url': 'http://www.mylifetime.com/shows/project-runway-junior/video/season-1/episode-6/superstar-clients', - 'only_matching': True + 'url': 'http://www.history.com/topics/world-war-i-history/videos', + 'only_matching': True, }] def _real_extract(self, url): - page_type, video_id = re.match(self._VALID_URL, url).groups() + topic_id, display_id = re.match(self._VALID_URL, url).groups() + if display_id: + webpage = self._download_webpage(url, display_id) + release_url, video_id = re.search(r"_videoPlayer.play\('([^']+)'\s*,\s*'[^']+'\s*,\s*'(\d+)'\)", webpage).groups() + release_url = unescapeHTML(release_url) - webpage = self._download_webpage(url, video_id) - - video_url_re = [ - r'data-href="[^"]*/%s"[^>]+data-release-url="([^"]+)"' % video_id, - r"media_url\s*=\s*'([^']+)'" - ] - video_url = unescapeHTML(self._search_regex(video_url_re, webpage, 'video url')) - query = {'mbr': 'true'} - if page_type == 'shows': - query['assetTypes'] = 'medium_video_s3' - if 'switch=hds' in video_url: - query['switch'] = 'hls' - - info = self._search_json_ld(webpage, video_id, fatal=False) - info.update({ - '_type': 'url_transparent', - 'url': smuggle_url( - update_url_query(video_url, query), - { - 'sig': { - 'key': 'crazyjava', - 'secret': 's3cr3t'}, - 'force_smil_url': True - }), - }) - return info + return self.theplatform_url_result( + release_url, video_id, { + 'mbr': 'true', + 'switch': 'hls' + }) + else: + webpage = self._download_webpage(url, topic_id) + entries = [] + for episode_item in re.findall(r']*>', webpage): + video_attributes = extract_attributes(episode_item) + entries.append(self.theplatform_url_result( + video_attributes['data-release-url'], video_attributes['data-id'], { + 'mbr': 'true', + 'switch': 'hls' + })) + return self.playlist_result(entries, topic_id) diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py index d548592fe..5766b4fe8 100644 --- a/youtube_dl/extractor/aftonbladet.py +++ b/youtube_dl/extractor/aftonbladet.py @@ -24,10 +24,10 @@ class AftonbladetIE(InfoExtractor): webpage = self._download_webpage(url, video_id) # find internal video meta data - meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json' + meta_url = 'http://aftonbladet-play-metadata.cdn.drvideo.aptoma.no/video/%s.json' player_config = self._parse_json(self._html_search_regex( r'data-player-config="([^"]+)"', webpage, 'player config'), video_id) - internal_meta_id = player_config['videoId'] + internal_meta_id = player_config['aptomaVideoId'] internal_meta_url = meta_url % internal_meta_id internal_meta_json = self._download_json( internal_meta_url, video_id, 'Downloading video meta data') diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index be40f85b4..a6801f3d4 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -7,6 +7,8 @@ from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( int_or_none, + parse_duration, + unified_strdate, ) @@ -16,7 +18,8 @@ class AppleTrailersIE(InfoExtractor): _TESTS = [{ 'url': 'http://trailers.apple.com/trailers/wb/manofsteel/', 'info_dict': { - 'id': 'manofsteel', + 'id': '5111', + 'title': 'Man of Steel', }, 'playlist': [ { @@ -70,6 +73,15 @@ class AppleTrailersIE(InfoExtractor): 'id': 'blackthorn', }, 'playlist_mincount': 2, + 'expected_warnings': ['Unable to download JSON metadata'], + }, { + # json data only available from http://trailers.apple.com/trailers/feeds/data/15881.json + 'url': 'http://trailers.apple.com/trailers/fox/kungfupanda3/', + 'info_dict': { + 'id': '15881', + 'title': 'Kung Fu Panda 3', + }, + 'playlist_mincount': 4, }, { 'url': 'http://trailers.apple.com/ca/metropole/autrui/', 'only_matching': True, @@ -85,6 +97,45 @@ class AppleTrailersIE(InfoExtractor): movie = mobj.group('movie') uploader_id = mobj.group('company') + webpage = self._download_webpage(url, movie) + film_id = self._search_regex(r"FilmId\s*=\s*'(\d+)'", webpage, 'film id') + film_data = self._download_json( + 'http://trailers.apple.com/trailers/feeds/data/%s.json' % film_id, + film_id, fatal=False) + + if film_data: + entries = [] + for clip in film_data.get('clips', []): + clip_title = clip['title'] + + formats = [] + for version, version_data in clip.get('versions', {}).items(): + for size, size_data in version_data.get('sizes', {}).items(): + src = size_data.get('src') + if not src: + continue + formats.append({ + 'format_id': '%s-%s' % (version, size), + 'url': re.sub(r'_(\d+p.mov)', r'_h\1', src), + 'width': int_or_none(size_data.get('width')), + 'height': int_or_none(size_data.get('height')), + 'language': version[:2], + }) + self._sort_formats(formats) + + entries.append({ + 'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(), + 'formats': formats, + 'title': clip_title, + 'thumbnail': clip.get('screen') or clip.get('thumb'), + 'duration': parse_duration(clip.get('runtime') or clip.get('faded')), + 'upload_date': unified_strdate(clip.get('posted')), + 'uploader_id': uploader_id, + }) + + page_data = film_data.get('page', {}) + return self.playlist_result(entries, film_id, page_data.get('movie_title')) + playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc') def fix_html(s): diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 26446c2fe..fd45b3e42 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -8,7 +8,6 @@ from .generic import GenericIE from ..utils import ( determine_ext, ExtractorError, - get_element_by_attribute, qualities, int_or_none, parse_duration, @@ -274,41 +273,3 @@ class ARDIE(InfoExtractor): 'upload_date': upload_date, 'thumbnail': thumbnail, } - - -class SportschauIE(ARDMediathekIE): - IE_NAME = 'Sportschau' - _VALID_URL = r'(?Phttps?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P[^/#?]+))\.html' - _TESTS = [{ - 'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html', - 'info_dict': { - 'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100', - 'ext': 'mp4', - 'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - base_url = mobj.group('baseurl') - - webpage = self._download_webpage(url, video_id) - title = get_element_by_attribute('class', 'headline', webpage) - description = self._html_search_meta('description', webpage, 'description') - - info = self._extract_media_info( - base_url + '-mc_defaultQuality-h.json', webpage, video_id) - - info.update({ - 'title': title, - 'description': description, - }) - - return info diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index f40532929..e0c5c1804 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -180,11 +180,14 @@ class ArteTVBaseIE(InfoExtractor): class ArteTVPlus7IE(ArteTVBaseIE): IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?Pfr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P[^/]+)/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/[^/]+/(?Pfr|de|en|es)/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D', 'only_matching': True, + }, { + 'url': 'http://sites.arte.tv/karambolage/de/video/karambolage-22', + 'only_matching': True, }] @classmethod @@ -240,10 +243,10 @@ class ArteTVPlus7IE(ArteTVBaseIE): return self._extract_from_json_url(json_url, video_id, lang, title=title) # Different kind of embed URL (e.g. # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) - embed_url = self._search_regex( - r']+src=(["\'])(?P.+?)\1', - webpage, 'embed url', group='url') - return self.url_result(embed_url) + entries = [ + self.url_result(url) + for _, url in re.findall(r']+src=(["\'])(?P.+?)\1', webpage)] + return self.playlist_result(entries) # It also uses the arte_vp_url url from the webpage to extract the information @@ -252,22 +255,17 @@ class ArteTVCreativeIE(ArteTVPlus7IE): _VALID_URL = r'https?://creative\.arte\.tv/(?Pfr|de|en|es)/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ - 'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', + 'url': 'http://creative.arte.tv/fr/episode/osmosis-episode-1', 'info_dict': { - 'id': '72176', + 'id': '057405-001-A', 'ext': 'mp4', - 'title': 'Folge 2 - Corporate Design', - 'upload_date': '20131004', + 'title': 'OSMOSIS - N\'AYEZ PLUS PEUR D\'AIMER (1)', + 'upload_date': '20150716', }, }, { 'url': 'http://creative.arte.tv/fr/Monty-Python-Reunion', - 'info_dict': { - 'id': '160676', - 'ext': 'mp4', - 'title': 'Monty Python live (mostly)', - 'description': 'Événement ! Quarante-cinq ans après leurs premiers succès, les légendaires Monty Python remontent sur scène.\n', - 'upload_date': '20140805', - } + 'playlist_count': 11, + 'add_ie': ['Youtube'], }, { 'url': 'http://creative.arte.tv/de/episode/agentur-amateur-4-der-erste-kunde', 'only_matching': True, @@ -349,14 +347,13 @@ class ArteTVCinemaIE(ArteTVPlus7IE): _VALID_URL = r'https?://cinema\.arte\.tv/(?Pfr|de|en|es)/(?P.+)' _TESTS = [{ - 'url': 'http://cinema.arte.tv/de/node/38291', - 'md5': '6b275511a5107c60bacbeeda368c3aa1', + 'url': 'http://cinema.arte.tv/fr/article/les-ailes-du-desir-de-julia-reck', + 'md5': 'a5b9dd5575a11d93daf0e3f404f45438', 'info_dict': { - 'id': '055876-000_PWA12025-D', + 'id': '062494-000-A', 'ext': 'mp4', - 'title': 'Tod auf dem Nil', - 'upload_date': '20160122', - 'description': 'md5:7f749bbb77d800ef2be11d54529b96bc', + 'title': 'Film lauréat du concours web - "Les ailes du désir" de Julia Reck', + 'upload_date': '20150807', }, }] @@ -422,6 +419,7 @@ class ArteTVPlaylistIE(ArteTVBaseIE): 'info_dict': { 'id': 'PL-013263', 'title': 'Areva & Uramin', + 'description': 'md5:a1dc0312ce357c262259139cfd48c9bf', }, 'playlist_mincount': 6, }, { diff --git a/youtube_dl/extractor/azubu.py b/youtube_dl/extractor/azubu.py index efa624de1..a813eb429 100644 --- a/youtube_dl/extractor/azubu.py +++ b/youtube_dl/extractor/azubu.py @@ -46,6 +46,7 @@ class AzubuIE(InfoExtractor): 'uploader_id': 272749, 'view_count': int, }, + 'skip': 'Channel offline', }, ] @@ -56,22 +57,26 @@ class AzubuIE(InfoExtractor): 'http://www.azubu.tv/api/video/%s' % video_id, video_id)['data'] title = data['title'].strip() - description = data['description'] - thumbnail = data['thumbnail'] - view_count = data['view_count'] - uploader = data['user']['username'] - uploader_id = data['user']['id'] + description = data.get('description') + thumbnail = data.get('thumbnail') + view_count = data.get('view_count') + user = data.get('user', {}) + uploader = user.get('username') + uploader_id = user.get('id') stream_params = json.loads(data['stream_params']) - timestamp = float_or_none(stream_params['creationDate'], 1000) - duration = float_or_none(stream_params['length'], 1000) + timestamp = float_or_none(stream_params.get('creationDate'), 1000) + duration = float_or_none(stream_params.get('length'), 1000) renditions = stream_params.get('renditions') or [] video = stream_params.get('FLVFullLength') or stream_params.get('videoFullLength') if video: renditions.append(video) + if not renditions and not user.get('channel', {}).get('is_live', True): + raise ExtractorError('%s said: channel is offline.' % self.IE_NAME, expected=True) + formats = [{ 'url': fmt['url'], 'width': fmt['frameWidth'], diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 74c4510f9..4b3cd8c65 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -31,7 +31,7 @@ class BBCCoUkIE(InfoExtractor): music/clips[/#]| radio/player/ ) - (?P%s) + (?P%s)(?!/(?:episodes|broadcasts|clips)) ''' % _ID_REGEX _MEDIASELECTOR_URLS = [ @@ -192,6 +192,7 @@ class BBCCoUkIE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'Now it\'s really geo-restricted', }, { # compact player (https://github.com/rg3/youtube-dl/issues/8147) 'url': 'http://www.bbc.co.uk/programmes/p028bfkf/player', @@ -698,7 +699,9 @@ class BBCIE(BBCCoUkIE): @classmethod def suitable(cls, url): - return False if BBCCoUkIE.suitable(url) or BBCCoUkArticleIE.suitable(url) else super(BBCIE, cls).suitable(url) + EXCLUDE_IE = (BBCCoUkIE, BBCCoUkArticleIE, BBCCoUkIPlayerPlaylistIE, BBCCoUkPlaylistIE) + return (False if any(ie.suitable(url) for ie in EXCLUDE_IE) + else super(BBCIE, cls).suitable(url)) def _extract_from_media_meta(self, media_meta, video_id): # Direct links to media in media metadata (e.g. @@ -975,3 +978,72 @@ class BBCCoUkArticleIE(InfoExtractor): r']+typeof="Clip"[^>]+resource="([^"]+)"', webpage)] return self.playlist_result(entries, playlist_id, title, description) + + +class BBCCoUkPlaylistBaseIE(InfoExtractor): + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result(self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) + for video_id in re.findall( + self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage)] + + title, description = self._extract_title_and_description(webpage) + + return self.playlist_result(entries, playlist_id, title, description) + + +class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:iplayer:playlist' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/episodes/(?P%s)' % BBCCoUkIE._ID_REGEX + _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s' + _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)' + _TEST = { + 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v', + 'info_dict': { + 'id': 'b05rcz9v', + 'title': 'The Disappearance', + 'description': 'French thriller serial about a missing teenager.', + }, + 'playlist_mincount': 6, + } + + def _extract_title_and_description(self, webpage): + title = self._search_regex(r'

([^<]+)

', webpage, 'title', fatal=False) + description = self._search_regex( + r']+class=(["\'])subtitle\1[^>]*>(?P[^<]+)

', + webpage, 'description', fatal=False, group='value') + return title, description + + +class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): + IE_NAME = 'bbc.co.uk:playlist' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/(?P%s)/(?:episodes|broadcasts|clips)' % BBCCoUkIE._ID_REGEX + _URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s' + _VIDEO_ID_TEMPLATE = r'data-pid=["\'](%s)' + _TESTS = [{ + 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips', + 'info_dict': { + 'id': 'b05rcz9v', + 'title': 'The Disappearance - Clips - BBC Four', + 'description': 'French thriller serial about a missing teenager.', + }, + 'playlist_mincount': 7, + }, { + 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/clips', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/b055jkys/episodes/player', + 'only_matching': True, + }] + + def _extract_title_and_description(self, webpage): + title = self._og_search_title(webpage, fatal=False) + description = self._og_search_description(webpage) + return title, description diff --git a/youtube_dl/extractor/bet.py b/youtube_dl/extractor/bet.py index 986245bf0..bd3ee2e2e 100644 --- a/youtube_dl/extractor/bet.py +++ b/youtube_dl/extractor/bet.py @@ -1,31 +1,27 @@ from __future__ import unicode_literals -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote -from ..utils import ( - xpath_text, - xpath_with_ns, - int_or_none, - parse_iso8601, -) +from .mtv import MTVServicesInfoExtractor +from ..utils import unified_strdate +from ..compat import compat_urllib_parse_urlencode -class BetIE(InfoExtractor): +class BetIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P.+?)\.html' _TESTS = [ { 'url': 'http://www.bet.com/news/politics/2014/12/08/in-bet-exclusive-obama-talks-race-and-racism.html', 'info_dict': { - 'id': 'news/national/2014/a-conversation-with-president-obama', + 'id': '07e96bd3-8850-3051-b856-271b457f0ab8', 'display_id': 'in-bet-exclusive-obama-talks-race-and-racism', 'ext': 'flv', 'title': 'A Conversation With President Obama', - 'description': 'md5:699d0652a350cf3e491cd15cc745b5da', + 'description': 'President Obama urges persistence in confronting racism and bias.', 'duration': 1534, - 'timestamp': 1418075340, 'upload_date': '20141208', - 'uploader': 'admin', 'thumbnail': 're:(?i)^https?://.*\.jpg$', + 'subtitles': { + 'en': 'mincount:2', + } }, 'params': { # rtmp download @@ -35,16 +31,17 @@ class BetIE(InfoExtractor): { 'url': 'http://www.bet.com/video/news/national/2014/justice-for-ferguson-a-community-reacts.html', 'info_dict': { - 'id': 'news/national/2014/justice-for-ferguson-a-community-reacts', + 'id': '9f516bf1-7543-39c4-8076-dd441b459ba9', 'display_id': 'justice-for-ferguson-a-community-reacts', 'ext': 'flv', 'title': 'Justice for Ferguson: A Community Reacts', 'description': 'A BET News special.', 'duration': 1696, - 'timestamp': 1416942360, 'upload_date': '20141125', - 'uploader': 'admin', 'thumbnail': 're:(?i)^https?://.*\.jpg$', + 'subtitles': { + 'en': 'mincount:2', + } }, 'params': { # rtmp download @@ -53,57 +50,32 @@ class BetIE(InfoExtractor): } ] + _FEED_URL = "http://feeds.mtvnservices.com/od/feed/bet-mrss-player" + + def _get_feed_query(self, uri): + return compat_urllib_parse_urlencode({ + 'uuid': uri, + }) + + def _extract_mgid(self, webpage): + return self._search_regex(r'data-uri="([^"]+)', webpage, 'mgid') + def _real_extract(self, url): display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + mgid = self._extract_mgid(webpage) + videos_info = self._get_videos_info(mgid) - media_url = compat_urllib_parse_unquote(self._search_regex( - [r'mediaURL\s*:\s*"([^"]+)"', r"var\s+mrssMediaUrl\s*=\s*'([^']+)'"], - webpage, 'media URL')) + info_dict = videos_info['entries'][0] - video_id = self._search_regex( - r'/video/(.*)/_jcr_content/', media_url, 'video id') + upload_date = unified_strdate(self._html_search_meta('date', webpage)) + description = self._html_search_meta('description', webpage) - mrss = self._download_xml(media_url, display_id) - - item = mrss.find('./channel/item') - - NS_MAP = { - 'dc': 'http://purl.org/dc/elements/1.1/', - 'media': 'http://search.yahoo.com/mrss/', - 'ka': 'http://kickapps.com/karss', - } - - title = xpath_text(item, './title', 'title') - description = xpath_text( - item, './description', 'description', fatal=False) - - timestamp = parse_iso8601(xpath_text( - item, xpath_with_ns('./dc:date', NS_MAP), - 'upload date', fatal=False)) - uploader = xpath_text( - item, xpath_with_ns('./dc:creator', NS_MAP), - 'uploader', fatal=False) - - media_content = item.find( - xpath_with_ns('./media:content', NS_MAP)) - duration = int_or_none(media_content.get('duration')) - smil_url = media_content.get('url') - - thumbnail = media_content.find( - xpath_with_ns('./media:thumbnail', NS_MAP)).get('url') - - formats = self._extract_smil_formats(smil_url, display_id) - self._sort_formats(formats) - - return { - 'id': video_id, + info_dict.update({ 'display_id': display_id, - 'title': title, 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'uploader': uploader, - 'duration': duration, - 'formats': formats, - } + 'upload_date': upload_date, + }) + + return info_dict diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 11cf49851..ff0aa11b1 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -29,7 +29,8 @@ class BRIE(InfoExtractor): 'duration': 180, 'uploader': 'Reinhard Weber', 'upload_date': '20150422', - } + }, + 'skip': '404 not found', }, { 'url': 'http://www.br.de/nachrichten/oberbayern/inhalt/muenchner-polizeipraesident-schreiber-gestorben-100.html', @@ -40,7 +41,8 @@ class BRIE(InfoExtractor): 'title': 'Manfred Schreiber ist tot', 'description': 'md5:b454d867f2a9fc524ebe88c3f5092d97', 'duration': 26, - } + }, + 'skip': '404 not found', }, { 'url': 'https://www.br-klassik.de/audio/peeping-tom-premierenkritik-dance-festival-muenchen-100.html', @@ -51,7 +53,8 @@ class BRIE(InfoExtractor): 'title': 'Kurzweilig und sehr bewegend', 'description': 'md5:0351996e3283d64adeb38ede91fac54e', 'duration': 296, - } + }, + 'skip': '404 not found', }, { 'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html', diff --git a/youtube_dl/extractor/carambatv.py b/youtube_dl/extractor/carambatv.py new file mode 100644 index 000000000..5797fb951 --- /dev/null +++ b/youtube_dl/extractor/carambatv.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + int_or_none, + try_get, +) + + +class CarambaTVIE(InfoExtractor): + _VALID_URL = r'(?:carambatv:|https?://video1\.carambatv\.ru/v/)(?P\d+)' + _TESTS = [{ + 'url': 'http://video1.carambatv.ru/v/191910501', + 'md5': '2f4a81b7cfd5ab866ee2d7270cb34a2a', + 'info_dict': { + 'id': '191910501', + 'ext': 'mp4', + 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 2678.31, + }, + }, { + 'url': 'carambatv:191910501', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'http://video1.carambatv.ru/v/%s/videoinfo.js' % video_id, + video_id) + + title = video['title'] + + base_url = video.get('video') or 'http://video1.carambatv.ru/v/%s/' % video_id + + formats = [{ + 'url': base_url + f['fn'], + 'height': int_or_none(f.get('height')), + 'format_id': '%sp' % f['height'] if f.get('height') else None, + } for f in video['qualities'] if f.get('fn')] + self._sort_formats(formats) + + thumbnail = video.get('splash') + duration = float_or_none(try_get( + video, lambda x: x['annotations'][0]['end_time'], compat_str)) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } + + +class CarambaTVPageIE(InfoExtractor): + _VALID_URL = r'https?://carambatv\.ru/(?:[^/]+/)+(?P[^/?#&]+)' + _TEST = { + 'url': 'http://carambatv.ru/movie/bad-comedian/razborka-v-manile/', + 'md5': '', + 'info_dict': { + 'id': '191910501', + 'ext': 'mp4', + 'title': '[BadComedian] - Разборка в Маниле (Абсолютный обзор)', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 2678.31, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_url = self._og_search_property('video:iframe', webpage, default=None) + + if not video_url: + video_id = self._search_regex( + r'(?:video_id|crmb_vuid)\s*[:=]\s*["\']?(\d+)', + webpage, 'video id') + video_url = 'carambatv:%s' % video_id + + return self.url_result(video_url, CarambaTVIE.ie_key()) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index ac2c7dced..a23173d6f 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -1,17 +1,13 @@ from __future__ import unicode_literals -import re - -from .theplatform import ThePlatformIE +from .theplatform import ThePlatformFeedIE from ..utils import ( - xpath_text, - xpath_element, int_or_none, find_xpath_attr, ) -class CBSBaseIE(ThePlatformIE): +class CBSBaseIE(ThePlatformFeedIE): def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): closed_caption_e = find_xpath_attr(smil, self._xpath_ns('.//param', namespace), 'name', 'ClosedCaptionURL') return { @@ -21,9 +17,22 @@ class CBSBaseIE(ThePlatformIE): }] } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] + def _extract_video_info(self, filter_query, video_id): + return self._extract_feed_info( + 'dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id, lambda entry: { + 'series': entry.get('cbs$SeriesTitle'), + 'season_number': int_or_none(entry.get('cbs$SeasonNumber')), + 'episode': entry.get('cbs$EpisodeTitle'), + 'episode_number': int_or_none(entry.get('cbs$EpisodeNumber')), + }, { + 'StreamPack': { + 'manifest': 'm3u', + } + }) + class CBSIE(CBSBaseIE): - _VALID_URL = r'(?:cbs:(?P\w+)|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P[^/]+))' + _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' _TESTS = [{ 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', @@ -38,25 +47,7 @@ class CBSIE(CBSBaseIE): 'upload_date': '20131127', 'uploader': 'CBSI-NEW', }, - 'params': { - # rtmp download - 'skip_download': True, - }, - '_skip': 'Blocked outside the US', - }, { - 'url': 'http://www.cbs.com/shows/liveonletterman/artist/221752/st-vincent/', - 'info_dict': { - 'id': 'WWF_5KqY3PK1', - 'display_id': 'st-vincent', - 'ext': 'flv', - 'title': 'Live on Letterman - St. Vincent', - 'description': 'Live On Letterman: St. Vincent in concert from New York\'s Ed Sullivan Theater on Tuesday, July 16, 2014.', - 'duration': 3221, - }, - 'params': { - # rtmp download - 'skip_download': True, - }, + 'expected_warnings': ['Failed to download m3u8 information'], '_skip': 'Blocked outside the US', }, { 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', @@ -68,44 +59,5 @@ class CBSIE(CBSBaseIE): TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' def _real_extract(self, url): - content_id, display_id = re.match(self._VALID_URL, url).groups() - if not content_id: - webpage = self._download_webpage(url, display_id) - content_id = self._search_regex( - [r"video\.settings\.content_id\s*=\s*'([^']+)';", r"cbsplayer\.contentId\s*=\s*'([^']+)';"], - webpage, 'content id') - items_data = self._download_xml( - 'http://can.cbs.com/thunder/player/videoPlayerService.php', - content_id, query={'partner': 'cbs', 'contentId': content_id}) - video_data = xpath_element(items_data, './/item') - title = xpath_text(video_data, 'videoTitle', 'title', True) - - subtitles = {} - formats = [] - for item in items_data.findall('.//item'): - pid = xpath_text(item, 'pid') - if not pid: - continue - tp_release_url = self.TP_RELEASE_URL_TEMPLATE % pid - if '.m3u8' in xpath_text(item, 'contentUrl', default=''): - tp_release_url += '&manifest=m3u' - tp_formats, tp_subtitles = self._extract_theplatform_smil( - tp_release_url, content_id, 'Downloading %s SMIL data' % pid) - formats.extend(tp_formats) - subtitles = self._merge_subtitles(subtitles, tp_subtitles) - self._sort_formats(formats) - - info = self.get_metadata('dJ5BDC/media/guid/2198311517/%s' % content_id, content_id) - info.update({ - 'id': content_id, - 'display_id': display_id, - 'title': title, - 'series': xpath_text(video_data, 'seriesTitle'), - 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), - 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), - 'duration': int_or_none(xpath_text(video_data, 'videoLength'), 1000), - 'thumbnail': xpath_text(video_data, 'previewImageURL'), - 'formats': formats, - 'subtitles': subtitles, - }) - return info + content_id = self._match_id(url) + return self._extract_video_info('byGuid=%s' % content_id, content_id) diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 79ddc20a0..387537e76 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -30,9 +30,12 @@ class CBSNewsIE(CBSBaseIE): { 'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', 'info_dict': { - 'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack', + 'id': 'SNJBOYzXiWBOvaLsdzwH8fmtP1SCd91Y', 'ext': 'mp4', 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack', + 'description': 'md5:4a6983e480542d8b333a947bfc64ddc7', + 'upload_date': '19700101', + 'uploader': 'CBSI-NEW', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 205, 'subtitles': { @@ -58,30 +61,8 @@ class CBSNewsIE(CBSBaseIE): webpage, 'video JSON info'), video_id) item = video_info['item'] if 'item' in video_info else video_info - title = item.get('articleTitle') or item.get('hed') - duration = item.get('duration') - thumbnail = item.get('mediaImage') or item.get('thumbnail') - - subtitles = {} - formats = [] - for format_id in ['RtmpMobileLow', 'RtmpMobileHigh', 'Hls', 'RtmpDesktop']: - pid = item.get('media' + format_id) - if not pid: - continue - release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' % pid - tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % pid) - formats.extend(tp_formats) - subtitles = self._merge_subtitles(subtitles, tp_subtitles) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } + guid = item['mpxRefId'] + return self._extract_video_info('byGuid=%s' % guid, guid) class CBSNewsLiveVideoIE(InfoExtractor): diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index 549ae32f3..78ca44b02 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -1,30 +1,28 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor +from .cbs import CBSBaseIE -class CBSSportsIE(InfoExtractor): - _VALID_URL = r'https?://www\.cbssports\.com/video/player/(?P
[^/]+)/(?P[^/]+)' +class CBSSportsIE(CBSBaseIE): + _VALID_URL = r'https?://www\.cbssports\.com/video/player/[^/]+/(?P\d+)' - _TEST = { - 'url': 'http://www.cbssports.com/video/player/tennis/318462531970/0/us-open-flashbacks-1990s', + _TESTS = [{ + 'url': 'http://www.cbssports.com/video/player/videos/708337219968/0/ben-simmons-the-next-lebron?-not-so-fast', 'info_dict': { - 'id': '_d5_GbO8p1sT', - 'ext': 'flv', - 'title': 'US Open flashbacks: 1990s', - 'description': 'Bill Macatee relives the best moments in US Open history from the 1990s.', + 'id': '708337219968', + 'ext': 'mp4', + 'title': 'Ben Simmons the next LeBron? Not so fast', + 'description': 'md5:854294f627921baba1f4b9a990d87197', + 'timestamp': 1466293740, + 'upload_date': '20160618', + 'uploader': 'CBSI-NEW', }, - } + 'params': { + # m3u8 download + 'skip_download': True, + } + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - section = mobj.group('section') - video_id = mobj.group('id') - all_videos = self._download_json( - 'http://www.cbssports.com/data/video/player/getVideos/%s?as=json' % section, - video_id) - # The json file contains the info of all the videos in the section - video_info = next(v for v in all_videos if v['pcid'] == video_id) - return self.url_result('theplatform:%s' % video_info['pid'], 'ThePlatform') + video_id = self._match_id(url) + return self._extract_video_info('byId=%s' % video_id, video_id) diff --git a/youtube_dl/extractor/closertotruth.py b/youtube_dl/extractor/closertotruth.py new file mode 100644 index 000000000..26243d52d --- /dev/null +++ b/youtube_dl/extractor/closertotruth.py @@ -0,0 +1,92 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class CloserToTruthIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688', + 'info_dict': { + 'id': '0_zof1ktre', + 'display_id': 'solutions-the-mind-body-problem', + 'ext': 'mov', + 'title': 'Solutions to the Mind-Body Problem?', + 'upload_date': '20140221', + 'timestamp': 1392956007, + 'uploader_id': 'CTTXML' + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://closertotruth.com/episodes/how-do-brains-work', + 'info_dict': { + 'id': '0_iuxai6g6', + 'display_id': 'how-do-brains-work', + 'ext': 'mov', + 'title': 'How do Brains Work?', + 'upload_date': '20140221', + 'timestamp': 1392956024, + 'uploader_id': 'CTTXML' + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://closertotruth.com/interviews/1725', + 'info_dict': { + 'id': '1725', + 'title': 'AyaFr-002', + }, + 'playlist_mincount': 2, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + partner_id = self._search_regex( + r']+src=["\'].*?\b(?:partner_id|p)/(\d+)', + webpage, 'kaltura partner_id') + + title = self._search_regex( + r'(.+?)\s*\|\s*.+?', webpage, 'video title') + + select = self._search_regex( + r'(?s)]+id="select-version"[^>]*>(.+?)', + webpage, 'select version', default=None) + if select: + entry_ids = set() + entries = [] + for mobj in re.finditer( + r']+value=(["\'])(?P[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P[^<]+)', + webpage): + entry_id = mobj.group('id') + if entry_id in entry_ids: + continue + entry_ids.add(entry_id) + entries.append({ + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, entry_id), + 'ie_key': 'Kaltura', + 'title': mobj.group('title'), + }) + if entries: + return self.playlist_result(entries, display_id, title) + + entry_id = self._search_regex( + r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2', + webpage, 'kaltura entry_id', group='id') + + return { + '_type': 'url_transparent', + 'display_id': display_id, + 'url': 'kaltura:%s:%s' % (partner_id, entry_id), + 'ie_key': 'Kaltura', + 'title': title + } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index bfd432160..e6c15de42 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -53,6 +53,7 @@ from ..utils import ( mimetype2ext, update_Request, update_url_query, + parse_m3u8_attributes, ) @@ -748,10 +749,12 @@ class InfoExtractor(object): return self._og_search_property('url', html, **kargs) def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): + if not isinstance(name, (list, tuple)): + name = [name] if display_name is None: - display_name = name + display_name = name[0] return self._html_search_regex( - self._meta_regex(name), + [self._meta_regex(n) for n in name], html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): @@ -875,7 +878,11 @@ class InfoExtractor(object): f['ext'] = determine_ext(f['url']) if isinstance(field_preference, (list, tuple)): - return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference) + return tuple( + f.get(field) + if f.get(field) is not None + else ('' if field == 'format_id' else -1) + for field in field_preference) preference = f.get('preference') if preference is None: @@ -1150,23 +1157,11 @@ class InfoExtractor(object): }] last_info = None last_media = None - kv_rex = re.compile( - r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)') for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-STREAM-INF:'): - last_info = {} - for m in kv_rex.finditer(line): - v = m.group('val') - if v.startswith('"'): - v = v[1:-1] - last_info[m.group('key')] = v + last_info = parse_m3u8_attributes(line) elif line.startswith('#EXT-X-MEDIA:'): - last_media = {} - for m in kv_rex.finditer(line): - v = m.group('val') - if v.startswith('"'): - v = v[1:-1] - last_media[m.group('key')] = v + last_media = parse_m3u8_attributes(line) elif line.startswith('#') or not line.strip(): continue else: diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 5deff5f30..efb8585e8 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -20,7 +20,7 @@ from ..utils import ( class DCNIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?' + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?' def _real_extract(self, url): show_id, video_id, season_id = re.match(self._VALID_URL, url).groups() @@ -55,30 +55,32 @@ class DCNBaseIE(InfoExtractor): 'is_live': is_live, } - def _extract_video_formats(self, webpage, video_id, entry_protocol): + def _extract_video_formats(self, webpage, video_id, m3u8_entry_protocol): formats = [] - m3u8_url = self._html_search_regex( - r'file\s*:\s*"([^"]+)', webpage, 'm3u8 url', fatal=False) - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=None)) - - rtsp_url = self._search_regex( - r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False) - if rtsp_url: - formats.append({ - 'url': rtsp_url, - 'format_id': 'rtsp', - }) - + format_url_base = 'http' + self._html_search_regex( + [ + r'file\s*:\s*"https?(://[^"]+)/playlist.m3u8', + r'<a[^>]+href="rtsp(://[^"]+)"' + ], webpage, 'format url') + # TODO: Current DASH formats are broken - $Time$ pattern in + # <SegmentTemplate> not implemented yet + # formats.extend(self._extract_mpd_formats( + # format_url_base + '/manifest.mpd', + # video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_m3u8_formats( + format_url_base + '/playlist.m3u8', video_id, 'mp4', + m3u8_entry_protocol, m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + format_url_base + '/manifest.f4m', + video_id, f4m_id='hds', fatal=False)) self._sort_formats(formats) return formats class DCNVideoIE(DCNBaseIE): IE_NAME = 'dcn:video' - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?(?:video(?:/[^/]+)?|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)' + _TESTS = [{ 'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375', 'info_dict': { @@ -94,7 +96,10 @@ class DCNVideoIE(DCNBaseIE): # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'http://awaan.ae/video/26723981/%D8%AF%D8%A7%D8%B1-%D8%A7%D9%84%D8%B3%D9%84%D8%A7%D9%85:-%D8%AE%D9%8A%D8%B1-%D8%AF%D9%88%D8%B1-%D8%A7%D9%84%D8%A3%D9%86%D8%B5%D8%A7%D8%B1', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -120,7 +125,7 @@ class DCNVideoIE(DCNBaseIE): class DCNLiveIE(DCNBaseIE): IE_NAME = 'dcn:live' - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?live/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?live/(?P<id>\d+)' def _real_extract(self, url): channel_id = self._match_id(url) @@ -147,7 +152,7 @@ class DCNLiveIE(DCNBaseIE): class DCNSeasonIE(InfoExtractor): IE_NAME = 'dcn:season' - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' + _VALID_URL = r'https?://(?:www\.)?(?:awaan|dcndigital)\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' _TEST = { 'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A', 'info_dict': diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5fce9f47a..bba88e9eb 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -20,7 +20,10 @@ from .adobetv import ( AdobeTVVideoIE, ) from .adultswim import AdultSwimIE -from .aenetworks import AENetworksIE +from .aenetworks import ( + AENetworksIE, + HistoryTopicIE, +) from .afreecatv import AfreecaTVIE from .aftonbladet import AftonbladetIE from .airmozilla import AirMozillaIE @@ -44,7 +47,6 @@ from .archiveorg import ArchiveOrgIE from .ard import ( ARDIE, ARDMediathekIE, - SportschauIE, ) from .arte import ( ArteTvIE, @@ -71,6 +73,8 @@ from .bandcamp import BandcampIE, BandcampAlbumIE from .bbc import ( BBCCoUkIE, BBCCoUkArticleIE, + BBCCoUkIPlayerPlaylistIE, + BBCCoUkPlaylistIE, BBCIE, ) from .beeg import BeegIE @@ -108,6 +112,10 @@ from .camwithher import CamWithHerIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .canvas import CanvasIE +from .carambatv import ( + CarambaTVIE, + CarambaTVPageIE, +) from .cbc import ( CBCIE, CBCPlayerIE, @@ -135,6 +143,7 @@ from .cliprs import ClipRsIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE from .clipsyndicate import ClipsyndicateIE +from .closertotruth import CloserToTruthIE from .cloudy import CloudyIE from .clubic import ClubicIE from .clyp import ClypIE @@ -279,7 +288,6 @@ from .gameone import ( from .gamersyde import GamersydeIE from .gamespot import GameSpotIE from .gamestar import GameStarIE -from .gametrailers import GametrailersIE from .gazeta import GazetaIE from .gdcvault import GDCVaultIE from .generic import GenericIE @@ -449,6 +457,7 @@ from .motherless import MotherlessIE from .motorsport import MotorsportIE from .movieclips import MovieClipsIE from .moviezine import MoviezineIE +from .msn import MSNIE from .mtv import ( MTVIE, MTVServicesEmbeddedIE, @@ -475,7 +484,6 @@ from .nbc import ( NBCNewsIE, NBCSportsIE, NBCSportsVPlayerIE, - MSNBCIE, ) from .ndr import ( NDRIE, @@ -512,7 +520,10 @@ from .nhl import ( NHLVideocenterCategoryIE, NHLIE, ) -from .nick import NickIE +from .nick import ( + NickIE, + NickDeIE, +) from .niconico import NiconicoIE, NiconicoPlaylistIE from .ninegag import NineGagIE from .noco import NocoIE @@ -599,6 +610,7 @@ from .pluralsight import ( PluralsightCourseIE, ) from .podomatic import PodomaticIE +from .polskieradio import PolskieRadioIE from .porn91 import Porn91IE from .pornhd import PornHdIE from .pornhub import ( @@ -622,7 +634,10 @@ from .qqmusic import ( QQMusicToplistIE, QQMusicPlaylistIE, ) -from .r7 import R7IE +from .r7 import ( + R7IE, + R7ArticleIE, +) from .radiocanada import ( RadioCanadaIE, RadioCanadaAudioVideoIE, @@ -738,6 +753,7 @@ from .sportbox import ( SportBoxEmbedIE, ) from .sportdeutschland import SportDeutschlandIE +from .sportschau import SportschauIE from .srgssr import ( SRGSSRIE, SRGSSRPlayIE, @@ -904,6 +920,7 @@ from .vice import ( ViceIE, ViceShowIE, ) +from .vidbit import VidbitIE from .viddler import ViddlerIE from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index f5bbd39d2..9b87b37ae 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -239,6 +239,8 @@ class FacebookIE(InfoExtractor): formats = [] for format_id, f in video_data.items(): + if f and isinstance(f, dict): + f = [f] if not f or not isinstance(f, list): continue for quality in ('sd', 'hd'): diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py index df7665176..a3bb98377 100644 --- a/youtube_dl/extractor/foxsports.py +++ b/youtube_dl/extractor/foxsports.py @@ -1,7 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import smuggle_url +from ..utils import ( + smuggle_url, + update_url_query, +) class FoxSportsIE(InfoExtractor): @@ -9,11 +12,15 @@ class FoxSportsIE(InfoExtractor): _TEST = { 'url': 'http://www.foxsports.com/video?vid=432609859715', + 'md5': 'b49050e955bebe32c301972e4012ac17', 'info_dict': { - 'id': 'gA0bHB3Ladz3', - 'ext': 'flv', + 'id': 'i0qKWsk3qJaM', + 'ext': 'mp4', 'title': 'Courtney Lee on going up 2-0 in series vs. Blazers', 'description': 'Courtney Lee talks about Memphis being focused.', + 'upload_date': '20150423', + 'timestamp': 1429761109, + 'uploader': 'NEWA-FNG-FOXSPORTS', }, 'add_ie': ['ThePlatform'], } @@ -28,5 +35,8 @@ class FoxSportsIE(InfoExtractor): r"data-player-config='([^']+)'", webpage, 'data player config'), video_id) - return self.url_result(smuggle_url( - config['releaseURL'] + '&manifest=f4m', {'force_smil_url': True})) + return self.url_result(smuggle_url(update_url_query( + config['releaseURL'], { + 'mbr': 'true', + 'switch': 'http', + }), {'force_smil_url': True})) diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 4ffdd7515..621257c9f 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -1,19 +1,19 @@ from __future__ import unicode_literals import re -import json -from .common import InfoExtractor +from .once import OnceIE from ..compat import ( compat_urllib_parse_unquote, - compat_urlparse, ) from ..utils import ( unescapeHTML, + url_basename, + dict_get, ) -class GameSpotIE(InfoExtractor): +class GameSpotIE(OnceIE): _VALID_URL = r'https?://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?' _TESTS = [{ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', @@ -39,29 +39,73 @@ class GameSpotIE(InfoExtractor): webpage = self._download_webpage(url, page_id) data_video_json = self._search_regex( r'data-video=["\'](.*?)["\']', webpage, 'data video') - data_video = json.loads(unescapeHTML(data_video_json)) + data_video = self._parse_json(unescapeHTML(data_video_json), page_id) streams = data_video['videoStreams'] + manifest_url = None formats = [] f4m_url = streams.get('f4m_stream') - if f4m_url is not None: - # Transform the manifest url to a link to the mp4 files - # they are used in mobile devices. - f4m_path = compat_urlparse.urlparse(f4m_url).path - QUALITIES_RE = r'((,\d+)+,?)' - qualities = self._search_regex(QUALITIES_RE, f4m_path, 'qualities').strip(',').split(',') - http_path = f4m_path[1:].split('/', 1)[1] - http_template = re.sub(QUALITIES_RE, r'%s', http_path) - http_template = http_template.replace('.csmil/manifest.f4m', '') - http_template = compat_urlparse.urljoin( - 'http://video.gamespotcdn.com/', http_template) - for q in qualities: - formats.append({ - 'url': http_template % q, - 'ext': 'mp4', - 'format_id': q, - }) - else: + if f4m_url: + manifest_url = f4m_url + formats.extend(self._extract_f4m_formats( + f4m_url + '?hdcore=3.7.0', page_id, f4m_id='hds', fatal=False)) + m3u8_url = streams.get('m3u8_stream') + if m3u8_url: + manifest_url = m3u8_url + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, page_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + progressive_url = dict_get( + streams, ('progressive_hd', 'progressive_high', 'progressive_low')) + if progressive_url and manifest_url: + qualities_basename = self._search_regex( + '/([^/]+)\.csmil/', + manifest_url, 'qualities basename', default=None) + if qualities_basename: + QUALITIES_RE = r'((,\d+)+,?)' + qualities = self._search_regex( + QUALITIES_RE, qualities_basename, + 'qualities', default=None) + if qualities: + qualities = list(map(lambda q: int(q), qualities.strip(',').split(','))) + qualities.sort() + http_template = re.sub(QUALITIES_RE, r'%d', qualities_basename) + http_url_basename = url_basename(progressive_url) + if m3u8_formats: + self._sort_formats(m3u8_formats) + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + m3u8_formats)) + if len(qualities) == len(m3u8_formats): + for q, m3u8_format in zip(qualities, m3u8_formats): + f = m3u8_format.copy() + f.update({ + 'url': progressive_url.replace( + http_url_basename, http_template % q), + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + formats.append(f) + else: + for q in qualities: + formats.append({ + 'url': progressive_url.replace( + http_url_basename, http_template % q), + 'ext': 'mp4', + 'format_id': 'http-%d' % q, + 'tbr': q, + }) + + onceux_json = self._search_regex( + r'data-onceux-options=["\'](.*?)["\']', webpage, 'data video', default=None) + if onceux_json: + onceux_url = self._parse_json(unescapeHTML(onceux_json), page_id).get('metadataUri') + if onceux_url: + formats.extend(self._extract_once_formats(re.sub( + r'https?://[^/]+', 'http://once.unicornmedia.com', onceux_url).replace('ads/vmap/', ''))) + + if not formats: for quality in ['sd', 'hd']: # It's actually a link to a flv file flv_url = streams.get('f4m_{0}'.format(quality)) @@ -71,6 +115,7 @@ class GameSpotIE(InfoExtractor): 'ext': 'flv', 'format_id': quality, }) + self._sort_formats(formats) return { 'id': data_video['guid'], diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py deleted file mode 100644 index 1e7948ab8..000000000 --- a/youtube_dl/extractor/gametrailers.py +++ /dev/null @@ -1,62 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_age_limit, - url_basename, -) - - -class GametrailersIE(InfoExtractor): - _VALID_URL = r'https?://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)' - - _TEST = { - 'url': 'http://www.gametrailers.com/videos/view/gametrailers-com/116437-Just-Cause-3-Review', - 'md5': 'f28c4efa0bdfaf9b760f6507955b6a6a', - 'info_dict': { - 'id': '2983958', - 'ext': 'mp4', - 'display_id': '116437-Just-Cause-3-Review', - 'title': 'Just Cause 3 - Review', - 'description': 'It\'s a lot of fun to shoot at things and then watch them explode in Just Cause 3, but should there be more to the experience than that?', - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - title = self._html_search_regex( - r'<title>(.+?)\|', webpage, 'title').strip() - embed_url = self._proto_relative_url( - self._search_regex( - r'src=\'(//embed.gametrailers.com/embed/[^\']+)\'', webpage, - 'embed url'), - scheme='http:') - video_id = url_basename(embed_url) - embed_page = self._download_webpage(embed_url, video_id) - embed_vars_json = self._search_regex( - r'(?s)var embedVars = (\{.*?\})\s*</script>', embed_page, - 'embed vars') - info = self._parse_json(embed_vars_json, video_id) - - formats = [] - for media in info['media']: - if media['mediaPurpose'] == 'play': - formats.append({ - 'url': media['uri'], - 'height': media['height'], - 'width:': media['width'], - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'formats': formats, - 'thumbnail': info.get('thumbUri'), - 'description': self._og_search_description(webpage), - 'duration': int_or_none(info.get('videoLengthInSeconds')), - 'age_limit': parse_age_limit(info.get('audienceRating')), - } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 4aa24061c..2188f8bb2 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -64,6 +64,7 @@ from .liveleak import LiveLeakIE from .threeqsdn import ThreeQSDNIE from .theplatform import ThePlatformIE from .vessel import VesselIE +from .kaltura import KalturaIE class GenericIE(InfoExtractor): @@ -920,6 +921,24 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, + { + # Kaltura embedded via quoted entry_id + 'url': 'https://www.oreilly.com/ideas/my-cloud-makes-pretty-pictures', + 'info_dict': { + 'id': '0_utuok90b', + 'ext': 'mp4', + 'title': '06_matthew_brender_raj_dutt', + 'timestamp': 1466638791, + 'upload_date': '20160622', + }, + 'add_ie': ['Kaltura'], + 'expected_warnings': [ + 'Could not send HEAD request' + ], + 'params': { + 'skip_download': True, + } + }, # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', @@ -1091,12 +1110,17 @@ class GenericIE(InfoExtractor): # Dailymotion Cloud video { 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910', - 'md5': '49444254273501a64675a7e68c502681', + 'md5': 'dcaf23ad0c67a256f4278bce6e0bae38', 'info_dict': { - 'id': '5585de919473990de4bee11b', + 'id': 'x2uy8t3', 'ext': 'mp4', - 'title': 'Le débat', + 'title': 'Sauvons les abeilles ! - Le débat', + 'description': 'md5:d9082128b1c5277987825d684939ca26', 'thumbnail': 're:^https?://.*\.jpe?g$', + 'timestamp': 1434970506, + 'upload_date': '20150622', + 'uploader': 'Public Sénat', + 'uploader_id': 'xa9gza', } }, # OnionStudios embed @@ -1903,12 +1927,9 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url'), 'Zapiks') # Look for Kaltura embeds - mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?(?P<q1>['\"])wid(?P=q1)\s*:\s*(?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*?(?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*(?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4),", webpage) or - re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage)) - if mobj is not None: - return self.url_result(smuggle_url( - 'kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), - {'source_url': url}), 'Kaltura') + kaltura_url = KalturaIE._extract_url(webpage) + if kaltura_url: + return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key()) # Look for Eagle.Platform embeds mobj = re.search( diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index ddcb3c916..fea26685e 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -1,30 +1,25 @@ # coding: utf-8 from __future__ import unicode_literals +import binascii import hashlib import itertools import math -import os -import random import re import time -import uuid from .common import InfoExtractor from ..compat import ( - compat_parse_qs, compat_str, compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, ) from ..utils import ( decode_packed_codes, ExtractorError, + intlist_to_bytes, ohdave_rsa_encrypt, remove_start, - sanitized_Request, - urlencode_postdata, - url_basename, + urshift, ) @@ -171,70 +166,21 @@ class IqiyiIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.iqiyi.com/v_19rrojlavg.html', - 'md5': '2cb594dc2781e6c941a110d8f358118b', + 'md5': '470a6c160618577166db1a7aac5a3606', 'info_dict': { 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73', + 'ext': 'mp4', 'title': '美国德州空中惊现奇异云团 酷似UFO', - 'ext': 'f4v', } }, { 'url': 'http://www.iqiyi.com/v_19rrhnnclk.html', + 'md5': 'f09f0a6a59b2da66a26bf4eda669a4cc', 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb', - 'title': '名侦探柯南第752集', - }, - 'playlist': [{ - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part1', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part2', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part3', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part4', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part5', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part6', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part7', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part8', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }], - 'params': { - 'skip_download': True, + 'ext': 'mp4', + 'title': '名侦探柯南 国语版', }, + 'skip': 'Geo-restricted to China', }, { 'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html', 'only_matching': True, @@ -287,13 +233,6 @@ class IqiyiIE(InfoExtractor): ('10', 'h1'), ] - AUTH_API_ERRORS = { - # No preview available (不允许试看鉴权失败) - 'Q00505': 'This video requires a VIP account', - # End of preview time (试看结束鉴权失败) - 'Q00506': 'Needs a VIP account for full video', - } - def _real_initialize(self): self._login() @@ -352,177 +291,101 @@ class IqiyiIE(InfoExtractor): return True - def _authenticate_vip_video(self, api_video_url, video_id, tvid, _uuid, do_report_warning): - auth_params = { - # version and platform hard-coded in com/qiyi/player/core/model/remote/AuthenticationRemote.as - 'version': '2.0', - 'platform': 'b6c13e26323c537d', - 'aid': tvid, + @staticmethod + def _gen_sc(tvid, timestamp): + M = [1732584193, -271733879] + M.extend([~M[0], ~M[1]]) + I_table = [7, 12, 17, 22, 5, 9, 14, 20, 4, 11, 16, 23, 6, 10, 15, 21] + C_base = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8388608, 432] + + def L(n, t): + if t is None: + t = 0 + return trunc(((n >> 1) + (t >> 1) << 1) + (n & 1) + (t & 1)) + + def trunc(n): + n = n % 0x100000000 + if n > 0x7fffffff: + n -= 0x100000000 + return n + + def transform(string, mod): + num = int(string, 16) + return (num >> 8 * (i % 4) & 255 ^ i % mod) << ((a & 3) << 3) + + C = list(C_base) + o = list(M) + k = str(timestamp - 7) + for i in range(13): + a = i + C[a >> 2] |= ord(k[a]) << 8 * (a % 4) + + for i in range(16): + a = i + 13 + start = (i >> 2) * 8 + r = '03967743b643f66763d623d637e30733' + C[a >> 2] |= transform(''.join(reversed(r[start:start + 8])), 7) + + for i in range(16): + a = i + 29 + start = (i >> 2) * 8 + r = '7038766939776a32776a32706b337139' + C[a >> 2] |= transform(r[start:start + 8], 1) + + for i in range(9): + a = i + 45 + if i < len(tvid): + C[a >> 2] |= ord(tvid[i]) << 8 * (a % 4) + + for a in range(64): + i = a + I = i >> 4 + C_index = [i, 5 * i + 1, 3 * i + 5, 7 * i][I] % 16 + urshift(a, 6) + m = L(L(o[0], [ + trunc(o[1] & o[2]) | trunc(~o[1] & o[3]), + trunc(o[3] & o[1]) | trunc(~o[3] & o[2]), + o[1] ^ o[2] ^ o[3], + o[2] ^ trunc(o[1] | ~o[3]) + ][I]), L( + trunc(int(abs(math.sin(i + 1)) * 4294967296)), + C[C_index] if C_index < len(C) else None)) + I = I_table[4 * I + i % 4] + o = [o[3], + L(o[1], trunc(trunc(m << I) | urshift(m, 32 - I))), + o[1], + o[2]] + + new_M = [L(o[0], M[0]), L(o[1], M[1]), L(o[2], M[2]), L(o[3], M[3])] + s = [new_M[a >> 3] >> (1 ^ a & 7) * 4 & 15 for a in range(32)] + return binascii.hexlify(intlist_to_bytes(s))[1::2].decode('ascii') + + def get_raw_data(self, tvid, video_id): + tm = int(time.time() * 1000) + + sc = self._gen_sc(tvid, tm) + params = { + 'platForm': 'h5', + 'rate': 1, 'tvid': tvid, - 'uid': '', - 'deviceId': _uuid, - 'playType': 'main', # XXX: always main? - 'filename': os.path.splitext(url_basename(api_video_url))[0], - } - - qd_items = compat_parse_qs(compat_urllib_parse_urlparse(api_video_url).query) - for key, val in qd_items.items(): - auth_params[key] = val[0] - - auth_req = sanitized_Request( - 'http://api.vip.iqiyi.com/services/ckn.action', - urlencode_postdata(auth_params)) - # iQiyi server throws HTTP 405 error without the following header - auth_req.add_header('Content-Type', 'application/x-www-form-urlencoded') - auth_result = self._download_json( - auth_req, video_id, - note='Downloading video authentication JSON', - errnote='Unable to download video authentication JSON') - - code = auth_result.get('code') - msg = self.AUTH_API_ERRORS.get(code) or auth_result.get('msg') or code - if code == 'Q00506': - if do_report_warning: - self.report_warning(msg) - return False - if 'data' not in auth_result: - if msg is not None: - raise ExtractorError('%s said: %s' % (self.IE_NAME, msg), expected=True) - raise ExtractorError('Unexpected error from Iqiyi auth API') - - return auth_result['data'] - - def construct_video_urls(self, data, video_id, _uuid, tvid): - def do_xor(x, y): - a = y % 3 - if a == 1: - return x ^ 121 - if a == 2: - return x ^ 72 - return x ^ 103 - - def get_encode_code(l): - a = 0 - b = l.split('-') - c = len(b) - s = '' - for i in range(c - 1, -1, -1): - a = do_xor(int(b[c - i - 1], 16), i) - s += chr(a) - return s[::-1] - - def get_path_key(x, format_id, segment_index): - mg = ')(*&^flash@#$%a' - tm = self._download_json( - 'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id, - note='Download path key of segment %d for format %s' % (segment_index + 1, format_id) - )['t'] - t = str(int(math.floor(int(tm) / (600.0)))) - return md5_text(t + mg + x) - - video_urls_dict = {} - need_vip_warning_report = True - for format_item in data['vp']['tkl'][0]['vs']: - if 0 < int(format_item['bid']) <= 10: - format_id = self.get_format(format_item['bid']) - else: - continue - - video_urls = [] - - video_urls_info = format_item['fs'] - if not format_item['fs'][0]['l'].startswith('/'): - t = get_encode_code(format_item['fs'][0]['l']) - if t.endswith('mp4'): - video_urls_info = format_item['flvs'] - - for segment_index, segment in enumerate(video_urls_info): - vl = segment['l'] - if not vl.startswith('/'): - vl = get_encode_code(vl) - is_vip_video = '/vip/' in vl - filesize = segment['b'] - base_url = data['vp']['du'].split('/') - if not is_vip_video: - key = get_path_key( - vl.split('/')[-1].split('.')[0], format_id, segment_index) - base_url.insert(-1, key) - base_url = '/'.join(base_url) - param = { - 'su': _uuid, - 'qyid': uuid.uuid4().hex, - 'client': '', - 'z': '', - 'bt': '', - 'ct': '', - 'tn': str(int(time.time())) - } - api_video_url = base_url + vl - if is_vip_video: - api_video_url = api_video_url.replace('.f4v', '.hml') - auth_result = self._authenticate_vip_video( - api_video_url, video_id, tvid, _uuid, need_vip_warning_report) - if auth_result is False: - need_vip_warning_report = False - break - param.update({ - 't': auth_result['t'], - # cid is hard-coded in com/qiyi/player/core/player/RuntimeData.as - 'cid': 'afbe8fd3d73448c9', - 'vid': video_id, - 'QY00001': auth_result['u'], - }) - api_video_url += '?' if '?' not in api_video_url else '&' - api_video_url += compat_urllib_parse_urlencode(param) - js = self._download_json( - api_video_url, video_id, - note='Download video info of segment %d for format %s' % (segment_index + 1, format_id)) - video_url = js['l'] - video_urls.append( - (video_url, filesize)) - - video_urls_dict[format_id] = video_urls - return video_urls_dict - - def get_format(self, bid): - matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)] - return matched_format_ids[0] if len(matched_format_ids) else None - - def get_bid(self, format_id): - matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id] - return matched_bids[0] if len(matched_bids) else None - - def get_raw_data(self, tvid, video_id, enc_key, _uuid): - tm = str(int(time.time())) - tail = tm + tvid - param = { - 'key': 'fvip', - 'src': md5_text('youtube-dl'), - 'tvId': tvid, 'vid': video_id, - 'vinfo': 1, - 'tm': tm, - 'enc': md5_text(enc_key + tail), - 'qyid': _uuid, - 'tn': random.random(), - # In iQiyi's flash player, um is set to 1 if there's a logged user - # Some 1080P formats are only available with a logged user. - # Here force um=1 to trick the iQiyi server - 'um': 1, - 'authkey': md5_text(md5_text('') + tail), - 'k_tag': 1, + 'cupid': 'qc_100001_100186', + 'type': 'mp4', + 'nolimit': 0, + 'agenttype': 13, + 'src': 'd846d0c32d664d32b6b54ea48997a589', + 'sc': sc, + 't': tm - 7, + '__jsT': None, } - api_url = 'http://cache.video.qiyi.com/vms' + '?' + \ - compat_urllib_parse_urlencode(param) - raw_data = self._download_json(api_url, video_id) - return raw_data - - def get_enc_key(self, video_id): - # TODO: automatic key extraction - # last update at 2016-01-22 for Zombie::bite - enc_key = '4a1caba4b4465345366f28da7c117d20' - return enc_key + headers = {} + cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') + if cn_verification_proxy: + headers['Ytdl-request-proxy'] = cn_verification_proxy + return self._download_json( + 'http://cache.m.iqiyi.com/jp/tmts/%s/%s/' % (tvid, video_id), + video_id, transform_source=lambda s: remove_start(s, 'var tvInfoJs='), + query=params, headers=headers) def _extract_playlist(self, webpage): PAGE_SIZE = 50 @@ -571,58 +434,27 @@ class IqiyiIE(InfoExtractor): r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid') video_id = self._search_regex( r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') - _uuid = uuid.uuid4().hex - enc_key = self.get_enc_key(video_id) + for _ in range(5): + raw_data = self.get_raw_data(tvid, video_id) - raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid) + if raw_data['code'] != 'A00000': + if raw_data['code'] == 'A00111': + self.raise_geo_restricted() + raise ExtractorError('Unable to load data. Error code: ' + raw_data['code']) - if raw_data['code'] != 'A000000': - raise ExtractorError('Unable to load data. Error code: ' + raw_data['code']) + data = raw_data['data'] - data = raw_data['data'] + # iQiYi sometimes returns Ads + if not isinstance(data['playInfo'], dict): + self._sleep(5, video_id) + continue - title = data['vi']['vn'] + title = data['playInfo']['an'] + break - # generate video_urls_dict - video_urls_dict = self.construct_video_urls( - data, video_id, _uuid, tvid) - - # construct info - entries = [] - for format_id in video_urls_dict: - video_urls = video_urls_dict[format_id] - for i, video_url_info in enumerate(video_urls): - if len(entries) < i + 1: - entries.append({'formats': []}) - entries[i]['formats'].append( - { - 'url': video_url_info[0], - 'filesize': video_url_info[-1], - 'format_id': format_id, - 'preference': int(self.get_bid(format_id)) - } - ) - - for i in range(len(entries)): - self._sort_formats(entries[i]['formats']) - entries[i].update( - { - 'id': '%s_part%d' % (video_id, i + 1), - 'title': title, - } - ) - - if len(entries) > 1: - info = { - '_type': 'multi_video', - 'id': video_id, - 'title': title, - 'entries': entries, - } - else: - info = entries[0] - info['id'] = video_id - info['title'] = title - - return info + return { + 'id': video_id, + 'title': title, + 'url': data['m3u'], + } diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index a65697ff5..c75a958ba 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -64,6 +64,32 @@ class KalturaIE(InfoExtractor): } ] + @staticmethod + def _extract_url(webpage): + mobj = ( + re.search( + r"""(?xs) + kWidget\.(?:thumb)?[Ee]mbed\( + \{.*? + (?P<q1>['\"])wid(?P=q1)\s*:\s* + (?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*? + (?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s* + (?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4), + """, webpage) or + re.search( + r'''(?xs) + (?P<q1>["\']) + (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*? + (?P=q1).*? + (?: + entry_?[Ii]d| + (?P<q2>["\'])entry_?[Ii]d(?P=q2) + )\s*:\s* + (?P<q3>["\'])(?P<id>.+?)(?P=q3) + ''', webpage)) + if mobj: + return 'kaltura:%(partner_id)s:%(id)s' % mobj.groupdict() + def _kaltura_api_call(self, video_id, actions, *args, **kwargs): params = actions[0] if len(actions) > 1: diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index 63f581cd9..959d71617 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -23,6 +23,7 @@ from ..utils import ( sanitized_Request, str_or_none, url_basename, + urshift, ) @@ -74,15 +75,11 @@ class LeIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def urshift(val, n): - return val >> n if val >= 0 else (val + 0x100000000) >> n - # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf def ror(self, param1, param2): _loc3_ = 0 while _loc3_ < param2: - param1 = self.urshift(param1, 1) + ((param1 & 1) << 31) + param1 = urshift(param1, 1) + ((param1 & 1) << 31) _loc3_ += 1 return param1 diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 3589c223d..5a00cd397 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,5 +1,8 @@ +# coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_urllib_parse_urlencode, @@ -8,6 +11,7 @@ from ..compat import ( from ..utils import ( get_element_by_attribute, int_or_none, + remove_start, ) @@ -15,7 +19,7 @@ class MiTeleIE(InfoExtractor): IE_DESC = 'mitele.es' _VALID_URL = r'https?://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/' - _TEST = { + _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', # MD5 is unstable 'info_dict': { @@ -24,10 +28,31 @@ class MiTeleIE(InfoExtractor): 'ext': 'flv', 'title': 'Tor, la web invisible', 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', + 'series': 'Diario de', + 'season': 'La redacción', + 'episode': 'Programa 144', 'thumbnail': 're:(?i)^https?://.*\.jpg$', 'duration': 2913, }, - } + }, { + # no explicit title + 'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/temporada-6/programa-226/', + 'info_dict': { + 'id': 'eLZSwoEd1S3pVyUm8lc6F', + 'display_id': 'programa-226', + 'ext': 'flv', + 'title': 'Cuarto Milenio - Temporada 6 - Programa 226', + 'description': 'md5:50daf9fadefa4e62d9fc866d0c015701', + 'series': 'Cuarto Milenio', + 'season': 'Temporada 6', + 'episode': 'Programa 226', + 'thumbnail': 're:(?i)^https?://.*\.jpg$', + 'duration': 7312, + }, + 'params': { + 'skip_download': True, + }, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -70,7 +95,22 @@ class MiTeleIE(InfoExtractor): self._sort_formats(formats) title = self._search_regex( - r'class="Destacado-text"[^>]*>\s*<strong>([^<]+)</strong>', webpage, 'title') + r'class="Destacado-text"[^>]*>\s*<strong>([^<]+)</strong>', + webpage, 'title', default=None) + + mobj = re.search(r'''(?sx) + class="Destacado-text"[^>]*>.*?<h1>\s* + <span>(?P<series>[^<]+)</span>\s* + <span>(?P<season>[^<]+)</span>\s* + <span>(?P<episode>[^<]+)</span>''', webpage) + series, season, episode = mobj.groups() if mobj else [None] * 3 + + if not title: + if mobj: + title = '%s - %s - %s' % (series, season, episode) + else: + title = remove_start(self._search_regex( + r'<title>([^<]+)', webpage, 'title'), 'Ver online ') video_id = self._search_regex( r'data-media-id\s*=\s*"([^"]+)"', webpage, @@ -83,6 +123,9 @@ class MiTeleIE(InfoExtractor): 'display_id': display_id, 'title': title, 'description': get_element_by_attribute('class', 'text', webpage), + 'series': series, + 'season': season, + 'episode': episode, 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 483f6925f..560fe188b 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -102,11 +102,11 @@ class MixcloudIE(InfoExtractor): description = self._og_search_description(webpage) like_count = parse_count(self._search_regex( r'\bbutton-favorite[^>]+>.*?]+class=["\']toggle-number[^>]+>\s*([^<]+)', - webpage, 'like count', fatal=False)) + webpage, 'like count', default=None)) view_count = str_to_int(self._search_regex( [r'([0-9,.]+)'], - webpage, 'play count', fatal=False)) + webpage, 'play count', default=None)) return { 'id': track_id, diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py new file mode 100644 index 000000000..1ec8e0f50 --- /dev/null +++ b/youtube_dl/extractor/msn.py @@ -0,0 +1,122 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + unescapeHTML, +) + + +class MSNIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?msn\.com/(?:[^/]+/)+(?P[^/]+)/[a-z]{2}-(?P[\da-zA-Z]+)' + _TESTS = [{ + 'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/criminal-minds-shemar-moore-shares-a-touching-goodbye-message/vp-BBqQYNE', + 'md5': '8442f66c116cbab1ff7098f986983458', + 'info_dict': { + 'id': 'BBqQYNE', + 'display_id': 'criminal-minds-shemar-moore-shares-a-touching-goodbye-message', + 'ext': 'mp4', + 'title': 'Criminal Minds - Shemar Moore Shares A Touching Goodbye Message', + 'description': 'md5:e8e89b897b222eb33a6b5067a8f1bc25', + 'duration': 104, + 'uploader': 'CBS Entertainment', + 'uploader_id': 'IT0X5aoJ6bJgYerJXSDCgFmYPB1__54v', + }, + }, { + 'url': 'http://www.msn.com/en-ae/news/offbeat/meet-the-nine-year-old-self-made-millionaire/ar-BBt6ZKf', + 'only_matching': True, + }, { + 'url': 'http://www.msn.com/en-ae/video/watch/obama-a-lot-of-people-will-be-disappointed/vi-AAhxUMH', + 'only_matching': True, + }, { + # geo restricted + 'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/the-first-fart-makes-you-laugh-the-last-fart-makes-you-cry/vp-AAhzIBU', + 'only_matching': True, + }, { + 'url': 'http://www.msn.com/en-ae/entertainment/bollywood/watch-how-salman-khan-reacted-when-asked-if-he-would-apologize-for-his-‘raped-woman’-comment/vi-AAhvzW6', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id, display_id = mobj.group('id', 'display_id') + + webpage = self._download_webpage(url, display_id) + + video = self._parse_json( + self._search_regex( + r'data-metadata\s*=\s*(["\'])(?P.+?)\1', + webpage, 'video data', default='{}', group='data'), + display_id, transform_source=unescapeHTML) + + if not video: + error = unescapeHTML(self._search_regex( + r'data-error=(["\'])(?P.+?)\1', + webpage, 'error', group='error')) + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + title = video['title'] + + formats = [] + for file_ in video.get('videoFiles', []): + format_url = file_.get('url') + if not format_url: + continue + ext = determine_ext(format_url) + # .ism is not yet supported (see + # https://github.com/rg3/youtube-dl/issues/8118) + if ext == 'ism': + continue + if 'm3u8' in format_url: + # m3u8_native should not be used here until + # https://github.com/rg3/youtube-dl/issues/9913 is fixed + m3u8_formats = self._extract_m3u8_formats( + format_url, display_id, 'mp4', + m3u8_id='hls', fatal=False) + # Despite metadata in m3u8 all video+audio formats are + # actually video-only (no audio) + for f in m3u8_formats: + if f.get('acodec') != 'none' and f.get('vcodec') != 'none': + f['acodec'] = 'none' + formats.extend(m3u8_formats) + else: + formats.append({ + 'url': format_url, + 'ext': 'mp4', + 'format_id': 'http', + 'width': int_or_none(file_.get('width')), + 'height': int_or_none(file_.get('height')), + }) + self._sort_formats(formats) + + subtitles = {} + for file_ in video.get('files', []): + format_url = file_.get('url') + format_code = file_.get('formatCode') + if not format_url or not format_code: + continue + if compat_str(format_code) == '3100': + subtitles.setdefault(file_.get('culture', 'en'), []).append({ + 'ext': determine_ext(format_url, 'ttml'), + 'url': format_url, + }) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': video.get('headlineImage', {}).get('url'), + 'duration': int_or_none(video.get('durationSecs')), + 'uploader': video.get('sourceFriendly'), + 'uploader_id': video.get('providerId'), + 'creator': video.get('creator'), + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 640ee3d93..dd0639589 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..compat import ( compat_urllib_parse_urlencode, compat_str, + compat_xpath, ) from ..utils import ( ExtractorError, @@ -84,9 +85,10 @@ class MTVServicesInfoExtractor(InfoExtractor): rtmp_video_url = rendition.find('./src').text if rtmp_video_url.endswith('siteunavail.png'): continue + new_url = self._transform_rtmp_url(rtmp_video_url) formats.append({ - 'ext': ext, - 'url': self._transform_rtmp_url(rtmp_video_url), + 'ext': 'flv' if new_url.startswith('rtmp') else ext, + 'url': new_url, 'format_id': rendition.get('bitrate'), 'width': int(rendition.get('width')), 'height': int(rendition.get('height')), @@ -139,9 +141,9 @@ class MTVServicesInfoExtractor(InfoExtractor): itemdoc, './/{http://search.yahoo.com/mrss/}category', 'scheme', 'urn:mtvn:video_title') if title_el is None: - title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title') + title_el = itemdoc.find(compat_xpath('.//{http://search.yahoo.com/mrss/}title')) if title_el is None: - title_el = itemdoc.find('.//title') or itemdoc.find('./title') + title_el = itemdoc.find(compat_xpath('.//title')) if title_el.text is None: title_el = None diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 6b7da1149..f694e210b 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -9,10 +9,6 @@ from ..utils import ( lowercase_escape, smuggle_url, unescapeHTML, - update_url_query, - int_or_none, - HEADRequest, - parse_iso8601, ) @@ -192,9 +188,9 @@ class CSNNEIE(InfoExtractor): class NBCNewsIE(ThePlatformIE): - _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today)\.com/ + _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/ (?:video/.+?/(?P\d+)| - ([^/]+/)*(?P[^/?]+)) + ([^/]+/)*(?:.*-)?(?P[^/?]+)) ''' _TESTS = [ @@ -216,13 +212,16 @@ class NBCNewsIE(ThePlatformIE): 'ext': 'mp4', 'title': 'How Twitter Reacted To The Snowden Interview', 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', + 'uploader': 'NBCU-NEWS', + 'timestamp': 1401363060, + 'upload_date': '20140529', }, }, { 'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156', 'md5': 'fdbf39ab73a72df5896b6234ff98518a', 'info_dict': { - 'id': 'Wjf9EDR3A_60', + 'id': '529953347624', 'ext': 'mp4', 'title': 'FULL EPISODE: Family Business', 'description': 'md5:757988edbaae9d7be1d585eb5d55cc04', @@ -237,6 +236,9 @@ class NBCNewsIE(ThePlatformIE): 'ext': 'mp4', 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', + 'timestamp': 1423104900, + 'uploader': 'NBCU-NEWS', + 'upload_date': '20150205', }, }, { @@ -245,10 +247,12 @@ class NBCNewsIE(ThePlatformIE): 'info_dict': { 'id': '529953347624', 'ext': 'mp4', - 'title': 'Volkswagen U.S. Chief: We \'Totally Screwed Up\'', - 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301', + 'title': 'Volkswagen U.S. Chief:\xa0 We Have Totally Screwed Up', + 'description': 'md5:c8be487b2d80ff0594c005add88d8351', + 'upload_date': '20150922', + 'timestamp': 1442917800, + 'uploader': 'NBCU-NEWS', }, - 'expected_warnings': ['http-6000 is not available'] }, { 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788', @@ -260,6 +264,22 @@ class NBCNewsIE(ThePlatformIE): 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', 'upload_date': '20160420', 'timestamp': 1461152093, + 'uploader': 'NBCU-NEWS', + }, + }, + { + 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', + 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', + 'info_dict': { + 'id': '314487875924', + 'ext': 'mp4', + 'title': 'The chaotic GOP immigration vote', + 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1406937606, + 'upload_date': '20140802', + 'uploader': 'NBCU-NEWS', + 'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'], }, }, { @@ -290,105 +310,28 @@ class NBCNewsIE(ThePlatformIE): } else: # "feature" and "nightly-news" pages use theplatform.com - display_id = mobj.group('display_id') - webpage = self._download_webpage(url, display_id) - info = None - bootstrap_json = self._search_regex( - [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$', - r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"'], - webpage, 'bootstrap json', default=None) - bootstrap = self._parse_json( - bootstrap_json, display_id, transform_source=unescapeHTML) - if 'results' in bootstrap: - info = bootstrap['results'][0]['video'] - elif 'video' in bootstrap: - info = bootstrap['video'] - else: - info = bootstrap - video_id = info['mpxId'] - title = info['title'] - - subtitles = {} - caption_links = info.get('captionLinks') - if caption_links: - for (sub_key, sub_ext) in (('smpte-tt', 'ttml'), ('web-vtt', 'vtt'), ('srt', 'srt')): - sub_url = caption_links.get(sub_key) - if sub_url: - subtitles.setdefault('en', []).append({ - 'url': sub_url, - 'ext': sub_ext, - }) - - formats = [] - for video_asset in info['videoAssets']: - video_url = video_asset.get('publicUrl') - if not video_url: - continue - container = video_asset.get('format') - asset_type = video_asset.get('assetType') or '' - if container == 'ISM' or asset_type == 'FireTV-Once': - continue - elif asset_type == 'OnceURL': - tp_formats, tp_subtitles = self._extract_theplatform_smil( - video_url, video_id) - formats.extend(tp_formats) - subtitles = self._merge_subtitles(subtitles, tp_subtitles) + video_id = mobj.group('mpx_id') + if not video_id.isdigit(): + webpage = self._download_webpage(url, video_id) + info = None + bootstrap_json = self._search_regex( + [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$', + r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"'], + webpage, 'bootstrap json', default=None) + bootstrap = self._parse_json( + bootstrap_json, video_id, transform_source=unescapeHTML) + if 'results' in bootstrap: + info = bootstrap['results'][0]['video'] + elif 'video' in bootstrap: + info = bootstrap['video'] else: - tbr = int_or_none(video_asset.get('bitRate') or video_asset.get('bitrate'), 1000) - format_id = 'http%s' % ('-%d' % tbr if tbr else '') - video_url = update_url_query( - video_url, {'format': 'redirect'}) - # resolve the url so that we can check availability and detect the correct extension - head = self._request_webpage( - HEADRequest(video_url), video_id, - 'Checking %s url' % format_id, - '%s is not available' % format_id, - fatal=False) - if head: - video_url = head.geturl() - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'width': int_or_none(video_asset.get('width')), - 'height': int_or_none(video_asset.get('height')), - 'tbr': tbr, - 'container': video_asset.get('format'), - }) - self._sort_formats(formats) + info = bootstrap + video_id = info['mpxId'] return { + '_type': 'url_transparent', 'id': video_id, - 'title': title, - 'description': info.get('description'), - 'thumbnail': info.get('thumbnail'), - 'duration': int_or_none(info.get('duration')), - 'timestamp': parse_iso8601(info.get('pubDate') or info.get('pub_date')), - 'formats': formats, - 'subtitles': subtitles, + # http://feed.theplatform.com/f/2E2eJC/nbcnews also works + 'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byId=%s' % video_id, + 'ie_key': 'ThePlatformFeed', } - - -class MSNBCIE(InfoExtractor): - # https URLs redirect to corresponding http ones - _VALID_URL = r'https?://www\.msnbc\.com/[^/]+/watch/(?P[^/]+)' - _TEST = { - 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', - 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', - 'info_dict': { - 'id': 'n_hayes_Aimm_140801_272214', - 'ext': 'mp4', - 'title': 'The chaotic GOP immigration vote', - 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'timestamp': 1406937606, - 'upload_date': '20140802', - 'uploader': 'NBCU-NEWS', - 'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'], - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - embed_url = self._html_search_meta('embedURL', webpage) - return self.url_result(embed_url) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index ce065f2b0..e96013791 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor from ..compat import compat_urllib_parse_urlencode +from ..utils import update_url_query class NickIE(MTVServicesInfoExtractor): @@ -61,3 +62,26 @@ class NickIE(MTVServicesInfoExtractor): def _extract_mgid(self, webpage): return self._search_regex(r'data-contenturi="([^"]+)', webpage, 'mgid') + + +class NickDeIE(MTVServicesInfoExtractor): + IE_NAME = 'nick.de' + _VALID_URL = r'https?://(?:www\.)?nick\.de/(?:playlist|shows)/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.nick.de/playlist/3773-top-videos/videos/episode/17306-zu-wasser-und-zu-land-rauchende-erdnusse', + 'only_matching': True, + }, { + 'url': 'http://www.nick.de/shows/342-icarly', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + mrss_url = update_url_query(self._search_regex( + r'data-mrss=(["\'])(?Phttp.+?)\1', webpage, 'mrss url', group='url'), + {'siteKey': 'nick.de'}) + + return self._get_videos_info_from_url(mrss_url, video_id) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 81918ac6e..f6f423597 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -516,9 +516,14 @@ class PBSIE(InfoExtractor): # https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications if not bitrate or bitrate not in ('400k', '800k', '1200k', '2500k'): continue + f_url = re.sub(r'\d+k|baseline', bitrate, http_url) + # This may produce invalid links sometimes (e.g. + # http://www.pbs.org/wgbh/frontline/film/suicide-plan) + if not self._is_valid_url(f_url, display_id, 'http-%s video' % bitrate): + continue f = m3u8_format.copy() f.update({ - 'url': re.sub(r'\d+k|baseline', bitrate, http_url), + 'url': f_url, 'format_id': m3u8_format['format_id'].replace('hls', 'http'), 'protocol': 'http', }) diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py new file mode 100644 index 000000000..d3bebaea3 --- /dev/null +++ b/youtube_dl/extractor/polskieradio.py @@ -0,0 +1,95 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) +from ..utils import ( + int_or_none, + strip_or_none, + unified_timestamp, +) + + +class PolskieRadioIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P[0-9]+)' + _TESTS = [{ + 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', + 'info_dict': { + 'id': '1587943', + 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', + 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', + }, + 'playlist': [{ + 'md5': '2984ee6ce9046d91fc233bc1a864a09a', + 'info_dict': { + 'id': '1540576', + 'ext': 'mp3', + 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', + 'timestamp': 1456594200, + 'upload_date': '20160227', + 'duration': 2364, + }, + }], + }, { + 'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal', + 'info_dict': { + 'id': '1635803', + 'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał', + 'description': 'md5:01cb7d0cad58664095d72b51a1ebada2', + }, + 'playlist_mincount': 12, + }, { + 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', + 'only_matching': True, + }, { + 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943', + 'only_matching': True, + }, { + # with mp4 video + 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + content = self._search_regex( + r'(?s)]+class="audio atarticle"[^>]*>(.+?)