diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 36559dd7b..693d787e3 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.12.01*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.12.01** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.12.22*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.12.22** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.12.01 +[debug] youtube-dl version 2016.12.22 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/AUTHORS b/AUTHORS index 4a6f7e13f..9e092cccc 100644 --- a/AUTHORS +++ b/AUTHORS @@ -190,3 +190,4 @@ John Hawkinson Rich Leeper Zhong Jianxin Thor77 +Mattias Wadman diff --git a/ChangeLog b/ChangeLog index bf5f26943..c45441345 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,8 +1,105 @@ -version +version 2016.12.22 + +Core +* [extractor/common] Improve detection of video-only formats in m3u8 + manifests (#11507) Extractors ++ [theplatform] Pass geo verification headers to SMIL request (#10146) ++ [viu] Pass geo verification headers to auth request +* [rtl2] Extract more formats and metadata +* [vbox7] Skip malformed JSON-LD (#11501) +* [uplynk] Force downloading using native HLS downloader (#11496) ++ [laola1] Add support for another extraction scenario (#11460) + + +version 2016.12.20 + +Core +* [extractor/common] Improve fragment URL construction for DASH media +* [extractor/common] Fix codec information extraction for mixed audio/video + DASH media (#11490) + +Extractors +* [vbox7] Fix extraction (#11494) ++ [uktvplay] Add support for uktvplay.uktv.co.uk (#11027) ++ [piksel] Add support for player.piksel.com (#11246) ++ [vimeo] Add support for DASH formats +* [vimeo] Fix extraction for HLS formats (#11490) +* [kaltura] Fix wrong widget ID in some cases (#11480) ++ [nrktv:direkte] Add support for live streams (#11488) +* [pbs] Fix extraction for geo restricted videos (#7095) +* [brightcove:new] Skip widevine classic videos ++ [viu] Add support for viu.com (#10607, #11329) + + +version 2016.12.18 + +Core ++ [extractor/common] Recognize DASH formats in html5 media entries + +Extractors ++ [ccma] Add support for ccma.cat (#11359) +* [laola1tv] Improve extraction ++ [laola1tv] Add support embed URLs (#11460) +* [nbc] Fix extraction for MSNBC videos (#11466) +* [twitch] Adapt to new videos pages URL schema (#11469) ++ [meipai] Add support for meipai.com (#10718) +* [jwplatform] Improve subtitles and duration extraction ++ [ondemandkorea] Add support for ondemandkorea.com (#10772) ++ [vvvvid] Add support for vvvvid.it (#5915) + + +version 2016.12.15 + +Core ++ [utils] Add convenience urljoin + +Extractors ++ [openload] Recognize oload.tv URLs (#10408) ++ [facebook] Recognize .onion URLs (#11443) +* [vlive] Fix extraction (#11375, #11383) ++ [canvas] Extract DASH formats ++ [melonvod] Add support for vod.melon.com (#11419) + + +version 2016.12.12 + +Core ++ [utils] Add common user agents map ++ [common] Recognize HLS manifests that contain video only formats (#11394) + +Extractors ++ [dplay] Use Safari user agent for HLS (#11418) ++ [facebook] Detect login required error message +* [facebook] Improve video selection (#11390) ++ [canalplus] Add another video id pattern (#11399) +* [mixcloud] Relax URL regular expression (#11406) +* [ctvnews] Relax URL regular expression (#11394) ++ [rte] Capture and output error message (#7746, #10498) ++ [prosiebensat1] Add support for DASH formats +* [srgssr] Improve extraction for geo restricted videos (#11089) +* [rts] Improve extraction for geo restricted videos (#4989) + + +version 2016.12.09 + +Core +* [socks] Fix error reporting (#11355) + +Extractors +* [openload] Fix extraction (#10408) +* [pandoratv] Fix extraction (#11023) ++ [telebruxelles] Add support for emission URLs +* [telebruxelles] Extract all formats ++ [bloomberg] Add another video id regular expression (#11371) +* [fusion] Update ooyala id regular expression (#11364) ++ [1tv] Add support for playlists (#11335) +* [1tv] Improve extraction (#11335) ++ [aenetworks] Extract more formats (#11321) + [thisoldhouse] Recognize /tv-episode/ URLs (#11271) + version 2016.12.01 Extractors diff --git a/README.md b/README.md index 840932298..71d37e8b0 100644 --- a/README.md +++ b/README.md @@ -638,7 +638,7 @@ Also filtering work for comparisons `=` (equals), `!=` (not equals), `^=` (begin - `acodec`: Name of the audio codec in use - `vcodec`: Name of the video codec in use - `container`: Name of the container format - - `protocol`: The protocol that will be used for the actual download, lower-case. `http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `m3u8`, or `m3u8_native` + - `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `m3u8`, or `m3u8_native`) - `format_id`: A short description of the format Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by the video hoster. @@ -932,7 +932,7 @@ If you want to create a build of youtube-dl yourself, you'll need If you want to add support for a new site, first of all **make sure** this site is **not dedicated to [copyright infringement](README.md#can-you-add-support-for-this-anime-video-site-or-site-which-shows-current-movies-for-free)**. youtube-dl does **not support** such sites thus pull requests adding support for them **will be rejected**. -After you have ensured this site is distributing it's content legally, you can follow this quick list (assuming your service is called `yourextractor`): +After you have ensured this site is distributing its content legally, you can follow this quick list (assuming your service is called `yourextractor`): 1. [Fork this repository](https://github.com/rg3/youtube-dl/fork) 2. Check out the source code with: diff --git a/docs/supportedsites.md b/docs/supportedsites.md index edb76d9cc..0b3d794c6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -131,6 +131,7 @@ - **cbsnews**: CBS News - **cbsnews:livevideo**: CBS News Live Videos - **CBSSports** + - **CCMA** - **CCTV** - **CDA** - **CeskaTelevize** @@ -364,7 +365,8 @@ - **kuwo:singer**: 酷我音乐 - 歌手 - **kuwo:song**: 酷我音乐 - **la7.it** - - **Laola1Tv** + - **laola1tv** + - **laola1tv:embed** - **LCI** - **Lcp** - **LcpPlay** @@ -402,6 +404,8 @@ - **MatchTV** - **MDR**: MDR.DE and KiKA - **media.ccc.de** + - **Meipai**: 美拍 + - **MelonVOD** - **META** - **metacafe** - **Metacritic** @@ -513,6 +517,7 @@ - **NRKPlaylist** - **NRKSkole**: NRK Skole - **NRKTV**: NRK TV and NRK Radio + - **NRKTVDirekte**: NRK TV Direkte and NRK Radio Direkte - **ntv.ru** - **Nuvid** - **NYTimes** @@ -523,6 +528,7 @@ - **Odnoklassniki** - **OktoberfestTV** - **on.aol.com** + - **OnDemandKorea** - **onet.tv** - **onet.tv:channel** - **OnionStudios** @@ -546,6 +552,7 @@ - **PhilharmonieDeParis**: Philharmonie de Paris - **phoenix.de** - **Photobucket** + - **Piksel** - **Pinkbike** - **Pladform** - **play.fm** @@ -784,10 +791,13 @@ - **Tweakers** - **twitch:chapter** - **twitch:clips** - - **twitch:past_broadcasts** - **twitch:profile** - **twitch:stream** - **twitch:video** + - **twitch:videos:all** + - **twitch:videos:highlights** + - **twitch:videos:past-broadcasts** + - **twitch:videos:uploads** - **twitch:vod** - **twitter** - **twitter:amplify** @@ -795,6 +805,7 @@ - **udemy** - **udemy:course** - **UDNEmbed**: 聯合影音 + - **UKTVPlay** - **Unistra** - **uol.com.br** - **uplynk** @@ -859,6 +870,9 @@ - **Vimple**: Vimple - one-click video hosting - **Vine** - **vine:user** + - **Viu** + - **viu:ott** + - **viu:playlist** - **Vivo**: vivo.sx - **vk**: VK - **vk:uservideos**: VK - User's Videos @@ -873,6 +887,7 @@ - **VRT** - **vube**: Vube.com - **VuClip** + - **VVVVID** - **VyboryMos** - **Vzaar** - **Walla** diff --git a/test/test_utils.py b/test/test_utils.py index 2e3cd0179..3092db5c1 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -70,6 +70,7 @@ from youtube_dl.utils import ( lowercase_escape, url_basename, base_url, + urljoin, urlencode_postdata, urshift, update_url_query, @@ -445,6 +446,23 @@ class TestUtil(unittest.TestCase): self.assertEqual(base_url('http://foo.de/bar/baz'), 'http://foo.de/bar/') self.assertEqual(base_url('http://foo.de/bar/baz?x=z/x/c'), 'http://foo.de/bar/') + def test_urljoin(self): + self.assertEqual(urljoin('http://foo.de/', '/a/b/c.txt'), 'http://foo.de/a/b/c.txt') + self.assertEqual(urljoin('//foo.de/', '/a/b/c.txt'), '//foo.de/a/b/c.txt') + self.assertEqual(urljoin('http://foo.de/', 'a/b/c.txt'), 'http://foo.de/a/b/c.txt') + self.assertEqual(urljoin('http://foo.de', '/a/b/c.txt'), 'http://foo.de/a/b/c.txt') + self.assertEqual(urljoin('http://foo.de', 'a/b/c.txt'), 'http://foo.de/a/b/c.txt') + self.assertEqual(urljoin('http://foo.de/', 'http://foo.de/a/b/c.txt'), 'http://foo.de/a/b/c.txt') + self.assertEqual(urljoin('http://foo.de/', '//foo.de/a/b/c.txt'), '//foo.de/a/b/c.txt') + self.assertEqual(urljoin(None, 'http://foo.de/a/b/c.txt'), 'http://foo.de/a/b/c.txt') + self.assertEqual(urljoin(None, '//foo.de/a/b/c.txt'), '//foo.de/a/b/c.txt') + self.assertEqual(urljoin('', 'http://foo.de/a/b/c.txt'), 'http://foo.de/a/b/c.txt') + self.assertEqual(urljoin(['foobar'], 'http://foo.de/a/b/c.txt'), 'http://foo.de/a/b/c.txt') + self.assertEqual(urljoin('http://foo.de/', None), None) + self.assertEqual(urljoin('http://foo.de/', ''), None) + self.assertEqual(urljoin('http://foo.de/', ['foobar']), None) + self.assertEqual(urljoin('http://foo.de/a/b/c.txt', '.././../d.txt'), 'http://foo.de/d.txt') + def test_parse_age_limit(self): self.assertEqual(parse_age_limit(None), None) self.assertEqual(parse_age_limit(False), None) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 7373ec05f..4989abce1 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -65,6 +65,9 @@ class HlsFD(FragmentFD): s = manifest.decode('utf-8', 'ignore') if not self.can_download(s, info_dict): + if info_dict.get('extra_param_to_segment_url'): + self.report_error('pycrypto not found. Please install it.') + return False self.report_warning( 'hlsnative has detected features it does not support, ' 'extraction will be delegated to ffmpeg') diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py index 94ce88c83..6dace3051 100644 --- a/youtube_dl/extractor/acast.py +++ b/youtube_dl/extractor/acast.py @@ -8,6 +8,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( int_or_none, + parse_iso8601, OnDemandPagedList, ) @@ -15,18 +16,33 @@ from ..utils import ( class ACastIE(InfoExtractor): IE_NAME = 'acast' _VALID_URL = r'https?://(?:www\.)?acast\.com/(?P[^/]+)/(?P[^/#?]+)' - _TEST = { + _TESTS = [{ + # test with one bling 'url': 'https://www.acast.com/condenasttraveler/-where-are-you-taipei-101-taiwan', 'md5': 'ada3de5a1e3a2a381327d749854788bb', 'info_dict': { 'id': '57de3baa-4bb0-487e-9418-2692c1277a34', 'ext': 'mp3', 'title': '"Where Are You?": Taipei 101, Taiwan', - 'timestamp': 1196172000000, + 'timestamp': 1196172000, + 'upload_date': '20071127', 'description': 'md5:a0b4ef3634e63866b542e5b1199a1a0e', 'duration': 211, } - } + }, { + # test with multiple blings + 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', + 'md5': '55c0097badd7095f494c99a172f86501', + 'info_dict': { + 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', + 'ext': 'mp3', + 'title': '2. Raggarmordet - Röster ur det förflutna', + 'timestamp': 1477346700, + 'upload_date': '20161024', + 'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4', + 'duration': 2797, + } + }] def _real_extract(self, url): channel, display_id = re.match(self._VALID_URL, url).groups() @@ -35,11 +51,11 @@ class ACastIE(InfoExtractor): return { 'id': compat_str(cast_data['id']), 'display_id': display_id, - 'url': cast_data['blings'][0]['audio'], + 'url': [b['audio'] for b in cast_data['blings'] if b['type'] == 'BlingAudio'][0], 'title': cast_data['name'], 'description': cast_data.get('description'), 'thumbnail': cast_data.get('image'), - 'timestamp': int_or_none(cast_data.get('publishingDate')), + 'timestamp': parse_iso8601(cast_data.get('publishingDate')), 'duration': int_or_none(cast_data.get('duration')), } diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 945cf19e8..aa2923ccf 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -232,13 +232,16 @@ class BrightcoveLegacyIE(InfoExtractor): """Return a list of all Brightcove URLs from the webpage """ url_m = re.search( - r']+ + content=([\'"])(?Phttps?://(?:secure|c)\.brightcove.com/(?:(?!\2).)+)\2 + ''', webpage) if url_m: - url = unescapeHTML(url_m.group(1)) + url = unescapeHTML(url_m.group('url')) # Some sites don't add it, we can't download with this url, for example: # http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/ - if 'playerKey' in url or 'videoId' in url: + if 'playerKey' in url or 'videoId' in url or 'idVideo' in url: return [url] matches = re.findall( @@ -259,7 +262,7 @@ class BrightcoveLegacyIE(InfoExtractor): url, smuggled_data = unsmuggle_url(url, {}) # Change the 'videoId' and others field to '@videoPlayer' - url = re.sub(r'(?<=[?&])(videoI(d|D)|bctid)', '%40videoPlayer', url) + url = re.sub(r'(?<=[?&])(videoI(d|D)|idVideo|bctid)', '%40videoPlayer', url) # Change bckey (used by bcove.me urls) to playerKey url = re.sub(r'(?<=[?&])bckey', 'playerKey', url) mobj = re.match(self._VALID_URL, url) @@ -548,7 +551,7 @@ class BrightcoveNewIE(InfoExtractor): container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') - if ext == 'ism': + if ext == 'ism' or container == 'WVM': continue elif ext == 'm3u8' or container == 'M2TS': if not src: diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 1c3c41d26..10cf165bc 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -105,7 +105,8 @@ class CanalplusIE(InfoExtractor): webpage = self._download_webpage(url, display_id) video_id = self._search_regex( [r']+?videoId=(["\'])(?P\d+)', - r'id=["\']canal_video_player(?P\d+)'], + r'id=["\']canal_video_player(?P\d+)', + r'data-video=["\'](?P\d+)'], webpage, 'video id', group='id') info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id) diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index d183d5d52..2cc539a6c 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -89,6 +89,9 @@ class CanvasIE(InfoExtractor): elif format_type == 'HDS': formats.extend(self._extract_f4m_formats( format_url, display_id, f4m_id=format_type, fatal=False)) + elif format_type == 'MPEG_DASH': + formats.extend(self._extract_mpd_formats( + format_url, display_id, mpd_id=format_type, fatal=False)) else: formats.append({ 'format_id': format_type, diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index d71fddf58..7c76ceac8 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -283,11 +283,6 @@ class CBCWatchVideoIE(CBCWatchBaseIE): formats = self._extract_m3u8_formats(re.sub(r'/([^/]+)/[^/?]+\.m3u8', r'/\1/\1.m3u8', m3u8_url), video_id, 'mp4', fatal=False) if len(formats) < 2: formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') - # Despite metadata in m3u8 all video+audio formats are - # actually video-only (no audio) - for f in formats: - if f.get('acodec') != 'none' and f.get('vcodec') != 'none': - f['acodec'] = 'none' self._sort_formats(formats) info = { diff --git a/youtube_dl/extractor/ccma.py b/youtube_dl/extractor/ccma.py new file mode 100644 index 000000000..39938c9ac --- /dev/null +++ b/youtube_dl/extractor/ccma.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, + clean_html, +) + + +class CCMAIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ccma\.cat/(?:[^/]+/)*?(?Pvideo|audio)/(?P\d+)' + _TESTS = [{ + 'url': 'http://www.ccma.cat/tv3/alacarta/lespot-de-la-marato-de-tv3/lespot-de-la-marato-de-tv3/video/5630208/', + 'md5': '7296ca43977c8ea4469e719c609b0871', + 'info_dict': { + 'id': '5630208', + 'ext': 'mp4', + 'title': 'L\'espot de La Marató de TV3', + 'description': 'md5:f12987f320e2f6e988e9908e4fe97765', + 'timestamp': 1470918540, + 'upload_date': '20160811', + } + }, { + 'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/', + 'md5': 'fa3e38f269329a278271276330261425', + 'info_dict': { + 'id': '943685', + 'ext': 'mp3', + 'title': 'El Consell de Savis analitza el derbi', + 'description': 'md5:e2a3648145f3241cb9c6b4b624033e53', + 'upload_date': '20171205', + 'timestamp': 1512507300, + } + }] + + def _real_extract(self, url): + media_type, media_id = re.match(self._VALID_URL, url).groups() + media_data = {} + formats = [] + profiles = ['pc'] if media_type == 'audio' else ['mobil', 'pc'] + for i, profile in enumerate(profiles): + md = self._download_json('http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={ + 'media': media_type, + 'idint': media_id, + 'profile': profile, + }, fatal=False) + if md: + media_data = md + media_url = media_data.get('media', {}).get('url') + if media_url: + formats.append({ + 'format_id': profile, + 'url': media_url, + 'quality': i, + }) + self._sort_formats(formats) + + informacio = media_data['informacio'] + title = informacio['titol'] + durada = informacio.get('durada', {}) + duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) + timestamp = parse_iso8601(informacio.get('data_emissio', {}).get('utc')) + + subtitles = {} + subtitols = media_data.get('subtitols', {}) + if subtitols: + sub_url = subtitols.get('url') + if sub_url: + subtitles.setdefault( + subtitols.get('iso') or subtitols.get('text') or 'ca', []).append({ + 'url': sub_url, + }) + + thumbnails = [] + imatges = media_data.get('imatges', {}) + if imatges: + thumbnail_url = imatges.get('url') + if thumbnail_url: + thumbnails = [{ + 'url': thumbnail_url, + 'width': int_or_none(imatges.get('amplada')), + 'height': int_or_none(imatges.get('alcada')), + }] + + return { + 'id': media_id, + 'title': title, + 'description': clean_html(informacio.get('descripcio')), + 'duration': duration, + 'timestamp': timestamp, + 'thumnails': thumbnails, + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 05c51fac9..6fa7c334e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -59,6 +59,7 @@ from ..utils import ( parse_m3u8_attributes, extract_attributes, parse_codecs, + urljoin, ) @@ -188,9 +189,10 @@ class InfoExtractor(object): uploader_url: Full URL to a personal webpage of the video uploader. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format - {language: subformats}. "subformats" is a list sorted from - lower to higher preference, each element is a dictionary - with the "ext" entry and one of: + {tag: subformats}. "tag" is usually a language code, and + "subformats" is a list sorted from lower to higher + preference, each element is a dictionary with the "ext" + entry and one of: * "data": The subtitles file contents * "url": A URL pointing to the subtitles file "ext" will be calculated from URL if missing @@ -1224,6 +1226,7 @@ class InfoExtractor(object): 'protocol': entry_protocol, 'preference': preference, }] + audio_in_video_stream = {} last_info = {} last_media = {} for line in m3u8_doc.splitlines(): @@ -1233,25 +1236,32 @@ class InfoExtractor(object): media = parse_m3u8_attributes(line) media_type = media.get('TYPE') if media_type in ('VIDEO', 'AUDIO'): + group_id = media.get('GROUP-ID') media_url = media.get('URI') if media_url: format_id = [] - for v in (media.get('GROUP-ID'), media.get('NAME')): + for v in (group_id, media.get('NAME')): if v: format_id.append(v) - formats.append({ + f = { 'format_id': '-'.join(format_id), 'url': format_url(media_url), 'language': media.get('LANGUAGE'), - 'vcodec': 'none' if media_type == 'AUDIO' else None, 'ext': ext, 'protocol': entry_protocol, 'preference': preference, - }) + } + if media_type == 'AUDIO': + f['vcodec'] = 'none' + if group_id and not audio_in_video_stream.get(group_id): + audio_in_video_stream[group_id] = False + formats.append(f) else: # When there is no URI in EXT-X-MEDIA let this tag's # data be used by regular URI lines below last_media = media + if media_type == 'AUDIO' and group_id: + audio_in_video_stream[group_id] = True elif line.startswith('#') or not line.strip(): continue else: @@ -1295,6 +1305,9 @@ class InfoExtractor(object): 'abr': abr, }) f.update(parse_codecs(last_info.get('CODECS'))) + if audio_in_video_stream.get(last_info.get('AUDIO')) is False: + # TODO: update acodec for for audio only formats with the same GROUP-ID + f['acodec'] = 'none' formats.append(f) last_info = {} last_media = {} @@ -1624,11 +1637,6 @@ class InfoExtractor(object): extract_Initialization(segment_template) return ms_info - def combine_url(base_url, target_url): - if re.match(r'^https?://', target_url): - return target_url - return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url) - mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) formats = [] for period in mpd_doc.findall(_add_ns('Period')): @@ -1678,12 +1686,11 @@ class InfoExtractor(object): 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000), 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), 'fps': int_or_none(representation_attrib.get('frameRate')), - 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'), - 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'), 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, 'format_note': 'DASH %s' % content_type, 'filesize': filesize, } + f.update(parse_codecs(representation_attrib.get('codecs'))) representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info: @@ -1767,7 +1774,7 @@ class InfoExtractor(object): f['fragments'].append({'url': initialization_url}) f['fragments'].extend(representation_ms_info['fragments']) for fragment in f['fragments']: - fragment['url'] = combine_url(base_url, fragment['url']) + fragment['url'] = urljoin(base_url, fragment['url']) try: existing_format = next( fo for fo in formats @@ -1881,7 +1888,7 @@ class InfoExtractor(object): }) return formats - def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8'): + def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None): def absolute_url(video_url): return compat_urlparse.urljoin(base_url, video_url) @@ -1898,11 +1905,16 @@ class InfoExtractor(object): def _media_formats(src, cur_media_type): full_url = absolute_url(src) - if determine_ext(full_url) == 'm3u8': + ext = determine_ext(full_url) + if ext == 'm3u8': is_plain_url = False formats = self._extract_m3u8_formats( full_url, video_id, ext='mp4', entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id) + elif ext == 'mpd': + is_plain_url = False + formats = self._extract_mpd_formats( + full_url, video_id, mpd_id=mpd_id) else: is_plain_url = True formats = [{ diff --git a/youtube_dl/extractor/ctvnews.py b/youtube_dl/extractor/ctvnews.py index 1023b6130..55a127b76 100644 --- a/youtube_dl/extractor/ctvnews.py +++ b/youtube_dl/extractor/ctvnews.py @@ -8,7 +8,7 @@ from ..utils import orderedSet class CTVNewsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P[0-9.]+)' + _VALID_URL = r'https?://(?:.+?\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P[0-9.]+)' _TESTS = [{ 'url': 'http://www.ctvnews.ca/video?clipId=901995', 'md5': '10deb320dc0ccb8d01d34d12fc2ea672', @@ -40,6 +40,9 @@ class CTVNewsIE(InfoExtractor): }, { 'url': 'http://www.ctvnews.ca/canadiens-send-p-k-subban-to-nashville-in-blockbuster-trade-1.2967231', 'only_matching': True, + }, { + 'url': 'http://vancouverisland.ctvnews.ca/video?clipId=761241', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 5790553f3..32028bc3b 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -8,6 +8,7 @@ import time from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( + USER_AGENTS, int_or_none, update_url_query, ) @@ -102,10 +103,16 @@ class DPlayIE(InfoExtractor): manifest_url, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=protocol, fatal=False) # Sometimes final URLs inside m3u8 are unsigned, let's fix this - # ourselves + # ourselves. Also fragments' URLs are only served signed for + # Safari user agent. query = compat_urlparse.parse_qs(compat_urlparse.urlparse(manifest_url).query) for m3u8_format in m3u8_formats: - m3u8_format['url'] = update_url_query(m3u8_format['url'], query) + m3u8_format.update({ + 'url': update_url_query(m3u8_format['url'], query), + 'http_headers': { + 'User-Agent': USER_AGENTS['Safari'], + }, + }) formats.extend(m3u8_formats) elif protocol == 'hds': formats.extend(self._extract_f4m_formats( diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 46d007b7d..fcfe87f6f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -150,6 +150,7 @@ from .cbsnews import ( ) from .cbssports import CBSSportsIE from .ccc import CCCIE +from .ccma import CCMAIE from .cctv import CCTVIE from .cda import CDAIE from .ceskatelevize import CeskaTelevizeIE @@ -446,7 +447,10 @@ from .kuwo import ( KuwoMvIE, ) from .la7 import LA7IE -from .laola1tv import Laola1TvIE +from .laola1tv import ( + Laola1TvEmbedIE, + Laola1TvIE, +) from .lci import LCIIE from .lcp import ( LcpPlayIE, @@ -498,6 +502,8 @@ from .mangomolo import ( ) from .matchtv import MatchTVIE from .mdr import MDRIE +from .meipai import MeipaiIE +from .melonvod import MelonVODIE from .meta import METAIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE @@ -649,6 +655,7 @@ from .nrk import ( NRKPlaylistIE, NRKSkoleIE, NRKTVIE, + NRKTVDirekteIE, ) from .ntvde import NTVDeIE from .ntvru import NTVRuIE @@ -661,6 +668,7 @@ from .nzz import NZZIE from .odatv import OdaTVIE from .odnoklassniki import OdnoklassnikiIE from .oktoberfesttv import OktoberfestTVIE +from .ondemandkorea import OnDemandKoreaIE from .onet import ( OnetIE, OnetChannelIE, @@ -691,6 +699,7 @@ from .periscope import ( from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE +from .piksel import PikselIE from .pinkbike import PinkbikeIE from .pladform import PladformIE from .playfm import PlayFMIE @@ -998,7 +1007,10 @@ from .twitch import ( TwitchChapterIE, TwitchVodIE, TwitchProfileIE, + TwitchAllVideosIE, + TwitchUploadsIE, TwitchPastBroadcastsIE, + TwitchHighlightsIE, TwitchStreamIE, TwitchClipsIE, ) @@ -1012,6 +1024,7 @@ from .udemy import ( UdemyCourseIE ) from .udn import UDNEmbedIE +from .uktvplay import UKTVPlayIE from .digiteka import DigitekaIE from .unistra import UnistraIE from .uol import UOLIE @@ -1095,6 +1108,11 @@ from .viki import ( VikiIE, VikiChannelIE, ) +from .viu import ( + ViuIE, + ViuPlaylistIE, + ViuOTTIE, +) from .vk import ( VKIE, VKUserVideosIE, @@ -1109,6 +1127,7 @@ from .vporn import VpornIE from .vrt import VRTIE from .vube import VubeIE from .vuclip import VuClipIE +from .vvvvid import VVVVIDIE from .vyborymos import VyboryMosIE from .vzaar import VzaarIE from .walla import WallaIE diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index b4d38e5c2..c0a7fc7d8 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -27,7 +27,7 @@ class FacebookIE(InfoExtractor): _VALID_URL = r'''(?x) (?: https?:// - (?:[\w-]+\.)?facebook\.com/ + (?:[\w-]+\.)?(?:facebook\.com|facebookcorewwwi\.onion)/ (?:[^#]*?\#!/)? (?: (?: @@ -150,6 +150,9 @@ class FacebookIE(InfoExtractor): }, { 'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/', 'only_matching': True, + }, { + 'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670', + 'only_matching': True, }] @staticmethod @@ -244,8 +247,10 @@ class FacebookIE(InfoExtractor): r'handleServerJS\(({.+})(?:\);|,")', webpage, 'server js data', default='{}'), video_id) for item in server_js_data.get('instances', []): if item[1][0] == 'VideoConfig': - video_data = item[2][0]['videoData'] - break + video_item = item[2][0] + if video_item.get('video_id') == video_id: + video_data = video_item['videoData'] + break if not video_data: if not fatal_if_no_video: @@ -255,6 +260,8 @@ class FacebookIE(InfoExtractor): raise ExtractorError( 'The video is not available, Facebook said: "%s"' % m_msg.group(1), expected=True) + elif '>You must log in to continue' in webpage: + self.raise_login_required() else: raise ExtractorError('Cannot parse data') diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 3949c8bf7..79d10a1d1 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -75,6 +75,7 @@ from .facebook import FacebookIE from .soundcloud import SoundcloudIE from .vbox7 import Vbox7IE from .dbtv import DBTVIE +from .piksel import PikselIE class GenericIE(InfoExtractor): @@ -343,10 +344,10 @@ class GenericIE(InfoExtractor): }, 'skip': 'There is a limit of 200 free downloads / month for the test song', }, - # embedded brightcove video - # it also tests brightcove videos that need to set the 'Referer' in the - # http requests { + # embedded brightcove video + # it also tests brightcove videos that need to set the 'Referer' + # in the http requests 'add_ie': ['BrightcoveLegacy'], 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', 'info_dict': { @@ -360,6 +361,24 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # embedded with itemprop embedURL and video id spelled as `idVideo` + 'add_id': ['BrightcoveLegacy'], + 'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/', + 'info_dict': { + 'id': '5255628253001', + 'ext': 'mp4', + 'title': 'md5:37c519b1128915607601e75a87995fc0', + 'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26', + 'uploader': 'BFM BUSINESS', + 'uploader_id': '876450612001', + 'timestamp': 1482255315, + 'upload_date': '20161220', + }, + 'params': { + 'skip_download': True, + }, + }, { # https://github.com/rg3/youtube-dl/issues/2253 'url': 'http://bcove.me/i6nfkrc3', @@ -972,6 +991,20 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, + { + # Kaltura embedded, some fileExt broken (#11480) + 'url': 'http://www.cornell.edu/video/nima-arkani-hamed-standard-models-of-particle-physics', + 'info_dict': { + 'id': '1_sgtvehim', + 'ext': 'mp4', + 'title': 'Our "Standard Models" of particle physics and cosmology', + 'description': 'md5:67ea74807b8c4fea92a6f38d6d323861', + 'timestamp': 1321158993, + 'upload_date': '20111113', + 'uploader_id': 'kps1', + }, + 'add_ie': ['Kaltura'], + }, # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', @@ -2211,6 +2244,11 @@ class GenericIE(InfoExtractor): if arkena_url: return self.url_result(arkena_url, ArkenaIE.ie_key()) + # Look for Piksel embeds + piksel_url = PikselIE._extract_url(webpage) + if piksel_url: + return self.url_result(piksel_url, PikselIE.ie_key()) + # Look for Limelight embeds mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P[a-z0-9]{32})', webpage) if mobj: diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 5d56e0a28..aff7ab49a 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -11,6 +11,7 @@ from ..utils import ( int_or_none, js_to_json, mimetype2ext, + urljoin, ) @@ -110,10 +111,14 @@ class JWPlatformBaseIE(InfoExtractor): tracks = video_data.get('tracks') if tracks and isinstance(tracks, list): for track in tracks: - if track.get('file') and track.get('kind') == 'captions': - subtitles.setdefault(track.get('label') or 'en', []).append({ - 'url': self._proto_relative_url(track['file']) - }) + if track.get('kind') != 'captions': + continue + track_url = urljoin(base_url, track.get('file')) + if not track_url: + continue + subtitles.setdefault(track.get('label') or 'en', []).append({ + 'url': self._proto_relative_url(track_url) + }) entries.append({ 'id': this_video_id, @@ -121,7 +126,7 @@ class JWPlatformBaseIE(InfoExtractor): 'description': video_data.get('description'), 'thumbnail': self._proto_relative_url(video_data.get('image')), 'timestamp': int_or_none(video_data.get('pubdate')), - 'duration': float_or_none(jwplayer_data.get('duration')), + 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), 'subtitles': subtitles, 'formats': formats, }) diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 91bc3a0a7..c0ddad6f9 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -107,7 +107,7 @@ class KalturaIE(InfoExtractor): (?P['\"])wid(?P=q1)\s*:\s* (?P['\"])_?(?P(?:(?!(?P=q2)).)+)(?P=q2),.*? (?P['\"])entry_?[Ii]d(?P=q3)\s*:\s* - (?P['\"])(?P(?:(?!(?P=q4)).)+)(?P=q4), + (?P['\"])(?P(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\}) """, webpage) or re.search( r'''(?xs) @@ -266,6 +266,9 @@ class KalturaIE(InfoExtractor): # skip for now. if f.get('fileExt') == 'chun': continue + if not f.get('fileExt') and f.get('containerFormat') == 'qt': + # QT indicates QuickTime; some videos have broken fileExt + f['fileExt'] = 'mov' video_url = sign_url( '%s/flavorId/%s' % (data_url, f['id'])) # audio-only has no videoCodecId (e.g. kaltura:1926081:0_c03e1b5g diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 2fab38079..3190b187c 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -1,25 +1,115 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlencode, - compat_urlparse, -) from ..utils import ( ExtractorError, - sanitized_Request, unified_strdate, urlencode_postdata, xpath_element, xpath_text, + urljoin, + update_url_query, ) +class Laola1TvEmbedIE(InfoExtractor): + IE_NAME = 'laola1tv:embed' + _VALID_URL = r'https?://(?:www\.)?laola1\.tv/titanplayer\.php\?.*?\bvideoid=(?P\d+)' + _TEST = { + # flashvars.premium = "false"; + 'url': 'https://www.laola1.tv/titanplayer.php?videoid=708065&type=V&lang=en&portal=int&customer=1024', + 'info_dict': { + 'id': '708065', + 'ext': 'mp4', + 'title': 'MA Long CHN - FAN Zhendong CHN', + 'uploader': 'ITTF - International Table Tennis Federation', + 'upload_date': '20161211', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + flash_vars = self._search_regex( + r'(?s)flashvars\s*=\s*({.+?});', webpage, 'flash vars') + + def get_flashvar(x, *args, **kwargs): + flash_var = self._search_regex( + r'%s\s*:\s*"([^"]+)"' % x, + flash_vars, x, default=None) + if not flash_var: + flash_var = self._search_regex([ + r'flashvars\.%s\s*=\s*"([^"]+)"' % x, + r'%s\s*=\s*"([^"]+)"' % x], + webpage, x, *args, **kwargs) + return flash_var + + hd_doc = self._download_xml( + 'http://www.laola1.tv/server/hd_video.php', video_id, query={ + 'play': get_flashvar('streamid'), + 'partner': get_flashvar('partnerid'), + 'portal': get_flashvar('portalid'), + 'lang': get_flashvar('sprache'), + 'v5ident': '', + }) + + _v = lambda x, **k: xpath_text(hd_doc, './/video/' + x, **k) + title = _v('title', fatal=True) + + token_url = None + premium = get_flashvar('premium', default=None) + if premium: + token_url = update_url_query( + _v('url', fatal=True), { + 'timestamp': get_flashvar('timestamp'), + 'auth': get_flashvar('auth'), + }) + else: + data_abo = urlencode_postdata( + dict((i, v) for i, v in enumerate(_v('req_liga_abos').split(',')))) + token_url = self._download_json( + 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access', + video_id, query={ + 'videoId': _v('id'), + 'target': self._search_regex(r'vs_target = (\d+);', webpage, 'vs target'), + 'label': _v('label'), + 'area': _v('area'), + }, data=data_abo)['data']['stream-access'][0] + + token_doc = self._download_xml( + token_url, video_id, 'Downloading token', + headers=self.geo_verification_headers()) + + token_attrib = xpath_element(token_doc, './/token').attrib + + if token_attrib['status'] != '0': + raise ExtractorError( + 'Token error: %s' % token_attrib['comment'], expected=True) + + formats = self._extract_akamai_formats( + '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']), + video_id) + self._sort_formats(formats) + + categories_str = _v('meta_sports') + categories = categories_str.split(',') if categories_str else [] + is_live = _v('islive') == 'true' + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'upload_date': unified_strdate(_v('time_date')), + 'uploader': _v('meta_organisation'), + 'categories': categories, + 'is_live': is_live, + 'formats': formats, + } + + class Laola1TvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?laola1\.tv/(?P[a-z]+)-(?P[a-z]+)/(?P[^/]+)/(?P[^/?#&]+)' + IE_NAME = 'laola1tv' + _VALID_URL = r'https?://(?:www\.)?laola1\.tv/[a-z]+-[a-z]+/[^/]+/(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://www.laola1.tv/de-de/video/straubing-tigers-koelner-haie/227883.html', 'info_dict': { @@ -67,85 +157,20 @@ class Laola1TvIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('slug') - kind = mobj.group('kind') - lang = mobj.group('lang') - portal = mobj.group('portal') + display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) if 'Dieser Livestream ist bereits beendet.' in webpage: raise ExtractorError('This live stream has already finished.', expected=True) - iframe_url = self._search_regex( + iframe_url = urljoin(url, self._search_regex( r']*?id="videoplayer"[^>]*?src="([^"]+)"', - webpage, 'iframe url') - - video_id = self._search_regex( - r'videoid=(\d+)', iframe_url, 'video id') - - iframe = self._download_webpage(compat_urlparse.urljoin( - url, iframe_url), display_id, 'Downloading iframe') - - partner_id = self._search_regex( - r'partnerid\s*:\s*(["\'])(?P.+?)\1', - iframe, 'partner id', group='partner_id') - - hd_doc = self._download_xml( - 'http://www.laola1.tv/server/hd_video.php?%s' - % compat_urllib_parse_urlencode({ - 'play': video_id, - 'partner': partner_id, - 'portal': portal, - 'lang': lang, - 'v5ident': '', - }), display_id) - - _v = lambda x, **k: xpath_text(hd_doc, './/video/' + x, **k) - title = _v('title', fatal=True) - - VS_TARGETS = { - 'video': '2', - 'livestream': '17', - } - - req = sanitized_Request( - 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access?%s' % - compat_urllib_parse_urlencode({ - 'videoId': video_id, - 'target': VS_TARGETS.get(kind, '2'), - 'label': _v('label'), - 'area': _v('area'), - }), - urlencode_postdata( - dict((i, v) for i, v in enumerate(_v('req_liga_abos').split(','))))) - - token_url = self._download_json(req, display_id)['data']['stream-access'][0] - token_doc = self._download_xml(token_url, display_id, 'Downloading token') - - token_attrib = xpath_element(token_doc, './/token').attrib - token_auth = token_attrib['auth'] - - if token_auth in ('blocked', 'restricted', 'error'): - raise ExtractorError( - 'Token error: %s' % token_attrib['comment'], expected=True) - - formats = self._extract_f4m_formats( - '%s?hdnea=%s&hdcore=3.2.0' % (token_attrib['url'], token_auth), - video_id, f4m_id='hds') - self._sort_formats(formats) - - categories_str = _v('meta_sports') - categories = categories_str.split(',') if categories_str else [] + webpage, 'iframe url')) return { - 'id': video_id, + '_type': 'url', 'display_id': display_id, - 'title': title, - 'upload_date': unified_strdate(_v('time_date')), - 'uploader': _v('meta_organisation'), - 'categories': categories, - 'is_live': _v('islive') == 'true', - 'formats': formats, + 'url': iframe_url, + 'ie_key': 'Laola1TvEmbed', } diff --git a/youtube_dl/extractor/meipai.py b/youtube_dl/extractor/meipai.py new file mode 100644 index 000000000..35914fd4b --- /dev/null +++ b/youtube_dl/extractor/meipai.py @@ -0,0 +1,104 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_duration, + unified_timestamp, +) + + +class MeipaiIE(InfoExtractor): + IE_DESC = '美拍' + _VALID_URL = r'https?://(?:www\.)?meipai.com/media/(?P[0-9]+)' + _TESTS = [{ + # regular uploaded video + 'url': 'http://www.meipai.com/media/531697625', + 'md5': 'e3e9600f9e55a302daecc90825854b4f', + 'info_dict': { + 'id': '531697625', + 'ext': 'mp4', + 'title': '#葉子##阿桑##余姿昀##超級女聲#', + 'description': '#葉子##阿桑##余姿昀##超級女聲#', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 152, + 'timestamp': 1465492420, + 'upload_date': '20160609', + 'view_count': 35511, + 'creator': '她她-TATA', + 'tags': ['葉子', '阿桑', '余姿昀', '超級女聲'], + } + }, { + # record of live streaming + 'url': 'http://www.meipai.com/media/585526361', + 'md5': 'ff7d6afdbc6143342408223d4f5fb99a', + 'info_dict': { + 'id': '585526361', + 'ext': 'mp4', + 'title': '姿昀和善願 練歌練琴啦😁😁😁', + 'description': '姿昀和善願 練歌練琴啦😁😁😁', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 5975, + 'timestamp': 1474311799, + 'upload_date': '20160919', + 'view_count': 1215, + 'creator': '她她-TATA', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title( + webpage, default=None) or self._html_search_regex( + r']*>([^<]+)', webpage, 'title') + + formats = [] + + # recorded playback of live streaming + m3u8_url = self._html_search_regex( + r'file:\s*encodeURIComponent\((["\'])(?P(?:(?!\1).)+)\1\)', + webpage, 'm3u8 url', group='url', default=None) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + + if not formats: + # regular uploaded video + video_url = self._search_regex( + r'data-video=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video url', + group='url', default=None) + if video_url: + formats.append({ + 'url': video_url, + 'format_id': 'http', + }) + + timestamp = unified_timestamp(self._og_search_property( + 'video:release_date', webpage, 'release date', fatal=False)) + + tags = self._og_search_property( + 'video:tag', webpage, 'tags', default='').split(',') + + view_count = int_or_none(self._html_search_meta( + 'interactionCount', webpage, 'view count')) + duration = parse_duration(self._html_search_meta( + 'duration', webpage, 'duration')) + creator = self._og_search_property( + 'video:director', webpage, 'creator', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': duration, + 'timestamp': timestamp, + 'view_count': view_count, + 'creator': creator, + 'tags': tags, + 'formats': formats, + } diff --git a/youtube_dl/extractor/melonvod.py b/youtube_dl/extractor/melonvod.py new file mode 100644 index 000000000..2c80b3ba8 --- /dev/null +++ b/youtube_dl/extractor/melonvod.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + urljoin, +) + + +class MelonVODIE(InfoExtractor): + _VALID_URL = r'https?://vod\.melon\.com/video/detail2\.html?\?.*?mvId=(?P[0-9]+)' + _TEST = { + 'url': 'http://vod.melon.com/video/detail2.htm?mvId=50158734', + 'info_dict': { + 'id': '50158734', + 'ext': 'mp4', + 'title': "Jessica 'Wonderland' MV Making Film", + 'thumbnail': 're:^https?://.*\.jpg$', + 'artist': 'Jessica (제시카)', + 'upload_date': '20161212', + 'duration': 203, + }, + 'params': { + 'skip_download': 'm3u8 download', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + play_info = self._download_json( + 'http://vod.melon.com/video/playerInfo.json', video_id, + note='Downloading player info JSON', query={'mvId': video_id}) + + title = play_info['mvInfo']['MVTITLE'] + + info = self._download_json( + 'http://vod.melon.com/delivery/streamingInfo.json', video_id, + note='Downloading streaming info JSON', + query={ + 'contsId': video_id, + 'contsType': 'VIDEO', + }) + + stream_info = info['streamingInfo'] + + formats = self._extract_m3u8_formats( + stream_info['encUrl'], video_id, 'mp4', m3u8_id='hls') + self._sort_formats(formats) + + artist_list = play_info.get('artistList') + artist = None + if isinstance(artist_list, list): + artist = ', '.join( + [a['ARTISTNAMEWEBLIST'] + for a in artist_list if a.get('ARTISTNAMEWEBLIST')]) + + thumbnail = urljoin(info.get('staticDomain'), stream_info.get('imgPath')) + + duration = int_or_none(stream_info.get('playTime')) + upload_date = stream_info.get('mvSvcOpenDt', '')[:8] or None + + return { + 'id': video_id, + 'title': title, + 'artist': artist, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'duration': duration, + 'formats': formats + } diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 560fe188b..202c05dcb 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -22,7 +22,7 @@ from ..utils import ( class MixcloudIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)' + _VALID_URL = r'https?://(?:(?:www|beta|m)\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)' IE_NAME = 'mixcloud' _TESTS = [{ @@ -51,6 +51,9 @@ class MixcloudIE(InfoExtractor): 'view_count': int, 'like_count': int, }, + }, { + 'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/', + 'only_matching': True, }] # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py index d75ce8b3b..1473bcf48 100644 --- a/youtube_dl/extractor/msn.py +++ b/youtube_dl/extractor/msn.py @@ -78,11 +78,6 @@ class MSNIE(InfoExtractor): m3u8_formats = self._extract_m3u8_formats( format_url, display_id, 'mp4', m3u8_id='hls', fatal=False) - # Despite metadata in m3u8 all video+audio formats are - # actually video-only (no audio) - for f in m3u8_formats: - if f.get('acodec') != 'none' and f.get('vcodec') != 'none': - f['acodec'] = 'none' formats.extend(m3u8_formats) else: formats.append({ diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 7f1bd9229..4e96e78c3 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -9,6 +9,7 @@ from ..utils import ( lowercase_escape, smuggle_url, unescapeHTML, + update_url_query, ) @@ -208,7 +209,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', 'md5': 'af1adfa51312291a017720403826bb64', 'info_dict': { - 'id': '269389891880', + 'id': 'p_tweet_snow_140529', 'ext': 'mp4', 'title': 'How Twitter Reacted To The Snowden Interview', 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', @@ -232,7 +233,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', 'md5': '73135a2e0ef819107bbb55a5a9b2a802', 'info_dict': { - 'id': '394064451844', + 'id': 'nn_netcast_150204', 'ext': 'mp4', 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', @@ -245,7 +246,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', 'md5': 'a49e173825e5fcd15c13fc297fced39d', 'info_dict': { - 'id': '529953347624', + 'id': 'x_lon_vwhorn_150922', 'ext': 'mp4', 'title': 'Volkswagen U.S. Chief:\xa0 We Have Totally Screwed Up', 'description': 'md5:c8be487b2d80ff0594c005add88d8351', @@ -258,7 +259,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788', 'md5': '118d7ca3f0bea6534f119c68ef539f71', 'info_dict': { - 'id': '669831235788', + 'id': 'tdy_al_space_160420', 'ext': 'mp4', 'title': 'See the aurora borealis from space in stunning new NASA video', 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', @@ -271,7 +272,7 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924', 'md5': '6d236bf4f3dddc226633ce6e2c3f814d', 'info_dict': { - 'id': '314487875924', + 'id': 'n_hayes_Aimm_140801_272214', 'ext': 'mp4', 'title': 'The chaotic GOP immigration vote', 'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.', @@ -279,7 +280,6 @@ class NBCNewsIE(ThePlatformIE): 'timestamp': 1406937606, 'upload_date': '20140802', 'uploader': 'NBCU-NEWS', - 'categories': ['MSNBC/Topics/Franchise/Best of last night', 'MSNBC/Topics/General/Congress'], }, }, { @@ -311,28 +311,41 @@ class NBCNewsIE(ThePlatformIE): else: # "feature" and "nightly-news" pages use theplatform.com video_id = mobj.group('mpx_id') - if not video_id.isdigit(): - webpage = self._download_webpage(url, video_id) - info = None - bootstrap_json = self._search_regex( - [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$', - r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"'], - webpage, 'bootstrap json', default=None) + webpage = self._download_webpage(url, video_id) + + filter_param = 'byId' + bootstrap_json = self._search_regex( + [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$', + r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"', + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);'], + webpage, 'bootstrap json', default=None) + if bootstrap_json: bootstrap = self._parse_json( bootstrap_json, video_id, transform_source=unescapeHTML) + + info = None if 'results' in bootstrap: info = bootstrap['results'][0]['video'] elif 'video' in bootstrap: info = bootstrap['video'] + elif 'msnbcVideoInfo' in bootstrap: + info = bootstrap['msnbcVideoInfo']['meta'] + elif 'msnbcThePlatform' in bootstrap: + info = bootstrap['msnbcThePlatform']['videoPlayer']['video'] else: info = bootstrap - video_id = info['mpxId'] + + if 'guid' in info: + video_id = info['guid'] + filter_param = 'byGuid' + elif 'mpxId' in info: + video_id = info['mpxId'] return { '_type': 'url_transparent', 'id': video_id, # http://feed.theplatform.com/f/2E2eJC/nbcnews also works - 'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byId=%s' % video_id, + 'url': update_url_query('http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews', {filter_param: video_id}), 'ie_key': 'ThePlatformFeed', } diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index c89aac63e..776c40b94 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -48,6 +48,13 @@ class NRKBaseIE(InfoExtractor): entries = [] + conviva = data.get('convivaStatistics') or {} + live = (data.get('mediaElementType') == 'Live' or + data.get('isLive') is True or conviva.get('isLive')) + + def make_title(t): + return self._live_title(t) if live else t + media_assets = data.get('mediaAssets') if media_assets and isinstance(media_assets, list): def video_id_and_title(idx): @@ -61,6 +68,13 @@ class NRKBaseIE(InfoExtractor): if not formats: continue self._sort_formats(formats) + + # Some f4m streams may not work with hdcore in fragments' URLs + for f in formats: + extra_param = f.get('extra_param_to_segment_url') + if extra_param and 'hdcore' in extra_param: + del f['extra_param_to_segment_url'] + entry_id, entry_title = video_id_and_title(num) duration = parse_duration(asset.get('duration')) subtitles = {} @@ -72,7 +86,7 @@ class NRKBaseIE(InfoExtractor): }) entries.append({ 'id': asset.get('carrierId') or entry_id, - 'title': entry_title, + 'title': make_title(entry_title), 'duration': duration, 'subtitles': subtitles, 'formats': formats, @@ -87,7 +101,7 @@ class NRKBaseIE(InfoExtractor): duration = parse_duration(data.get('duration')) entries = [{ 'id': video_id, - 'title': title, + 'title': make_title(title), 'duration': duration, 'formats': formats, }] @@ -111,7 +125,6 @@ class NRKBaseIE(InfoExtractor): message_type, message_type)), expected=True) - conviva = data.get('convivaStatistics') or {} series = conviva.get('seriesName') or data.get('seriesTitle') episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') @@ -260,6 +273,19 @@ class NRKTVIE(NRKBaseIE): }] +class NRKTVDirekteIE(NRKTVIE): + IE_DESC = 'NRK TV Direkte and NRK Radio Direkte' + _VALID_URL = r'https?://(?:tv|radio)\.nrk\.no/direkte/(?P[^/?#&]+)' + + _TESTS = [{ + 'url': 'https://tv.nrk.no/direkte/nrk1', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/direkte/p1_oslo_akershus', + 'only_matching': True, + }] + + class NRKPlaylistIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video|skole)(?:[^/]+/)+(?P[^/]+)' diff --git a/youtube_dl/extractor/ondemandkorea.py b/youtube_dl/extractor/ondemandkorea.py new file mode 100644 index 000000000..c3e830c23 --- /dev/null +++ b/youtube_dl/extractor/ondemandkorea.py @@ -0,0 +1,60 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .jwplatform import JWPlatformBaseIE +from ..utils import ( + ExtractorError, + js_to_json, +) + + +class OnDemandKoreaIE(JWPlatformBaseIE): + _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?P[^/]+)\.html' + _TEST = { + 'url': 'http://www.ondemandkorea.com/ask-us-anything-e43.html', + 'info_dict': { + 'id': 'ask-us-anything-e43', + 'ext': 'mp4', + 'title': 'Ask Us Anything : E43', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': 'm3u8 download' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id, fatal=False) + + if not webpage: + # Page sometimes returns captcha page with HTTP 403 + raise ExtractorError( + 'Unable to access page. You may have been blocked.', + expected=True) + + if 'msg_block_01.png' in webpage: + self.raise_geo_restricted( + 'This content is not available in your region') + + if 'This video is only available to ODK PLUS members.' in webpage: + raise ExtractorError( + 'This video is only available to ODK PLUS members.', + expected=True) + + title = self._og_search_title(webpage) + + jw_config = self._parse_json( + self._search_regex( + r'(?s)jwplayer\(([\'"])(?:(?!\1).)+\1\)\.setup\s*\((?P.+?)\);', + webpage, 'jw config', group='options'), + video_id, transform_source=js_to_json) + info = self._parse_jwplayer_data( + jw_config, video_id, require_title=False, m3u8_id='hls', + base_url=url) + + info.update({ + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage), + }) + return info diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 7f19b1ba5..8c5ec72d9 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -1,25 +1,16 @@ # coding: utf-8 -from __future__ import unicode_literals, division - -import re +from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_chr, - compat_ord, -) +from ..compat import compat_chr from ..utils import ( determine_ext, ExtractorError, ) -from ..jsinterp import ( - JSInterpreter, - _NAME_RE -) class OpenloadIE(InfoExtractor): - _VALID_URL = r'https?://openload\.(?:co|io)/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://(?:openload\.(?:co|io)|oload\.tv)/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -60,46 +51,11 @@ class OpenloadIE(InfoExtractor): # for title and ext 'url': 'https://openload.co/embed/Sxz5sADo82g/', 'only_matching': True, + }, { + 'url': 'https://oload.tv/embed/KnG-kKZdcfY/', + 'only_matching': True, }] - def openload_decode(self, txt): - symbol_dict = { - '(゚Д゚) [゚Θ゚]': '_', - '(゚Д゚) [゚ω゚ノ]': 'a', - '(゚Д゚) [゚Θ゚ノ]': 'b', - '(゚Д゚) [\'c\']': 'c', - '(゚Д゚) [゚ー゚ノ]': 'd', - '(゚Д゚) [゚Д゚ノ]': 'e', - '(゚Д゚) [1]': 'f', - '(゚Д゚) [\'o\']': 'o', - '(o゚ー゚o)': 'u', - '(゚Д゚) [\'c\']': 'c', - '((゚ー゚) + (o^_^o))': '7', - '((o^_^o) +(o^_^o) +(c^_^o))': '6', - '((゚ー゚) + (゚Θ゚))': '5', - '(-~3)': '4', - '(-~-~1)': '3', - '(-~1)': '2', - '(-~0)': '1', - '((c^_^o)-(c^_^o))': '0', - } - delim = '(゚Д゚)[゚ε゚]+' - end_token = '(゚Д゚)[゚o゚]' - symbols = '|'.join(map(re.escape, symbol_dict.keys())) - txt = re.sub('(%s)\+\s?' % symbols, lambda m: symbol_dict[m.group(1)], txt) - ret = '' - for aacode in re.findall(r'{0}\+\s?{1}(.*?){0}'.format(re.escape(end_token), re.escape(delim)), txt): - for aachar in aacode.split(delim): - if aachar.isdigit(): - ret += compat_chr(int(aachar, 8)) - else: - m = re.match(r'^u([\da-f]{4})$', aachar) - if m: - ret += compat_chr(int(m.group(1), 16)) - else: - self.report_warning("Cannot decode: %s" % aachar) - return ret - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage('https://openload.co/embed/%s/' % video_id, video_id) @@ -107,36 +63,20 @@ class OpenloadIE(InfoExtractor): if 'File not found' in webpage or 'deleted by the owner' in webpage: raise ExtractorError('File not found', expected=True) - # The following decryption algorithm is written by @yokrysty and - # declared to be freely used in youtube-dl - # See https://github.com/rg3/youtube-dl/issues/10408 - enc_data = self._html_search_regex( - r']*>([^<]+)\s*]*>[^<]+\s*]+id="streamurl"', - webpage, 'encrypted data') + ol_id = self._search_regex( + ']+id="[a-zA-Z0-9]+x"[^>]*>([0-9]+)', + webpage, 'openload ID') - enc_code = self._html_search_regex(r']+>(゚ω゚[^<]+)', - webpage, 'encrypted code') + first_two_chars = int(float(ol_id[0:][:2])) + urlcode = '' + num = 2 - js_code = self.openload_decode(enc_code) - jsi = JSInterpreter(js_code) + while num < len(ol_id): + urlcode += compat_chr(int(float(ol_id[num:][:3])) - + first_two_chars * int(float(ol_id[num + 3:][:2]))) + num += 5 - m_offset_fun = self._search_regex(r'slice\(0\s*-\s*(%s)\(\)' % _NAME_RE, js_code, 'javascript offset function') - m_diff_fun = self._search_regex(r'charCodeAt\(0\)\s*\+\s*(%s)\(\)' % _NAME_RE, js_code, 'javascript diff function') - - offset = jsi.call_function(m_offset_fun) - diff = jsi.call_function(m_diff_fun) - - video_url_chars = [] - - for idx, c in enumerate(enc_data): - j = compat_ord(c) - if j >= 33 and j <= 126: - j = ((j + 14) % 94) + 33 - if idx == len(enc_data) - offset: - j += diff - video_url_chars += compat_chr(j) - - video_url = 'https://openload.co/stream/%s?mime=true' % ''.join(video_url_chars) + video_url = 'https://openload.co/stream/' + urlcode title = self._og_search_title(webpage, default=None) or self._search_regex( r']+class=["\']title["\'][^>]*>([^<]+)', webpage, @@ -155,5 +95,4 @@ class OpenloadIE(InfoExtractor): 'ext': determine_ext(title), 'subtitles': subtitles, } - return info_dict diff --git a/youtube_dl/extractor/pandoratv.py b/youtube_dl/extractor/pandoratv.py index 2b07958bb..cbb1968d3 100644 --- a/youtube_dl/extractor/pandoratv.py +++ b/youtube_dl/extractor/pandoratv.py @@ -11,6 +11,7 @@ from ..utils import ( float_or_none, parse_duration, str_to_int, + urlencode_postdata, ) @@ -56,6 +57,22 @@ class PandoraTVIE(InfoExtractor): r'^v(\d+)[Uu]rl$', format_id, 'height', default=None) if not height: continue + + play_url = self._download_json( + 'http://m.pandora.tv/?c=api&m=play_url', video_id, + data=urlencode_postdata({ + 'prgid': video_id, + 'runtime': info.get('runtime'), + 'vod_url': format_url, + }), + headers={ + 'Origin': url, + 'Content-Type': 'application/x-www-form-urlencoded', + }) + format_url = play_url.get('url') + if not format_url: + continue + formats.append({ 'format_id': '%sp' % height, 'url': format_url, diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index b490ef74c..f1c0cd068 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -350,6 +350,15 @@ class PBSIE(InfoExtractor): 410: 'This video has expired and is no longer available for online streaming.', } + def _real_initialize(self): + cookie = (self._download_json( + 'http://localization.services.pbs.org/localize/auto/cookie/', + None, headers=self.geo_verification_headers(), fatal=False) or {}).get('cookie') + if cookie: + station = self._search_regex(r'#?s=\["([^"]+)"', cookie, 'station') + if station: + self._set_cookie('.pbs.org', 'pbsol.station', station) + def _extract_webpage(self, url): mobj = re.match(self._VALID_URL, url) @@ -476,7 +485,8 @@ class PBSIE(InfoExtractor): redirect_info = self._download_json( '%s?format=json' % redirect['url'], display_id, - 'Downloading %s video url info' % (redirect_id or num)) + 'Downloading %s video url info' % (redirect_id or num), + headers=self.geo_verification_headers()) if redirect_info['status'] == 'error': raise ExtractorError( diff --git a/youtube_dl/extractor/piksel.py b/youtube_dl/extractor/piksel.py new file mode 100644 index 000000000..d44edcdfb --- /dev/null +++ b/youtube_dl/extractor/piksel.py @@ -0,0 +1,106 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + dict_get, + int_or_none, + unescapeHTML, + parse_iso8601, +) + + +class PikselIE(InfoExtractor): + _VALID_URL = r'https?://player\.piksel\.com/v/(?P[a-z0-9]+)' + _TEST = { + 'url': 'http://player.piksel.com/v/nv60p12f', + 'md5': 'd9c17bbe9c3386344f9cfd32fad8d235', + 'info_dict': { + 'id': 'nv60p12f', + 'ext': 'mp4', + 'title': 'فن الحياة - الحلقة 1', + 'description': 'احدث برامج الداعية الاسلامي " مصطفي حسني " فى رمضان 2016علي النهار نور', + 'timestamp': 1465231790, + 'upload_date': '20160606', + } + } + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r']+src=["\'](?P(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + app_token = self._search_regex( + r'clientAPI\s*:\s*"([^"]+)"', webpage, 'app token') + response = self._download_json( + 'http://player.piksel.com/ws/ws_program/api/%s/mode/json/apiv/5' % app_token, + video_id, query={ + 'v': video_id + })['response'] + failure = response.get('failure') + if failure: + raise ExtractorError(response['failure']['reason'], expected=True) + video_data = response['WsProgramResponse']['program']['asset'] + title = video_data['title'] + + formats = [] + + m3u8_url = dict_get(video_data, [ + 'm3u8iPadURL', + 'ipadM3u8Url', + 'm3u8AndroidURL', + 'm3u8iPhoneURL', + 'iphoneM3u8Url']) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + asset_type = dict_get(video_data, ['assetType', 'asset_type']) + for asset_file in video_data.get('assetFiles', []): + # TODO: extract rtmp formats + http_url = asset_file.get('http_url') + if not http_url: + continue + tbr = None + vbr = int_or_none(asset_file.get('videoBitrate'), 1024) + abr = int_or_none(asset_file.get('audioBitrate'), 1024) + if asset_type == 'video': + tbr = vbr + abr + elif asset_type == 'audio': + tbr = abr + + format_id = ['http'] + if tbr: + format_id.append(compat_str(tbr)) + + formats.append({ + 'format_id': '-'.join(format_id), + 'url': unescapeHTML(http_url), + 'vbr': vbr, + 'abr': abr, + 'width': int_or_none(asset_file.get('videoWidth')), + 'height': int_or_none(asset_file.get('videoHeight')), + 'filesize': int_or_none(asset_file.get('filesize')), + 'tbr': tbr, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('thumbnailUrl'), + 'timestamp': parse_iso8601(video_data.get('dateadd')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 7cc07a2ad..30478f979 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -85,6 +85,9 @@ class ProSiebenSat1BaseIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( source_url, clip_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + elif mimetype == 'application/dash+xml': + formats.extend(self._extract_mpd_formats( + source_url, clip_id, mpd_id='dash', fatal=False)) else: tbr = fix_bitrate(source['bitrate']) if protocol in ('rtmp', 'rtmpe'): diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py index ebe563ebb..e09670da2 100644 --- a/youtube_dl/extractor/rte.py +++ b/youtube_dl/extractor/rte.py @@ -4,118 +4,31 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( float_or_none, parse_iso8601, unescapeHTML, + ExtractorError, ) -class RteIE(InfoExtractor): - IE_NAME = 'rte' - IE_DESC = 'Raidió Teilifís Éireann TV' - _VALID_URL = r'https?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P[0-9]+)' - _TEST = { - 'url': 'http://www.rte.ie/player/ie/show/iwitness-862/10478715/', - 'info_dict': { - 'id': '10478715', - 'ext': 'flv', - 'title': 'Watch iWitness online', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'iWitness : The spirit of Ireland, one voice and one minute at a time.', - 'duration': 60.046, - }, - 'params': { - 'skip_download': 'f4m fails with --test atm' - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._og_search_title(webpage) - description = self._html_search_meta('description', webpage, 'description') - duration = float_or_none(self._html_search_meta( - 'duration', webpage, 'duration', fatal=False), 1000) - - thumbnail = None - thumbnail_meta = self._html_search_meta('thumbnail', webpage) - if thumbnail_meta: - thumbnail_id = self._search_regex( - r'uri:irus:(.+)', thumbnail_meta, - 'thumbnail id', fatal=False) - if thumbnail_id: - thumbnail = 'http://img.rasset.ie/%s.jpg' % thumbnail_id - - feeds_url = self._html_search_meta('feeds-prefix', webpage, 'feeds url') + video_id - json_string = self._download_json(feeds_url, video_id) - - # f4m_url = server + relative_url - f4m_url = json_string['shows'][0]['media:group'][0]['rte:server'] + json_string['shows'][0]['media:group'][0]['url'] - f4m_formats = self._extract_f4m_formats(f4m_url, video_id) - self._sort_formats(f4m_formats) - - return { - 'id': video_id, - 'title': title, - 'formats': f4m_formats, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - } - - -class RteRadioIE(InfoExtractor): - IE_NAME = 'rte:radio' - IE_DESC = 'Raidió Teilifís Éireann radio' - # Radioplayer URLs have two distinct specifier formats, - # the old format #!rii=:::: - # the new format #!rii=b____ - # where the IDs are int/empty, the date is DD-MM-YYYY, and the specifier may be truncated. - # An uniquely defines an individual recording, and is the only part we require. - _VALID_URL = r'https?://(?:www\.)?rte\.ie/radio/utils/radioplayer/rteradioweb\.html#!rii=(?:b?[0-9]*)(?:%3A|:|%5F|_)(?P[0-9]+)' - - _TESTS = [{ - # Old-style player URL; HLS and RTMPE formats - 'url': 'http://www.rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=16:10507902:2414:27-12-2015:', - 'info_dict': { - 'id': '10507902', - 'ext': 'mp4', - 'title': 'Gloria', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'md5:9ce124a7fb41559ec68f06387cabddf0', - 'timestamp': 1451203200, - 'upload_date': '20151227', - 'duration': 7230.0, - }, - 'params': { - 'skip_download': 'f4m fails with --test atm' - } - }, { - # New-style player URL; RTMPE formats only - 'url': 'http://rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=b16_3250678_8861_06-04-2012_', - 'info_dict': { - 'id': '3250678', - 'ext': 'flv', - 'title': 'The Lyric Concert with Paul Herriott', - 'thumbnail': 're:^https?://.*\.jpg$', - 'description': '', - 'timestamp': 1333742400, - 'upload_date': '20120406', - 'duration': 7199.016, - }, - 'params': { - 'skip_download': 'f4m fails with --test atm' - } - }] - +class RteBaseIE(InfoExtractor): def _real_extract(self, url): item_id = self._match_id(url) - json_string = self._download_json( - 'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=' + item_id, - item_id) + try: + json_string = self._download_json( + 'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=' + item_id, + item_id) + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: + error_info = self._parse_json(ee.cause.read().decode(), item_id, fatal=False) + if error_info: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error_info['message']), + expected=True) + raise # NB the string values in the JSON are stored using XML escaping(!) show = json_string['shows'][0] @@ -163,3 +76,67 @@ class RteRadioIE(InfoExtractor): 'duration': duration, 'formats': formats, } + + +class RteIE(RteBaseIE): + IE_NAME = 'rte' + IE_DESC = 'Raidió Teilifís Éireann TV' + _VALID_URL = r'https?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P[0-9]+)' + _TEST = { + 'url': 'http://www.rte.ie/player/ie/show/iwitness-862/10478715/', + 'md5': '4a76eb3396d98f697e6e8110563d2604', + 'info_dict': { + 'id': '10478715', + 'ext': 'mp4', + 'title': 'iWitness', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'The spirit of Ireland, one voice and one minute at a time.', + 'duration': 60.046, + 'upload_date': '20151012', + 'timestamp': 1444694160, + }, + } + + +class RteRadioIE(RteBaseIE): + IE_NAME = 'rte:radio' + IE_DESC = 'Raidió Teilifís Éireann radio' + # Radioplayer URLs have two distinct specifier formats, + # the old format #!rii=:::: + # the new format #!rii=b____ + # where the IDs are int/empty, the date is DD-MM-YYYY, and the specifier may be truncated. + # An uniquely defines an individual recording, and is the only part we require. + _VALID_URL = r'https?://(?:www\.)?rte\.ie/radio/utils/radioplayer/rteradioweb\.html#!rii=(?:b?[0-9]*)(?:%3A|:|%5F|_)(?P[0-9]+)' + + _TESTS = [{ + # Old-style player URL; HLS and RTMPE formats + 'url': 'http://www.rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=16:10507902:2414:27-12-2015:', + 'md5': 'c79ccb2c195998440065456b69760411', + 'info_dict': { + 'id': '10507902', + 'ext': 'mp4', + 'title': 'Gloria', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': 'md5:9ce124a7fb41559ec68f06387cabddf0', + 'timestamp': 1451203200, + 'upload_date': '20151227', + 'duration': 7230.0, + }, + }, { + # New-style player URL; RTMPE formats only + 'url': 'http://rte.ie/radio/utils/radioplayer/rteradioweb.html#!rii=b16_3250678_8861_06-04-2012_', + 'info_dict': { + 'id': '3250678', + 'ext': 'flv', + 'title': 'The Lyric Concert with Paul Herriott', + 'thumbnail': 're:^https?://.*\.jpg$', + 'description': '', + 'timestamp': 1333742400, + 'upload_date': '20120406', + 'duration': 7199.016, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }] diff --git a/youtube_dl/extractor/rtl2.py b/youtube_dl/extractor/rtl2.py index cb4ee8803..721ee733c 100644 --- a/youtube_dl/extractor/rtl2.py +++ b/youtube_dl/extractor/rtl2.py @@ -2,7 +2,9 @@ from __future__ import unicode_literals import re + from .common import InfoExtractor +from ..utils import int_or_none class RTL2IE(InfoExtractor): @@ -13,7 +15,7 @@ class RTL2IE(InfoExtractor): 'id': 'folge-203-0', 'ext': 'f4v', 'title': 'GRIP sucht den Sommerkönig', - 'description': 'Matthias, Det und Helge treten gegeneinander an.' + 'description': 'md5:e3adbb940fd3c6e76fa341b8748b562f' }, 'params': { # rtmp download @@ -25,7 +27,7 @@ class RTL2IE(InfoExtractor): 'id': '21040-anna-erwischt-alex', 'ext': 'mp4', 'title': 'Anna erwischt Alex!', - 'description': 'Anna ist Alex\' Tochter bei Köln 50667.' + 'description': 'Anna nimmt ihrem Vater nicht ab, dass er nicht spielt. Und tatsächlich erwischt sie ihn auf frischer Tat.' }, 'params': { # rtmp download @@ -52,34 +54,47 @@ class RTL2IE(InfoExtractor): r'vico_id\s*:\s*([0-9]+)', webpage, 'vico_id') vivi_id = self._html_search_regex( r'vivi_id\s*:\s*([0-9]+)', webpage, 'vivi_id') - info_url = 'http://www.rtl2.de/video/php/get_video.php?vico_id=' + vico_id + '&vivi_id=' + vivi_id - info = self._download_json(info_url, video_id) + info = self._download_json( + 'http://www.rtl2.de/sites/default/modules/rtl2/mediathek/php/get_video_jw.php', + video_id, query={ + 'vico_id': vico_id, + 'vivi_id': vivi_id, + }) video_info = info['video'] title = video_info['titel'] - description = video_info.get('beschreibung') - thumbnail = video_info.get('image') - download_url = video_info['streamurl'] - download_url = download_url.replace('\\', '') - stream_url = 'mp4:' + self._html_search_regex(r'ondemand/(.*)', download_url, 'stream URL') - rtmp_conn = ['S:connect', 'O:1', 'NS:pageUrl:' + url, 'NB:fpad:0', 'NN:videoFunction:1', 'O:0'] + formats = [] + + rtmp_url = video_info.get('streamurl') + if rtmp_url: + rtmp_url = rtmp_url.replace('\\', '') + stream_url = 'mp4:' + self._html_search_regex(r'/ondemand/(.+)', rtmp_url, 'stream URL') + rtmp_conn = ['S:connect', 'O:1', 'NS:pageUrl:' + url, 'NB:fpad:0', 'NN:videoFunction:1', 'O:0'] + + formats.append({ + 'format_id': 'rtmp', + 'url': rtmp_url, + 'play_path': stream_url, + 'player_url': 'http://www.rtl2.de/flashplayer/vipo_player.swf', + 'page_url': url, + 'flash_version': 'LNX 11,2,202,429', + 'rtmp_conn': rtmp_conn, + 'no_resume': True, + 'preference': 1, + }) + + m3u8_url = video_info.get('streamurl_hls') + if m3u8_url: + formats.extend(self._extract_akamai_formats(m3u8_url, video_id)) - formats = [{ - 'url': download_url, - 'play_path': stream_url, - 'player_url': 'http://www.rtl2.de/flashplayer/vipo_player.swf', - 'page_url': url, - 'flash_version': 'LNX 11,2,202,429', - 'rtmp_conn': rtmp_conn, - 'no_resume': True, - }] self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'thumbnail': thumbnail, - 'description': description, + 'thumbnail': video_info.get('image'), + 'description': video_info.get('beschreibung'), + 'duration': int_or_none(video_info.get('duration')), 'formats': formats, } diff --git a/youtube_dl/extractor/rts.py b/youtube_dl/extractor/rts.py index 3cc32847b..ae012ab98 100644 --- a/youtube_dl/extractor/rts.py +++ b/youtube_dl/extractor/rts.py @@ -4,27 +4,24 @@ from __future__ import unicode_literals import re from .srgssr import SRGSSRIE -from ..compat import ( - compat_str, - compat_urllib_parse_urlparse, -) +from ..compat import compat_str from ..utils import ( int_or_none, parse_duration, parse_iso8601, unescapeHTML, - xpath_text, + determine_ext, ) class RTSIE(SRGSSRIE): IE_DESC = 'RTS.ch' - _VALID_URL = r'rts:(?P\d+)|https?://(?:www\.)?rts\.ch/(?:[^/]+/){2,}(?P[0-9]+)-(?P.+?)\.html' + _VALID_URL = r'rts:(?P\d+)|https?://(?:.+?\.)?rts\.ch/(?:[^/]+/){2,}(?P[0-9]+)-(?P.+?)\.html' _TESTS = [ { 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html', - 'md5': 'f254c4b26fb1d3c183793d52bc40d3e7', + 'md5': 'ff7f8450a90cf58dacb64e29707b4a8e', 'info_dict': { 'id': '3449373', 'display_id': 'les-enfants-terribles', @@ -38,35 +35,17 @@ class RTSIE(SRGSSRIE): 'thumbnail': 're:^https?://.*\.image', 'view_count': int, }, - 'params': { - # m3u8 download - 'skip_download': True, - } }, { 'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html', - 'md5': 'f1077ac5af686c76528dc8d7c5df29ba', 'info_dict': { - 'id': '5742494', - 'display_id': '5742494', - 'ext': 'mp4', - 'duration': 3720, - 'title': 'Les yeux dans les cieux - Mon homard au Canada', - 'description': 'md5:d22ee46f5cc5bac0912e5a0c6d44a9f7', - 'uploader': 'Passe-moi les jumelles', - 'upload_date': '20140404', - 'timestamp': 1396635300, - 'thumbnail': 're:^https?://.*\.image', - 'view_count': int, + 'id': '5624065', + 'title': 'Passe-moi les jumelles', }, - 'params': { - # m3u8 download - 'skip_download': True, - } + 'playlist_mincount': 4, }, { 'url': 'http://www.rts.ch/video/sport/hockey/5745975-1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski.html', - 'md5': 'b4326fecd3eb64a458ba73c73e91299d', 'info_dict': { 'id': '5745975', 'display_id': '1-2-kloten-fribourg-5-2-second-but-pour-gotteron-par-kwiatowski', @@ -80,11 +59,15 @@ class RTSIE(SRGSSRIE): 'thumbnail': 're:^https?://.*\.image', 'view_count': int, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, 'skip': 'Blocked outside Switzerland', }, { 'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html', - 'md5': '9f713382f15322181bb366cc8c3a4ff0', + 'md5': '1bae984fe7b1f78e94abc74e802ed99f', 'info_dict': { 'id': '5745356', 'display_id': 'londres-cachee-par-un-epais-smog', @@ -92,16 +75,12 @@ class RTSIE(SRGSSRIE): 'duration': 33, 'title': 'Londres cachée par un épais smog', 'description': 'Un important voile de smog recouvre Londres depuis mercredi, provoqué par la pollution et du sable du Sahara.', - 'uploader': 'Le Journal en continu', + 'uploader': 'L\'actu en vidéo', 'upload_date': '20140403', 'timestamp': 1396537322, 'thumbnail': 're:^https?://.*\.image', 'view_count': int, }, - 'params': { - # m3u8 download - 'skip_download': True, - } }, { 'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html', @@ -125,6 +104,10 @@ class RTSIE(SRGSSRIE): 'title': 'Hockey: Davos décroche son 31e titre de champion de Suisse', }, 'playlist_mincount': 5, + }, + { + 'url': 'http://pages.rts.ch/emissions/passe-moi-les-jumelles/5624065-entre-ciel-et-mer.html', + 'only_matching': True, } ] @@ -142,19 +125,32 @@ class RTSIE(SRGSSRIE): # media_id extracted out of URL is not always a real id if 'video' not in all_info and 'audio' not in all_info: - page = self._download_webpage(url, display_id) + entries = [] - # article with videos on rhs - videos = re.findall( - r']+class="content-item"[^>]*>\s*]+data-video-urn="urn:([^"]+)"', - page) - if not videos: + for item in all_info.get('items', []): + item_url = item.get('url') + if not item_url: + continue + entries.append(self.url_result(item_url, 'RTS')) + + if not entries: + page, urlh = self._download_webpage_handle(url, display_id) + if re.match(self._VALID_URL, urlh.geturl()).group('id') != media_id: + return self.url_result(urlh.geturl(), 'RTS') + + # article with videos on rhs videos = re.findall( - r'(?s)]+class="srg-player"[^>]+src="[^"]+urn:([^"]+)"', + r']+class="content-item"[^>]*>\s*]+data-video-urn="urn:([^"]+)"', page) - if videos: - entries = [self.url_result('srgssr:%s' % video_urn, 'SRGSSR') for video_urn in videos] - return self.playlist_result(entries, media_id, self._og_search_title(page)) + if not videos: + videos = re.findall( + r'(?s)]+class="srg-player"[^>]+src="[^"]+urn:([^"]+)"', + page) + if videos: + entries = [self.url_result('srgssr:%s' % video_urn, 'SRGSSR') for video_urn in videos] + + if entries: + return self.playlist_result(entries, media_id, all_info.get('title')) internal_id = self._html_search_regex( r'<(?:video|audio) data-id="([0-9]+)"', page, @@ -168,36 +164,29 @@ class RTSIE(SRGSSRIE): info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio'] - upload_timestamp = parse_iso8601(info.get('broadcast_date')) - duration = info.get('duration') or info.get('cutout') or info.get('cutduration') - if isinstance(duration, compat_str): - duration = parse_duration(duration) - view_count = info.get('plays') - thumbnail = unescapeHTML(info.get('preview_image_url')) + title = info['title'] def extract_bitrate(url): return int_or_none(self._search_regex( r'-([0-9]+)k\.', url, 'bitrate', default=None)) formats = [] - for format_id, format_url in info['streams'].items(): - if format_id == 'hds_sd' and 'hds' in info['streams']: + streams = info.get('streams', {}) + for format_id, format_url in streams.items(): + if format_id == 'hds_sd' and 'hds' in streams: continue - if format_id == 'hls_sd' and 'hls' in info['streams']: + if format_id == 'hls_sd' and 'hls' in streams: continue - if format_url.endswith('.f4m'): - token = self._download_xml( - 'http://tp.srgssr.ch/token/akahd.xml?stream=%s/*' % compat_urllib_parse_urlparse(format_url).path, - media_id, 'Downloading %s token' % format_id) - auth_params = xpath_text(token, './/authparams', 'auth params') - if not auth_params: - continue - formats.extend(self._extract_f4m_formats( - '%s?%s&hdcore=3.4.0&plugin=aasp-3.4.0.132.66' % (format_url, auth_params), - media_id, f4m_id=format_id, fatal=False)) - elif format_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats( - format_url, media_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) + ext = determine_ext(format_url) + if ext in ('m3u8', 'f4m'): + format_url = self._get_tokenized_src(format_url, media_id, format_id) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + format_url + ('?' if '?' not in format_url else '&') + 'hdcore=3.4.0', + media_id, f4m_id=format_id, fatal=False)) + else: + formats.extend(self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) else: formats.append({ 'format_id': format_id, @@ -205,25 +194,37 @@ class RTSIE(SRGSSRIE): 'tbr': extract_bitrate(format_url), }) - if 'media' in info: - formats.extend([{ - 'format_id': '%s-%sk' % (media['ext'], media['rate']), - 'url': 'http://download-video.rts.ch/%s' % media['url'], - 'tbr': media['rate'] or extract_bitrate(media['url']), - } for media in info['media'] if media.get('rate')]) + for media in info.get('media', []): + media_url = media.get('url') + if not media_url or re.match(r'https?://', media_url): + continue + rate = media.get('rate') + ext = media.get('ext') or determine_ext(media_url, 'mp4') + format_id = ext + if rate: + format_id += '-%dk' % rate + formats.append({ + 'format_id': format_id, + 'url': 'http://download-video.rts.ch/' + media_url, + 'tbr': rate or extract_bitrate(media_url), + }) self._check_formats(formats, media_id) self._sort_formats(formats) + duration = info.get('duration') or info.get('cutout') or info.get('cutduration') + if isinstance(duration, compat_str): + duration = parse_duration(duration) + return { 'id': media_id, 'display_id': display_id, 'formats': formats, - 'title': info['title'], + 'title': title, 'description': info.get('intro'), 'duration': duration, - 'view_count': view_count, + 'view_count': int_or_none(info.get('plays')), 'uploader': info.get('programName'), - 'timestamp': upload_timestamp, - 'thumbnail': thumbnail, + 'timestamp': parse_iso8601(info.get('broadcast_date')), + 'thumbnail': unescapeHTML(info.get('preview_image_url')), } diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 6a43b036e..746677a24 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -209,7 +209,10 @@ class RTVELiveIE(InfoExtractor): title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time) vidplayer_id = self._search_regex( - r'playerId=player([0-9]+)', webpage, 'internal video ID') + (r'playerId=player([0-9]+)', + r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)', + r'data-id=["\'](\d+)'), + webpage, 'internal video ID') png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/amonet/videos/%s.png' % vidplayer_id png = self._download_webpage(png_url, video_id, 'Downloading url information') m3u8_url = _decrypt_url(png) diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index 246970c4d..847d3c08f 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse from ..utils import ( ExtractorError, parse_iso8601, @@ -23,6 +24,16 @@ class SRGSSRIE(InfoExtractor): 'STARTDATE': 'This video is not yet available. Please try again later.', } + def _get_tokenized_src(self, url, video_id, format_id): + sp = compat_urllib_parse_urlparse(url).path.split('/') + token = self._download_json( + 'http://tp.srgssr.ch/akahd/token?acl=/%s/%s/*' % (sp[1], sp[2]), + video_id, 'Downloading %s token' % format_id, fatal=False) or {} + auth_params = token.get('token', {}).get('authparams') + if auth_params: + url += '?' + auth_params + return url + def get_media_data(self, bu, media_type, media_id): media_data = self._download_json( 'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id), @@ -61,14 +72,16 @@ class SRGSSRIE(InfoExtractor): asset_url = asset['text'] quality = asset['@quality'] format_id = '%s-%s' % (protocol, quality) - if protocol == 'HTTP-HDS': - formats.extend(self._extract_f4m_formats( - asset_url + '?hdcore=3.4.0', media_id, - f4m_id=format_id, fatal=False)) - elif protocol == 'HTTP-HLS': - formats.extend(self._extract_m3u8_formats( - asset_url, media_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) + if protocol.startswith('HTTP-HDS') or protocol.startswith('HTTP-HLS'): + asset_url = self._get_tokenized_src(asset_url, media_id, format_id) + if protocol.startswith('HTTP-HDS'): + formats.extend(self._extract_f4m_formats( + asset_url + ('?' if '?' not in asset_url else '&') + 'hdcore=3.4.0', + media_id, f4m_id=format_id, fatal=False)) + elif protocol.startswith('HTTP-HLS'): + formats.extend(self._extract_m3u8_formats( + asset_url, media_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) else: formats.append({ 'format_id': format_id, @@ -94,10 +107,10 @@ class SRGSSRPlayIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'md5': '4cd93523723beff51bb4bee974ee238d', + 'md5': 'da6b5b3ac9fa4761a942331cef20fcb3', 'info_dict': { 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'ext': 'm4v', + 'ext': 'mp4', 'upload_date': '20130701', 'title': 'Snowden beantragt Asyl in Russland', 'timestamp': 1372713995, diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index cfbf7f4e1..0405bd6b0 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -33,7 +33,9 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) class ThePlatformBaseIE(OnceIE): def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): - meta = self._download_xml(smil_url, video_id, note=note, query={'format': 'SMIL'}) + meta = self._download_xml( + smil_url, video_id, note=note, query={'format': 'SMIL'}, + headers=self.geo_verification_headers()) error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src') if error_element is not None and error_element.attrib['src'].startswith( 'http://link.theplatform.com/s/errorFiles/Unavailable.'): diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 77414a242..bbf071da3 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -22,6 +22,7 @@ from ..utils import ( orderedSet, parse_duration, parse_iso8601, + update_url_query, urlencode_postdata, ) @@ -279,6 +280,18 @@ class TwitchVodIE(TwitchItemBaseIE): if 't' in query: info['start_time'] = parse_duration(query['t'][0]) + if info.get('timestamp') is not None: + info['subtitles'] = { + 'rechat': [{ + 'url': update_url_query( + 'https://rechat.twitch.tv/rechat-messages', { + 'video_id': 'v%s' % item_id, + 'start': info['timestamp'], + }), + 'ext': 'json', + }], + } + return info @@ -300,7 +313,7 @@ class TwitchPlaylistBaseIE(TwitchBaseIE): response = self._call_api( self._PLAYLIST_PATH % (channel_id, offset, limit), channel_id, - 'Downloading %s videos JSON page %s' + 'Downloading %s JSON page %s' % (self._PLAYLIST_TYPE, counter_override or counter)) page_entries = self._extract_playlist_page(response) if not page_entries: @@ -350,19 +363,72 @@ class TwitchProfileIE(TwitchPlaylistBaseIE): } -class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE): - IE_NAME = 'twitch:past_broadcasts' - _VALID_URL = r'%s/(?P[^/]+)/profile/past_broadcasts/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE - _PLAYLIST_PATH = TwitchPlaylistBaseIE._PLAYLIST_PATH + '&broadcasts=true' - _PLAYLIST_TYPE = 'past broadcasts' +class TwitchVideosBaseIE(TwitchPlaylistBaseIE): + _VALID_URL_VIDEOS_BASE = r'%s/(?P[^/]+)/videos' % TwitchBaseIE._VALID_URL_BASE + _PLAYLIST_PATH = TwitchPlaylistBaseIE._PLAYLIST_PATH + '&broadcast_type=' + + +class TwitchAllVideosIE(TwitchVideosBaseIE): + IE_NAME = 'twitch:videos:all' + _VALID_URL = r'%s/all' % TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE + _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'archive,upload,highlight' + _PLAYLIST_TYPE = 'all videos' _TEST = { - 'url': 'http://www.twitch.tv/spamfish/profile/past_broadcasts', + 'url': 'https://www.twitch.tv/spamfish/videos/all', 'info_dict': { 'id': 'spamfish', 'title': 'Spamfish', }, - 'playlist_mincount': 54, + 'playlist_mincount': 869, + } + + +class TwitchUploadsIE(TwitchVideosBaseIE): + IE_NAME = 'twitch:videos:uploads' + _VALID_URL = r'%s/uploads' % TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE + _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'upload' + _PLAYLIST_TYPE = 'uploads' + + _TEST = { + 'url': 'https://www.twitch.tv/spamfish/videos/uploads', + 'info_dict': { + 'id': 'spamfish', + 'title': 'Spamfish', + }, + 'playlist_mincount': 0, + } + + +class TwitchPastBroadcastsIE(TwitchVideosBaseIE): + IE_NAME = 'twitch:videos:past-broadcasts' + _VALID_URL = r'%s/past-broadcasts' % TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE + _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'archive' + _PLAYLIST_TYPE = 'past broadcasts' + + _TEST = { + 'url': 'https://www.twitch.tv/spamfish/videos/past-broadcasts', + 'info_dict': { + 'id': 'spamfish', + 'title': 'Spamfish', + }, + 'playlist_mincount': 0, + } + + +class TwitchHighlightsIE(TwitchVideosBaseIE): + IE_NAME = 'twitch:videos:highlights' + _VALID_URL = r'%s/highlights' % TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE + _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'highlight' + _PLAYLIST_TYPE = 'highlights' + + _TEST = { + 'url': 'https://www.twitch.tv/spamfish/videos/highlights', + 'info_dict': { + 'id': 'spamfish', + 'title': 'Spamfish', + }, + 'playlist_mincount': 805, } diff --git a/youtube_dl/extractor/uktvplay.py b/youtube_dl/extractor/uktvplay.py new file mode 100644 index 000000000..2137502a1 --- /dev/null +++ b/youtube_dl/extractor/uktvplay.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class UKTVPlayIE(InfoExtractor): + _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/.+?\?.*?\bvideo=(?P\d+)' + _TEST = { + 'url': 'https://uktvplay.uktv.co.uk/shows/world-at-war/c/200/watch-online/?video=2117008346001', + 'md5': '', + 'info_dict': { + 'id': '2117008346001', + 'ext': 'mp4', + 'title': 'Pincers', + 'description': 'Pincers', + 'uploader_id': '1242911124001', + 'upload_date': '20130124', + 'timestamp': 1359049267, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['Failed to download MPD manifest'] + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1242911124001/H1xnMOqP_default/index.html?videoId=%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % video_id, + 'BrightcoveNew', video_id) diff --git a/youtube_dl/extractor/uplynk.py b/youtube_dl/extractor/uplynk.py index 2cd22cf8a..f06bf5b12 100644 --- a/youtube_dl/extractor/uplynk.py +++ b/youtube_dl/extractor/uplynk.py @@ -30,7 +30,9 @@ class UplynkIE(InfoExtractor): def _extract_uplynk_info(self, uplynk_content_url): path, external_id, video_id, session_id = re.match(UplynkIE._VALID_URL, uplynk_content_url).groups() display_id = video_id or external_id - formats = self._extract_m3u8_formats('http://content.uplynk.com/%s.m3u8' % path, display_id, 'mp4') + formats = self._extract_m3u8_formats( + 'http://content.uplynk.com/%s.m3u8' % path, + display_id, 'mp4', 'm3u8_native') if session_id: for f in formats: f['extra_param_to_segment_url'] = 'pbs=' + session_id diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index a1e0851b7..429893e38 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -4,11 +4,22 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import urlencode_postdata +from ..utils import ExtractorError class Vbox7IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vbox7\.com/(?:play:|emb/external\.php\?.*?\bvid=)(?P[\da-fA-F]+)' + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?vbox7\.com/ + (?: + play:| + (?: + emb/external\.php| + player/ext\.swf + )\?.*?\bvid= + ) + (?P[\da-fA-F]+) + ''' _TESTS = [{ 'url': 'http://vbox7.com/play:0946fff23c', 'md5': 'a60f9ab3a3a2f013ef9a967d5f7be5bf', @@ -16,6 +27,14 @@ class Vbox7IE(InfoExtractor): 'id': '0946fff23c', 'ext': 'mp4', 'title': 'Борисов: Притеснен съм за бъдещето на България', + 'description': 'По думите му е опасно страната ни да бъде обявена за "сигурна"', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1470982814, + 'upload_date': '20160812', + 'uploader': 'zdraveibulgaria', + }, + 'params': { + 'proxy': '127.0.0.1:8118', }, }, { 'url': 'http://vbox7.com/play:249bb972c2', @@ -29,6 +48,9 @@ class Vbox7IE(InfoExtractor): }, { 'url': 'http://vbox7.com/emb/external.php?vid=a240d20f9c&autoplay=1', 'only_matching': True, + }, { + 'url': 'http://i49.vbox7.com/player/ext.swf?vid=0946fff23c&autoplay=1', + 'only_matching': True, }] @staticmethod @@ -42,33 +64,41 @@ class Vbox7IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://vbox7.com/play:%s' % video_id, video_id) + response = self._download_json( + 'https://www.vbox7.com/ajax/video/nextvideo.php?vid=%s' % video_id, + video_id) - title = self._html_search_regex( - r'(.+?)', webpage, 'title').split('/')[0].strip() + if 'error' in response: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, response['error']), expected=True) - video_url = self._search_regex( - r'src\s*:\s*(["\'])(?P.+?.mp4.*?)\1', - webpage, 'video url', default=None, group='url') + video = response['options'] - thumbnail_url = self._og_search_thumbnail(webpage) - - if not video_url: - info_response = self._download_webpage( - 'http://vbox7.com/play/magare.do', video_id, - 'Downloading info webpage', - data=urlencode_postdata({'as3': '1', 'vid': video_id}), - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - final_url, thumbnail_url = map( - lambda x: x.split('=')[1], info_response.split('&')) + title = video['title'] + video_url = video['src'] if '/na.mp4' in video_url: self.raise_geo_restricted() - return { + uploader = video.get('uploader') + + webpage = self._download_webpage( + 'http://vbox7.com/play:%s' % video_id, video_id, fatal=None) + + info = {} + + if webpage: + info = self._search_json_ld( + webpage.replace('"/*@context"', '"@context"'), video_id, + fatal=False) + + info.update({ 'id': video_id, - 'url': self._proto_relative_url(video_url, 'http:'), 'title': title, - 'thumbnail': thumbnail_url, - } + 'url': video_url, + 'uploader': uploader, + 'thumbnail': self._proto_relative_url( + info.get('thumbnail') or self._og_search_thumbnail(webpage), + 'http:'), + }) + return info diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 51c69a80c..c35cafcc6 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -92,29 +92,30 @@ class VimeoBaseInfoExtractor(InfoExtractor): def _vimeo_sort_formats(self, formats): # Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps # at the same time without actual units specified. This lead to wrong sorting. - self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'format_id')) + self._sort_formats(formats, field_preference=('preference', 'height', 'width', 'fps', 'tbr', 'format_id')) def _parse_config(self, config, video_id): + video_data = config['video'] # Extract title - video_title = config['video']['title'] + video_title = video_data['title'] # Extract uploader, uploader_url and uploader_id - video_uploader = config['video'].get('owner', {}).get('name') - video_uploader_url = config['video'].get('owner', {}).get('url') + video_uploader = video_data.get('owner', {}).get('name') + video_uploader_url = video_data.get('owner', {}).get('url') video_uploader_id = video_uploader_url.split('/')[-1] if video_uploader_url else None # Extract video thumbnail - video_thumbnail = config['video'].get('thumbnail') + video_thumbnail = video_data.get('thumbnail') if video_thumbnail is None: - video_thumbs = config['video'].get('thumbs') + video_thumbs = video_data.get('thumbs') if video_thumbs and isinstance(video_thumbs, dict): _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1] # Extract video duration - video_duration = int_or_none(config['video'].get('duration')) + video_duration = int_or_none(video_data.get('duration')) formats = [] - config_files = config['video'].get('files') or config['request'].get('files', {}) + config_files = video_data.get('files') or config['request'].get('files', {}) for f in config_files.get('progressive', []): video_url = f.get('url') if not video_url: @@ -127,10 +128,24 @@ class VimeoBaseInfoExtractor(InfoExtractor): 'fps': int_or_none(f.get('fps')), 'tbr': int_or_none(f.get('bitrate')), }) - m3u8_url = config_files.get('hls', {}).get('url') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + + for files_type in ('hls', 'dash'): + for cdn_name, cdn_data in config_files.get(files_type, {}).get('cdns', {}).items(): + manifest_url = cdn_data.get('url') + if not manifest_url: + continue + format_id = '%s-%s' % (files_type, cdn_name) + if files_type == 'hls': + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', + 'm3u8_native', m3u8_id=format_id, + note='Downloading %s m3u8 information' % cdn_name, + fatal=False)) + elif files_type == 'dash': + formats.extend(self._extract_mpd_formats( + manifest_url.replace('/master.json', '/master.mpd'), video_id, format_id, + 'Downloading %s MPD information' % cdn_name, + fatal=False)) subtitles = {} text_tracks = config['request'].get('text_tracks') diff --git a/youtube_dl/extractor/viu.py b/youtube_dl/extractor/viu.py new file mode 100644 index 000000000..3fd889c8e --- /dev/null +++ b/youtube_dl/extractor/viu.py @@ -0,0 +1,249 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, +) + + +class ViuBaseIE(InfoExtractor): + def _real_initialize(self): + viu_auth_res = self._request_webpage( + 'https://www.viu.com/api/apps/v2/authenticate', None, + 'Requesting Viu auth', query={ + 'acct': 'test', + 'appid': 'viu_desktop', + 'fmt': 'json', + 'iid': 'guest', + 'languageid': 'default', + 'platform': 'desktop', + 'userid': 'guest', + 'useridtype': 'guest', + 'ver': '1.0' + }, headers=self.geo_verification_headers()) + self._auth_token = viu_auth_res.info()['X-VIU-AUTH'] + + def _call_api(self, path, *args, **kwargs): + headers = self.geo_verification_headers() + headers.update({ + 'X-VIU-AUTH': self._auth_token + }) + headers.update(kwargs.get('headers', {})) + kwargs['headers'] = headers + response = self._download_json( + 'https://www.viu.com/api/' + path, *args, **kwargs)['response'] + if response.get('status') != 'success': + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, response['message']), expected=True) + return response + + +class ViuIE(ViuBaseIE): + _VALID_URL = r'(?:viu:|https?://www\.viu\.com/[a-z]{2}/media/)(?P\d+)' + _TESTS = [{ + 'url': 'https://www.viu.com/en/media/1116705532?containerId=playlist-22168059', + 'info_dict': { + 'id': '1116705532', + 'ext': 'mp4', + 'title': 'Citizen Khan - Ep 1', + 'description': 'md5:d7ea1604f49e5ba79c212c551ce2110e', + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + 'skip': 'Geo-restricted to India', + }, { + 'url': 'https://www.viu.com/en/media/1130599965', + 'info_dict': { + 'id': '1130599965', + 'ext': 'mp4', + 'title': 'Jealousy Incarnate - Episode 1', + 'description': 'md5:d3d82375cab969415d2720b6894361e9', + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + 'skip': 'Geo-restricted to Indonesia', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_data = self._call_api( + 'clip/load', video_id, 'Downloading video data', query={ + 'appid': 'viu_desktop', + 'fmt': 'json', + 'id': video_id + })['item'][0] + + title = video_data['title'] + + m3u8_url = None + url_path = video_data.get('urlpathd') or video_data.get('urlpath') + tdirforwhole = video_data.get('tdirforwhole') + # #EXT-X-BYTERANGE is not supported by native hls downloader + # and ffmpeg (#10955) + # hls_file = video_data.get('hlsfile') + hls_file = video_data.get('jwhlsfile') + if url_path and tdirforwhole and hls_file: + m3u8_url = '%s/%s/%s' % (url_path, tdirforwhole, hls_file) + else: + # m3u8_url = re.sub( + # r'(/hlsc_)[a-z]+(\d+\.m3u8)', + # r'\1whe\2', video_data['href']) + m3u8_url = video_data['href'] + formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + self._sort_formats(formats) + + subtitles = {} + for key, value in video_data.items(): + mobj = re.match(r'^subtitle_(?P[^_]+)_(?P(vtt|srt))', key) + if not mobj: + continue + subtitles.setdefault(mobj.group('lang'), []).append({ + 'url': value, + 'ext': mobj.group('ext') + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'series': video_data.get('moviealbumshowname'), + 'episode': title, + 'episode_number': int_or_none(video_data.get('episodeno')), + 'duration': int_or_none(video_data.get('duration')), + 'formats': formats, + 'subtitles': subtitles, + } + + +class ViuPlaylistIE(ViuBaseIE): + IE_NAME = 'viu:playlist' + _VALID_URL = r'https?://www\.viu\.com/[^/]+/listing/playlist-(?P\d+)' + _TEST = { + 'url': 'https://www.viu.com/en/listing/playlist-22461380', + 'info_dict': { + 'id': '22461380', + 'title': 'The Good Wife', + }, + 'playlist_count': 16, + 'skip': 'Geo-restricted to Indonesia', + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + playlist_data = self._call_api( + 'container/load', playlist_id, + 'Downloading playlist info', query={ + 'appid': 'viu_desktop', + 'fmt': 'json', + 'id': 'playlist-' + playlist_id + })['container'] + + entries = [] + for item in playlist_data.get('item', []): + item_id = item.get('id') + if not item_id: + continue + item_id = compat_str(item_id) + entries.append(self.url_result( + 'viu:' + item_id, 'Viu', item_id)) + + return self.playlist_result( + entries, playlist_id, playlist_data.get('title')) + + +class ViuOTTIE(InfoExtractor): + IE_NAME = 'viu:ott' + _VALID_URL = r'https?://(?:www\.)?viu\.com/ott/(?P[a-z]{2})/[a-z]{2}-[a-z]{2}/vod/(?P\d+)' + _TESTS = [{ + 'url': 'http://www.viu.com/ott/sg/en-us/vod/3421/The%20Prime%20Minister%20and%20I', + 'info_dict': { + 'id': '3421', + 'ext': 'mp4', + 'title': 'A New Beginning', + 'description': 'md5:1e7486a619b6399b25ba6a41c0fe5b2c', + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + 'skip': 'Geo-restricted to Singapore', + }, { + 'url': 'http://www.viu.com/ott/hk/zh-hk/vod/7123/%E5%A4%A7%E4%BA%BA%E5%A5%B3%E5%AD%90', + 'info_dict': { + 'id': '7123', + 'ext': 'mp4', + 'title': '這就是我的生活之道', + 'description': 'md5:4eb0d8b08cf04fcdc6bbbeb16043434f', + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + 'skip': 'Geo-restricted to Hong Kong', + }] + + def _real_extract(self, url): + country_code, video_id = re.match(self._VALID_URL, url).groups() + + product_data = self._download_json( + 'http://www.viu.com/ott/%s/index.php' % country_code, video_id, + 'Downloading video info', query={ + 'r': 'vod/ajax-detail', + 'platform_flag_label': 'web', + 'product_id': video_id, + })['data'] + + video_data = product_data.get('current_product') + if not video_data: + raise ExtractorError('This video is not available in your region.', expected=True) + + stream_data = self._download_json( + 'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code, + video_id, 'Downloading stream info', query={ + 'ccs_product_id': video_data['ccs_product_id'], + })['data']['stream'] + + stream_sizes = stream_data.get('size', {}) + formats = [] + for vid_format, stream_url in stream_data.get('url', {}).items(): + height = int_or_none(self._search_regex( + r's(\d+)p', vid_format, 'height', default=None)) + formats.append({ + 'format_id': vid_format, + 'url': stream_url, + 'height': height, + 'ext': 'mp4', + 'filesize': int_or_none(stream_sizes.get(vid_format)) + }) + self._sort_formats(formats) + + subtitles = {} + for sub in video_data.get('subtitle', []): + sub_url = sub.get('url') + if not sub_url: + continue + subtitles.setdefault(sub.get('name'), []).append({ + 'url': sub_url, + 'ext': 'srt', + }) + + title = video_data['synopsis'].strip() + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'series': product_data.get('series', {}).get('name'), + 'episode': title, + 'episode_number': int_or_none(video_data.get('number')), + 'duration': int_or_none(stream_data.get('duration')), + 'thumbnail': video_data.get('cover_image_url'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index acf9fda48..540246c79 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -10,6 +10,7 @@ from ..utils import ( float_or_none, int_or_none, remove_start, + urlencode_postdata, ) from ..compat import compat_urllib_parse_urlencode @@ -48,17 +49,23 @@ class VLiveIE(InfoExtractor): webpage = self._download_webpage( 'http://www.vlive.tv/video/%s' % video_id, video_id) - video_params = self._search_regex( - r'\bvlive\.video\.init\(([^)]+)\)', - webpage, 'video params') - status, _, _, live_params, long_video_id, key = re.split( - r'"\s*,\s*"', video_params)[2:8] + VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)' + VIDEO_PARAMS_FIELD = 'video params' + + params = self._parse_json(self._search_regex( + VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD, default=''), video_id, + transform_source=lambda s: '[' + s + ']', fatal=False) + + if not params or len(params) < 7: + params = self._search_regex( + VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD) + params = [p.strip(r'"') for p in re.split(r'\s*,\s*', params)] + + status, long_video_id, key = params[2], params[5], params[6] status = remove_start(status, 'PRODUCT_') if status == 'LIVE_ON_AIR' or status == 'BIG_EVENT_ON_AIR': - live_params = self._parse_json('"%s"' % live_params, video_id) - live_params = self._parse_json(live_params, video_id) - return self._live(video_id, webpage, live_params) + return self._live(video_id, webpage) elif status == 'VOD_ON_AIR' or status == 'BIG_EVENT_INTRO': if long_video_id and key: return self._replay(video_id, webpage, long_video_id, key) @@ -89,7 +96,22 @@ class VLiveIE(InfoExtractor): 'thumbnail': thumbnail, } - def _live(self, video_id, webpage, live_params): + def _live(self, video_id, webpage): + init_page = self._download_webpage( + 'http://www.vlive.tv/video/init/view', + video_id, note='Downloading live webpage', + data=urlencode_postdata({'videoSeq': video_id}), + headers={ + 'Referer': 'http://www.vlive.tv/video/%s' % video_id, + 'Content-Type': 'application/x-www-form-urlencoded' + }) + + live_params = self._search_regex( + r'"liveStreamInfo"\s*:\s*(".*"),', + init_page, 'live stream info') + live_params = self._parse_json(live_params, video_id) + live_params = self._parse_json(live_params, video_id) + formats = [] for vid in live_params.get('resolutions', []): formats.extend(self._extract_m3u8_formats( @@ -98,10 +120,14 @@ class VLiveIE(InfoExtractor): fatal=False, live=True)) self._sort_formats(formats) - return dict(self._get_common_fields(webpage), - id=video_id, - formats=formats, - is_live=True) + info = self._get_common_fields(webpage) + info.update({ + 'title': self._live_title(info['title']), + 'id': video_id, + 'formats': formats, + 'is_live': True, + }) + return info def _replay(self, video_id, webpage, long_video_id, key): playinfo = self._download_json( @@ -135,8 +161,11 @@ class VLiveIE(InfoExtractor): 'ext': 'vtt', 'url': caption['source']}] - return dict(self._get_common_fields(webpage), - id=video_id, - formats=formats, - view_count=view_count, - subtitles=subtitles) + info = self._get_common_fields(webpage) + info.update({ + 'id': video_id, + 'formats': formats, + 'view_count': view_count, + 'subtitles': subtitles, + }) + return info diff --git a/youtube_dl/extractor/vporn.py b/youtube_dl/extractor/vporn.py index 1557a0e04..e22900f8d 100644 --- a/youtube_dl/extractor/vporn.py +++ b/youtube_dl/extractor/vporn.py @@ -7,6 +7,7 @@ from ..utils import ( ExtractorError, parse_duration, str_to_int, + urljoin, ) @@ -66,10 +67,9 @@ class VpornIE(InfoExtractor): description = self._html_search_regex( r'class="(?:descr|description_txt)">(.*?)', webpage, 'description', fatal=False) - thumbnail = self._html_search_regex( - r'flashvars\.imageUrl\s*=\s*"([^"]+)"', webpage, 'description', fatal=False, default=None) - if thumbnail: - thumbnail = 'http://www.vporn.com' + thumbnail + thumbnail = urljoin('http://www.vporn.com', self._html_search_regex( + r'flashvars\.imageUrl\s*=\s*"([^"]+)"', webpage, 'description', + default=None)) uploader = self._html_search_regex( r'(?s)Uploaded by:.*?]*>(.+?)', diff --git a/youtube_dl/extractor/vvvvid.py b/youtube_dl/extractor/vvvvid.py new file mode 100644 index 000000000..d44ec85fd --- /dev/null +++ b/youtube_dl/extractor/vvvvid.py @@ -0,0 +1,140 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, +) + + +class VVVVIDIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vvvvid\.it/#!(?:show|anime|film|series)/(?P\d+)/[^/]+/(?P\d+)/(?P[0-9]+)' + _TESTS = [{ + # video_type == 'video/vvvvid' + 'url': 'https://www.vvvvid.it/#!show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048/ping-pong', + 'md5': 'b8d3cecc2e981adc3835adf07f6df91b', + 'info_dict': { + 'id': '489048', + 'ext': 'mp4', + 'title': 'Ping Pong', + }, + }, { + # video_type == 'video/rcs' + 'url': 'https://www.vvvvid.it/#!show/376/death-note-live-action/377/482493/episodio-01', + 'md5': '33e0edfba720ad73a8782157fdebc648', + 'info_dict': { + 'id': '482493', + 'ext': 'mp4', + 'title': 'Episodio 01', + }, + }] + _conn_id = None + + def _real_initialize(self): + self._conn_id = self._download_json( + 'https://www.vvvvid.it/user/login', + None, headers=self.geo_verification_headers())['data']['conn_id'] + + def _real_extract(self, url): + show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() + response = self._download_json( + 'https://www.vvvvid.it/vvvvid/ondemand/%s/season/%s' % (show_id, season_id), + video_id, headers=self.geo_verification_headers(), query={ + 'conn_id': self._conn_id, + }) + if response['result'] == 'error': + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, response['message']), expected=True) + + vid = int(video_id) + video_data = list(filter( + lambda episode: episode.get('video_id') == vid, response['data']))[0] + formats = [] + + # vvvvid embed_info decryption algorithm is reverse engineered from function $ds(h) at vvvvid.js + def ds(h): + g = "MNOPIJKL89+/4567UVWXQRSTEFGHABCDcdefYZabstuvopqr0123wxyzklmnghij" + + def f(m): + l = [] + o = 0 + b = False + m_len = len(m) + while ((not b) and o < m_len): + n = m[o] << 2 + o += 1 + k = -1 + j = -1 + if o < m_len: + n += m[o] >> 4 + o += 1 + if o < m_len: + k = (m[o - 1] << 4) & 255 + k += m[o] >> 2 + o += 1 + if o < m_len: + j = (m[o - 1] << 6) & 255 + j += m[o] + o += 1 + else: + b = True + else: + b = True + else: + b = True + l.append(n) + if k != -1: + l.append(k) + if j != -1: + l.append(j) + return l + + c = [] + for e in h: + c.append(g.index(e)) + + c_len = len(c) + for e in range(c_len * 2 - 1, -1, -1): + a = c[e % c_len] ^ c[(e + 1) % c_len] + c[e % c_len] = a + + c = f(c) + d = '' + for e in c: + d += chr(e) + + return d + + for quality in ('_sd', ''): + embed_code = video_data.get('embed_info' + quality) + if not embed_code: + continue + embed_code = ds(embed_code) + video_type = video_data.get('video_type') + if video_type in ('video/rcs', 'video/kenc'): + formats.extend(self._extract_akamai_formats( + embed_code, video_id)) + else: + formats.extend(self._extract_wowza_formats( + 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_data['title'], + 'formats': formats, + 'thumbnail': video_data.get('thumbnail'), + 'duration': int_or_none(video_data.get('length')), + 'series': video_data.get('show_title'), + 'season_id': season_id, + 'season_number': video_data.get('season_number'), + 'episode_id': str_or_none(video_data.get('id')), + 'epidode_number': int_or_none(video_data.get('number')), + 'episode_title': video_data['title'], + 'view_count': int_or_none(video_data.get('views')), + 'like_count': int_or_none(video_data.get('video_likes')), + } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9595bcf9f..528d87bb9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -86,6 +86,11 @@ std_headers = { } +USER_AGENTS = { + 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', +} + + NO_DEFAULT = object() ENGLISH_MONTH_NAMES = [ @@ -1695,6 +1700,16 @@ def base_url(url): return re.match(r'https?://[^?#&]+/', url).group() +def urljoin(base, path): + if not isinstance(path, compat_str) or not path: + return None + if re.match(r'^(?:https?:)?//', path): + return path + if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base): + return None + return compat_urlparse.urljoin(base, path) + + class HEADRequest(compat_urllib_request.Request): def get_method(self): return 'HEAD' diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 1acb630af..3082ebf66 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.12.01' +__version__ = '2016.12.22'