From f1f879098a38c786d78927df8915b547f7ac3569 Mon Sep 17 00:00:00 2001 From: remitamine Date: Tue, 26 Apr 2016 13:39:53 +0100 Subject: [PATCH 01/30] [viewster] extract more metadata for http formats --- youtube_dl/extractor/viewster.py | 37 ++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 7839225d4..6edc2c44e 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -118,6 +118,7 @@ class ViewsterIE(InfoExtractor): formats = [] manifest_url = None + m3u8_formats = [] for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'): media = self._download_json( 'https://public-api.viewster.com/movies/%s/video?mediaType=%s' @@ -154,18 +155,32 @@ class ViewsterIE(InfoExtractor): 'qualities', default=None) if not qualities: continue - qualities = qualities.strip(',').split(',') - http_template = re.sub(QUALITIES_RE, r'%s', qualities_basename) + qualities = list(map(lambda q: int(q[:-1]), qualities.strip(',').split(','))) + qualities.sort() + http_template = re.sub(QUALITIES_RE, r'%dk', qualities_basename) http_url_basename = url_basename(video_url) - for q in qualities: - tbr = int_or_none(self._search_regex( - r'(\d+)k', q, 'bitrate', default=None)) - formats.append({ - 'url': video_url.replace(http_url_basename, http_template % q), - 'ext': 'mp4', - 'format_id': 'http' + ('-%d' % tbr if tbr else ''), - 'tbr': tbr, - }) + if m3u8_formats: + self._sort_formats(m3u8_formats) + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + m3u8_formats)) + if len(qualities) == len(m3u8_formats): + for q, m3u8_format in zip(qualities, m3u8_formats): + f = m3u8_format.copy() + f.update({ + 'url': video_url.replace(http_url_basename, http_template % q), + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + formats.append(f) + else: + for q in qualities: + formats.append({ + 'url': video_url.replace(http_url_basename, http_template % q), + 'ext': 'mp4', + 'format_id': 'http-%d' % q, + 'tbr': q, + }) if not formats and not info.get('LanguageSets') and not info.get('VODSettings'): self.raise_geo_restricted() From 175c2e9ec326f9ef820413837608eb4f5c8c5961 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 26 Apr 2016 22:29:29 +0600 Subject: [PATCH 02/30] [youtube:search_url] Reimplement in terms of youtube:playlistbase --- youtube_dl/extractor/youtube.py | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 44f98d294..b7c3cb63f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2139,10 +2139,11 @@ class YoutubeSearchDateIE(YoutubeSearchIE): _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'} -class YoutubeSearchURLIE(InfoExtractor): +class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P[^&]+)(?:[&]|$)' + _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P[^"]+))?' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, @@ -2157,32 +2158,8 @@ class YoutubeSearchURLIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) - webpage = self._download_webpage(url, query) - result_code = self._search_regex( - r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML') - - part_codes = re.findall( - r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code) - entries = [] - for part_code in part_codes: - part_title = self._html_search_regex( - [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False) - part_url_snippet = self._html_search_regex( - r'(?s)href="([^"]+)"', part_code, 'item URL') - part_url = compat_urlparse.urljoin( - 'https://www.youtube.com/', part_url_snippet) - entries.append({ - '_type': 'url', - 'url': part_url, - 'title': part_title, - }) - - return { - '_type': 'playlist', - 'entries': entries, - 'title': query, - } + return self.playlist_result(self._process_page(webpage), playlist_title=query) class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): From 7464360379a1a3fc6ba3228f54dd4853df349142 Mon Sep 17 00:00:00 2001 From: Sergey M <dstftw@gmail.com> Date: Wed, 27 Apr 2016 00:16:48 +0600 Subject: [PATCH 03/30] [README.md] Add FAQ entry on output template conflicts --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index e062444b3..fb57b0323 100644 --- a/README.md +++ b/README.md @@ -697,6 +697,10 @@ YouTube changed their playlist format in March 2014 and later on, so you'll need If you have installed youtube-dl with a package manager, pip, setup.py or a tarball, please use that to update. Note that Ubuntu packages do not seem to get updated anymore. Since we are not affiliated with Ubuntu, there is little we can do. Feel free to [report bugs](https://bugs.launchpad.net/ubuntu/+source/youtube-dl/+filebug) to the [Ubuntu packaging guys](mailto:ubuntu-motu@lists.ubuntu.com?subject=outdated%20version%20of%20youtube-dl) - all they have to do is update the package to a somewhat recent version. See above for a way to update. +### I'm getting an error when trying to use output template: `error: using output template conflicts with using title, video ID or auto number` + +Make sure you are not using `-o` with any of these options `-t`, `--title`, `--id`, `-A` or `--auto-number` set in command line or in a configuration file. Remove the latter if any. + ### Do I always have to pass `-citw`? By default, youtube-dl intends to have the best options (incidentally, if you have a convincing case that these should be different, [please file an issue where you explain that](https://yt-dl.org/bug)). Therefore, it is unnecessary and sometimes harmful to copy long option strings from webpages. In particular, the only option out of `-citw` that is regularly useful is `-i`. From 046ea04a7d8601a85007430a7a3da3ce236549f7 Mon Sep 17 00:00:00 2001 From: Sergey M <dstftw@gmail.com> Date: Wed, 27 Apr 2016 00:22:08 +0600 Subject: [PATCH 04/30] [README.md] Mention mpv --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fb57b0323..ecf737047 100644 --- a/README.md +++ b/README.md @@ -721,7 +721,7 @@ Videos or video formats streamed via RTMP protocol can only be downloaded when [ ### I have downloaded a video but how can I play it? -Once the video is fully downloaded, use any video player, such as [vlc](http://www.videolan.org) or [mplayer](http://www.mplayerhq.hu/). +Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](http://www.videolan.org) or [mplayer](http://www.mplayerhq.hu/). ### I extracted a video URL with `-g`, but it does not play on another machine / in my webbrowser. From a7e03861e8d0ce18ad698e0e38ffac40a09cef8b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 27 Apr 2016 13:52:04 +0800 Subject: [PATCH 05/30] [scivee] Skip the test Not accessible from either Travis CI or my machine. Closes #9315 --- youtube_dl/extractor/scivee.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/scivee.py b/youtube_dl/extractor/scivee.py index 3bf93c870..b1ca12fde 100644 --- a/youtube_dl/extractor/scivee.py +++ b/youtube_dl/extractor/scivee.py @@ -18,6 +18,7 @@ class SciVeeIE(InfoExtractor): 'title': 'Adam Arkin at the 2014 DOE JGI Genomics of Energy & Environment Meeting', 'description': 'md5:81f1710638e11a481358fab1b11059d7', }, + 'skip': 'Not accessible from Travis CI server', } def _real_extract(self, url): From 2ac2cbc0a351785e0c6d034bd1bab77973ec7a41 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 27 Apr 2016 13:55:32 +0800 Subject: [PATCH 06/30] [malemotion] Remove the extractor Announcement from their homepage: ``` MaleMotion is closed After another system crash, I'm forced to close the site This week all content will be erased Don't forget to cancel your subscription if any ! ``` Closes #9311. --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/malemotion.py | 46 ------------------------------ 2 files changed, 47 deletions(-) delete mode 100644 youtube_dl/extractor/malemotion.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8b215c5ab..00f8a7a85 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -400,7 +400,6 @@ from .macgamestore import MacGameStoreIE from .mailru import MailRuIE from .makerschannel import MakersChannelIE from .makertv import MakerTVIE -from .malemotion import MalemotionIE from .matchtv import MatchTVIE from .mdr import MDRIE from .metacafe import MetacafeIE diff --git a/youtube_dl/extractor/malemotion.py b/youtube_dl/extractor/malemotion.py deleted file mode 100644 index 92511a671..000000000 --- a/youtube_dl/extractor/malemotion.py +++ /dev/null @@ -1,46 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote - - -class MalemotionIE(InfoExtractor): - _VALID_URL = r'https?://malemotion\.com/video/(.+?)\.(?P<id>.+?)(#|$)' - _TEST = { - 'url': 'http://malemotion.com/video/bete-de-concours.ltc', - 'md5': '3013e53a0afbde2878bc39998c33e8a5', - 'info_dict': { - 'id': 'ltc', - 'ext': 'mp4', - 'title': 'Bête de Concours', - 'age_limit': 18, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_url = compat_urllib_parse_unquote(self._search_regex( - r'<source type="video/mp4" src="(.+?)"', webpage, 'video URL')) - video_title = self._html_search_regex( - r'<title>(.*?)</title', webpage, 'title') - video_thumbnail = self._search_regex( - r'<video .+?poster="(.+?)"', webpage, 'thumbnail', fatal=False) - - formats = [{ - 'url': video_url, - 'ext': 'mp4', - 'format_id': 'mp4', - 'preference': 1, - }] - self._sort_formats(formats) - - return { - 'id': video_id, - 'formats': formats, - 'title': video_title, - 'thumbnail': video_thumbnail, - 'age_limit': 18, - } From 5b5d7cc11e3037408aeedf8d6dc57ac228b02496 Mon Sep 17 00:00:00 2001 From: Peter Rowlands <peter@pmrowla.com> Date: Wed, 27 Apr 2016 15:57:17 +0900 Subject: [PATCH 07/30] [mwave] Add Mwave Meet & Greet extractor --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/mwave.py | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8b215c5ab..9d1992721 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -439,7 +439,7 @@ from .mtv import ( ) from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE -from .mwave import MwaveIE +from .mwave import MwaveIE, MwaveMeetGreetIE from .myspace import MySpaceIE, MySpaceAlbumIE from .myspass import MySpassIE from .myvi import MyviIE diff --git a/youtube_dl/extractor/mwave.py b/youtube_dl/extractor/mwave.py index 5c3c8d464..6485c6928 100644 --- a/youtube_dl/extractor/mwave.py +++ b/youtube_dl/extractor/mwave.py @@ -56,3 +56,26 @@ class MwaveIE(InfoExtractor): 'view_count': int_or_none(vod_info.get('hit')), 'formats': formats, } + + +class MwaveMeetGreetIE(InfoExtractor): + _VALID_URL = r'https?://mwave\.interest\.me/meetgreet/view/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://mwave.interest.me/meetgreet/view/256', + 'info_dict': { + 'id': '173294', + 'ext': 'flv', + 'title': '[MEET&GREET] Park BoRam', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Mwave', + 'duration': 3634, + 'view_count': int, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + clip_id = self._html_search_regex(r'<iframe src="/mnettv/ifr_clip\.m\?searchVideoDetailVO\.clip_id=(?P<id>[0-9]+)', webpage, 'clip ID') + clip_url = 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id={0}'.format(clip_id) + return self.url_result(clip_url, 'Mwave', clip_id) From dcf094d62699f8ad06ceaf3fba55e453980fac91 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 27 Apr 2016 18:08:23 +0800 Subject: [PATCH 08/30] [theplatform] Fix for Python 3.2 test_AENetworks{,_1} fails as in Python < 3.3, binascii.a2b_* functions accepts only bytes-like objects --- youtube_dl/extractor/theplatform.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 8272dd969..a25417f94 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -159,11 +159,11 @@ class ThePlatformIE(ThePlatformBaseIE): def str_to_hex(str): return binascii.b2a_hex(str.encode('ascii')).decode('ascii') - def hex_to_str(hex): - return binascii.a2b_hex(hex) + def hex_to_bytes(hex): + return binascii.a2b_hex(hex.encode('ascii')) relative_path = re.match(r'https?://link.theplatform.com/s/([^?]+)', url).group(1) - clear_text = hex_to_str(flags + expiration_date + str_to_hex(relative_path)) + clear_text = hex_to_bytes(flags + expiration_date + str_to_hex(relative_path)) checksum = hmac.new(sig_key.encode('ascii'), clear_text, hashlib.sha1).hexdigest() sig = flags + expiration_date + checksum + str_to_hex(sig_secret) return '%s&sig=%s' % (url, sig) From 3cc8649c9d42bab8c7b665115ebdc569bf44a762 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 28 Apr 2016 02:58:11 +0800 Subject: [PATCH 09/30] [20min] Detect embedded YouTube videos Fixes #9331 --- youtube_dl/extractor/twentymin.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py index ca7d953b8..b721ecb0a 100644 --- a/youtube_dl/extractor/twentymin.py +++ b/youtube_dl/extractor/twentymin.py @@ -32,7 +32,22 @@ class TwentyMinutenIE(InfoExtractor): 'title': '«Wir müssen mutig nach vorne schauen»', 'description': 'Kein Land sei innovativer als die Schweiz, sagte Johann Schneider-Ammann in seiner Neujahrsansprache. Das Land müsse aber seine Hausaufgaben machen.', 'thumbnail': 'http://www.20min.ch/images/content/2/2/0/22050469/10/teaserbreit.jpg' - } + }, + 'skip': '"This video is no longer available" is shown both on the web page and in the downloaded file.', + }, { + # YouTube embed + 'url': 'http://www.20min.ch/ro/sports/football/story/Il-marque-une-bicyclette-de-plus-de-30-metres--21115184', + 'md5': 'cec64d59aa01c0ed9dbba9cf639dd82f', + 'info_dict': { + 'id': 'ivM7A7SpDOs', + 'ext': 'mp4', + 'title': 'GOLAZO DE CHILENA DE JAVI GÓMEZ, FINALISTA AL BALÓN DE CLM 2016', + 'description': 'md5:903c92fbf2b2f66c09de514bc25e9f5a', + 'upload_date': '20160424', + 'uploader': 'RTVCM Castilla-La Mancha', + 'uploader_id': 'RTVCM', + }, + 'add_ie': ['Youtube'], }, { 'url': 'http://www.20min.ch/videotv/?cid=44&vid=468738', 'only_matching': True, @@ -48,6 +63,12 @@ class TwentyMinutenIE(InfoExtractor): webpage = self._download_webpage(url, display_id) + youtube_url = self._html_search_regex( + r'<iframe[^>]+src="((?:https?:)?//www\.youtube\.com/embed/[^"]+)"', + webpage, 'YouTube embed URL', default=None) + if youtube_url is not None: + return self.url_result(youtube_url, 'Youtube') + title = self._html_search_regex( r'<h1>.*?<span>(.+?)</span></h1>', webpage, 'title', default=None) From 52af8f222bc4f067b4c5e7a977a64345d35ae4fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 Apr 2016 04:01:21 +0600 Subject: [PATCH 10/30] [cwtv] Relax _VALID_URL (Closes #9327) --- youtube_dl/extractor/cwtv.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py index f5cefd966..ebd14cb16 100644 --- a/youtube_dl/extractor/cwtv.py +++ b/youtube_dl/extractor/cwtv.py @@ -9,7 +9,7 @@ from ..utils import ( class CWTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cw(?:tv|seed)\.com/shows/(?:[^/]+/){2}\?play=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})' + _VALID_URL = r'https?://(?:www\.)?cw(?:tv|seed)\.com/(?:shows/)?(?:[^/]+/){2}\?.*\bplay=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})' _TESTS = [{ 'url': 'http://cwtv.com/shows/arrow/legends-of-yesterday/?play=6b15e985-9345-4f60-baf8-56e96be57c63', 'info_dict': { @@ -48,6 +48,9 @@ class CWTVIE(InfoExtractor): # m3u8 download 'skip_download': True, } + }, { + 'url': 'http://cwtv.com/thecw/chroniclesofcisco/?play=8adebe35-f447-465f-ab52-e863506ff6d6', + 'only_matching': True, }] def _real_extract(self, url): From 618c71dc64086f751b6ae87d5f32687e02a54e58 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 28 Apr 2016 15:00:02 +0800 Subject: [PATCH 11/30] [cloudy] New domain name for the test_cloudy_1 I'm sure whether videoraj.ch still works or not, so keep it. --- youtube_dl/extractor/cloudy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index 9e267e6c0..9a28ef354 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -19,7 +19,7 @@ from ..utils import ( class CloudyIE(InfoExtractor): _IE_DESC = 'cloudy.ec and videoraj.ch' _VALID_URL = r'''(?x) - https?://(?:www\.)?(?P<host>cloudy\.ec|videoraj\.ch)/ + https?://(?:www\.)?(?P<host>cloudy\.ec|videoraj\.(?:ch|to))/ (?:v/|embed\.php\?id=) (?P<id>[A-Za-z0-9]+) ''' @@ -37,7 +37,7 @@ class CloudyIE(InfoExtractor): } }, { - 'url': 'http://www.videoraj.ch/v/47f399fd8bb60', + 'url': 'http://www.videoraj.to/v/47f399fd8bb60', 'md5': '7d0f8799d91efd4eda26587421c3c3b0', 'info_dict': { 'id': '47f399fd8bb60', From a5941305b6ba0921ea4f34641dd9095372dd1c1d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 28 Apr 2016 16:03:08 +0800 Subject: [PATCH 12/30] [mwave] Coding style --- youtube_dl/extractor/mwave.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/mwave.py b/youtube_dl/extractor/mwave.py index 6485c6928..a103e0323 100644 --- a/youtube_dl/extractor/mwave.py +++ b/youtube_dl/extractor/mwave.py @@ -10,6 +10,7 @@ from ..utils import ( class MwaveIE(InfoExtractor): _VALID_URL = r'https?://mwave\.interest\.me/mnettv/videodetail\.m\?searchVideoDetailVO\.clip_id=(?P<id>[0-9]+)' + _URL_TEMPLATE = 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=%s' _TEST = { 'url': 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=168859', # md5 is unstable @@ -59,7 +60,7 @@ class MwaveIE(InfoExtractor): class MwaveMeetGreetIE(InfoExtractor): - _VALID_URL = r'https?://mwave\.interest\.me/meetgreet/view/(?P<id>[0-9]+)' + _VALID_URL = r'https?://mwave\.interest\.me/meetgreet/view/(?P<id>\d+)' _TEST = { 'url': 'http://mwave.interest.me/meetgreet/view/256', 'info_dict': { @@ -76,6 +77,8 @@ class MwaveMeetGreetIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - clip_id = self._html_search_regex(r'<iframe src="/mnettv/ifr_clip\.m\?searchVideoDetailVO\.clip_id=(?P<id>[0-9]+)', webpage, 'clip ID') - clip_url = 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id={0}'.format(clip_id) + clip_id = self._html_search_regex( + r'<iframe[^>]+src="/mnettv/ifr_clip\.m\?searchVideoDetailVO\.clip_id=(\d+)', + webpage, 'clip ID') + clip_url = MwaveIE._URL_TEMPLATE % clip_id return self.url_result(clip_url, 'Mwave', clip_id) From 7f776fa4b510b7973e08f06de556fa39cb5946e5 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 28 Apr 2016 17:08:41 +0800 Subject: [PATCH 13/30] [yandexmusic] Skip tests as Travis CI blocked --- youtube_dl/extractor/yandexmusic.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 7a90cc60c..0d32a612f 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -22,6 +22,12 @@ class YandexMusicBaseIE(InfoExtractor): if error: raise ExtractorError(error, expected=True) + def _download_webpage(self, *args, **kwargs): + webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs) + if 'Нам очень жаль, но запросы, поступившие с вашего IP-адреса, похожи на автоматические.' in webpage: + raise ExtractorError('Blocked by YandexMusic', expected=True) + return webpage + def _download_json(self, *args, **kwargs): response = super(YandexMusicBaseIE, self)._download_json(*args, **kwargs) self._handle_error(response) @@ -47,7 +53,8 @@ class YandexMusicTrackIE(YandexMusicBaseIE): 'album_artist': 'Carlo Ambrosio', 'artist': 'Carlo Ambrosio & Fabio Di Bari, Carlo Ambrosio', 'release_year': '2009', - } + }, + 'skip': 'Travis CI servers blocked by YandexMusic', } def _get_track_url(self, storage_dir, track_id): @@ -139,6 +146,7 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE): 'title': 'Carlo Ambrosio - Gypsy Soul (2009)', }, 'playlist_count': 50, + 'skip': 'Travis CI servers blocked by YandexMusic', } def _real_extract(self, url): @@ -171,6 +179,7 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9', }, 'playlist_count': 6, + 'skip': 'Travis CI servers blocked by YandexMusic', }, { # playlist exceeding the limit of 150 tracks shipped with webpage (see # https://github.com/rg3/youtube-dl/issues/6666) @@ -180,6 +189,7 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'title': 'Музыка 90-х', }, 'playlist_count': 310, + 'skip': 'Travis CI servers blocked by YandexMusic', }] def _real_extract(self, url): From 0cbcbdd89dbc3573ecfcf68496c54bd84804967d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 28 Apr 2016 17:51:20 +0800 Subject: [PATCH 14/30] [nuvid] Fix extraction Closes #7620 --- youtube_dl/extractor/nuvid.py | 44 +++++++++++++++++------------------ 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/nuvid.py b/youtube_dl/extractor/nuvid.py index 9fa7cefad..ab6bfcd7f 100644 --- a/youtube_dl/extractor/nuvid.py +++ b/youtube_dl/extractor/nuvid.py @@ -5,8 +5,6 @@ import re from .common import InfoExtractor from ..utils import ( parse_duration, - sanitized_Request, - unified_strdate, ) @@ -20,7 +18,6 @@ class NuvidIE(InfoExtractor): 'ext': 'mp4', 'title': 'Horny babes show their awesome bodeis and', 'duration': 129, - 'upload_date': '20140508', 'age_limit': 18, } } @@ -28,28 +25,31 @@ class NuvidIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - formats = [] + page_url = 'http://m.nuvid.com/video/%s' % video_id + webpage = self._download_webpage( + page_url, video_id, 'Downloading video page') + # When dwnld_speed exists and has a value larger than the MP4 file's + # bitrate, Nuvid returns the MP4 URL + # It's unit is 100bytes/millisecond, see mobile-nuvid-min.js for the algorithm + self._set_cookie('nuvid.com', 'dwnld_speed', '10.0') + mp4_webpage = self._download_webpage( + page_url, video_id, 'Downloading video page for MP4 format') - for dwnld_speed, format_id in [(0, '3gp'), (5, 'mp4')]: - request = sanitized_Request( - 'http://m.nuvid.com/play/%s' % video_id) - request.add_header('Cookie', 'skip_download_page=1; dwnld_speed=%d; adv_show=1' % dwnld_speed) - webpage = self._download_webpage( - request, video_id, 'Downloading %s page' % format_id) - video_url = self._html_search_regex( - r'<a\s+href="([^"]+)"\s+class="b_link">', webpage, '%s video URL' % format_id, fatal=False) - if not video_url: - continue + html5_video_re = r'(?s)<(?:video|audio)[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', + video_url = self._html_search_regex(html5_video_re, webpage, video_id) + mp4_video_url = self._html_search_regex(html5_video_re, mp4_webpage, video_id) + formats = [{ + 'url': video_url, + }] + if mp4_video_url != video_url: formats.append({ - 'url': video_url, - 'format_id': format_id, + 'url': mp4_video_url, }) - webpage = self._download_webpage( - 'http://m.nuvid.com/video/%s' % video_id, video_id, 'Downloading video page') title = self._html_search_regex( [r'<span title="([^"]+)">', - r'<div class="thumb-holder video">\s*<h5[^>]*>([^<]+)</h5>'], webpage, 'title').strip() + r'<div class="thumb-holder video">\s*<h5[^>]*>([^<]+)</h5>', + r'<span[^>]+class="title_thumb">([^<]+)</span>'], webpage, 'title').strip() thumbnails = [ { 'url': thumb_url, @@ -57,9 +57,8 @@ class NuvidIE(InfoExtractor): ] thumbnail = thumbnails[0]['url'] if thumbnails else None duration = parse_duration(self._html_search_regex( - r'<i class="fa fa-clock-o"></i>\s*(\d{2}:\d{2})', webpage, 'duration', fatal=False)) - upload_date = unified_strdate(self._html_search_regex( - r'<i class="fa fa-user"></i>\s*(\d{4}-\d{2}-\d{2})', webpage, 'upload date', fatal=False)) + [r'<i class="fa fa-clock-o"></i>\s*(\d{2}:\d{2})', + r'<span[^>]+class="view_time">([^<]+)</span>'], webpage, 'duration', fatal=False)) return { 'id': video_id, @@ -67,7 +66,6 @@ class NuvidIE(InfoExtractor): 'thumbnails': thumbnails, 'thumbnail': thumbnail, 'duration': duration, - 'upload_date': upload_date, 'age_limit': 18, 'formats': formats, } From eebe6b382eb6bd9e8118b616f3dde48c294e3b0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 Apr 2016 21:37:34 +0600 Subject: [PATCH 15/30] [yandexmusic] Improve error handling --- youtube_dl/extractor/yandexmusic.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 0d32a612f..b0e68a087 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -18,9 +18,10 @@ from ..utils import ( class YandexMusicBaseIE(InfoExtractor): @staticmethod def _handle_error(response): - error = response.get('error') - if error: - raise ExtractorError(error, expected=True) + if isinstance(response, dict): + error = response.get('error') + if error: + raise ExtractorError(error, expected=True) def _download_webpage(self, *args, **kwargs): webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs) From 4b537629143c8f51c5814c650227971c438b12e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 Apr 2016 21:45:33 +0600 Subject: [PATCH 16/30] [yandexmusic] Clarify blockage --- youtube_dl/extractor/yandexmusic.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index b0e68a087..a33fe3d83 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -26,7 +26,11 @@ class YandexMusicBaseIE(InfoExtractor): def _download_webpage(self, *args, **kwargs): webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs) if 'Нам очень жаль, но запросы, поступившие с вашего IP-адреса, похожи на автоматические.' in webpage: - raise ExtractorError('Blocked by YandexMusic', expected=True) + raise ExtractorError( + 'YandexMusic asks you to solve a CAPTCHA: go to ' + 'https://music.yandex.ru/ and solve it, then export ' + 'cookies and pass cookie file to youtube-dl with --cookies', + expected=True) return webpage def _download_json(self, *args, **kwargs): From 0ba9e3ca2233d018d695bac4eebe0e34043a7ec9 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 28 Apr 2016 17:44:33 +0100 Subject: [PATCH 17/30] [viewster] extract formats for videos with multiple audios/subtitles --- youtube_dl/extractor/viewster.py | 147 +++++++++++++++++-------------- 1 file changed, 80 insertions(+), 67 deletions(-) diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 6edc2c44e..1813b81d6 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -78,11 +78,11 @@ class ViewsterIE(InfoExtractor): _ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01' - def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True): + def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True, query={}): request = sanitized_Request(url) request.add_header('Accept', self._ACCEPT_HEADER) request.add_header('Auth-token', self._AUTH_TOKEN) - return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal) + return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal, query=query) def _real_extract(self, url): video_id = self._match_id(url) @@ -117,72 +117,85 @@ class ViewsterIE(InfoExtractor): return self.playlist_result(entries, video_id, title, description) formats = [] - manifest_url = None - m3u8_formats = [] - for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'): - media = self._download_json( - 'https://public-api.viewster.com/movies/%s/video?mediaType=%s' - % (entry_id, compat_urllib_parse.quote(media_type)), - video_id, 'Downloading %s JSON' % media_type, fatal=False) - if not media: - continue - video_url = media.get('Uri') - if not video_url: - continue - ext = determine_ext(video_url) - if ext == 'f4m': - manifest_url = video_url - video_url += '&' if '?' in video_url else '?' - video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1' - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id='hds')) - elif ext == 'm3u8': - manifest_url = video_url - m3u8_formats = self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id='hls', - fatal=False) # m3u8 sometimes fail - if m3u8_formats: - formats.extend(m3u8_formats) - else: - qualities_basename = self._search_regex( - '/([^/]+)\.csmil/', - manifest_url, 'qualities basename', default=None) - if not qualities_basename: - continue - QUALITIES_RE = r'((,\d+k)+,?)' - qualities = self._search_regex( - QUALITIES_RE, qualities_basename, - 'qualities', default=None) - if not qualities: - continue - qualities = list(map(lambda q: int(q[:-1]), qualities.strip(',').split(','))) - qualities.sort() - http_template = re.sub(QUALITIES_RE, r'%dk', qualities_basename) - http_url_basename = url_basename(video_url) - if m3u8_formats: - self._sort_formats(m3u8_formats) - m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - m3u8_formats)) - if len(qualities) == len(m3u8_formats): - for q, m3u8_format in zip(qualities, m3u8_formats): - f = m3u8_format.copy() - f.update({ - 'url': video_url.replace(http_url_basename, http_template % q), - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - else: - for q in qualities: - formats.append({ - 'url': video_url.replace(http_url_basename, http_template % q), - 'ext': 'mp4', - 'format_id': 'http-%d' % q, - 'tbr': q, - }) + for language_set in info.get('LanguageSets', []): + manifest_url = None + m3u8_formats = [] + audio = language_set.get('Audio') or '' + subtitle = language_set.get('Subtitle') or '' + base_format_id = audio + if subtitle: + base_format_id += '-%s' % subtitle - if not formats and not info.get('LanguageSets') and not info.get('VODSettings'): + def concat(suffix, sep='-'): + return (base_format_id + '%s%s' % (sep, suffix)) if base_format_id else suffix + + for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'): + media = self._download_json( + 'https://public-api.viewster.com/movies/%s/video' % entry_id, + video_id, 'Downloading %s JSON' % concat(media_type, ' '), fatal=False, query={ + 'mediaType': media_type, + 'language': audio, + 'subtitle': subtitle, + }) + if not media: + continue + video_url = media.get('Uri') + if not video_url: + continue + ext = determine_ext(video_url) + if ext == 'f4m': + manifest_url = video_url + video_url += '&' if '?' in video_url else '?' + video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1' + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id=concat('hds'))) + elif ext == 'm3u8': + manifest_url = video_url + m3u8_formats = self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id=concat('hls'), + fatal=False) # m3u8 sometimes fail + if m3u8_formats: + formats.extend(m3u8_formats) + else: + qualities_basename = self._search_regex( + '/([^/]+)\.csmil/', + manifest_url, 'qualities basename', default=None) + if not qualities_basename: + continue + QUALITIES_RE = r'((,\d+k)+,?)' + qualities = self._search_regex( + QUALITIES_RE, qualities_basename, + 'qualities', default=None) + if not qualities: + continue + qualities = list(map(lambda q: int(q[:-1]), qualities.strip(',').split(','))) + qualities.sort() + http_template = re.sub(QUALITIES_RE, r'%dk', qualities_basename) + http_url_basename = url_basename(video_url) + if m3u8_formats: + self._sort_formats(m3u8_formats) + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + m3u8_formats)) + if len(qualities) == len(m3u8_formats): + for q, m3u8_format in zip(qualities, m3u8_formats): + f = m3u8_format.copy() + f.update({ + 'url': video_url.replace(http_url_basename, http_template % q), + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + formats.append(f) + else: + for q in qualities: + formats.append({ + 'url': video_url.replace(http_url_basename, http_template % q), + 'ext': 'mp4', + 'format_id': 'http-%d' % q, + 'tbr': q, + }) + + if not formats and not info.get('VODSettings'): self.raise_geo_restricted() self._sort_formats(formats) From e757fb3d053a195da4084c08a59a7b17b08ba598 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 28 Apr 2016 18:42:20 +0100 Subject: [PATCH 18/30] [crunchyroll] improve extraction - extract more metadata(series, episode, episode_number) - reduce duplicate requests for extracting formats - remove duplicate formats --- youtube_dl/extractor/crunchyroll.py | 31 +++++++++++++++++------------ 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 8ae3f2890..dd753c7c3 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -11,7 +11,6 @@ from math import pow, sqrt, floor from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, - compat_urllib_parse_unquote, compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, @@ -306,28 +305,24 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', webpage, 'video_uploader', fatal=False) - playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url')) - playerdata_req = sanitized_Request(playerdata_url) - playerdata_req.data = urlencode_postdata({'current_page': webpage_url}) - playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') - playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info') - - stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, 'stream_id') - video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False) - formats = [] - for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage): + video_encode_ids = [] + for fmt in re.findall(r'token="showmedia\.([0-9]{3,4})p"', webpage): stream_quality, stream_format = self._FORMAT_IDS[fmt] video_format = fmt + 'p' streamdata_req = sanitized_Request( 'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s' - % (stream_id, stream_format, stream_quality), + % (video_id, stream_format, stream_quality), compat_urllib_parse_urlencode({'current_page': url}).encode('utf-8')) streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') streamdata = self._download_xml( streamdata_req, video_id, note='Downloading media info for %s' % video_format) stream_info = streamdata.find('./{default}preload/stream_info') + video_encode_id = xpath_text(stream_info, './video_encode_id') + if video_encode_id in video_encode_ids: + continue + video_encode_ids.append(video_encode_id) video_url = xpath_text(stream_info, './host') video_play_path = xpath_text(stream_info, './file') if not video_url or not video_play_path: @@ -360,15 +355,25 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text }) formats.append(format_info) + metadata = self._download_xml( + 'http://www.crunchyroll.com/xml', video_id, + note='Downloading media info', query={ + 'req': 'RpcApiVideoPlayer_GetMediaMetadata', + 'media_id': video_id, + }) + subtitles = self.extract_subtitles(video_id, webpage) return { 'id': video_id, 'title': video_title, 'description': video_description, - 'thumbnail': video_thumbnail, + 'thumbnail': xpath_text(metadata, 'episode_image_url'), 'uploader': video_uploader, 'upload_date': video_upload_date, + 'series': xpath_text(metadata, 'series_title'), + 'episode': xpath_text(metadata, 'episode_title'), + 'episode_number': int_or_none(xpath_text(metadata, 'episode_number')), 'subtitles': subtitles, 'formats': formats, } From 497971cd4a8407651debfb2fd4b10fc4009b0f15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 29 Apr 2016 01:28:07 +0600 Subject: [PATCH 19/30] [yandexmusic] Clarify blockage even more --- youtube_dl/extractor/yandexmusic.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index a33fe3d83..ce3723b55 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -27,9 +27,12 @@ class YandexMusicBaseIE(InfoExtractor): webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs) if 'Нам очень жаль, но запросы, поступившие с вашего IP-адреса, похожи на автоматические.' in webpage: raise ExtractorError( - 'YandexMusic asks you to solve a CAPTCHA: go to ' - 'https://music.yandex.ru/ and solve it, then export ' - 'cookies and pass cookie file to youtube-dl with --cookies', + 'YandexMusic has considered youtube-dl requests automated and ' + 'asks you to solve a CAPTCHA. You can either wait for some ' + 'time until unblocked and optionally use --sleep-interval ' + 'in future or alternatively you can go to https://music.yandex.ru/ ' + 'solve CAPTCHA, then export cookies and pass cookie file to ' + 'youtube-dl with --cookies', expected=True) return webpage From 683d892bf9332df1a255c673bca56a8f5487292a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 29 Apr 2016 01:30:53 +0600 Subject: [PATCH 20/30] [viewster] Remove unused import --- youtube_dl/extractor/viewster.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 1813b81d6..a93196a07 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -6,7 +6,6 @@ import re from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_urllib_parse, compat_urllib_parse_unquote, ) from ..utils import ( From 72670c39decc296a3ee757301dc70389674d19c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 29 Apr 2016 04:46:23 +0600 Subject: [PATCH 21/30] [arte:+7] Fix typo in _VALID_URL --- youtube_dl/extractor/arte.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index a9e3266dc..881cacfab 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -63,7 +63,7 @@ class ArteTvIE(InfoExtractor): class ArteTVPlus7IE(InfoExtractor): IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&+])' + _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)' @classmethod def _extract_url_info(cls, url): From 31ff3c074eddf4078b6eb49281830875eb4e65a1 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 29 Apr 2016 13:36:52 +0800 Subject: [PATCH 22/30] [sexykarma] Remove the extractor Its domain name is on sale. Closes #9317 --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/sexykarma.py | 121 ----------------------------- 2 files changed, 122 deletions(-) delete mode 100644 youtube_dl/extractor/sexykarma.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 88405f070..41ff1e7a5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -657,7 +657,6 @@ from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE from .senateisvp import SenateISVPIE from .servingsys import ServingSysIE from .sexu import SexuIE -from .sexykarma import SexyKarmaIE from .shahid import ShahidIE from .shared import SharedIE from .sharesix import ShareSixIE diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/sexykarma.py deleted file mode 100644 index e33483674..000000000 --- a/youtube_dl/extractor/sexykarma.py +++ /dev/null @@ -1,121 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - unified_strdate, - parse_duration, - int_or_none, -) - - -class SexyKarmaIE(InfoExtractor): - IE_DESC = 'Sexy Karma and Watch Indian Porn' - _VALID_URL = r'https?://(?:www\.)?(?:sexykarma\.com|watchindianporn\.net)/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html' - _TESTS = [{ - 'url': 'http://www.sexykarma.com/gonewild/video/taking-a-quick-pee-yHI70cOyIHt.html', - 'md5': 'b9798e7d1ef1765116a8f516c8091dbd', - 'info_dict': { - 'id': 'yHI70cOyIHt', - 'display_id': 'taking-a-quick-pee', - 'ext': 'mp4', - 'title': 'Taking a quick pee.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'wildginger7', - 'upload_date': '20141008', - 'duration': 22, - 'view_count': int, - 'comment_count': int, - 'categories': list, - 'age_limit': 18, - } - }, { - 'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html', - 'md5': 'dd216c68d29b49b12842b9babe762a5d', - 'info_dict': { - 'id': '8Id6EZPbuHf', - 'display_id': 'pot-pixie-tribute', - 'ext': 'mp4', - 'title': 'pot_pixie tribute', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'banffite', - 'upload_date': '20141013', - 'duration': 16, - 'view_count': int, - 'comment_count': int, - 'categories': list, - 'age_limit': 18, - } - }, { - 'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html', - 'md5': '9afb80675550406ed9a63ac2819ef69d', - 'info_dict': { - 'id': 'dW2mtctxJfs', - 'display_id': 'desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number', - 'ext': 'mp4', - 'title': 'Desi dancer namrata stripping completely nude and dancing on a hot number', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'Don', - 'upload_date': '20140213', - 'duration': 83, - 'view_count': int, - 'comment_count': int, - 'categories': list, - 'age_limit': 18, - } - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - video_url = self._html_search_regex( - r"url: escape\('([^']+)'\)", webpage, 'url') - - title = self._html_search_regex( - r'<h2 class="he2"><span>(.*?)</span>', - webpage, 'title') - thumbnail = self._html_search_regex( - r'<span id="container"><img\s+src="([^"]+)"', - webpage, 'thumbnail', fatal=False) - - uploader = self._html_search_regex( - r'class="aupa">\s*(.*?)</a>', - webpage, 'uploader') - upload_date = unified_strdate(self._html_search_regex( - r'Added: <strong>(.+?)</strong>', webpage, 'upload date', fatal=False)) - - duration = parse_duration(self._search_regex( - r'<td>Time:\s*</td>\s*<td align="right"><span>\s*(.+?)\s*</span>', - webpage, 'duration', fatal=False)) - - view_count = int_or_none(self._search_regex( - r'<td>Views:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', - webpage, 'view count', fatal=False)) - comment_count = int_or_none(self._search_regex( - r'<td>Comments:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', - webpage, 'comment count', fatal=False)) - - categories = re.findall( - r'<a href="[^"]+/search/video/desi"><span>([^<]+)</span></a>', - webpage) - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'upload_date': upload_date, - 'duration': duration, - 'view_count': view_count, - 'comment_count': comment_count, - 'categories': categories, - 'age_limit': 18, - } From f5535ed0e3537acee90820c98d6ca474d437d7d0 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 29 Apr 2016 14:24:07 +0800 Subject: [PATCH 23/30] [orf] Skip the expired test --- youtube_dl/extractor/orf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 66c75f8b3..4e3864f0d 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -185,6 +185,7 @@ class ORFFM4IE(InfoExtractor): 'timestamp': 1452456073, 'upload_date': '20160110', }, + 'skip': 'Live streams on FM4 got deleted soon', } def _real_extract(self, url): From 5819edef034819b76b8eec6a0cdf7b29cc9ddff3 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 29 Apr 2016 14:27:15 +0800 Subject: [PATCH 24/30] [ooyala] Skip an invalid test Ooyala is used by lots of extractors and its correctness can be verified by these websites. --- youtube_dl/extractor/ooyala.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 16f040191..95e982897 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -96,6 +96,8 @@ class OoyalaIE(OoyalaBaseIE): 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', 'duration': 853.386, }, + # The video in the original webpage now uses PlayWire + 'skip': 'Ooyala said: movie expired', }, { # Only available for ipad 'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', From 1910077ed77a270fea8e368c3815b23cee254f85 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 29 Apr 2016 17:59:23 +0800 Subject: [PATCH 25/30] Revert "[sexykarma] Remove the extractor" This reverts commit 31ff3c074eddf4078b6eb49281830875eb4e65a1. --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/sexykarma.py | 121 +++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 youtube_dl/extractor/sexykarma.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 41ff1e7a5..88405f070 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -657,6 +657,7 @@ from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE from .senateisvp import SenateISVPIE from .servingsys import ServingSysIE from .sexu import SexuIE +from .sexykarma import SexyKarmaIE from .shahid import ShahidIE from .shared import SharedIE from .sharesix import ShareSixIE diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/sexykarma.py new file mode 100644 index 000000000..e33483674 --- /dev/null +++ b/youtube_dl/extractor/sexykarma.py @@ -0,0 +1,121 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + unified_strdate, + parse_duration, + int_or_none, +) + + +class SexyKarmaIE(InfoExtractor): + IE_DESC = 'Sexy Karma and Watch Indian Porn' + _VALID_URL = r'https?://(?:www\.)?(?:sexykarma\.com|watchindianporn\.net)/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html' + _TESTS = [{ + 'url': 'http://www.sexykarma.com/gonewild/video/taking-a-quick-pee-yHI70cOyIHt.html', + 'md5': 'b9798e7d1ef1765116a8f516c8091dbd', + 'info_dict': { + 'id': 'yHI70cOyIHt', + 'display_id': 'taking-a-quick-pee', + 'ext': 'mp4', + 'title': 'Taking a quick pee.', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'wildginger7', + 'upload_date': '20141008', + 'duration': 22, + 'view_count': int, + 'comment_count': int, + 'categories': list, + 'age_limit': 18, + } + }, { + 'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html', + 'md5': 'dd216c68d29b49b12842b9babe762a5d', + 'info_dict': { + 'id': '8Id6EZPbuHf', + 'display_id': 'pot-pixie-tribute', + 'ext': 'mp4', + 'title': 'pot_pixie tribute', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'banffite', + 'upload_date': '20141013', + 'duration': 16, + 'view_count': int, + 'comment_count': int, + 'categories': list, + 'age_limit': 18, + } + }, { + 'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html', + 'md5': '9afb80675550406ed9a63ac2819ef69d', + 'info_dict': { + 'id': 'dW2mtctxJfs', + 'display_id': 'desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number', + 'ext': 'mp4', + 'title': 'Desi dancer namrata stripping completely nude and dancing on a hot number', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Don', + 'upload_date': '20140213', + 'duration': 83, + 'view_count': int, + 'comment_count': int, + 'categories': list, + 'age_limit': 18, + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + video_url = self._html_search_regex( + r"url: escape\('([^']+)'\)", webpage, 'url') + + title = self._html_search_regex( + r'<h2 class="he2"><span>(.*?)</span>', + webpage, 'title') + thumbnail = self._html_search_regex( + r'<span id="container"><img\s+src="([^"]+)"', + webpage, 'thumbnail', fatal=False) + + uploader = self._html_search_regex( + r'class="aupa">\s*(.*?)</a>', + webpage, 'uploader') + upload_date = unified_strdate(self._html_search_regex( + r'Added: <strong>(.+?)</strong>', webpage, 'upload date', fatal=False)) + + duration = parse_duration(self._search_regex( + r'<td>Time:\s*</td>\s*<td align="right"><span>\s*(.+?)\s*</span>', + webpage, 'duration', fatal=False)) + + view_count = int_or_none(self._search_regex( + r'<td>Views:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', + webpage, 'view count', fatal=False)) + comment_count = int_or_none(self._search_regex( + r'<td>Comments:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', + webpage, 'comment count', fatal=False)) + + categories = re.findall( + r'<a href="[^"]+/search/video/desi"><span>([^<]+)</span></a>', + webpage) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, + 'categories': categories, + 'age_limit': 18, + } From 14638e291511c3305b70dce64e9bd97686e9da93 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 29 Apr 2016 18:17:08 +0800 Subject: [PATCH 26/30] [sexykarma] Rename to WatchIndianPornIE and fix extraction --- youtube_dl/extractor/extractors.py | 2 +- .../{sexykarma.py => watchindianporn.py} | 63 +++++-------------- 2 files changed, 17 insertions(+), 48 deletions(-) rename youtube_dl/extractor/{sexykarma.py => watchindianporn.py} (54%) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 88405f070..3adcd41c4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -657,7 +657,6 @@ from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE from .senateisvp import SenateISVPIE from .servingsys import ServingSysIE from .sexu import SexuIE -from .sexykarma import SexyKarmaIE from .shahid import ShahidIE from .shared import SharedIE from .sharesix import ShareSixIE @@ -918,6 +917,7 @@ from .vulture import VultureIE from .walla import WallaIE from .washingtonpost import WashingtonPostIE from .wat import WatIE +from .watchindianporn import WatchIndianPornIE from .wdr import ( WDRIE, WDRMobileIE, diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/watchindianporn.py similarity index 54% rename from youtube_dl/extractor/sexykarma.py rename to youtube_dl/extractor/watchindianporn.py index e33483674..5d3b5bdb4 100644 --- a/youtube_dl/extractor/sexykarma.py +++ b/youtube_dl/extractor/watchindianporn.py @@ -11,61 +11,27 @@ from ..utils import ( ) -class SexyKarmaIE(InfoExtractor): - IE_DESC = 'Sexy Karma and Watch Indian Porn' - _VALID_URL = r'https?://(?:www\.)?(?:sexykarma\.com|watchindianporn\.net)/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html' - _TESTS = [{ - 'url': 'http://www.sexykarma.com/gonewild/video/taking-a-quick-pee-yHI70cOyIHt.html', - 'md5': 'b9798e7d1ef1765116a8f516c8091dbd', +class WatchIndianPornIE(InfoExtractor): + IE_DESC = 'Watch Indian Porn' + _VALID_URL = r'https?://(?:www\.)?watchindianporn\.net/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html' + _TEST = { + 'url': 'http://www.watchindianporn.net/video/hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera-RZa2avywNPa.html', + 'md5': '249589a164dde236ec65832bfce17440', 'info_dict': { - 'id': 'yHI70cOyIHt', - 'display_id': 'taking-a-quick-pee', + 'id': 'RZa2avywNPa', + 'display_id': 'hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera', 'ext': 'mp4', - 'title': 'Taking a quick pee.', + 'title': 'Hot milf from kerala shows off her gorgeous large breasts on camera', 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'wildginger7', - 'upload_date': '20141008', - 'duration': 22, + 'uploader': 'LoveJay', + 'upload_date': '20160428', + 'duration': 226, 'view_count': int, 'comment_count': int, 'categories': list, 'age_limit': 18, } - }, { - 'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html', - 'md5': 'dd216c68d29b49b12842b9babe762a5d', - 'info_dict': { - 'id': '8Id6EZPbuHf', - 'display_id': 'pot-pixie-tribute', - 'ext': 'mp4', - 'title': 'pot_pixie tribute', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'banffite', - 'upload_date': '20141013', - 'duration': 16, - 'view_count': int, - 'comment_count': int, - 'categories': list, - 'age_limit': 18, - } - }, { - 'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html', - 'md5': '9afb80675550406ed9a63ac2819ef69d', - 'info_dict': { - 'id': 'dW2mtctxJfs', - 'display_id': 'desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number', - 'ext': 'mp4', - 'title': 'Desi dancer namrata stripping completely nude and dancing on a hot number', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'Don', - 'upload_date': '20140213', - 'duration': 83, - 'view_count': int, - 'comment_count': int, - 'categories': list, - 'age_limit': 18, - } - }] + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -109,6 +75,9 @@ class SexyKarmaIE(InfoExtractor): 'id': video_id, 'display_id': display_id, 'url': video_url, + 'http_headers': { + 'Referer': url, + }, 'title': title, 'thumbnail': thumbnail, 'uploader': uploader, From 67167920db50e818c9fca20579c8a05eb2218f86 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 29 Apr 2016 11:14:42 +0100 Subject: [PATCH 27/30] [viewlift] replace SnagFilms extractors - add support for other sites that use the same logic - improve format extraction and sorting --- youtube_dl/extractor/extractors.py | 8 ++--- youtube_dl/extractor/generic.py | 10 +++--- .../extractor/{snagfilms.py => viewlift.py} | 35 +++++++++++++------ 3 files changed, 34 insertions(+), 19 deletions(-) rename youtube_dl/extractor/{snagfilms.py => viewlift.py} (81%) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3adcd41c4..b1b7f9b42 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -673,10 +673,6 @@ from .smotri import ( SmotriUserIE, SmotriBroadcastIE, ) -from .snagfilms import ( - SnagFilmsIE, - SnagFilmsEmbedIE, -) from .snotr import SnotrIE from .sohu import SohuIE from .soundcloud import ( @@ -879,6 +875,10 @@ from .vidme import ( ) from .vidzi import VidziIE from .vier import VierIE, VierVideosIE +from .viewlift import ( + ViewLiftIE, + ViewLiftEmbedIE, +) from .viewster import ViewsterIE from .viidea import ViideaIE from .vimeo import ( diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a95501d86..0f1eb7fa6 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -51,7 +51,7 @@ from .tnaflix import TNAFlixNetworkEmbedIE from .vimeo import VimeoIE from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE -from .snagfilms import SnagFilmsEmbedIE +from .viewlift import ViewLiftEmbedIE from .screenwavemedia import ScreenwaveMediaIE from .mtv import MTVServicesEmbeddedIE from .pladform import PladformIE @@ -1924,10 +1924,10 @@ class GenericIE(InfoExtractor): if onionstudios_url: return self.url_result(onionstudios_url) - # Look for SnagFilms embeds - snagfilms_url = SnagFilmsEmbedIE._extract_url(webpage) - if snagfilms_url: - return self.url_result(snagfilms_url) + # Look for ViewLift embeds + viewlift_url = ViewLiftEmbedIE._extract_url(webpage) + if viewlift_url: + return self.url_result(viewlift_url) # Look for JWPlatform embeds jwplatform_url = JWPlatformIE._extract_url(webpage) diff --git a/youtube_dl/extractor/snagfilms.py b/youtube_dl/extractor/viewlift.py similarity index 81% rename from youtube_dl/extractor/snagfilms.py rename to youtube_dl/extractor/viewlift.py index 6977afb27..dd4a13a4a 100644 --- a/youtube_dl/extractor/snagfilms.py +++ b/youtube_dl/extractor/viewlift.py @@ -13,8 +13,12 @@ from ..utils import ( ) -class SnagFilmsEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|embed)\.)?snagfilms\.com/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})' +class ViewLiftBaseIE(InfoExtractor): + _DOMAINS_REGEX = '(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|monumentalsportsnetwork|vayafilm)\.com|kesari\.tv' + + +class ViewLiftEmbedIE(ViewLiftBaseIE): + _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', 'md5': '2924e9215c6eff7a55ed35b72276bd93', @@ -40,7 +44,7 @@ class SnagFilmsEmbedIE(InfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?snagfilms\.com/embed/player.+?)\1', + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?(?:%s)/embed/player.+?)\1' % ViewLiftBaseIE._DOMAINS_REGEX, webpage) if mobj: return mobj.group('url') @@ -55,6 +59,7 @@ class SnagFilmsEmbedIE(InfoExtractor): 'Film %s is not playable in your area.' % video_id, expected=True) formats = [] + has_bitrate = False for source in self._parse_json(js_to_json(self._search_regex( r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id): file_ = source.get('file') @@ -63,22 +68,25 @@ class SnagFilmsEmbedIE(InfoExtractor): type_ = source.get('type') ext = determine_ext(file_) format_id = source.get('label') or ext - if all(v == 'm3u8' for v in (type_, ext)): + if all(v == 'm3u8' or v == 'hls' for v in (type_, ext)): formats.extend(self._extract_m3u8_formats( file_, video_id, 'mp4', m3u8_id='hls')) else: bitrate = int_or_none(self._search_regex( [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext], file_, 'bitrate', default=None)) + if not has_bitrate and bitrate: + has_bitrate = True height = int_or_none(self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None)) formats.append({ 'url': file_, - 'format_id': format_id, + 'format_id': 'http-%s%s' % (format_id, ('-%dk' % bitrate if bitrate else '')), 'tbr': bitrate, 'height': height, }) - self._sort_formats(formats) + field_preference = None if has_bitrate else ('height', 'tbr', 'format_id') + self._sort_formats(formats, field_preference) title = self._search_regex( [r"title\s*:\s*'([^']+)'", r'<title>([^<]+)'], @@ -91,8 +99,8 @@ class SnagFilmsEmbedIE(InfoExtractor): } -class SnagFilmsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?snagfilms\.com/(?:films/title|show)/(?P[^?#]+)' +class ViewLiftIE(ViewLiftBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?P%s)/(?:films/title|show|(?:news/)?videos?)/(?P[^?#]+)' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://www.snagfilms.com/films/title/lost_for_life', 'md5': '19844f897b35af219773fd63bdec2942', @@ -127,10 +135,16 @@ class SnagFilmsIE(InfoExtractor): # Film is not available. 'url': 'http://www.snagfilms.com/show/augie_alone/flirting', 'only_matching': True, + }, { + 'url': 'http://www.winnersview.com/videos/the-good-son', + 'only_matching': True, + }, { + 'url': 'http://www.kesari.tv/news/video/1461919076414', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) + domain, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) @@ -170,7 +184,7 @@ class SnagFilmsIE(InfoExtractor): return { '_type': 'url_transparent', - 'url': 'http://embed.snagfilms.com/embed/player?filmId=%s' % film_id, + 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), 'id': film_id, 'display_id': display_id, 'title': title, @@ -178,4 +192,5 @@ class SnagFilmsIE(InfoExtractor): 'thumbnail': thumbnail, 'duration': duration, 'categories': categories, + 'ie_key': 'ViewLiftEmbed', } From 065216d94f59953a228d2683d3bafe4241fd1e29 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 29 Apr 2016 11:46:42 +0100 Subject: [PATCH 28/30] [crunchyroll] reduce requests for formats extraction --- youtube_dl/extractor/crunchyroll.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index dd753c7c3..184ba6896 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -26,6 +26,7 @@ from ..utils import ( unified_strdate, urlencode_postdata, xpath_text, + extract_attributes, ) from ..aes import ( aes_cbc_decrypt, @@ -305,9 +306,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text r']+href="/publisher/[^"]+"[^>]*>([^<]+)', webpage, 'video_uploader', fatal=False) - formats = [] + available_fmts = [] + for a, fmt in re.findall(r'(]+token="showmedia\.([0-9]{3,4})p"[^>]+>.*?)', webpage): + attrs = extract_attributes(a) + href = attrs.get('href') + if href and '/freetrial' in href: + continue + available_fmts.append(fmt) + if not available_fmts: + available_fmts = re.findall(r'token="showmedia\.([0-9]{3,4})p"', webpage) video_encode_ids = [] - for fmt in re.findall(r'token="showmedia\.([0-9]{3,4})p"', webpage): + formats = [] + for fmt in available_fmts: stream_quality, stream_format = self._FORMAT_IDS[fmt] video_format = fmt + 'p' streamdata_req = sanitized_Request( From b24d6336a797b99339c12a0aa1b431755e22e8cf Mon Sep 17 00:00:00 2001 From: Kagami Hiiragi Date: Tue, 26 Apr 2016 17:30:24 +0300 Subject: [PATCH 29/30] [vlive] Add support for live videos --- youtube_dl/extractor/common.py | 8 ++- youtube_dl/extractor/vlive.py | 98 ++++++++++++++++++++++++++-------- 2 files changed, 83 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a285ee7d8..2763d2ffe 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1061,7 +1061,7 @@ class InfoExtractor(object): def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, m3u8_id=None, note=None, errnote=None, - fatal=True): + fatal=True, live=False): formats = [{ 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), @@ -1139,7 +1139,11 @@ class InfoExtractor(object): if m3u8_id: format_id.append(m3u8_id) last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None - format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) + # Bandwidth of live streams may differ over time thus making + # format_id unpredictable. So it's better to keep provided + # format_id intact. + if last_media_name and not live: + format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) f = { 'format_id': '-'.join(format_id), 'url': format_url(line.strip()), diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index baf39bb2c..2151696ea 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -1,8 +1,11 @@ # coding: utf-8 -from __future__ import unicode_literals +from __future__ import division, unicode_literals +import re +import time from .common import InfoExtractor from ..utils import ( + ExtractorError, dict_get, float_or_none, int_or_none, @@ -31,16 +34,77 @@ class VLiveIE(InfoExtractor): webpage = self._download_webpage( 'http://www.vlive.tv/video/%s' % video_id, video_id) - long_video_id = self._search_regex( - r'vlive\.tv\.video\.ajax\.request\.handler\.init\(\s*"[0-9]+"\s*,\s*"[^"]*"\s*,\s*"([^"]+)"', - webpage, 'long video id') + # UTC+x - UTC+9 (KST) + tz = time.altzone if time.localtime().tm_isdst == 1 else time.timezone + tz_offset = -tz // 60 - 9 * 60 + self._set_cookie('vlive.tv', 'timezoneOffset', '%d' % tz_offset) - key = self._search_regex( - r'vlive\.tv\.video\.ajax\.request\.handler\.init\(\s*"[0-9]+"\s*,\s*"[^"]*"\s*,\s*"[^"]+"\s*,\s*"([^"]+)"', - webpage, 'key') + status_params = self._download_json( + 'http://www.vlive.tv/video/status?videoSeq=%s' % video_id, + video_id, 'Downloading JSON status', + headers={'Referer': url}) + status = status_params.get('status') + air_start = status_params.get('onAirStartAt', '') + is_live = status_params.get('isLive') + video_params = self._search_regex( + r'vlive\.tv\.video\.ajax\.request\.handler\.init\((.+)\)', + webpage, 'video params') + live_params, long_video_id, key = re.split( + r'"\s*,\s*"', video_params)[1:4] + + if status == 'LIVE_ON_AIR' or status == 'BIG_EVENT_ON_AIR': + live_params = self._parse_json('"%s"' % live_params, video_id) + live_params = self._parse_json(live_params, video_id) + return self._live(video_id, webpage, live_params) + elif status == 'VOD_ON_AIR' or status == 'BIG_EVENT_INTRO': + if long_video_id and key: + return self._replay(video_id, webpage, long_video_id, key) + elif is_live: + status = 'LIVE_END' + else: + status = 'COMING_SOON' + + if status == 'LIVE_END': + raise ExtractorError('Uploading for replay. Please wait...', + expected=True) + elif status == 'COMING_SOON': + raise ExtractorError('Coming soon! %s' % air_start, expected=True) + elif status == 'CANCELED': + raise ExtractorError('We are sorry, ' + 'but the live broadcast has been canceled.', + expected=True) + else: + raise ExtractorError('Unknown status %s' % status) + + def _get_common_fields(self, webpage): title = self._og_search_title(webpage) + creator = self._html_search_regex( + r']+class="info_area"[^>]*>\s*]*>([^<]+)', + webpage, 'creator', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + return { + 'title': title, + 'creator': creator, + 'thumbnail': thumbnail, + } + def _live(self, video_id, webpage, live_params): + formats = [] + for vid in live_params.get('resolutions', []): + formats.extend(self._extract_m3u8_formats( + vid['cdnUrl'], video_id, 'mp4', + m3u8_id=vid.get('name'), + fatal=False, live=True)) + self._sort_formats(formats) + + return dict(self._get_common_fields(webpage), + id=video_id, + formats=formats, + is_live=True, + ) + + def _replay(self, video_id, webpage, long_video_id, key): playinfo = self._download_json( 'http://global.apis.naver.com/rmcnmv/rmcnmv/vod_play_videoInfo.json?%s' % compat_urllib_parse_urlencode({ @@ -62,11 +126,6 @@ class VLiveIE(InfoExtractor): } for vid in playinfo.get('videos', {}).get('list', []) if vid.get('source')] self._sort_formats(formats) - thumbnail = self._og_search_thumbnail(webpage) - creator = self._html_search_regex( - r']+class="info_area"[^>]*>\s*]*>([^<]+)', - webpage, 'creator', fatal=False) - view_count = int_or_none(playinfo.get('meta', {}).get('count')) subtitles = {} @@ -77,12 +136,9 @@ class VLiveIE(InfoExtractor): 'ext': 'vtt', 'url': caption['source']}] - return { - 'id': video_id, - 'title': title, - 'creator': creator, - 'thumbnail': thumbnail, - 'view_count': view_count, - 'formats': formats, - 'subtitles': subtitles, - } + return dict(self._get_common_fields(webpage), + id=video_id, + formats=formats, + view_count=view_count, + subtitles=subtitles, + ) From 9d186afac818645490122aa7457f247c31c601bf Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 29 Apr 2016 19:29:00 +0800 Subject: [PATCH 30/30] [vlive] Coding style and PEP8 --- youtube_dl/extractor/vlive.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 2151696ea..7f9e99ec2 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -3,10 +3,11 @@ from __future__ import division, unicode_literals import re import time + from .common import InfoExtractor from ..utils import ( - ExtractorError, dict_get, + ExtractorError, float_or_none, int_or_none, ) @@ -99,10 +100,9 @@ class VLiveIE(InfoExtractor): self._sort_formats(formats) return dict(self._get_common_fields(webpage), - id=video_id, - formats=formats, - is_live=True, - ) + id=video_id, + formats=formats, + is_live=True) def _replay(self, video_id, webpage, long_video_id, key): playinfo = self._download_json( @@ -137,8 +137,7 @@ class VLiveIE(InfoExtractor): 'url': caption['source']}] return dict(self._get_common_fields(webpage), - id=video_id, - formats=formats, - view_count=view_count, - subtitles=subtitles, - ) + id=video_id, + formats=formats, + view_count=view_count, + subtitles=subtitles)