From fb72ec58ae2612590d661c9943fe6b2fa0864401 Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 30 Jul 2015 17:34:38 +0100 Subject: [PATCH 01/68] [extractor/common] do not process f4m manifest that contain akamai playerVerificationChallenge --- youtube_dl/extractor/common.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index dc5080504..e3d1dd076 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -856,6 +856,13 @@ class InfoExtractor(object): # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) transform_source=transform_source) + # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy + akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0') + if akamai_pv is not None and ';' in akamai_pv.text: + playerVerificationChallenge = akamai_pv.text.split(';')[0] + if playerVerificationChallenge.strip() != '': + return [] + formats = [] manifest_version = '1.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') From abc1723edd03d38b256e012d465e3343064f5682 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 25 Apr 2016 22:24:40 +0800 Subject: [PATCH 02/68] [unistra] Sort formats Originally URLs are passed to set() and not sorted, so the result is not deterministic, causing occasional FAILs on Travis CI. --- youtube_dl/extractor/unistra.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py index 66d9f1bf3..a724cdbef 100644 --- a/youtube_dl/extractor/unistra.py +++ b/youtube_dl/extractor/unistra.py @@ -49,6 +49,7 @@ class UnistraIE(InfoExtractor): 'format_id': format_id, 'quality': quality(format_id) }) + self._sort_formats(formats) title = self._html_search_regex( r'UTV - (.*?)</', webpage, 'title') From 2beff95da5fb28440d26a3dee5de575c792d133c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 25 Apr 2016 22:26:19 +0800 Subject: [PATCH 03/68] [nrk] Comment out unstable MD5 checksums Both are Akamai f4f fragments. --- youtube_dl/extractor/nrk.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 9df200822..51dfc27ac 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -23,7 +23,7 @@ class NRKIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': 'bccd850baebefe23b56d708a113229c2', + # MD5 is unstable 'info_dict': { 'id': '150533', 'ext': 'flv', @@ -34,7 +34,7 @@ class NRKIE(InfoExtractor): }, { 'url': 'http://www.nrk.no/video/PS*154915', - 'md5': '0b1493ba1aae7d9579a5ad5531bc395a', + # MD5 is unstable 'info_dict': { 'id': '154915', 'ext': 'flv', From 6bdc2d5358c2843e3be4d073b2005e5196519664 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 25 Apr 2016 22:27:25 +0800 Subject: [PATCH 04/68] [mitele] Comment out unstable MD5 Also Akamai f4f fragments --- youtube_dl/extractor/mitele.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 7b4581dc5..3589c223d 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -15,9 +15,9 @@ class MiTeleIE(InfoExtractor): IE_DESC = 'mitele.es' _VALID_URL = r'https?://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/' - _TESTS = [{ + _TEST = { 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', - 'md5': '0ff1a13aebb35d9bc14081ff633dd324', + # MD5 is unstable 'info_dict': { 'id': '0NF1jJnxS1Wu3pHrmvFyw2', 'display_id': 'programa-144', @@ -27,7 +27,7 @@ class MiTeleIE(InfoExtractor): 'thumbnail': 're:(?i)^https?://.*\.jpg$', 'duration': 2913, }, - }] + } def _real_extract(self, url): display_id = self._match_id(url) From 4645432d7a92bfb950571dde5dd690110e0f2284 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 25 Apr 2016 22:48:17 +0800 Subject: [PATCH 05/68] [eagleplatform] Checking direct HTTP links Sometimes they fail with 404 --- youtube_dl/extractor/eagleplatform.py | 7 +++++-- youtube_dl/extractor/generic.py | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 0f8c73fd7..113a4966f 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -23,7 +23,7 @@ class EaglePlatformIE(InfoExtractor): _TESTS = [{ # http://lenta.ru/news/2015/03/06/navalny/ 'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201', - 'md5': '881ee8460e1b7735a8be938e2ffb362b', + # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used 'info_dict': { 'id': '227304', 'ext': 'mp4', @@ -109,8 +109,11 @@ class EaglePlatformIE(InfoExtractor): mobj = re.search('/([^/]+)/index\.m3u8', m3u8_format['url']) if mobj: http_format = m3u8_format.copy() + video_url = mp4_url.replace(mp4_url_basename, mobj.group(1)) + if not self._is_valid_url(video_url, video_id): + continue http_format.update({ - 'url': mp4_url.replace(mp4_url_basename, mobj.group(1)), + 'url': video_url, 'format_id': m3u8_format['format_id'].replace('hls', 'http'), 'protocol': 'http', }) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c63bdbd08..a95501d86 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -887,6 +887,7 @@ class GenericIE(InfoExtractor): # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', + # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used 'info_dict': { 'id': '227304', 'ext': 'mp4', @@ -901,6 +902,7 @@ class GenericIE(InfoExtractor): # ClipYou (Eagle.Platform) embed (custom URL) { 'url': 'http://muz-tv.ru/play/7129/', + # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used 'info_dict': { 'id': '12820', 'ext': 'mp4', From ad58942d57996f7f43601f22c85b8c6a9afe1b09 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 25 Apr 2016 23:35:05 +0800 Subject: [PATCH 06/68] [muzu] Remove extractor MUZU is shutting down in October 2015. [1] [1] http://www.musicbusinessworldwide.com/youtube-rival-muzu-is-heading-into-liquidation/ --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/muzu.py | 63 ------------------------------ 2 files changed, 64 deletions(-) delete mode 100644 youtube_dl/extractor/muzu.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6de3438fc..8b215c5ab 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -439,7 +439,6 @@ from .mtv import ( ) from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE -from .muzu import MuzuTVIE from .mwave import MwaveIE from .myspace import MySpaceIE, MySpaceAlbumIE from .myspass import MySpassIE diff --git a/youtube_dl/extractor/muzu.py b/youtube_dl/extractor/muzu.py deleted file mode 100644 index cbc800481..000000000 --- a/youtube_dl/extractor/muzu.py +++ /dev/null @@ -1,63 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode - - -class MuzuTVIE(InfoExtractor): - _VALID_URL = r'https?://www\.muzu\.tv/(.+?)/(.+?)/(?P<id>\d+)' - IE_NAME = 'muzu.tv' - - _TEST = { - 'url': 'http://www.muzu.tv/defected/marcashken-featuring-sos-cat-walk-original-mix-music-video/1981454/', - 'md5': '98f8b2c7bc50578d6a0364fff2bfb000', - 'info_dict': { - 'id': '1981454', - 'ext': 'mp4', - 'title': 'Cat Walk (Original Mix)', - 'description': 'md5:90e868994de201b2570e4e5854e19420', - 'uploader': 'MarcAshken featuring SOS', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - info_data = compat_urllib_parse_urlencode({ - 'format': 'json', - 'url': url, - }) - info = self._download_json( - 'http://www.muzu.tv/api/oembed/?%s' % info_data, - video_id, 'Downloading video info') - - player_info = self._download_json( - 'http://player.muzu.tv/player/playerInit?ai=%s' % video_id, - video_id, 'Downloading player info') - video_info = player_info['videos'][0] - for quality in ['1080', '720', '480', '360']: - if video_info.get('v%s' % quality): - break - - data = compat_urllib_parse_urlencode({ - 'ai': video_id, - # Even if each time you watch a video the hash changes, - # it seems to work for different videos, and it will work - # even if you use any non empty string as a hash - 'viewhash': 'VBNff6djeV4HV5TRPW5kOHub2k', - 'device': 'web', - 'qv': quality, - }) - video_url_info = self._download_json( - 'http://player.muzu.tv/player/requestVideo?%s' % data, - video_id, 'Downloading video url') - video_url = video_url_info['url'] - - return { - 'id': video_id, - 'title': info['title'], - 'url': video_url, - 'thumbnail': info['thumbnail_url'], - 'description': info['description'], - 'uploader': info['author_name'], - } From e3de3d6f2f9c82683e76b6bc12697aa7264372ca Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 25 Apr 2016 23:49:12 +0800 Subject: [PATCH 07/68] [normalboots] Fix extraction Now it's using ScreenwaveMedia --- youtube_dl/extractor/normalboots.py | 18 +++++++++--------- youtube_dl/extractor/screenwavemedia.py | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/normalboots.py b/youtube_dl/extractor/normalboots.py index 77e091072..af44c3bb5 100644 --- a/youtube_dl/extractor/normalboots.py +++ b/youtube_dl/extractor/normalboots.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .screenwavemedia import ScreenwaveMediaIE from ..utils import ( unified_strdate, @@ -12,7 +13,6 @@ class NormalbootsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?normalboots\.com/video/(?P<id>[0-9a-z-]*)/?$' _TEST = { 'url': 'http://normalboots.com/video/home-alone-games-jontron/', - 'md5': '8bf6de238915dd501105b44ef5f1e0f6', 'info_dict': { 'id': 'home-alone-games-jontron', 'ext': 'mp4', @@ -22,9 +22,10 @@ class NormalbootsIE(InfoExtractor): 'upload_date': '20140125', }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, }, + 'add_ie': ['ScreenwaveMedia'], } def _real_extract(self, url): @@ -38,16 +39,15 @@ class NormalbootsIE(InfoExtractor): r'<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>', webpage, 'date', fatal=False)) - player_url = self._html_search_regex( - r'<iframe\swidth="[0-9]+"\sheight="[0-9]+"\ssrc="(?P<url>[\S]+)"', - webpage, 'player url') - player_page = self._download_webpage(player_url, video_id) - video_url = self._html_search_regex( - r"file:\s'(?P<file>[^']+\.mp4)'", player_page, 'file') + screenwavemedia_url = self._html_search_regex( + ScreenwaveMediaIE.EMBED_PATTERN, webpage, 'screenwave URL', + group='url') return { + '_type': 'url_transparent', 'id': video_id, - 'url': video_url, + 'url': screenwavemedia_url, + 'ie_key': ScreenwaveMediaIE.ie_key(), 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py index 44b0bbee6..40333c825 100644 --- a/youtube_dl/extractor/screenwavemedia.py +++ b/youtube_dl/extractor/screenwavemedia.py @@ -12,7 +12,7 @@ from ..utils import ( class ScreenwaveMediaIE(InfoExtractor): - _VALID_URL = r'https?://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=(?P<id>[A-Za-z0-9-]+)' + _VALID_URL = r'(?:https?:)?//player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=(?P<id>[A-Za-z0-9-]+)' EMBED_PATTERN = r'src=(["\'])(?P<url>(?:https?:)?//player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=.+?)\1' _TESTS = [{ 'url': 'http://player.screenwavemedia.com/play/play.php?playerdiv=videoarea&companiondiv=squareAd&id=Cinemassacre-19911', From 749b0046a8664d023ff622dd38844f5c8632f3f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 25 Apr 2016 22:05:47 +0600 Subject: [PATCH 08/68] [ok] Allow embeds without title (Closes #9303) --- youtube_dl/extractor/odnoklassniki.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index f9e064a60..cd614f427 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -60,6 +60,22 @@ class OdnoklassnikiIE(InfoExtractor): 'uploader': 'Алина П', 'age_limit': 0, }, + }, { + # YouTube embed (metadata, provider == USER_YOUTUBE, no metadata.movie.title field) + 'url': 'http://ok.ru/video/62036049272859-0', + 'info_dict': { + 'id': '62036049272859-0', + 'ext': 'mp4', + 'title': 'МУЗЫКА ДОЖДЯ .', + 'description': 'md5:6f1867132bd96e33bf53eda1091e8ed0', + 'upload_date': '20120106', + 'uploader_id': '473534735899', + 'uploader': 'МARINA D', + 'age_limit': 0, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452', 'only_matching': True, @@ -106,7 +122,14 @@ class OdnoklassnikiIE(InfoExtractor): video_id, 'Downloading metadata JSON') movie = metadata['movie'] - title = movie['title'] + + # Some embedded videos may not contain title in movie dict (e.g. + # http://ok.ru/video/62036049272859-0) thus we allow missing title + # here and it's going to be extracted later by an extractor that + # will process the actual embed. + provider = metadata.get('provider') + title = movie['title'] if provider == 'UPLOADED_ODKL' else movie.get('title') + thumbnail = movie.get('poster') duration = int_or_none(movie.get('duration')) @@ -137,7 +160,7 @@ class OdnoklassnikiIE(InfoExtractor): 'age_limit': age_limit, } - if metadata.get('provider') == 'USER_YOUTUBE': + if provider == 'USER_YOUTUBE': info.update({ '_type': 'url_transparent', 'url': movie['contentId'], From c9fd5306709d0c03487a3b0163b7a33cab6774aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 25 Apr 2016 22:15:15 +0600 Subject: [PATCH 09/68] [ok] Extract start time --- youtube_dl/extractor/odnoklassniki.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index cd614f427..986708e75 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -2,7 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, +) from ..utils import ( ExtractorError, unified_strdate, @@ -32,7 +36,7 @@ class OdnoklassnikiIE(InfoExtractor): 'skip': 'Video has been blocked', }, { # metadataUrl - 'url': 'http://ok.ru/video/63567059965189-0', + 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5', 'md5': '9676cf86eff5391d35dea675d224e131', 'info_dict': { 'id': '63567059965189-0', @@ -44,6 +48,7 @@ class OdnoklassnikiIE(InfoExtractor): 'uploader': '☭ Андрей Мещанинов ☭', 'like_count': int, 'age_limit': 0, + 'start_time': 5, }, }, { # YouTube embed (metadataUrl, provider == USER_YOUTUBE) @@ -94,6 +99,9 @@ class OdnoklassnikiIE(InfoExtractor): }] def _real_extract(self, url): + start_time = int_or_none(compat_parse_qs( + compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0]) + video_id = self._match_id(url) webpage = self._download_webpage( @@ -158,6 +166,7 @@ class OdnoklassnikiIE(InfoExtractor): 'uploader_id': uploader_id, 'like_count': like_count, 'age_limit': age_limit, + 'start_time': start_time, } if provider == 'USER_YOUTUBE': From f1f879098a38c786d78927df8915b547f7ac3569 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Tue, 26 Apr 2016 13:39:53 +0100 Subject: [PATCH 10/68] [viewster] extract more metadata for http formats --- youtube_dl/extractor/viewster.py | 37 ++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 7839225d4..6edc2c44e 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -118,6 +118,7 @@ class ViewsterIE(InfoExtractor): formats = [] manifest_url = None + m3u8_formats = [] for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'): media = self._download_json( 'https://public-api.viewster.com/movies/%s/video?mediaType=%s' @@ -154,18 +155,32 @@ class ViewsterIE(InfoExtractor): 'qualities', default=None) if not qualities: continue - qualities = qualities.strip(',').split(',') - http_template = re.sub(QUALITIES_RE, r'%s', qualities_basename) + qualities = list(map(lambda q: int(q[:-1]), qualities.strip(',').split(','))) + qualities.sort() + http_template = re.sub(QUALITIES_RE, r'%dk', qualities_basename) http_url_basename = url_basename(video_url) - for q in qualities: - tbr = int_or_none(self._search_regex( - r'(\d+)k', q, 'bitrate', default=None)) - formats.append({ - 'url': video_url.replace(http_url_basename, http_template % q), - 'ext': 'mp4', - 'format_id': 'http' + ('-%d' % tbr if tbr else ''), - 'tbr': tbr, - }) + if m3u8_formats: + self._sort_formats(m3u8_formats) + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + m3u8_formats)) + if len(qualities) == len(m3u8_formats): + for q, m3u8_format in zip(qualities, m3u8_formats): + f = m3u8_format.copy() + f.update({ + 'url': video_url.replace(http_url_basename, http_template % q), + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + formats.append(f) + else: + for q in qualities: + formats.append({ + 'url': video_url.replace(http_url_basename, http_template % q), + 'ext': 'mp4', + 'format_id': 'http-%d' % q, + 'tbr': q, + }) if not formats and not info.get('LanguageSets') and not info.get('VODSettings'): self.raise_geo_restricted() From 175c2e9ec326f9ef820413837608eb4f5c8c5961 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 26 Apr 2016 22:29:29 +0600 Subject: [PATCH 11/68] [youtube:search_url] Reimplement in terms of youtube:playlistbase --- youtube_dl/extractor/youtube.py | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 44f98d294..b7c3cb63f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2139,10 +2139,11 @@ class YoutubeSearchDateIE(YoutubeSearchIE): _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'} -class YoutubeSearchURLIE(InfoExtractor): +class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' + _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, @@ -2157,32 +2158,8 @@ class YoutubeSearchURLIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) - webpage = self._download_webpage(url, query) - result_code = self._search_regex( - r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML') - - part_codes = re.findall( - r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code) - entries = [] - for part_code in part_codes: - part_title = self._html_search_regex( - [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False) - part_url_snippet = self._html_search_regex( - r'(?s)href="([^"]+)"', part_code, 'item URL') - part_url = compat_urlparse.urljoin( - 'https://www.youtube.com/', part_url_snippet) - entries.append({ - '_type': 'url', - 'url': part_url, - 'title': part_title, - }) - - return { - '_type': 'playlist', - 'entries': entries, - 'title': query, - } + return self.playlist_result(self._process_page(webpage), playlist_title=query) class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): From 7464360379a1a3fc6ba3228f54dd4853df349142 Mon Sep 17 00:00:00 2001 From: Sergey M <dstftw@gmail.com> Date: Wed, 27 Apr 2016 00:16:48 +0600 Subject: [PATCH 12/68] [README.md] Add FAQ entry on output template conflicts --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index e062444b3..fb57b0323 100644 --- a/README.md +++ b/README.md @@ -697,6 +697,10 @@ YouTube changed their playlist format in March 2014 and later on, so you'll need If you have installed youtube-dl with a package manager, pip, setup.py or a tarball, please use that to update. Note that Ubuntu packages do not seem to get updated anymore. Since we are not affiliated with Ubuntu, there is little we can do. Feel free to [report bugs](https://bugs.launchpad.net/ubuntu/+source/youtube-dl/+filebug) to the [Ubuntu packaging guys](mailto:ubuntu-motu@lists.ubuntu.com?subject=outdated%20version%20of%20youtube-dl) - all they have to do is update the package to a somewhat recent version. See above for a way to update. +### I'm getting an error when trying to use output template: `error: using output template conflicts with using title, video ID or auto number` + +Make sure you are not using `-o` with any of these options `-t`, `--title`, `--id`, `-A` or `--auto-number` set in command line or in a configuration file. Remove the latter if any. + ### Do I always have to pass `-citw`? By default, youtube-dl intends to have the best options (incidentally, if you have a convincing case that these should be different, [please file an issue where you explain that](https://yt-dl.org/bug)). Therefore, it is unnecessary and sometimes harmful to copy long option strings from webpages. In particular, the only option out of `-citw` that is regularly useful is `-i`. From 046ea04a7d8601a85007430a7a3da3ce236549f7 Mon Sep 17 00:00:00 2001 From: Sergey M <dstftw@gmail.com> Date: Wed, 27 Apr 2016 00:22:08 +0600 Subject: [PATCH 13/68] [README.md] Mention mpv --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fb57b0323..ecf737047 100644 --- a/README.md +++ b/README.md @@ -721,7 +721,7 @@ Videos or video formats streamed via RTMP protocol can only be downloaded when [ ### I have downloaded a video but how can I play it? -Once the video is fully downloaded, use any video player, such as [vlc](http://www.videolan.org) or [mplayer](http://www.mplayerhq.hu/). +Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](http://www.videolan.org) or [mplayer](http://www.mplayerhq.hu/). ### I extracted a video URL with `-g`, but it does not play on another machine / in my webbrowser. From a7e03861e8d0ce18ad698e0e38ffac40a09cef8b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 27 Apr 2016 13:52:04 +0800 Subject: [PATCH 14/68] [scivee] Skip the test Not accessible from either Travis CI or my machine. Closes #9315 --- youtube_dl/extractor/scivee.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/scivee.py b/youtube_dl/extractor/scivee.py index 3bf93c870..b1ca12fde 100644 --- a/youtube_dl/extractor/scivee.py +++ b/youtube_dl/extractor/scivee.py @@ -18,6 +18,7 @@ class SciVeeIE(InfoExtractor): 'title': 'Adam Arkin at the 2014 DOE JGI Genomics of Energy & Environment Meeting', 'description': 'md5:81f1710638e11a481358fab1b11059d7', }, + 'skip': 'Not accessible from Travis CI server', } def _real_extract(self, url): From 2ac2cbc0a351785e0c6d034bd1bab77973ec7a41 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 27 Apr 2016 13:55:32 +0800 Subject: [PATCH 15/68] [malemotion] Remove the extractor Announcement from their homepage: ``` MaleMotion is closed After another system crash, I'm forced to close the site This week all content will be erased Don't forget to cancel your subscription if any ! ``` Closes #9311. --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/malemotion.py | 46 ------------------------------ 2 files changed, 47 deletions(-) delete mode 100644 youtube_dl/extractor/malemotion.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8b215c5ab..00f8a7a85 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -400,7 +400,6 @@ from .macgamestore import MacGameStoreIE from .mailru import MailRuIE from .makerschannel import MakersChannelIE from .makertv import MakerTVIE -from .malemotion import MalemotionIE from .matchtv import MatchTVIE from .mdr import MDRIE from .metacafe import MetacafeIE diff --git a/youtube_dl/extractor/malemotion.py b/youtube_dl/extractor/malemotion.py deleted file mode 100644 index 92511a671..000000000 --- a/youtube_dl/extractor/malemotion.py +++ /dev/null @@ -1,46 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote - - -class MalemotionIE(InfoExtractor): - _VALID_URL = r'https?://malemotion\.com/video/(.+?)\.(?P<id>.+?)(#|$)' - _TEST = { - 'url': 'http://malemotion.com/video/bete-de-concours.ltc', - 'md5': '3013e53a0afbde2878bc39998c33e8a5', - 'info_dict': { - 'id': 'ltc', - 'ext': 'mp4', - 'title': 'Bête de Concours', - 'age_limit': 18, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_url = compat_urllib_parse_unquote(self._search_regex( - r'<source type="video/mp4" src="(.+?)"', webpage, 'video URL')) - video_title = self._html_search_regex( - r'<title>(.*?)</title', webpage, 'title') - video_thumbnail = self._search_regex( - r'<video .+?poster="(.+?)"', webpage, 'thumbnail', fatal=False) - - formats = [{ - 'url': video_url, - 'ext': 'mp4', - 'format_id': 'mp4', - 'preference': 1, - }] - self._sort_formats(formats) - - return { - 'id': video_id, - 'formats': formats, - 'title': video_title, - 'thumbnail': video_thumbnail, - 'age_limit': 18, - } From 5b5d7cc11e3037408aeedf8d6dc57ac228b02496 Mon Sep 17 00:00:00 2001 From: Peter Rowlands <peter@pmrowla.com> Date: Wed, 27 Apr 2016 15:57:17 +0900 Subject: [PATCH 16/68] [mwave] Add Mwave Meet & Greet extractor --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/mwave.py | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8b215c5ab..9d1992721 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -439,7 +439,7 @@ from .mtv import ( ) from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE -from .mwave import MwaveIE +from .mwave import MwaveIE, MwaveMeetGreetIE from .myspace import MySpaceIE, MySpaceAlbumIE from .myspass import MySpassIE from .myvi import MyviIE diff --git a/youtube_dl/extractor/mwave.py b/youtube_dl/extractor/mwave.py index 5c3c8d464..6485c6928 100644 --- a/youtube_dl/extractor/mwave.py +++ b/youtube_dl/extractor/mwave.py @@ -56,3 +56,26 @@ class MwaveIE(InfoExtractor): 'view_count': int_or_none(vod_info.get('hit')), 'formats': formats, } + + +class MwaveMeetGreetIE(InfoExtractor): + _VALID_URL = r'https?://mwave\.interest\.me/meetgreet/view/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://mwave.interest.me/meetgreet/view/256', + 'info_dict': { + 'id': '173294', + 'ext': 'flv', + 'title': '[MEET&GREET] Park BoRam', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Mwave', + 'duration': 3634, + 'view_count': int, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + clip_id = self._html_search_regex(r'<iframe src="/mnettv/ifr_clip\.m\?searchVideoDetailVO\.clip_id=(?P<id>[0-9]+)', webpage, 'clip ID') + clip_url = 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id={0}'.format(clip_id) + return self.url_result(clip_url, 'Mwave', clip_id) From dcf094d62699f8ad06ceaf3fba55e453980fac91 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 27 Apr 2016 18:08:23 +0800 Subject: [PATCH 17/68] [theplatform] Fix for Python 3.2 test_AENetworks{,_1} fails as in Python < 3.3, binascii.a2b_* functions accepts only bytes-like objects --- youtube_dl/extractor/theplatform.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 8272dd969..a25417f94 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -159,11 +159,11 @@ class ThePlatformIE(ThePlatformBaseIE): def str_to_hex(str): return binascii.b2a_hex(str.encode('ascii')).decode('ascii') - def hex_to_str(hex): - return binascii.a2b_hex(hex) + def hex_to_bytes(hex): + return binascii.a2b_hex(hex.encode('ascii')) relative_path = re.match(r'https?://link.theplatform.com/s/([^?]+)', url).group(1) - clear_text = hex_to_str(flags + expiration_date + str_to_hex(relative_path)) + clear_text = hex_to_bytes(flags + expiration_date + str_to_hex(relative_path)) checksum = hmac.new(sig_key.encode('ascii'), clear_text, hashlib.sha1).hexdigest() sig = flags + expiration_date + checksum + str_to_hex(sig_secret) return '%s&sig=%s' % (url, sig) From 3cc8649c9d42bab8c7b665115ebdc569bf44a762 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 28 Apr 2016 02:58:11 +0800 Subject: [PATCH 18/68] [20min] Detect embedded YouTube videos Fixes #9331 --- youtube_dl/extractor/twentymin.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py index ca7d953b8..b721ecb0a 100644 --- a/youtube_dl/extractor/twentymin.py +++ b/youtube_dl/extractor/twentymin.py @@ -32,7 +32,22 @@ class TwentyMinutenIE(InfoExtractor): 'title': '«Wir müssen mutig nach vorne schauen»', 'description': 'Kein Land sei innovativer als die Schweiz, sagte Johann Schneider-Ammann in seiner Neujahrsansprache. Das Land müsse aber seine Hausaufgaben machen.', 'thumbnail': 'http://www.20min.ch/images/content/2/2/0/22050469/10/teaserbreit.jpg' - } + }, + 'skip': '"This video is no longer available" is shown both on the web page and in the downloaded file.', + }, { + # YouTube embed + 'url': 'http://www.20min.ch/ro/sports/football/story/Il-marque-une-bicyclette-de-plus-de-30-metres--21115184', + 'md5': 'cec64d59aa01c0ed9dbba9cf639dd82f', + 'info_dict': { + 'id': 'ivM7A7SpDOs', + 'ext': 'mp4', + 'title': 'GOLAZO DE CHILENA DE JAVI GÓMEZ, FINALISTA AL BALÓN DE CLM 2016', + 'description': 'md5:903c92fbf2b2f66c09de514bc25e9f5a', + 'upload_date': '20160424', + 'uploader': 'RTVCM Castilla-La Mancha', + 'uploader_id': 'RTVCM', + }, + 'add_ie': ['Youtube'], }, { 'url': 'http://www.20min.ch/videotv/?cid=44&vid=468738', 'only_matching': True, @@ -48,6 +63,12 @@ class TwentyMinutenIE(InfoExtractor): webpage = self._download_webpage(url, display_id) + youtube_url = self._html_search_regex( + r'<iframe[^>]+src="((?:https?:)?//www\.youtube\.com/embed/[^"]+)"', + webpage, 'YouTube embed URL', default=None) + if youtube_url is not None: + return self.url_result(youtube_url, 'Youtube') + title = self._html_search_regex( r'<h1>.*?<span>(.+?)</span></h1>', webpage, 'title', default=None) From 52af8f222bc4f067b4c5e7a977a64345d35ae4fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 Apr 2016 04:01:21 +0600 Subject: [PATCH 19/68] [cwtv] Relax _VALID_URL (Closes #9327) --- youtube_dl/extractor/cwtv.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py index f5cefd966..ebd14cb16 100644 --- a/youtube_dl/extractor/cwtv.py +++ b/youtube_dl/extractor/cwtv.py @@ -9,7 +9,7 @@ from ..utils import ( class CWTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cw(?:tv|seed)\.com/shows/(?:[^/]+/){2}\?play=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})' + _VALID_URL = r'https?://(?:www\.)?cw(?:tv|seed)\.com/(?:shows/)?(?:[^/]+/){2}\?.*\bplay=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})' _TESTS = [{ 'url': 'http://cwtv.com/shows/arrow/legends-of-yesterday/?play=6b15e985-9345-4f60-baf8-56e96be57c63', 'info_dict': { @@ -48,6 +48,9 @@ class CWTVIE(InfoExtractor): # m3u8 download 'skip_download': True, } + }, { + 'url': 'http://cwtv.com/thecw/chroniclesofcisco/?play=8adebe35-f447-465f-ab52-e863506ff6d6', + 'only_matching': True, }] def _real_extract(self, url): From 618c71dc64086f751b6ae87d5f32687e02a54e58 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 28 Apr 2016 15:00:02 +0800 Subject: [PATCH 20/68] [cloudy] New domain name for the test_cloudy_1 I'm sure whether videoraj.ch still works or not, so keep it. --- youtube_dl/extractor/cloudy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index 9e267e6c0..9a28ef354 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -19,7 +19,7 @@ from ..utils import ( class CloudyIE(InfoExtractor): _IE_DESC = 'cloudy.ec and videoraj.ch' _VALID_URL = r'''(?x) - https?://(?:www\.)?(?P<host>cloudy\.ec|videoraj\.ch)/ + https?://(?:www\.)?(?P<host>cloudy\.ec|videoraj\.(?:ch|to))/ (?:v/|embed\.php\?id=) (?P<id>[A-Za-z0-9]+) ''' @@ -37,7 +37,7 @@ class CloudyIE(InfoExtractor): } }, { - 'url': 'http://www.videoraj.ch/v/47f399fd8bb60', + 'url': 'http://www.videoraj.to/v/47f399fd8bb60', 'md5': '7d0f8799d91efd4eda26587421c3c3b0', 'info_dict': { 'id': '47f399fd8bb60', From a5941305b6ba0921ea4f34641dd9095372dd1c1d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 28 Apr 2016 16:03:08 +0800 Subject: [PATCH 21/68] [mwave] Coding style --- youtube_dl/extractor/mwave.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/mwave.py b/youtube_dl/extractor/mwave.py index 6485c6928..a103e0323 100644 --- a/youtube_dl/extractor/mwave.py +++ b/youtube_dl/extractor/mwave.py @@ -10,6 +10,7 @@ from ..utils import ( class MwaveIE(InfoExtractor): _VALID_URL = r'https?://mwave\.interest\.me/mnettv/videodetail\.m\?searchVideoDetailVO\.clip_id=(?P<id>[0-9]+)' + _URL_TEMPLATE = 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=%s' _TEST = { 'url': 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=168859', # md5 is unstable @@ -59,7 +60,7 @@ class MwaveIE(InfoExtractor): class MwaveMeetGreetIE(InfoExtractor): - _VALID_URL = r'https?://mwave\.interest\.me/meetgreet/view/(?P<id>[0-9]+)' + _VALID_URL = r'https?://mwave\.interest\.me/meetgreet/view/(?P<id>\d+)' _TEST = { 'url': 'http://mwave.interest.me/meetgreet/view/256', 'info_dict': { @@ -76,6 +77,8 @@ class MwaveMeetGreetIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - clip_id = self._html_search_regex(r'<iframe src="/mnettv/ifr_clip\.m\?searchVideoDetailVO\.clip_id=(?P<id>[0-9]+)', webpage, 'clip ID') - clip_url = 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id={0}'.format(clip_id) + clip_id = self._html_search_regex( + r'<iframe[^>]+src="/mnettv/ifr_clip\.m\?searchVideoDetailVO\.clip_id=(\d+)', + webpage, 'clip ID') + clip_url = MwaveIE._URL_TEMPLATE % clip_id return self.url_result(clip_url, 'Mwave', clip_id) From 7f776fa4b510b7973e08f06de556fa39cb5946e5 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 28 Apr 2016 17:08:41 +0800 Subject: [PATCH 22/68] [yandexmusic] Skip tests as Travis CI blocked --- youtube_dl/extractor/yandexmusic.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 7a90cc60c..0d32a612f 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -22,6 +22,12 @@ class YandexMusicBaseIE(InfoExtractor): if error: raise ExtractorError(error, expected=True) + def _download_webpage(self, *args, **kwargs): + webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs) + if 'Нам очень жаль, но запросы, поступившие с вашего IP-адреса, похожи на автоматические.' in webpage: + raise ExtractorError('Blocked by YandexMusic', expected=True) + return webpage + def _download_json(self, *args, **kwargs): response = super(YandexMusicBaseIE, self)._download_json(*args, **kwargs) self._handle_error(response) @@ -47,7 +53,8 @@ class YandexMusicTrackIE(YandexMusicBaseIE): 'album_artist': 'Carlo Ambrosio', 'artist': 'Carlo Ambrosio & Fabio Di Bari, Carlo Ambrosio', 'release_year': '2009', - } + }, + 'skip': 'Travis CI servers blocked by YandexMusic', } def _get_track_url(self, storage_dir, track_id): @@ -139,6 +146,7 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE): 'title': 'Carlo Ambrosio - Gypsy Soul (2009)', }, 'playlist_count': 50, + 'skip': 'Travis CI servers blocked by YandexMusic', } def _real_extract(self, url): @@ -171,6 +179,7 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9', }, 'playlist_count': 6, + 'skip': 'Travis CI servers blocked by YandexMusic', }, { # playlist exceeding the limit of 150 tracks shipped with webpage (see # https://github.com/rg3/youtube-dl/issues/6666) @@ -180,6 +189,7 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'title': 'Музыка 90-х', }, 'playlist_count': 310, + 'skip': 'Travis CI servers blocked by YandexMusic', }] def _real_extract(self, url): From 0cbcbdd89dbc3573ecfcf68496c54bd84804967d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 28 Apr 2016 17:51:20 +0800 Subject: [PATCH 23/68] [nuvid] Fix extraction Closes #7620 --- youtube_dl/extractor/nuvid.py | 44 +++++++++++++++++------------------ 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/nuvid.py b/youtube_dl/extractor/nuvid.py index 9fa7cefad..ab6bfcd7f 100644 --- a/youtube_dl/extractor/nuvid.py +++ b/youtube_dl/extractor/nuvid.py @@ -5,8 +5,6 @@ import re from .common import InfoExtractor from ..utils import ( parse_duration, - sanitized_Request, - unified_strdate, ) @@ -20,7 +18,6 @@ class NuvidIE(InfoExtractor): 'ext': 'mp4', 'title': 'Horny babes show their awesome bodeis and', 'duration': 129, - 'upload_date': '20140508', 'age_limit': 18, } } @@ -28,28 +25,31 @@ class NuvidIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - formats = [] + page_url = 'http://m.nuvid.com/video/%s' % video_id + webpage = self._download_webpage( + page_url, video_id, 'Downloading video page') + # When dwnld_speed exists and has a value larger than the MP4 file's + # bitrate, Nuvid returns the MP4 URL + # It's unit is 100bytes/millisecond, see mobile-nuvid-min.js for the algorithm + self._set_cookie('nuvid.com', 'dwnld_speed', '10.0') + mp4_webpage = self._download_webpage( + page_url, video_id, 'Downloading video page for MP4 format') - for dwnld_speed, format_id in [(0, '3gp'), (5, 'mp4')]: - request = sanitized_Request( - 'http://m.nuvid.com/play/%s' % video_id) - request.add_header('Cookie', 'skip_download_page=1; dwnld_speed=%d; adv_show=1' % dwnld_speed) - webpage = self._download_webpage( - request, video_id, 'Downloading %s page' % format_id) - video_url = self._html_search_regex( - r'<a\s+href="([^"]+)"\s+class="b_link">', webpage, '%s video URL' % format_id, fatal=False) - if not video_url: - continue + html5_video_re = r'(?s)<(?:video|audio)[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', + video_url = self._html_search_regex(html5_video_re, webpage, video_id) + mp4_video_url = self._html_search_regex(html5_video_re, mp4_webpage, video_id) + formats = [{ + 'url': video_url, + }] + if mp4_video_url != video_url: formats.append({ - 'url': video_url, - 'format_id': format_id, + 'url': mp4_video_url, }) - webpage = self._download_webpage( - 'http://m.nuvid.com/video/%s' % video_id, video_id, 'Downloading video page') title = self._html_search_regex( [r'<span title="([^"]+)">', - r'<div class="thumb-holder video">\s*<h5[^>]*>([^<]+)</h5>'], webpage, 'title').strip() + r'<div class="thumb-holder video">\s*<h5[^>]*>([^<]+)</h5>', + r'<span[^>]+class="title_thumb">([^<]+)</span>'], webpage, 'title').strip() thumbnails = [ { 'url': thumb_url, @@ -57,9 +57,8 @@ class NuvidIE(InfoExtractor): ] thumbnail = thumbnails[0]['url'] if thumbnails else None duration = parse_duration(self._html_search_regex( - r'<i class="fa fa-clock-o"></i>\s*(\d{2}:\d{2})', webpage, 'duration', fatal=False)) - upload_date = unified_strdate(self._html_search_regex( - r'<i class="fa fa-user"></i>\s*(\d{4}-\d{2}-\d{2})', webpage, 'upload date', fatal=False)) + [r'<i class="fa fa-clock-o"></i>\s*(\d{2}:\d{2})', + r'<span[^>]+class="view_time">([^<]+)</span>'], webpage, 'duration', fatal=False)) return { 'id': video_id, @@ -67,7 +66,6 @@ class NuvidIE(InfoExtractor): 'thumbnails': thumbnails, 'thumbnail': thumbnail, 'duration': duration, - 'upload_date': upload_date, 'age_limit': 18, 'formats': formats, } From eebe6b382eb6bd9e8118b616f3dde48c294e3b0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 Apr 2016 21:37:34 +0600 Subject: [PATCH 24/68] [yandexmusic] Improve error handling --- youtube_dl/extractor/yandexmusic.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 0d32a612f..b0e68a087 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -18,9 +18,10 @@ from ..utils import ( class YandexMusicBaseIE(InfoExtractor): @staticmethod def _handle_error(response): - error = response.get('error') - if error: - raise ExtractorError(error, expected=True) + if isinstance(response, dict): + error = response.get('error') + if error: + raise ExtractorError(error, expected=True) def _download_webpage(self, *args, **kwargs): webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs) From 4b537629143c8f51c5814c650227971c438b12e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 Apr 2016 21:45:33 +0600 Subject: [PATCH 25/68] [yandexmusic] Clarify blockage --- youtube_dl/extractor/yandexmusic.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index b0e68a087..a33fe3d83 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -26,7 +26,11 @@ class YandexMusicBaseIE(InfoExtractor): def _download_webpage(self, *args, **kwargs): webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs) if 'Нам очень жаль, но запросы, поступившие с вашего IP-адреса, похожи на автоматические.' in webpage: - raise ExtractorError('Blocked by YandexMusic', expected=True) + raise ExtractorError( + 'YandexMusic asks you to solve a CAPTCHA: go to ' + 'https://music.yandex.ru/ and solve it, then export ' + 'cookies and pass cookie file to youtube-dl with --cookies', + expected=True) return webpage def _download_json(self, *args, **kwargs): From 0ba9e3ca2233d018d695bac4eebe0e34043a7ec9 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 28 Apr 2016 17:44:33 +0100 Subject: [PATCH 26/68] [viewster] extract formats for videos with multiple audios/subtitles --- youtube_dl/extractor/viewster.py | 147 +++++++++++++++++-------------- 1 file changed, 80 insertions(+), 67 deletions(-) diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 6edc2c44e..1813b81d6 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -78,11 +78,11 @@ class ViewsterIE(InfoExtractor): _ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01' - def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True): + def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True, query={}): request = sanitized_Request(url) request.add_header('Accept', self._ACCEPT_HEADER) request.add_header('Auth-token', self._AUTH_TOKEN) - return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal) + return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal, query=query) def _real_extract(self, url): video_id = self._match_id(url) @@ -117,72 +117,85 @@ class ViewsterIE(InfoExtractor): return self.playlist_result(entries, video_id, title, description) formats = [] - manifest_url = None - m3u8_formats = [] - for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'): - media = self._download_json( - 'https://public-api.viewster.com/movies/%s/video?mediaType=%s' - % (entry_id, compat_urllib_parse.quote(media_type)), - video_id, 'Downloading %s JSON' % media_type, fatal=False) - if not media: - continue - video_url = media.get('Uri') - if not video_url: - continue - ext = determine_ext(video_url) - if ext == 'f4m': - manifest_url = video_url - video_url += '&' if '?' in video_url else '?' - video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1' - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id='hds')) - elif ext == 'm3u8': - manifest_url = video_url - m3u8_formats = self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id='hls', - fatal=False) # m3u8 sometimes fail - if m3u8_formats: - formats.extend(m3u8_formats) - else: - qualities_basename = self._search_regex( - '/([^/]+)\.csmil/', - manifest_url, 'qualities basename', default=None) - if not qualities_basename: - continue - QUALITIES_RE = r'((,\d+k)+,?)' - qualities = self._search_regex( - QUALITIES_RE, qualities_basename, - 'qualities', default=None) - if not qualities: - continue - qualities = list(map(lambda q: int(q[:-1]), qualities.strip(',').split(','))) - qualities.sort() - http_template = re.sub(QUALITIES_RE, r'%dk', qualities_basename) - http_url_basename = url_basename(video_url) - if m3u8_formats: - self._sort_formats(m3u8_formats) - m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - m3u8_formats)) - if len(qualities) == len(m3u8_formats): - for q, m3u8_format in zip(qualities, m3u8_formats): - f = m3u8_format.copy() - f.update({ - 'url': video_url.replace(http_url_basename, http_template % q), - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - else: - for q in qualities: - formats.append({ - 'url': video_url.replace(http_url_basename, http_template % q), - 'ext': 'mp4', - 'format_id': 'http-%d' % q, - 'tbr': q, - }) + for language_set in info.get('LanguageSets', []): + manifest_url = None + m3u8_formats = [] + audio = language_set.get('Audio') or '' + subtitle = language_set.get('Subtitle') or '' + base_format_id = audio + if subtitle: + base_format_id += '-%s' % subtitle - if not formats and not info.get('LanguageSets') and not info.get('VODSettings'): + def concat(suffix, sep='-'): + return (base_format_id + '%s%s' % (sep, suffix)) if base_format_id else suffix + + for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'): + media = self._download_json( + 'https://public-api.viewster.com/movies/%s/video' % entry_id, + video_id, 'Downloading %s JSON' % concat(media_type, ' '), fatal=False, query={ + 'mediaType': media_type, + 'language': audio, + 'subtitle': subtitle, + }) + if not media: + continue + video_url = media.get('Uri') + if not video_url: + continue + ext = determine_ext(video_url) + if ext == 'f4m': + manifest_url = video_url + video_url += '&' if '?' in video_url else '?' + video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1' + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id=concat('hds'))) + elif ext == 'm3u8': + manifest_url = video_url + m3u8_formats = self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id=concat('hls'), + fatal=False) # m3u8 sometimes fail + if m3u8_formats: + formats.extend(m3u8_formats) + else: + qualities_basename = self._search_regex( + '/([^/]+)\.csmil/', + manifest_url, 'qualities basename', default=None) + if not qualities_basename: + continue + QUALITIES_RE = r'((,\d+k)+,?)' + qualities = self._search_regex( + QUALITIES_RE, qualities_basename, + 'qualities', default=None) + if not qualities: + continue + qualities = list(map(lambda q: int(q[:-1]), qualities.strip(',').split(','))) + qualities.sort() + http_template = re.sub(QUALITIES_RE, r'%dk', qualities_basename) + http_url_basename = url_basename(video_url) + if m3u8_formats: + self._sort_formats(m3u8_formats) + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + m3u8_formats)) + if len(qualities) == len(m3u8_formats): + for q, m3u8_format in zip(qualities, m3u8_formats): + f = m3u8_format.copy() + f.update({ + 'url': video_url.replace(http_url_basename, http_template % q), + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + formats.append(f) + else: + for q in qualities: + formats.append({ + 'url': video_url.replace(http_url_basename, http_template % q), + 'ext': 'mp4', + 'format_id': 'http-%d' % q, + 'tbr': q, + }) + + if not formats and not info.get('VODSettings'): self.raise_geo_restricted() self._sort_formats(formats) From e757fb3d053a195da4084c08a59a7b17b08ba598 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 28 Apr 2016 18:42:20 +0100 Subject: [PATCH 27/68] [crunchyroll] improve extraction - extract more metadata(series, episode, episode_number) - reduce duplicate requests for extracting formats - remove duplicate formats --- youtube_dl/extractor/crunchyroll.py | 31 +++++++++++++++++------------ 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 8ae3f2890..dd753c7c3 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -11,7 +11,6 @@ from math import pow, sqrt, floor from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, - compat_urllib_parse_unquote, compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, @@ -306,28 +305,24 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', webpage, 'video_uploader', fatal=False) - playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url')) - playerdata_req = sanitized_Request(playerdata_url) - playerdata_req.data = urlencode_postdata({'current_page': webpage_url}) - playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') - playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info') - - stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, 'stream_id') - video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False) - formats = [] - for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage): + video_encode_ids = [] + for fmt in re.findall(r'token="showmedia\.([0-9]{3,4})p"', webpage): stream_quality, stream_format = self._FORMAT_IDS[fmt] video_format = fmt + 'p' streamdata_req = sanitized_Request( 'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s' - % (stream_id, stream_format, stream_quality), + % (video_id, stream_format, stream_quality), compat_urllib_parse_urlencode({'current_page': url}).encode('utf-8')) streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') streamdata = self._download_xml( streamdata_req, video_id, note='Downloading media info for %s' % video_format) stream_info = streamdata.find('./{default}preload/stream_info') + video_encode_id = xpath_text(stream_info, './video_encode_id') + if video_encode_id in video_encode_ids: + continue + video_encode_ids.append(video_encode_id) video_url = xpath_text(stream_info, './host') video_play_path = xpath_text(stream_info, './file') if not video_url or not video_play_path: @@ -360,15 +355,25 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text }) formats.append(format_info) + metadata = self._download_xml( + 'http://www.crunchyroll.com/xml', video_id, + note='Downloading media info', query={ + 'req': 'RpcApiVideoPlayer_GetMediaMetadata', + 'media_id': video_id, + }) + subtitles = self.extract_subtitles(video_id, webpage) return { 'id': video_id, 'title': video_title, 'description': video_description, - 'thumbnail': video_thumbnail, + 'thumbnail': xpath_text(metadata, 'episode_image_url'), 'uploader': video_uploader, 'upload_date': video_upload_date, + 'series': xpath_text(metadata, 'series_title'), + 'episode': xpath_text(metadata, 'episode_title'), + 'episode_number': int_or_none(xpath_text(metadata, 'episode_number')), 'subtitles': subtitles, 'formats': formats, } From 497971cd4a8407651debfb2fd4b10fc4009b0f15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 29 Apr 2016 01:28:07 +0600 Subject: [PATCH 28/68] [yandexmusic] Clarify blockage even more --- youtube_dl/extractor/yandexmusic.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index a33fe3d83..ce3723b55 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -27,9 +27,12 @@ class YandexMusicBaseIE(InfoExtractor): webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs) if 'Нам очень жаль, но запросы, поступившие с вашего IP-адреса, похожи на автоматические.' in webpage: raise ExtractorError( - 'YandexMusic asks you to solve a CAPTCHA: go to ' - 'https://music.yandex.ru/ and solve it, then export ' - 'cookies and pass cookie file to youtube-dl with --cookies', + 'YandexMusic has considered youtube-dl requests automated and ' + 'asks you to solve a CAPTCHA. You can either wait for some ' + 'time until unblocked and optionally use --sleep-interval ' + 'in future or alternatively you can go to https://music.yandex.ru/ ' + 'solve CAPTCHA, then export cookies and pass cookie file to ' + 'youtube-dl with --cookies', expected=True) return webpage From 683d892bf9332df1a255c673bca56a8f5487292a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 29 Apr 2016 01:30:53 +0600 Subject: [PATCH 29/68] [viewster] Remove unused import --- youtube_dl/extractor/viewster.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 1813b81d6..a93196a07 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -6,7 +6,6 @@ import re from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_urllib_parse, compat_urllib_parse_unquote, ) from ..utils import ( From 72670c39decc296a3ee757301dc70389674d19c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 29 Apr 2016 04:46:23 +0600 Subject: [PATCH 30/68] [arte:+7] Fix typo in _VALID_URL --- youtube_dl/extractor/arte.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index a9e3266dc..881cacfab 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -63,7 +63,7 @@ class ArteTvIE(InfoExtractor): class ArteTVPlus7IE(InfoExtractor): IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&+])' + _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)' @classmethod def _extract_url_info(cls, url): From 31ff3c074eddf4078b6eb49281830875eb4e65a1 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 29 Apr 2016 13:36:52 +0800 Subject: [PATCH 31/68] [sexykarma] Remove the extractor Its domain name is on sale. Closes #9317 --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/sexykarma.py | 121 ----------------------------- 2 files changed, 122 deletions(-) delete mode 100644 youtube_dl/extractor/sexykarma.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 88405f070..41ff1e7a5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -657,7 +657,6 @@ from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE from .senateisvp import SenateISVPIE from .servingsys import ServingSysIE from .sexu import SexuIE -from .sexykarma import SexyKarmaIE from .shahid import ShahidIE from .shared import SharedIE from .sharesix import ShareSixIE diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/sexykarma.py deleted file mode 100644 index e33483674..000000000 --- a/youtube_dl/extractor/sexykarma.py +++ /dev/null @@ -1,121 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - unified_strdate, - parse_duration, - int_or_none, -) - - -class SexyKarmaIE(InfoExtractor): - IE_DESC = 'Sexy Karma and Watch Indian Porn' - _VALID_URL = r'https?://(?:www\.)?(?:sexykarma\.com|watchindianporn\.net)/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html' - _TESTS = [{ - 'url': 'http://www.sexykarma.com/gonewild/video/taking-a-quick-pee-yHI70cOyIHt.html', - 'md5': 'b9798e7d1ef1765116a8f516c8091dbd', - 'info_dict': { - 'id': 'yHI70cOyIHt', - 'display_id': 'taking-a-quick-pee', - 'ext': 'mp4', - 'title': 'Taking a quick pee.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'wildginger7', - 'upload_date': '20141008', - 'duration': 22, - 'view_count': int, - 'comment_count': int, - 'categories': list, - 'age_limit': 18, - } - }, { - 'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html', - 'md5': 'dd216c68d29b49b12842b9babe762a5d', - 'info_dict': { - 'id': '8Id6EZPbuHf', - 'display_id': 'pot-pixie-tribute', - 'ext': 'mp4', - 'title': 'pot_pixie tribute', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'banffite', - 'upload_date': '20141013', - 'duration': 16, - 'view_count': int, - 'comment_count': int, - 'categories': list, - 'age_limit': 18, - } - }, { - 'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html', - 'md5': '9afb80675550406ed9a63ac2819ef69d', - 'info_dict': { - 'id': 'dW2mtctxJfs', - 'display_id': 'desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number', - 'ext': 'mp4', - 'title': 'Desi dancer namrata stripping completely nude and dancing on a hot number', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'Don', - 'upload_date': '20140213', - 'duration': 83, - 'view_count': int, - 'comment_count': int, - 'categories': list, - 'age_limit': 18, - } - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - video_url = self._html_search_regex( - r"url: escape\('([^']+)'\)", webpage, 'url') - - title = self._html_search_regex( - r'<h2 class="he2"><span>(.*?)</span>', - webpage, 'title') - thumbnail = self._html_search_regex( - r'<span id="container"><img\s+src="([^"]+)"', - webpage, 'thumbnail', fatal=False) - - uploader = self._html_search_regex( - r'class="aupa">\s*(.*?)</a>', - webpage, 'uploader') - upload_date = unified_strdate(self._html_search_regex( - r'Added: <strong>(.+?)</strong>', webpage, 'upload date', fatal=False)) - - duration = parse_duration(self._search_regex( - r'<td>Time:\s*</td>\s*<td align="right"><span>\s*(.+?)\s*</span>', - webpage, 'duration', fatal=False)) - - view_count = int_or_none(self._search_regex( - r'<td>Views:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', - webpage, 'view count', fatal=False)) - comment_count = int_or_none(self._search_regex( - r'<td>Comments:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', - webpage, 'comment count', fatal=False)) - - categories = re.findall( - r'<a href="[^"]+/search/video/desi"><span>([^<]+)</span></a>', - webpage) - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'upload_date': upload_date, - 'duration': duration, - 'view_count': view_count, - 'comment_count': comment_count, - 'categories': categories, - 'age_limit': 18, - } From f5535ed0e3537acee90820c98d6ca474d437d7d0 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 29 Apr 2016 14:24:07 +0800 Subject: [PATCH 32/68] [orf] Skip the expired test --- youtube_dl/extractor/orf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 66c75f8b3..4e3864f0d 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -185,6 +185,7 @@ class ORFFM4IE(InfoExtractor): 'timestamp': 1452456073, 'upload_date': '20160110', }, + 'skip': 'Live streams on FM4 got deleted soon', } def _real_extract(self, url): From 5819edef034819b76b8eec6a0cdf7b29cc9ddff3 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 29 Apr 2016 14:27:15 +0800 Subject: [PATCH 33/68] [ooyala] Skip an invalid test Ooyala is used by lots of extractors and its correctness can be verified by these websites. --- youtube_dl/extractor/ooyala.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 16f040191..95e982897 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -96,6 +96,8 @@ class OoyalaIE(OoyalaBaseIE): 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', 'duration': 853.386, }, + # The video in the original webpage now uses PlayWire + 'skip': 'Ooyala said: movie expired', }, { # Only available for ipad 'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', From 1910077ed77a270fea8e368c3815b23cee254f85 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 29 Apr 2016 17:59:23 +0800 Subject: [PATCH 34/68] Revert "[sexykarma] Remove the extractor" This reverts commit 31ff3c074eddf4078b6eb49281830875eb4e65a1. --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/sexykarma.py | 121 +++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 youtube_dl/extractor/sexykarma.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 41ff1e7a5..88405f070 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -657,6 +657,7 @@ from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE from .senateisvp import SenateISVPIE from .servingsys import ServingSysIE from .sexu import SexuIE +from .sexykarma import SexyKarmaIE from .shahid import ShahidIE from .shared import SharedIE from .sharesix import ShareSixIE diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/sexykarma.py new file mode 100644 index 000000000..e33483674 --- /dev/null +++ b/youtube_dl/extractor/sexykarma.py @@ -0,0 +1,121 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + unified_strdate, + parse_duration, + int_or_none, +) + + +class SexyKarmaIE(InfoExtractor): + IE_DESC = 'Sexy Karma and Watch Indian Porn' + _VALID_URL = r'https?://(?:www\.)?(?:sexykarma\.com|watchindianporn\.net)/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html' + _TESTS = [{ + 'url': 'http://www.sexykarma.com/gonewild/video/taking-a-quick-pee-yHI70cOyIHt.html', + 'md5': 'b9798e7d1ef1765116a8f516c8091dbd', + 'info_dict': { + 'id': 'yHI70cOyIHt', + 'display_id': 'taking-a-quick-pee', + 'ext': 'mp4', + 'title': 'Taking a quick pee.', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'wildginger7', + 'upload_date': '20141008', + 'duration': 22, + 'view_count': int, + 'comment_count': int, + 'categories': list, + 'age_limit': 18, + } + }, { + 'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html', + 'md5': 'dd216c68d29b49b12842b9babe762a5d', + 'info_dict': { + 'id': '8Id6EZPbuHf', + 'display_id': 'pot-pixie-tribute', + 'ext': 'mp4', + 'title': 'pot_pixie tribute', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'banffite', + 'upload_date': '20141013', + 'duration': 16, + 'view_count': int, + 'comment_count': int, + 'categories': list, + 'age_limit': 18, + } + }, { + 'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html', + 'md5': '9afb80675550406ed9a63ac2819ef69d', + 'info_dict': { + 'id': 'dW2mtctxJfs', + 'display_id': 'desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number', + 'ext': 'mp4', + 'title': 'Desi dancer namrata stripping completely nude and dancing on a hot number', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Don', + 'upload_date': '20140213', + 'duration': 83, + 'view_count': int, + 'comment_count': int, + 'categories': list, + 'age_limit': 18, + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + video_url = self._html_search_regex( + r"url: escape\('([^']+)'\)", webpage, 'url') + + title = self._html_search_regex( + r'<h2 class="he2"><span>(.*?)</span>', + webpage, 'title') + thumbnail = self._html_search_regex( + r'<span id="container"><img\s+src="([^"]+)"', + webpage, 'thumbnail', fatal=False) + + uploader = self._html_search_regex( + r'class="aupa">\s*(.*?)</a>', + webpage, 'uploader') + upload_date = unified_strdate(self._html_search_regex( + r'Added: <strong>(.+?)</strong>', webpage, 'upload date', fatal=False)) + + duration = parse_duration(self._search_regex( + r'<td>Time:\s*</td>\s*<td align="right"><span>\s*(.+?)\s*</span>', + webpage, 'duration', fatal=False)) + + view_count = int_or_none(self._search_regex( + r'<td>Views:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', + webpage, 'view count', fatal=False)) + comment_count = int_or_none(self._search_regex( + r'<td>Comments:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', + webpage, 'comment count', fatal=False)) + + categories = re.findall( + r'<a href="[^"]+/search/video/desi"><span>([^<]+)</span></a>', + webpage) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, + 'categories': categories, + 'age_limit': 18, + } From 14638e291511c3305b70dce64e9bd97686e9da93 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 29 Apr 2016 18:17:08 +0800 Subject: [PATCH 35/68] [sexykarma] Rename to WatchIndianPornIE and fix extraction --- youtube_dl/extractor/extractors.py | 2 +- .../{sexykarma.py => watchindianporn.py} | 63 +++++-------------- 2 files changed, 17 insertions(+), 48 deletions(-) rename youtube_dl/extractor/{sexykarma.py => watchindianporn.py} (54%) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 88405f070..3adcd41c4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -657,7 +657,6 @@ from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE from .senateisvp import SenateISVPIE from .servingsys import ServingSysIE from .sexu import SexuIE -from .sexykarma import SexyKarmaIE from .shahid import ShahidIE from .shared import SharedIE from .sharesix import ShareSixIE @@ -918,6 +917,7 @@ from .vulture import VultureIE from .walla import WallaIE from .washingtonpost import WashingtonPostIE from .wat import WatIE +from .watchindianporn import WatchIndianPornIE from .wdr import ( WDRIE, WDRMobileIE, diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/watchindianporn.py similarity index 54% rename from youtube_dl/extractor/sexykarma.py rename to youtube_dl/extractor/watchindianporn.py index e33483674..5d3b5bdb4 100644 --- a/youtube_dl/extractor/sexykarma.py +++ b/youtube_dl/extractor/watchindianporn.py @@ -11,61 +11,27 @@ from ..utils import ( ) -class SexyKarmaIE(InfoExtractor): - IE_DESC = 'Sexy Karma and Watch Indian Porn' - _VALID_URL = r'https?://(?:www\.)?(?:sexykarma\.com|watchindianporn\.net)/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html' - _TESTS = [{ - 'url': 'http://www.sexykarma.com/gonewild/video/taking-a-quick-pee-yHI70cOyIHt.html', - 'md5': 'b9798e7d1ef1765116a8f516c8091dbd', +class WatchIndianPornIE(InfoExtractor): + IE_DESC = 'Watch Indian Porn' + _VALID_URL = r'https?://(?:www\.)?watchindianporn\.net/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html' + _TEST = { + 'url': 'http://www.watchindianporn.net/video/hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera-RZa2avywNPa.html', + 'md5': '249589a164dde236ec65832bfce17440', 'info_dict': { - 'id': 'yHI70cOyIHt', - 'display_id': 'taking-a-quick-pee', + 'id': 'RZa2avywNPa', + 'display_id': 'hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera', 'ext': 'mp4', - 'title': 'Taking a quick pee.', + 'title': 'Hot milf from kerala shows off her gorgeous large breasts on camera', 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'wildginger7', - 'upload_date': '20141008', - 'duration': 22, + 'uploader': 'LoveJay', + 'upload_date': '20160428', + 'duration': 226, 'view_count': int, 'comment_count': int, 'categories': list, 'age_limit': 18, } - }, { - 'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html', - 'md5': 'dd216c68d29b49b12842b9babe762a5d', - 'info_dict': { - 'id': '8Id6EZPbuHf', - 'display_id': 'pot-pixie-tribute', - 'ext': 'mp4', - 'title': 'pot_pixie tribute', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'banffite', - 'upload_date': '20141013', - 'duration': 16, - 'view_count': int, - 'comment_count': int, - 'categories': list, - 'age_limit': 18, - } - }, { - 'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html', - 'md5': '9afb80675550406ed9a63ac2819ef69d', - 'info_dict': { - 'id': 'dW2mtctxJfs', - 'display_id': 'desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number', - 'ext': 'mp4', - 'title': 'Desi dancer namrata stripping completely nude and dancing on a hot number', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'Don', - 'upload_date': '20140213', - 'duration': 83, - 'view_count': int, - 'comment_count': int, - 'categories': list, - 'age_limit': 18, - } - }] + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -109,6 +75,9 @@ class SexyKarmaIE(InfoExtractor): 'id': video_id, 'display_id': display_id, 'url': video_url, + 'http_headers': { + 'Referer': url, + }, 'title': title, 'thumbnail': thumbnail, 'uploader': uploader, From 67167920db50e818c9fca20579c8a05eb2218f86 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 29 Apr 2016 11:14:42 +0100 Subject: [PATCH 36/68] [viewlift] replace SnagFilms extractors - add support for other sites that use the same logic - improve format extraction and sorting --- youtube_dl/extractor/extractors.py | 8 ++--- youtube_dl/extractor/generic.py | 10 +++--- .../extractor/{snagfilms.py => viewlift.py} | 35 +++++++++++++------ 3 files changed, 34 insertions(+), 19 deletions(-) rename youtube_dl/extractor/{snagfilms.py => viewlift.py} (81%) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3adcd41c4..b1b7f9b42 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -673,10 +673,6 @@ from .smotri import ( SmotriUserIE, SmotriBroadcastIE, ) -from .snagfilms import ( - SnagFilmsIE, - SnagFilmsEmbedIE, -) from .snotr import SnotrIE from .sohu import SohuIE from .soundcloud import ( @@ -879,6 +875,10 @@ from .vidme import ( ) from .vidzi import VidziIE from .vier import VierIE, VierVideosIE +from .viewlift import ( + ViewLiftIE, + ViewLiftEmbedIE, +) from .viewster import ViewsterIE from .viidea import ViideaIE from .vimeo import ( diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a95501d86..0f1eb7fa6 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -51,7 +51,7 @@ from .tnaflix import TNAFlixNetworkEmbedIE from .vimeo import VimeoIE from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE -from .snagfilms import SnagFilmsEmbedIE +from .viewlift import ViewLiftEmbedIE from .screenwavemedia import ScreenwaveMediaIE from .mtv import MTVServicesEmbeddedIE from .pladform import PladformIE @@ -1924,10 +1924,10 @@ class GenericIE(InfoExtractor): if onionstudios_url: return self.url_result(onionstudios_url) - # Look for SnagFilms embeds - snagfilms_url = SnagFilmsEmbedIE._extract_url(webpage) - if snagfilms_url: - return self.url_result(snagfilms_url) + # Look for ViewLift embeds + viewlift_url = ViewLiftEmbedIE._extract_url(webpage) + if viewlift_url: + return self.url_result(viewlift_url) # Look for JWPlatform embeds jwplatform_url = JWPlatformIE._extract_url(webpage) diff --git a/youtube_dl/extractor/snagfilms.py b/youtube_dl/extractor/viewlift.py similarity index 81% rename from youtube_dl/extractor/snagfilms.py rename to youtube_dl/extractor/viewlift.py index 6977afb27..dd4a13a4a 100644 --- a/youtube_dl/extractor/snagfilms.py +++ b/youtube_dl/extractor/viewlift.py @@ -13,8 +13,12 @@ from ..utils import ( ) -class SnagFilmsEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|embed)\.)?snagfilms\.com/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})' +class ViewLiftBaseIE(InfoExtractor): + _DOMAINS_REGEX = '(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|monumentalsportsnetwork|vayafilm)\.com|kesari\.tv' + + +class ViewLiftEmbedIE(ViewLiftBaseIE): + _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', 'md5': '2924e9215c6eff7a55ed35b72276bd93', @@ -40,7 +44,7 @@ class SnagFilmsEmbedIE(InfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?snagfilms\.com/embed/player.+?)\1', + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?(?:%s)/embed/player.+?)\1' % ViewLiftBaseIE._DOMAINS_REGEX, webpage) if mobj: return mobj.group('url') @@ -55,6 +59,7 @@ class SnagFilmsEmbedIE(InfoExtractor): 'Film %s is not playable in your area.' % video_id, expected=True) formats = [] + has_bitrate = False for source in self._parse_json(js_to_json(self._search_regex( r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id): file_ = source.get('file') @@ -63,22 +68,25 @@ class SnagFilmsEmbedIE(InfoExtractor): type_ = source.get('type') ext = determine_ext(file_) format_id = source.get('label') or ext - if all(v == 'm3u8' for v in (type_, ext)): + if all(v == 'm3u8' or v == 'hls' for v in (type_, ext)): formats.extend(self._extract_m3u8_formats( file_, video_id, 'mp4', m3u8_id='hls')) else: bitrate = int_or_none(self._search_regex( [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext], file_, 'bitrate', default=None)) + if not has_bitrate and bitrate: + has_bitrate = True height = int_or_none(self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None)) formats.append({ 'url': file_, - 'format_id': format_id, + 'format_id': 'http-%s%s' % (format_id, ('-%dk' % bitrate if bitrate else '')), 'tbr': bitrate, 'height': height, }) - self._sort_formats(formats) + field_preference = None if has_bitrate else ('height', 'tbr', 'format_id') + self._sort_formats(formats, field_preference) title = self._search_regex( [r"title\s*:\s*'([^']+)'", r'<title>([^<]+)'], @@ -91,8 +99,8 @@ class SnagFilmsEmbedIE(InfoExtractor): } -class SnagFilmsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?snagfilms\.com/(?:films/title|show)/(?P[^?#]+)' +class ViewLiftIE(ViewLiftBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?P%s)/(?:films/title|show|(?:news/)?videos?)/(?P[^?#]+)' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://www.snagfilms.com/films/title/lost_for_life', 'md5': '19844f897b35af219773fd63bdec2942', @@ -127,10 +135,16 @@ class SnagFilmsIE(InfoExtractor): # Film is not available. 'url': 'http://www.snagfilms.com/show/augie_alone/flirting', 'only_matching': True, + }, { + 'url': 'http://www.winnersview.com/videos/the-good-son', + 'only_matching': True, + }, { + 'url': 'http://www.kesari.tv/news/video/1461919076414', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) + domain, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) @@ -170,7 +184,7 @@ class SnagFilmsIE(InfoExtractor): return { '_type': 'url_transparent', - 'url': 'http://embed.snagfilms.com/embed/player?filmId=%s' % film_id, + 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), 'id': film_id, 'display_id': display_id, 'title': title, @@ -178,4 +192,5 @@ class SnagFilmsIE(InfoExtractor): 'thumbnail': thumbnail, 'duration': duration, 'categories': categories, + 'ie_key': 'ViewLiftEmbed', } From 065216d94f59953a228d2683d3bafe4241fd1e29 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 29 Apr 2016 11:46:42 +0100 Subject: [PATCH 37/68] [crunchyroll] reduce requests for formats extraction --- youtube_dl/extractor/crunchyroll.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index dd753c7c3..184ba6896 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -26,6 +26,7 @@ from ..utils import ( unified_strdate, urlencode_postdata, xpath_text, + extract_attributes, ) from ..aes import ( aes_cbc_decrypt, @@ -305,9 +306,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text r']+href="/publisher/[^"]+"[^>]*>([^<]+)', webpage, 'video_uploader', fatal=False) - formats = [] + available_fmts = [] + for a, fmt in re.findall(r'(]+token="showmedia\.([0-9]{3,4})p"[^>]+>.*?)', webpage): + attrs = extract_attributes(a) + href = attrs.get('href') + if href and '/freetrial' in href: + continue + available_fmts.append(fmt) + if not available_fmts: + available_fmts = re.findall(r'token="showmedia\.([0-9]{3,4})p"', webpage) video_encode_ids = [] - for fmt in re.findall(r'token="showmedia\.([0-9]{3,4})p"', webpage): + formats = [] + for fmt in available_fmts: stream_quality, stream_format = self._FORMAT_IDS[fmt] video_format = fmt + 'p' streamdata_req = sanitized_Request( From b24d6336a797b99339c12a0aa1b431755e22e8cf Mon Sep 17 00:00:00 2001 From: Kagami Hiiragi Date: Tue, 26 Apr 2016 17:30:24 +0300 Subject: [PATCH 38/68] [vlive] Add support for live videos --- youtube_dl/extractor/common.py | 8 ++- youtube_dl/extractor/vlive.py | 98 ++++++++++++++++++++++++++-------- 2 files changed, 83 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a285ee7d8..2763d2ffe 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1061,7 +1061,7 @@ class InfoExtractor(object): def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, m3u8_id=None, note=None, errnote=None, - fatal=True): + fatal=True, live=False): formats = [{ 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), @@ -1139,7 +1139,11 @@ class InfoExtractor(object): if m3u8_id: format_id.append(m3u8_id) last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None - format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) + # Bandwidth of live streams may differ over time thus making + # format_id unpredictable. So it's better to keep provided + # format_id intact. + if last_media_name and not live: + format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) f = { 'format_id': '-'.join(format_id), 'url': format_url(line.strip()), diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index baf39bb2c..2151696ea 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -1,8 +1,11 @@ # coding: utf-8 -from __future__ import unicode_literals +from __future__ import division, unicode_literals +import re +import time from .common import InfoExtractor from ..utils import ( + ExtractorError, dict_get, float_or_none, int_or_none, @@ -31,16 +34,77 @@ class VLiveIE(InfoExtractor): webpage = self._download_webpage( 'http://www.vlive.tv/video/%s' % video_id, video_id) - long_video_id = self._search_regex( - r'vlive\.tv\.video\.ajax\.request\.handler\.init\(\s*"[0-9]+"\s*,\s*"[^"]*"\s*,\s*"([^"]+)"', - webpage, 'long video id') + # UTC+x - UTC+9 (KST) + tz = time.altzone if time.localtime().tm_isdst == 1 else time.timezone + tz_offset = -tz // 60 - 9 * 60 + self._set_cookie('vlive.tv', 'timezoneOffset', '%d' % tz_offset) - key = self._search_regex( - r'vlive\.tv\.video\.ajax\.request\.handler\.init\(\s*"[0-9]+"\s*,\s*"[^"]*"\s*,\s*"[^"]+"\s*,\s*"([^"]+)"', - webpage, 'key') + status_params = self._download_json( + 'http://www.vlive.tv/video/status?videoSeq=%s' % video_id, + video_id, 'Downloading JSON status', + headers={'Referer': url}) + status = status_params.get('status') + air_start = status_params.get('onAirStartAt', '') + is_live = status_params.get('isLive') + video_params = self._search_regex( + r'vlive\.tv\.video\.ajax\.request\.handler\.init\((.+)\)', + webpage, 'video params') + live_params, long_video_id, key = re.split( + r'"\s*,\s*"', video_params)[1:4] + + if status == 'LIVE_ON_AIR' or status == 'BIG_EVENT_ON_AIR': + live_params = self._parse_json('"%s"' % live_params, video_id) + live_params = self._parse_json(live_params, video_id) + return self._live(video_id, webpage, live_params) + elif status == 'VOD_ON_AIR' or status == 'BIG_EVENT_INTRO': + if long_video_id and key: + return self._replay(video_id, webpage, long_video_id, key) + elif is_live: + status = 'LIVE_END' + else: + status = 'COMING_SOON' + + if status == 'LIVE_END': + raise ExtractorError('Uploading for replay. Please wait...', + expected=True) + elif status == 'COMING_SOON': + raise ExtractorError('Coming soon! %s' % air_start, expected=True) + elif status == 'CANCELED': + raise ExtractorError('We are sorry, ' + 'but the live broadcast has been canceled.', + expected=True) + else: + raise ExtractorError('Unknown status %s' % status) + + def _get_common_fields(self, webpage): title = self._og_search_title(webpage) + creator = self._html_search_regex( + r']+class="info_area"[^>]*>\s*]*>([^<]+)', + webpage, 'creator', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + return { + 'title': title, + 'creator': creator, + 'thumbnail': thumbnail, + } + def _live(self, video_id, webpage, live_params): + formats = [] + for vid in live_params.get('resolutions', []): + formats.extend(self._extract_m3u8_formats( + vid['cdnUrl'], video_id, 'mp4', + m3u8_id=vid.get('name'), + fatal=False, live=True)) + self._sort_formats(formats) + + return dict(self._get_common_fields(webpage), + id=video_id, + formats=formats, + is_live=True, + ) + + def _replay(self, video_id, webpage, long_video_id, key): playinfo = self._download_json( 'http://global.apis.naver.com/rmcnmv/rmcnmv/vod_play_videoInfo.json?%s' % compat_urllib_parse_urlencode({ @@ -62,11 +126,6 @@ class VLiveIE(InfoExtractor): } for vid in playinfo.get('videos', {}).get('list', []) if vid.get('source')] self._sort_formats(formats) - thumbnail = self._og_search_thumbnail(webpage) - creator = self._html_search_regex( - r']+class="info_area"[^>]*>\s*]*>([^<]+)', - webpage, 'creator', fatal=False) - view_count = int_or_none(playinfo.get('meta', {}).get('count')) subtitles = {} @@ -77,12 +136,9 @@ class VLiveIE(InfoExtractor): 'ext': 'vtt', 'url': caption['source']}] - return { - 'id': video_id, - 'title': title, - 'creator': creator, - 'thumbnail': thumbnail, - 'view_count': view_count, - 'formats': formats, - 'subtitles': subtitles, - } + return dict(self._get_common_fields(webpage), + id=video_id, + formats=formats, + view_count=view_count, + subtitles=subtitles, + ) From 9d186afac818645490122aa7457f247c31c601bf Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 29 Apr 2016 19:29:00 +0800 Subject: [PATCH 39/68] [vlive] Coding style and PEP8 --- youtube_dl/extractor/vlive.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 2151696ea..7f9e99ec2 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -3,10 +3,11 @@ from __future__ import division, unicode_literals import re import time + from .common import InfoExtractor from ..utils import ( - ExtractorError, dict_get, + ExtractorError, float_or_none, int_or_none, ) @@ -99,10 +100,9 @@ class VLiveIE(InfoExtractor): self._sort_formats(formats) return dict(self._get_common_fields(webpage), - id=video_id, - formats=formats, - is_live=True, - ) + id=video_id, + formats=formats, + is_live=True) def _replay(self, video_id, webpage, long_video_id, key): playinfo = self._download_json( @@ -137,8 +137,7 @@ class VLiveIE(InfoExtractor): 'url': caption['source']}] return dict(self._get_common_fields(webpage), - id=video_id, - formats=formats, - view_count=view_count, - subtitles=subtitles, - ) + id=video_id, + formats=formats, + view_count=view_count, + subtitles=subtitles) From 6ff4469528d642bd678df9b1fa83545a0942e333 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 29 Apr 2016 19:39:27 +0600 Subject: [PATCH 40/68] [crunchyroll] Relax fmt regex --- youtube_dl/extractor/crunchyroll.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 184ba6896..4a7664296 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -307,7 +307,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'video_uploader', fatal=False) available_fmts = [] - for a, fmt in re.findall(r'(]+token="showmedia\.([0-9]{3,4})p"[^>]+>.*?)', webpage): + for a, fmt in re.findall(r'(]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): attrs = extract_attributes(a) href = attrs.get('href') if href and '/freetrial' in href: From 8312b1a3d1dc07d80d33e31f9b2b6facf13fa744 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 29 Apr 2016 19:43:53 +0600 Subject: [PATCH 41/68] [crunchyroll] Add even more relaxed fmt fallback --- youtube_dl/extractor/crunchyroll.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 4a7664296..58960b2f8 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -314,7 +314,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text continue available_fmts.append(fmt) if not available_fmts: - available_fmts = re.findall(r'token="showmedia\.([0-9]{3,4})p"', webpage) + for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'): + available_fmts = re.findall(p, webpage) + if available_fmts: + break video_encode_ids = [] formats = [] for fmt in available_fmts: From 00a17a9e1234ecc868a15b5759472a0f9215f797 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 29 Apr 2016 19:44:10 +0600 Subject: [PATCH 42/68] [crunchyroll] Sort formats --- youtube_dl/extractor/crunchyroll.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 58960b2f8..90a64303d 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -367,6 +367,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'ext': 'flv', }) formats.append(format_info) + self._sort_formats(formats) metadata = self._download_xml( 'http://www.crunchyroll.com/xml', video_id, From e9c6cdf4a103d1ebdb6927bdab429c370cbe66b2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 29 Apr 2016 22:49:04 +0800 Subject: [PATCH 43/68] [common] Fix format_id construction for HLS --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 2763d2ffe..61a5d124c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1142,7 +1142,7 @@ class InfoExtractor(object): # Bandwidth of live streams may differ over time thus making # format_id unpredictable. So it's better to keep provided # format_id intact. - if last_media_name and not live: + if not live: format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) f = { 'format_id': '-'.join(format_id), From cef3f3011f9d3a67de3ff064a5185a1a4bcf40e7 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 30 Apr 2016 00:17:09 +0800 Subject: [PATCH 44/68] [funimation] Detect blocking and support CloudFlare cookies --- youtube_dl/extractor/funimation.py | 48 ++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index 1eb528f31..0ad0d9b6a 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -2,6 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_urllib_parse_unquote_plus, +) from ..utils import ( clean_html, determine_ext, @@ -27,6 +31,7 @@ class FunimationIE(InfoExtractor): 'description': 'md5:1769f43cd5fc130ace8fd87232207892', 'thumbnail': 're:https?://.*\.jpg', }, + 'skip': 'Access without user interaction is forbidden by CloudFlare, and video removed', }, { 'url': 'http://www.funimation.com/shows/hacksign/videos/official/role-play', 'info_dict': { @@ -37,6 +42,7 @@ class FunimationIE(InfoExtractor): 'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd', 'thumbnail': 're:https?://.*\.jpg', }, + 'skip': 'Access without user interaction is forbidden by CloudFlare', }, { 'url': 'http://www.funimation.com/shows/attack-on-titan-junior-high/videos/promotional/broadcast-dub-preview', 'info_dict': { @@ -47,8 +53,36 @@ class FunimationIE(InfoExtractor): 'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803', 'thumbnail': 're:https?://.*\.(?:jpg|png)', }, + 'skip': 'Access without user interaction is forbidden by CloudFlare', }] + _LOGIN_URL = 'http://www.funimation.com/login' + + def _download_webpage(self, *args, **kwargs): + try: + return super(FunimationIE, self)._download_webpage(*args, **kwargs) + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: + response = ee.cause.read() + if b'>Please complete the security check to access<' in response: + raise ExtractorError( + 'Access to funimation.com is blocked by CloudFlare. ' + 'Please browse to http://www.funimation.com/, solve ' + 'the reCAPTCHA, export browser cookies to a text file,' + ' and then try again with --cookies YOUR_COOKIE_FILE.', + expected=True) + raise + + def _extract_cloudflare_session_ua(self, url): + ci_session_cookie = self._get_cookies(url).get('ci_session') + if ci_session_cookie: + ci_session = compat_urllib_parse_unquote_plus(ci_session_cookie.value) + # ci_session is a string serialized by PHP function serialize() + # This case is simple enough to use regular expressions only + return self._search_regex( + r'"user_agent";s:\d+:"([^"]+)"', ci_session, 'user agent', + default=None) + def _login(self): (username, password) = self._get_login_info() if username is None: @@ -57,8 +91,11 @@ class FunimationIE(InfoExtractor): 'email_field': username, 'password_field': password, }) - login_request = sanitized_Request('http://www.funimation.com/login', data, headers={ - 'User-Agent': 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0', + user_agent = self._extract_cloudflare_session_ua(self._LOGIN_URL) + if not user_agent: + user_agent = 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0' + login_request = sanitized_Request(self._LOGIN_URL, data, headers={ + 'User-Agent': user_agent, 'Content-Type': 'application/x-www-form-urlencoded' }) login_page = self._download_webpage( @@ -103,11 +140,16 @@ class FunimationIE(InfoExtractor): ('mobile', 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36'), ) + user_agent = self._extract_cloudflare_session_ua(url) + if user_agent: + USER_AGENTS = ((None, user_agent),) + for kind, user_agent in USER_AGENTS: request = sanitized_Request(url) request.add_header('User-Agent', user_agent) webpage = self._download_webpage( - request, display_id, 'Downloading %s webpage' % kind) + request, display_id, + 'Downloading %s webpage' % kind if kind else 'Downloading webpage') playlist = self._parse_json( self._search_regex( From 65a3bfb379c9d5e53cac874af097d2071ee4ac4d Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 29 Apr 2016 19:21:17 +0100 Subject: [PATCH 45/68] [dfb] extract m3u8 formats --- youtube_dl/extractor/dfb.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py index cdfeccacb..a4d0448c2 100644 --- a/youtube_dl/extractor/dfb.py +++ b/youtube_dl/extractor/dfb.py @@ -12,39 +12,46 @@ class DFBIE(InfoExtractor): _TEST = { 'url': 'http://tv.dfb.de/video/u-19-em-stimmen-zum-spiel-gegen-russland/11633/', - # The md5 is different each time + 'md5': 'ac0f98a52a330f700b4b3034ad240649', 'info_dict': { 'id': '11633', 'display_id': 'u-19-em-stimmen-zum-spiel-gegen-russland', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'U 19-EM: Stimmen zum Spiel gegen Russland', 'upload_date': '20150714', }, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, display_id) player_info = self._download_xml( 'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id, display_id) video_info = player_info.find('video') + stream_access_url = self._proto_relative_url(video_info.find('url').text.strip()) - f4m_info = self._download_xml( - self._proto_relative_url(video_info.find('url').text.strip()), display_id) - token_el = f4m_info.find('token') - manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0' - formats = self._extract_f4m_formats(manifest_url, display_id) + formats = [] + # see http://tv.dfb.de/player/js/ajax.js for the method to extract m3u8 formats + for sa_url in (stream_access_url, stream_access_url + '&area=&format=iphone'): + stream_access_info = self._download_xml(sa_url, display_id) + token_el = stream_access_info.find('token') + manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + if '.f4m' in manifest_url: + formats.extend(self._extract_f4m_formats( + manifest_url + '&hdcore=3.2.0', + display_id, f4m_id='hds', fatal=False)) + else: + formats.extend(self._extract_m3u8_formats( + manifest_url, display_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) self._sort_formats(formats) return { 'id': video_id, 'display_id': display_id, 'title': video_info.find('title').text, - 'thumbnail': self._og_search_thumbnail(webpage), + 'thumbnail': 'http://tv.dfb.de/images/%s_640x360.jpg' % video_id, 'upload_date': unified_strdate(video_info.find('time_date').text), 'formats': formats, } From 5556047465e0601d2bdee0e5a436cee64b745851 Mon Sep 17 00:00:00 2001 From: Reino17 Date: Wed, 27 Apr 2016 13:11:38 +0200 Subject: [PATCH 46/68] [rtlnl] Update 720p PG_URL_TEMPLATE - Fixed the format_id for the 720p progressive videostream and added the video's resolution. - The adaptive videostreams have the m3u8-extension, so I removed the confusing mp4-extension in order to make a better distinction between the these and the progressive videostreams. --- youtube_dl/extractor/rtlnl.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 543d94417..e8b55ea25 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -94,19 +94,30 @@ class RtlNlIE(InfoExtractor): videopath = material['videopath'] m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath - formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4') + formats = self._extract_m3u8_formats(m3u8_url, uuid) video_urlpart = videopath.split('/adaptive/')[1][:-5] PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4' formats.extend([ { - 'url': PG_URL_TEMPLATE % ('a2m', video_urlpart), - 'format_id': 'pg-sd', + 'url': PG_URL_TEMPLATE % ('a2t', video_urlpart), + 'format_id': 'a2t', + 'width': 512, + 'height': 288, }, { - 'url': PG_URL_TEMPLATE % ('a3m', video_urlpart), - 'format_id': 'pg-hd', + 'url': PG_URL_TEMPLATE % ('a3t', video_urlpart), + 'format_id': 'a3t', + 'width': 704, + 'height': 400, + 'quality': 0, + }, + { + 'url': PG_URL_TEMPLATE % ('nettv', video_urlpart), + 'format_id': 'nettv', + 'width': 1280, + 'height': 720, 'quality': 0, } ]) From 0571ffda7dd12fc1067c0344f3ce4ce47b39edb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 30 Apr 2016 01:43:39 +0600 Subject: [PATCH 47/68] [rtlnl] Improve extraction (Closes #9329) * Make hls extraction non fatal and revert ext * Extract progressive formats' metadata from corresponding hls formats --- youtube_dl/extractor/rtlnl.py | 55 +++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index e8b55ea25..c95bcf035 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -94,33 +94,44 @@ class RtlNlIE(InfoExtractor): videopath = material['videopath'] m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath - formats = self._extract_m3u8_formats(m3u8_url, uuid) + formats = self._extract_m3u8_formats( + m3u8_url, uuid, 'mp4', m3u8_id='hls', fatal=False) video_urlpart = videopath.split('/adaptive/')[1][:-5] PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4' - formats.extend([ - { - 'url': PG_URL_TEMPLATE % ('a2t', video_urlpart), - 'format_id': 'a2t', - 'width': 512, - 'height': 288, - }, - { - 'url': PG_URL_TEMPLATE % ('a3t', video_urlpart), - 'format_id': 'a3t', - 'width': 704, - 'height': 400, - 'quality': 0, - }, - { - 'url': PG_URL_TEMPLATE % ('nettv', video_urlpart), - 'format_id': 'nettv', - 'width': 1280, - 'height': 720, - 'quality': 0, + PG_FORMATS = ( + ('a2t', 512, 288), + ('a3t', 704, 400), + ('nettv', 1280, 720), + ) + + def pg_format(format_id, width, height): + return { + 'url': PG_URL_TEMPLATE % (format_id, video_urlpart), + 'format_id': 'pg-%s' % format_id, + 'protocol': 'http', + 'width': width, + 'height': height, } - ]) + + if not formats: + formats = [pg_format(*pg_tuple) for pg_tuple in PG_FORMATS] + else: + pg_formats = [] + for format_id, width, height in PG_FORMATS: + try: + # Find hls format with the same width and height corresponding + # to progressive format and copy metadata from it. + f = next(f for f in formats + if f.get('width') == width and f.get('height') == height).copy() + f.update(pg_format(format_id, width, height)) + pg_formats.append(f) + except StopIteration: + # Missing hls format does mean that no progressive format with + # such width and height exists either. + pass + formats.extend(pg_formats) self._sort_formats(formats) thumbnails = [] From cd63d091cecd8a85a2080035051205b00f3454d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 30 Apr 2016 01:48:14 +0600 Subject: [PATCH 48/68] [rtlnl] Fix tests --- youtube_dl/extractor/rtlnl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index c95bcf035..e4411054a 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -39,7 +39,7 @@ class RtlNlIE(InfoExtractor): 'ext': 'mp4', 'timestamp': 1424039400, 'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag', - 'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$', + 'thumbnail': 're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$', 'upload_date': '20150215', 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.', } @@ -50,7 +50,7 @@ class RtlNlIE(InfoExtractor): 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a', 'ext': 'mp4', 'title': 'RTL Nieuws - Meer beelden van overval juwelier', - 'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$', + 'thumbnail': 're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$', 'timestamp': 1437233400, 'upload_date': '20150718', 'duration': 30.474, From 373e1230e4a3b934ddc59c212773d36a7e998dec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 30 Apr 2016 01:50:26 +0600 Subject: [PATCH 49/68] [rtlnl] Clarify tests --- youtube_dl/extractor/rtlnl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index e4411054a..5e916c4ab 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -32,6 +32,7 @@ class RtlNlIE(InfoExtractor): 'duration': 576.880, }, }, { + # best format avaialble a3t 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false', 'md5': 'dea7474214af1271d91ef332fb8be7ea', 'info_dict': { @@ -45,6 +46,7 @@ class RtlNlIE(InfoExtractor): } }, { # empty synopsis and missing episodes (see https://github.com/rg3/youtube-dl/issues/6275) + # best format available nettv 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false', 'info_dict': { 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a', From ca278a182b9331201e058f9f4d46b3b6114a1518 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 30 Apr 2016 02:07:29 +0600 Subject: [PATCH 50/68] [rtlnl] Replace test --- youtube_dl/extractor/rtlnl.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 5e916c4ab..8598b5840 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -20,16 +20,16 @@ class RtlNlIE(InfoExtractor): (?P[0-9a-f-]+)''' _TESTS = [{ - 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677', - 'md5': 'cc16baa36a6c169391f0764fa6b16654', + 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/82b1aad1-4a14-3d7b-b554-b0aed1b2c416', + 'md5': '473d1946c1fdd050b2c0161a4b13c373', 'info_dict': { - 'id': '6e4203a6-0a5e-3596-8424-c599a59e0677', + 'id': '82b1aad1-4a14-3d7b-b554-b0aed1b2c416', 'ext': 'mp4', - 'title': 'RTL Nieuws - Laat', - 'description': 'md5:6b61f66510c8889923b11f2778c72dc5', - 'timestamp': 1408051800, - 'upload_date': '20140814', - 'duration': 576.880, + 'title': 'RTL Nieuws', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'timestamp': 1461951000, + 'upload_date': '20160429', + 'duration': 1167.96, }, }, { # best format avaialble a3t From 69c4cde4ba6a4c7dfb8a46d1713cbb46d6f1d623 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 29 Apr 2016 21:35:09 +0100 Subject: [PATCH 51/68] [wsj] improve extraction --- youtube_dl/extractor/wsj.py | 95 +++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 47 deletions(-) diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py index 5a897371d..a83e68b17 100644 --- a/youtube_dl/extractor/wsj.py +++ b/youtube_dl/extractor/wsj.py @@ -4,16 +4,22 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, + float_or_none, unified_strdate, ) class WSJIE(InfoExtractor): - _VALID_URL = r'https?://video-api\.wsj\.com/api-video/player/iframe\.html\?guid=(?P[a-zA-Z0-9-]+)' + _VALID_URL = r'''(?x)https?:// + (?: + video-api\.wsj\.com/api-video/player/iframe\.html\?guid=| + (?:www\.)?wsj\.com/video/[^/]+/ + ) + (?P[a-zA-Z0-9-]+)''' IE_DESC = 'Wall Street Journal' - _TEST = { + _TESTS = [{ 'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', - 'md5': '9747d7a6ebc2f4df64b981e1dde9efa9', + 'md5': 'e230a5bb249075e40793b655a54a02e4', 'info_dict': { 'id': '1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', 'ext': 'mp4', @@ -24,65 +30,60 @@ class WSJIE(InfoExtractor): 'duration': 90, 'title': 'Bills Coach Rex Ryan Updates His Old Jets Tattoo', }, - } + }, { + 'url': 'http://www.wsj.com/video/can-alphabet-build-a-smarter-city/359DDAA8-9AC1-489C-82E6-0429C1E430E0.html', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - bitrates = [128, 174, 264, 320, 464, 664, 1264] api_url = ( 'http://video-api.wsj.com/api-video/find_all_videos.asp?' - 'type=guid&count=1&query=%s&' - 'fields=hls,adZone,thumbnailList,guid,state,secondsUntilStartTime,' - 'author,description,name,linkURL,videoStillURL,duration,videoURL,' - 'adCategory,catastrophic,linkShortURL,doctypeID,youtubeID,' - 'titletag,rssURL,wsj-section,wsj-subsection,allthingsd-section,' - 'allthingsd-subsection,sm-section,sm-subsection,provider,' - 'formattedCreationDate,keywords,keywordsOmniture,column,editor,' - 'emailURL,emailPartnerID,showName,omnitureProgramName,' - 'omnitureVideoFormat,linkRelativeURL,touchCastID,' - 'omniturePublishDate,%s') % ( - video_id, ','.join('video%dkMP4Url' % br for br in bitrates)) + 'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,' + 'thumbnailList,author,description,name,duration,videoURL,' + 'titletag,formattedCreationDate,keywords,editor' % video_id) info = self._download_json(api_url, video_id)['items'][0] - - # Thumbnails are conveniently in the correct format already - thumbnails = info.get('thumbnailList') - creator = info.get('author') - uploader_id = info.get('editor') - categories = info.get('keywords') - duration = int_or_none(info.get('duration')) - upload_date = unified_strdate( - info.get('formattedCreationDate'), day_first=False) title = info.get('name', info.get('titletag')) - formats = [{ - 'format_id': 'f4m', - 'format_note': 'f4m (meta URL)', - 'url': info['videoURL'], - }] - if info.get('hls'): + formats = [] + + f4m_url = info.get('videoURL') + if f4m_url: + formats.extend(self._extract_f4m_formats( + f4m_url, video_id, f4m_id='hds', fatal=False)) + + m3u8_url = info.get('hls') + if m3u8_url: formats.extend(self._extract_m3u8_formats( info['hls'], video_id, ext='mp4', - preference=0, entry_protocol='m3u8_native')) - for br in bitrates: - field = 'video%dkMP4Url' % br - if info.get(field): - formats.append({ - 'format_id': 'mp4-%d' % br, - 'container': 'mp4', - 'tbr': br, - 'url': info[field], - }) + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + + for v in info.get('videoMP4List', []): + mp4_url = v.get('url') + if not mp4_url: + continue + tbr = int_or_none(v.get('bitrate')) + formats.append({ + 'url': mp4_url, + 'format_id': 'http' + ('-%d' % tbr if tbr else ''), + 'tbr': tbr, + 'width': int_or_none(v.get('width')), + 'height': int_or_none(v.get('height')), + 'fps': float_or_none(v.get('fps')), + }) self._sort_formats(formats) return { 'id': video_id, 'formats': formats, - 'thumbnails': thumbnails, - 'creator': creator, - 'uploader_id': uploader_id, - 'duration': duration, - 'upload_date': upload_date, + # Thumbnails are conveniently in the correct format already + 'thumbnails': info.get('thumbnailList'), + 'creator': info.get('author'), + 'uploader_id': info.get('editor'), + 'duration': int_or_none(info.get('duration')), + 'upload_date': unified_strdate(info.get( + 'formattedCreationDate'), day_first=False), 'title': title, - 'categories': categories, + 'categories': info.get('keywords'), } From cbc032c8b70a038a69259378c92b4ba97b42d491 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 30 Apr 2016 01:24:36 +0100 Subject: [PATCH 52/68] [pbs] extract all http formats --- youtube_dl/extractor/pbs.py | 48 ++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index f43e3a146..38cdb9975 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -196,7 +196,7 @@ class PBSIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', - 'md5': 'ce1888486f0908d555a8093cac9a7362', + 'md5': '173dc391afd361fa72eab5d3d918968d', 'info_dict': { 'id': '2365006249', 'ext': 'mp4', @@ -204,13 +204,10 @@ class PBSIE(InfoExtractor): 'description': 'md5:36f341ae62e251b8f5bd2b754b95a071', 'duration': 3190, }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { 'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/', - 'md5': '143c98aa54a346738a3d78f54c925321', + 'md5': '6f722cb3c3982186d34b0f13374499c7', 'info_dict': { 'id': '2365297690', 'ext': 'mp4', @@ -218,9 +215,6 @@ class PBSIE(InfoExtractor): 'description': 'md5:4d3eaa01f94e61b3e73704735f1196d9', 'duration': 5050, }, - 'params': { - 'skip_download': True, # requires ffmpeg - } }, { 'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/', @@ -244,9 +238,6 @@ class PBSIE(InfoExtractor): 'duration': 6559, 'thumbnail': 're:^https?://.*\.jpg$', }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { 'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html', @@ -262,9 +253,6 @@ class PBSIE(InfoExtractor): 'upload_date': '20140122', 'age_limit': 10, }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { 'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/', @@ -290,6 +278,7 @@ class PBSIE(InfoExtractor): }, { 'url': 'http://www.pbs.org/video/2365245528/', + 'md5': '115223d41bd55cda8ae5cd5ed4e11497', 'info_dict': { 'id': '2365245528', 'display_id': '2365245528', @@ -299,15 +288,13 @@ class PBSIE(InfoExtractor): 'duration': 6851, 'thumbnail': 're:^https?://.*\.jpg$', }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { # Video embedded in iframe containing angle brackets as attribute's value (e.g. # "