From 17961ddf92304f4cf86da8ad1a2598b6cd314565 Mon Sep 17 00:00:00 2001 From: user706 <39215612+user706@users.noreply.github.com> Date: Tue, 1 Jan 2019 16:46:03 +0100 Subject: [PATCH 1/4] [gramofononline] new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/gramofononline.py | 83 ++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 youtube_dl/extractor/gramofononline.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d72f52e36..637c56d67 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -439,6 +439,7 @@ from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .goshgay import GoshgayIE from .gputechconf import GPUTechConfIE +from .gramofononline import GramofonOnlineIE from .groupon import GrouponIE from .hark import HarkIE from .hbo import ( diff --git a/youtube_dl/extractor/gramofononline.py b/youtube_dl/extractor/gramofononline.py new file mode 100644 index 000000000..2b49fbb49 --- /dev/null +++ b/youtube_dl/extractor/gramofononline.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +import re +import json + + +class GramofonOnlineIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gramofononline\.hu(/(listen.php\?.*track=)?(?P[0-9]+))?' + + _TESTS = [{ + 'url': 'https://gramofononline.hu/1401835664/papageno-duett', + 'md5': '1b4bcabde313f09cdd48c463b54d8125', + 'info_dict': { + 'id': '1401835664', + 'title': 'Papageno-Duett ', + 'artist': 'Johanna Gadski, Otto Goritz, ismeretlen zenekar', + 'ext': 'mp3' + } + }, { + 'url': 'https://gramofononline.hu/listen.php?autoplay=true&track=1401835664', + 'md5': '1b4bcabde313f09cdd48c463b54d8125', + 'info_dict': { + 'id': '1401835664', + 'title': 'Papageno-Duett ', + 'artist': 'Johanna Gadski, Otto Goritz, ismeretlen zenekar', + 'ext': 'mp3' + }, + 'params': { + 'skip_download': True, + } + }] + + def _get_entry(self, obj): + id1 = obj.get("id") + source = obj.get("source") + name = obj.get("name") + artist = obj.get("artist") + # subname = obj.get("subname") + # paralelname = obj.get("paralelname") + # record = obj.get("record") + # long1 = obj.get("long") + # genre = obj.get("genre") + # author = obj.get("author") + # state = obj.get("state") + # matrica = obj.get("matrica") + # publisher = obj.get("publisher") + # img = obj.get("img") + return { + 'id': id1, + 'title': name, + 'http_headers': {'Referer': 'https://gramofononline.hu/' + id1}, + 'artist': artist, + 'thumbnail': 'https://gramofononline.hu/getImage.php?id=' + source, + 'formats': [{ + 'url': 'https://gramofononline.hu/go/master/' + source + '.mp3', + 'ext': 'mp3' + }, { + 'url': 'https://gramofononline.hu/go/noise_reduction/' + source + '.mp3', + 'ext': 'mp3' + }] + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + for line in webpage.split("\n"): + m = re.search(r'var\s+trackList\s*=\s*(\[.*\]);?\s*', line) + if m: + break + lineobjs = json.loads(m.group(1)) + + if len(lineobjs) > 1: + result = { + '_type': 'playlist', + 'entries': [self._get_entry(obj) for obj in lineobjs] + } + else: + result = self._get_entry(lineobjs[0]) + + return result From e0c8ed0696752d5278a1d2906675c15c5c4b9f62 Mon Sep 17 00:00:00 2001 From: user706 <39215612+user706@users.noreply.github.com> Date: Tue, 1 Jan 2019 17:10:51 +0100 Subject: [PATCH 2/4] [gramofononline] fixes --- youtube_dl/extractor/gramofononline.py | 38 +++++++------------------- 1 file changed, 10 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/gramofononline.py b/youtube_dl/extractor/gramofononline.py index 2b49fbb49..409710baa 100644 --- a/youtube_dl/extractor/gramofononline.py +++ b/youtube_dl/extractor/gramofononline.py @@ -33,23 +33,13 @@ class GramofonOnlineIE(InfoExtractor): }] def _get_entry(self, obj): - id1 = obj.get("id") - source = obj.get("source") - name = obj.get("name") - artist = obj.get("artist") - # subname = obj.get("subname") - # paralelname = obj.get("paralelname") - # record = obj.get("record") - # long1 = obj.get("long") - # genre = obj.get("genre") - # author = obj.get("author") - # state = obj.get("state") - # matrica = obj.get("matrica") - # publisher = obj.get("publisher") - # img = obj.get("img") + id1 = obj.get('id') + source = obj.get('source') + title = obj.get('name') + artist = obj.get('artist') return { 'id': id1, - 'title': name, + 'title': title, 'http_headers': {'Referer': 'https://gramofononline.hu/' + id1}, 'artist': artist, 'thumbnail': 'https://gramofononline.hu/getImage.php?id=' + source, @@ -66,18 +56,10 @@ class GramofonOnlineIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - for line in webpage.split("\n"): - m = re.search(r'var\s+trackList\s*=\s*(\[.*\]);?\s*', line) - if m: - break + m = re.search(r'var\s+trackList\s*=\s*(\[.*\]);', webpage) lineobjs = json.loads(m.group(1)) - if len(lineobjs) > 1: - result = { - '_type': 'playlist', - 'entries': [self._get_entry(obj) for obj in lineobjs] - } - else: - result = self._get_entry(lineobjs[0]) - - return result + return { + '_type': 'playlist', + 'entries': [self._get_entry(obj) for obj in lineobjs] + } From 7253770f0de6f48795f717369e343485d40e6c6e Mon Sep 17 00:00:00 2001 From: user706 <39215612+user706@users.noreply.github.com> Date: Sun, 6 Jan 2019 22:24:22 +0100 Subject: [PATCH 3/4] [gramofononline] improve --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/gramofononline.py | 117 ++++++++++++++++++++----- 2 files changed, 98 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 637c56d67..d40bbde3e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -439,7 +439,10 @@ from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .goshgay import GoshgayIE from .gputechconf import GPUTechConfIE -from .gramofononline import GramofonOnlineIE +from .gramofononline import ( + GramofonOnlineIE, + GramofonOnlinePlaylistIE, +) from .groupon import GrouponIE from .hark import HarkIE from .hbo import ( diff --git a/youtube_dl/extractor/gramofononline.py b/youtube_dl/extractor/gramofononline.py index 409710baa..9814afc3d 100644 --- a/youtube_dl/extractor/gramofononline.py +++ b/youtube_dl/extractor/gramofononline.py @@ -2,12 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -import re -import json +from ..utils import try_get class GramofonOnlineIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gramofononline\.hu(/(listen.php\?.*track=)?(?P[0-9]+))?' + _VALID_URL = r'https?://(?:www\.)?gramofononline\.hu/(?:hu/|en/|de/)?(?:listen.php\?.*track=)?(?P[0-9]+)' _TESTS = [{ 'url': 'https://gramofononline.hu/1401835664/papageno-duett', @@ -18,6 +17,19 @@ class GramofonOnlineIE(InfoExtractor): 'artist': 'Johanna Gadski, Otto Goritz, ismeretlen zenekar', 'ext': 'mp3' } + }, { + # same as above but with /en/ + 'url': 'https://gramofononline.hu/en/1401835664/papageno-duett', + 'md5': '1b4bcabde313f09cdd48c463b54d8125', + 'info_dict': { + 'id': '1401835664', + 'title': 'Papageno-Duett ', + 'artist': 'Johanna Gadski, Otto Goritz, ismeretlen zenekar', + 'ext': 'mp3' + }, + 'params': { + 'skip_download': True, + } }, { 'url': 'https://gramofononline.hu/listen.php?autoplay=true&track=1401835664', 'md5': '1b4bcabde313f09cdd48c463b54d8125', @@ -30,36 +42,95 @@ class GramofonOnlineIE(InfoExtractor): 'params': { 'skip_download': True, } + }, { + # same as above but with /en/ + 'url': 'https://gramofononline.hu/en/listen.php?autoplay=true&track=1401835664', + 'md5': '1b4bcabde313f09cdd48c463b54d8125', + 'info_dict': { + 'id': '1401835664', + 'title': 'Papageno-Duett ', + 'artist': 'Johanna Gadski, Otto Goritz, ismeretlen zenekar', + 'ext': 'mp3' + }, + 'params': { + 'skip_download': True, + } }] - def _get_entry(self, obj): - id1 = obj.get('id') - source = obj.get('source') - title = obj.get('name') + def _get_entry(self, obj, webpage): + id1 = (obj.get('id') + or self._search_regex(r'var\s*track=([^;]+);', webpage, 'id', default=None) + or self._search_regex(r'http://gramofononline\.hu/flash/loader\.swf\?id=(\w+)', webpage, 'id')) + url_suffix = (obj.get('source') + or self._search_regex(r'/data\.php\?n=600&fname=(\w+)', webpage, 'url_suffix', default=None) + or self._search_regex(r'http://gramofononline\.hu/keyframe/go/midres/midres_(\w+)', webpage, 'url_suffix')) + title = (obj.get('name') + or self._html_search_regex(r'Gramofon Online / (.*)', webpage, 'title') + or self._og_search_title(webpage)) artist = obj.get('artist') - return { - 'id': id1, - 'title': title, - 'http_headers': {'Referer': 'https://gramofononline.hu/' + id1}, - 'artist': artist, - 'thumbnail': 'https://gramofononline.hu/getImage.php?id=' + source, - 'formats': [{ - 'url': 'https://gramofononline.hu/go/master/' + source + '.mp3', - 'ext': 'mp3' - }, { - 'url': 'https://gramofononline.hu/go/noise_reduction/' + source + '.mp3', - 'ext': 'mp3' - }] - } + return get_gramofon_online_info_dict(id1, title, url_suffix, artist) def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - m = re.search(r'var\s+trackList\s*=\s*(\[.*\]);', webpage) - lineobjs = json.loads(m.group(1)) + json_tracklist = self._search_regex(r'var\s+trackList\s*=\s*(\[.*\]);', webpage, 'json_tracklist') + lineobjs = self._parse_json(json_tracklist, video_id, transform_source=None, fatal=False) or {} + obj = try_get(lineobjs, lambda x: x[0]) or {} + return self._get_entry(obj, webpage) + + +class GramofonOnlinePlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gramofononline\.hu(?:/hu|/en|/de)?(?:/index.php?.*playradio)?' + + _TESTS = [{ + 'url': 'https://gramofononline.hu', + 'only_matching': True + }, { + 'url': 'https://gramofononline.hu/', + 'only_matching': True + }, { + 'url': 'https://gramofononline.hu/en/', + 'only_matching': True + }, { + 'url': 'https://gramofononline.hu/index.php?playradio=ord%3D7%26w%3D2&autoplay=1', + 'only_matching': True + }, { + 'url': 'https://gramofononline.hu/en/index.php?playradio=ord%3D7%26w%3D2&autoplay=1', + 'only_matching': True + }] + + def _get_entry(self, obj): + id1 = obj['id'] + url_suffix = obj['source'] + title = obj['name'] + artist = obj.get('artist') + return get_gramofon_online_info_dict(id1, title, url_suffix, artist) + + def _real_extract(self, url): + webpage = self._download_webpage(url, url) + + json_tracklist = self._search_regex(r'var\s+trackList\s*=\s*(\[.*\]);', webpage, 'json_tracklist') + lineobjs = self._parse_json(json_tracklist, url) return { '_type': 'playlist', 'entries': [self._get_entry(obj) for obj in lineobjs] } + + +def get_gramofon_online_info_dict(id1, title, url_suffix, artist): + return { + 'id': id1, + 'title': title, + 'http_headers': {'Referer': 'https://gramofononline.hu/' + id1}, + 'artist': artist, + 'thumbnail': 'https://gramofononline.hu/getImage.php?id=' + url_suffix, + 'formats': [{ + 'url': 'https://gramofononline.hu/go/master/' + url_suffix + '.mp3', + 'ext': 'mp3' + }, { + 'url': 'https://gramofononline.hu/go/noise_reduction/' + url_suffix + '.mp3', + 'ext': 'mp3' + }] + } From 1e06f0b170c2f382688ebe796cd7d61ad5150208 Mon Sep 17 00:00:00 2001 From: user706 <39215612+user706@users.noreply.github.com> Date: Sun, 6 Jan 2019 22:44:59 +0100 Subject: [PATCH 4/4] [gramofononline] fix url matching (playlist must be unique, relative to single file) --- youtube_dl/extractor/gramofononline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/gramofononline.py b/youtube_dl/extractor/gramofononline.py index 9814afc3d..43b657ea7 100644 --- a/youtube_dl/extractor/gramofononline.py +++ b/youtube_dl/extractor/gramofononline.py @@ -81,7 +81,7 @@ class GramofonOnlineIE(InfoExtractor): class GramofonOnlinePlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gramofononline\.hu(?:/hu|/en|/de)?(?:/index.php?.*playradio)?' + _VALID_URL = r'https?://(?:www\.)?gramofononline\.hu(?:/(?:hu/|en/|de/)?(?:index.php?.*playradio.*)?)?$' _TESTS = [{ 'url': 'https://gramofononline.hu',