From 7253770f0de6f48795f717369e343485d40e6c6e Mon Sep 17 00:00:00 2001 From: user706 <39215612+user706@users.noreply.github.com> Date: Sun, 6 Jan 2019 22:24:22 +0100 Subject: [PATCH] [gramofononline] improve --- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/gramofononline.py | 117 ++++++++++++++++++++----- 2 files changed, 98 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 637c56d67..d40bbde3e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -439,7 +439,10 @@ from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .goshgay import GoshgayIE from .gputechconf import GPUTechConfIE -from .gramofononline import GramofonOnlineIE +from .gramofononline import ( + GramofonOnlineIE, + GramofonOnlinePlaylistIE, +) from .groupon import GrouponIE from .hark import HarkIE from .hbo import ( diff --git a/youtube_dl/extractor/gramofononline.py b/youtube_dl/extractor/gramofononline.py index 409710baa..9814afc3d 100644 --- a/youtube_dl/extractor/gramofononline.py +++ b/youtube_dl/extractor/gramofononline.py @@ -2,12 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -import re -import json +from ..utils import try_get class GramofonOnlineIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gramofononline\.hu(/(listen.php\?.*track=)?(?P[0-9]+))?' + _VALID_URL = r'https?://(?:www\.)?gramofononline\.hu/(?:hu/|en/|de/)?(?:listen.php\?.*track=)?(?P[0-9]+)' _TESTS = [{ 'url': 'https://gramofononline.hu/1401835664/papageno-duett', @@ -18,6 +17,19 @@ class GramofonOnlineIE(InfoExtractor): 'artist': 'Johanna Gadski, Otto Goritz, ismeretlen zenekar', 'ext': 'mp3' } + }, { + # same as above but with /en/ + 'url': 'https://gramofononline.hu/en/1401835664/papageno-duett', + 'md5': '1b4bcabde313f09cdd48c463b54d8125', + 'info_dict': { + 'id': '1401835664', + 'title': 'Papageno-Duett ', + 'artist': 'Johanna Gadski, Otto Goritz, ismeretlen zenekar', + 'ext': 'mp3' + }, + 'params': { + 'skip_download': True, + } }, { 'url': 'https://gramofononline.hu/listen.php?autoplay=true&track=1401835664', 'md5': '1b4bcabde313f09cdd48c463b54d8125', @@ -30,36 +42,95 @@ class GramofonOnlineIE(InfoExtractor): 'params': { 'skip_download': True, } + }, { + # same as above but with /en/ + 'url': 'https://gramofononline.hu/en/listen.php?autoplay=true&track=1401835664', + 'md5': '1b4bcabde313f09cdd48c463b54d8125', + 'info_dict': { + 'id': '1401835664', + 'title': 'Papageno-Duett ', + 'artist': 'Johanna Gadski, Otto Goritz, ismeretlen zenekar', + 'ext': 'mp3' + }, + 'params': { + 'skip_download': True, + } }] - def _get_entry(self, obj): - id1 = obj.get('id') - source = obj.get('source') - title = obj.get('name') + def _get_entry(self, obj, webpage): + id1 = (obj.get('id') + or self._search_regex(r'var\s*track=([^;]+);', webpage, 'id', default=None) + or self._search_regex(r'http://gramofononline\.hu/flash/loader\.swf\?id=(\w+)', webpage, 'id')) + url_suffix = (obj.get('source') + or self._search_regex(r'/data\.php\?n=600&fname=(\w+)', webpage, 'url_suffix', default=None) + or self._search_regex(r'http://gramofononline\.hu/keyframe/go/midres/midres_(\w+)', webpage, 'url_suffix')) + title = (obj.get('name') + or self._html_search_regex(r'Gramofon Online / (.*)', webpage, 'title') + or self._og_search_title(webpage)) artist = obj.get('artist') - return { - 'id': id1, - 'title': title, - 'http_headers': {'Referer': 'https://gramofononline.hu/' + id1}, - 'artist': artist, - 'thumbnail': 'https://gramofononline.hu/getImage.php?id=' + source, - 'formats': [{ - 'url': 'https://gramofononline.hu/go/master/' + source + '.mp3', - 'ext': 'mp3' - }, { - 'url': 'https://gramofononline.hu/go/noise_reduction/' + source + '.mp3', - 'ext': 'mp3' - }] - } + return get_gramofon_online_info_dict(id1, title, url_suffix, artist) def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - m = re.search(r'var\s+trackList\s*=\s*(\[.*\]);', webpage) - lineobjs = json.loads(m.group(1)) + json_tracklist = self._search_regex(r'var\s+trackList\s*=\s*(\[.*\]);', webpage, 'json_tracklist') + lineobjs = self._parse_json(json_tracklist, video_id, transform_source=None, fatal=False) or {} + obj = try_get(lineobjs, lambda x: x[0]) or {} + return self._get_entry(obj, webpage) + + +class GramofonOnlinePlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gramofononline\.hu(?:/hu|/en|/de)?(?:/index.php?.*playradio)?' + + _TESTS = [{ + 'url': 'https://gramofononline.hu', + 'only_matching': True + }, { + 'url': 'https://gramofononline.hu/', + 'only_matching': True + }, { + 'url': 'https://gramofononline.hu/en/', + 'only_matching': True + }, { + 'url': 'https://gramofononline.hu/index.php?playradio=ord%3D7%26w%3D2&autoplay=1', + 'only_matching': True + }, { + 'url': 'https://gramofononline.hu/en/index.php?playradio=ord%3D7%26w%3D2&autoplay=1', + 'only_matching': True + }] + + def _get_entry(self, obj): + id1 = obj['id'] + url_suffix = obj['source'] + title = obj['name'] + artist = obj.get('artist') + return get_gramofon_online_info_dict(id1, title, url_suffix, artist) + + def _real_extract(self, url): + webpage = self._download_webpage(url, url) + + json_tracklist = self._search_regex(r'var\s+trackList\s*=\s*(\[.*\]);', webpage, 'json_tracklist') + lineobjs = self._parse_json(json_tracklist, url) return { '_type': 'playlist', 'entries': [self._get_entry(obj) for obj in lineobjs] } + + +def get_gramofon_online_info_dict(id1, title, url_suffix, artist): + return { + 'id': id1, + 'title': title, + 'http_headers': {'Referer': 'https://gramofononline.hu/' + id1}, + 'artist': artist, + 'thumbnail': 'https://gramofononline.hu/getImage.php?id=' + url_suffix, + 'formats': [{ + 'url': 'https://gramofononline.hu/go/master/' + url_suffix + '.mp3', + 'ext': 'mp3' + }, { + 'url': 'https://gramofononline.hu/go/noise_reduction/' + url_suffix + '.mp3', + 'ext': 'mp3' + }] + }