From 2c7edaf8f60a193439c2b3c0363045da3fe53c6b Mon Sep 17 00:00:00 2001 From: Allan Daemon Date: Mon, 15 May 2017 02:37:11 -0300 Subject: [PATCH] PalcoMP3: Getting song fine --- youtube_dl/extractor/palcomp3.py | 84 +++++++++++++++++++------------- 1 file changed, 49 insertions(+), 35 deletions(-) diff --git a/youtube_dl/extractor/palcomp3.py b/youtube_dl/extractor/palcomp3.py index 6ff5b0fce..16c238f43 100644 --- a/youtube_dl/extractor/palcomp3.py +++ b/youtube_dl/extractor/palcomp3.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from ..compat import compat_etree_fromstring from ..utils import get_element_by_id, get_element_by_attribute from .common import InfoExtractor @@ -10,60 +12,72 @@ from pprint import pprint as pp class PalcoMP3IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?palcomp3\.com/(?P[^/]+)/(?P[^/]+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.palcomp3.com/maiaraemaraisaoficial/nossas-composicoes-cuida-bem-dela/', - 'md5': '0effca14d6640568df0c1daa1e5609e2', + 'md5': '99fd6405b2d8fd589670f6db1ba3b358', 'info_dict': { - 'id': 'nossas-composicoes-cuida-bem-dela', + 'id': '3162927', 'ext': 'mp3', + 'display_id': 'nossas-composicoes-cuida-bem-dela', 'title': 'Nossas Composições - CUIDA BEM DELA', - 'thumbnail': r'https://studiosol-a.akamaihd.net/tb/468x351/palcomp3-logo/9/d/f/c/356447_20170324175145.jpg', - # TODO more properties, either as: - # * A value - # * MD5 checksum; start the string with md5: - # * A regular expression; start the string with re: - # * Any Python type (for example int or float) - } - } + 'thumbnail': r'https://studiosol-a.akamaihd.net/tb/80x60/palcomp3-logo/9/d/f/c/356447_20170324175145.jpg', + }}, + { + 'url': 'https://www.palcomp3.com/maiaraemaraisaoficial/niveis-da-bebida/', + 'md5': '4c4d1e45b5ae49396cfff017eb41cdd9', + 'info_dict': { + 'id': '2303899', + 'ext': 'mp3', + 'display_id': 'niveis-da-bebida', + 'title': 'NIVEIS DA BEBIDA', + 'thumbnail': r'https://studiosol-a.akamaihd.net/tb/80x60/palcomp3-logo/9/d/f/c/356447_20170324175145.jpg', + }}, + ] - def _json_ld(self, json_ld, video_id, fatal=True, expected_type="MusicGroup"): + def _json_ld(self, json_ld, display_id, fatal=True, expected_type="MusicGroup"): """ override `common.py:_json_ld` as we just need the `_search_json_ld` function to get the JSON, but the original `_json_ld` function does not fit us.""" - return self._parse_json(json_ld, video_id, fatal=fatal) + return self._parse_json(json_ld, display_id, fatal=fatal) def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) print ("Webpage", type(webpage), len(webpage)) + ld = self._get_ld_info(webpage, display_id) + tracks = [ self._ld_track_process(track, ld) for track in ld['track'] ] + + # from IPython import embed + # embed() + for track in tracks: + if track['display_id'] == display_id: + return track - player = get_element_by_id('player', webpage) + + def _get_ld_info(self, webpage, display_id): # player = get_element_by_attribute('id', 'player', webpage, escape_value=False) + player = get_element_by_id('player', webpage) pp(player) - - - ld = self._search_json_ld(player, video_id, expected_type="MusicGroup") + ld = self._search_json_ld(player, display_id, expected_type="MusicGroup") print("LD:") pp(ld) - # from IPython import embed - # embed() - info = self._ld_track_process(ld['track'][0]) + return ld + + def _ld_track_process(self, track, ld={'genre':None}): + tmin, tsec = re.findall("PT(\d+)M(\d+)S", track['duration'], re.IGNORECASE)[0] return { - 'id': video_id, - 'title': self._og_search_title(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - # 'description': self._og_search_description(webpage), - # 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), - # TODO more properties (see youtube_dl/extractor/common.py) - 'url': 'https:' + ld['track'][0]['audio'], - } - - - def _ld_track_process(self, track): - return { - 'url': 'https:' + track['audio'] + 'id': track['@id'], + 'title': track['name'], + 'track': track['name'], + 'url': 'https:' + track['audio'], + 'webpage_url': 'https://www.palcomp3.com' + track['url'], + 'artist': track['byArtist']['name'], + 'thumbnail': track['byArtist']['image'], + 'display_id': track['url'].split('/')[-2], + 'duration': int(tmin)*60 + int(tsec), + 'genre': ld['genre'], } \ No newline at end of file