From 749010659d0edd1f893ec3cb73b5487c0bddb66c Mon Sep 17 00:00:00 2001 From: Jalakas Date: Tue, 10 May 2016 20:11:21 +0300 Subject: [PATCH 1/2] [err] Add new extractor --- youtube_dl/extractor/err.py | 228 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 229 insertions(+) create mode 100644 youtube_dl/extractor/err.py diff --git a/youtube_dl/extractor/err.py b/youtube_dl/extractor/err.py new file mode 100644 index 000000000..dbe6f4569 --- /dev/null +++ b/youtube_dl/extractor/err.py @@ -0,0 +1,228 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +print_debug = 1 # 1 to turn on debug notification printing + + +class ErrIE(InfoExtractor): + _VALID_URL = r'https?:\/\/(?:[a-z0-9]*)\.?err\.ee\/(?:[a-z0-9_-]*)\/(?:[a-z0-9_-]*)' + + # For testing run: "/test/test_download.py TestDownload.test_Err" and Err_1 and Err_2 ... + _TESTS = [{ + # ETV recent + 'url': 'http://etv.err.ee/v/9f77d07c-9ed7-4b29-bac0-03ddee4f4675', + 'info_dict': { + 'id': '2016-000934-0611_Pealtnagija.mp4', + 'ext': 'mp4', + 'title': '2016-000934-0611_Pealtnagija.mp4' + }, + }, { + # ETV recent (folder: /gb/, pattern: sources) + 'url': 'http://etv.err.ee/v/dokumentaalfilmid/valisilma_dokk/saated/c4e742ef-262f-4e8b-9eb7-90415630eff8/valisilma-dokk-suur-plaan-eestilatileedu-2016', + 'info_dict': { + 'id': '2015-023708-0001_Suur_plaan.mp4', + 'ext': 'mp4', + 'title': '2015-023708-0001_Suur_plaan.mp4' + }, + }, { + # ETV archive (video) + 'url': 'http://arhiiv.err.ee/vaata/vurst-volkonski', + 'info_dict': { + 'id': '1990-082743-0001_0001_D10_VURST-VOLKONSKI.mp4', + 'ext': 'mp4', + 'title': '1990-082743-0001_0001_D10_VURST-VOLKONSKI.mp4' + }, + }, { + # ETV archive (audio) + 'url': 'https://arhiiv.err.ee/vaata/estraadikava-naisevott-raali-abil', + 'info_dict': { + 'id': 'a_108175_RMARHIIV.m4a', + 'ext': 'm4a', + 'title': 'a_108175_RMARHIIV.m4a' + }, + }, { + # Radios: Vikerraadio, R2 + 'url': 'http://r2.err.ee/v/2tartutudengipaevad/saated/8c0e0116-2f67-43a4-8b74-8bf974510d6a/tudengi-45', + 'info_dict': { + 'id': 'RR2049iu7382.m4a', + 'ext': 'mp4', + 'title': 'RR2049iu7382.m4a' + }, + }] + + def _real_extract(self, url): + + webpage = self._download_webpage(url, "1") + webpage_folder = "" + source_pattern = "" + clean_pattern = "" + + # remember to change the definitions in source_pattern cleaning part too + search_pattern1 = r'clip = {sources: ":[\',"]?([^\&" >]+)' + search_pattern2 = r'file=[\',"]?([^\&" >]+)' + search_pattern3 = r'AUDIO/[\',"]?([^\&" >]+)' + search_pattern4 = r'arhiiv/[\',"]?([^\&" >]+)' + search_pattern5 = r'"Source":":[\',"]?([^\&" >]+)' + + # here we search for data folder, after that we know what patterns to search for + if webpage.find("/gb/") != -1: + webpage_folder = "gb" + if len(re.findall(search_pattern1, webpage)) > 0: + source_pattern = search_pattern1 + elif len(re.findall(search_pattern2, webpage)) > 0: + source_pattern = search_pattern2 + elif webpage.find("/etvsaated/") != -1: + webpage_folder = "etvsaated" + if len(re.findall(search_pattern1, webpage)) > 0: + source_pattern = search_pattern1 + elif len(re.findall(search_pattern2, webpage)) > 0: + source_pattern = search_pattern2 + elif webpage.find("/etv2saated/") != -1: + webpage_folder = "etv2saated" + if len(re.findall(search_pattern1, webpage)) > 0: + source_pattern = search_pattern1 + elif len(re.findall(search_pattern2, webpage)) > 0: + source_pattern = search_pattern2 + elif webpage.find("/etvvideod/") != -1: + webpage_folder = "etvvideod" + if len(re.findall(search_pattern1, webpage)) > 0: + source_pattern = search_pattern1 + elif webpage.find("/etv2videod/") != -1: + webpage_folder = "etv2videod" + if len(re.findall(search_pattern1, webpage)) > 0: + source_pattern = search_pattern1 + elif webpage.find("/etvplussvideod/") != -1: + webpage_folder = "etvplussvideod" + if len(re.findall(search_pattern1, webpage)) > 0: + source_pattern = search_pattern1 + elif webpage.find("/uudised/") != -1: + webpage_folder = "uudised" + if len(re.findall(search_pattern1, webpage)) > 0: + source_pattern = search_pattern1 + elif webpage.find("/AUDIO/") != -1: + webpage_folder = "AUDIO" + if len(re.findall(search_pattern2, webpage)) > 0: + source_pattern = search_pattern2 + elif len(re.findall(search_pattern3, webpage)) > 0: + source_pattern = search_pattern3 + elif webpage.find("/arhiiv/") != -1: + webpage_folder = "arhiiv" + if len(re.findall(search_pattern4, webpage)) > 0: + source_pattern = search_pattern4 + elif webpage.find("/viker/") != -1: + webpage_folder = "viker" + if len(re.findall(search_pattern5, webpage)) > 0: + source_pattern = search_pattern5 + elif webpage.find("/r2/") != -1: + webpage_folder = "r2" + if len(re.findall(search_pattern5, webpage)) > 0: + source_pattern = search_pattern5 + elif webpage.find("/r4/") != -1: + webpage_folder = "r4" + if len(re.findall(search_pattern5, webpage)) > 0: + source_pattern = search_pattern5 + + # folder was found checks + if webpage_folder == "": + print("[Err] [ERROR] No webpage_folder was found from webpage: webpage_folder = " + webpage_folder) + elif webpage_folder != "": + if print_debug == 1: + print("[Err] [DEBUG] Found from webpage: webpage_folder = " + webpage_folder) + + # source pattern found checks + if source_pattern == "": + print("[Err] [ERROR] Found webpage: webpage_folder: " + webpage_folder + ", no data for *source_pattern* found!") + elif source_pattern != "": + if print_debug == 1: + print("[Err] [DEBUG] Found pattern from webpage: source_pattern = " + source_pattern) + + # common BEFORE cleanup for all source_patterns + clean_pattern_v = re.findall(source_pattern, webpage) + clean_pattern = clean_pattern_v[0] + clean_pattern = clean_pattern.strip("[]") + + # search_pattern1 = r'clip = {sources: ":[\',"]?([^\&" >]+)' + if source_pattern == search_pattern1: + # ETV ERR clips, 3.05.2016 + # http://etv.err.ee/v/meelelahutus/4x4_magadan/videod/6e716cab-ba91-40b0-b27b-262e369f3d5a + # clip = {sources: "://media.err.ee/etvvideod/@2013-08-06-magadan-kyla.mp4" } + clean_pattern_v = clean_pattern.split("/") + clean_pattern = clean_pattern_v[-1] + + # search_pattern2 = r'file=[\',"]?([^\&" >]+)' + # elif source_pattern == search_pattern2: + # ETV new archive, 9. september 2014 + # + + # search_pattern3 = r'AUDIO/[\',"]?([^\&" >]+)' + # elif source_pattern == search_pattern3: + # ERR audio archive: + # http://arhiiv.err.ee/guid/86328 + # https://static.err.ee/media?stream=media.err.ee:80/arhiiv/&file=/AUDIO/a_86328_RMARHIIV.m4a + # stream=media.err.ee:80/arhiiv/ # file=/AUDIO/a_86328_RMARHIIV.m4a + + # search_pattern4 = r'arhiiv/[\',"]?([^\&" >]+)' + # elif source_pattern == search_pattern4: + # ETV video archive, 2.05.2016 + # http://arhiiv.err.ee/vaata/vurst-volkonski + # setPlayer("://media.err.ee:80/arhiiv/@1990-082743-0001_0001_D10_VURST-VOLKONSKI.mp4", '# fPlayer', player, false) + + # search_pattern5 = r'"Source":":[\',"]?([^\&" >]+)' + elif source_pattern == search_pattern5: + # Vikerraadio, 5.05.2016 + # http://vikerraadio.err.ee/v/ooylikool/saated/8e51ea8b-e3f5-4c80-856b-35348e35d47e/ooulikool-linda-madalik-akustikast + # localList[am] = {"Source":"://media.err.ee:80/viker/@2728431.m4a","MediaType" + clean_pattern_v = clean_pattern.split("/") + clean_pattern = clean_pattern_v[-1] + + # common AFTER cleanup for all source_patterns + clean_pattern = clean_pattern.replace("@", " ").strip(" ").replace("%28", "(").replace("%29", ")") + + # source_pattern was cleaned to clean_pattern + if clean_pattern == "": + print("[Err] [ERROR] found webpage_folder = " + webpage_folder + ", source_pattern " + source_pattern + ", but NO clean_pattern = " + clean_pattern) + elif clean_pattern != "": + print("[Err] Found data pattern: " + clean_pattern) + + # if clean_pattern.find("m4a") > 0: don't use this control, because this hack works only for webpage_folder /AUDIO/ + if webpage_folder == "AUDIO": + # command =("rtmpdump -v -r "rtmp://media.err.ee:80/arhiiv/" -y "mp4:/' + webpage_folder + '/' + clean_pattern + '" -o "'+clean_pattern+'"") + audio_id = clean_pattern + audio_ext = "m4a" + audio_play_path = "mp4:/" + webpage_folder + "/" + clean_pattern + audio_title = clean_pattern + audio_url = "rtmp://media.err.ee:80/arhiiv/" # don't change it to webpage_folder, it doesn't work so + if print_debug == 1: + print("[Err] [DEBUG] Starting to download audio data from " + audio_url) + return { # id, title, url are mandatory, ext is for beauty, ERR audio download is not working without play_path, last two ones help to download + 'id': audio_id, + 'title': audio_title, + 'url': audio_url, + 'ext': audio_ext, + 'play_path': audio_play_path, + 'no_resume': True, # rtmpdump -e means --resume is set + 'rtmp_live': True, # rtmpdump -v means --live is set + # TODO more properties(see youtube_dl/extractor/common.py) + } + else: + # command =("rtmpdump -v -e -r "rtmp://media.err.ee/' + webpage_folder + '/mp4:%s" -o "%s"' %(clean_pattern, clean_pattern) ) + video_id = clean_pattern + video_ext = "mp4" + video_title = clean_pattern + video_url = "rtmp://media.err.ee/" + webpage_folder + "/mp4:" + clean_pattern + if print_debug == 1: + print("[Err] [DEBUG] Starting to download video data from " + video_url) + return { # id, title, url are mandatory, ext is for beauty, last two ones help to download + 'id': video_id, # Video identifier + 'title': video_title, # Video title, unescaped. + 'url': video_url, # Final video URL. + 'ext': video_ext, # Video filename extension. + 'no_resume': False, # rtmpdump -e means --resume is set + 'rtmp_live': True, # rtmpdump -v means --live is set + # TODO more properties(see youtube_dl/extractor/common.py) + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ef4431364..99d4f83b9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -218,6 +218,7 @@ from .embedly import EmbedlyIE from .engadget import EngadgetIE from .eporner import EpornerIE from .eroprofile import EroProfileIE +from .err import ErrIE from .escapist import EscapistIE from .espn import ESPNIE from .esri import EsriVideoIE From 545fc3e3c2f12dc614c09ae86823707d4060472d Mon Sep 17 00:00:00 2001 From: Jalakas Date: Tue, 10 May 2016 22:45:34 +0300 Subject: [PATCH 2/2] [err] Fixes for Travis tests --- youtube_dl/extractor/err.py | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/err.py b/youtube_dl/extractor/err.py index dbe6f4569..66735b699 100644 --- a/youtube_dl/extractor/err.py +++ b/youtube_dl/extractor/err.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ExtractorError print_debug = 1 # 1 to turn on debug notification printing @@ -20,6 +21,9 @@ class ErrIE(InfoExtractor): 'ext': 'mp4', 'title': '2016-000934-0611_Pealtnagija.mp4' }, + 'params': { + 'skip_download': 'requires rtmpdump' + }, }, { # ETV recent (folder: /gb/, pattern: sources) 'url': 'http://etv.err.ee/v/dokumentaalfilmid/valisilma_dokk/saated/c4e742ef-262f-4e8b-9eb7-90415630eff8/valisilma-dokk-suur-plaan-eestilatileedu-2016', @@ -28,6 +32,9 @@ class ErrIE(InfoExtractor): 'ext': 'mp4', 'title': '2015-023708-0001_Suur_plaan.mp4' }, + 'params': { + 'skip_download': 'requires rtmpdump' + }, }, { # ETV archive (video) 'url': 'http://arhiiv.err.ee/vaata/vurst-volkonski', @@ -36,6 +43,9 @@ class ErrIE(InfoExtractor): 'ext': 'mp4', 'title': '1990-082743-0001_0001_D10_VURST-VOLKONSKI.mp4' }, + 'params': { + 'skip_download': 'requires rtmpdump' + }, }, { # ETV archive (audio) 'url': 'https://arhiiv.err.ee/vaata/estraadikava-naisevott-raali-abil', @@ -44,6 +54,9 @@ class ErrIE(InfoExtractor): 'ext': 'm4a', 'title': 'a_108175_RMARHIIV.m4a' }, + 'params': { + 'skip_download': 'requires rtmpdump' + }, }, { # Radios: Vikerraadio, R2 'url': 'http://r2.err.ee/v/2tartutudengipaevad/saated/8c0e0116-2f67-43a4-8b74-8bf974510d6a/tudengi-45', @@ -52,6 +65,9 @@ class ErrIE(InfoExtractor): 'ext': 'mp4', 'title': 'RR2049iu7382.m4a' }, + 'params': { + 'skip_download': 'requires rtmpdump' + }, }] def _real_extract(self, url): @@ -128,17 +144,17 @@ class ErrIE(InfoExtractor): # folder was found checks if webpage_folder == "": - print("[Err] [ERROR] No webpage_folder was found from webpage: webpage_folder = " + webpage_folder) + raise ExtractorError('[Err] No *webpage_folder* was found from webpage: webpage_folder = ' + webpage_folder) elif webpage_folder != "": if print_debug == 1: - print("[Err] [DEBUG] Found from webpage: webpage_folder = " + webpage_folder) + self.to_screen('[DEBUG] Found from webpage: webpage_folder = ' + webpage_folder) # source pattern found checks if source_pattern == "": - print("[Err] [ERROR] Found webpage: webpage_folder: " + webpage_folder + ", no data for *source_pattern* found!") + raise ExtractorError('[Err] Found webpage: webpage_folder: ' + webpage_folder + ', no data for *source_pattern* found!') elif source_pattern != "": if print_debug == 1: - print("[Err] [DEBUG] Found pattern from webpage: source_pattern = " + source_pattern) + self.to_screen('[DEBUG] Found pattern from webpage: source_pattern = ' + source_pattern) # common BEFORE cleanup for all source_patterns clean_pattern_v = re.findall(source_pattern, webpage) @@ -185,9 +201,9 @@ class ErrIE(InfoExtractor): # source_pattern was cleaned to clean_pattern if clean_pattern == "": - print("[Err] [ERROR] found webpage_folder = " + webpage_folder + ", source_pattern " + source_pattern + ", but NO clean_pattern = " + clean_pattern) + raise ExtractorError('[Err] found webpage_folder = ' + webpage_folder + ', source_pattern ' + source_pattern + ', but NO *clean_pattern* = ' + clean_pattern) elif clean_pattern != "": - print("[Err] Found data pattern: " + clean_pattern) + self.to_screen('Found data pattern: ' + clean_pattern) # if clean_pattern.find("m4a") > 0: don't use this control, because this hack works only for webpage_folder /AUDIO/ if webpage_folder == "AUDIO": @@ -198,7 +214,7 @@ class ErrIE(InfoExtractor): audio_title = clean_pattern audio_url = "rtmp://media.err.ee:80/arhiiv/" # don't change it to webpage_folder, it doesn't work so if print_debug == 1: - print("[Err] [DEBUG] Starting to download audio data from " + audio_url) + self.to_screen('[DEBUG] Starting to download audio data from ' + audio_url) return { # id, title, url are mandatory, ext is for beauty, ERR audio download is not working without play_path, last two ones help to download 'id': audio_id, 'title': audio_title, @@ -216,7 +232,7 @@ class ErrIE(InfoExtractor): video_title = clean_pattern video_url = "rtmp://media.err.ee/" + webpage_folder + "/mp4:" + clean_pattern if print_debug == 1: - print("[Err] [DEBUG] Starting to download video data from " + video_url) + self.to_screen('[DEBUG] Starting to download video data from ' + video_url) return { # id, title, url are mandatory, ext is for beauty, last two ones help to download 'id': video_id, # Video identifier 'title': video_title, # Video title, unescaped.