From b8874d4d4ea3becfde813d451d884bce558fe213 Mon Sep 17 00:00:00 2001 From: megustamucho Date: Tue, 9 Sep 2014 12:46:58 +1000 Subject: [PATCH 001/652] [tube8] Improved regex --- youtube_dl/extractor/tube8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index 08a48c05a..39f20c546 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -14,7 +14,7 @@ from ..aes import aes_decrypt_text class Tube8IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/){2}(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:gay/|shemale/)?(?:[^/]+/){2}(?P\d+)' _TEST = { 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/', 'md5': '44bf12b98313827dd52d35b8706a4ea0', From 94b539d15505daf5213d5c4de7c2fde08b5d2f40 Mon Sep 17 00:00:00 2001 From: megustamucho Date: Tue, 9 Sep 2014 12:46:58 +1000 Subject: [PATCH 002/652] [tube8] Improved regex --- youtube_dl/extractor/tube8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index 08a48c05a..39f20c546 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -14,7 +14,7 @@ from ..aes import aes_decrypt_text class Tube8IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/){2}(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:gay/|shemale/)?(?:[^/]+/){2}(?P\d+)' _TEST = { 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/', 'md5': '44bf12b98313827dd52d35b8706a4ea0', From 64892c0b79b401ab487c8facb5a646011873194c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 16 Sep 2014 23:47:59 +0700 Subject: [PATCH 003/652] [francetv] Adapt to new API (Closes #3751, closes #3769) --- youtube_dl/extractor/francetv.py | 200 ++++++++++++++++--------------- 1 file changed, 106 insertions(+), 94 deletions(-) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 1b0e8e5d5..0b3374d97 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -8,45 +8,68 @@ import json from .common import InfoExtractor from ..utils import ( compat_urlparse, + ExtractorError, + clean_html, + parse_duration, + compat_urllib_parse_urlparse, + int_or_none, ) class FranceTVBaseInfoExtractor(InfoExtractor): - def _extract_video(self, video_id): - info = self._download_xml( - 'http://www.francetvinfo.fr/appftv/webservices/video/' - 'getInfosOeuvre.php?id-diffusion=' - + video_id, video_id, 'Downloading XML config') + def _extract_video(self, video_id, catalogue): + info = self._download_json( + 'http://webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=%s&catalogue=%s' + % (video_id, catalogue), + video_id, 'Downloading video JSON') - manifest_url = info.find('videos/video/url').text - manifest_url = manifest_url.replace('/z/', '/i/') - - if manifest_url.startswith('rtmp'): - formats = [{'url': manifest_url, 'ext': 'flv'}] - else: - formats = [] - available_formats = self._search_regex(r'/[^,]*,(.*?),k\.mp4', manifest_url, 'available formats') - for index, format_descr in enumerate(available_formats.split(',')): - format_info = { - 'url': manifest_url.replace('manifest.f4m', 'index_%d_av.m3u8' % index), - 'ext': 'mp4', - } - m_resolution = re.search(r'(?P\d+)x(?P\d+)', format_descr) - if m_resolution is not None: - format_info.update({ - 'width': int(m_resolution.group('width')), - 'height': int(m_resolution.group('height')), - }) - formats.append(format_info) + if info.get('status') == 'NOK': + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, info['message']), expected=True) - thumbnail_path = info.find('image').text + formats = [] + for video in info['videos']: + if video['statut'] != 'ONLINE': + continue + video_url = video['url'] + if not video_url: + continue + format_id = video['format'] + if video_url.endswith('.f4m'): + video_url_parsed = compat_urllib_parse_urlparse(video_url) + f4m_url = self._download_webpage( + 'http://hdfauth.francetv.fr/esi/urltokengen2.html?url=%s' % video_url_parsed.path, + video_id, 'Downloading f4m manifest token', fatal=False) + if f4m_url: + f4m_formats = self._extract_f4m_formats(f4m_url, video_id) + for f4m_format in f4m_formats: + f4m_format['preference'] = 1 + formats.extend(f4m_formats) + elif video_url.endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(video_url, video_id)) + elif video_url.startswith('rtmp'): + formats.append({ + 'url': video_url, + 'format_id': 'rtmp-%s' % format_id, + 'ext': 'flv', + 'preference': 1, + }) + else: + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'preference': 2, + }) + self._sort_formats(formats) return { 'id': video_id, - 'title': info.find('titre').text, + 'title': info['titre'], + 'description': clean_html(info['synopsis']), + 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']), + 'duration': parse_duration(info['duree']), + 'timestamp': int_or_none(info['diffusion']['timestamp']), 'formats': formats, - 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', thumbnail_path), - 'description': info.find('synopsis').text, } @@ -61,7 +84,7 @@ class PluzzIE(FranceTVBaseInfoExtractor): webpage = self._download_webpage(url, title) video_id = self._search_regex( r'data-diffusion="(\d+)"', webpage, 'ID') - return self._extract_video(video_id) + return self._extract_video(video_id, 'Pluzz') class FranceTvInfoIE(FranceTVBaseInfoExtractor): @@ -70,13 +93,13 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): _TESTS = [{ 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', + 'md5': '9cecf35f99c4079c199e9817882a9a1c', 'info_dict': { 'id': '84981923', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Soir 3', - }, - 'params': { - 'skip_download': True, + 'upload_date': '20130826', + 'timestamp': 1377548400, }, }, { 'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html', @@ -88,15 +111,17 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): }, 'params': { 'skip_download': 'HLS (reqires ffmpeg)' - } + }, + 'skip': 'Ce direct est terminé et sera disponible en rattrapage dans quelques minutes.', }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) page_title = mobj.group('title') webpage = self._download_webpage(url, page_title) - video_id = self._search_regex(r'id-video=((?:[^0-9]*?_)?[0-9]+)[@"]', webpage, 'video id') - return self._extract_video(video_id) + video_id, catalogue = self._search_regex( + r'id-video=([^@]+@[^"]+)', webpage, 'video id').split('@') + return self._extract_video(video_id, catalogue) class FranceTVIE(FranceTVBaseInfoExtractor): @@ -112,91 +137,77 @@ class FranceTVIE(FranceTVBaseInfoExtractor): # france2 { 'url': 'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104', - 'file': '75540104.mp4', + 'md5': 'c03fc87cb85429ffd55df32b9fc05523', 'info_dict': { - 'title': '13h15, le samedi...', - 'description': 'md5:2e5b58ba7a2d3692b35c792be081a03d', - }, - 'params': { - # m3u8 download - 'skip_download': True, + 'id': '109169362', + 'ext': 'flv', + 'title': '13h15, le dimanche...', + 'description': 'md5:9a0932bb465f22d377a449be9d1a0ff7', + 'upload_date': '20140914', + 'timestamp': 1410693600, }, }, # france3 { 'url': 'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575', + 'md5': '679bb8f8921f8623bd658fa2f8364da0', 'info_dict': { 'id': '000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Le scandale du prix des médicaments', 'description': 'md5:1384089fbee2f04fc6c9de025ee2e9ce', - }, - 'params': { - # rtmp download - 'skip_download': True, + 'upload_date': '20131113', + 'timestamp': 1384380000, }, }, # france4 { 'url': 'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', + 'md5': 'a182bf8d2c43d88d46ec48fbdd260c1c', 'info_dict': { 'id': 'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Hero Corp Making of - Extrait 1', 'description': 'md5:c87d54871b1790679aec1197e73d650a', - }, - 'params': { - # rtmp download - 'skip_download': True, + 'upload_date': '20131106', + 'timestamp': 1383766500, }, }, # france5 { 'url': 'http://www.france5.fr/emissions/c-a-dire/videos/92837968', + 'md5': '78f0f4064f9074438e660785bbf2c5d9', 'info_dict': { - 'id': '92837968', - 'ext': 'mp4', + 'id': '108961659', + 'ext': 'flv', 'title': 'C à dire ?!', - 'description': 'md5:fb1db1cbad784dcce7c7a7bd177c8e2f', - }, - 'params': { - # m3u8 download - 'skip_download': True, + 'description': 'md5:1a4aeab476eb657bf57c4ff122129f81', + 'upload_date': '20140915', + 'timestamp': 1410795000, }, }, # franceo { 'url': 'http://www.franceo.fr/jt/info-afrique/04-12-2013', + 'md5': '52f0bfe202848b15915a2f39aaa8981b', 'info_dict': { - 'id': '92327925', - 'ext': 'mp4', - 'title': 'Infô-Afrique', + 'id': '108634970', + 'ext': 'flv', + 'title': 'Infô Afrique', 'description': 'md5:ebf346da789428841bee0fd2a935ea55', + 'upload_date': '20140915', + 'timestamp': 1410822000, }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'The id changes frequently', }, ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj.group('key'): - webpage = self._download_webpage(url, mobj.group('key')) - id_res = [ - (r'''(?x)\s* - '''), - (r'.*?)(\?|$)' _TEST = { - 'url': 'http://culturebox.francetvinfo.fr/einstein-on-the-beach-au-theatre-du-chatelet-146813', + 'url': 'http://culturebox.francetvinfo.fr/festivals/dans-les-jardins-de-william-christie/dans-les-jardins-de-william-christie-le-camus-162553', + 'md5': '5ad6dec1ffb2a3fbcb20cc4b744be8d6', 'info_dict': { - 'id': 'EV_6785', - 'ext': 'mp4', - 'title': 'Einstein on the beach au Théâtre du Châtelet', - 'description': 'md5:9ce2888b1efefc617b5e58b3f6200eeb', - }, - 'params': { - # m3u8 download - 'skip_download': True, + 'id': 'EV_22853', + 'ext': 'flv', + 'title': 'Dans les jardins de William Christie - Le Camus', + 'description': 'md5:4710c82315c40f0c865ca8b9a68b5299', + 'upload_date': '20140829', + 'timestamp': 1409317200, }, } @@ -249,5 +259,7 @@ class CultureboxIE(FranceTVBaseInfoExtractor): mobj = re.match(self._VALID_URL, url) name = mobj.group('name') webpage = self._download_webpage(url, name) - video_id = self._search_regex(r'"http://videos\.francetv\.fr/video/(.*?)"', webpage, 'video id') - return self._extract_video(video_id) + video_id, catalogue = self._search_regex( + r'"http://videos\.francetv\.fr/video/([^@]+@[^"]+)"', webpage, 'video id').split('@') + + return self._extract_video(video_id, catalogue) From 38349518f1292b665905b0c2dc30d33021aaa8cb Mon Sep 17 00:00:00 2001 From: Carlos Ramos Date: Tue, 16 Sep 2014 20:48:53 +0200 Subject: [PATCH 004/652] Added new host: allmyvideos.net --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/allmyvideos.py | 51 +++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 youtube_dl/extractor/allmyvideos.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e9fceae4c..f715c3310 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -6,6 +6,7 @@ from .aftonbladet import AftonbladetIE from .anitube import AnitubeIE from .anysex import AnySexIE from .aol import AolIE +from .allmyvideos import AllmyvideosIE from .allocine import AllocineIE from .aparat import AparatIE from .appletrailers import AppleTrailersIE diff --git a/youtube_dl/extractor/allmyvideos.py b/youtube_dl/extractor/allmyvideos.py new file mode 100644 index 000000000..4cb559483 --- /dev/null +++ b/youtube_dl/extractor/allmyvideos.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + compat_urllib_request, +) + + +class AllmyvideosIE(InfoExtractor): + IE_NAME = 'allmyvideos.net' + _VALID_URL = r'https?://allmyvideos\.net/(?P[a-zA-Z0-9_-]+)' + + _TEST = { + 'url': 'http://allmyvideos.net/jih3nce3x6wn', + 'md5': '8f26c1e7102556a0d7f24306d32c2092', + 'info_dict': { + 'id': 'jih3nce3x6wn', + 'ext': 'mp4', + 'title': 'youtube-dl test video', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + orig_webpage = self._download_webpage(url, video_id) + fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage) + data = {} + for name, value in fields: + data[name] = value + + post = compat_urllib_parse.urlencode(data) + headers = { + b'Content-Type': b'application/x-www-form-urlencoded', + } + req = compat_urllib_request.Request(url, post, headers) + webpage = self._download_webpage(req, video_id, note='Downloading video page ...') + + #Could be several links with different quality + links = re.findall(r'"file" : "?(.+?)",', webpage) + + return { + 'id': video_id, + 'title': data['fname'][:len(data['fname'])-4], #Remove .mp4 extension + 'url': links[len(links)-1] #Choose the higher quality link + } \ No newline at end of file From 7cdd5339b3a02b7429c945ae4cd1e70c7112d555 Mon Sep 17 00:00:00 2001 From: Carlos Ramos Date: Tue, 16 Sep 2014 21:05:50 +0200 Subject: [PATCH 005/652] Change tabs to spaces --- youtube_dl/extractor/allmyvideos.py | 40 ++++++++++++++--------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/allmyvideos.py b/youtube_dl/extractor/allmyvideos.py index 4cb559483..7764d4a14 100644 --- a/youtube_dl/extractor/allmyvideos.py +++ b/youtube_dl/extractor/allmyvideos.py @@ -25,27 +25,27 @@ class AllmyvideosIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') - orig_webpage = self._download_webpage(url, video_id) - fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage) - data = {} - for name, value in fields: - data[name] = value + orig_webpage = self._download_webpage(url, video_id) + fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage) + data = {} + for name, value in fields: + data[name] = value - post = compat_urllib_parse.urlencode(data) - headers = { - b'Content-Type': b'application/x-www-form-urlencoded', - } - req = compat_urllib_request.Request(url, post, headers) - webpage = self._download_webpage(req, video_id, note='Downloading video page ...') + post = compat_urllib_parse.urlencode(data) + headers = { + b'Content-Type': b'application/x-www-form-urlencoded', + } + req = compat_urllib_request.Request(url, post, headers) + webpage = self._download_webpage(req, video_id, note='Downloading video page ...') - #Could be several links with different quality - links = re.findall(r'"file" : "?(.+?)",', webpage) + #Could be several links with different quality + links = re.findall(r'"file" : "?(.+?)",', webpage) - return { - 'id': video_id, - 'title': data['fname'][:len(data['fname'])-4], #Remove .mp4 extension - 'url': links[len(links)-1] #Choose the higher quality link - } \ No newline at end of file + return { + 'id': video_id, + 'title': data['fname'][:len(data['fname'])-4], #Remove .mp4 extension + 'url': links[len(links)-1] #Choose the higher quality link + } \ No newline at end of file From a04aa7a9e692b174f4b03ddf6918c2d0a20ff7fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 16 Sep 2014 22:53:48 +0200 Subject: [PATCH 006/652] [generic] The url in the doesn't need to be enclosed in single quotes (fixes #3770) See the examples in https://en.wikipedia.org/wiki/Meta_refresh or the shortened urls from https://t.co/. --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 2bfa20606..40eeaad16 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -877,7 +877,7 @@ class GenericIE(InfoExtractor): if not found: found = re.search( r'(?i) Date: Tue, 16 Sep 2014 22:56:31 +0200 Subject: [PATCH 007/652] [allmyvideos] Support multiple formats --- youtube_dl/extractor/allmyvideos.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/allmyvideos.py b/youtube_dl/extractor/allmyvideos.py index 7764d4a14..e6c60e7e4 100644 --- a/youtube_dl/extractor/allmyvideos.py +++ b/youtube_dl/extractor/allmyvideos.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import os.path import re from .common import InfoExtractor @@ -16,7 +17,7 @@ class AllmyvideosIE(InfoExtractor): _TEST = { 'url': 'http://allmyvideos.net/jih3nce3x6wn', - 'md5': '8f26c1e7102556a0d7f24306d32c2092', + 'md5': '710883dee1bfc370ecf9fa6a89307c88', 'info_dict': { 'id': 'jih3nce3x6wn', 'ext': 'mp4', @@ -30,22 +31,29 @@ class AllmyvideosIE(InfoExtractor): orig_webpage = self._download_webpage(url, video_id) fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage) - data = {} - for name, value in fields: - data[name] = value + data = dict(fields) post = compat_urllib_parse.urlencode(data) headers = { b'Content-Type': b'application/x-www-form-urlencoded', } req = compat_urllib_request.Request(url, post, headers) - webpage = self._download_webpage(req, video_id, note='Downloading video page ...') + webpage = self._download_webpage( + req, video_id, note='Downloading video page ...') + + title = os.path.splitext(data['fname'])[0] #Could be several links with different quality links = re.findall(r'"file" : "?(.+?)",', webpage) + # Assume the links are ordered in quality + formats = [{ + 'url': l, + 'quality': i, + } for i, l in enumerate(links)] + self._sort_formats(formats) return { 'id': video_id, - 'title': data['fname'][:len(data['fname'])-4], #Remove .mp4 extension - 'url': links[len(links)-1] #Choose the higher quality link - } \ No newline at end of file + 'title': title, + 'formats': formats, + } From c001f939e45271b63fd4182de647142834b8d367 Mon Sep 17 00:00:00 2001 From: Carlos Ramos Date: Tue, 16 Sep 2014 23:23:54 +0200 Subject: [PATCH 008/652] [Allmyvideos] Fixed md5. Only 10KiB of the video file --- youtube_dl/extractor/allmyvideos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/allmyvideos.py b/youtube_dl/extractor/allmyvideos.py index 7764d4a14..a5ebdfdf4 100644 --- a/youtube_dl/extractor/allmyvideos.py +++ b/youtube_dl/extractor/allmyvideos.py @@ -16,7 +16,7 @@ class AllmyvideosIE(InfoExtractor): _TEST = { 'url': 'http://allmyvideos.net/jih3nce3x6wn', - 'md5': '8f26c1e7102556a0d7f24306d32c2092', + 'md5': '710883dee1bfc370ecf9fa6a89307c88', 'info_dict': { 'id': 'jih3nce3x6wn', 'ext': 'mp4', From 997987d568b49cb3720083d85f120ef634989ba9 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 16 Sep 2014 23:33:13 +0200 Subject: [PATCH 009/652] Credit @r4mos for allmyvideos --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 871ddead9..a8d5095ae 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -76,6 +76,7 @@ __authors__ = ( 'Aaron McDaniel (mcd1992)', 'Magnus Kolstad', 'Hari Padmanaban', + 'Carlos Ramos', ) __license__ = 'Public Domain' From 6b6096d0b7c7e98ae2aefb306793ef58ee13c9f2 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 16 Sep 2014 23:35:15 +0200 Subject: [PATCH 010/652] release 2014.09.16.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a59cbf879..23892a8bd 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.09.16' +__version__ = '2014.09.16.1' From 5a13fe9ed2abcd67b4e8469805267b1afa0fb2d8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 17 Sep 2014 12:50:36 +0200 Subject: [PATCH 011/652] Credit @m5moufl for behindkink (#3740) --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index a8d5095ae..20d7a57ce 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -77,6 +77,7 @@ __authors__ = ( 'Magnus Kolstad', 'Hari Padmanaban', 'Carlos Ramos', + '5moufl', ) __license__ = 'Public Domain' From 944a3de2781658c94d71d4bc4b12bac9b8b382c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 17 Sep 2014 19:02:57 +0700 Subject: [PATCH 012/652] [npo] Improve formats extraction (Closes #3782) --- youtube_dl/extractor/npo.py | 77 +++++++++++++++++++++++++++---------- 1 file changed, 57 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 902d62944..7a154e94a 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( unified_strdate, + parse_duration, qualities, ) @@ -13,17 +14,43 @@ class NPOIE(InfoExtractor): IE_NAME = 'npo.nl' _VALID_URL = r'https?://www\.npo\.nl/[^/]+/[^/]+/(?P[^/?]+)' - _TEST = { - 'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719', - 'md5': '4b3f9c429157ec4775f2c9cb7b911016', - 'info_dict': { - 'id': 'VPWON_1220719', - 'ext': 'm4v', - 'title': 'Nieuwsuur', - 'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.', - 'upload_date': '20140622', + _TESTS = [ + { + 'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719', + 'md5': '4b3f9c429157ec4775f2c9cb7b911016', + 'info_dict': { + 'id': 'VPWON_1220719', + 'ext': 'm4v', + 'title': 'Nieuwsuur', + 'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.', + 'upload_date': '20140622', + }, }, - } + { + 'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800', + 'md5': 'da50a5787dbfc1603c4ad80f31c5120b', + 'info_dict': { + 'id': 'VARA_101191800', + 'ext': 'm4v', + 'title': 'De Mega Mike & Mega Thomas show', + 'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4', + 'upload_date': '20090227', + 'duration': 2400, + }, + }, + { + 'url': 'http://www.npo.nl/tegenlicht/25-02-2013/VPWON_1169289', + 'md5': 'f8065e4e5a7824068ed3c7e783178f2c', + 'info_dict': { + 'id': 'VPWON_1169289', + 'ext': 'm4v', + 'title': 'Tegenlicht', + 'description': 'md5:d6476bceb17a8c103c76c3b708f05dd1', + 'upload_date': '20130225', + 'duration': 3000, + }, + } + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -43,19 +70,28 @@ class NPOIE(InfoExtractor): token = self._search_regex(r'npoplayer\.token = "(.+?)"', token_page, 'token') formats = [] - quality = qualities(['adaptive', 'h264_sb', 'h264_bb', 'h264_std']) + quality = qualities(['adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std']) for format_id in metadata['pubopties']: - streams_info = self._download_json( + format_info = self._download_json( 'http://ida.omroep.nl/odi/?prid=%s&puboptions=%s&adaptive=yes&token=%s' % (video_id, format_id, token), - video_id, 'Downloading %s streams info' % format_id) - stream_info = self._download_json( - streams_info['streams'][0] + '&type=json', - video_id, 'Downloading %s stream info' % format_id) + video_id, 'Downloading %s JSON' % format_id) + if format_info.get('error_code', 0) or format_info.get('errorcode', 0): + continue + streams = format_info.get('streams') + if streams: + video_info = self._download_json( + streams[0] + '&type=json', + video_id, 'Downloading %s stream JSON' % format_id) + else: + video_info = format_info + video_url = video_info.get('url') + if not video_url: + continue if format_id == 'adaptive': - formats.extend(self._extract_m3u8_formats(stream_info['url'], video_id)) + formats.extend(self._extract_m3u8_formats(video_url, video_id)) else: formats.append({ - 'url': stream_info['url'], + 'url': video_url, 'format_id': format_id, 'quality': quality(format_id), }) @@ -65,7 +101,8 @@ class NPOIE(InfoExtractor): 'id': video_id, 'title': metadata['titel'], 'description': metadata['info'], - 'thumbnail': metadata['images'][-1]['url'], - 'upload_date': unified_strdate(metadata['gidsdatum']), + 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], + 'upload_date': unified_strdate(metadata.get('gidsdatum')), + 'duration': parse_duration(metadata.get('tijdsduur')), 'formats': formats, } From 67abbe95273f59f4a04486172e6d422a10b6afb3 Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis Date: Wed, 17 Sep 2014 22:57:01 +0300 Subject: [PATCH 013/652] [videomega] Add new extractor. Closes #3775 --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/videomega.py | 59 +++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 youtube_dl/extractor/videomega.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f715c3310..75831b40a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -393,6 +393,7 @@ from .videobam import VideoBamIE from .videodetective import VideoDetectiveIE from .videolecturesnet import VideoLecturesNetIE from .videofyme import VideofyMeIE +from .videomega import VideoMegaIE from .videopremium import VideoPremiumIE from .videott import VideoTtIE from .videoweed import VideoWeedIE diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py new file mode 100644 index 000000000..1b6b65839 --- /dev/null +++ b/youtube_dl/extractor/videomega.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + remove_start, +) + + +class VideoMegaIE(InfoExtractor): + _VALID_URL = r'''(?x)https?:// + (?:www\.)?videomega\.tv/ + (?:iframe\.php)?\?ref=(?P[A-Za-z0-9]+) + ''' + _TEST = { + 'url': 'http://videomega.tv/?ref=GKeGPVedBe', + 'md5': '240fb5bcf9199961f48eb17839b084d6', + 'info_dict': { + 'id': 'GKeGPVedBe', + 'ext': 'mp4', + 'title': 'XXL - All Sports United', + 'thumbnail': 're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + url = 'http://videomega.tv/iframe.php?ref={0:}'.format(video_id) + webpage = self._download_webpage(url, video_id) + + escaped_data = self._search_regex( + 'unescape\("([^"]+)"\)', webpage, 'escaped data') + playlist = compat_urllib_parse.unquote(escaped_data) + + thumbnail = self._search_regex( + r'image:\s*"([^"]+)"', playlist, 'thumbnail', fatal=False) + url = self._search_regex(r'file:\s*"([^"]+)"', playlist, 'URL') + title = self._html_search_regex( + r'(.*?)', webpage, 'title') + if title: + title = remove_start(title, 'VideoMega.tv - ') + + formats = [] + formats.append({ + 'format_id': 'sd', + 'url': url, + }) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + } From 0e59b9fffb12255a16577dca7710b7738feca75c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 18 Sep 2014 00:18:27 +0200 Subject: [PATCH 014/652] [videomega] Simplify (#3786) * Use raw strings (r'foo') for regular expressions (enables highlighting and avoids some errors). * title is always true-ish --- youtube_dl/extractor/videomega.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/videomega.py b/youtube_dl/extractor/videomega.py index 1b6b65839..29c4e0101 100644 --- a/youtube_dl/extractor/videomega.py +++ b/youtube_dl/extractor/videomega.py @@ -34,22 +34,20 @@ class VideoMegaIE(InfoExtractor): webpage = self._download_webpage(url, video_id) escaped_data = self._search_regex( - 'unescape\("([^"]+)"\)', webpage, 'escaped data') + r'unescape\("([^"]+)"\)', webpage, 'escaped data') playlist = compat_urllib_parse.unquote(escaped_data) thumbnail = self._search_regex( r'image:\s*"([^"]+)"', playlist, 'thumbnail', fatal=False) url = self._search_regex(r'file:\s*"([^"]+)"', playlist, 'URL') - title = self._html_search_regex( - r'(.*?)', webpage, 'title') - if title: - title = remove_start(title, 'VideoMega.tv - ') + title = remove_start(self._html_search_regex( + r'(.*?)', webpage, 'title'), 'VideoMega.tv - ') - formats = [] - formats.append({ + formats = [{ 'format_id': 'sd', 'url': url, - }) + }] + self._sort_formats(formats) return { 'id': video_id, From 9296738f20c1335498a78c99a86767e9bae4f6d2 Mon Sep 17 00:00:00 2001 From: dequis Date: Thu, 18 Sep 2014 03:02:03 -0300 Subject: [PATCH 015/652] [soundcloud] Support api urls with secret_token, Closes #3707 --- youtube_dl/extractor/soundcloud.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index b78aed7f0..129f587ec 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -31,7 +31,8 @@ class SoundcloudIE(InfoExtractor): (?!sets/|likes/?(?:$|[?#])) (?P[\w\d-]+)/? (?P<token>[^?]+?)?(?:[?].*)?$) - |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)) + |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+) + (?:/?\?secret_token=(?P<secret_token>[^&]+?))?$) |(?P<player>(?:w|player|p.)\.soundcloud\.com/player/?.*?url=.*) ) ''' @@ -80,6 +81,20 @@ class SoundcloudIE(InfoExtractor): 'duration': 9, }, }, + # private link (alt format) + { + 'url': 'https://api.soundcloud.com/tracks/123998367?secret_token=s-8Pjrp', + 'md5': 'aa0dd32bfea9b0c5ef4f02aacd080604', + 'info_dict': { + 'id': '123998367', + 'ext': 'mp3', + 'title': 'Youtube - Dl Test Video \'\' Ä↭', + 'uploader': 'jaimeMF', + 'description': 'test chars: \"\'/\\ä↭', + 'upload_date': '20131209', + 'duration': 9, + }, + }, # downloadable song { 'url': 'https://soundcloud.com/oddsamples/bus-brakes', @@ -197,6 +212,9 @@ class SoundcloudIE(InfoExtractor): if track_id is not None: info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID full_title = track_id + token = mobj.group('secret_token') + if token: + info_json_url += "&secret_token=" + token elif mobj.group('player'): query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) return self.url_result(query['url'][0]) From 2f834e938192a61fd4a32fa98bffb5e1b614bc29 Mon Sep 17 00:00:00 2001 From: dequis <dx@dxzone.com.ar> Date: Thu, 18 Sep 2014 06:35:11 -0300 Subject: [PATCH 016/652] [soundcloud] Secret playlists and sets Closes #3707 again. No test cases because I don't know what urls to use that won't be turned into public eventually (as it happened with the first one in that ticket) --- youtube_dl/extractor/soundcloud.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 129f587ec..2bed3c350 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -238,7 +238,7 @@ class SoundcloudIE(InfoExtractor): class SoundcloudSetIE(SoundcloudIE): - _VALID_URL = r'https?://(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)' + _VALID_URL = r'https?://(?:www\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?' IE_NAME = 'soundcloud:set' _TESTS = [{ 'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep', @@ -252,14 +252,19 @@ class SoundcloudSetIE(SoundcloudIE): mobj = re.match(self._VALID_URL, url) # extract uploader (which is in the url) - uploader = mobj.group(1) + uploader = mobj.group('uploader') # extract simple title (uploader + slug of song title) - slug_title = mobj.group(2) + slug_title = mobj.group('slug_title') full_title = '%s/sets/%s' % (uploader, slug_title) + url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title) + + token = mobj.group('token') + if token: + full_title += '/' + token + url += '/' + token self.report_resolve(full_title) - url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title) resolv_url = self._resolv_url(url) info = self._download_json(resolv_url, full_title) @@ -270,7 +275,7 @@ class SoundcloudSetIE(SoundcloudIE): return { '_type': 'playlist', - 'entries': [self._extract_info_dict(track) for track in info['tracks']], + 'entries': [self._extract_info_dict(track, secret_token=token) for track in info['tracks']], 'id': info['id'], 'title': info['title'], } @@ -333,7 +338,7 @@ class SoundcloudUserIE(SoundcloudIE): class SoundcloudPlaylistIE(SoundcloudIE): - _VALID_URL = r'https?://api\.soundcloud\.com/playlists/(?P<id>[0-9]+)' + _VALID_URL = r'https?://api\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))$' IE_NAME = 'soundcloud:playlist' _TESTS = [ @@ -353,14 +358,21 @@ class SoundcloudPlaylistIE(SoundcloudIE): playlist_id = mobj.group('id') base_url = '%s//api.soundcloud.com/playlists/%s.json?' % (self.http_scheme(), playlist_id) - data = compat_urllib_parse.urlencode({ + data_dict = { 'client_id': self._CLIENT_ID, - }) + } + token = mobj.group('token') + + if token: + data_dict['secret_token'] = token + + data = compat_urllib_parse.urlencode(data_dict) data = self._download_json( base_url + data, playlist_id, 'Downloading playlist') entries = [ - self._extract_info_dict(t, quiet=True) for t in data['tracks']] + self._extract_info_dict(t, quiet=True, secret_token=token) + for t in data['tracks']] return { '_type': 'playlist', From 2914e5f00f6ebcc59712b7091a87988408ff3c88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 18 Sep 2014 20:56:54 +0700 Subject: [PATCH 017/652] [drtuber] Fix categories --- youtube_dl/extractor/drtuber.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py index d5bfd7f22..ca274dff6 100644 --- a/youtube_dl/extractor/drtuber.py +++ b/youtube_dl/extractor/drtuber.py @@ -19,7 +19,7 @@ class DrTuberIE(InfoExtractor): 'like_count': int, 'dislike_count': int, 'comment_count': int, - 'categories': list, # NSFW + 'categories': ['Babe', 'Blonde', 'Erotic', 'Outdoor', 'Softcore', 'Solo'], 'thumbnail': 're:https?://.*\.jpg$', 'age_limit': 18, } @@ -52,9 +52,9 @@ class DrTuberIE(InfoExtractor): r'<span class="comments_count">([\d,\.]+)</span>', webpage, 'comment count', fatal=False)) - cats_str = self._html_search_regex( - r'<meta name="keywords" content="([^"]+)"', webpage, 'categories', fatal=False) - categories = None if cats_str is None else cats_str.split(' ') + cats_str = self._search_regex( + r'<span>Categories:</span><div>(.+?)</div>', webpage, 'categories', fatal=False) + categories = [] if not cats_str else re.findall(r'<a title="([^"]+)"', cats_str) return { 'id': video_id, From 109a540e7a4c5741fa77b68b4f346f42dc1cda97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 18 Sep 2014 16:57:34 +0200 Subject: [PATCH 018/652] [ign] Fix extraction --- youtube_dl/extractor/ign.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index 9e8b69f57..ac7804ad9 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -71,6 +71,7 @@ class IGNIE(InfoExtractor): def _find_video_id(self, webpage): res_id = [ + r'"video_id"\s*:\s*"(.*?)"', r'data-video-id="(.+?)"', r'<object id="vid_(.+?)"', r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"', @@ -85,7 +86,7 @@ class IGNIE(InfoExtractor): webpage = self._download_webpage(url, name_or_id) if page_type != 'video': multiple_urls = re.findall( - '<param name="flashvars" value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]', + '<param name="flashvars"[^>]*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]', webpage) if multiple_urls: return [self.url_result(u, ie='IGN') for u in multiple_urls] From 09b23c902b5ab4a4ca9607128128d110a3c41875 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 18 Sep 2014 17:02:53 +0200 Subject: [PATCH 019/652] [1up.com] Urls end now with '.html' --- youtube_dl/extractor/ign.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index ac7804ad9..12e9e61c4 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -112,13 +112,13 @@ class IGNIE(InfoExtractor): class OneUPIE(IGNIE): - _VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)' + _VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)\.html' IE_NAME = '1up.com' _DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>' _TESTS = [{ - 'url': 'http://gamevideos.1up.com/video/id/34976', + 'url': 'http://gamevideos.1up.com/video/id/34976.html', 'md5': '68a54ce4ebc772e4b71e3123d413163d', 'info_dict': { 'id': '34976', From e2e5dae64da60c37af65c7cffd18475a30fcbad3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 18 Sep 2014 18:40:19 +0200 Subject: [PATCH 020/652] Add -f m4a --- youtube_dl/YoutubeDL.py | 2 +- youtube_dl/options.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 9519594c9..eaba40bf2 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -708,7 +708,7 @@ class YoutubeDL(object): if video_formats: return video_formats[0] else: - extensions = ['mp4', 'flv', 'webm', '3gp'] + extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a'] if format_spec in extensions: filter_f = lambda f: f['ext'] == format_spec else: diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 31baab469..7df20ae61 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -218,7 +218,7 @@ def parseOpts(overrideArguments=None): video_format.add_option('-f', '--format', action='store', dest='format', metavar='FORMAT', default=None, - help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported. You can also use the special names "best", "bestvideo", "bestaudio", "worst", "worstvideo" and "worstaudio". By default, youtube-dl will pick the best quality.') + help='video format code, specify the order of preference using slashes: -f 22/17/18 . -f mp4 , -f m4a and -f flv are also supported. You can also use the special names "best", "bestvideo", "bestaudio", "worst", "worstvideo" and "worstaudio". By default, youtube-dl will pick the best quality.') video_format.add_option('--all-formats', action='store_const', dest='format', help='download all available video formats', const='all') video_format.add_option('--prefer-free-formats', From 1de33fafd94c7e0d4ccede711ef7f13bd3e2301b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 18 Sep 2014 18:43:49 +0200 Subject: [PATCH 021/652] [YoutubeDL] Allow downloading multiple formats with , --- youtube_dl/YoutubeDL.py | 43 +++++++++++++++++++++-------------------- youtube_dl/options.py | 2 +- 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index eaba40bf2..a1713dc5a 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -809,28 +809,29 @@ class YoutubeDL(object): if req_format in ('-1', 'all'): formats_to_download = formats else: - # We can accept formats requested in the format: 34/5/best, we pick - # the first that is available, starting from left - req_formats = req_format.split('/') - for rf in req_formats: - if re.match(r'.+?\+.+?', rf) is not None: - # Two formats have been requested like '137+139' - format_1, format_2 = rf.split('+') - formats_info = (self.select_format(format_1, formats), - self.select_format(format_2, formats)) - if all(formats_info): - selected_format = { - 'requested_formats': formats_info, - 'format': rf, - 'ext': formats_info[0]['ext'], - } + for rfstr in req_format.split(','): + # We can accept formats requested in the format: 34/5/best, we pick + # the first that is available, starting from left + req_formats = rfstr.split('/') + for rf in req_formats: + if re.match(r'.+?\+.+?', rf) is not None: + # Two formats have been requested like '137+139' + format_1, format_2 = rf.split('+') + formats_info = (self.select_format(format_1, formats), + self.select_format(format_2, formats)) + if all(formats_info): + selected_format = { + 'requested_formats': formats_info, + 'format': rf, + 'ext': formats_info[0]['ext'], + } + else: + selected_format = None else: - selected_format = None - else: - selected_format = self.select_format(rf, formats) - if selected_format is not None: - formats_to_download = [selected_format] - break + selected_format = self.select_format(rf, formats) + if selected_format is not None: + formats_to_download.append(selected_format) + break if not formats_to_download: raise ExtractorError('requested format not available', expected=True) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 7df20ae61..44dcb1e34 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -218,7 +218,7 @@ def parseOpts(overrideArguments=None): video_format.add_option('-f', '--format', action='store', dest='format', metavar='FORMAT', default=None, - help='video format code, specify the order of preference using slashes: -f 22/17/18 . -f mp4 , -f m4a and -f flv are also supported. You can also use the special names "best", "bestvideo", "bestaudio", "worst", "worstvideo" and "worstaudio". By default, youtube-dl will pick the best quality.') + help='video format code, specify the order of preference using slashes: -f 22/17/18 . -f mp4 , -f m4a and -f flv are also supported. You can also use the special names "best", "bestvideo", "bestaudio", "worst", "worstvideo" and "worstaudio". By default, youtube-dl will pick the best quality. Use commas to download multiple audio formats, such as -f 136/137/mp4/bestvideo,140/m4a/bestaudio') video_format.add_option('--all-formats', action='store_const', dest='format', help='download all available video formats', const='all') video_format.add_option('--prefer-free-formats', From fd78a4d3e63f191e0774584d9b71bf25a2d8dbcf Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 18 Sep 2014 18:43:59 +0200 Subject: [PATCH 022/652] release 2014.09.18 --- README.md | 15 +++++++++------ youtube_dl/version.py | 2 +- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 5cc959ac5..5d15decb5 100644 --- a/README.md +++ b/README.md @@ -227,12 +227,15 @@ which means you can modify it, redistribute it or use it however you like. ## Video Format Options: -f, --format FORMAT video format code, specify the order of - preference using slashes: "-f 22/17/18". - "-f mp4" and "-f flv" are also supported. - You can also use the special names "best", - "bestvideo", "bestaudio", "worst", - "worstvideo" and "worstaudio". By default, - youtube-dl will pick the best quality. + preference using slashes: -f 22/17/18 . -f + mp4 , -f m4a and -f flv are also + supported. You can also use the special + names "best", "bestvideo", "bestaudio", + "worst", "worstvideo" and "worstaudio". By + default, youtube-dl will pick the best + quality. Use commas to download multiple + audio formats, such as -f + 136/137/mp4/bestvideo,140/m4a/bestaudio --all-formats download all available video formats --prefer-free-formats prefer free video formats unless a specific one is requested diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 23892a8bd..430509ba3 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.09.16.1' +__version__ = '2014.09.18' From 0529eef5a4513d8f3c042f09fe5485e1c41e2f08 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 18 Sep 2014 18:54:03 +0200 Subject: [PATCH 023/652] [hypestat] Unify allmyvideos and vidspot (Closes #3788) --- youtube_dl/extractor/__init__.py | 2 +- .../extractor/{allmyvideos.py => hypestat.py} | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) rename youtube_dl/extractor/{allmyvideos.py => hypestat.py} (77%) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 75831b40a..97693018f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -6,7 +6,6 @@ from .aftonbladet import AftonbladetIE from .anitube import AnitubeIE from .anysex import AnySexIE from .aol import AolIE -from .allmyvideos import AllmyvideosIE from .allocine import AllocineIE from .aparat import AparatIE from .appletrailers import AppleTrailersIE @@ -151,6 +150,7 @@ from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE from .huffpost import HuffPostIE from .hypem import HypemIE +from .hypestat import HypestatIE from .iconosquare import IconosquareIE from .ign import IGNIE, OneUPIE from .imdb import ( diff --git a/youtube_dl/extractor/allmyvideos.py b/youtube_dl/extractor/hypestat.py similarity index 77% rename from youtube_dl/extractor/allmyvideos.py rename to youtube_dl/extractor/hypestat.py index e6c60e7e4..8b8db30ae 100644 --- a/youtube_dl/extractor/allmyvideos.py +++ b/youtube_dl/extractor/hypestat.py @@ -11,11 +11,11 @@ from ..utils import ( ) -class AllmyvideosIE(InfoExtractor): - IE_NAME = 'allmyvideos.net' - _VALID_URL = r'https?://allmyvideos\.net/(?P<id>[a-zA-Z0-9_-]+)' +class HypestatIE(InfoExtractor): + IE_DESC = 'allmyvideos.net and vidspot.net' + _VALID_URL = r'https?://(?:allmyvideos|vidspot)\.net/(?P<id>[a-zA-Z0-9_-]+)' - _TEST = { + _TESTS = [{ 'url': 'http://allmyvideos.net/jih3nce3x6wn', 'md5': '710883dee1bfc370ecf9fa6a89307c88', 'info_dict': { @@ -23,7 +23,15 @@ class AllmyvideosIE(InfoExtractor): 'ext': 'mp4', 'title': 'youtube-dl test video', }, - } + }, { + 'url': 'http://vidspot.net/l2ngsmhs8ci5', + 'md5': '710883dee1bfc370ecf9fa6a89307c88', + 'info_dict': { + 'id': 'l2ngsmhs8ci5', + 'ext': 'mp4', + 'title': 'youtube-dl test video', + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From 37bfe8ace4dcd1b476a54aedb7f39b88e7bb527e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 18 Sep 2014 18:56:02 +0200 Subject: [PATCH 024/652] [hypestat] Match URLs with www. and https:// --- youtube_dl/extractor/hypestat.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/hypestat.py b/youtube_dl/extractor/hypestat.py index 8b8db30ae..e1a142268 100644 --- a/youtube_dl/extractor/hypestat.py +++ b/youtube_dl/extractor/hypestat.py @@ -13,7 +13,7 @@ from ..utils import ( class HypestatIE(InfoExtractor): IE_DESC = 'allmyvideos.net and vidspot.net' - _VALID_URL = r'https?://(?:allmyvideos|vidspot)\.net/(?P<id>[a-zA-Z0-9_-]+)' + _VALID_URL = r'https?://(?:www\.)?(?:allmyvideos|vidspot)\.net/(?P<id>[a-zA-Z0-9_-]+)' _TESTS = [{ 'url': 'http://allmyvideos.net/jih3nce3x6wn', @@ -31,6 +31,9 @@ class HypestatIE(InfoExtractor): 'ext': 'mp4', 'title': 'youtube-dl test video', }, + }, { + 'url': 'https://www.vidspot.net/l2ngsmhs8ci5', + 'only_matching': True, }] def _real_extract(self, url): From 46f74bcf5c5fc876e3a966408cb8bde6d6ef15e0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 18 Sep 2014 18:57:04 +0200 Subject: [PATCH 025/652] [soundcloud] Fix non-secret playlists --- youtube_dl/extractor/soundcloud.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 2bed3c350..4719ba45c 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -338,20 +338,17 @@ class SoundcloudUserIE(SoundcloudIE): class SoundcloudPlaylistIE(SoundcloudIE): - _VALID_URL = r'https?://api\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))$' + _VALID_URL = r'https?://api\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$' IE_NAME = 'soundcloud:playlist' - _TESTS = [ - - { - 'url': 'http://api.soundcloud.com/playlists/4110309', - 'info_dict': { - 'id': '4110309', - 'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]', - 'description': 're:.*?TILT Brass - Bowery Poetry Club', - }, - 'playlist_count': 6, - } - ] + _TESTS = [{ + 'url': 'http://api.soundcloud.com/playlists/4110309', + 'info_dict': { + 'id': '4110309', + 'title': 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]', + 'description': 're:.*?TILT Brass - Bowery Poetry Club', + }, + 'playlist_count': 6, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From 589d3d7c7ae18875060caa15f5547c0194932e55 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 18 Sep 2014 21:37:09 +0200 Subject: [PATCH 026/652] [moniker] rename from hypestat (#3788) --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/{hypestat.py => moniker.py} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename youtube_dl/extractor/{hypestat.py => moniker.py} (98%) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 97693018f..a9a33c40f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -150,7 +150,6 @@ from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE from .huffpost import HuffPostIE from .hypem import HypemIE -from .hypestat import HypestatIE from .iconosquare import IconosquareIE from .ign import IGNIE, OneUPIE from .imdb import ( @@ -209,6 +208,7 @@ from .mpora import MporaIE from .moevideo import MoeVideoIE from .mofosex import MofosexIE from .mojvideo import MojvideoIE +from .moniker import MonikerIE from .mooshare import MooshareIE from .morningstar import MorningstarIE from .motherless import MotherlessIE diff --git a/youtube_dl/extractor/hypestat.py b/youtube_dl/extractor/moniker.py similarity index 98% rename from youtube_dl/extractor/hypestat.py rename to youtube_dl/extractor/moniker.py index e1a142268..79bb2ca59 100644 --- a/youtube_dl/extractor/hypestat.py +++ b/youtube_dl/extractor/moniker.py @@ -11,7 +11,7 @@ from ..utils import ( ) -class HypestatIE(InfoExtractor): +class MonikerIE(InfoExtractor): IE_DESC = 'allmyvideos.net and vidspot.net' _VALID_URL = r'https?://(?:www\.)?(?:allmyvideos|vidspot)\.net/(?P<id>[a-zA-Z0-9_-]+)' From 7267bd536fb81cb1bdcc6554219a0b66a75b31a6 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 19 Sep 2014 09:57:53 +0200 Subject: [PATCH 027/652] [muenchentv] Add support (Fixes #3507) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/common.py | 2 + youtube_dl/extractor/muenchentv.py | 77 ++++++++++++++++++++++++++++++ 3 files changed, 80 insertions(+) create mode 100644 youtube_dl/extractor/muenchentv.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a9a33c40f..625666acb 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -221,6 +221,7 @@ from .mtv import ( MTVServicesEmbeddedIE, MTVIggyIE, ) +from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE from .musicvault import MusicVaultIE from .muzu import MuzuTVIE diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 929dd1e97..9c30a1d33 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -130,6 +130,8 @@ class InfoExtractor(object): by YoutubeDL if it's missing) categories: A list of categories that the video falls in, for example ["Sports", "Berlin"] + is_live: True, False, or None (=unknown). Whether this video is a + live stream that goes on instead of a fixed-length video. Unless mentioned otherwise, the fields should be Unicode strings. diff --git a/youtube_dl/extractor/muenchentv.py b/youtube_dl/extractor/muenchentv.py new file mode 100644 index 000000000..3a938861b --- /dev/null +++ b/youtube_dl/extractor/muenchentv.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import datetime +import json + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + js_to_json, +) + + +class MuenchenTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?muenchen\.tv/livestream' + IE_DESC = 'münchen.tv' + _TEST = { + 'url': 'http://www.muenchen.tv/livestream/', + 'info_dict': { + 'id': '5334', + 'display_id': 'live', + 'ext': 'mp4', + 'title': 're:^münchen.tv-Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + } + + def _real_extract(self, url): + display_id = 'live' + webpage = self._download_webpage(url, display_id) + + now = datetime.datetime.now() + now_str = now.strftime("%Y-%m-%d %H:%M") + title = self._og_search_title(webpage) + ' ' + now_str + + data_js = self._search_regex( + r'(?s)\nplaylist:\s*(\[.*?}\]),related:', + webpage, 'playlist configuration') + data_json = js_to_json(data_js) + data = json.loads(data_json)[0] + + video_id = data['mediaid'] + thumbnail = data.get('image') + + formats = [] + for format_num, s in enumerate(data['sources']): + ext = determine_ext(s['file'], None) + label_str = s.get('label') + if label_str is None: + label_str = '_%d' % format_num + + if ext is None: + format_id = label_str + else: + format_id = '%s-%s' % (ext, label_str) + + formats.append({ + 'url': s['file'], + 'tbr': int_or_none(s.get('label')), + 'ext': 'mp4', + 'format_id': format_id, + 'preference': -100 if '.smil' in s['file'] else 0, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'is_live': True, + } + From f566d9f1d54a61497a17c5ed62a32ee1387483bd Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 19 Sep 2014 09:58:01 +0200 Subject: [PATCH 028/652] release 2014.09.19 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 430509ba3..940e9c8cf 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.09.18' +__version__ = '2014.09.19' From 532f5bff70cc32f54f38fbce9233a88faf4423b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 19 Sep 2014 20:58:50 +0700 Subject: [PATCH 029/652] [franceinter] Fix extraction and modernize --- youtube_dl/extractor/franceinter.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py index deb1b0b9d..6613ee17a 100644 --- a/youtube_dl/extractor/franceinter.py +++ b/youtube_dl/extractor/franceinter.py @@ -4,16 +4,21 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import int_or_none class FranceInterIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?franceinter\.fr/player/reecouter\?play=(?P<id>[0-9]{6})' + _VALID_URL = r'http://(?:www\.)?franceinter\.fr/player/reecouter\?play=(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.franceinter.fr/player/reecouter?play=793962', - 'file': '793962.mp3', 'md5': '4764932e466e6f6c79c317d2e74f6884', "info_dict": { - "title": "L’Histoire dans les jeux vidéo", + 'id': '793962', + 'ext': 'mp3', + 'title': 'L’Histoire dans les jeux vidéo', + 'description': 'md5:7e93ddb4451e7530022792240a3049c7', + 'timestamp': 1387369800, + 'upload_date': '20131218', }, } @@ -22,17 +27,26 @@ class FranceInterIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'<span class="roll_overflow">(.*?)</span></h1>', webpage, 'title') + path = self._search_regex( - r'&urlAOD=(.*?)&startTime', webpage, 'video url') + r'<a id="player".+?href="([^"]+)"', webpage, 'video url') video_url = 'http://www.franceinter.fr/' + path + title = self._html_search_regex( + r'<span class="title">(.+?)</span>', webpage, 'title') + description = self._html_search_regex( + r'<span class="description">(.*?)</span>', + webpage, 'description', fatal=False) + timestamp = int_or_none(self._search_regex( + r'data-date="(\d+)"', webpage, 'upload date', fatal=False)) + return { 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, 'formats': [{ 'url': video_url, 'vcodec': 'none', }], - 'title': title, } From 5aa38e75b27b428b67f9f7083c44051881c98fd8 Mon Sep 17 00:00:00 2001 From: Carlos Ramos <carlos.ramos1@alu.uclm.es> Date: Fri, 19 Sep 2014 22:46:57 +0200 Subject: [PATCH 030/652] [played] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/played.py | 57 ++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 youtube_dl/extractor/played.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 625666acb..9ee3f9190 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -266,6 +266,7 @@ from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE from .photobucket import PhotobucketIE +from .played import PlayedIE from .playfm import PlayFMIE from .playvid import PlayvidIE from .podomatic import PodomaticIE diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py new file mode 100644 index 000000000..a396e62e5 --- /dev/null +++ b/youtube_dl/extractor/played.py @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import time +import os.path + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + compat_urllib_request, +) + + +class PlayedIE(InfoExtractor): + IE_NAME = 'played.to' + _VALID_URL = r'https?://played\.to/(?P<id>[a-zA-Z0-9_-]+)' + + _TEST = { + 'url': 'http://played.to/j2f2sfiiukgt', + 'md5': 'c2bd75a368e82980e7257bf500c00637', + 'info_dict': { + 'id': 'j2f2sfiiukgt', + 'ext': 'flv', + 'title': 'youtube-dl_test_video.mp4', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + orig_webpage = self._download_webpage(url, video_id) + fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage) + data = dict(fields) + + self.to_screen('%s: Waiting for timeout' % video_id) + time.sleep(2) + + post = compat_urllib_parse.urlencode(data) + headers = { + b'Content-Type': b'application/x-www-form-urlencoded', + } + req = compat_urllib_request.Request(url, post, headers) + webpage = self._download_webpage( + req, video_id, note='Downloading video page ...') + + title = os.path.splitext(data['fname'])[0] + + video_url = self._search_regex( + r'file: "?(.+?)",', webpage, 'video URL') + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + } \ No newline at end of file From 746c67d72f760f2805dbc125e5a3863aa0d569e3 Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Sat, 20 Sep 2014 03:02:11 +0300 Subject: [PATCH 031/652] [wistia] Use API and make more generic --- youtube_dl/extractor/generic.py | 23 +++++++++++++++++++++++ youtube_dl/extractor/wistia.py | 15 +++++++++------ 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 40eeaad16..2d77f604a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -382,6 +382,19 @@ class GenericIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', }, }, + # Wistia embed + { + 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', + 'md5': '8788b683c777a5cf25621eaf286d0c23', + 'info_dict': { + 'id': '1cfaf6b7ea', + 'ext': 'mov', + 'title': 'md5:51364a8d3d009997ba99656004b5e20d', + 'duration': 643.0, + 'filesize': 182808282, + 'uploader': 'education-portal.com', + }, + }, ] def report_download_webpage(self, video_id): @@ -654,6 +667,16 @@ class GenericIE(InfoExtractor): 'title': video_title, 'id': video_id, } + match = re.search(r'(?:id=["\']wistia_|data-wistiaid=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) + if match: + return { + '_type': 'url_transparent', + 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')), + 'ie_key': 'Wistia', + 'uploader': video_uploader, + 'title': video_title, + 'id': match.group('id') + } # Look for embedded blip.tv player mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage) diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index e6bfa9e14..748443f81 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -1,13 +1,14 @@ from __future__ import unicode_literals -import json import re from .common import InfoExtractor +from ..utils import ExtractorError, compat_urllib_request class WistiaIE(InfoExtractor): _VALID_URL = r'https?://(?:fast\.)?wistia\.net/embed/iframe/(?P<id>[a-z0-9]+)' + _API_URL = 'http://fast.wistia.com/embed/medias/{0:}.json' _TEST = { 'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt', @@ -24,11 +25,13 @@ class WistiaIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - data_json = self._html_search_regex( - r'Wistia\.iframeInit\((.*?), {}\);', webpage, 'video data') - - data = json.loads(data_json) + request = compat_urllib_request.Request(self._API_URL.format(video_id)) + request.add_header('Referer', url) # Some videos require this. + data_json = self._download_json(request, video_id) + if data_json.get('error'): + raise ExtractorError('Error while getting the playlist', + expected=True) + data = data_json['media'] formats = [] thumbnails = [] From 3e8fcd9fa1ae23ee3f0370dd948411a5f74c03dc Mon Sep 17 00:00:00 2001 From: Marco Schuster <marco+github@m-s-d.eu> Date: Sat, 20 Sep 2014 02:32:41 +0200 Subject: [PATCH 032/652] [divxstage] added .to TLD Example video "http://www.divxstage.eu/video/930c52709d2" which gets redirected to .to TLD --- youtube_dl/extractor/divxstage.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/divxstage.py b/youtube_dl/extractor/divxstage.py index 4ca3f37a2..b88379e06 100644 --- a/youtube_dl/extractor/divxstage.py +++ b/youtube_dl/extractor/divxstage.py @@ -7,7 +7,7 @@ class DivxStageIE(NovaMovIE): IE_NAME = 'divxstage' IE_DESC = 'DivxStage' - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'divxstage\.(?:eu|net|ch|co|at|ag)'} + _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'divxstage\.(?:eu|net|ch|co|at|ag|to)'} _HOST = 'www.divxstage.eu' @@ -24,4 +24,4 @@ class DivxStageIE(NovaMovIE): 'title': 'youtubedl test video', 'description': 'This is a test video for youtubedl.', } - } \ No newline at end of file + } From 752297631ffd9a51535e650f4444a36f820f01f4 Mon Sep 17 00:00:00 2001 From: Sergey M <dstftw@gmail.com> Date: Sun, 21 Sep 2014 06:20:42 +0700 Subject: [PATCH 033/652] [noco] Adapt to API v1.1 (Closes #3797) --- youtube_dl/extractor/noco.py | 73 ++++++++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index 959fdf590..e3ec9ed15 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals import re +import time +import hashlib from .common import InfoExtractor from ..utils import ( @@ -17,6 +19,7 @@ from ..utils import ( class NocoIE(InfoExtractor): _VALID_URL = r'http://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)' _LOGIN_URL = 'http://noco.tv/do.php' + _API_URL_TEMPLATE = 'https://api.noco.tv/1.1/%s?ts=%s&tk=%s' _NETRC_MACHINE = 'noco' _TEST = { @@ -55,33 +58,52 @@ class NocoIE(InfoExtractor): login = self._download_json(request, None, 'Logging in as %s' % username) if 'erreur' in login: - raise ExtractorError('Unable to login: %s' % clean_html(login['erreur']), expected=True) + raise ExtractorError('Unable to login: %s' % clean_html(login['erreur']), expected=True) + + def _call_api(self, path, video_id, note): + ts = compat_str(int(time.time() * 1000)) + tk = hashlib.md5(hashlib.md5(ts).hexdigest() + '#8S?uCraTedap6a').hexdigest() + url = self._API_URL_TEMPLATE % (path, ts, tk) + + resp = self._download_json(url, video_id, note) + + if isinstance(resp, dict) and resp.get('error'): + self._raise_error(resp['error'], resp['description']) + + return resp + + def _raise_error(self, error, description): + raise ExtractorError( + '%s returned error: %s - %s' % (self.IE_NAME, error, description), + expected=True) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - medias = self._download_json( - 'https://api.noco.tv/1.0/video/medias/%s' % video_id, video_id, 'Downloading video JSON') + medias = self._call_api( + 'shows/%s/medias' % video_id, + video_id, 'Downloading video JSON') + + qualities = self._call_api( + 'qualities', + video_id, 'Downloading qualities JSON') formats = [] - for fmt in medias['fr']['video_list']['default']['quality_list']: - format_id = fmt['quality_key'] + for format_id, fmt in medias['fr']['video_list']['none']['quality_list'].items(): - file = self._download_json( - 'https://api.noco.tv/1.0/video/file/%s/fr/%s' % (format_id.lower(), video_id), + video = self._call_api( + 'shows/%s/video/%s/fr' % (video_id, format_id.lower()), video_id, 'Downloading %s video JSON' % format_id) - file_url = file['file'] + file_url = video['file'] if not file_url: continue - if file_url == 'forbidden': - raise ExtractorError( - '%s returned error: %s - %s' % ( - self.IE_NAME, file['popmessage']['title'], file['popmessage']['message']), - expected=True) + if file_url in ['forbidden', 'not found']: + popmessage = video['popmessage'] + self._raise_error(popmessage['title'], popmessage['message']) formats.append({ 'url': file_url, @@ -91,20 +113,31 @@ class NocoIE(InfoExtractor): 'abr': fmt['audiobitrate'], 'vbr': fmt['videobitrate'], 'filesize': fmt['filesize'], - 'format_note': fmt['quality_name'], - 'preference': fmt['priority'], + 'format_note': qualities[format_id]['quality_name'], + 'preference': qualities[format_id]['priority'], }) self._sort_formats(formats) - show = self._download_json( - 'https://api.noco.tv/1.0/shows/show/%s' % video_id, video_id, 'Downloading show JSON')[0] + show = self._call_api( + 'shows/by_id/%s' % video_id, + video_id, 'Downloading show JSON')[0] - upload_date = unified_strdate(show['indexed']) + upload_date = unified_strdate(show['online_date_start_utc']) uploader = show['partner_name'] uploader_id = show['partner_key'] duration = show['duration_ms'] / 1000.0 - thumbnail = show['screenshot'] + + thumbnails = [] + for thumbnail_key, thumbnail_url in show.items(): + m = re.search(r'^screenshot_(?P<width>\d+)x(?P<height>\d+)$', thumbnail_key) + if not m: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int(m.group('width')), + 'height': int(m.group('height')), + }) episode = show.get('show_TT') or show.get('show_OT') family = show.get('family_TT') or show.get('family_OT') @@ -124,7 +157,7 @@ class NocoIE(InfoExtractor): 'id': video_id, 'title': title, 'description': description, - 'thumbnail': thumbnail, + 'thumbnails': thumbnails, 'upload_date': upload_date, 'uploader': uploader, 'uploader_id': uploader_id, From 58e7071a2ced491a6ecd8a8bcb1b4533a2b0fd8f Mon Sep 17 00:00:00 2001 From: Sergey M <dstftw@gmail.com> Date: Sun, 21 Sep 2014 06:37:11 +0700 Subject: [PATCH 034/652] [tube8] Improve _VALID_URL and add display_id --- youtube_dl/extractor/tube8.py | 37 ++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index 39f20c546..64a1e9030 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -14,27 +14,35 @@ from ..aes import aes_decrypt_text class Tube8IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:gay/|shemale/)?(?:[^/]+/){2}(?P<id>\d+)' - _TEST = { - 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/', - 'md5': '44bf12b98313827dd52d35b8706a4ea0', - 'info_dict': { - 'id': '229795', - 'ext': 'mp4', - 'description': 'hot teen Kasia grinding', - 'uploader': 'unknown', - 'title': 'Kasia music video', - 'age_limit': 18, - } - } + _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/', + 'md5': '44bf12b98313827dd52d35b8706a4ea0', + 'info_dict': { + 'id': '229795', + 'display_id': 'kasia-music-video', + 'ext': 'mp4', + 'description': 'hot teen Kasia grinding', + 'uploader': 'unknown', + 'title': 'Kasia music video', + 'age_limit': 18, + } + }, + { + 'url': 'http://www.tube8.com/shemale/teen/blonde-cd-gets-kidnapped-by-two-blacks-and-punished-for-being-a-slutty-girl/19569151/', + 'only_matching': True, + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + display_id = mobj.group('display_id') req = compat_urllib_request.Request(url) req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) + webpage = self._download_webpage(req, display_id) flashvars = json.loads(self._html_search_regex( r'var flashvars\s*=\s*({.+?})', webpage, 'flashvars')) @@ -70,6 +78,7 @@ class Tube8IE(InfoExtractor): return { 'id': video_id, + 'display_id': display_id, 'url': video_url, 'title': title, 'description': description, From 522c55b7f2622b6138a2781db362d822b4fed32d Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Sun, 21 Sep 2014 03:26:12 +0300 Subject: [PATCH 035/652] [mgoon] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/mgoon.py | 87 ++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 youtube_dl/extractor/mgoon.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 625666acb..fb546eeae 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -199,6 +199,7 @@ from .malemotion import MalemotionIE from .mdr import MDRIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE +from .mgoon import MgoonIE from .ministrygrid import MinistryGridIE from .mit import TechTVMITIE, MITIE, OCWMITIE from .mitele import MiTeleIE diff --git a/youtube_dl/extractor/mgoon.py b/youtube_dl/extractor/mgoon.py new file mode 100644 index 000000000..94bc87b00 --- /dev/null +++ b/youtube_dl/extractor/mgoon.py @@ -0,0 +1,87 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + qualities, + unified_strdate, +) + + +class MgoonIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.)? + (?:(:?m\.)?mgoon\.com/(?:ch/(?:.+)/v|play/view)| + video\.mgoon\.com)/(?P<id>[0-9]+)''' + _API_URL = 'http://mpos.mgoon.com/player/video?id={0:}' + _TESTS = [ + { + 'url': 'http://m.mgoon.com/ch/hi6618/v/5582148', + 'md5': 'dd46bb66ab35cf6d51cc812fd82da79d', + 'info_dict': { + 'id': '5582148', + 'uploader_id': 'hi6618', + 'duration': 240.419, + 'upload_date': '20131220', + 'ext': 'mp4', + 'title': 'md5:543aa4c27a4931d371c3f433e8cebebc', + 'thumbnail': 're:^https?://.*\.jpg$', + } + }, + { + 'url': 'http://www.mgoon.com/play/view/5582148', + 'only_matching': True, + }, + { + 'url': 'http://video.mgoon.com/5582148', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + data = self._download_json(self._API_URL.format(video_id), video_id) + + if data.get('errorInfo', {}).get('code') != 'NONE': + raise ExtractorError('%s encountered an error: %s' % ( + self.IE_NAME, data['errorInfo']['message']), expected=True) + + v_info = data['videoInfo'] + title = v_info.get('v_title') + thumbnail = v_info.get('v_thumbnail') + duration = v_info.get('v_duration') + upload_date = unified_strdate(v_info.get('v_reg_date')) + uploader_id = data.get('userInfo', {}).get('u_alias') + if duration: + duration /= 1000.0 + + age_limit = None + if data.get('accessInfo', {}).get('code') == 'VIDEO_STATUS_ADULT': + age_limit = 18 + + formats = [] + get_quality = qualities(['360p', '480p', '720p', '1080p']) + for fmt in data['videoFiles']: + formats.append({ + 'format_id': fmt['label'], + 'quality': get_quality(fmt['label']), + 'url': fmt['url'], + 'ext': fmt['format'], + + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'duration': duration, + 'upload_date': upload_date, + 'uploader_id': uploader_id, + 'age_limit': age_limit, + } From 72e450c5550ae26b5b36216be1c001f64479c773 Mon Sep 17 00:00:00 2001 From: Anton Larionov <diffident.cat@gmail.com> Date: Sun, 21 Sep 2014 13:21:29 +0400 Subject: [PATCH 036/652] [thvideo] Add support for THVideo --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/thvideo.py | 55 ++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 youtube_dl/extractor/thvideo.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index fb546eeae..ae5b4e9e6 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -362,6 +362,7 @@ from .thisav import ThisAVIE from .tinypic import TinyPicIE from .tlc import TlcIE, TlcDeIE from .tnaflix import TNAFlixIE +from .thvideo import THVideoIE from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE diff --git a/youtube_dl/extractor/thvideo.py b/youtube_dl/extractor/thvideo.py new file mode 100644 index 000000000..9fa14d3c4 --- /dev/null +++ b/youtube_dl/extractor/thvideo.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + unified_strdate +) + + +class THVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thvideo\.tv/v/th(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://thvideo.tv/v/th1987/', + 'md5': 'fa107b1f73817e325e9433505a70db50', + 'info_dict': { + 'id': '1987', + 'ext': 'mp4', + 'title': '【动画】秘封活动记录 ~ The Sealed Esoteric History.分镜稿预览', + 'display_id': 'th1987', + 'thumbnail': 'http://thvideo.tv/uploadfile/2014/0722/20140722013459856.jpg', + 'description': '社团京都幻想剧团的第一个东方二次同人动画作品「秘封活动记录 ~ The Sealed Esoteric History.」 本视频是该动画第一期的分镜草稿...', + 'upload_date': '20140722' + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + # extract download link from mobile player page + webpage_player = self._download_webpage('http://thvideo.tv/mobile.php?cid=%s-0' % video_id, video_id) + video_url = self._html_search_regex(r'<source src="(.*?)" type', webpage_player, 'video url') + + # extract video info from main page + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + display_id = 'th%s' % video_id + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description(webpage) + upload_date_raw = self._html_search_regex(r'span itemprop="datePublished" content="(.*?)">', webpage, + 'upload date', fatal=False) + upload_date = unified_strdate(upload_date_raw) + + return { + 'id': video_id, + 'ext': 'mp4', + 'url': video_url, + 'title': title, + 'display_id': display_id, + 'thumbnail': thumbnail, + 'description': description, + 'upload_date': upload_date + } \ No newline at end of file From 7bd4b4229a126a9f47035beec8a13eff08804850 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 21 Sep 2014 13:40:22 +0200 Subject: [PATCH 037/652] [dropbox] Recognize 'https://www.dropbox.com/sh/*' urls (fixes #3795) And extract the title from the url last path component. --- youtube_dl/extractor/dropbox.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/dropbox.py b/youtube_dl/extractor/dropbox.py index 1e1763abf..817a9bd61 100644 --- a/youtube_dl/extractor/dropbox.py +++ b/youtube_dl/extractor/dropbox.py @@ -5,24 +5,29 @@ import os.path import re from .common import InfoExtractor -from ..utils import compat_urllib_parse_unquote +from ..utils import compat_urllib_parse_unquote, url_basename class DropboxIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dropbox[.]com/s/(?P<id>[a-zA-Z0-9]{15})/(?P<title>[^?#]*)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?dropbox[.]com/sh?/(?P<id>[a-zA-Z0-9]{15})/.*' + _TESTS = [{ 'url': 'https://www.dropbox.com/s/nelirfsxnmcfbfh/youtube-dl%20test%20video%20%27%C3%A4%22BaW_jenozKc.mp4?dl=0', 'info_dict': { 'id': 'nelirfsxnmcfbfh', 'ext': 'mp4', 'title': 'youtube-dl test video \'ä"BaW_jenozKc' } - } + }, + { + 'url': 'https://www.dropbox.com/sh/662glsejgzoj9sr/AAByil3FGH9KFNZ13e08eSa1a/Pregame%20Ceremony%20Program%20PA%2020140518.m4v', + 'only_matching': True, + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - fn = compat_urllib_parse_unquote(mobj.group('title')) + fn = compat_urllib_parse_unquote(url_basename(url)) title = os.path.splitext(fn)[0] video_url = ( re.sub(r'[?&]dl=0', '', url) + From b28c8403b2c1ef51f04520e8116176b1fee12dcb Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Sun, 21 Sep 2014 15:13:35 +0300 Subject: [PATCH 038/652] [yourupload] Add new extractor. Fixes #3085 --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/yourupload.py | 58 ++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 youtube_dl/extractor/yourupload.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index fb546eeae..1a6033320 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -448,6 +448,7 @@ from .yahoo import ( from .youjizz import YouJizzIE from .youku import YoukuIE from .youporn import YouPornIE +from .yourupload import YourUploadIE from .youtube import ( YoutubeIE, YoutubeChannelIE, diff --git a/youtube_dl/extractor/yourupload.py b/youtube_dl/extractor/yourupload.py new file mode 100644 index 000000000..40fc4165f --- /dev/null +++ b/youtube_dl/extractor/yourupload.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class YourUploadIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.)? + (?:yourupload\.com/watch| + embed\.yourupload\.com| + embed\.yucache\.net + )/(?P<id>[A-Za-z0-9]+) + ''' + _TESTS = [ + { + 'url': 'http://yourupload.com/watch/14i14h', + 'md5': 'bf5c2f95c4c917536e80936af7bc51e1', + 'info_dict': { + 'id': '14i14h', + 'ext': 'mp4', + 'title': 'BigBuckBunny_320x180.mp4', + 'thumbnail': 're:^https?://.*\.jpe?g', + } + }, + { + 'url': 'http://embed.yourupload.com/14i14h', + 'only_matching': True, + }, + { + 'url': 'http://embed.yucache.net/14i14h?client_file_id=803349', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + url = 'http://embed.yucache.net/{0:}'.format(video_id) + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) + url = self._og_search_video_url(webpage) + + formats = [{ + 'format_id': 'sd', + 'url': url, + }] + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + } From b509a4b17643422b750e5258f538894105c58d42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 21 Sep 2014 15:43:09 +0200 Subject: [PATCH 039/652] [downloader/f4m] If <pv-2.0> is in the manifest, add it to the fragments urls query (fixes #3176) It's used in some akamai videos (for example for theplatform.com). --- youtube_dl/downloader/f4m.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 71353f607..b3be16ff1 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -16,6 +16,7 @@ from ..utils import ( format_bytes, encodeFilename, sanitize_open, + xpath_text, ) @@ -251,6 +252,8 @@ class F4mFD(FileDownloader): # We only download the first fragment fragments_list = fragments_list[:1] total_frags = len(fragments_list) + # For some akamai manifests we'll need to add a query to the fragment url + akamai_pv = xpath_text(doc, _add_ns('pv-2.0')) tmpfilename = self.temp_name(filename) (dest_stream, tmpfilename) = sanitize_open(tmpfilename, 'wb') @@ -290,6 +293,8 @@ class F4mFD(FileDownloader): for (seg_i, frag_i) in fragments_list: name = 'Seg%d-Frag%d' % (seg_i, frag_i) url = base_url + name + if akamai_pv: + url += '?' + akamai_pv.strip(';') frag_filename = '%s-%s' % (tmpfilename, name) success = http_dl.download(frag_filename, {'url': url}) if not success: From dd41e8c82bb14bda0c407f9f0865cfb112e8fc30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 21 Sep 2014 15:47:58 +0200 Subject: [PATCH 040/652] [theplatform] Extract all formats for f4m videos --- youtube_dl/extractor/theplatform.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index b6b2dba9c..031a958fa 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -62,10 +62,7 @@ class ThePlatformIE(InfoExtractor): # the parameters are from syfy.com, other sites may use others, # they also work for nbc.com f4m_url += '&g=UXWGVKRWHFSP&hdcore=3.0.3' - formats = [{ - 'ext': 'flv', - 'url': f4m_url, - }] + formats = self._extract_f4m_formats(f4m_url, video_id) else: base_url = head.find(_x('smil:meta')).attrib['base'] switch = body.find(_x('smil:switch')) From 224ce0d87299cf54469baccb9922e78f9594d029 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 21 Sep 2014 15:49:04 +0200 Subject: [PATCH 041/652] [nbc] Update test --- youtube_dl/extractor/nbc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index d2e4acbad..e75ab7c39 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -16,9 +16,9 @@ class NBCIE(InfoExtractor): _TEST = { 'url': 'http://www.nbc.com/chicago-fire/video/i-am-a-firefighter/2734188', - 'md5': '54d0fbc33e0b853a65d7b4de5c06d64e', + # md5 checksum is not stable 'info_dict': { - 'id': 'u1RInQZRN7QJ', + 'id': 'bTmnLCvIbaaH', 'ext': 'flv', 'title': 'I Am a Firefighter', 'description': 'An emergency puts Dawson\'sf irefighter skills to the ultimate test in this four-part digital series.', From e35cb78c4099263c26f717669463a3c025c30d17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 21 Sep 2014 16:08:38 +0200 Subject: [PATCH 042/652] [theplatform] Correctly extract videos that don't use f4m or rtmp (reported in #3176) --- youtube_dl/extractor/sbs.py | 2 +- youtube_dl/extractor/theplatform.py | 48 ++++++++++++++++++----------- 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index 34058fd4b..214990e7a 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -21,7 +21,7 @@ class SBSIE(InfoExtractor): 'md5': '3150cf278965eeabb5b4cea1c963fe0a', 'info_dict': { 'id': '320403011771', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Dingo Conservation', 'description': 'Dingoes are on the brink of extinction; most of the animals we think are dingoes are in fact crossbred with wild dogs. This family run a dingo conservation park to prevent their extinction', 'thumbnail': 're:http://.*\.jpg', diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 031a958fa..0be793b1c 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -5,6 +5,7 @@ import json from .common import InfoExtractor from ..utils import ( + compat_str, ExtractorError, xpath_with_ns, ) @@ -55,7 +56,7 @@ class ThePlatformIE(InfoExtractor): body = meta.find(_x('smil:body')) f4m_node = body.find(_x('smil:seq//smil:video')) - if f4m_node is not None: + if f4m_node is not None and '.f4m' in f4m_node.attrib['src']: f4m_url = f4m_node.attrib['src'] if 'manifest.f4m?' not in f4m_url: f4m_url += '?' @@ -64,24 +65,35 @@ class ThePlatformIE(InfoExtractor): f4m_url += '&g=UXWGVKRWHFSP&hdcore=3.0.3' formats = self._extract_f4m_formats(f4m_url, video_id) else: - base_url = head.find(_x('smil:meta')).attrib['base'] - switch = body.find(_x('smil:switch')) formats = [] - for f in switch.findall(_x('smil:video')): - attr = f.attrib - width = int(attr['width']) - height = int(attr['height']) - vbr = int(attr['system-bitrate']) // 1000 - format_id = '%dx%d_%dk' % (width, height, vbr) - formats.append({ - 'format_id': format_id, - 'url': base_url, - 'play_path': 'mp4:' + attr['src'], - 'ext': 'flv', - 'width': width, - 'height': height, - 'vbr': vbr, - }) + switch = body.find(_x('smil:switch')) + if switch is not None: + base_url = head.find(_x('smil:meta')).attrib['base'] + for f in switch.findall(_x('smil:video')): + attr = f.attrib + width = int(attr['width']) + height = int(attr['height']) + vbr = int(attr['system-bitrate']) // 1000 + format_id = '%dx%d_%dk' % (width, height, vbr) + formats.append({ + 'format_id': format_id, + 'url': base_url, + 'play_path': 'mp4:' + attr['src'], + 'ext': 'flv', + 'width': width, + 'height': height, + 'vbr': vbr, + }) + else: + switch = body.find(_x('smil:seq//smil:switch')) + for f in switch.findall(_x('smil:video')): + attr = f.attrib + vbr = int(attr['system-bitrate']) // 1000 + formats.append({ + 'format_id': compat_str(vbr), + 'url': attr['src'], + 'vbr': vbr, + }) self._sort_formats(formats) return { From df8f53f752c0f01577dcc5d63c6d9a81d924770b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 21 Sep 2014 16:32:38 +0200 Subject: [PATCH 043/652] [thvideo] Support mobile URLs as well --- youtube_dl/extractor/thvideo.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/thvideo.py b/youtube_dl/extractor/thvideo.py index 9fa14d3c4..607e947bb 100644 --- a/youtube_dl/extractor/thvideo.py +++ b/youtube_dl/extractor/thvideo.py @@ -10,7 +10,7 @@ from ..utils import ( class THVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?thvideo\.tv/v/th(?P<id>[0-9]+)' + _VALID_URL = r'http://(?:www\.)?thvideo\.tv/(?:v/th|mobile\.php\?cid=)(?P<id>[0-9]+)' _TEST = { 'url': 'http://thvideo.tv/v/th1987/', 'md5': 'fa107b1f73817e325e9433505a70db50', @@ -30,18 +30,22 @@ class THVideoIE(InfoExtractor): video_id = mobj.group('id') # extract download link from mobile player page - webpage_player = self._download_webpage('http://thvideo.tv/mobile.php?cid=%s-0' % video_id, video_id) - video_url = self._html_search_regex(r'<source src="(.*?)" type', webpage_player, 'video url') + webpage_player = self._download_webpage( + 'http://thvideo.tv/mobile.php?cid=%s-0' % (video_id), + video_id, note='Downloading video source page') + video_url = self._html_search_regex( + r'<source src="(.*?)" type', webpage_player, 'video url') # extract video info from main page - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + 'http://thvideo.tv/v/th%s' % (video_id), video_id) title = self._og_search_title(webpage) display_id = 'th%s' % video_id thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description(webpage) - upload_date_raw = self._html_search_regex(r'span itemprop="datePublished" content="(.*?)">', webpage, - 'upload date', fatal=False) - upload_date = unified_strdate(upload_date_raw) + upload_date = unified_strdate(self._html_search_regex( + r'span itemprop="datePublished" content="(.*?)">', webpage, + 'upload date', fatal=False)) return { 'id': video_id, @@ -52,4 +56,4 @@ class THVideoIE(InfoExtractor): 'thumbnail': thumbnail, 'description': description, 'upload_date': upload_date - } \ No newline at end of file + } From d0df92928bc099775e18f6413e387713839012ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 21 Sep 2014 16:53:00 +0200 Subject: [PATCH 044/652] [npo] Add extractor for tegenlicht.vpro.nl (closes #3778) --- youtube_dl/extractor/__init__.py | 5 ++++- youtube_dl/extractor/npo.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1a6033320..bca34ae73 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -249,7 +249,10 @@ from .nosvideo import NosVideoIE from .novamov import NovaMovIE from .nowness import NownessIE from .nowvideo import NowVideoIE -from .npo import NPOIE +from .npo import ( + NPOIE, + TegenlichtVproIE, +) from .nrk import ( NRKIE, NRKTVIE, diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 7a154e94a..f36d446d2 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -7,6 +7,7 @@ from ..utils import ( unified_strdate, parse_duration, qualities, + url_basename, ) @@ -55,7 +56,9 @@ class NPOIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + return self._get_info(video_id) + def _get_info(self, video_id): metadata = self._download_json( 'http://e.omroep.nl/metadata/aflevering/%s' % video_id, video_id, @@ -106,3 +109,30 @@ class NPOIE(InfoExtractor): 'duration': parse_duration(metadata.get('tijdsduur')), 'formats': formats, } + + +class TegenlichtVproIE(NPOIE): + IE_NAME = 'tegenlicht.vpro.nl' + _VALID_URL = r'https?://tegenlicht\.vpro\.nl/afleveringen/.*?' + + _TESTS = [ + { + 'url': 'http://tegenlicht.vpro.nl/afleveringen/2012-2013/de-toekomst-komt-uit-afrika.html', + 'md5': 'f8065e4e5a7824068ed3c7e783178f2c', + 'info_dict': { + 'id': 'VPWON_1169289', + 'ext': 'm4v', + 'title': 'Tegenlicht', + 'description': 'md5:d6476bceb17a8c103c76c3b708f05dd1', + 'upload_date': '20130225', + }, + }, + ] + + def _real_extract(self, url): + name = url_basename(url) + webpage = self._download_webpage(url, name) + urn = self._html_search_meta('mediaurn', webpage) + info_page = self._download_json( + 'http://rs.vpro.nl/v2/api/media/%s.json' % urn, name) + return self._get_info(info_page['mid']) From f90d95edeb981481834f4b092b4c2ac793f225f9 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 22 Sep 2014 13:07:23 +0200 Subject: [PATCH 045/652] release 2014.09.22 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 940e9c8cf..4bf208b67 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.09.19' +__version__ = '2014.09.22' From 273dea42487461884926b4d810ebf74e541dc8b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 22 Sep 2014 18:58:22 +0700 Subject: [PATCH 046/652] [playfm] Fix view count and add comment count --- youtube_dl/extractor/playfm.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/playfm.py b/youtube_dl/extractor/playfm.py index 72df4d842..ebc046804 100644 --- a/youtube_dl/extractor/playfm.py +++ b/youtube_dl/extractor/playfm.py @@ -10,6 +10,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + str_to_int, ) @@ -29,6 +30,7 @@ class PlayFMIE(InfoExtractor): 'duration': 5627.428, 'upload_date': '20140712', 'view_count': int, + 'comment_count': int, 'thumbnail': 're:^https?://.*\.jpg$', }, } @@ -51,7 +53,8 @@ class PlayFMIE(InfoExtractor): recording = rec_doc.find('./recording') title = recording.find('./title').text - view_count = int_or_none(recording.find('./stats/playcount').text) + view_count = str_to_int(recording.find('./stats/playcount').text) + comment_count = str_to_int(recording.find('./stats/comments').text) duration = float_or_none(recording.find('./duration').text, scale=1000) thumbnail = recording.find('./image').text @@ -75,6 +78,7 @@ class PlayFMIE(InfoExtractor): 'title': title, 'upload_date': upload_date, 'view_count': view_count, + 'comment_count': comment_count, 'duration': duration, 'thumbnail': thumbnail, 'uploader': uploader, From 63cddb6477e785ca2bfb6e3bb1ac2af20aa9842c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 22 Sep 2014 14:11:08 +0200 Subject: [PATCH 047/652] [sbs] Recognize urls with format 'http://www.sbs.com.au/ondemand/video/<id>' (#3811) --- youtube_dl/extractor/sbs.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index 214990e7a..409f8540a 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -12,7 +12,7 @@ from ..utils import ( class SBSIE(InfoExtractor): IE_DESC = 'sbs.com.au' - _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/ondemand/video/single/(?P<id>[0-9]+)/' + _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/ondemand/video/(?:single/)?(?P<id>[0-9]+)' _TESTS = [{ # Original URL is handled by the generic IE which finds the iframe: @@ -27,6 +27,10 @@ class SBSIE(InfoExtractor): 'thumbnail': 're:http://.*\.jpg', }, 'add_ies': ['generic'], + }, + { + 'url': 'http://www.sbs.com.au/ondemand/video/320403011771/Dingo-Conservation-The-Feed', + 'only_matching': True, }] def _real_extract(self, url): From 094d42fe443c8f7ad5bd9049d63317195ab8fd3a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 22 Sep 2014 18:15:07 +0200 Subject: [PATCH 048/652] release 2014.09.22.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 4bf208b67..2853c79c9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.09.22' +__version__ = '2014.09.22.1' From 632e5684ce797eb8a7372eb25dd4ce299f2e66de Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Tue, 23 Sep 2014 00:28:19 +0300 Subject: [PATCH 049/652] [nfl] Add new extractor. (Closes #3815) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/nfl.py | 103 +++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 youtube_dl/extractor/nfl.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 244d22297..1f1fc0eb2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -240,6 +240,7 @@ from .ndtv import NDTVIE from .newgrounds import NewgroundsIE from .newstube import NewstubeIE from .nfb import NFBIE +from .nfl import NFLIE from .nhl import NHLIE, NHLVideocenterIE from .niconico import NiconicoIE from .ninegag import NineGagIE diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py new file mode 100644 index 000000000..f53596f5e --- /dev/null +++ b/youtube_dl/extractor/nfl.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + remove_end, +) + + +class NFLIE(InfoExtractor): + IE_NAME = 'nfl.com' + _VALID_URL = r'(?x)https?://(?:www\.)?nfl\.com/(?:videos/(?:.+)/|.*?\#video=)(?P<id>\d..[0-9]+)' + _PLAYER_CONFIG_URL = 'http://www.nfl.com/static/content/static/config/video/config.json' + _TEST = { + 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', + 'skip_download': True, # md5 sum fluctuates + 'info_dict': { + 'id': '0ap3000000398478', + 'ext': 'mp4', + 'title': 'Week 3: Washington Redskins vs. Philadelphia Eagles highlights', + 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', + 'upload_date': '20140921', + 'timestamp': 1411337580, + 'thumbnail': 're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + config = self._download_json(self._PLAYER_CONFIG_URL, video_id, + note='Downloading player config') + url_template = 'http://nfl.com{contentURLTemplate:s}'.format(**config) + video_data = self._download_json(url_template.format(id=video_id), video_id) + + cdns = config.get('cdns') + if not cdns: + raise ExtractorError('Failed to get CDN data', expected=True) + + formats = [] + streams = video_data.get('cdnData', {}).get('bitrateInfo', []) + for name, cdn in cdns.items(): + # LimeLight streams don't seem to work + if cdn.get('name') == 'LIMELIGHT': + continue + + protocol = cdn.get('protocol') + host = remove_end(cdn.get('host', ''), '/') + if not (protocol and host): + continue + + path_prefix = cdn.get('pathprefix', '') + if path_prefix and not path_prefix.endswith('/'): + path_prefix = '%s/' % path_prefix + + get_url = lambda p: '{protocol:s}://{host:s}/{prefix:s}{path:}'.format( + protocol=protocol, + host=host, + prefix=path_prefix, + path=p, + ) + + if protocol == 'rtmp': + preference = -2 + elif 'prog' in name.lower(): + preference = -1 + else: + preference = 0 + + for stream in streams: + path = stream.get('path') + if not path: + continue + + formats.append({ + 'url': get_url(path), + 'vbr': int_or_none(stream.get('rate', 0), 1000), + 'preference': preference, + 'format_note': name, + }) + + self._sort_formats(formats) + + thumbnail = None + for q in ('xl', 'l', 'm', 's', 'xs'): + thumbnail = video_data.get('imagePaths', {}).get(q) + if thumbnail: + break + + return { + 'id': video_id, + 'title': video_data.get('storyHeadline'), + 'formats': formats, + 'description': video_data.get('caption'), + 'duration': video_data.get('duration'), + 'thumbnail': thumbnail, + 'timestamp': int_or_none(video_data.get('posted'), 1000), + } From f7d159cf953bd1884ca45f535327f3016998270c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 23 Sep 2014 19:13:11 +0700 Subject: [PATCH 050/652] [noco] Encode before passing to hashlib.md5 (Closes #3816) --- youtube_dl/extractor/noco.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index e3ec9ed15..7f1bc6377 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -62,7 +62,7 @@ class NocoIE(InfoExtractor): def _call_api(self, path, video_id, note): ts = compat_str(int(time.time() * 1000)) - tk = hashlib.md5(hashlib.md5(ts).hexdigest() + '#8S?uCraTedap6a').hexdigest() + tk = hashlib.md5((hashlib.md5(ts.encode('ascii')).hexdigest() + '#8S?uCraTedap6a').encode('ascii')).hexdigest() url = self._API_URL_TEMPLATE % (path, ts, tk) resp = self._download_json(url, video_id, note) From 86916dae4b8604431205d11ccfa5f9796c0798dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 23 Sep 2014 19:58:35 +0700 Subject: [PATCH 051/652] [wat] Capture and output error message --- youtube_dl/extractor/wat.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 46b4d9133..268e2f618 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -5,7 +5,10 @@ import re import hashlib from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( + ExtractorError, + unified_strdate, +) class WatIE(InfoExtractor): @@ -57,6 +60,11 @@ class WatIE(InfoExtractor): video_info = self.download_video_info(real_id) + error_desc = video_info.get('error_desc') + if error_desc: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error_desc), expected=True) + geo_list = video_info.get('geoList') country = geo_list[0] if geo_list else '' From bd5650ac64fedd1c1ad7b90c4ec4ff5d4c053bc0 Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Tue, 23 Sep 2014 20:42:28 +0300 Subject: [PATCH 052/652] [nfl] Fix test case - download, but don't check md5 --- youtube_dl/extractor/nfl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py index f53596f5e..963c4587c 100644 --- a/youtube_dl/extractor/nfl.py +++ b/youtube_dl/extractor/nfl.py @@ -17,7 +17,7 @@ class NFLIE(InfoExtractor): _PLAYER_CONFIG_URL = 'http://www.nfl.com/static/content/static/config/video/config.json' _TEST = { 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', - 'skip_download': True, # md5 sum fluctuates + # 'md5': '5eb8c40a727dda106d510e5d6ffa79e5', # md5 checksum fluctuates 'info_dict': { 'id': '0ap3000000398478', 'ext': 'mp4', From 4bc3a23ec5d2c1bbcdc5289393881606604922c7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 24 Sep 2014 09:49:53 +0200 Subject: [PATCH 053/652] [youtube] Modernize --- youtube_dl/extractor/youtube.py | 103 +++++++++++++++++--------------- 1 file changed, 54 insertions(+), 49 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b54c69122..602be9859 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -307,69 +307,74 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): IE_NAME = 'youtube' _TESTS = [ { - u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc", - u"file": u"BaW_jenozKc.mp4", - u"info_dict": { - u"title": u"youtube-dl test video \"'/\\ä↭𝕐", - u"uploader": u"Philipp Hagemeister", - u"uploader_id": u"phihag", - u"upload_date": u"20121002", - u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .", - u"categories": [u'Science & Technology'], + 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc', + 'info_dict': { + 'id': 'BaW_jenozKc', + 'ext': 'mp4', + 'title': 'youtube-dl test video "\'/\\ä↭𝕐', + 'uploader': 'Philipp Hagemeister', + 'uploader_id': 'phihag', + 'upload_date': '20121002', + 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', + 'categories': ['Science & Technology'], 'like_count': int, 'dislike_count': int, } }, { - u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY", - u"file": u"UxxajLWwzqY.mp4", - u"note": u"Test generic use_cipher_signature video (#897)", - u"info_dict": { - u"upload_date": u"20120506", - u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]", - u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f", - u"uploader": u"Icona Pop", - u"uploader_id": u"IconaPop" + 'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY', + 'note': 'Test generic use_cipher_signature video (#897)', + 'info_dict': { + 'id': 'UxxajLWwzqY', + 'ext': 'mp4', + 'upload_date': '20120506', + 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', + 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f', + 'uploader': 'Icona Pop', + 'uploader_id': 'IconaPop', } }, { - u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ", - u"file": u"07FYdnEawAQ.mp4", - u"note": u"Test VEVO video with age protection (#956)", - u"info_dict": { - u"upload_date": u"20130703", - u"title": u"Justin Timberlake - Tunnel Vision (Explicit)", - u"description": u"md5:64249768eec3bc4276236606ea996373", - u"uploader": u"justintimberlakeVEVO", - u"uploader_id": u"justintimberlakeVEVO" + 'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ', + 'note': 'Test VEVO video with age protection (#956)', + 'info_dict': { + 'id': '07FYdnEawAQ', + 'ext': 'mp4', + 'upload_date': '20130703', + 'title': 'Justin Timberlake - Tunnel Vision (Explicit)', + 'description': 'md5:64249768eec3bc4276236606ea996373', + 'uploader': 'justintimberlakeVEVO', + 'uploader_id': 'justintimberlakeVEVO', } }, { - u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ", - u"file": u"yZIXLfi8CZQ.mp4", - u"note": u"Embed-only video (#1746)", - u"info_dict": { - u"upload_date": u"20120608", - u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012", - u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7", - u"uploader": u"SET India", - u"uploader_id": u"setindia" + 'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ', + 'note': 'Embed-only video (#1746)', + 'info_dict': { + 'id': 'yZIXLfi8CZQ', + 'ext': 'mp4', + 'upload_date': '20120608', + 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012', + 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', + 'uploader': 'SET India', + 'uploader_id': 'setindia' } }, { - u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I", - u"file": u"a9LDPn-MO4I.m4a", - u"note": u"256k DASH audio (format 141) via DASH manifest", - u"info_dict": { - u"upload_date": "20121002", - u"uploader_id": "8KVIDEO", - u"description": '', - u"uploader": "8KVIDEO", - u"title": "UHDTV TEST 8K VIDEO.mp4" + 'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I', + 'note': '256k DASH audio (format 141) via DASH manifest', + 'info_dict': { + 'id': 'a9LDPn-MO4I', + 'ext': 'm4a', + 'upload_date': '20121002', + 'uploader_id': '8KVIDEO', + 'description': '', + 'uploader': '8KVIDEO', + 'title': 'UHDTV TEST 8K VIDEO.mp4' }, - u"params": { - u"youtube_include_dash_manifest": True, - u"format": "141", + 'params': { + 'youtube_include_dash_manifest': True, + 'format': '141', }, }, # DASH manifest with encrypted signature @@ -384,7 +389,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', }, - u"params": { + 'params': { 'youtube_include_dash_manifest': True, 'format': '141', }, From 69ea8ca42cd4fc62fdd4e7f18defb3b23da618d2 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 24 Sep 2014 09:51:45 +0200 Subject: [PATCH 054/652] [youtube] Remove superfluous unicode specifiers --- youtube_dl/extractor/youtube.py | 136 ++++++++++++++++---------------- 1 file changed, 68 insertions(+), 68 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 602be9859..0257ee2f9 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -46,7 +46,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _set_language(self): return bool(self._download_webpage( self._LANG_URL, None, - note=u'Setting language', errnote='unable to set language', + note='Setting language', errnote='unable to set language', fatal=False)) def _login(self): @@ -61,13 +61,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # No authentication to be performed if username is None: if self._LOGIN_REQUIRED: - raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True) + raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) return True login_page = self._download_webpage( self._LOGIN_URL, None, - note=u'Downloading login page', - errnote=u'unable to fetch login page', fatal=False) + note='Downloading login page', + errnote='unable to fetch login page', fatal=False) if login_page is False: return @@ -105,12 +105,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor): req = compat_urllib_request.Request(self._LOGIN_URL, login_data) login_results = self._download_webpage( req, None, - note=u'Logging in', errnote=u'unable to log in', fatal=False) + note='Logging in', errnote='unable to log in', fatal=False) if login_results is False: return False if re.search(r'id="errormsg_0_Passwd"', login_results) is not None: - raise ExtractorError(u'Please use your account password and a two-factor code instead of an application-specific password.', expected=True) + raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True) # Two-Factor # TODO add SMS and phone call support - these require making a request and then prompting the user @@ -119,19 +119,19 @@ class YoutubeBaseInfoExtractor(InfoExtractor): tfa_code = self._get_tfa_info() if tfa_code is None: - self._downloader.report_warning(u'Two-factor authentication required. Provide it with --twofactor <code>') - self._downloader.report_warning(u'(Note that only TOTP (Google Authenticator App) codes work at this time.)') + self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>') + self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)') return False # Unlike the first login form, secTok and timeStmp are both required for the TFA form match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U) if match is None: - self._downloader.report_warning(u'Failed to get secTok - did the page structure change?') + self._downloader.report_warning('Failed to get secTok - did the page structure change?') secTok = match.group(1) match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U) if match is None: - self._downloader.report_warning(u'Failed to get timeStmp - did the page structure change?') + self._downloader.report_warning('Failed to get timeStmp - did the page structure change?') timeStmp = match.group(1) tfa_form_strs = { @@ -155,23 +155,23 @@ class YoutubeBaseInfoExtractor(InfoExtractor): tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data) tfa_results = self._download_webpage( tfa_req, None, - note=u'Submitting TFA code', errnote=u'unable to submit tfa', fatal=False) + note='Submitting TFA code', errnote='unable to submit tfa', fatal=False) if tfa_results is False: return False if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None: - self._downloader.report_warning(u'Two-factor code expired. Please try again, or use a one-use backup code instead.') + self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.') return False if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None: - self._downloader.report_warning(u'unable to log in - did the page structure change?') + self._downloader.report_warning('unable to log in - did the page structure change?') return False if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None: - self._downloader.report_warning(u'Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.') + self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.') return False if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None: - self._downloader.report_warning(u'unable to log in: bad username or password') + self._downloader.report_warning('unable to log in: bad username or password') return False return True @@ -185,7 +185,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): self._download_webpage( req, None, - note=u'Confirming age', errnote=u'Unable to confirm age') + note='Confirming age', errnote='Unable to confirm age') return True def _real_initialize(self): @@ -402,19 +402,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def report_video_info_webpage_download(self, video_id): """Report attempt to download video info webpage.""" - self.to_screen(u'%s: Downloading video info webpage' % video_id) + self.to_screen('%s: Downloading video info webpage' % video_id) def report_information_extraction(self, video_id): """Report attempt to extract video information.""" - self.to_screen(u'%s: Extracting video information' % video_id) + self.to_screen('%s: Extracting video information' % video_id) def report_unavailable_format(self, video_id, format): """Report extracted video URL.""" - self.to_screen(u'%s: Format %s not available' % (video_id, format)) + self.to_screen('%s: Format %s not available' % (video_id, format)) def report_rtmp_download(self): """Indicate the download will use the RTMP protocol.""" - self.to_screen(u'RTMP download detected') + self.to_screen('RTMP download detected') def _signature_cache_id(self, example_sig): """ Return a string representation of a signature """ @@ -434,21 +434,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): player_type, player_id, self._signature_cache_id(example_sig)) assert os.path.basename(func_id) == func_id - cache_spec = self._downloader.cache.load(u'youtube-sigfuncs', func_id) + cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) if cache_spec is not None: return lambda s: ''.join(s[i] for i in cache_spec) if player_type == 'js': code = self._download_webpage( player_url, video_id, - note=u'Downloading %s player %s' % (player_type, player_id), - errnote=u'Download of %s failed' % player_url) + note='Downloading %s player %s' % (player_type, player_id), + errnote='Download of %s failed' % player_url) res = self._parse_sig_js(code) elif player_type == 'swf': urlh = self._request_webpage( player_url, video_id, - note=u'Downloading %s player %s' % (player_type, player_id), - errnote=u'Download of %s failed' % player_url) + note='Downloading %s player %s' % (player_type, player_id), + errnote='Download of %s failed' % player_url) code = urlh.read() res = self._parse_sig_swf(code) else: @@ -459,15 +459,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): cache_res = res(test_string) cache_spec = [ord(c) for c in cache_res] - self._downloader.cache.store(u'youtube-sigfuncs', func_id, cache_spec) + self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec) return res def _print_sig_code(self, func, example_sig): def gen_sig_code(idxs): def _genslice(start, end, step): starts = '' if start == 0 else str(start) - ends = (u':%d' % (end+step)) if end + step >= 0 else ':' - steps = '' if step == 1 else (u':%d' % step) + ends = (':%d' % (end+step)) if end + step >= 0 else ':' + steps = '' if step == 1 else (':%d' % step) return 's[%s%s%s]' % (starts, ends, steps) step = None @@ -497,9 +497,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): expr_code = ' + '.join(gen_sig_code(cache_spec)) signature_id_tuple = '(%s)' % ( ', '.join(compat_str(len(p)) for p in example_sig.split('.'))) - code = (u'if tuple(len(p) for p in s.split(\'.\')) == %s:\n' + code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n' ' return %s\n') % (signature_id_tuple, expr_code) - self.to_screen(u'Extracted signature function:\n' + code) + self.to_screen('Extracted signature function:\n' + code) def _parse_sig_js(self, jscode): funcname = self._search_regex( @@ -521,9 +521,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): """Turn the encrypted s field into a working signature""" if player_url is None: - raise ExtractorError(u'Cannot decrypt signature without player_url') + raise ExtractorError('Cannot decrypt signature without player_url') - if player_url.startswith(u'//'): + if player_url.startswith('//'): player_url = 'https:' + player_url try: player_id = (player_url, self._signature_cache_id(s)) @@ -547,7 +547,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, video_id, note=False) except ExtractorError as err: - self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) + self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err)) return {} lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) @@ -565,7 +565,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): url = 'https://www.youtube.com/api/timedtext?' + params sub_lang_list[lang] = url if not sub_lang_list: - self._downloader.report_warning(u'video doesn\'t have subtitles') + self._downloader.report_warning('video doesn\'t have subtitles') return {} return sub_lang_list @@ -573,7 +573,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" sub_format = self._downloader.params.get('subtitlesformat', 'srt') - self.to_screen(u'%s: Looking for automatic captions' % video_id) + self.to_screen('%s: Looking for automatic captions' % video_id) mobj = re.search(r';ytplayer.config = ({.*?});', webpage) err_msg = 'Couldn\'t find automatic captions for %s' % video_id if mobj is None: @@ -594,7 +594,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): caption_list = self._download_xml(list_url, video_id) original_lang_node = caption_list.find('track') if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' : - self._downloader.report_warning(u'Video doesn\'t have automatic captions') + self._downloader.report_warning('Video doesn\'t have automatic captions') return {} original_lang = original_lang_node.attrib['lang_code'] @@ -620,7 +620,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def extract_id(cls, url): mobj = re.match(cls._VALID_URL, url, re.VERBOSE) if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError('Invalid URL: %s' % url) video_id = mobj.group(2) return video_id @@ -640,7 +640,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def _extract_annotations(self, video_id): url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id - return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.') + return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.') def _real_extract(self, url): proto = ( @@ -710,14 +710,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - raise ExtractorError(u'"rental" videos not supported') + raise ExtractorError('"rental" videos not supported') # Start extracting information self.report_information_extraction(video_id) # uploader if 'author' not in video_info: - raise ExtractorError(u'Unable to extract uploader name') + raise ExtractorError('Unable to extract uploader name') video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0]) # uploader_id @@ -726,13 +726,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if mobj is not None: video_uploader_id = mobj.group(1) else: - self._downloader.report_warning(u'unable to extract uploader nickname') + self._downloader.report_warning('unable to extract uploader nickname') # title if 'title' in video_info: video_title = video_info['title'][0] else: - self._downloader.report_warning(u'Unable to extract video title') + self._downloader.report_warning('Unable to extract video title') video_title = '_' # thumbnail image @@ -742,7 +742,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if m_thumb is not None: video_thumbnail = m_thumb.group(1) elif 'thumbnail_url' not in video_info: - self._downloader.report_warning(u'unable to extract video thumbnail') + self._downloader.report_warning('unable to extract video thumbnail') video_thumbnail = None else: # don't panic if we can't find it video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0]) @@ -796,8 +796,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): if count is not None: return int(count.replace(',', '')) return None - like_count = _extract_count(u'like') - dislike_count = _extract_count(u'dislike') + like_count = _extract_count('like') + dislike_count = _extract_count('dislike') # subtitles video_subtitles = self.extract_subtitles(video_id, video_webpage) @@ -807,7 +807,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): return if 'length_seconds' not in video_info: - self._downloader.report_warning(u'unable to extract video duration') + self._downloader.report_warning('unable to extract video duration') video_duration = None else: video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])) @@ -828,11 +828,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Easy way to know if the 's' value is in url_encoded_fmt_stream_map # this signatures are encrypted if 'url_encoded_fmt_stream_map' not in args: - raise ValueError(u'No stream_map present') # caught below + raise ValueError('No stream_map present') # caught below re_signature = re.compile(r'[&,]s=') m_s = re_signature.search(args['url_encoded_fmt_stream_map']) if m_s is not None: - self.to_screen(u'%s: Encrypted signatures detected.' % video_id) + self.to_screen('%s: Encrypted signatures detected.' % video_id) video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']] m_s = re_signature.search(args.get('adaptive_fmts', '')) if m_s is not None: @@ -910,7 +910,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): player_desc = 'html5 player %s' % player_version parts_sizes = self._signature_cache_id(encrypted_sig) - self.to_screen(u'{%s} signature length %s, %s' % + self.to_screen('{%s} signature length %s, %s' % (format_id, parts_sizes, player_desc)) signature = self._decrypt_signature( @@ -925,7 +925,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): url_map = self._extract_from_m3u8(manifest_url, video_id) formats = _map_to_format_list(url_map) else: - raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') + raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') # Look for the DASH manifest if (self._downloader.params.get('youtube_include_dash_manifest', False)): @@ -946,9 +946,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url) dash_doc = self._download_xml( dash_manifest_url, video_id, - note=u'Downloading DASH manifest', - errnote=u'Could not download DASH manifest') - for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'): + note='Downloading DASH manifest', + errnote='Could not download DASH manifest') + for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'): url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL') if url_el is None: continue @@ -974,7 +974,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): existing_format.update(f) except (ExtractorError, KeyError) as e: - self.report_warning(u'Skipping DASH manifest: %s' % e, video_id) + self.report_warning('Skipping DASH manifest: %s' % e, video_id) self._sort_formats(formats) @@ -1095,7 +1095,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): # Extract playlist id mobj = re.match(self._VALID_URL, url) if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError('Invalid URL: %s' % url) playlist_id = mobj.group(1) or mobj.group(2) # Check if it's a video-specific URL @@ -1103,16 +1103,16 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): if 'v' in query_dict: video_id = query_dict['v'][0] if self._downloader.params.get('noplaylist'): - self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id) + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) return self.url_result(video_id, 'Youtube', video_id=video_id) else: - self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) if playlist_id.startswith('RD'): # Mixes require a custom extraction process return self._extract_mix(playlist_id) if playlist_id.startswith('TL'): - raise ExtractorError(u'For downloading YouTube.com top lists, use ' + raise ExtractorError('For downloading YouTube.com top lists, use ' 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True) url = self._TEMPLATE_URL % playlist_id @@ -1157,7 +1157,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): class YoutubeTopListIE(YoutubePlaylistIE): IE_NAME = 'youtube:toplist' - IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"' + IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"' ' (Example: "yttoplist:music:Top Tracks")') _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$' _TESTS = [] @@ -1207,7 +1207,7 @@ class YoutubeChannelIE(InfoExtractor): # Extract channel id mobj = re.match(self._VALID_URL, url) if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError('Invalid URL: %s' % url) # Download channel page channel_id = mobj.group(1) @@ -1229,7 +1229,7 @@ class YoutubeChannelIE(InfoExtractor): for pagenum in itertools.count(1): url = self._MORE_PAGES_URL % (pagenum, channel_id) page = self._download_json( - url, channel_id, note=u'Downloading page #%s' % pagenum, + url, channel_id, note='Downloading page #%s' % pagenum, transform_source=uppercase_escape) ids_in_page = self.extract_videos_from_page(page['content_html']) @@ -1238,7 +1238,7 @@ class YoutubeChannelIE(InfoExtractor): if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']: break - self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) + self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id) for video_id in video_ids] @@ -1265,7 +1265,7 @@ class YoutubeUserIE(InfoExtractor): # Extract username mobj = re.match(self._VALID_URL, url) if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + raise ExtractorError('Invalid URL: %s' % url) username = mobj.group(1) @@ -1286,7 +1286,7 @@ class YoutubeUserIE(InfoExtractor): try: response = json.loads(page) except ValueError as err: - raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) + raise ExtractorError('Invalid JSON in API response: ' + compat_str(err)) if 'entry' not in response['feed']: return @@ -1327,9 +1327,9 @@ class YoutubeSearchIE(SearchInfoExtractor): compat_urllib_parse.quote_plus(query.encode('utf-8')), (PAGE_SIZE * pagenum) + 1) data_json = self._download_webpage( - result_url, video_id=u'query "%s"' % query, - note=u'Downloading page %s' % (pagenum + 1), - errnote=u'Unable to download API page') + result_url, video_id='query "%s"' % query, + note='Downloading page %s' % (pagenum + 1), + errnote='Unable to download API page') data = json.loads(data_json) api_response = data['data'] @@ -1404,7 +1404,7 @@ class YoutubeShowIE(InfoExtractor): webpage = self._download_webpage(url, show_name, 'Downloading show webpage') # There's one playlist for each season of the show m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage)) - self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons))) + self.to_screen('%s: Found %s seasons' % (show_name, len(m_seasons))) return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons] From cdc628a498b8f2198d057ba1ba78e86d8915e3aa Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 24 Sep 2014 10:25:47 +0200 Subject: [PATCH 055/652] [youtube] Move more tests to extractors --- test/test_youtube_lists.py | 39 ------------------ youtube_dl/extractor/youtube.py | 71 ++++++++++++++++++++++++++++----- 2 files changed, 62 insertions(+), 48 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 1fa99f88b..410f9edc2 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -10,7 +10,6 @@ from test.helper import FakeYDL from youtube_dl.extractor import ( - YoutubeUserIE, YoutubePlaylistIE, YoutubeIE, YoutubeChannelIE, @@ -43,28 +42,6 @@ class TestYoutubeLists(unittest.TestCase): self.assertEqual(len(entries), 25) self.assertEqual(YoutubeIE().extract_id(entries[-1]['url']), 'rYefUsYuEp0') - def test_youtube_channel(self): - dl = FakeYDL() - ie = YoutubeChannelIE(dl) - #test paginated channel - result = ie.extract('https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w') - self.assertTrue(len(result['entries']) > 90) - #test autogenerated channel - result = ie.extract('https://www.youtube.com/channel/HCtnHdj3df7iM/videos') - self.assertTrue(len(result['entries']) >= 18) - - def test_youtube_user(self): - dl = FakeYDL() - ie = YoutubeUserIE(dl) - result = ie.extract('https://www.youtube.com/user/TheLinuxFoundation') - self.assertTrue(len(result['entries']) >= 320) - - def test_youtube_show(self): - dl = FakeYDL() - ie = YoutubeShowIE(dl) - result = ie.extract('http://www.youtube.com/show/airdisasters') - self.assertTrue(len(result) >= 3) - def test_youtube_mix(self): dl = FakeYDL() ie = YoutubePlaylistIE(dl) @@ -83,21 +60,5 @@ class TestYoutubeLists(unittest.TestCase): entries = result['entries'] self.assertEqual(len(entries), 100) - def test_youtube_toplist(self): - dl = FakeYDL() - ie = YoutubeTopListIE(dl) - result = ie.extract('yttoplist:music:Trending') - entries = result['entries'] - self.assertTrue(len(entries) >= 5) - - def test_youtube_search_url(self): - dl = FakeYDL() - ie = YoutubeSearchURLIE(dl) - result = ie.extract('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video') - entries = result['entries'] - self.assertIsPlaylist(result) - self.assertEqual(result['title'], 'youtube-dl test video') - self.assertTrue(len(entries) >= 5) - if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0257ee2f9..2ef76b69b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1160,16 +1160,25 @@ class YoutubeTopListIE(YoutubePlaylistIE): IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"' ' (Example: "yttoplist:music:Top Tracks")') _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$' - _TESTS = [] + _TESTS = [{ + 'url': 'yttoplist:music:Trending', + 'playlist_mincount': 5, + 'skip': 'Only works for logged-in users', + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) channel = mobj.group('chann') title = mobj.group('title') query = compat_urllib_parse.urlencode({'title': title}) - playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query) - channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title) - link = self._html_search_regex(playlist_re, channel_page, 'list') + channel_page = self._download_webpage( + 'https://www.youtube.com/%s' % channel, title) + link = self._html_search_regex( + r'''(?x) + <a\s+href="([^"]+)".*?>\s* + <span\s+class="branded-page-module-title-text">\s* + <span[^>]*>.*?%s.*?</span>''' % re.escape(query), + channel_page, 'list') url = compat_urlparse.urljoin('https://www.youtube.com/', link) video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"' @@ -1195,6 +1204,11 @@ class YoutubeChannelIE(InfoExtractor): _MORE_PAGES_INDICATOR = 'yt-uix-load-more' _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' IE_NAME = 'youtube:channel' + _TESTS = [{ + 'note': 'paginated channel', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', + 'playlist_mincount': 91, + }] def extract_videos_from_page(self, page): ids_in_page = [] @@ -1253,6 +1267,17 @@ class YoutubeUserIE(InfoExtractor): _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json' IE_NAME = 'youtube:user' + _TESTS = [{ + 'url': 'https://www.youtube.com/user/TheLinuxFoundation', + 'playlist_mincount': 320, + 'info_dict': { + 'title': 'TheLinuxFoundation', + } + }, { + 'url': 'ytuser:phihag', + 'only_matching': True, + }] + @classmethod def suitable(cls, url): # Don't return True if the url can be extracted with other youtube @@ -1361,6 +1386,13 @@ class YoutubeSearchURLIE(InfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)' + _TESTS = [{ + 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', + 'playlist_mincount': 5, + 'info_dict': { + 'title': 'youtube-dl test video', + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -1395,17 +1427,38 @@ class YoutubeSearchURLIE(InfoExtractor): class YoutubeShowIE(InfoExtractor): IE_DESC = 'YouTube.com (multi-season) shows' - _VALID_URL = r'https?://www\.youtube\.com/show/(.*)' + _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)' IE_NAME = 'youtube:show' + _TESTS = [{ + 'url': 'http://www.youtube.com/show/airdisasters', + 'playlist_mincount': 3, + 'info_dict': { + 'id': 'airdisasters', + 'title': 'Air Disasters', + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - show_name = mobj.group(1) - webpage = self._download_webpage(url, show_name, 'Downloading show webpage') + playlist_id = mobj.group('id') + webpage = self._download_webpage( + url, playlist_id, 'Downloading show webpage') # There's one playlist for each season of the show m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage)) - self.to_screen('%s: Found %s seasons' % (show_name, len(m_seasons))) - return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons] + self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons))) + entries = [ + self.url_result( + 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist') + for season in m_seasons + ] + title = self._og_search_title(webpage, fatal=False) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': title, + 'entries': entries, + } class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): From ac7553d031ffa6cdcdb109330467eb7c423ffd13 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 24 Sep 2014 10:34:29 +0200 Subject: [PATCH 056/652] [youtube] Support embed/videoseries URLs (#3821) --- youtube_dl/extractor/youtube.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2ef76b69b..ae9564862 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -211,7 +211,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: - (?:(?:v|embed|e)/) # v/ or embed/ or e/ + (?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/ |(?: # or the v= param in all its forms (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! @@ -1005,7 +1005,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): (?:\w+\.)? youtube\.com/ (?: - (?:course|view_play_list|my_playlists|artist|playlist|watch) + (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries) \? (?:.*?&)*? (?:p|a|list)= | p/ ) @@ -1061,6 +1061,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): 'title': 'YDL_safe_search', }, 'playlist_count': 2, + }, { + 'note': 'embedded', + 'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', + 'playlist_count': 4, + 'info_dict': { + 'title': 'JODA15', + } }] def _real_initialize(self): From cc746841e76a0ab6a1bb65400ca496a105f65821 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 24 Sep 2014 10:46:33 +0200 Subject: [PATCH 057/652] [flickr] Modernize --- youtube_dl/extractor/flickr.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 21ea5ec2b..e09982e88 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -10,13 +10,13 @@ from ..utils import ( class FlickrIE(InfoExtractor): - """Information Extractor for Flickr videos""" - _VALID_URL = r'(?:https?://)?(?:www\.|secure\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*' + _VALID_URL = r'https?://(?:www\.|secure\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*' _TEST = { 'url': 'http://www.flickr.com/photos/forestwander-nature-pictures/5645318632/in/photostream/', - 'file': '5645318632.mp4', 'md5': '6fdc01adbc89d72fc9c4f15b4a4ba87b', 'info_dict': { + 'id': '5645318632', + 'ext': 'mp4', "description": "Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.", "uploader_id": "forestwander-nature-pictures", "title": "Dark Hollow Waterfalls" @@ -49,12 +49,12 @@ class FlickrIE(InfoExtractor): raise ExtractorError('Unable to extract video url') video_url = mobj.group(1) + unescapeHTML(mobj.group(2)) - return [{ - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': self._og_search_title(webpage), + return { + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), 'uploader_id': video_uploader_id, - }] + } From 3b2f933b01c30a8b3a6bd7fb8418b44167ca30c5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 24 Sep 2014 11:05:14 +0200 Subject: [PATCH 058/652] [generic] Allow embedded YoutubePlaylists (Fixes #3821) --- youtube_dl/extractor/generic.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 40eeaad16..a3bfeb174 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -584,7 +584,9 @@ class GenericIE(InfoExtractor): # Helper method def _playlist_from_matches(matches, getter, ie=None): - urlrs = orderedSet(self.url_result(getter(m), ie) for m in matches) + urlrs = orderedSet( + self.url_result(self._proto_relative_url(getter(m)), ie) + for m in matches) return self.playlist_result( urlrs, playlist_id=video_id, playlist_title=video_title) @@ -633,7 +635,7 @@ class GenericIE(InfoExtractor): \1''', webpage) if matches: return _playlist_from_matches( - matches, lambda m: unescapeHTML(m[1]), ie='Youtube') + matches, lambda m: unescapeHTML(m[1])) # Look for embedded Dailymotion player matches = re.findall( From 2f771f6c99480684522f2ccdfac25d69c1470ea5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 24 Sep 2014 11:06:46 +0200 Subject: [PATCH 059/652] release 2014.09.24 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2853c79c9..960fd59a3 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.09.22.1' +__version__ = '2014.09.24' From f0b5d6af74469d8216aebfe8079dbe1516188b89 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 24 Sep 2014 14:16:56 +0200 Subject: [PATCH 060/652] [vevo] Support 1080p videos (Fixes #3656) --- youtube_dl/downloader/__init__.py | 3 ++ youtube_dl/downloader/hls.py | 47 +++++++++++++++++++++++++++++++ youtube_dl/extractor/common.py | 16 +++++++++-- youtube_dl/extractor/vevo.py | 40 +++++++++++++++++++++++++- 4 files changed, 102 insertions(+), 4 deletions(-) diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index 4ea5811a5..3f941596e 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import FileDownloader from .hls import HlsFD +from .hls import NativeHlsFD from .http import HttpFD from .mplayer import MplayerFD from .rtmp import RtmpFD @@ -19,6 +20,8 @@ def get_suitable_downloader(info_dict): if url.startswith('rtmp'): return RtmpFD + if protocol == 'm3u8_native': + return NativeHlsFD if (protocol == 'm3u8') or (protocol is None and determine_ext(url) == 'm3u8'): return HlsFD if url.startswith('mms') or url.startswith('rtsp'): diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 32852f333..8040bdf08 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -1,8 +1,12 @@ +from __future__ import unicode_literals + import os +import re import subprocess from .common import FileDownloader from ..utils import ( + compat_urlparse, check_executable, encodeFilename, ) @@ -43,3 +47,46 @@ class HlsFD(FileDownloader): self.to_stderr(u"\n") self.report_error(u'%s exited with code %d' % (program, retval)) return False + + +class NativeHlsFD(FileDownloader): + """ A more limited implementation that does not require ffmpeg """ + + def real_download(self, filename, info_dict): + url = info_dict['url'] + self.report_destination(filename) + tmpfilename = self.temp_name(filename) + + self.to_screen( + '[hlsnative] %s: Downloading m3u8 manifest' % info_dict['id']) + data = self.ydl.urlopen(url).read() + s = data.decode('utf-8', 'ignore') + segment_urls = [] + for line in s.splitlines(): + line = line.strip() + if line and not line.startswith('#'): + segment_url = ( + line + if re.match(r'^https?://', line) + else compat_urlparse.urljoin(url, line)) + segment_urls.append(segment_url) + + byte_counter = 0 + with open(tmpfilename, 'wb') as outf: + for i, segurl in enumerate(segment_urls): + segment = self.ydl.urlopen(segurl).read() + outf.write(segment) + byte_counter += len(segment) + self.to_screen( + '[hlsnative] %s: Downloading segment %d / %d' % + (info_dict['id'], i + 1, len(segment_urls))) + + self._hook_progress({ + 'downloaded_bytes': byte_counter, + 'total_bytes': byte_counter, + 'filename': filename, + 'status': 'finished', + }) + self.try_rename(tmpfilename, filename) + return True + diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 9c30a1d33..60cab6f4e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -15,6 +15,7 @@ from ..utils import ( compat_http_client, compat_urllib_error, compat_urllib_parse_urlparse, + compat_urlparse, compat_str, clean_html, @@ -640,7 +641,9 @@ class InfoExtractor(object): return formats - def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None): + def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, + entry_protocol='m3u8', preference=None): + formats = [{ 'format_id': 'm3u8-meta', 'url': m3u8_url, @@ -651,6 +654,11 @@ class InfoExtractor(object): 'format_note': 'Quality selection URL', }] + format_url = lambda u: ( + u + if re.match(r'^https?://', u) + else compat_urlparse.urljoin(m3u8_url, u)) + m3u8_doc = self._download_webpage(m3u8_url, video_id) last_info = None kv_rex = re.compile( @@ -667,15 +675,17 @@ class InfoExtractor(object): continue else: if last_info is None: - formats.append({'url': line}) + formats.append({'url': format_url(line)}) continue tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000) f = { 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)), - 'url': line.strip(), + 'url': format_url(line.strip()), 'tbr': tbr, 'ext': ext, + 'protocol': entry_protocol, + 'preference': preference, } codecs = last_info.get('CODECS') if codecs: diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index d2ffd1b6b..5e54a35d4 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -6,6 +6,7 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( compat_HTTPError, + compat_urllib_request, ExtractorError, ) @@ -69,6 +70,21 @@ class VevoIE(InfoExtractor): }] _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/' + def _real_initialize(self): + req = compat_urllib_request.Request( + 'http://www.vevo.com/auth', data=b'') + webpage = self._download_webpage( + req, None, + note='Retrieving oauth token', + errnote='Unable to retrieve oauth token', + fatal=False) + if webpage is False: + self._oauth_token = None + else: + self._oauth_token = self._search_regex( + r'access_token":\s*"([^"]+)"', + webpage, 'access token', fatal=False) + def _formats_from_json(self, video_info): last_version = {'version': -1} for version in video_info['videoVersions']: @@ -129,6 +145,26 @@ class VevoIE(InfoExtractor): }) return formats + def _download_api_formats(self, video_id): + if not self._oauth_token: + self._downloader.report_warning( + 'No oauth token available, skipping API HLS download') + return [] + + api_url = 'https://apiv2.vevo.com/video/%s/streams/hls?token=%s' % ( + video_id, self._oauth_token) + api_data = self._download_json( + api_url, video_id, + note='Downloading HLS formats', + errnote='Failed to download HLS format list', fatal=False) + if api_data is None: + return [] + + m3u8_url = api_data[0]['url'] + return self._extract_m3u8_formats( + m3u8_url, video_id, entry_protocol='m3u8_native', ext='mp4', + preference=0) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') @@ -152,6 +188,9 @@ class VevoIE(InfoExtractor): else: age_limit = None + # Download via HLS API + formats.extend(self._download_api_formats(video_id)) + # Download SMIL smil_blocks = sorted(( f for f in video_info['videoVersions'] @@ -166,7 +205,6 @@ class VevoIE(InfoExtractor): fatal=False) if smil_url_m is not None: smil_url = smil_url_m - try: smil_xml = self._download_webpage(smil_url, video_id, 'Downloading SMIL info') From eb73f2649f41e80063d6f2f3e4b6345eb90f9777 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 24 Sep 2014 14:17:33 +0200 Subject: [PATCH 061/652] [vevo] Skip SMIL download --- youtube_dl/extractor/vevo.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 5e54a35d4..1edeece3f 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -191,30 +191,6 @@ class VevoIE(InfoExtractor): # Download via HLS API formats.extend(self._download_api_formats(video_id)) - # Download SMIL - smil_blocks = sorted(( - f for f in video_info['videoVersions'] - if f['sourceType'] == 13), - key=lambda f: f['version']) - - smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % ( - self._SMIL_BASE_URL, video_id, video_id.lower()) - if smil_blocks: - smil_url_m = self._search_regex( - r'url="([^"]+)"', smil_blocks[-1]['data'], 'SMIL URL', - fatal=False) - if smil_url_m is not None: - smil_url = smil_url_m - try: - smil_xml = self._download_webpage(smil_url, video_id, - 'Downloading SMIL info') - formats.extend(self._formats_from_smil(smil_xml)) - except ExtractorError as ee: - if not isinstance(ee.cause, compat_HTTPError): - raise - self._downloader.report_warning( - 'Cannot download SMIL information, falling back to JSON ..') - self._sort_formats(formats) timestamp_ms = int(self._search_regex( r'/Date\((\d+)\)/', video_info['launchDate'], 'launch date')) From 0b97f3a93690ea5449790acc1274df8900d141aa Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 24 Sep 2014 14:17:42 +0200 Subject: [PATCH 062/652] release 2014.09.24.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 960fd59a3..ecbd578db 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.09.24' +__version__ = '2014.09.24.1' From b686fc18dacaa6994c646c171368b99e168b619a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 24 Sep 2014 14:38:40 +0200 Subject: [PATCH 063/652] [hlsnative] Support test parameter --- youtube_dl/downloader/common.py | 1 + youtube_dl/downloader/hls.py | 18 +++++++++++++++--- youtube_dl/downloader/http.py | 2 -- youtube_dl/extractor/vevo.py | 4 ++-- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 9ce97f5fe..f85f0c94e 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -42,6 +42,7 @@ class FileDownloader(object): Subclasses of this one must re-define the real_download method. """ + _TEST_FILE_SIZE = 10241 params = None def __init__(self, ydl, params): diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 8040bdf08..56cce2811 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -7,6 +7,7 @@ import subprocess from .common import FileDownloader from ..utils import ( compat_urlparse, + compat_urllib_request, check_executable, encodeFilename, ) @@ -71,15 +72,26 @@ class NativeHlsFD(FileDownloader): else compat_urlparse.urljoin(url, line)) segment_urls.append(segment_url) + is_test = self.params.get('test', False) + remaining_bytes = self._TEST_FILE_SIZE if is_test else None byte_counter = 0 with open(tmpfilename, 'wb') as outf: for i, segurl in enumerate(segment_urls): - segment = self.ydl.urlopen(segurl).read() - outf.write(segment) - byte_counter += len(segment) self.to_screen( '[hlsnative] %s: Downloading segment %d / %d' % (info_dict['id'], i + 1, len(segment_urls))) + seg_req = compat_urllib_request.Request(segurl) + if remaining_bytes: + seg_req.add_header('Range', 'bytes=0-%d' % (remaining_bytes - 1)) + + segment = self.ydl.urlopen(seg_req).read() + if remaining_bytes: + segment = segment[:remaining_bytes] + remaining_bytes -= len(segment) + outf.write(segment) + byte_counter += len(segment) + if remaining_bytes <= 0: + break self._hook_progress({ 'downloaded_bytes': byte_counter, diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 6caf7451e..f62555ce0 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -14,8 +14,6 @@ from ..utils import ( class HttpFD(FileDownloader): - _TEST_FILE_SIZE = 10241 - def real_download(self, filename, info_dict): url = info_dict['url'] tmpfilename = self.temp_name(filename) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 1edeece3f..ebab8b86c 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -25,7 +25,7 @@ class VevoIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', - "md5": "06bea460acb744eab74a9d7dcb4bfd61", + "md5": "95ee28ee45e70130e3ab02b0f579ae23", 'info_dict': { 'id': 'GB1101300280', 'ext': 'mp4', @@ -41,7 +41,7 @@ class VevoIE(InfoExtractor): }, { 'note': 'v3 SMIL format', 'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923', - 'md5': '893ec0e0d4426a1d96c01de8f2bdff58', + 'md5': 'f6ab09b034f8c22969020b042e5ac7fc', 'info_dict': { 'id': 'USUV71302923', 'ext': 'mp4', From 6b08cdf626afc71740e539a83ef570999df2c50b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 25 Sep 2014 01:58:49 +0200 Subject: [PATCH 064/652] [youtube] Support for embedded /p players (Fixes #3821) --- youtube_dl/extractor/generic.py | 2 +- youtube_dl/extractor/youtube.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a3bfeb174..0dcbb39db 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -631,7 +631,7 @@ class GenericIE(InfoExtractor): ) (["\']) (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ - (?:embed|v)/.+?) + (?:embed|v|p)/.+?) \1''', webpage) if matches: return _playlist_from_matches( diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ae9564862..99198e380 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1068,6 +1068,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): 'info_dict': { 'title': 'JODA15', } + }, { + 'note': 'Embedded SWF player', + 'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0', + 'playlist_count': 4, + 'info_dict': { + 'title': 'JODA7', + } }] def _real_initialize(self): From 4bbf157794084e1ca076b63c402bc5aab4a5ad0a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 25 Sep 2014 01:59:45 +0200 Subject: [PATCH 065/652] release 2014.09.25 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index ecbd578db..c17701d6a 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.09.24.1' +__version__ = '2014.09.25' From fec02bcc90ad26ac5bbd11173fa83db91b3858bb Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 25 Sep 2014 09:21:45 +0200 Subject: [PATCH 066/652] [hlsnative] Correct handling when remaining_bytes is None --- youtube_dl/downloader/hls.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 56cce2811..68eafa403 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -81,16 +81,16 @@ class NativeHlsFD(FileDownloader): '[hlsnative] %s: Downloading segment %d / %d' % (info_dict['id'], i + 1, len(segment_urls))) seg_req = compat_urllib_request.Request(segurl) - if remaining_bytes: + if remaining_bytes is not None: seg_req.add_header('Range', 'bytes=0-%d' % (remaining_bytes - 1)) segment = self.ydl.urlopen(seg_req).read() - if remaining_bytes: + if remaining_bytes is not None: segment = segment[:remaining_bytes] remaining_bytes -= len(segment) outf.write(segment) byte_counter += len(segment) - if remaining_bytes <= 0: + if remaining_bytes is not None and remaining_bytes <= 0: break self._hook_progress({ From 8a32b82e46b73680a9287336e455e6e38894bff3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Thu, 25 Sep 2014 09:58:09 +0200 Subject: [PATCH 067/652] [youku] Modernize somewhat --- youtube_dl/extractor/youku.py | 89 ++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 44 deletions(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index a8fd40c83..07ed7cbd1 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -1,5 +1,7 @@ # coding: utf-8 +from __future__ import unicode_literals + import json import math import random @@ -13,18 +15,25 @@ from ..utils import ( class YoukuIE(InfoExtractor): - _VALID_URL = r'(?:(?:http://)?(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|youku:)(?P<ID>[A-Za-z0-9]+)(?:\.html|/v\.swf|)' - _TEST = { - u"url": u"http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html", - u"file": u"XNDgyMDQ2NTQw_part00.flv", - u"md5": u"ffe3f2e435663dc2d1eea34faeff5b5b", - u"params": {u"test": False}, - u"info_dict": { - u"title": u"youtube-dl test video \"'/\\ä↭𝕐" + _VALID_URL = r'''(?x) + (?: + http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)| + youku:) + (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|) + ''' + _TEST = { + 'url': 'http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html', + 'md5': 'ffe3f2e435663dc2d1eea34faeff5b5b', + 'params': { + 'test': False + }, + 'info_dict': { + 'id': 'XNDgyMDQ2NTQw_part00', + 'ext': 'flv', + 'title': 'youtube-dl test video "\'/\\ä↭𝕐' } } - def _gen_sid(self): nowTime = int(time.time() * 1000) random1 = random.randint(1000,1998) @@ -55,49 +64,42 @@ class YoukuIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group('ID') + video_id = mobj.group('id') info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id - jsondata = self._download_webpage(info_url, video_id) + config = self._download_json(info_url, video_id) - self.report_extraction(video_id) - try: - config = json.loads(jsondata) - error_code = config['data'][0].get('error_code') - if error_code: - # -8 means blocked outside China. - error = config['data'][0].get('error') # Chinese and English, separated by newline. - raise ExtractorError(error or u'Server reported error %i' % error_code, - expected=True) + error_code = config['data'][0].get('error_code') + if error_code: + # -8 means blocked outside China. + error = config['data'][0].get('error') # Chinese and English, separated by newline. + raise ExtractorError(error or 'Server reported error %i' % error_code, + expected=True) - video_title = config['data'][0]['title'] - seed = config['data'][0]['seed'] + video_title = config['data'][0]['title'] + seed = config['data'][0]['seed'] - format = self._downloader.params.get('format', None) - supported_format = list(config['data'][0]['streamfileids'].keys()) + format = self._downloader.params.get('format', None) + supported_format = list(config['data'][0]['streamfileids'].keys()) - if format is None or format == 'best': - if 'hd2' in supported_format: - format = 'hd2' - else: - format = 'flv' - ext = u'flv' - elif format == 'worst': - format = 'mp4' - ext = u'mp4' + # TODO proper format selection + if format is None or format == 'best': + if 'hd2' in supported_format: + format = 'hd2' else: format = 'flv' - ext = u'flv' + ext = 'flv' + elif format == 'worst': + format = 'mp4' + ext = 'mp4' + else: + format = 'flv' + ext = 'flv' - - fileid = config['data'][0]['streamfileids'][format] - keys = [s['k'] for s in config['data'][0]['segs'][format]] - # segs is usually a dictionary, but an empty *list* if an error occured. - except (UnicodeDecodeError, ValueError, KeyError): - raise ExtractorError(u'Unable to extract info section') + fileid = config['data'][0]['streamfileids'][format] + keys = [s['k'] for s in config['data'][0]['segs'][format]] + # segs is usually a dictionary, but an empty *list* if an error occured. files_info=[] sid = self._gen_sid() @@ -106,9 +108,8 @@ class YoukuIE(InfoExtractor): #column 8,9 of fileid represent the segment number #fileid[7:9] should be changed for index, key in enumerate(keys): - temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:]) - download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key) + download_url = 'http://k.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key) info = { 'id': '%s_part%02d' % (video_id, index), From 54e9a4af951f26edd7719f1a1b56e0a92d2791ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 25 Sep 2014 20:33:11 +0700 Subject: [PATCH 068/652] [wat] Skip test --- youtube_dl/extractor/wat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 268e2f618..bf9e40bad 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -40,6 +40,7 @@ class WatIE(InfoExtractor): 'upload_date': '20140816', 'duration': 2910, }, + 'skip': "Ce contenu n'est pas disponible pour l'instant.", }, ] From fbd3162e4918a2e1321ebdcec47ac84a8b121fbe Mon Sep 17 00:00:00 2001 From: Sergey M <dstftw@gmail.com> Date: Thu, 25 Sep 2014 20:48:54 +0700 Subject: [PATCH 069/652] [vube] Add DMCA notice --- youtube_dl/extractor/vube.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py index 2544c24bd..bcca4897a 100644 --- a/youtube_dl/extractor/vube.py +++ b/youtube_dl/extractor/vube.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..utils import ( int_or_none, compat_str, + ExtractorError, ) @@ -102,6 +103,11 @@ class VubeIE(InfoExtractor): self._sort_formats(formats) + if not formats and video.get('vst') == 'dmca': + raise ExtractorError( + 'This video has been removed in response to a complaint received under the US Digital Millennium Copyright Act.', + expected=True) + title = video['title'] description = video.get('description') thumbnail = self._proto_relative_url(video.get('thumbnail_src'), scheme='http:') From 9a0d98bb401a809eaed68623a8534b3874d079e8 Mon Sep 17 00:00:00 2001 From: Sergey M <dstftw@gmail.com> Date: Thu, 25 Sep 2014 20:57:18 +0700 Subject: [PATCH 070/652] [vube] Update tests --- youtube_dl/extractor/vube.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py index bcca4897a..1b2f731e9 100644 --- a/youtube_dl/extractor/vube.py +++ b/youtube_dl/extractor/vube.py @@ -17,6 +17,24 @@ class VubeIE(InfoExtractor): _TESTS = [ { + 'url': 'http://vube.com/trending/William+Wei/Y8NUZ69Tf7?t=s', + 'md5': 'e7aabe1f8f1aa826b9e4735e1f9cee42', + 'info_dict': { + 'id': 'Y8NUZ69Tf7', + 'ext': 'mp4', + 'title': 'Best Drummer Ever [HD]', + 'description': 'md5:2d63c4b277b85c2277761c2cf7337d71', + 'thumbnail': 're:^https?://.*\.jpg', + 'uploader': 'William', + 'timestamp': 1406876915, + 'upload_date': '20140801', + 'duration': 258.051, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'categories': ['amazing', 'hd', 'best drummer ever', 'william wei', 'bucket drumming', 'street drummer', 'epic street drumming'], + }, + }, { 'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon', 'md5': 'db7aba89d4603dadd627e9d1973946fe', 'info_dict': { @@ -33,7 +51,8 @@ class VubeIE(InfoExtractor): 'dislike_count': int, 'comment_count': int, 'categories': ['pop', 'music', 'cover', 'singing', 'jessie j', 'price tag', 'chiara grispo'], - } + }, + 'skip': 'Removed due to DMCA', }, { 'url': 'http://vube.com/SerainaMusic/my-7-year-old-sister-and-i-singing-alive-by-krewella/UeBhTudbfS?t=s&n=1', @@ -52,7 +71,8 @@ class VubeIE(InfoExtractor): 'dislike_count': int, 'comment_count': int, 'categories': ['seraina', 'jessica', 'krewella', 'alive'], - } + }, + 'skip': 'Removed due to DMCA', }, { 'url': 'http://vube.com/vote/Siren+Gene/0nmsMY5vEq?n=2&t=s', 'md5': '0584fc13b50f887127d9d1007589d27f', @@ -70,7 +90,8 @@ class VubeIE(InfoExtractor): 'dislike_count': int, 'comment_count': int, 'categories': ['let it go', 'cover', 'idina menzel', 'frozen', 'singing', 'disney', 'siren gene'], - } + }, + 'skip': 'Removed due to DMCA', } ] From 6a5af6acb9131d702b0d206242053b202440dbb9 Mon Sep 17 00:00:00 2001 From: Mats <d912e3@gmail.com> Date: Thu, 25 Sep 2014 16:25:53 +0200 Subject: [PATCH 071/652] [golem] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/golem.py | 131 +++++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+) create mode 100644 youtube_dl/extractor/golem.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1f1fc0eb2..71fe38ca0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -135,6 +135,7 @@ from .gametrailers import GametrailersIE from .gdcvault import GDCVaultIE from .generic import GenericIE from .godtube import GodTubeIE +from .golem import GolemIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .gorillavid import GorillaVidIE diff --git a/youtube_dl/extractor/golem.py b/youtube_dl/extractor/golem.py new file mode 100644 index 000000000..afb620b1c --- /dev/null +++ b/youtube_dl/extractor/golem.py @@ -0,0 +1,131 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import compat_urlparse + + +class GolemIE(InfoExtractor): + _VALID_URL = r'^https?://video\.golem\.de/.+?/(?P<id>.+?)/' + _TEST = { + 'url': 'http://video.golem.de/handy/14095/iphone-6-und-6-plus-test.html', + 'md5': 'c1a2c0a3c863319651c7c992c5ee29bf', + 'info_dict': { + 'id': '14095', + 'format_id': 'high', + 'ext': 'mp4', + 'title': 'iPhone 6 und 6 Plus - Test', + 'duration': 300, + 'filesize': 65309548, + } + } + + _CONFIG = 'https://video.golem.de/xml/{}.xml' + _PREFIX = 'http://video.golem.de' + + def _warn(self, fmt, *args): + self.report_warning(fmt.format(*args), self._id) + + def _extract_format(self, elem): + format_id = elem.tag + + url = elem.findtext('./url') + if url == '': + self._warn("{}: url: empty, skipping", format_id) + return None + + fmt = { + 'format_id': format_id, + 'url': compat_urlparse.urljoin(self._PREFIX, url) + } + + try: + _, ext = elem.findtext('./filename', '').rsplit('.', 1) + except ValueError: + self._warn('{}: ext: missing extension', format_id) + else: + fmt['ext'] = ext + + filesize = elem.findtext('./filesize') + if filesize is not None: + try: + fmt['filesize'] = int(filesize) + except ValueError as e: + self._warn('{}: filesize: {}', format_id, e) + + width = elem.get('width') + if width is not None: + try: + fmt['width'] = int(width) + except ValueError as e: + self._warn('{}: width: {}', format_id, e) + + height = elem.get('height') + if height is not None: + try: + fmt['height'] = int(height) + except ValueError as e: + self._warn('{}: height: {}', format_id, e) + + return fmt + + def _extract_thumbnail(self, elem): + url = elem.findtext('./url') + if url == '': + return None + thumb = { + 'url': compat_urlparse.urljoin(self._PREFIX, url) + } + + width = elem.get('width') + if width is not None: + try: + thumb['width'] = int(width) + except ValueError as e: + self._warn('thumbnail: width: {}', e) + + height = elem.get('height') + if height is not None: + try: + thumb['height'] = int(height) + except ValueError as e: + self._warn('thumbnail: height: {}', e) + + return thumb + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + self._id = mobj.group('id') + + config = self._download_xml(self._CONFIG.format(self._id), self._id) + + info = { + 'id': self._id, + 'title': config.findtext('./title', 'golem') + } + + formats = [] + for e in config.findall('./*[url]'): + fmt = self._extract_format(e) + if fmt is not None: + formats.append(fmt) + self._sort_formats(formats) + info['formats'] = formats + + thumbnails = [] + for e in config.findall('.//teaser[url]'): + thumb = self._extract_thumbnail(e) + if thumb is not None: + thumbnails.append(thumb) + info['thumbnails'] = thumbnails + + playtime = config.findtext('./playtime') + if playtime is not None: + try: + info['duration'] = round(float(playtime)) + except ValueError as e: + self._warn('duration: {}', e) + + return info From 11b3ce85097430e1d26ddff0f51aa895c9d5af43 Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Thu, 25 Sep 2014 17:57:38 +0300 Subject: [PATCH 072/652] [crunchyroll] Allow to list subtitles (fixes #3805) --- youtube_dl/extractor/crunchyroll.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 4903764f7..f99888ecc 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -9,7 +9,7 @@ import xml.etree.ElementTree from hashlib import sha1 from math import pow, sqrt, floor -from .common import InfoExtractor +from .subtitles import SubtitlesInfoExtractor from ..utils import ( ExtractorError, compat_urllib_parse, @@ -26,7 +26,7 @@ from ..aes import ( ) -class CrunchyrollIE(InfoExtractor): +class CrunchyrollIE(SubtitlesInfoExtractor): _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:[^/]*/[^/?&]*?|media/\?id=)(?P<video_id>[0-9]+))(?:[/?&]|$)' _TEST = { 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', @@ -271,6 +271,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text else: subtitles[lang_code] = self._convert_subtitles_to_srt(subtitle) + if self._downloader.params.get('listsubtitles', False): + self._list_available_subtitles(video_id, subtitles) + return + return { 'id': video_id, 'title': video_title, From 8e6f8051f084f445015140f1f88ac770f3c0f43d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 27 Sep 2014 10:53:02 +0200 Subject: [PATCH 073/652] [vbox7] Don't set the extension to 'flv' (fixes #3836) --- youtube_dl/extractor/vbox7.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index df115d251..ebd64f0f5 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -19,7 +19,7 @@ class Vbox7IE(InfoExtractor): 'md5': '99f65c0c9ef9b682b97313e052734c3f', 'info_dict': { 'id': '249bb972c2', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Смях! Чудо - чист за секунди - Скрита камера', }, } @@ -50,7 +50,6 @@ class Vbox7IE(InfoExtractor): return { 'id': video_id, 'url': final_url, - 'ext': 'flv', 'title': title, 'thumbnail': thumbnail_url, } From 497339fa0e633c8b1dcebf3f70670f6d96ee2d62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 27 Sep 2014 22:29:27 +0700 Subject: [PATCH 074/652] [anysex] Fix extraction --- youtube_dl/extractor/anysex.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/anysex.py b/youtube_dl/extractor/anysex.py index bc64423a3..ad86d6e58 100644 --- a/youtube_dl/extractor/anysex.py +++ b/youtube_dl/extractor/anysex.py @@ -35,7 +35,7 @@ class AnySexIE(InfoExtractor): title = self._html_search_regex(r'<title>(.*?)', webpage, 'title') description = self._html_search_regex( - r'
([^<]+)
', webpage, 'description', fatal=False) + r'
]*>([^<]+)
', webpage, 'description', fatal=False) thumbnail = self._html_search_regex( r'preview_url\s*:\s*\'(.*?)\'', webpage, 'thumbnail', fatal=False) @@ -43,7 +43,7 @@ class AnySexIE(InfoExtractor): r'
([^<]+)', webpage) duration = parse_duration(self._search_regex( - r'Duration: (\d+:\d+)', webpage, 'duration', fatal=False)) + r'Duration: (?:)?(\d+:\d+)', webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( r'Views: (\d+)', webpage, 'view count', fatal=False)) From 2f9e8776df664e21aee18b05c468a56b03fe4417 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Sep 2014 22:36:53 +0700 Subject: [PATCH 075/652] [extremetube] Fix extraction --- youtube_dl/extractor/extremetube.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index 14a196ffc..aacbf1414 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -7,6 +7,7 @@ from ..utils import ( compat_urllib_parse_urlparse, compat_urllib_request, compat_urllib_parse, + str_to_int, ) @@ -20,6 +21,7 @@ class ExtremeTubeIE(InfoExtractor): 'ext': 'mp4', 'title': 'Music Video 14 british euro brit european cumshots swallow', 'uploader': 'unknown', + 'view_count': int, 'age_limit': 18, } }, { @@ -39,8 +41,12 @@ class ExtremeTubeIE(InfoExtractor): video_title = self._html_search_regex( r'

]*?title="([^"]+)"[^>]*>', webpage, 'title') uploader = self._html_search_regex( - r'>Posted by:(?=<)(?:\s|<[^>]*>)*(.+?)\|', webpage, 'uploader', - fatal=False) + r'Uploaded by:\s*\s*(.+?)\s*', + webpage, 'uploader', fatal=False) + view_count = str_to_int(self._html_search_regex( + r'Views:\s*\s*([\d,\.]+)', + webpage, 'view count', fatal=False)) + video_url = compat_urllib_parse.unquote(self._html_search_regex( r'video_url=(.+?)&', webpage, 'video_url')) path = compat_urllib_parse_urlparse(video_url).path @@ -51,6 +57,7 @@ class ExtremeTubeIE(InfoExtractor): 'id': video_id, 'title': video_title, 'uploader': uploader, + 'view_count': view_count, 'url': video_url, 'format': format, 'format_id': format, From 2a1325fdde5c88fc052710b3c42fcc0d73153901 Mon Sep 17 00:00:00 2001 From: net Date: Sat, 27 Sep 2014 20:11:22 +0300 Subject: [PATCH 076/652] [ynet] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/ynet.py | 47 ++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 youtube_dl/extractor/ynet.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1f1fc0eb2..944e356ae 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -450,6 +450,7 @@ from .yahoo import ( YahooNewsIE, YahooSearchIE, ) +from .ynet import YnetIE from .youjizz import YouJizzIE from .youku import YoukuIE from .youporn import YouPornIE diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py new file mode 100644 index 000000000..94d253679 --- /dev/null +++ b/youtube_dl/extractor/ynet.py @@ -0,0 +1,47 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import base64 +import json + +from .common import InfoExtractor +from youtube_dl.utils import compat_urllib_parse_urlparse, compat_urllib_parse + +class YnetIE(InfoExtractor): + _VALID_URL = r'http://.*ynet\.co\.il/.*/0,7340,(?PL(-[0-9]+)+),00\.html' + _TEST = { + 'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html', + 'info_dict': { + 'id': 'L-11659-99244', + 'ext': 'flv', + 'title': 'md5:3dba12d2837ee2ad9652cc64af652b16', + 'thumbnail': 'http://hot.ynet.co.il/PicServer4/2014/09/23/5606015/AMERICAN_COMMUNE1_T.jpg', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + id = mobj.group('id') + + webpage = self._download_webpage(url, id) + + content = compat_urllib_parse.unquote_plus(self._og_search_video_url(webpage).decode('utf-8')) + + player_url = re.match('(http.*\.swf)\?' ,content).group(1) + + config = json.loads(re.match('.*config\=(.*)' ,content).group(1)) + + f4m_url = config['clip']['url'] + + title = re.sub(': Video$', '', self._og_search_title(webpage)) + + return { + 'id': id, + 'title': title, + 'formats': self._extract_f4m_formats(f4m_url, id), + 'thumbnail': self._og_search_thumbnail(webpage), + 'player_url': player_url, + } + From b66745288e50cff42ff711e63242b5d97e80cd4f Mon Sep 17 00:00:00 2001 From: net Date: Sat, 27 Sep 2014 20:21:46 +0300 Subject: [PATCH 077/652] [sport5] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/sport5.py | 70 ++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 youtube_dl/extractor/sport5.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1f1fc0eb2..c3a4d3c9a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -340,6 +340,7 @@ from .spiegel import SpiegelIE, SpiegelArticleIE from .spiegeltv import SpiegeltvIE from .spike import SpikeIE from .sportdeutschland import SportDeutschlandIE +from .sport5 import Sport5IE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE from .streamcloud import StreamcloudIE diff --git a/youtube_dl/extractor/sport5.py b/youtube_dl/extractor/sport5.py new file mode 100644 index 000000000..9a4e39a43 --- /dev/null +++ b/youtube_dl/extractor/sport5.py @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from youtube_dl.utils import compat_str, compat_urlretrieve + + + +class Sport5IE(InfoExtractor): + _VALID_URL = r'http://.*sport5\.co\.il' + _TESTS = [{ + 'url': 'http://vod.sport5.co.il/?Vc=147&Vi=176331&Page=1', + 'info_dict': { + 'id': 's5-Y59xx1-GUh2', + 'ext': 'mp4', + 'title': 'md5:4a2a5eba7e7dc88fdc446cbca8a41c79', + } + }, { + 'url': 'http://www.sport5.co.il/articles.aspx?FolderID=3075&docID=176372&lang=HE', + 'info_dict': { + 'id': 's5-SiXxx1-hKh2', + 'ext': 'mp4', + 'title': 'md5:5cb1c6bfc0f16086e59f6683013f8e02', + } + } + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + webpage = self._download_webpage(url, '') + + media_id = self._html_search_regex('clipId=(s5-\w+-\w+)', webpage, 'media id') + + xml = self._download_xml( + 'http://sport5-metadata-rr-d.nsacdn.com/vod/vod/%s/HDS/metadata.xml' % media_id, + media_id, 'Downloading media XML') + + title = xml.find('./Title').text + duration = xml.find('./Duration').text + description = xml.find('./Description').text + thumbnail = xml.find('./PosterLinks/PosterIMG').text + player_url = xml.find('./PlaybackLinks/PlayerUrl').text + file_els = xml.findall('./PlaybackLinks/FileURL') + + formats = [] + + for file_el in file_els: + bitrate = file_el.attrib.get('bitrate') + width = int(file_el.attrib.get('width')) + height = int(file_el.attrib.get('height')) + formats.append({ + 'url': compat_str(file_el.text), + 'ext': 'mp4', + 'height': height, + 'width': width + }) + + self._sort_formats(formats) + + return { + 'id': media_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + 'player_url': player_url, + } \ No newline at end of file From 0155549d6cec6f49279ebe4a5a73cf6dcc6716fe Mon Sep 17 00:00:00 2001 From: Mats Date: Sat, 27 Sep 2014 19:28:01 +0200 Subject: [PATCH 078/652] [heise] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/heise.py | 120 +++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100644 youtube_dl/extractor/heise.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 1f1fc0eb2..d0417a1f2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -141,6 +141,7 @@ from .gorillavid import GorillaVidIE from .goshgay import GoshgayIE from .grooveshark import GroovesharkIE from .hark import HarkIE +from .heise import HeiseIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE from .hornbunny import HornBunnyIE diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py new file mode 100644 index 000000000..b3cb10fde --- /dev/null +++ b/youtube_dl/extractor/heise.py @@ -0,0 +1,120 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + compat_urllib_parse, + get_meta_content, + parse_iso8601, +) + + +class HeiseIE(InfoExtractor): + _VALID_URL = ( + r'^https?://(?:www\.)?heise\.de/video/artikel/' + + r'.+?(?P[0-9]+)\.html$' + ) + _TEST = { + 'url': ( + 'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-' + + 'Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html' + ), + 'md5': 'ffed432483e922e88545ad9f2f15d30e', + 'info_dict': { + 'id': '2404147', + 'ext': 'mp4', + 'title': ( + "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / " + + "Peilsender Smartphone" + ), + 'format_id': 'mp4_720', + 'timestamp': 1411812600, + 'upload_date': '20140927', + } + } + + _CONFIG = ( + r'".+?\?sequenz=(?P.+?)&container=(?P.+?)' + + r'(?:&hd=(?P.+?))?(?:&signature=(?P.+?))?&callback=\?"' + ) + _PREFIX = 'http://www.heise.de/videout/info?' + + def _warn(self, fmt, *args): + self.report_warning(fmt.format(*args), self._id) + + def _parse_config_url(self, html): + m = re.search(self._CONFIG, html) + if not m: + raise ExtractorError('No config found') + + qs = compat_urllib_parse.urlencode(dict((k, v) for k, v + in m.groupdict().items() + if v is not None)) + return self._PREFIX + qs + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + self._id = mobj.group('id') + + html = self._download_webpage(url, self._id) + config = self._download_json(self._parse_config_url(html), self._id) + + info = { + 'id': self._id + } + + title = get_meta_content('fulltitle', html) + if title: + info['title'] = title + elif config.get('title'): + info['title'] = config['title'] + else: + self._warn('title: not found') + info['title'] = 'heise' + + if (not config.get('formats') or + not hasattr(config['formats'], 'items')): + raise ExtractorError('No formats found') + + formats = [] + for t, rs in config['formats'].items(): + if not rs or not hasattr(rs, 'items'): + self._warn('formats: {0}: no resolutions', t) + continue + + for res, obj in rs.items(): + format_id = '{0}_{1}'.format(t, res) + + if (not obj or not obj.get('url') or + not isinstance(obj['url'], str)): + self._warn('formats: {0}: no url', format_id) + continue + + fmt = { + 'url': obj['url'], + 'format_id': format_id + } + try: + fmt['height'] = int(res) + except ValueError as e: + self._warn('formats: {0}: height: {1}', t, e) + + formats.append(fmt) + + self._sort_formats(formats) + info['formats'] = formats + + if config.get('poster') and isinstance(config['poster'], str): + info['thumbnail'] = config['poster'] + + date = get_meta_content('date', html) + if date and isinstance(date, str): + try: + info['timestamp'] = parse_iso8601(date) + except ValueError as e: + self._warn('timestamp: {0}', e) + + return info From 70752ccefd2dcb54d131644aea38c324c81ff168 Mon Sep 17 00:00:00 2001 From: Mats Date: Sat, 27 Sep 2014 19:35:55 +0200 Subject: [PATCH 079/652] [golem] Don't omit positional argument specifiers Required by Python 2.6. --- youtube_dl/extractor/golem.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/golem.py b/youtube_dl/extractor/golem.py index afb620b1c..6a64b5d95 100644 --- a/youtube_dl/extractor/golem.py +++ b/youtube_dl/extractor/golem.py @@ -22,7 +22,7 @@ class GolemIE(InfoExtractor): } } - _CONFIG = 'https://video.golem.de/xml/{}.xml' + _CONFIG = 'https://video.golem.de/xml/{0}.xml' _PREFIX = 'http://video.golem.de' def _warn(self, fmt, *args): @@ -33,7 +33,7 @@ class GolemIE(InfoExtractor): url = elem.findtext('./url') if url == '': - self._warn("{}: url: empty, skipping", format_id) + self._warn("{0}: url: empty, skipping", format_id) return None fmt = { @@ -44,7 +44,7 @@ class GolemIE(InfoExtractor): try: _, ext = elem.findtext('./filename', '').rsplit('.', 1) except ValueError: - self._warn('{}: ext: missing extension', format_id) + self._warn('{0}: ext: missing extension', format_id) else: fmt['ext'] = ext @@ -53,21 +53,21 @@ class GolemIE(InfoExtractor): try: fmt['filesize'] = int(filesize) except ValueError as e: - self._warn('{}: filesize: {}', format_id, e) + self._warn('{0}: filesize: {1}', format_id, e) width = elem.get('width') if width is not None: try: fmt['width'] = int(width) except ValueError as e: - self._warn('{}: width: {}', format_id, e) + self._warn('{0}: width: {1}', format_id, e) height = elem.get('height') if height is not None: try: fmt['height'] = int(height) except ValueError as e: - self._warn('{}: height: {}', format_id, e) + self._warn('{0}: height: {1}', format_id, e) return fmt @@ -84,14 +84,14 @@ class GolemIE(InfoExtractor): try: thumb['width'] = int(width) except ValueError as e: - self._warn('thumbnail: width: {}', e) + self._warn('thumbnail: width: {0}', e) height = elem.get('height') if height is not None: try: thumb['height'] = int(height) except ValueError as e: - self._warn('thumbnail: height: {}', e) + self._warn('thumbnail: height: {0}', e) return thumb @@ -126,6 +126,6 @@ class GolemIE(InfoExtractor): try: info['duration'] = round(float(playtime)) except ValueError as e: - self._warn('duration: {}', e) + self._warn('duration: {0}', e) return info From 68b09730461de20395cee9427dc469fa9edc4022 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 28 Sep 2014 02:07:42 +0700 Subject: [PATCH 080/652] [YoutubeDL] Expect all kind of strings in urlopen Now it doesn't fail if req is python2's str --- youtube_dl/YoutubeDL.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a1713dc5a..b485dbdf1 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1250,12 +1250,13 @@ class YoutubeDL(object): # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) # To work around aforementioned issue we will replace request's original URL with # percent-encoded one - url = req if isinstance(req, compat_str) else req.get_full_url() + req_is_string = isinstance(req, basestring) + url = req if req_is_string else req.get_full_url() url_escaped = escape_url(url) # Substitute URL if any change after escaping if url != url_escaped: - if isinstance(req, compat_str): + if req_is_string: req = url_escaped else: req = compat_urllib_request.Request( From 7b7518124ee433484b485502671e011017bc1897 Mon Sep 17 00:00:00 2001 From: Mats Date: Sat, 27 Sep 2014 21:12:23 +0200 Subject: [PATCH 081/652] [heise] Don't check string type Before Python 3 could be unicode, so don't check at all. --- youtube_dl/extractor/heise.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index b3cb10fde..73c953181 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -88,8 +88,7 @@ class HeiseIE(InfoExtractor): for res, obj in rs.items(): format_id = '{0}_{1}'.format(t, res) - if (not obj or not obj.get('url') or - not isinstance(obj['url'], str)): + if not obj or not obj.get('url'): self._warn('formats: {0}: no url', format_id) continue @@ -107,11 +106,11 @@ class HeiseIE(InfoExtractor): self._sort_formats(formats) info['formats'] = formats - if config.get('poster') and isinstance(config['poster'], str): + if config.get('poster'): info['thumbnail'] = config['poster'] date = get_meta_content('date', html) - if date and isinstance(date, str): + if date: try: info['timestamp'] = parse_iso8601(date) except ValueError as e: From 0b75c2a88ba56a84322db6cc1a298d7e52b44b2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 28 Sep 2014 02:31:14 +0700 Subject: [PATCH 082/652] [sport5] Capture error message and improve --- youtube_dl/extractor/sport5.py | 88 +++++++++++++++++++++------------- 1 file changed, 55 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/sport5.py b/youtube_dl/extractor/sport5.py index 9a4e39a43..3f680bfc6 100644 --- a/youtube_dl/extractor/sport5.py +++ b/youtube_dl/extractor/sport5.py @@ -4,67 +4,89 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from youtube_dl.utils import compat_str, compat_urlretrieve - +from ..utils import ExtractorError class Sport5IE(InfoExtractor): - _VALID_URL = r'http://.*sport5\.co\.il' - _TESTS = [{ + _VALID_URL = r'http://(?:www|vod)?\.sport5\.co\.il/.*\b(?:Vi|docID)=(?P\d+)' + _TESTS = [ + { 'url': 'http://vod.sport5.co.il/?Vc=147&Vi=176331&Page=1', 'info_dict': { 'id': 's5-Y59xx1-GUh2', 'ext': 'mp4', - 'title': 'md5:4a2a5eba7e7dc88fdc446cbca8a41c79', - } + 'title': 'ולנסיה-קורדובה 0:3', + 'description': 'אלקאסר, גאייה ופגולי סידרו לקבוצה של נונו ניצחון על קורדובה ואת המקום הראשון בליגה', + 'duration': 228, + 'categories': list, + }, + 'skip': 'Blocked outside of Israel', }, { 'url': 'http://www.sport5.co.il/articles.aspx?FolderID=3075&docID=176372&lang=HE', 'info_dict': { 'id': 's5-SiXxx1-hKh2', 'ext': 'mp4', - 'title': 'md5:5cb1c6bfc0f16086e59f6683013f8e02', - } + 'title': 'GOALS_CELTIC_270914.mp4', + 'description': '', + 'duration': 87, + 'categories': list, + }, + 'skip': 'Blocked outside of Israel', } ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + media_id = mobj.group('id') - webpage = self._download_webpage(url, '') + webpage = self._download_webpage(url, media_id) - media_id = self._html_search_regex('clipId=(s5-\w+-\w+)', webpage, 'media id') + video_id = self._html_search_regex('clipId=([\w-]+)', webpage, 'video id') - xml = self._download_xml( - 'http://sport5-metadata-rr-d.nsacdn.com/vod/vod/%s/HDS/metadata.xml' % media_id, - media_id, 'Downloading media XML') + metadata = self._download_xml( + 'http://sport5-metadata-rr-d.nsacdn.com/vod/vod/%s/HDS/metadata.xml' % video_id, + video_id) - title = xml.find('./Title').text - duration = xml.find('./Duration').text - description = xml.find('./Description').text - thumbnail = xml.find('./PosterLinks/PosterIMG').text - player_url = xml.find('./PlaybackLinks/PlayerUrl').text - file_els = xml.findall('./PlaybackLinks/FileURL') + error = metadata.find('./Error') + if error is not None: + raise ExtractorError( + '%s returned error: %s - %s' % ( + self.IE_NAME, + error.find('./Name').text, + error.find('./Description').text), + expected=True) - formats = [] + title = metadata.find('./Title').text + description = metadata.find('./Description').text + duration = int(metadata.find('./Duration').text) - for file_el in file_els: - bitrate = file_el.attrib.get('bitrate') - width = int(file_el.attrib.get('width')) - height = int(file_el.attrib.get('height')) - formats.append({ - 'url': compat_str(file_el.text), - 'ext': 'mp4', - 'height': height, - 'width': width - }) + posters_el = metadata.find('./PosterLinks') + thumbnails = [{ + 'url': thumbnail.text, + 'width': int(thumbnail.get('width')), + 'height': int(thumbnail.get('height')), + } for thumbnail in posters_el.findall('./PosterIMG')] if posters_el is not None else [] + categories_el = metadata.find('./Categories') + categories = [ + cat.get('name') for cat in categories_el.findall('./Category') + ] if categories_el is not None else [] + + formats = [{ + 'url': fmt.text, + 'ext': 'mp4', + 'vbr': int(fmt.get('bitrate')), + 'width': int(fmt.get('width')), + 'height': int(fmt.get('height')), + } for fmt in metadata.findall('./PlaybackLinks/FileURL')] self._sort_formats(formats) return { - 'id': media_id, + 'id': video_id, 'title': title, - 'thumbnail': thumbnail, + 'description': description, + 'thumbnails': thumbnails, 'duration': duration, + 'categories': categories, 'formats': formats, - 'player_url': player_url, } \ No newline at end of file From f776d8f6081b305ba7ccc1bda323aa510a01db7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 28 Sep 2014 02:35:46 +0700 Subject: [PATCH 083/652] [sport5] Keep alphanumeric order --- youtube_dl/extractor/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index c3a4d3c9a..5a14540a3 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -339,8 +339,8 @@ from .spankwire import SpankwireIE from .spiegel import SpiegelIE, SpiegelArticleIE from .spiegeltv import SpiegeltvIE from .spike import SpikeIE -from .sportdeutschland import SportDeutschlandIE from .sport5 import Sport5IE +from .sportdeutschland import SportDeutschlandIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE from .streamcloud import StreamcloudIE From ee0d90707a38537355bab8527edd9a42d6514aa9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 28 Sep 2014 02:48:41 +0700 Subject: [PATCH 084/652] [YoutubeDL] Fix string check for python3 --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index b485dbdf1..4a9610355 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1250,7 +1250,7 @@ class YoutubeDL(object): # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) # To work around aforementioned issue we will replace request's original URL with # percent-encoded one - req_is_string = isinstance(req, basestring) + req_is_string = isinstance(req, basestring if sys.version_info < (3, 0) else compat_str) url = req if req_is_string else req.get_full_url() url_escaped = escape_url(url) From c6641823238ac70091520fe9b4b02ec3d41cb1a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 28 Sep 2014 03:26:38 +0700 Subject: [PATCH 085/652] [ynet] Remove unused stuff, simplify and improve --- youtube_dl/extractor/ynet.py | 63 ++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py index 94d253679..66d53962a 100644 --- a/youtube_dl/extractor/ynet.py +++ b/youtube_dl/extractor/ynet.py @@ -2,46 +2,53 @@ from __future__ import unicode_literals import re -import base64 import json from .common import InfoExtractor -from youtube_dl.utils import compat_urllib_parse_urlparse, compat_urllib_parse +from ..utils import compat_urllib_parse + class YnetIE(InfoExtractor): - _VALID_URL = r'http://.*ynet\.co\.il/.*/0,7340,(?PL(-[0-9]+)+),00\.html' - _TEST = { - 'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html', - 'info_dict': { - 'id': 'L-11659-99244', - 'ext': 'flv', - 'title': 'md5:3dba12d2837ee2ad9652cc64af652b16', - 'thumbnail': 'http://hot.ynet.co.il/PicServer4/2014/09/23/5606015/AMERICAN_COMMUNE1_T.jpg', + _VALID_URL = r'http://.*ynet\.co\.il/.*/0,7340,(?PL(?:-[0-9]+)+),00\.html' + _TESTS = [ + { + 'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html', + 'md5': '002b44ee2f33d50363a1c153bed524cf', + 'info_dict': { + 'id': 'L-11659-99244', + 'ext': 'flv', + 'title': 'איש לא יודע מאיפה באנו', + 'thumbnail': 're:^https?://.*\.jpg', + } + }, { + 'url': 'http://hot.ynet.co.il/home/0,7340,L-8859-84418,00.html', + 'md5': '6455046ae1b48cf7e2b7cae285e53a16', + 'info_dict': { + 'id': 'L-8859-84418', + 'ext': 'flv', + 'title': "צפו: הנשיקה הלוהטת של תורגי' ויוליה פלוטקין", + 'thumbnail': 're:^https?://.*\.jpg', + } } - } + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') - id = mobj.group('id') - - webpage = self._download_webpage(url, id) + webpage = self._download_webpage(url, video_id) - content = compat_urllib_parse.unquote_plus(self._og_search_video_url(webpage).decode('utf-8')) - - player_url = re.match('(http.*\.swf)\?' ,content).group(1) - - config = json.loads(re.match('.*config\=(.*)' ,content).group(1)) - - f4m_url = config['clip']['url'] - - title = re.sub(': Video$', '', self._og_search_title(webpage)) + content = compat_urllib_parse.unquote_plus(self._og_search_video_url(webpage)) + config = json.loads(self._search_regex(r'config=({.+?})$', content, 'video config')) + f4m_url = config['clip']['url'] + title = self._og_search_title(webpage) + m = re.search(r'ynet - HOT -- (["\']+)(?P.+?)\1', title) + if m: + title = m.group('title') return { - 'id': id, + 'id': video_id, 'title': title, - 'formats': self._extract_f4m_formats(f4m_url, id), + 'formats': self._extract_f4m_formats(f4m_url, video_id), 'thumbnail': self._og_search_thumbnail(webpage), - 'player_url': player_url, - } - + } \ No newline at end of file From a89435a7a8e0574239531bfeedc437ae14b13902 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 28 Sep 2014 03:30:41 +0700 Subject: [PATCH 086/652] [ynet] Improve _VALID_URL --- youtube_dl/extractor/ynet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py index 66d53962a..24872861a 100644 --- a/youtube_dl/extractor/ynet.py +++ b/youtube_dl/extractor/ynet.py @@ -9,7 +9,7 @@ from ..utils import compat_urllib_parse class YnetIE(InfoExtractor): - _VALID_URL = r'http://.*ynet\.co\.il/.*/0,7340,(?P<id>L(?:-[0-9]+)+),00\.html' + _VALID_URL = r'http://(?:.+?\.)?ynet\.co\.il/(?:.+?/)?0,7340,(?P<id>L(?:-[0-9]+)+),00\.html' _TESTS = [ { 'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html', From 5e43e3803c462d7a0f5ac85f8b54ab24f271cb0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 28 Sep 2014 03:45:15 +0700 Subject: [PATCH 087/652] Credit @lenaten for ynet (#3840) and sport5 (#3841) --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 20d7a57ce..7f2b4dfcc 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -78,6 +78,7 @@ __authors__ = ( 'Hari Padmanaban', 'Carlos Ramos', '5moufl', + 'lenaten', ) __license__ = 'Public Domain' From c95eeb7b80e5007259df260b64874b675a802431 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 28 Sep 2014 08:49:03 +0200 Subject: [PATCH 088/652] [eitb] Modernize --- youtube_dl/extractor/eitb.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py index 4ba323148..2cba82532 100644 --- a/youtube_dl/extractor/eitb.py +++ b/youtube_dl/extractor/eitb.py @@ -1,4 +1,6 @@ # encoding: utf-8 +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -7,20 +9,20 @@ from ..utils import ExtractorError class EitbIE(InfoExtractor): - IE_NAME = u'eitb.tv' + IE_NAME = 'eitb.tv' _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/(?P<playlist_id>\d+)/(?P<chapter_id>\d+)' _TEST = { - u'add_ie': ['Brightcove'], - u'url': u'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/', - u'md5': u'edf4436247185adee3ea18ce64c47998', - u'info_dict': { - u'id': u'2743577154001', - u'ext': u'mp4', - u'title': u'60 minutos (Lasa y Zabala, 30 años)', + 'add_ie': ['Brightcove'], + 'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/', + 'md5': 'edf4436247185adee3ea18ce64c47998', + 'info_dict': { + 'id': '2743577154001', + 'ext': 'mp4', + 'title': '60 minutos (Lasa y Zabala, 30 años)', # All videos from eitb has this description in the brightcove info - u'description': u'.', - u'uploader': u'Euskal Telebista', + 'description': '.', + 'uploader': 'Euskal Telebista', }, } @@ -30,7 +32,7 @@ class EitbIE(InfoExtractor): webpage = self._download_webpage(url, chapter_id) bc_url = BrightcoveIE._extract_brightcove_url(webpage) if bc_url is None: - raise ExtractorError(u'Could not extract the Brightcove url') + raise ExtractorError('Could not extract the Brightcove url') # The BrightcoveExperience object doesn't contain the video id, we set # it manually bc_url += '&%40videoPlayer={0}'.format(chapter_id) From f4b1c7adb81555fde0dff390b48e4139438b4071 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 28 Sep 2014 08:53:52 +0200 Subject: [PATCH 089/652] [muenchentv] Move live title generation to common --- youtube_dl/extractor/common.py | 7 +++++++ youtube_dl/extractor/muenchentv.py | 5 +---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 60cab6f4e..403791e6b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import base64 +import datetime import hashlib import json import netrc @@ -705,6 +706,12 @@ class InfoExtractor(object): self._sort_formats(formats) return formats + def _live_title(self, name): + """ Generate the title for a live video """ + now = datetime.datetime.now() + now_str = now.strftime("%Y-%m-%d %H:%M") + return name + ' ' + now_str + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/muenchentv.py b/youtube_dl/extractor/muenchentv.py index 3a938861b..7cb6749be 100644 --- a/youtube_dl/extractor/muenchentv.py +++ b/youtube_dl/extractor/muenchentv.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import datetime import json from .common import InfoExtractor @@ -33,9 +32,7 @@ class MuenchenTVIE(InfoExtractor): display_id = 'live' webpage = self._download_webpage(url, display_id) - now = datetime.datetime.now() - now_str = now.strftime("%Y-%m-%d %H:%M") - title = self._og_search_title(webpage) + ' ' + now_str + title = self._live_title(self._og_search_title(webpage)) data_js = self._search_regex( r'(?s)\nplaylist:\s*(\[.*?}\]),related:', From ed9266db90023500e687aa634b55e11742c2e18c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 28 Sep 2014 09:31:58 +0200 Subject: [PATCH 090/652] [common] Add new helper function _match_id --- youtube_dl/extractor/abc.py | 3 +-- youtube_dl/extractor/common.py | 8 ++++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 7d89f44ee..69f89320c 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -22,8 +22,7 @@ class ABCIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) urls_info_json = self._search_regex( diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 403791e6b..8d6a6f601 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -165,6 +165,14 @@ class InfoExtractor(object): cls._VALID_URL_RE = re.compile(cls._VALID_URL) return cls._VALID_URL_RE.match(url) is not None + @classmethod + def _match_id(cls, url): + if '_VALID_URL_RE' not in cls.__dict__: + cls._VALID_URL_RE = re.compile(cls._VALID_URL) + m = cls._VALID_URL_RE.match(url) + assert m + return m.group('id') + @classmethod def working(cls): """Getter method for _WORKING.""" From 394599f422b11f54efa78123296867efa45a1a2c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 28 Sep 2014 09:48:51 +0200 Subject: [PATCH 091/652] [oktoberfesttv] Add new extractor (Fixes #3845) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/oktoberfesttv.py | 47 +++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 youtube_dl/extractor/oktoberfesttv.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 079221567..629280215 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -261,6 +261,7 @@ from .nrk import ( from .ntv import NTVIE from .nytimes import NYTimesIE from .nuvid import NuvidIE +from .oktoberfesttv import OktoberfestTVIE from .ooyala import OoyalaIE from .orf import ( ORFTVthekIE, diff --git a/youtube_dl/extractor/oktoberfesttv.py b/youtube_dl/extractor/oktoberfesttv.py new file mode 100644 index 000000000..4a41c0542 --- /dev/null +++ b/youtube_dl/extractor/oktoberfesttv.py @@ -0,0 +1,47 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class OktoberfestTVIE(InfoExtractor): + _VALID_URL = r'https?://www\.oktoberfest-tv\.de/[^/]+/[^/]+/video/(?P<id>[^/?#]+)' + + _TEST = { + 'url': 'http://www.oktoberfest-tv.de/de/kameras/video/hb-zelt', + 'info_dict': { + 'id': 'hb-zelt', + 'ext': 'mp4', + 'title': 're:^Live-Kamera: Hofbräuzelt [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': 're:^https?://.*\.jpg$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._live_title(self._html_search_regex( + r'<h1><strong>.*?</strong>(.*?)</h1>', webpage, 'title')) + + clip = self._search_regex( + r"clip:\s*\{\s*url:\s*'([^']+)'", webpage, 'clip') + ncurl = self._search_regex( + r"netConnectionUrl:\s*'([^']+)'", webpage, 'rtmp base') + video_url = ncurl + clip + thumbnail = self._search_regex( + r"canvas:\s*\{\s*backgroundImage:\s*'url\(([^)]+)\)'", webpage, + 'thumbnail', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'ext': 'mp4', + 'is_live': True, + 'thumbnail': thumbnail, + } From 88fbe4c2ccdae7f917b6c9a2655f0878b6e4308c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 28 Sep 2014 09:49:42 +0200 Subject: [PATCH 092/652] release 2014.09.28 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c17701d6a..e62bef2cf 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.09.25' +__version__ = '2014.09.28' From b14f3a4c1da00cbee8775904c24c4d0547018ae0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 28 Sep 2014 10:34:55 +0200 Subject: [PATCH 093/652] [golem] Simplify (#3828) --- youtube_dl/extractor/common.py | 23 ++++++ youtube_dl/extractor/golem.py | 124 +++++++++------------------------ 2 files changed, 56 insertions(+), 91 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 8d6a6f601..f43a0a569 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -22,6 +22,7 @@ from ..utils import ( clean_html, compiled_regex_type, ExtractorError, + float_or_none, int_or_none, RegexNotFoundError, sanitize_filename, @@ -720,6 +721,28 @@ class InfoExtractor(object): now_str = now.strftime("%Y-%m-%d %H:%M") return name + ' ' + now_str + def _int(self, v, name, fatal=False, **kwargs): + res = int_or_none(v, **kwargs) + if 'get_attr' in kwargs: + print(getattr(v, kwargs['get_attr'])) + if res is None: + msg = 'Failed to extract %s: Could not parse value %r' % (name, v) + if fatal: + raise ExtractorError(msg) + else: + self._downloader.report_warning(msg) + return res + + def _float(self, v, name, fatal=False, **kwargs): + res = float_or_none(v, **kwargs) + if res is None: + msg = 'Failed to extract %s: Could not parse value %r' % (name, v) + if fatal: + raise ExtractorError(msg) + else: + self._downloader.report_warning(msg) + return res + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/golem.py b/youtube_dl/extractor/golem.py index 6a64b5d95..a237f19ee 100644 --- a/youtube_dl/extractor/golem.py +++ b/youtube_dl/extractor/golem.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import compat_urlparse +from ..utils import ( + compat_urlparse, + determine_ext, +) class GolemIE(InfoExtractor): @@ -17,115 +20,54 @@ class GolemIE(InfoExtractor): 'format_id': 'high', 'ext': 'mp4', 'title': 'iPhone 6 und 6 Plus - Test', - 'duration': 300, + 'duration': 300.44, 'filesize': 65309548, } } - _CONFIG = 'https://video.golem.de/xml/{0}.xml' _PREFIX = 'http://video.golem.de' - def _warn(self, fmt, *args): - self.report_warning(fmt.format(*args), self._id) - - def _extract_format(self, elem): - format_id = elem.tag - - url = elem.findtext('./url') - if url == '': - self._warn("{0}: url: empty, skipping", format_id) - return None - - fmt = { - 'format_id': format_id, - 'url': compat_urlparse.urljoin(self._PREFIX, url) - } - - try: - _, ext = elem.findtext('./filename', '').rsplit('.', 1) - except ValueError: - self._warn('{0}: ext: missing extension', format_id) - else: - fmt['ext'] = ext - - filesize = elem.findtext('./filesize') - if filesize is not None: - try: - fmt['filesize'] = int(filesize) - except ValueError as e: - self._warn('{0}: filesize: {1}', format_id, e) - - width = elem.get('width') - if width is not None: - try: - fmt['width'] = int(width) - except ValueError as e: - self._warn('{0}: width: {1}', format_id, e) - - height = elem.get('height') - if height is not None: - try: - fmt['height'] = int(height) - except ValueError as e: - self._warn('{0}: height: {1}', format_id, e) - - return fmt - - def _extract_thumbnail(self, elem): - url = elem.findtext('./url') - if url == '': - return None - thumb = { - 'url': compat_urlparse.urljoin(self._PREFIX, url) - } - - width = elem.get('width') - if width is not None: - try: - thumb['width'] = int(width) - except ValueError as e: - self._warn('thumbnail: width: {0}', e) - - height = elem.get('height') - if height is not None: - try: - thumb['height'] = int(height) - except ValueError as e: - self._warn('thumbnail: height: {0}', e) - - return thumb - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - self._id = mobj.group('id') + video_id = self._match_id(url) - config = self._download_xml(self._CONFIG.format(self._id), self._id) + config = self._download_xml( + 'https://video.golem.de/xml/{0}.xml'.format(video_id), video_id) info = { - 'id': self._id, - 'title': config.findtext('./title', 'golem') + 'id': video_id, + 'title': config.findtext('./title', 'golem'), + 'duration': self._float(config.findtext('./playtime'), 'duration'), } formats = [] for e in config.findall('./*[url]'): - fmt = self._extract_format(e) - if fmt is not None: - formats.append(fmt) + url = e.findtext('./url') + if not url: + self._downloader.report_warning( + "{0}: url: empty, skipping".format(e.tag)) + continue + + formats.append({ + 'format_id': e.tag, + 'url': compat_urlparse.urljoin(self._PREFIX, url), + 'height': self._int(e.get('height'), 'height'), + 'width': self._int(e.get('width'), 'width'), + 'filesize': self._int(e.findtext('filesize'), 'filesize'), + 'ext': determine_ext(e.findtext('./filename')), + }) self._sort_formats(formats) info['formats'] = formats thumbnails = [] for e in config.findall('.//teaser[url]'): - thumb = self._extract_thumbnail(e) - if thumb is not None: - thumbnails.append(thumb) + url = e.findtext('./url') + if not url: + continue + thumbnails.append({ + 'url': compat_urlparse.urljoin(self._PREFIX, url), + 'width': self._int(e.get('width'), 'thumbnail width'), + 'height': self._int(e.get('height'), 'thumbnail height'), + }) info['thumbnails'] = thumbnails - playtime = config.findtext('./playtime') - if playtime is not None: - try: - info['duration'] = round(float(playtime)) - except ValueError as e: - self._warn('duration: {0}', e) - return info From 5a8b77551d930d4672159a015f553e64be111492 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 28 Sep 2014 10:40:49 +0200 Subject: [PATCH 094/652] [heise] Simplify (#3842) --- youtube_dl/extractor/heise.py | 92 ++++++++++------------------------- 1 file changed, 26 insertions(+), 66 deletions(-) diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 73c953181..05d4efb8c 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -1,34 +1,28 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( - ExtractorError, - compat_urllib_parse, get_meta_content, parse_iso8601, ) class HeiseIE(InfoExtractor): - _VALID_URL = ( - r'^https?://(?:www\.)?heise\.de/video/artikel/' + - r'.+?(?P<id>[0-9]+)\.html$' - ) + _VALID_URL = r'''(?x) + https?://(?:www\.)?heise\.de/video/artikel/ + .+?(?P<id>[0-9]+)\.html(?:$|[?#]) + ''' _TEST = { 'url': ( - 'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-' + - 'Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html' + 'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html' ), 'md5': 'ffed432483e922e88545ad9f2f15d30e', 'info_dict': { 'id': '2404147', 'ext': 'mp4', 'title': ( - "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / " + - "Peilsender Smartphone" + "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone" ), 'format_id': 'mp4_720', 'timestamp': 1411812600, @@ -36,84 +30,50 @@ class HeiseIE(InfoExtractor): } } - _CONFIG = ( - r'".+?\?sequenz=(?P<sequenz>.+?)&container=(?P<container>.+?)' + - r'(?:&hd=(?P<hd>.+?))?(?:&signature=(?P<signature>.+?))?&callback=\?"' - ) - _PREFIX = 'http://www.heise.de/videout/info?' - - def _warn(self, fmt, *args): - self.report_warning(fmt.format(*args), self._id) - - def _parse_config_url(self, html): - m = re.search(self._CONFIG, html) - if not m: - raise ExtractorError('No config found') - - qs = compat_urllib_parse.urlencode(dict((k, v) for k, v - in m.groupdict().items() - if v is not None)) - return self._PREFIX + qs - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - self._id = mobj.group('id') + video_id = self._match_id(url) - html = self._download_webpage(url, self._id) - config = self._download_json(self._parse_config_url(html), self._id) + webpage = self._download_webpage(url, video_id) + json_url = self._search_regex( + r'json_url:\s*"([^"]+)"', webpage, 'json URL') + config = self._download_json(json_url, video_id) info = { - 'id': self._id + 'id': video_id, + 'thumbnail': config.get('poster'), + 'timestamp': parse_iso8601(get_meta_content('date', webpage)), } - title = get_meta_content('fulltitle', html) + title = get_meta_content('fulltitle', webpage) if title: info['title'] = title elif config.get('title'): info['title'] = config['title'] else: - self._warn('title: not found') - info['title'] = 'heise' - - if (not config.get('formats') or - not hasattr(config['formats'], 'items')): - raise ExtractorError('No formats found') + info['title'] = self._og_search_title(webpage) formats = [] for t, rs in config['formats'].items(): if not rs or not hasattr(rs, 'items'): - self._warn('formats: {0}: no resolutions', t) + self._downloader.report_warning( + 'formats: {0}: no resolutions'.format(t)) continue - for res, obj in rs.items(): - format_id = '{0}_{1}'.format(t, res) + for height_str, obj in rs.items(): + format_id = '{0}_{1}'.format(t, height_str) if not obj or not obj.get('url'): - self._warn('formats: {0}: no url', format_id) + self._downloader.report_warning( + 'formats: {0}: no url'.format(format_id)) continue - fmt = { + formats.append({ 'url': obj['url'], - 'format_id': format_id - } - try: - fmt['height'] = int(res) - except ValueError as e: - self._warn('formats: {0}: height: {1}', t, e) - - formats.append(fmt) + 'format_id': format_id, + 'height': self._int(height_str, 'height'), + }) self._sort_formats(formats) info['formats'] = formats - if config.get('poster'): - info['thumbnail'] = config['poster'] - - date = get_meta_content('date', html) - if date: - try: - info['timestamp'] = parse_iso8601(date) - except ValueError as e: - self._warn('timestamp: {0}', e) - return info From c121a75b368a0c75de7416cbb36d9b9f40a7f1a7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 28 Sep 2014 10:49:12 +0200 Subject: [PATCH 095/652] [heise] Add support for description --- youtube_dl/extractor/heise.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 05d4efb8c..f97b1e085 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -27,6 +27,7 @@ class HeiseIE(InfoExtractor): 'format_id': 'mp4_720', 'timestamp': 1411812600, 'upload_date': '20140927', + 'description': 'In uplink-Episode 3.3 geht es darum, wie man sich von Cloud-Anbietern emanzipieren kann, worauf man beim Kauf einer Tastatur achten sollte und was Smartphones über uns verraten.', } } @@ -42,6 +43,7 @@ class HeiseIE(InfoExtractor): 'id': video_id, 'thumbnail': config.get('poster'), 'timestamp': parse_iso8601(get_meta_content('date', webpage)), + 'description': self._og_search_description(webpage), } title = get_meta_content('fulltitle', webpage) From c84178977268df1fb705bc8fd8cf3aa73158139a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 28 Sep 2014 10:49:58 +0200 Subject: [PATCH 096/652] [muenchentv] Add thumbnail --- youtube_dl/extractor/muenchentv.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/muenchentv.py b/youtube_dl/extractor/muenchentv.py index 7cb6749be..c7f6beb9c 100644 --- a/youtube_dl/extractor/muenchentv.py +++ b/youtube_dl/extractor/muenchentv.py @@ -22,6 +22,7 @@ class MuenchenTVIE(InfoExtractor): 'ext': 'mp4', 'title': 're:^münchen.tv-Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, + 'thumbnail': 're:^https?://.*\.jpg$' }, 'params': { 'skip_download': True, @@ -70,5 +71,6 @@ class MuenchenTVIE(InfoExtractor): 'title': title, 'formats': formats, 'is_live': True, + 'thumbnail': thumbnail, } From 51ee08c4bb044bb670c8a6b855ba48a91892d27b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 28 Sep 2014 10:50:43 +0200 Subject: [PATCH 097/652] Remove unused imports --- youtube_dl/extractor/ard.py | 2 -- youtube_dl/extractor/golem.py | 2 -- youtube_dl/extractor/vevo.py | 1 - youtube_dl/extractor/youku.py | 1 - 4 files changed, 6 deletions(-) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 54cec1c2f..8de9c11ea 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -8,8 +8,6 @@ from ..utils import ( determine_ext, ExtractorError, qualities, - compat_urllib_parse_urlparse, - compat_urllib_parse, int_or_none, parse_duration, unified_strdate, diff --git a/youtube_dl/extractor/golem.py b/youtube_dl/extractor/golem.py index a237f19ee..bebfe8568 100644 --- a/youtube_dl/extractor/golem.py +++ b/youtube_dl/extractor/golem.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( compat_urlparse, diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index ebab8b86c..5b1a3ec78 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -5,7 +5,6 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( - compat_HTTPError, compat_urllib_request, ExtractorError, ) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 07ed7cbd1..48d47a245 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals -import json import math import random import re From 38c4d41b744660463abbb333737e031d9c87243e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 28 Sep 2014 10:55:12 +0200 Subject: [PATCH 098/652] [played] Simplify (#3798) --- youtube_dl/extractor/played.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py index a396e62e5..db40da43b 100644 --- a/youtube_dl/extractor/played.py +++ b/youtube_dl/extractor/played.py @@ -14,7 +14,7 @@ from ..utils import ( class PlayedIE(InfoExtractor): IE_NAME = 'played.to' - _VALID_URL = r'https?://played\.to/(?P<id>[a-zA-Z0-9_-]+)' + _VALID_URL = r'https?://(?:www\.)?played\.to/(?P<id>[a-zA-Z0-9_-]+)' _TEST = { 'url': 'http://played.to/j2f2sfiiukgt', @@ -27,15 +27,14 @@ class PlayedIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) orig_webpage = self._download_webpage(url, video_id) - fields = re.findall(r'type="hidden" name="(.+?)"\s* value="?(.+?)">', orig_webpage) + fields = re.findall( + r'type="hidden" name="([^"]+)"\s+value="([^"]+)">', orig_webpage) data = dict(fields) - self.to_screen('%s: Waiting for timeout' % video_id) - time.sleep(2) + self._sleep(2, video_id) post = compat_urllib_parse.urlencode(data) headers = { @@ -54,4 +53,4 @@ class PlayedIE(InfoExtractor): 'id': video_id, 'title': title, 'url': video_url, - } \ No newline at end of file + } From 76e7d1e74b10b99ed9289b0c30c5f4933f9d841e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 28 Sep 2014 10:56:36 +0200 Subject: [PATCH 099/652] [played] Remove unused import --- youtube_dl/extractor/played.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/played.py b/youtube_dl/extractor/played.py index db40da43b..645a1e06d 100644 --- a/youtube_dl/extractor/played.py +++ b/youtube_dl/extractor/played.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import time import os.path from .common import InfoExtractor From d6e6a4225650ff220c7fe0687d883552e4b45bde Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 28 Sep 2014 12:14:16 +0200 Subject: [PATCH 100/652] [vimeo:likes] Add new extractor (Fixes #3835) --- test/test_download.py | 4 +++- youtube_dl/extractor/__init__.py | 5 +++-- youtube_dl/extractor/generic.py | 10 ++++------ youtube_dl/extractor/vimeo.py | 33 ++++++++++++++++++++++++++++++++ 4 files changed, 43 insertions(+), 9 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index 2b8ac6975..8178015ea 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -139,7 +139,9 @@ def generator(test_case): if is_playlist: self.assertEqual(res_dict['_type'], 'playlist') + self.assertTrue('entries' in res_dict) expect_info_dict(self, test_case.get('info_dict', {}), res_dict) + if 'playlist_mincount' in test_case: assertGreaterEqual( self, @@ -188,7 +190,7 @@ def generator(test_case): expect_info_dict(self, tc.get('info_dict', {}), info_dict) finally: try_rm_tcs_files() - if is_playlist and res_dict is not None: + if is_playlist and res_dict is not None and res_dict.get('entries'): # Remove all other files that may have been extracted if the # extractor returns full results even with extract_flat res_tcs = [{'info_dict': e} for e in res_dict['entries']] diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6ab3eeaf5..86bff185b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -412,11 +412,12 @@ from .videoweed import VideoWeedIE from .vidme import VidmeIE from .vimeo import ( VimeoIE, - VimeoChannelIE, - VimeoUserIE, VimeoAlbumIE, + VimeoChannelIE, VimeoGroupsIE, + VimeoLikesIE, VimeoReviewIE, + VimeoUserIE, VimeoWatchLaterIE, ) from .vimple import VimpleIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 367f930dd..0dfa4853d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -397,12 +397,6 @@ class GenericIE(InfoExtractor): }, ] - def report_download_webpage(self, video_id): - """Report webpage download.""" - if not self._downloader.params.get('test', False): - self._downloader.report_warning('Falling back on generic information extractor.') - super(GenericIE, self).report_download_webpage(video_id) - def report_following_redirect(self, new_url): """Report information extraction.""" self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) @@ -502,6 +496,7 @@ class GenericIE(InfoExtractor): url, smuggled_data = unsmuggle_url(url) force_videoid = None + is_intentional = smuggled_data and smuggled_data.get('to_generic') if smuggled_data and 'force_videoid' in smuggled_data: force_videoid = smuggled_data['force_videoid'] video_id = force_videoid @@ -544,6 +539,9 @@ class GenericIE(InfoExtractor): 'upload_date': upload_date, } + if not self._downloader.params.get('test', False) and not is_intentional: + self._downloader.report_warning('Falling back on generic information extractor.') + try: webpage = self._download_webpage(url, video_id) except ValueError: diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index bc01d7fbf..4be1b8785 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -15,6 +15,7 @@ from ..utils import ( get_element_by_attribute, ExtractorError, RegexNotFoundError, + smuggle_url, std_headers, unsmuggle_url, urlencode_postdata, @@ -529,3 +530,35 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE): def _real_extract(self, url): return self._extract_videos('watchlater', 'https://vimeo.com/home/watchlater') + + +class VimeoLikesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes(?:$|[?#])' + IE_NAME = 'vimeo:likes' + IE_DESC = 'Vimeo user likes' + _TEST = { + 'url': 'https://vimeo.com/user20132939/likes', + 'playlist_mincount': 4, + 'add_ies': ['Generic'], + "info_dict": { + "description": "Videos Philipp Hagemeister likes on Vimeo.", + "title": "Vimeo / Philipp Hagemeister's likes", + }, + 'params': { + 'extract_flat': False, + }, + } + + def _real_extract(self, url): + user_id = self._match_id(url) + rss_url = '%s//vimeo.com/user%s/likes/rss' % ( + self.http_scheme(), user_id) + surl = smuggle_url(rss_url, { + 'force_videoid': '%s_likes' % user_id, + 'to_generic': True, + }) + + return { + '_type': 'url', + 'url': surl, + } From 22dd3fad8623472cfe681fdfbaa346e0c8f5fb84 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 28 Sep 2014 12:14:25 +0200 Subject: [PATCH 101/652] release 2014.09.28.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index e62bef2cf..eb4356811 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.09.28' +__version__ = '2014.09.28.1' From 4bc77c8417ca0340d09dcebb311d06aa7d5ba0ac Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 28 Sep 2014 13:52:21 +0200 Subject: [PATCH 102/652] [README] Use _match_id helper function --- README.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/README.md b/README.md index 5d15decb5..5e0d07997 100644 --- a/README.md +++ b/README.md @@ -442,8 +442,6 @@ If you want to add support for a new site, you can follow this quick list (assum # coding: utf-8 from __future__ import unicode_literals - import re - from .common import InfoExtractor @@ -466,8 +464,7 @@ If you want to add support for a new site, you can follow this quick list (assum } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) # TODO more code goes here, for example ... webpage = self._download_webpage(url, video_id) From 7f5c0c4a19cf72b6ede80ee0fea4611d8bd45010 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 28 Sep 2014 22:10:20 +0700 Subject: [PATCH 103/652] [README] Clarify test's md5 filesize (#3846) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5e0d07997..0f7442906 100644 --- a/README.md +++ b/README.md @@ -449,7 +449,7 @@ If you want to add support for a new site, you can follow this quick list (assum _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P<id>[0-9]+)' _TEST = { 'url': 'http://yourextractor.com/watch/42', - 'md5': 'TODO: md5 sum of the first 10KiB of the video file', + 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', 'info_dict': { 'id': '42', 'ext': 'mp4', From dfee83234b642a94255d52d992295b980ce2a5f7 Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Sun, 28 Sep 2014 19:25:28 +0300 Subject: [PATCH 104/652] [nfl] Prefer progressive downloads --- youtube_dl/extractor/nfl.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py index 963c4587c..668d99512 100644 --- a/youtube_dl/extractor/nfl.py +++ b/youtube_dl/extractor/nfl.py @@ -17,11 +17,11 @@ class NFLIE(InfoExtractor): _PLAYER_CONFIG_URL = 'http://www.nfl.com/static/content/static/config/video/config.json' _TEST = { 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', - # 'md5': '5eb8c40a727dda106d510e5d6ffa79e5', # md5 checksum fluctuates + 'md5': '394ef771ddcd1354f665b471d78ec4c6', 'info_dict': { 'id': '0ap3000000398478', 'ext': 'mp4', - 'title': 'Week 3: Washington Redskins vs. Philadelphia Eagles highlights', + 'title': 'Week 3: Redskins vs. Eagles highlights', 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', 'upload_date': '20140921', 'timestamp': 1411337580, @@ -66,9 +66,9 @@ class NFLIE(InfoExtractor): ) if protocol == 'rtmp': - preference = -2 - elif 'prog' in name.lower(): preference = -1 + elif 'prog' in name.lower(): + preference = 1 else: preference = 0 @@ -94,7 +94,7 @@ class NFLIE(InfoExtractor): return { 'id': video_id, - 'title': video_data.get('storyHeadline'), + 'title': video_data.get('headline'), 'formats': formats, 'description': video_data.get('caption'), 'duration': video_data.get('duration'), From 5f4c318844180d51745303979682a0a482f05328 Mon Sep 17 00:00:00 2001 From: Naglis Jonaitis <njonaitis@gmail.com> Date: Sun, 28 Sep 2014 21:48:26 +0300 Subject: [PATCH 105/652] [nfl] Support team micro-sites (fixes #3831) --- youtube_dl/extractor/nfl.py | 159 +++++++++++++++++++++++------------- 1 file changed, 100 insertions(+), 59 deletions(-) diff --git a/youtube_dl/extractor/nfl.py b/youtube_dl/extractor/nfl.py index 668d99512..4832b3ce4 100644 --- a/youtube_dl/extractor/nfl.py +++ b/youtube_dl/extractor/nfl.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( ExtractorError, + compat_urllib_parse, int_or_none, remove_end, ) @@ -13,76 +14,116 @@ from ..utils import ( class NFLIE(InfoExtractor): IE_NAME = 'nfl.com' - _VALID_URL = r'(?x)https?://(?:www\.)?nfl\.com/(?:videos/(?:.+)/|.*?\#video=)(?P<id>\d..[0-9]+)' - _PLAYER_CONFIG_URL = 'http://www.nfl.com/static/content/static/config/video/config.json' - _TEST = { - 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', - 'md5': '394ef771ddcd1354f665b471d78ec4c6', - 'info_dict': { - 'id': '0ap3000000398478', - 'ext': 'mp4', - 'title': 'Week 3: Redskins vs. Eagles highlights', - 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', - 'upload_date': '20140921', - 'timestamp': 1411337580, - 'thumbnail': 're:^https?://.*\.jpg$', + _VALID_URL = r'''(?x)https?:// + (?P<host>(?:www\.)?(?:nfl\.com|.*?\.clubs\.nfl\.com))/ + (?:.+?/)* + (?P<id>(?:\d[a-z]{2}\d{13}|\w{8}\-(?:\w{4}\-){3}\w{12}))''' + _TESTS = [ + { + 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', + 'md5': '394ef771ddcd1354f665b471d78ec4c6', + 'info_dict': { + 'id': '0ap3000000398478', + 'ext': 'mp4', + 'title': 'Week 3: Redskins vs. Eagles highlights', + 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', + 'upload_date': '20140921', + 'timestamp': 1411337580, + 'thumbnail': 're:^https?://.*\.jpg$', + } + }, + { + 'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266', + 'md5': 'cf85bdb4bc49f6e9d3816d130c78279c', + 'info_dict': { + 'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266', + 'ext': 'mp4', + 'title': 'LIVE: Post Game vs. Browns', + 'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8', + 'upload_date': '20131229', + 'timestamp': 1388354455, + 'thumbnail': 're:^https?://.*\.jpg$', + } + } + ] + + @staticmethod + def prepend_host(host, url): + if not url.startswith('http'): + if not url.startswith('/'): + url = '/%s' % url + url = 'http://{0:}{1:}'.format(host, url) + return url + + @staticmethod + def format_from_stream(stream, protocol, host, path_prefix='', + preference=0, note=None): + url = '{protocol:}://{host:}/{prefix:}{path:}'.format( + protocol=protocol, + host=host, + prefix=path_prefix, + path=stream.get('path'), + ) + return { + 'url': url, + 'vbr': int_or_none(stream.get('rate', 0), 1000), + 'preference': preference, + 'format_note': note, } - } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id, host = mobj.group('id'), mobj.group('host') - config = self._download_json(self._PLAYER_CONFIG_URL, video_id, + webpage = self._download_webpage(url, video_id) + + config_url = NFLIE.prepend_host(host, self._search_regex( + r'(?:config|configURL)\s*:\s*"([^"]+)"', webpage, 'config URL')) + config = self._download_json(config_url, video_id, note='Downloading player config') - url_template = 'http://nfl.com{contentURLTemplate:s}'.format(**config) - video_data = self._download_json(url_template.format(id=video_id), video_id) - - cdns = config.get('cdns') - if not cdns: - raise ExtractorError('Failed to get CDN data', expected=True) + url_template = NFLIE.prepend_host( + host, '{contentURLTemplate:}'.format(**config)) + video_data = self._download_json( + url_template.format(id=video_id), video_id) formats = [] - streams = video_data.get('cdnData', {}).get('bitrateInfo', []) - for name, cdn in cdns.items(): - # LimeLight streams don't seem to work - if cdn.get('name') == 'LIMELIGHT': - continue - - protocol = cdn.get('protocol') - host = remove_end(cdn.get('host', ''), '/') - if not (protocol and host): - continue - - path_prefix = cdn.get('pathprefix', '') - if path_prefix and not path_prefix.endswith('/'): - path_prefix = '%s/' % path_prefix - - get_url = lambda p: '{protocol:s}://{host:s}/{prefix:s}{path:}'.format( - protocol=protocol, - host=host, - prefix=path_prefix, - path=p, - ) - - if protocol == 'rtmp': - preference = -1 - elif 'prog' in name.lower(): - preference = 1 - else: - preference = 0 - + cdn_data = video_data.get('cdnData', {}) + streams = cdn_data.get('bitrateInfo', []) + if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM': + parts = compat_urllib_parse.urlparse(cdn_data.get('uri')) + protocol, host = parts.scheme, parts.netloc for stream in streams: - path = stream.get('path') - if not path: + formats.append( + NFLIE.format_from_stream(stream, protocol, host)) + else: + cdns = config.get('cdns') + if not cdns: + raise ExtractorError('Failed to get CDN data', expected=True) + + for name, cdn in cdns.items(): + # LimeLight streams don't seem to work + if cdn.get('name') == 'LIMELIGHT': continue - formats.append({ - 'url': get_url(path), - 'vbr': int_or_none(stream.get('rate', 0), 1000), - 'preference': preference, - 'format_note': name, - }) + protocol = cdn.get('protocol') + host = remove_end(cdn.get('host', ''), '/') + if not (protocol and host): + continue + + prefix = cdn.get('pathprefix', '') + if prefix and not prefix.endswith('/'): + prefix = '%s/' % prefix + + preference = 0 + if protocol == 'rtmp': + preference = -2 + elif 'prog' in name.lower(): + preference = 1 + + for stream in streams: + formats.append( + NFLIE.format_from_stream(stream, protocol, host, + prefix, preference, name)) self._sort_formats(formats) From 67077b182b698ac56cec9525a2669d5cee394226 Mon Sep 17 00:00:00 2001 From: Anton Larionov <diffident.cat@gmail.com> Date: Sun, 28 Sep 2014 23:36:55 +0400 Subject: [PATCH 106/652] [thvideo] Add support for playlists --- youtube_dl/extractor/__init__.py | 5 ++++- youtube_dl/extractor/thvideo.py | 24 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 86bff185b..89a9d8106 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -371,7 +371,10 @@ from .thisav import ThisAVIE from .tinypic import TinyPicIE from .tlc import TlcIE, TlcDeIE from .tnaflix import TNAFlixIE -from .thvideo import THVideoIE +from .thvideo import ( + THVideoIE, + THVideoPlaylistIE +) from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE diff --git a/youtube_dl/extractor/thvideo.py b/youtube_dl/extractor/thvideo.py index 607e947bb..0ae20ea30 100644 --- a/youtube_dl/extractor/thvideo.py +++ b/youtube_dl/extractor/thvideo.py @@ -57,3 +57,27 @@ class THVideoIE(InfoExtractor): 'description': description, 'upload_date': upload_date } + + +class THVideoPlaylistIE(InfoExtractor): + _VALID_URL = r'http?://(?:www\.)?thvideo\.tv/mylist(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://thvideo.tv/mylist2', + 'info_dict': { + 'id': '2', + 'title': '幻想万華鏡', + }, + 'playlist_mincount': 23, + } + + def _real_extract(self, url): + webpage = self._download_webpage(url, 'playlist') + mobj = re.match(self._VALID_URL, url) + list_id = mobj.group('id') + list_title = self._html_search_regex(r'<h1 class="show_title">(.*?)<b id', webpage, 'playlist title') + + entries = [ + self.url_result('http://thvideo.tv/v/th' + id, 'THVideo') + for id in re.findall(r'<dd><a href="http://thvideo.tv/v/th(\d+)/" target=', webpage)] + + return self.playlist_result(entries, list_id, list_title) \ No newline at end of file From d2e32f7df56ab497175437bffdcdfedbd71ca8d9 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 29 Sep 2014 00:23:41 +0200 Subject: [PATCH 107/652] Do not use HTML characters in output This messes up the format when people paste it outside of code tags. --- youtube_dl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 44dcb1e34..f651337ad 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -87,7 +87,7 @@ def parseOpts(overrideArguments=None): for private_opt in ['-p', '--password', '-u', '--username', '--video-password']: try: i = opts.index(private_opt) - opts[i+1] = '<PRIVATE>' + opts[i+1] = 'PRIVATE' except ValueError: pass return opts From 9c44d2429b90dece734df778c63b04c15e91c1ca Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 29 Sep 2014 00:36:06 +0200 Subject: [PATCH 108/652] [vimeo:likes] Support large like lists (Fixes #3847) --- test/test_utils.py | 9 ++++- youtube_dl/extractor/vimeo.py | 66 ++++++++++++++++++++++----------- youtube_dl/extractor/youtube.py | 4 +- youtube_dl/utils.py | 39 +++++++++++++++++-- 4 files changed, 89 insertions(+), 29 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 3efbed29d..6419b3ca9 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -22,7 +22,8 @@ from youtube_dl.utils import ( fix_xml_ampersands, get_meta_content, orderedSet, - PagedList, + OnDemandPagedList, + InAdvancePagedList, parse_duration, read_batch_urls, sanitize_filename, @@ -246,10 +247,14 @@ class TestUtil(unittest.TestCase): for i in range(firstid, upto): yield i - pl = PagedList(get_page, pagesize) + pl = OnDemandPagedList(get_page, pagesize) got = pl.getslice(*sliceargs) self.assertEqual(got, expected) + iapl = InAdvancePagedList(get_page, size // pagesize + 1, pagesize) + got = iapl.getslice(*sliceargs) + self.assertEqual(got, expected) + testPL(5, 2, (), [0, 1, 2, 3, 4]) testPL(5, 2, (1,), [1, 2, 3, 4]) testPL(5, 2, (2,), [2, 3, 4]) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 4be1b8785..403d0bb28 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -8,18 +8,19 @@ import itertools from .common import InfoExtractor from .subtitles import SubtitlesInfoExtractor from ..utils import ( + clean_html, compat_HTTPError, compat_urllib_parse, compat_urllib_request, - clean_html, - get_element_by_attribute, + compat_urlparse, ExtractorError, + get_element_by_attribute, + InAdvancePagedList, + int_or_none, RegexNotFoundError, - smuggle_url, std_headers, unsmuggle_url, urlencode_postdata, - int_or_none, ) @@ -533,32 +534,55 @@ class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE): class VimeoLikesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?vimeo\.com/user(?P<id>[0-9]+)/likes/?(?:$|[?#]|sort:)' IE_NAME = 'vimeo:likes' IE_DESC = 'Vimeo user likes' _TEST = { - 'url': 'https://vimeo.com/user20132939/likes', - 'playlist_mincount': 4, - 'add_ies': ['Generic'], + 'url': 'https://vimeo.com/user755559/likes/', + 'playlist_mincount': 293, "info_dict": { - "description": "Videos Philipp Hagemeister likes on Vimeo.", - "title": "Vimeo / Philipp Hagemeister's likes", - }, - 'params': { - 'extract_flat': False, + "description": "See all the videos urza likes", + "title": 'Videos urza likes', }, } def _real_extract(self, url): user_id = self._match_id(url) - rss_url = '%s//vimeo.com/user%s/likes/rss' % ( - self.http_scheme(), user_id) - surl = smuggle_url(rss_url, { - 'force_videoid': '%s_likes' % user_id, - 'to_generic': True, - }) + webpage = self._download_webpage(url, user_id) + page_count = self._int( + self._search_regex( + r'''(?x)<li><a\s+href="[^"]+"\s+data-page="([0-9]+)"> + .*?</a></li>\s*<li\s+class="pagination_next"> + ''', webpage, 'page count'), + 'page count', fatal=True) + PAGE_SIZE = 12 + title = self._html_search_regex( + r'(?s)<h1>(.+?)</h1>', webpage, 'title', fatal=False) + description = self._html_search_meta('description', webpage) + + def _get_page(idx): + page_url = '%s//vimeo.com/user%s/likes/page:%d/sort:date' % ( + self.http_scheme(), user_id, idx + 1) + webpage = self._download_webpage( + page_url, user_id, + note='Downloading page %d/%d' % (idx + 1, page_count)) + video_list = self._search_regex( + r'(?s)<ol class="js-browse_list[^"]+"[^>]*>(.*?)</ol>', + webpage, 'video content') + paths = re.findall( + r'<li[^>]*>\s*<a\s+href="([^"]+)"', video_list) + for path in paths: + yield { + '_type': 'url', + 'url': compat_urlparse.urljoin(page_url, path), + } + + pl = InAdvancePagedList(_get_page, page_count, PAGE_SIZE) return { - '_type': 'url', - 'url': surl, + '_type': 'playlist', + 'id': 'user%s_likes' % user_id, + 'title': title, + 'description': description, + 'entries': pl, } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 99198e380..045507bc7 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -26,7 +26,7 @@ from ..utils import ( get_element_by_attribute, ExtractorError, int_or_none, - PagedList, + OnDemandPagedList, unescapeHTML, unified_strdate, orderedSet, @@ -1341,7 +1341,7 @@ class YoutubeUserIE(InfoExtractor): 'id': video_id, 'title': title, } - url_results = PagedList(download_page, self._GDATA_PAGE_SIZE) + url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE) return self.playlist_result(url_results, playlist_title=username) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b644f4e92..9f49507c1 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1384,14 +1384,16 @@ def check_executable(exe, args=[]): class PagedList(object): - def __init__(self, pagefunc, pagesize): - self._pagefunc = pagefunc - self._pagesize = pagesize - def __len__(self): # This is only useful for tests return len(self.getslice()) + +class OnDemandPagedList(PagedList): + def __init__(self, pagefunc, pagesize): + self._pagefunc = pagefunc + self._pagesize = pagesize + def getslice(self, start=0, end=None): res = [] for pagenum in itertools.count(start // self._pagesize): @@ -1430,6 +1432,35 @@ class PagedList(object): return res +class InAdvancePagedList(PagedList): + def __init__(self, pagefunc, pagecount, pagesize): + self._pagefunc = pagefunc + self._pagecount = pagecount + self._pagesize = pagesize + + def getslice(self, start=0, end=None): + res = [] + start_page = start // self._pagesize + end_page = ( + self._pagecount if end is None else (end // self._pagesize + 1)) + skip_elems = start - start_page * self._pagesize + only_more = None if end is None else end - start + for pagenum in range(start_page, end_page): + page = list(self._pagefunc(pagenum)) + if skip_elems: + page = page[skip_elems:] + skip_elems = None + if only_more is not None: + if len(page) < only_more: + only_more -= len(page) + else: + page = page[:only_more] + res.extend(page) + break + res.extend(page) + return res + + def uppercase_escape(s): unicode_escape = codecs.getdecoder('unicode_escape') return re.sub( From 1770ed9e86a147eceb86210dec0aefcf0d94ab52 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 29 Sep 2014 00:38:37 +0200 Subject: [PATCH 109/652] [thvideo] Simplify (#3848) --- youtube_dl/extractor/thvideo.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/thvideo.py b/youtube_dl/extractor/thvideo.py index 0ae20ea30..496f15d80 100644 --- a/youtube_dl/extractor/thvideo.py +++ b/youtube_dl/extractor/thvideo.py @@ -26,8 +26,7 @@ class THVideoIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) # extract download link from mobile player page webpage_player = self._download_webpage( @@ -71,13 +70,15 @@ class THVideoPlaylistIE(InfoExtractor): } def _real_extract(self, url): - webpage = self._download_webpage(url, 'playlist') - mobj = re.match(self._VALID_URL, url) - list_id = mobj.group('id') - list_title = self._html_search_regex(r'<h1 class="show_title">(.*?)<b id', webpage, 'playlist title') + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + list_title = self._html_search_regex( + r'<h1 class="show_title">(.*?)<b id', webpage, 'playlist title', + fatal=False) entries = [ self.url_result('http://thvideo.tv/v/th' + id, 'THVideo') for id in re.findall(r'<dd><a href="http://thvideo.tv/v/th(\d+)/" target=', webpage)] - return self.playlist_result(entries, list_id, list_title) \ No newline at end of file + return self.playlist_result(entries, playlist_id, list_title) From e2dce5378191e315ad86785aac7e786c86a1a121 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 29 Sep 2014 01:39:26 +0200 Subject: [PATCH 110/652] [youtube] Always request webpage in English (Fixes #3844) --- youtube_dl/extractor/youtube.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 045507bc7..61228817e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -655,7 +655,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Get video webpage url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id - video_webpage = self._download_webpage(url, video_id) + req = compat_urllib_request.Request(url) + req.add_header('Cookie', 'PREF=hl=en') + video_webpage = self._download_webpage(req, video_id) # Attempt to extract SWF player URL mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) From a43ee88c6f888196b47cb1e12463a64ada0ead12 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 29 Sep 2014 01:51:53 +0200 Subject: [PATCH 111/652] release 2014.09.29 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index eb4356811..17e5ea8e2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.09.28.1' +__version__ = '2014.09.29' From a1f934b171dcc8e1215ee30d0715ce562eb220e3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 29 Sep 2014 02:04:16 +0200 Subject: [PATCH 112/652] [youtube] Correct language cookie handling --- youtube_dl/extractor/youtube.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 61228817e..9041cfa87 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -655,9 +655,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Get video webpage url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id - req = compat_urllib_request.Request(url) - req.add_header('Cookie', 'PREF=hl=en') - video_webpage = self._download_webpage(req, video_id) + pref_cookies = [ + c for c in self._downloader.cookiejar + if c.domain == '.youtube.com' and c.name == 'PREF'] + for pc in pref_cookies: + if 'hl=' in pc.value: + pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value) + else: + if pc.value: + pc.value += '&' + pc.value += 'hl=en' + video_webpage = self._download_webpage(url, video_id) # Attempt to extract SWF player URL mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) From f5b7e6a842b00bab8320d30608dd4a10a4752a17 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 29 Sep 2014 02:04:28 +0200 Subject: [PATCH 113/652] release 2014.09.29.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 17e5ea8e2..885df83c0 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.09.29' +__version__ = '2014.09.29.1' From 27aede907436fa58600cd46bb04e7eae6e1e9279 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 29 Sep 2014 04:48:50 +0200 Subject: [PATCH 114/652] [pbs] Add support for series/jwplayer type video (Fixes #3849) --- youtube_dl/extractor/pbs.py | 39 +++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 2adfde909..8f140d626 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( + unified_strdate, US_RATINGS, ) @@ -11,10 +12,10 @@ from ..utils import ( class PBSIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// (?: - # Direct video URL - video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? | - # Article with embedded player - (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+)/?(?:$|[?\#]) | + # Direct video URL + video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? | + # Article with embedded player (or direct video) + (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) | # Player video\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/ ) @@ -65,10 +66,25 @@ class PBSIE(InfoExtractor): 'duration': 6559, 'thumbnail': 're:^https?://.*\.jpg$', } + }, + { + 'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html', + 'md5': '908f3e5473a693b266b84e25e1cf9703', + 'info_dict': { + 'id': '2365160389', + 'display_id': 'killer-typhoon', + 'ext': 'mp4', + 'description': 'md5:c741d14e979fc53228c575894094f157', + 'title': 'Killer Typhoon', + 'duration': 3172, + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20140122', + } } + ] - def _extract_ids(self, url): + def _extract_webpage(self, url): mobj = re.match(self._VALID_URL, url) presumptive_id = mobj.group('presumptive_id') @@ -76,15 +92,20 @@ class PBSIE(InfoExtractor): if presumptive_id: webpage = self._download_webpage(url, display_id) + upload_date = unified_strdate(self._search_regex( + r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"', + webpage, 'upload date', default=None)) + MEDIA_ID_REGEXES = [ r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed r'class="coveplayerid">([^<]+)<', # coveplayer + r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer ] media_id = self._search_regex( MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None) if media_id: - return media_id, presumptive_id + return media_id, presumptive_id, upload_date url = self._search_regex( r'<iframe\s+(?:class|id)=["\']partnerPlayer["\'].*?\s+src=["\'](.*?)["\']>', @@ -104,10 +125,10 @@ class PBSIE(InfoExtractor): video_id = mobj.group('id') display_id = video_id - return video_id, display_id + return video_id, display_id, None def _real_extract(self, url): - video_id, display_id = self._extract_ids(url) + video_id, display_id, upload_date = self._extract_webpage(url) info_url = 'http://video.pbs.org/videoInfo/%s?format=json' % video_id info = self._download_json(info_url, display_id) @@ -119,6 +140,7 @@ class PBSIE(InfoExtractor): return { 'id': video_id, + 'display_id': display_id, 'title': info['title'], 'url': info['alternate_encoding']['url'], 'ext': 'mp4', @@ -126,4 +148,5 @@ class PBSIE(InfoExtractor): 'thumbnail': info.get('image_url'), 'duration': info.get('duration'), 'age_limit': age_limit, + 'upload_date': upload_date, } From 35d3e63d24c524922cb39ba36cb5f6de12400504 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 29 Sep 2014 04:49:11 +0200 Subject: [PATCH 115/652] release 2014.09.29.2 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 885df83c0..1384b496b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.09.29.1' +__version__ = '2014.09.29.2' From 25930395225c45a9e5045ada291d37817371b086 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 29 Sep 2014 04:58:29 +0200 Subject: [PATCH 116/652] [vimeo] Use regexps to find description This fixes descriptions on 2.6 and makes the code simpler. --- youtube_dl/extractor/vimeo.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 403d0bb28..a002555a9 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -275,18 +275,9 @@ class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): _, video_thumbnail = sorted((int(width if width.isdigit() else 0), t_url) for (width, t_url) in video_thumbs.items())[-1] # Extract video description - video_description = None - try: - video_description = get_element_by_attribute("class", "description_wrapper", webpage) - if video_description: - video_description = clean_html(video_description) - except AssertionError as err: - # On some pages like (http://player.vimeo.com/video/54469442) the - # html tags are not closed, python 2.6 cannot handle it - if err.args[0] == 'we should not get here!': - pass - else: - raise + video_description = self._html_search_regex( + r'(?s)<div class="[^"]*description"[^>]*>(.*?)</div>', + webpage, 'description', fatal=False) # Extract video duration video_duration = int_or_none(config["video"].get("duration")) From 12548cd9330222848f6b49fe9eac91aaff897325 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 29 Sep 2014 05:02:58 +0200 Subject: [PATCH 117/652] [worldstarhiphop] Correct title extraction --- youtube_dl/extractor/worldstarhiphop.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py index 4e89acd81..bda3870db 100644 --- a/youtube_dl/extractor/worldstarhiphop.py +++ b/youtube_dl/extractor/worldstarhiphop.py @@ -13,37 +13,35 @@ class WorldStarHipHopIE(InfoExtractor): "info_dict": { "id": "wshh6a7q1ny0G34ZwuIO", "ext": "mp4", - "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!" + "title": "KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!" } } def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('id') + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - webpage_src = self._download_webpage(url, video_id) - - m_vevo_id = re.search(r'videoId=(.*?)&?', - webpage_src) + m_vevo_id = re.search(r'videoId=(.*?)&?', webpage) if m_vevo_id is not None: return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo') video_url = self._search_regex( - r'so\.addVariable\("file","(.*?)"\)', webpage_src, 'video URL') + r'so\.addVariable\("file","(.*?)"\)', webpage, 'video URL') if 'youtube' in video_url: return self.url_result(video_url, ie='Youtube') video_title = self._html_search_regex( - r"<title>(.*)", webpage_src, 'title') + r'(?s)
\s*

(.*?)

', + webpage, 'title') # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. thumbnail = self._html_search_regex( - r'rel="image_src" href="(.*)" />', webpage_src, 'thumbnail', + r'rel="image_src" href="(.*)" />', webpage, 'thumbnail', fatal=False) if not thumbnail: - _title = r"""candytitles.*>(.*)""" - mobj = re.search(_title, webpage_src) + _title = r'candytitles.*>(.*)' + mobj = re.search(_title, webpage) if mobj is not None: video_title = mobj.group(1) From 6043f1df4e4f74bd0ade52b3fc0938ff241366dc Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 29 Sep 2014 05:05:06 +0200 Subject: [PATCH 118/652] [ign] Return proper playlist object --- youtube_dl/extractor/ign.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index 12e9e61c4..c80185b53 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -89,7 +89,12 @@ class IGNIE(InfoExtractor): ']*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]', webpage) if multiple_urls: - return [self.url_result(u, ie='IGN') for u in multiple_urls] + entries = [self.url_result(u, ie='IGN') for u in multiple_urls] + return { + '_type': 'playlist', + 'id': name_or_id, + 'entries': entries, + } video_id = self._find_video_id(webpage) result = self._get_video_info(video_id) From a8eb5a8e610a2b90eac2789d5b5f3cda81f543bb Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 29 Sep 2014 05:12:57 +0200 Subject: [PATCH 119/652] [generic] Fix testcases --- youtube_dl/extractor/generic.py | 18 +++++++++--------- youtube_dl/extractor/ted.py | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 0dfa4853d..263aa8579 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -180,13 +180,13 @@ class GenericIE(InfoExtractor): # Embedded TED video { 'url': 'http://en.support.wordpress.com/videos/ted-talks/', - 'md5': 'deeeabcc1085eb2ba205474e7235a3d5', + 'md5': '65fdff94098e4a607385a60c5177c638', 'info_dict': { - 'id': '981', + 'id': '1969', 'ext': 'mp4', - 'title': 'My web playroom', - 'uploader': 'Ze Frank', - 'description': 'md5:ddb2a40ecd6b6a147e400e535874947b', + 'title': 'Hidden miracles of the natural world', + 'uploader': 'Louie Schwartzberg', + 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9', } }, # Embeded Ustream video @@ -295,13 +295,13 @@ class GenericIE(InfoExtractor): { 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM', 'info_dict': { - 'id': 'jpSGZsgga_I', + 'id': '4vAffPZIT44', 'ext': 'mp4', - 'title': 'Asphalt 8: Airborne - Launch Trailer', + 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!', 'uploader': 'Gameloft', 'uploader_id': 'gameloft', - 'upload_date': '20130821', - 'description': 'md5:87bd95f13d8be3e7da87a5f2c443106a', + 'upload_date': '20140828', + 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4', }, 'params': { 'skip_download': True, diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 1cca47771..d5e28efad 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -149,7 +149,7 @@ class TEDIE(SubtitlesInfoExtractor): thumbnail = 'http://' + thumbnail return { 'id': video_id, - 'title': talk_info['title'], + 'title': talk_info['title'].strip(), 'uploader': talk_info['speaker'], 'thumbnail': thumbnail, 'description': self._og_search_description(webpage), From dbe3043cd6d6bf468495df1e9f927a8c512e82a0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 29 Sep 2014 05:15:42 +0200 Subject: [PATCH 120/652] [ynet] Fix test checksums --- youtube_dl/extractor/ynet.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/ynet.py b/youtube_dl/extractor/ynet.py index 24872861a..944d7da38 100644 --- a/youtube_dl/extractor/ynet.py +++ b/youtube_dl/extractor/ynet.py @@ -13,7 +13,7 @@ class YnetIE(InfoExtractor): _TESTS = [ { 'url': 'http://hot.ynet.co.il/home/0,7340,L-11659-99244,00.html', - 'md5': '002b44ee2f33d50363a1c153bed524cf', + 'md5': '4b29cb57c3dddd57642b3f051f535b07', 'info_dict': { 'id': 'L-11659-99244', 'ext': 'flv', @@ -22,7 +22,7 @@ class YnetIE(InfoExtractor): } }, { 'url': 'http://hot.ynet.co.il/home/0,7340,L-8859-84418,00.html', - 'md5': '6455046ae1b48cf7e2b7cae285e53a16', + 'md5': '8194c2ea221e9a639cac96b6b0753dc5', 'info_dict': { 'id': 'L-8859-84418', 'ext': 'flv', @@ -33,9 +33,7 @@ class YnetIE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) content = compat_urllib_parse.unquote_plus(self._og_search_video_url(webpage)) From 8ff14175e228a30f9940a69a4e72ca3a2a99aaf6 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 29 Sep 2014 05:17:16 +0200 Subject: [PATCH 121/652] [sportdeutschland] Fix testcase --- youtube_dl/extractor/sportdeutschland.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/sportdeutschland.py b/youtube_dl/extractor/sportdeutschland.py index 185353bef..abb827783 100644 --- a/youtube_dl/extractor/sportdeutschland.py +++ b/youtube_dl/extractor/sportdeutschland.py @@ -17,11 +17,11 @@ class SportDeutschlandIE(InfoExtractor): 'info_dict': { 'id': 'live-li-ning-badminton-weltmeisterschaft-2014-kopenhagen', 'ext': 'mp4', - 'title': 'LIVE: Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen', + 'title': 're:Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen', 'categories': ['Badminton'], 'view_count': int, 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 're:^Die Badminton-WM 2014 aus Kopenhagen LIVE', + 'description': 're:Die Badminton-WM 2014 aus Kopenhagen bei Sportdeutschland\.TV', 'timestamp': int, 'upload_date': 're:^201408[23][0-9]$', }, From 761e1645e075ff9f8c5aeb8d0f2a4cfac71fb528 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 29 Sep 2014 05:18:45 +0200 Subject: [PATCH 122/652] [generic] Remove unstable test checksum --- youtube_dl/extractor/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 263aa8579..742bc2856 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -155,7 +155,6 @@ class GenericIE(InfoExtractor): # funnyordie embed { 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns', - 'md5': '7cf780be104d40fea7bae52eed4a470e', 'info_dict': { 'id': '18e820ec3f', 'ext': 'mp4', From 5e4f06197f3d949bf89ee7e156391ca78121bf16 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 29 Sep 2014 05:19:56 +0200 Subject: [PATCH 123/652] [facebook] Fix test case --- youtube_dl/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 60e68d98a..3ad993751 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -35,7 +35,7 @@ class FacebookIE(InfoExtractor): 'id': '637842556329505', 'ext': 'mp4', 'duration': 38, - 'title': 'Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam fin...', + 'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam', } }, { 'note': 'Video without discernible title', From 6be451f422090601e25b6a9b1f801f521f1ca41f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 29 Sep 2014 05:23:58 +0200 Subject: [PATCH 124/652] [youtube] Remove swf signature test cases These files are now 0 Bytes --- test/test_youtube_signature.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index 604e76ab6..df2cb09f2 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -47,18 +47,6 @@ _TESTS = [ '2ACFC7A61CA478CD21425E5A57EBD73DDC78E22A.2094302436B2D377D14A3BBA23022D023B8BC25AA', 'A52CB8B320D22032ABB3A41D773D2B6342034902.A22E87CDD37DBE75A5E52412DC874AC16A7CFCA2', ), - ( - 'http://s.ytimg.com/yts/swfbin/player-vfl5vIhK2/watch_as3.swf', - 'swf', - 86, - 'O1I3456789abcde0ghijklmnopqrstuvwxyzABCDEFGHfJKLMN2PQRSTUVWXY\\!"#$%&\'()*+,-./:;<=>?' - ), - ( - 'http://s.ytimg.com/yts/swfbin/player-vflmDyk47/watch_as3.swf', - 'swf', - 'F375F75BF2AFDAAF2666E43868D46816F83F13E81C46.3725A8218E446A0DECD33F79DC282994D6AA92C92C9', - '9C29AA6D499282CD97F33DCED0A644E8128A5273.64C18E31F38361864D86834E6662FAADFA2FB57F' - ), ( 'https://s.ytimg.com/yts/jsbin/html5player-en_US-vflBb0OQx.js', 'js', From e50e2fcd4deaab50da506a0abf1bafed16085cd7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 29 Sep 2014 05:40:20 +0200 Subject: [PATCH 125/652] [br] fix test case --- youtube_dl/extractor/br.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index 4e2960c62..2e277c8c3 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -26,6 +26,8 @@ class BRIE(InfoExtractor): 'title': 'Wenn das Traditions-Theater wackelt', 'description': 'Heimatsound-Festival 2014: Wenn das Traditions-Theater wackelt', 'duration': 34, + 'uploader': 'BR', + 'upload_date': '20140802', } }, { @@ -66,8 +68,7 @@ class BRIE(InfoExtractor): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') + display_id = self._match_id(url) page = self._download_webpage(url, display_id) xml_url = self._search_regex( r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL') From 8157ae39042298831afc8f8e5d67619d21e3e00b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 29 Sep 2014 05:48:56 +0200 Subject: [PATCH 126/652] [golem] Fix under 2.6 It's a sad story; 2.6 does not support any non-trivial xpaths. --- youtube_dl/extractor/golem.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/golem.py b/youtube_dl/extractor/golem.py index bebfe8568..53714f47f 100644 --- a/youtube_dl/extractor/golem.py +++ b/youtube_dl/extractor/golem.py @@ -38,11 +38,9 @@ class GolemIE(InfoExtractor): } formats = [] - for e in config.findall('./*[url]'): + for e in config: url = e.findtext('./url') if not url: - self._downloader.report_warning( - "{0}: url: empty, skipping".format(e.tag)) continue formats.append({ @@ -57,7 +55,7 @@ class GolemIE(InfoExtractor): info['formats'] = formats thumbnails = [] - for e in config.findall('.//teaser[url]'): + for e in config.findall('.//teaser'): url = e.findtext('./url') if not url: continue From 2a7b4681c6628e2a17b9b980333af1011e482058 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 29 Sep 2014 05:51:41 +0200 Subject: [PATCH 127/652] [godtube] Fix on Python 2.6 --- youtube_dl/extractor/godtube.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/godtube.py b/youtube_dl/extractor/godtube.py index 73bd6d890..363dc6608 100644 --- a/youtube_dl/extractor/godtube.py +++ b/youtube_dl/extractor/godtube.py @@ -36,16 +36,16 @@ class GodTubeIE(InfoExtractor): 'http://www.godtube.com/resource/mediaplayer/%s.xml' % video_id.lower(), video_id, 'Downloading player config XML') - video_url = config.find('.//file').text - uploader = config.find('.//author').text - timestamp = parse_iso8601(config.find('.//date').text) - duration = parse_duration(config.find('.//duration').text) - thumbnail = config.find('.//image').text + video_url = config.find('file').text + uploader = config.find('author').text + timestamp = parse_iso8601(config.find('date').text) + duration = parse_duration(config.find('duration').text) + thumbnail = config.find('image').text media = self._download_xml( 'http://www.godtube.com/media/xml/?v=%s' % video_id, video_id, 'Downloading media XML') - title = media.find('.//title').text + title = media.find('title').text return { 'id': video_id, From 989b4b2b86588c011314200c0d30db965f79105e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 29 Sep 2014 06:15:46 +0200 Subject: [PATCH 128/652] [utils:YoutubeDLHandler] Work around brain-dead Python 2.6 httplib In 2.6, the httplib sends fragments! Remove those (fixes generic_26 on 2.6). --- youtube_dl/utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9f49507c1..950cd1a7a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -799,6 +799,12 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): del req.headers['User-agent'] req.headers['User-agent'] = req.headers['Youtubedl-user-agent'] del req.headers['Youtubedl-user-agent'] + + if sys.version_info < (2, 7) and '#' in req.get_full_url(): + # Python 2.6 is brain-dead when it comes to fragments + req._Request__original = req._Request__original.partition('#')[0] + req._Request__r_type = req._Request__r_type.partition('#')[0] + return req def http_response(self, req, resp): From 8c23945c727dab01eabbc6e134cbb80db34d3120 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 29 Sep 2014 06:19:18 +0200 Subject: [PATCH 129/652] [eporner] Adapt to changed default format --- youtube_dl/extractor/eporner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index 522aa3d63..bb231ecb1 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -14,11 +14,11 @@ class EpornerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P\d+)/(?P[\w-]+)' _TEST = { 'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/', - 'md5': '3b427ae4b9d60619106de3185c2987cd', + 'md5': '39d486f046212d8e1b911c52ab4691f8', 'info_dict': { 'id': '95008', 'display_id': 'Infamous-Tiffany-Teen-Strip-Tease-Video', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Infamous Tiffany Teen Strip Tease Video', 'duration': 194, 'view_count': int, From 80bcefcd77415eff62b722c0a432e5c217a1d64f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 29 Sep 2014 06:22:54 +0200 Subject: [PATCH 130/652] [cliphunter] Remove duration --- youtube_dl/extractor/cliphunter.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index 65c12136a..d4227e6eb 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -35,7 +35,6 @@ class CliphunterIE(InfoExtractor): 'title': 'Fun Jynx Maze solo', 'thumbnail': 're:^https?://.*\.jpg$', 'age_limit': 18, - 'duration': 1317, } } @@ -86,14 +85,11 @@ class CliphunterIE(InfoExtractor): thumbnail = self._search_regex( r"var\s+mov_thumb\s*=\s*'([^']+)';", webpage, 'thumbnail', fatal=False) - duration = int_or_none(self._search_regex( - r'pl_dur\s*=\s*([0-9]+)', webpage, 'duration', fatal=False)) return { 'id': video_id, 'title': video_title, 'formats': formats, - 'duration': duration, 'age_limit': self._rta_search(webpage), 'thumbnail': thumbnail, } From 937f935db0932fcbd6402068c0147f07f78af4ed Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 29 Sep 2014 12:15:23 +0200 Subject: [PATCH 131/652] [jukebox] Remove md5 sum, it fluctuates --- youtube_dl/extractor/jukebox.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/jukebox.py b/youtube_dl/extractor/jukebox.py index 9b553b9fa..5aa32bf09 100644 --- a/youtube_dl/extractor/jukebox.py +++ b/youtube_dl/extractor/jukebox.py @@ -11,10 +11,9 @@ from ..utils import ( class JukeboxIE(InfoExtractor): - _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P[a-z0-9\-]+)\.html' + _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P[a-z0-9\-]+)\.html' _TEST = { 'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html', - 'md5': '1574e9b4d6438446d5b7dbcdf2786276', 'info_dict': { 'id': 'r303r', 'ext': 'flv', @@ -24,8 +23,7 @@ class JukeboxIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') + video_id = self._match_id(url) html = self._download_webpage(url, video_id) iframe_url = unescapeHTML(self._search_regex(r'