From f870544302f75bee0d96f6a8623c8ff270beca89 Mon Sep 17 00:00:00 2001 From: fnord Date: Mon, 13 Jul 2015 07:41:38 -0500 Subject: [PATCH 0001/1214] Add support for democracynow.org Supports downloading clips or entire shows. Subtitle support --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/democracynow.py | 100 +++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 youtube_dl/extractor/democracynow.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index cbaa07391..5cc03b875 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -112,6 +112,7 @@ from .daum import DaumIE from .dbtv import DBTVIE from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE +from .democracynow import DemocracynowIE from .dfb import DFBIE from .dhm import DHMIE from .dotsub import DotsubIE diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py new file mode 100644 index 000000000..1c9b36052 --- /dev/null +++ b/youtube_dl/extractor/democracynow.py @@ -0,0 +1,100 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import time +import hmac +import hashlib +import itertools +import re +from ..utils import ( + ExtractorError, + int_or_none, + parse_age_limit, + parse_iso8601, +) +from ..compat import compat_urllib_request +from .common import InfoExtractor + + +class DemocracynowIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?democracynow.org/?(?P[^\?]*)' + IE_NAME = 'democracynow' + _TESTS = [{ + 'url': 'http://www.democracynow.org/shows/2015/7/3', + 'info_dict': { + 'id': '2015-0703-001', + 'ext': 'mp4', + 'title': 'July 03, 2015 - Democracy Now!', + 'description': 'A daily independent global news hour with Amy Goodman & Juan Gonz\xe1lez "What to the Slave is 4th of July?": James Earl Jones Reads Frederick Douglass\u2019 Historic Speech : "This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag : "We Shall Overcome": Remembering Folk Icon, Activist Pete Seeger in His Own Words & Songs', + 'uploader': 'Democracy Now', + 'upload_date': None, + }, + },{ + 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', + 'info_dict': { + 'id': '2015-0703-001', + 'ext': 'mp4', + 'title': '"This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag', + 'description': 'md5:4d2bc4f0d29f5553c2210a4bc7761a21', + 'uploader': 'Democracy Now', + 'upload_date': None, + }, + + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + base_host = re.search(r'^(.+?://[^/]+)', url).group(1) + if display_id == '': + display_id = 'home' + webpage = self._download_webpage(url, display_id) + re_desc = re.search(r'[^/]+)/(?:dn)?(?P[^/]+?)\.(?P[^\.\?]+)(?P\?|$)',url) + if video_id == None: + video_id = purl.group('fn') + if js.get('start') != None: + url += '&' if purl.group('hasparams') == '?' else '?' + url = url + 'start='+str(js.get('start')) + formats.append({ + 'format_id': purl.group('dir'), + 'ext': purl.group('ext'), + 'url': url, + }) + self._sort_formats(formats) + ret = { + 'id': video_id, + 'title': js.get('title'), + 'description': description, + 'uploader': 'Democracy Now', +# 'thumbnails': thumbnails, + 'subtitles': subtitles, + 'formats': formats, + } + return ret +# \ No newline at end of file From eb08081330f5ef52d66140589137ae1bb05eee5f Mon Sep 17 00:00:00 2001 From: fnord Date: Fri, 17 Jul 2015 02:57:08 -0500 Subject: [PATCH 0002/1214] democracynow: correct syntax --- youtube_dl/extractor/democracynow.py | 43 +++++++++------------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 1c9b36052..973bb437b 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -1,19 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -import json -import time -import hmac -import hashlib -import itertools import re -from ..utils import ( - ExtractorError, - int_or_none, - parse_age_limit, - parse_iso8601, -) -from ..compat import compat_urllib_request from .common import InfoExtractor @@ -30,7 +18,7 @@ class DemocracynowIE(InfoExtractor): 'uploader': 'Democracy Now', 'upload_date': None, }, - },{ + }, { 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', 'info_dict': { 'id': '2015-0703-001', @@ -40,7 +28,6 @@ class DemocracynowIE(InfoExtractor): 'uploader': 'Democracy Now', 'upload_date': None, }, - }] def _real_extract(self, url): @@ -49,7 +36,7 @@ class DemocracynowIE(InfoExtractor): if display_id == '': display_id = 'home' webpage = self._download_webpage(url, display_id) - re_desc = re.search(r'[^/]+)/(?:dn)?(?P[^/]+?)\.(?P[^\.\?]+)(?P\?|$)',url) - if video_id == None: + purl = re.search(r'/(?P[^/]+)/(?:dn)?(?P[^/]+?)\.(?P[^\.\?]+)(?P\?|$)', url) + if video_id is None: video_id = purl.group('fn') - if js.get('start') != None: + if js.get('start') is not None: url += '&' if purl.group('hasparams') == '?' else '?' - url = url + 'start='+str(js.get('start')) + url = url + 'start=' + str(js.get('start')) formats.append({ 'format_id': purl.group('dir'), 'ext': purl.group('ext'), @@ -92,9 +79,7 @@ class DemocracynowIE(InfoExtractor): 'title': js.get('title'), 'description': description, 'uploader': 'Democracy Now', -# 'thumbnails': thumbnails, 'subtitles': subtitles, 'formats': formats, } return ret -# \ No newline at end of file From 984e4d487520bd2a860b31b3165416c879b28096 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 24 Jun 2015 01:13:23 +0100 Subject: [PATCH 0003/1214] [googledrive] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/googledrive.py | 106 ++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) create mode 100644 youtube_dl/extractor/googledrive.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3cfa804ec..6655d7eb5 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -209,6 +209,7 @@ from .globo import GloboIE from .godtube import GodTubeIE from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE +from .googledrive import GoogleDriveIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .gorillavid import GorillaVidIE diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py new file mode 100644 index 000000000..8c611fa47 --- /dev/null +++ b/youtube_dl/extractor/googledrive.py @@ -0,0 +1,106 @@ +from .common import InfoExtractor +from ..utils import RegexNotFoundError + +class GoogleDriveIE(InfoExtractor): + _VALID_URL = r'(?:https?://)?(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/))(?P.+?)(?:&|/|$)' + _TEST = { + 'url': 'https://drive.google.com/file/d/0BzpExh0WzJF0NlR5WUlxdEVsY0U/edit?pli=1', + 'info_dict': { + 'id': '0BzpExh0WzJF0NlR5WUlxdEVsY0U', + 'ext': 'mp4', + 'title': '[AHSH] Fairy Tail S2 - 01 [720p].mp4', + } + } + _formats = { + '5': {'ext': 'flv'}, + '6': {'ext': 'flv'}, + '13': {'ext': '3gp'}, + '17': {'ext': '3gp'}, + '18': {'ext': 'mp4'}, + '22': {'ext': 'mp4'}, + '34': {'ext': 'flv'}, + '35': {'ext': 'flv'}, + '36': {'ext': '3gp'}, + '37': {'ext': 'mp4'}, + '38': {'ext': 'mp4'}, + '43': {'ext': 'webm'}, + '44': {'ext': 'webm'}, + '45': {'ext': 'webm'}, + '46': {'ext': 'webm'}, + '59': {'ext': 'mp4'} + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://docs.google.com/file/d/'+video_id, video_id, encoding='unicode_escape' + ) + try: + title = self._html_search_regex( + r'"title","(?P.*?)"', + webpage, + 'title', + group='title' + ) + fmt_stream_map = self._html_search_regex( + r'"fmt_stream_map","(?P<fmt_stream_map>.*?)"', + webpage, + 'fmt_stream_map', + group='fmt_stream_map' + ) + fmt_list = self._html_search_regex( + r'"fmt_list","(?P<fmt_list>.*?)"', + webpage, + 'fmt_list', + group='fmt_list' + ) +# timestamp = self._html_search_regex( +# r'"timestamp","(?P<timestamp>.*?)"', +# webpage, +# 'timestamp', +# group='timestamp' +# ) + length_seconds = self._html_search_regex( + r'"length_seconds","(?P<length_seconds>.*?)"', + webpage, + 'length_seconds', + group='length_seconds' + ) + except RegexNotFoundError: + try: + reason = self._html_search_regex( + r'"reason","(?P<reason>.*?)"', + webpage, + 'reason', + group='reason' + ) + self.report_warning(reason) + return + except RegexNotFoundError: + self.report_warning('not a video') + return + + fmt_stream_map = fmt_stream_map.split(',') + fmt_list = fmt_list.split(',') + formats = [] + for i in range(len(fmt_stream_map)): + fmt_id, fmt_url = fmt_stream_map[i].split('|') + resolution = fmt_list[i].split('/')[1] + width, height = resolution.split('x') + formats.append({ + 'url': fmt_url, + 'format_id': fmt_id, + 'resolution': resolution, + 'width': int(width), + 'height': int(height), + 'ext': self._formats[fmt_id]['ext'] + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, +# 'timestamp': int(timestamp), + 'duration': int(length_seconds), + 'formats': formats + } From f120a7ab5e9c560a8114f9662e2f213243a945b0 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 24 Jun 2015 14:56:19 +0100 Subject: [PATCH 0004/1214] change the _TEST info --- youtube_dl/extractor/googledrive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 8c611fa47..e3d5c3418 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -4,11 +4,11 @@ from ..utils import RegexNotFoundError class GoogleDriveIE(InfoExtractor): _VALID_URL = r'(?:https?://)?(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/))(?P<id>.+?)(?:&|/|$)' _TEST = { - 'url': 'https://drive.google.com/file/d/0BzpExh0WzJF0NlR5WUlxdEVsY0U/edit?pli=1', + 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', 'info_dict': { - 'id': '0BzpExh0WzJF0NlR5WUlxdEVsY0U', + 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', 'ext': 'mp4', - 'title': '[AHSH] Fairy Tail S2 - 01 [720p].mp4', + 'title': 'Big Buck Bunny.mp4', } } _formats = { From 3e5f3df1729846a33631dd38a887cd1d81a727c1 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 29 Jun 2015 07:53:21 +0100 Subject: [PATCH 0005/1214] move the embed to a separate class --- youtube_dl/extractor/googledrive.py | 31 ++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index e3d5c3418..ac891b275 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -1,8 +1,37 @@ +import re + from .common import InfoExtractor from ..utils import RegexNotFoundError +class GoogleDriveEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9-]{28})(?:/preview)' + _TEST = { + 'url': 'https://docs.google.com/file/d/0B8KB9DRosYGKMXNoeWxqa3JYclE/preview', + 'info_dict': { + 'id': '0B8KB9DRosYGKMXNoeWxqa3JYclE', + 'ext': 'mp4', + 'title': 'Jimmy Fallon Sings Since You\'ve Been Gone.wmv', + } + } + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9-]{28})(?:/preview)', + webpage) + if mobj: + return 'https://drive.google.com/file/d/%s' % mobj.group('id') + + def _real_extract(self, url): + video_id = self._match_id(url) + return { + '_type': 'url', + 'ie-key': 'GoogleDrive', + 'url': 'https://drive.google.com/file/d/%s' % video_id + } + class GoogleDriveIE(InfoExtractor): - _VALID_URL = r'(?:https?://)?(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/))(?P<id>.+?)(?:&|/|$)' + _VALID_URL = r'https?://(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)(?P<id>[a-zA-Z0-9-]{28})' _TEST = { 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', 'info_dict': { From 2d651a2d02885cddf1752b45497e9113d3a3d403 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 29 Jun 2015 07:55:44 +0100 Subject: [PATCH 0006/1214] import google drive embed class --- youtube_dl/extractor/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6655d7eb5..02e18a0da 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -209,7 +209,10 @@ from .globo import GloboIE from .godtube import GodTubeIE from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE -from .googledrive import GoogleDriveIE +from .googledrive import ( + GoogleDriveEmbedIE, + GoogleDriveIE, +) from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .gorillavid import GorillaVidIE From 653789afc72d1a225b971541fb633dd768d58942 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 29 Jun 2015 08:01:30 +0100 Subject: [PATCH 0007/1214] add google drive embeds --- youtube_dl/extractor/generic.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6d2efb22e..3f7b094db 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -48,6 +48,7 @@ from .vimeo import VimeoIE from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE from .snagfilms import SnagFilmsEmbedIE +from .googledrive import GoogleDriveEmbedIE class GenericIE(InfoExtractor): @@ -1599,6 +1600,11 @@ class GenericIE(InfoExtractor): if nbc_sports_url: return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') + # Look for Google Drive embeds + google_drive_url = GoogleDriveEmbedIE._extract_url(webpage) + if google_drive_url: + return self.url_result(google_drive_url, 'GoogleDrive') + # Look for UDN embeds mobj = re.search( r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage) From 3b3d531965f0f36c20f5fa8557481c144170653f Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 17 Jul 2015 14:17:19 +0100 Subject: [PATCH 0008/1214] fix embed regex --- youtube_dl/extractor/googledrive.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index ac891b275..c82c9037f 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -4,7 +4,7 @@ from .common import InfoExtractor from ..utils import RegexNotFoundError class GoogleDriveEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9-]{28})(?:/preview)' + _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})' _TEST = { 'url': 'https://docs.google.com/file/d/0B8KB9DRosYGKMXNoeWxqa3JYclE/preview', 'info_dict': { @@ -17,7 +17,7 @@ class GoogleDriveEmbedIE(InfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r'<iframe src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9-]{28})(?:/preview)', + r'<iframe src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})', webpage) if mobj: return 'https://drive.google.com/file/d/%s' % mobj.group('id') @@ -31,7 +31,7 @@ class GoogleDriveEmbedIE(InfoExtractor): } class GoogleDriveIE(InfoExtractor): - _VALID_URL = r'https?://(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)(?P<id>[a-zA-Z0-9-]{28})' + _VALID_URL = r'https?://(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)(?P<id>[a-zA-Z0-9_-]{28})' _TEST = { 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', 'info_dict': { From d1cc05e17eccccb7ee6473574c6a4f887104baeb Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 17 Jul 2015 14:37:21 +0100 Subject: [PATCH 0009/1214] remove unnecessary regex group names --- youtube_dl/extractor/googledrive.py | 32 ++++++++++++----------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index c82c9037f..6d9bcfefd 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -62,46 +62,40 @@ class GoogleDriveIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://docs.google.com/file/d/'+video_id, video_id, encoding='unicode_escape' + 'http://docs.google.com/file/d/' + video_id, video_id, encoding='unicode_escape' ) try: title = self._html_search_regex( - r'"title","(?P<title>.*?)"', + r'"title"\s+,\s+"[^"]+', webpage, - 'title', - group='title' + 'title' ) fmt_stream_map = self._html_search_regex( - r'"fmt_stream_map","(?P<fmt_stream_map>.*?)"', + r'"fmt_stream_map"\s+,\s+"[^"]+', webpage, - 'fmt_stream_map', - group='fmt_stream_map' + 'fmt_stream_map' ) fmt_list = self._html_search_regex( - r'"fmt_list","(?P<fmt_list>.*?)"', + r'"fmt_list"\s+,\s+"[^"]+', webpage, - 'fmt_list', - group='fmt_list' + 'fmt_list' ) # timestamp = self._html_search_regex( -# r'"timestamp","(?P<timestamp>.*?)"', +# r'"timestamp"\s+,\s+"[^"]+', # webpage, -# 'timestamp', -# group='timestamp' +# 'timestamp' # ) length_seconds = self._html_search_regex( - r'"length_seconds","(?P<length_seconds>.*?)"', + r'"length_seconds"\s+,\s+"[^"]+', webpage, - 'length_seconds', - group='length_seconds' + 'length_seconds' ) except RegexNotFoundError: try: reason = self._html_search_regex( - r'"reason","(?P<reason>.*?)"', + r'"reason","[^"]+', webpage, - 'reason', - group='reason' + 'reason' ) self.report_warning(reason) return From 36dbca87848fc5698d3e0b89380c7bcec741ceaf Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 17 Jul 2015 14:52:01 +0100 Subject: [PATCH 0010/1214] fix recursive error --- youtube_dl/extractor/googledrive.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 6d9bcfefd..a3d9b4450 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -26,7 +26,7 @@ class GoogleDriveEmbedIE(InfoExtractor): video_id = self._match_id(url) return { '_type': 'url', - 'ie-key': 'GoogleDrive', + 'ie_key': 'GoogleDrive', 'url': 'https://drive.google.com/file/d/%s' % video_id } @@ -66,34 +66,34 @@ class GoogleDriveIE(InfoExtractor): ) try: title = self._html_search_regex( - r'"title"\s+,\s+"[^"]+', + r'"title"\s*,\s*"([^"]+)', webpage, 'title' ) fmt_stream_map = self._html_search_regex( - r'"fmt_stream_map"\s+,\s+"[^"]+', + r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, 'fmt_stream_map' ) fmt_list = self._html_search_regex( - r'"fmt_list"\s+,\s+"[^"]+', + r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list' ) # timestamp = self._html_search_regex( -# r'"timestamp"\s+,\s+"[^"]+', +# r'"timestamp"\s*,\s*"([^"]+)', # webpage, # 'timestamp' # ) length_seconds = self._html_search_regex( - r'"length_seconds"\s+,\s+"[^"]+', + r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length_seconds' ) except RegexNotFoundError: try: reason = self._html_search_regex( - r'"reason","[^"]+', + r'"reason","([^"]+)', webpage, 'reason' ) From 8e92d21ebf6f17e14c9e916f22e49f27529556af Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 18 Jul 2015 23:31:14 +0100 Subject: [PATCH 0011/1214] [googledrive] raise ExtractorError instead of warning --- youtube_dl/extractor/googledrive.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index a3d9b4450..7bc7b7a0d 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -1,7 +1,10 @@ import re from .common import InfoExtractor -from ..utils import RegexNotFoundError +from ..utils import ( + RegexNotFoundError, + ExtractorError, +) class GoogleDriveEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})' @@ -97,10 +100,10 @@ class GoogleDriveIE(InfoExtractor): webpage, 'reason' ) - self.report_warning(reason) + raise ExtractorError(reason) return except RegexNotFoundError: - self.report_warning('not a video') + raise ExtractorError('not a video') return fmt_stream_map = fmt_stream_map.split(',') From 9f4921bfa0ce3a48d2f93b4946f361116cfde5e9 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 3 Sep 2015 00:29:53 +0100 Subject: [PATCH 0012/1214] [dcn] add show extraction and support for other types of urls --- youtube_dl/extractor/__init__.py | 6 ++- youtube_dl/extractor/dcn.py | 81 ++++++++++++++++++++++++++++++-- 2 files changed, 83 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 39b05ce8f..d4a3e8ab0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -118,7 +118,11 @@ from .dailymotion import ( ) from .daum import DaumIE from .dbtv import DBTVIE -from .dcn import DCNIE +from .dcn import ( + DCNGeneralIE, + DCNVideoIE, + DCNShowIE, +) from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE from .dfb import DFBIE diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 82261e25c..352d35c7a 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import ( compat_urllib_parse, @@ -12,10 +14,33 @@ from ..utils import ( ) -class DCNIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/.+|show/\d+/.+?)/(?P<id>\d+)' +class DCNGeneralIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?show/(?P<show_id>\d+)/[^/]+(?:/(?P<video_id>\d+)/(?P<season_id>\d+))?' + + def _real_extract(self, url): + show_id, video_id, season_id = re.match(self._VALID_URL, url).groups() + url = '' + ie_key = '' + if video_id and int(video_id) > 0: + url = 'http://www.dcndigital.ae/#/media/%s' % video_id + ie_key = 'DCNVideo' + else: + ie_key = 'DCNShow' + if season_id and int(season_id) > 0: + url = 'http://www.dcndigital.ae/#/program/season/%s' % season_id + else: + url = 'http://www.dcndigital.ae/#/program/%s' % show_id + return { + 'url': url, + '_type': 'url', + 'ie_key': ie_key + } + + +class DCNVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media)/(?P<id>\d+)' _TEST = { - 'url': 'http://www.dcndigital.ae/#/show/199074/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375/6887', + 'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375', 'info_dict': { 'id': '17375', @@ -82,3 +107,53 @@ class DCNIE(InfoExtractor): 'timestamp': timestamp, 'formats': formats, } + + +class DCNShowIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' + _TEST = { + 'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A', + 'info_dict': + { + 'id': '205024', + 'title': 'محاضرات الشيخ الشعراوي', + 'description': '', + }, + 'playlist_mincount': 27, + } + + def _real_extract(self, url): + show_id, season_id = re.match(self._VALID_URL, url).groups() + data = {} + if season_id: + request = compat_urllib_request.Request( + 'http://admin.mangomolo.com/analytics/index.php/plus/season_info?id=%s' % season_id, + headers={'Origin': 'http://www.dcndigital.ae'}) + season = self._download_json(request, season_id) + show_id = season['id'] + data['season'] = season_id + data['show_id'] = show_id + request = compat_urllib_request.Request( + 'http://admin.mangomolo.com/analytics/index.php/plus/show', + compat_urllib_parse.urlencode(data), + { + 'Origin': 'http://www.dcndigital.ae', + 'Content-Type': 'application/x-www-form-urlencoded' + }) + show = self._download_json(request, show_id) + title = show['cat'].get('title_en') or show['cat']['title_ar'] + description = show['cat'].get('description_en') or show['cat'].get('description_ar') + entries = [] + for video in show['videos']: + entries.append({ + 'url': 'http://www.dcndigital.ae/#/media/%s' % video['id'], + '_type': 'url', + 'ie_key': 'DCNVideo', + }) + return { + 'id': show_id, + 'title': title, + 'description': description, + 'entries': entries, + '_type': 'playlist', + } From b477da2094db30a232f67edf3b342dc460aa14d4 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 3 Sep 2015 16:59:10 +0100 Subject: [PATCH 0013/1214] correct the extractor name and id and remove unnecessary request --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/dcn.py | 28 ++++++++++++++++++---------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index d4a3e8ab0..4e41d9bf9 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -121,7 +121,7 @@ from .dbtv import DBTVIE from .dcn import ( DCNGeneralIE, DCNVideoIE, - DCNShowIE, + DCNSeasonIE, ) from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 352d35c7a..8a36c10f6 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -11,6 +11,8 @@ from ..compat import ( from ..utils import ( int_or_none, parse_iso8601, + smuggle_url, + unsmuggle_url, ) @@ -25,9 +27,9 @@ class DCNGeneralIE(InfoExtractor): url = 'http://www.dcndigital.ae/#/media/%s' % video_id ie_key = 'DCNVideo' else: - ie_key = 'DCNShow' + ie_key = 'DCNSeason' if season_id and int(season_id) > 0: - url = 'http://www.dcndigital.ae/#/program/season/%s' % season_id + url = smuggle_url('http://www.dcndigital.ae/#/program/season/%s' % season_id, {'show_id': show_id}) else: url = 'http://www.dcndigital.ae/#/program/%s' % show_id return { @@ -38,6 +40,7 @@ class DCNGeneralIE(InfoExtractor): class DCNVideoIE(InfoExtractor): + IE_NAME = 'dcn:video' _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media)/(?P<id>\d+)' _TEST = { 'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375', @@ -109,13 +112,14 @@ class DCNVideoIE(InfoExtractor): } -class DCNShowIE(InfoExtractor): +class DCNSeasonIE(InfoExtractor): + IE_NAME = 'dcn:season' _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' _TEST = { 'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A', 'info_dict': { - 'id': '205024', + 'id': '7910', 'title': 'محاضرات الشيخ الشعراوي', 'description': '', }, @@ -123,15 +127,18 @@ class DCNShowIE(InfoExtractor): } def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) show_id, season_id = re.match(self._VALID_URL, url).groups() data = {} if season_id: - request = compat_urllib_request.Request( - 'http://admin.mangomolo.com/analytics/index.php/plus/season_info?id=%s' % season_id, - headers={'Origin': 'http://www.dcndigital.ae'}) - season = self._download_json(request, season_id) - show_id = season['id'] data['season'] = season_id + show_id = smuggled_data.get('show_id') + if show_id is None: + request = compat_urllib_request.Request( + 'http://admin.mangomolo.com/analytics/index.php/plus/season_info?id=%s' % season_id, + headers={'Origin': 'http://www.dcndigital.ae'}) + season = self._download_json(request, season_id) + show_id = season['id'] data['show_id'] = show_id request = compat_urllib_request.Request( 'http://admin.mangomolo.com/analytics/index.php/plus/show', @@ -141,6 +148,7 @@ class DCNShowIE(InfoExtractor): 'Content-Type': 'application/x-www-form-urlencoded' }) show = self._download_json(request, show_id) + season_id = season_id or show['default_season'] title = show['cat'].get('title_en') or show['cat']['title_ar'] description = show['cat'].get('description_en') or show['cat'].get('description_ar') entries = [] @@ -151,7 +159,7 @@ class DCNShowIE(InfoExtractor): 'ie_key': 'DCNVideo', }) return { - 'id': show_id, + 'id': season_id, 'title': title, 'description': description, 'entries': entries, From 8e2898edf930830260ab6b294c8866e7651a01a6 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 4 Sep 2015 15:42:09 +0100 Subject: [PATCH 0014/1214] [dcn] add support for live streams and catchup videos --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/dcn.py | 62 +++++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 4e41d9bf9..677c75564 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -121,6 +121,7 @@ from .dbtv import DBTVIE from .dcn import ( DCNGeneralIE, DCNVideoIE, + DCNLiveIE, DCNSeasonIE, ) from .dctp import DctpTvIE diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 8a36c10f6..2e8fff660 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import re +import base64 from .common import InfoExtractor from ..compat import ( @@ -41,7 +42,7 @@ class DCNGeneralIE(InfoExtractor): class DCNVideoIE(InfoExtractor): IE_NAME = 'dcn:video' - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media|catchup/[^/]+/[^/]+)/(?P<id>\d+)' _TEST = { 'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375', 'info_dict': @@ -112,6 +113,65 @@ class DCNVideoIE(InfoExtractor): } +class DCNLiveIE(InfoExtractor): + IE_NAME = 'dcn:live' + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?live/(?P<id>\d+)' + _TEST = { + 'url': 'http://www.dcndigital.ae/#/live/6/dubai-tv', + 'info_dict': + { + 'id': '6', + 'ext': 'mp4', + 'title': 'Dubai Al Oula', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + channel_id = self._match_id(url) + + request = compat_urllib_request.Request( + 'http://admin.mangomolo.com/analytics/index.php/plus/getchanneldetails?channel_id=%s' % channel_id, + headers={'Origin': 'http://www.dcndigital.ae'}) + + channel = self._download_json(request, channel_id) + title = channel.get('title_en') or channel['title_ar'] + + webpage = self._download_webpage( + 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + + compat_urllib_parse.urlencode({ + 'id': base64.b64encode(channel['user_id'].encode()).decode(), + 'channelid': base64.b64encode(channel['id'].encode()).decode(), + 'signature': channel['signature'], + 'countries': 'Q0M=', + 'filter': 'DENY', + }), channel_id) + + m3u8_url = self._html_search_regex(r'file:\s*"([^"]+)', webpage, 'm3u8 url') + formats = self._extract_m3u8_formats( + m3u8_url, channel_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + + rtsp_url = self._search_regex( + r'<a[^>]+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False) + if rtsp_url: + formats.append({ + 'url': rtsp_url, + 'format_id': 'rtsp', + }) + + self._sort_formats(formats) + + return { + 'id': channel_id, + 'title': title, + 'formats': formats, + 'is_live': True, + } + + class DCNSeasonIE(InfoExtractor): IE_NAME = 'dcn:season' _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?program/(?:(?P<show_id>\d+)|season/(?P<season_id>\d+))' From 486375154cb7d79bd084879467bc70550104b555 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 5 Sep 2015 11:30:42 +0100 Subject: [PATCH 0015/1214] correct season info extraction and simplify --- youtube_dl/extractor/dcn.py | 64 ++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 2e8fff660..8b360a9d7 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -25,19 +25,13 @@ class DCNGeneralIE(InfoExtractor): url = '' ie_key = '' if video_id and int(video_id) > 0: - url = 'http://www.dcndigital.ae/#/media/%s' % video_id - ie_key = 'DCNVideo' + return self.url_result('http://www.dcndigital.ae/#/media/%s' % video_id, 'DCNVideo') else: - ie_key = 'DCNSeason' if season_id and int(season_id) > 0: url = smuggle_url('http://www.dcndigital.ae/#/program/season/%s' % season_id, {'show_id': show_id}) else: url = 'http://www.dcndigital.ae/#/program/%s' % show_id - return { - 'url': url, - '_type': 'url', - 'ie_key': ie_key - } + return self.url_result(url, 'DCNSeason') class DCNVideoIE(InfoExtractor): @@ -71,6 +65,11 @@ class DCNVideoIE(InfoExtractor): video = self._download_json(request, video_id) title = video.get('title_en') or video['title_ar'] + img = video.get('img') + thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None + duration = int_or_none(video.get('duration')) + description = video.get('description_en') or video.get('description_ar') + timestamp = parse_iso8601(video.get('create_time') or video.get('update_time'), ' ') webpage = self._download_webpage( 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' @@ -96,12 +95,6 @@ class DCNVideoIE(InfoExtractor): self._sort_formats(formats) - img = video.get('img') - thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None - duration = int_or_none(video.get('duration')) - description = video.get('description_en') or video.get('description_ar') - timestamp = parse_iso8601(video.get('create_time') or video.get('update_time'), ' ') - return { 'id': video_id, 'title': title, @@ -122,7 +115,9 @@ class DCNLiveIE(InfoExtractor): { 'id': '6', 'ext': 'mp4', - 'title': 'Dubai Al Oula', + 'title': 're:^Dubai Al Oula [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'thumbnail': 're:^https?://.*\.png$', + 'is_live': True, }, 'params': { # m3u8 download @@ -139,10 +134,14 @@ class DCNLiveIE(InfoExtractor): channel = self._download_json(request, channel_id) title = channel.get('title_en') or channel['title_ar'] + img = channel.get('thumbnail') + thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None + description = channel.get('description_en') or channel.get('description_ar') + timestamp = parse_iso8601(channel.get('create_time') or channel.get('update_time'), ' ') webpage = self._download_webpage( - 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' - + compat_urllib_parse.urlencode({ + 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + + compat_urllib_parse.urlencode({ 'id': base64.b64encode(channel['user_id'].encode()).decode(), 'channelid': base64.b64encode(channel['id'].encode()).decode(), 'signature': channel['signature'], @@ -166,7 +165,9 @@ class DCNLiveIE(InfoExtractor): return { 'id': channel_id, - 'title': title, + 'title': self._live_title(title), + 'description': description, + 'thumbnail': thumbnail, 'formats': formats, 'is_live': True, } @@ -181,7 +182,6 @@ class DCNSeasonIE(InfoExtractor): { 'id': '7910', 'title': 'محاضرات الشيخ الشعراوي', - 'description': '', }, 'playlist_mincount': 27, } @@ -189,6 +189,7 @@ class DCNSeasonIE(InfoExtractor): def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) show_id, season_id = re.match(self._VALID_URL, url).groups() + data = {} if season_id: data['season'] = season_id @@ -207,21 +208,18 @@ class DCNSeasonIE(InfoExtractor): 'Origin': 'http://www.dcndigital.ae', 'Content-Type': 'application/x-www-form-urlencoded' }) + show = self._download_json(request, show_id) season_id = season_id or show['default_season'] - title = show['cat'].get('title_en') or show['cat']['title_ar'] - description = show['cat'].get('description_en') or show['cat'].get('description_ar') + season = {} + for _ in show['seasons']: + if _['id'] == season_id: + season = _ + break + title = season.get('title_en') or season['title_ar'] + entries = [] for video in show['videos']: - entries.append({ - 'url': 'http://www.dcndigital.ae/#/media/%s' % video['id'], - '_type': 'url', - 'ie_key': 'DCNVideo', - }) - return { - 'id': season_id, - 'title': title, - 'description': description, - 'entries': entries, - '_type': 'playlist', - } + entries.append(self.url_result('http://www.dcndigital.ae/#/media/%s' % video['id'], 'DCNVideo')) + + return self.playlist_result(entries, season_id, title) From 8b55cadc83f198e0fa6bac7158f9b05826f39257 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 7 Sep 2015 16:39:01 +0100 Subject: [PATCH 0016/1214] [canal13cl] fix info extraction --- youtube_dl/extractor/__init__.py | 2 +- youtube_dl/extractor/canal13cl.py | 48 ------------------- youtube_dl/extractor/tele13.py | 77 +++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 49 deletions(-) delete mode 100644 youtube_dl/extractor/canal13cl.py create mode 100644 youtube_dl/extractor/tele13.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5d2ea39d0..661b53e63 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -67,7 +67,6 @@ from .camdemy import ( CamdemyIE, CamdemyFolderIE ) -from .canal13cl import Canal13clIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .cbs import CBSIE @@ -612,6 +611,7 @@ from .teachingchannel import TeachingChannelIE from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE from .ted import TEDIE +from .tele13 import Tele13IE from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE from .telegraaf import TelegraafIE diff --git a/youtube_dl/extractor/canal13cl.py b/youtube_dl/extractor/canal13cl.py deleted file mode 100644 index 93241fefe..000000000 --- a/youtube_dl/extractor/canal13cl.py +++ /dev/null @@ -1,48 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class Canal13clIE(InfoExtractor): - _VALID_URL = r'^http://(?:www\.)?13\.cl/(?:[^/?#]+/)*(?P<id>[^/?#]+)' - _TEST = { - 'url': 'http://www.13.cl/t13/nacional/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', - 'md5': '4cb1fa38adcad8fea88487a078831755', - 'info_dict': { - 'id': '1403022125', - 'display_id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', - 'ext': 'mp4', - 'title': 'El "círculo de hierro" de Michelle Bachelet en su regreso a La Moneda', - 'description': '(Foto: Agencia Uno) En nueve días más, Michelle Bachelet va a asumir por segunda vez como presidenta de la República. Entre aquellos que la acompañarán hay caras que se repiten y otras que se consolidan en su entorno de colaboradores más cercanos.', - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') - - webpage = self._download_webpage(url, display_id) - - title = self._html_search_meta( - 'twitter:title', webpage, 'title', fatal=True) - description = self._html_search_meta( - 'twitter:description', webpage, 'description') - url = self._html_search_regex( - r'articuloVideo = \"(.*?)\"', webpage, 'url') - real_id = self._search_regex( - r'[^0-9]([0-9]{7,})[^0-9]', url, 'id', default=display_id) - thumbnail = self._html_search_regex( - r'articuloImagen = \"(.*?)\"', webpage, 'thumbnail') - - return { - 'id': real_id, - 'display_id': display_id, - 'url': url, - 'title': title, - 'description': description, - 'ext': 'mp4', - 'thumbnail': thumbnail, - } diff --git a/youtube_dl/extractor/tele13.py b/youtube_dl/extractor/tele13.py new file mode 100644 index 000000000..5d89e757f --- /dev/null +++ b/youtube_dl/extractor/tele13.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import js_to_json + + +class Tele13IE(InfoExtractor): + _VALID_URL = r'^http://(?:www\.)?t13\.cl/videos(?:/[^/]+)+/(?P<id>[\w-]+)' + _TESTS = [ + { + 'url': 'http://www.t13.cl/videos/actualidad/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', + 'md5': '4cb1fa38adcad8fea88487a078831755', + 'info_dict': { + 'id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', + 'ext': 'mp4', + 'title': 'El c\u00edrculo de hierro de Michelle Bachelet en su regreso a La Moneda', + } + }, + { + 'url': 'http://www.t13.cl/videos/mundo/tendencias/video-captan-misteriosa-bola-fuego-cielos-bangkok', + 'md5': '65d1ae54812c96f4b345dd21d3bb1adc', + 'info_dict': { + 'id': 'rOoKv2OMpOw', + 'ext': 'mp4', + 'title': 'Shooting star seen on 7-Sep-2015', + 'description': 'md5:a1cd2e74f6ee6851552c9cf5851d6b06', + 'uploader': 'Porjai Jaturongkhakun', + 'upload_date': '20150906', + 'uploader_id': 'UCnLY_3ezwNcDSC_Wc6suZxw', + }, + 'add_ie': ['Youtube'], + } + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + setup_js = self._parse_json( + js_to_json( + self._search_regex( + r"jwplayer\('player-vivo'\).setup\((\{.*?\})\)", + webpage, + 'setup code', + flags=re.DOTALL + ).replace('\n//', '') + ), + display_id + ) + title = setup_js['title'] + thumbnail = setup_js.get('image') or setup_js['playlist'][0].get('image') + description = self._html_search_meta( + 'description', webpage, 'description') + + formats = [] + for f in setup_js['playlist'][0]['sources']: + format_url = f['file'] + if format_url != '': + if '.m3u8' in format_url: + formats.extend(self._extract_m3u8_formats(format_url, display_id)) + else: + if 'youtube.com' in format_url: + return self.url_result(format_url, 'Youtube') + else: + formats.append({'url': format_url, 'format_id': f.get('label')}) + + return { + 'id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + } From 436416afe2ea70dd6b55f8c9d699ddb0bdc1ec5f Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Mon, 7 Sep 2015 21:13:49 +0100 Subject: [PATCH 0017/1214] [tele13] skip test --- youtube_dl/extractor/tele13.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tele13.py b/youtube_dl/extractor/tele13.py index 5d89e757f..f1764eb2f 100644 --- a/youtube_dl/extractor/tele13.py +++ b/youtube_dl/extractor/tele13.py @@ -16,8 +16,12 @@ class Tele13IE(InfoExtractor): 'info_dict': { 'id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', 'ext': 'mp4', - 'title': 'El c\u00edrculo de hierro de Michelle Bachelet en su regreso a La Moneda', - } + 'title': 'El círculo de hierro de Michelle Bachelet en su regreso a La Moneda', + }, + 'params': { + # HTTP Error 404: Not Found + 'skip_download': True, + }, }, { 'url': 'http://www.t13.cl/videos/mundo/tendencias/video-captan-misteriosa-bola-fuego-cielos-bangkok', From 689fb748ee1ba8e61f99d21a3bcb1bc83b708649 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 11 Sep 2015 04:44:17 +0100 Subject: [PATCH 0018/1214] [utlis] add extract_attributes for extracting html tags attributes --- youtube_dl/utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 206dd56bc..bcebf9cc5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -248,6 +248,14 @@ def get_element_by_attribute(attribute, value, html): return unescapeHTML(res) +def extract_attributes(attributes_str, attributes_regex=r'(?s)\s*([^\s=]+)\s*=\s*["\']([^"\']+)["\']'): + attributes = re.findall(attributes_regex, attributes_str) + attributes_dict = {} + if attributes: + attributes_dict = {attribute_name: attribute_value for (attribute_name, attribute_value) in attributes} + return attributes_dict + + def clean_html(html): """Clean an HTML snippet into a readable string""" From ed1269000f24a6ddc683a295ff402ef3ded5c4fb Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 11 Sep 2015 04:46:21 +0100 Subject: [PATCH 0019/1214] [brightcove] add support for brightcove in page embed(fixes #6824) --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/brightcove.py | 92 ++++++++++++++++++++++++++++++ youtube_dl/extractor/generic.py | 21 ++++++- 3 files changed, 116 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 57f55b479..fcd9edec3 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -59,7 +59,10 @@ from .bloomberg import BloombergIE from .bpb import BpbIE from .br import BRIE from .breakcom import BreakIE -from .brightcove import BrightcoveIE +from .brightcove import ( + BrightcoveIE, + BrightcoveInPageEmbedIE, +) from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 4721c2293..a07c0888f 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -22,6 +22,10 @@ from ..utils import ( fix_xml_ampersands, unescapeHTML, unsmuggle_url, + js_to_json, + int_or_none, + parse_iso8601, + extract_attributes, ) @@ -346,3 +350,91 @@ class BrightcoveIE(InfoExtractor): if 'url' not in info and not info.get('formats'): raise ExtractorError('Unable to extract video url for %s' % info['id']) return info + + +class BrightcoveInPageEmbedIE(InfoExtractor): + _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/([a-z0-9-]+)_([a-z]+)/index.html?.*videoId=(?P<video_id>\d+)' + TEST = { + 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', + 'info_dict': { + 'id': '4463358922001', + 'ext': 'flv', + 'title': 'Meet the man behind Popcorn Time', + 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', + 'duration': 165768, + } + } + + @staticmethod + def _extract_url(webpage): + video_attributes = re.search(r'(?s)<video([^>]*)>.*?</(?:video|audio)>', webpage) + if video_attributes: + video_attributes = extract_attributes(video_attributes.group(), r'(?s)\s*data-(account|video-id|playlist-id|policy-key|player|embed)\s*=\s*["\']([^"\']+)["\']') + account_id = video_attributes.get('account') + player_id = video_attributes.get('player') + embed = video_attributes.get('embed') + video_id = video_attributes.get('video-id') + if account_id and player_id and embed and video_id: + return 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (account_id, player_id, embed, video_id) + return None + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + account_id, player_id, embed, video_id = mobj.groups() + + webpage = self._download_webpage('http://players.brightcove.net/%s/%s_%s/index.min.js' % (account_id, player_id, embed), video_id) + + catalog = self._parse_json( + js_to_json( + self._search_regex( + r'catalog\(({[^}]+})\);', + webpage, + 'catalog' + ) + ), + video_id + ) + policy_key = catalog['policyKey'] + + req = compat_urllib_request.Request( + 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id), + headers={'Accept': 'application/json;pk=%s' % policy_key}) + json_data = self._download_json(req, video_id) + + title = json_data['name'] + description = json_data.get('description') + thumbnail = json_data.get('name') + timestamp = parse_iso8601(json_data.get('published_at')) + duration = int_or_none(json_data.get('duration')) + + formats = [] + for source in json_data.get('sources'): + source_type = source.get('type') + if source_type == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats(source.get('src'), video_id)) + else: + src = source.get('src') + if src: + formats.append({ + 'url': src, + 'abr': source.get('avg_bitrate'), + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + 'filesize': source.get('size'), + 'container': source.get('container'), + 'vcodec': source.get('container'), + }) + else: + formats.extend(self._extract_f4m_formats(source.get('streaming_src'), video_id)) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'duration': duration, + 'formats': formats, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ec748ed9f..7a3a7f66b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -29,7 +29,10 @@ from ..utils import ( url_basename, xpath_text, ) -from .brightcove import BrightcoveIE +from .brightcove import ( + BrightcoveIE, + BrightcoveInPageEmbedIE, +) from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE @@ -1012,6 +1015,17 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'cinemasnob', }, + }, + # BrightcoveInPageEmbed embed + { + 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', + 'info_dict': { + 'id': '4238694884001', + 'ext': 'flv', + 'title': 'Tabletop: Dread, Last Thoughts', + 'description': 'Tabletop: Dread, Last Thoughts', + 'duration': 51690, + }, } ] @@ -1288,6 +1302,11 @@ class GenericIE(InfoExtractor): 'entries': entries, } + # Look for Brightcove In Page Embed: + brightcove_in_page_embed_url = BrightcoveInPageEmbedIE._extract_url(webpage) + if brightcove_in_page_embed_url: + return self.url_result(brightcove_in_page_embed_url, 'BrightcoveInPageEmbed') + # Look for embedded rtl.nl player matches = re.findall( r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', From b306c439d7f2997ebf2a88385c73fe2d92227b76 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 23 Sep 2015 13:28:05 +0100 Subject: [PATCH 0020/1214] [cnet] fix extraction and extract more formats --- youtube_dl/extractor/cnet.py | 54 +++++++++++++++--------------------- 1 file changed, 23 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index 5dd69bff7..2fac0d79d 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -4,9 +4,7 @@ from __future__ import unicode_literals import json from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) +from .theplatform import ThePlatformIE class CNETIE(InfoExtractor): @@ -15,29 +13,22 @@ class CNETIE(InfoExtractor): 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', 'info_dict': { 'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Hands-on with Microsoft Windows 8.1 Update', 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.', - 'thumbnail': 're:^http://.*/flmswindows8.jpg$', 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', 'uploader': 'Sarah Mitroff', }, - 'params': { - 'skip_download': 'requires rtmpdump', - } }, { 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', 'info_dict': { 'id': '56527b93-d25d-44e3-b738-f989ce2e49ba', - 'ext': 'flv', + 'ext': 'mp4', 'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole', 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', 'uploader': 'Ashley Esqueda', 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', }, - 'params': { - 'skip_download': True, # requires rtmpdump - }, }] def _real_extract(self, url): @@ -45,26 +36,13 @@ class CNETIE(InfoExtractor): webpage = self._download_webpage(url, display_id) data_json = self._html_search_regex( - r"<div class=\"cnetVideoPlayer\"\s+.*?data-cnet-video-options='([^']+)'", + r"<div class=\"videoPlayer\"\s+.*?data-cnet-video-uvp-options='([^']+)'", webpage, 'data json') data = json.loads(data_json) - vdata = data['video'] - if not vdata: - vdata = data['videos'][0] - if not vdata: - raise ExtractorError('Cannot find video data') - - mpx_account = data['config']['players']['default']['mpx_account'] - vid = vdata['files'].get('rtmp', vdata['files']['hds']) - tp_link = 'http://link.theplatform.com/s/%s/%s' % (mpx_account, vid) + vdata = data['videos'][0] video_id = vdata['id'] - title = vdata.get('headline') - if title is None: - title = vdata.get('title') - if title is None: - raise ExtractorError('Cannot find title!') - thumbnail = vdata.get('image', {}).get('path') + title = vdata['title'] author = vdata.get('author') if author: uploader = '%s %s' % (author['firstName'], author['lastName']) @@ -73,13 +51,27 @@ class CNETIE(InfoExtractor): uploader = None uploader_id = None + mpx_account = data['config']['uvpConfig']['default']['mpx_account'] + tp = ThePlatformIE(self._downloader) + formats = [] + subtitles = {} + description = vdata.get('description') + + for vid in vdata['files'].values(): + result = tp.extract(('http://link.theplatform.com/s/%s/%s' % (mpx_account, vid))) + formats.extend(result['formats']) + subtitles = self._merge_subtitles(subtitles, result['subtitles']) + description = description or result.get('description') + + self._sort_formats(formats) + return { - '_type': 'url_transparent', - 'url': tp_link, 'id': video_id, 'display_id': display_id, 'title': title, + 'description': description, 'uploader': uploader, 'uploader_id': uploader_id, - 'thumbnail': thumbnail, + 'subtitles': subtitles, + 'formats': formats, } From 53407e3f383ed80c67db9e06b8c3480257aa3184 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 23 Sep 2015 14:02:13 +0100 Subject: [PATCH 0021/1214] [brightcove] fix streaming_src extraction --- youtube_dl/extractor/brightcove.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index a07c0888f..e4a7befee 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -413,7 +413,7 @@ class BrightcoveInPageEmbedIE(InfoExtractor): if source_type == 'application/x-mpegURL': formats.extend(self._extract_m3u8_formats(source.get('src'), video_id)) else: - src = source.get('src') + src = source.get('src') or source.get('streaming_src') if src: formats.append({ 'url': src, @@ -424,8 +424,6 @@ class BrightcoveInPageEmbedIE(InfoExtractor): 'container': source.get('container'), 'vcodec': source.get('container'), }) - else: - formats.extend(self._extract_f4m_formats(source.get('streaming_src'), video_id)) self._sort_formats(formats) From 6aeba407db84a636fc2522b4f2344eac9e0c1fdb Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 25 Sep 2015 10:52:48 +0100 Subject: [PATCH 0022/1214] [jukebox] remove extractor and handle it using generic extractor --- youtube_dl/extractor/__init__.py | 1 - youtube_dl/extractor/generic.py | 21 +++++++ youtube_dl/extractor/jukebox.py | 59 ------------------ youtube_dl/extractor/ultimedia.py | 99 +++++++++++++------------------ 4 files changed, 61 insertions(+), 119 deletions(-) delete mode 100644 youtube_dl/extractor/jukebox.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 7272859db..1813c7e1b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -262,7 +262,6 @@ from .izlesene import IzleseneIE from .jadorecettepub import JadoreCettePubIE from .jeuxvideo import JeuxVideoIE from .jove import JoveIE -from .jukebox import JukeboxIE from .jpopsukitv import JpopsukiIE from .kaltura import KalturaIE from .kanalplay import KanalPlayIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8881a8a23..4d1f75e63 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -50,6 +50,7 @@ from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE from .snagfilms import SnagFilmsEmbedIE from .screenwavemedia import ScreenwaveMediaIE +from .ultimedia import UltimediaIE class GenericIE(InfoExtractor): @@ -1029,6 +1030,21 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'cinemasnob', }, + }, + # Ultimedia embed + { + 'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html', + 'md5': '25551df6e7c7ab8096ceeeae048c5f64', + 'info_dict': { + 'id': 'r303r', + 'ext': 'mp4', + 'title': 'Kosheen - Pride (live)', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 293, + 'upload_date': '20081103', + 'timestamp': 1225733392, + 'uploader_id': '33m03', + }, } ] @@ -1751,6 +1767,11 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(unescapeHTML(mobj.group('url')), 'ScreenwaveMedia') + # Look for Ulltimedia embeds + ultimedia_url = UltimediaIE._extract_url(webpage) + if ultimedia_url: + return self.url_result(self._proto_relative_url(ultimedia_url), 'Ultimedia') + # Look for AdobeTVVideo embeds mobj = re.search( r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', diff --git a/youtube_dl/extractor/jukebox.py b/youtube_dl/extractor/jukebox.py deleted file mode 100644 index da8068efc..000000000 --- a/youtube_dl/extractor/jukebox.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - RegexNotFoundError, - unescapeHTML, -) - - -class JukeboxIE(InfoExtractor): - _VALID_URL = r'^http://www\.jukebox?\..+?\/.+[,](?P<id>[a-z0-9\-]+)\.html' - _TEST = { - 'url': 'http://www.jukebox.es/kosheen/videoclip,pride,r303r.html', - 'info_dict': { - 'id': 'r303r', - 'ext': 'flv', - 'title': 'Kosheen-En Vivo Pride', - 'uploader': 'Kosheen', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - html = self._download_webpage(url, video_id) - iframe_url = unescapeHTML(self._search_regex(r'<iframe .*src="([^"]*)"', html, 'iframe url')) - - iframe_html = self._download_webpage(iframe_url, video_id, 'Downloading iframe') - if re.search(r'class="jkb_waiting"', iframe_html) is not None: - raise ExtractorError('Video is not available(in your country?)!') - - self.report_extraction(video_id) - - try: - video_url = self._search_regex(r'"config":{"file":"(?P<video_url>http:[^"]+\?mdtk=[0-9]+)"', - iframe_html, 'video url') - video_url = unescapeHTML(video_url).replace('\/', '/') - except RegexNotFoundError: - youtube_url = self._search_regex( - r'config":{"file":"(http:\\/\\/www\.youtube\.com\\/watch\?v=[^"]+)"', - iframe_html, 'youtube url') - youtube_url = unescapeHTML(youtube_url).replace('\/', '/') - self.to_screen('Youtube video detected') - return self.url_result(youtube_url, ie='Youtube') - - title = self._html_search_regex(r'<h1 class="inline">([^<]+)</h1>', - html, 'title') - artist = self._html_search_regex(r'<span id="infos_article_artist">([^<]+)</span>', - html, 'artist') - - return { - 'id': video_id, - 'url': video_url, - 'title': artist + '-' + title, - 'uploader': artist, - } diff --git a/youtube_dl/extractor/ultimedia.py b/youtube_dl/extractor/ultimedia.py index c4751050e..45201332d 100644 --- a/youtube_dl/extractor/ultimedia.py +++ b/youtube_dl/extractor/ultimedia.py @@ -4,102 +4,83 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse -from ..utils import ( - ExtractorError, - qualities, - unified_strdate, - clean_html, -) +from ..utils import int_or_none class UltimediaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ultimedia\.com/default/index/video[^/]+/id/(?P<id>[\d+a-z]+)' + _VALID_URL = r'https?://(?:www\.)?ultimedia\.com/deliver/(?P<type>generic|musique)(?:/[^/]+)*/(?:src|article)/(?P<id>[\d+a-z]+)' _TESTS = [{ # news - 'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r', + 'url': 'https://www.ultimedia.com/deliver/generic/iframe/mdtk/01601930/zone/1/src/s8uk0r/autoplay/yes/ad/no/width/714/height/435', 'md5': '276a0e49de58c7e85d32b057837952a2', 'info_dict': { 'id': 's8uk0r', 'ext': 'mp4', 'title': 'Loi sur la fin de vie: le texte prévoit un renforcement des directives anticipées', - 'description': 'md5:3e5c8fd65791487333dda5db8aed32af', 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 74, 'upload_date': '20150317', + 'timestamp': 1426604939, + 'uploader_id': '3fszv', }, }, { # music - 'url': 'https://www.ultimedia.com/default/index/videomusic/id/xvpfp8', + 'url': 'https://www.ultimedia.com/deliver/musique/iframe/mdtk/01601930/zone/1/article/xvpfp8/autoplay/yes/ad/no/width/714/height/435', 'md5': '2ea3513813cf230605c7e2ffe7eca61c', 'info_dict': { 'id': 'xvpfp8', 'ext': 'mp4', - 'title': "Two - C'est la vie (Clip)", - 'description': 'Two', + 'title': 'Two - C\'est La Vie (clip)', 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 233, 'upload_date': '20150224', + 'timestamp': 1424760500, + 'uploader_id': '3rfzk', }, }] + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<(?:iframe|script)[^>]+src=["\'](?P<url>(?:https?:)?//(?:www\.)?ultimedia\.com/deliver/(?:generic|musique)(?:/[^/]+)*/(?:src|article)/[\d+a-z]+)', + webpage) + if mobj: + return mobj.group('url') + def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + video_type, video_id = re.match(self._VALID_URL, url).groups() - deliver_url = self._proto_relative_url(self._search_regex( - r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?ultimedia\.com/deliver/[^"]+)"', - webpage, 'deliver URL'), compat_urllib_parse_urlparse(url).scheme + ':') - - deliver_page = self._download_webpage( - deliver_url, video_id, 'Downloading iframe page') - - if '>This video is currently not available' in deliver_page: - raise ExtractorError( - 'Video %s is currently not available' % video_id, expected=True) - - player = self._parse_json( - self._search_regex( - r"jwplayer\('player(?:_temp)?'\)\.setup\(({.+?})\)\.on", - deliver_page, 'player'), + deliver_info = self._download_json( + 'http://www.ultimedia.com/deliver/video?video=%s&topic=%s' % (video_id, video_type), video_id) - quality = qualities(['flash', 'html5']) + yt_id = deliver_info.get('yt_id') + if yt_id: + return self.url_result(yt_id, 'Youtube') + + jwconf = deliver_info['jwconf'] + formats = [] - for mode in player['modes']: - video_url = mode.get('config', {}).get('file') - if not video_url: - continue - if re.match(r'https?://www\.youtube\.com/.+?', video_url): - return self.url_result(video_url, 'Youtube') + for source in jwconf['playlist'][0]['sources']: formats.append({ - 'url': video_url, - 'format_id': mode.get('type'), - 'quality': quality(mode.get('type')), + 'url': source['file'], + 'format_id': source.get('label'), }) + self._sort_formats(formats) - thumbnail = player.get('image') - - title = clean_html(( - self._html_search_regex( - r'(?s)<div\s+id="catArticle">.+?</div>(.+?)</h1>', - webpage, 'title', default=None) or - self._search_regex( - r"var\s+nameVideo\s*=\s*'([^']+)'", - deliver_page, 'title'))) - - description = clean_html(self._html_search_regex( - r'(?s)<span>Description</span>(.+?)</p>', webpage, - 'description', fatal=False)) - - upload_date = unified_strdate(self._search_regex( - r'Ajouté le\s*<span>([^<]+)', webpage, - 'upload date', fatal=False)) + title = deliver_info['title'] + thumbnail = jwconf.get('image') + duration = int_or_none(deliver_info.get('duration')) + timestamp = int_or_none(deliver_info.get('release_time')) + uploader_id = deliver_info.get('owner_id') return { 'id': video_id, 'title': title, - 'description': description, 'thumbnail': thumbnail, - 'upload_date': upload_date, + 'duration': duration, + 'timestamp': timestamp, + 'uploader_id': uploader_id, 'formats': formats, } From c01e1a96aa964ef6d5f0bf7675dbe34096b1d2c8 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 30 Sep 2015 11:20:43 +0100 Subject: [PATCH 0023/1214] [brightcove] fix test and fields extraction --- youtube_dl/extractor/brightcove.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index e4a7befee..b41cee91b 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -354,14 +354,18 @@ class BrightcoveIE(InfoExtractor): class BrightcoveInPageEmbedIE(InfoExtractor): _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/([a-z0-9-]+)_([a-z]+)/index.html?.*videoId=(?P<video_id>\d+)' - TEST = { + _TEST = { 'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001', + 'md5': 'c8100925723840d4b0d243f7025703be', 'info_dict': { 'id': '4463358922001', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Meet the man behind Popcorn Time', - 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', + 'description': 'md5:eac376a4fe366edc70279bfb681aea16', + 'timestamp': 1441391203, + 'upload_date': '20150904', 'duration': 165768, + 'uploader_id': '929656772001', } } @@ -403,7 +407,7 @@ class BrightcoveInPageEmbedIE(InfoExtractor): title = json_data['name'] description = json_data.get('description') - thumbnail = json_data.get('name') + thumbnail = json_data.get('thumbnail') timestamp = parse_iso8601(json_data.get('published_at')) duration = int_or_none(json_data.get('duration')) @@ -417,12 +421,13 @@ class BrightcoveInPageEmbedIE(InfoExtractor): if src: formats.append({ 'url': src, - 'abr': source.get('avg_bitrate'), + 'tbr': source.get('avg_bitrate'), 'width': int_or_none(source.get('width')), 'height': int_or_none(source.get('height')), 'filesize': source.get('size'), 'container': source.get('container'), - 'vcodec': source.get('container'), + 'vcodec': source.get('codec'), + 'ext': source.get('container').lower(), }) self._sort_formats(formats) @@ -435,4 +440,5 @@ class BrightcoveInPageEmbedIE(InfoExtractor): 'timestamp': timestamp, 'duration': duration, 'formats': formats, + 'uploader_id': account_id, } From 8fc226ef994a82f7b1050cdb72ec38922d3ab9cf Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 2 Oct 2015 17:24:30 +0100 Subject: [PATCH 0024/1214] [nba] extract all video formats and extract more info --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/nba.py | 102 +++++++++++++++++++++---------- 2 files changed, 74 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a73a1317e..78478b38b 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -351,7 +351,10 @@ from .myvideo import MyVideoIE from .myvidster import MyVidsterIE from .nationalgeographic import NationalGeographicIE from .naver import NaverIE -from .nba import NBAIE +from .nba import ( + NBAIE, + NBAWatchIE, +) from .nbc import ( NBCIE, NBCNewsIE, diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 944096e1c..36ece5b64 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -2,62 +2,100 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - remove_end, parse_duration, + parse_iso8601, + int_or_none, ) -class NBAIE(InfoExtractor): - _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?:nba/)?video(?P<id>/[^?]*?)/?(?:/index\.html)?(?:\?.*)?$' +class NBABaseIE(InfoExtractor): + def _get_formats(self, video_id): + base_url = 'http://nba.cdn.turner.com/nba/big%s' % video_id + return [{ + 'url': base_url + '_nba_android_high.mp4', + 'width': 480, + 'height': 320, + 'format_id': '320p', + },{ + 'url': base_url + '_640x360_664b.mp4', + 'width': 640, + 'height': 360, + 'format_id': '360p', + },{ + 'url': base_url + '_768x432_1404.mp4', + 'width': 768, + 'height': 432, + 'format_id': '432p', + },{ + 'url': base_url + '_1280x720.mp4', + 'width': 1280, + 'height': 720, + 'format_id': '720p', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + ret = self._extract_metadata(webpage, video_id) + ret['id'] = video_id.rpartition('/')[2] + ret['formats'] = self._get_formats(video_id) + return ret + + +class NBAIE(NBABaseIE): + IE_NAME = 'nba' + _VALID_URL = r'https?://(?:www\.)?nba\.com/(?:nba/)?video(?P<id>/[^?]*?)/?(?:/index\.html)?(?:\?.*)?$' _TESTS = [{ 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', - 'md5': 'c0edcfc37607344e2ff8f13c378c88a4', + 'md5': '9d902940d2a127af3f7f9d2f3dc79c96', 'info_dict': { 'id': '0021200253-okc-bkn-recap.nba', 'ext': 'mp4', 'title': 'Thunder vs. Nets', 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', 'duration': 181, + 'timestamp': 1354680189, + 'upload_date': '20121205', }, }, { 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', 'only_matching': True, - }, { + }] + + def _extract_metadata(self, webpage, video_id): + return { + 'title': self._html_search_meta('name', webpage), + 'description': self._html_search_meta('description', webpage), + 'duration': parse_duration(self._html_search_meta('duration', webpage)), + 'thumbnail': self._html_search_meta('thumbnailUrl', webpage), + 'timestamp': parse_iso8601(self._html_search_meta('uploadDate', webpage)) + } + +class NBAWatchIE(NBABaseIE): + IE_NAME = 'nba:watch' + _VALID_URL = r'https?://watch.nba\.com/(?:nba/)?video(?P<id>/[^?]*?)/?(?:/index\.html)?(?:\?.*)?$' + _TESTS = [{ 'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', + 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4', 'info_dict': { 'id': '0041400301-cle-atl-recap.nba', 'ext': 'mp4', - 'title': 'NBA GAME TIME | Video: Hawks vs. Cavaliers Game 1', + 'title': 'Hawks vs. Cavaliers Game 1', 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', 'duration': 228, - }, - 'params': { - 'skip_download': True, + 'timestamp': 1432094400, + 'upload_date': '20150520', } }] - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_url = 'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' - - shortened_video_id = video_id.rpartition('/')[2] - title = remove_end( - self._og_search_title(webpage, default=shortened_video_id), ' : NBA.com') - - description = self._og_search_description(webpage) - duration_str = self._html_search_meta( - 'duration', webpage, 'duration', default=None) - if not duration_str: - duration_str = self._html_search_regex( - r'Duration:</b>\s*(\d+:\d+)', webpage, 'duration', fatal=False) - duration = parse_duration(duration_str) - + def _extract_metadata(self, webpage, video_id): + program_id = self._search_regex(r'var\s+programId\s*=\s*(\d+);', webpage, 'program id') + metadata = self._download_json( + 'http://smbsolr.cdnak.neulion.com/solr_nbav6/nba/nba/mlt/?wt=json&fl=name,description,image,runtime,releaseDate&q=sequence%3A' + program_id, video_id)['match']['docs'][0] return { - 'id': shortened_video_id, - 'url': video_url, - 'title': title, - 'description': description, - 'duration': duration, + 'title': metadata['name'], + 'description': metadata.get('description'), + 'duration': int_or_none(metadata.get('runtime')), + 'thumbnail': metadata.get('image'), + 'timestamp': parse_iso8601(metadata.get('releaseDate')) } From adccf33632c51def397cdfb08c1271de6d6ec95e Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 2 Oct 2015 21:58:20 +0100 Subject: [PATCH 0025/1214] [ign] add support for pcmag and extract all formats and more metadata --- youtube_dl/extractor/__init__.py | 6 +- youtube_dl/extractor/ign.py | 117 +++++++++++++++++++++++-------- 2 files changed, 93 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index a73a1317e..191661390 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -231,7 +231,11 @@ from .howstuffworks import HowStuffWorksIE from .huffpost import HuffPostIE from .hypem import HypemIE from .iconosquare import IconosquareIE -from .ign import IGNIE, OneUPIE +from .ign import ( + IGNIE, + OneUPIE, + PCMagIE, +) from .imdb import ( ImdbIE, ImdbListIE diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index bf2d2041b..fa4e67394 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -3,6 +3,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) class IGNIE(InfoExtractor): @@ -11,25 +15,23 @@ class IGNIE(InfoExtractor): Some videos of it.ign.com are also supported """ - _VALID_URL = r'https?://.+?\.ign\.com/(?P<type>videos|show_videos|articles|(?:[^/]*/feature))(/.+)?/(?P<name_or_id>.+)' + _VALID_URL = r'https?://.+?\.ign\.com/(?:[^/]+/)?(?P<type>videos|show_videos|articles|feature|(?:[^/]+/\d+/video))(/.+)?/(?P<name_or_id>.+)' IE_NAME = 'ign.com' - _CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config' - _DESCRIPTION_RE = [ - r'<span class="page-object-description">(.+?)</span>', - r'id="my_show_video">.*?<p>(.*?)</p>', - r'<meta name="description" content="(.*?)"', - ] + _API_URL_TEMPLATE = 'http://apis.ign.com/video/v3/videos/%s' + _EMBED_RE = r'<iframe[^>]+?["\']((?:https?:)?//.+?\.ign\.com.+?/embed.+?)["\']' _TESTS = [ { 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', - 'md5': 'eac8bdc1890980122c3b66f14bdd02e9', + 'md5': 'febda82c4bafecd2d44b6e1a18a595f8', 'info_dict': { 'id': '8f862beef863986b2785559b9e1aa599', 'ext': 'mp4', 'title': 'The Last of Us Review', 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c', + 'timestamp': 1370440800, + 'upload_date': '20130605', } }, { @@ -44,6 +46,8 @@ class IGNIE(InfoExtractor): 'ext': 'mp4', 'title': 'GTA 5 Video Review', 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', + 'timestamp': 1379339880, + 'upload_date': '20130916', }, }, { @@ -52,6 +56,8 @@ class IGNIE(InfoExtractor): 'ext': 'mp4', 'title': '26 Twisted Moments from GTA 5 in Slow Motion', 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', + 'timestamp': 1386878820, + 'upload_date': '20131212', }, }, ], @@ -66,10 +72,9 @@ class IGNIE(InfoExtractor): 'id': '078fdd005f6d3c02f63d795faa1b984f', 'ext': 'mp4', 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', - 'description': ( - 'Giant skeletons, bloody hunts, and captivating' - ' natural beauty take our breath away.' - ), + 'description': 'Brian and Jared explore Michel Ancel\'s captivating new preview.', + 'timestamp': 1408047180, + 'upload_date': '20140814', }, }, ] @@ -82,7 +87,7 @@ class IGNIE(InfoExtractor): r'<object id="vid_(.+?)"', r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"', ] - return self._search_regex(res_id, webpage, 'video id') + return self._search_regex(res_id, webpage, 'video id', default=None) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -102,22 +107,45 @@ class IGNIE(InfoExtractor): } video_id = self._find_video_id(webpage) - result = self._get_video_info(video_id) - description = self._html_search_regex(self._DESCRIPTION_RE, - webpage, 'video description', flags=re.DOTALL) - result['description'] = description - return result + if not video_id: + return self.url_result(self._search_regex(self._EMBED_RE, webpage, 'embed url')) + return self._get_video_info(video_id) def _get_video_info(self, video_id): - config_url = self._CONFIG_URL_TEMPLATE % video_id - config = self._download_json(config_url, video_id) - media = config['playlist']['media'] + api_data = self._download_json(self._API_URL_TEMPLATE % video_id, video_id) + + formats = [] + m3u8_url = api_data['refs'].get('m3uUrl') + if m3u8_url: + formats.extend(self._extract_m3u8_formats(m3u8_url, video_id)) + f4m_url = api_data['refs'].get('f4mUrl') + if f4m_url: + formats.extend(self._extract_f4m_formats(f4m_url, video_id)) + for asset in api_data['assets']: + formats.append({ + 'url': asset['url'], + 'tbr': asset.get('actual_bitrate_kbps'), + 'fps': asset.get('frame_rate'), + 'height': int_or_none(asset.get('height')), + 'width': int_or_none(asset.get('width')), + }) + self._sort_formats(formats) + + thumbnails = [] + for thumbnail in api_data['thumbnails']: + thumbnails.append({'url': thumbnail['url']}) + + metadata = api_data['metadata'] return { - 'id': media['metadata']['videoId'], - 'url': media['url'], - 'title': media['metadata']['title'], - 'thumbnail': media['poster'][0]['url'].replace('{size}', 'grande'), + 'id': api_data.get('videoId') or video_id, + 'title': metadata.get('longTitle') or metadata.get('name') or metadata.get['title'], + 'description': metadata.get('description'), + 'timestamp': parse_iso8601(metadata.get('publishDate')), + 'duration': int_or_none(metadata.get('duration')), + 'display_id': metadata.get('slug') or video_id, + 'thumbnails': thumbnails, + 'formats': formats, } @@ -125,16 +153,16 @@ class OneUPIE(IGNIE): _VALID_URL = r'https?://gamevideos\.1up\.com/(?P<type>video)/id/(?P<name_or_id>.+)\.html' IE_NAME = '1up.com' - _DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>' - _TESTS = [{ 'url': 'http://gamevideos.1up.com/video/id/34976.html', - 'md5': '68a54ce4ebc772e4b71e3123d413163d', + 'md5': 'c9cc69e07acb675c31a16719f909e347', 'info_dict': { 'id': '34976', 'ext': 'mp4', 'title': 'Sniper Elite V2 - Trailer', - 'description': 'md5:5d289b722f5a6d940ca3136e9dae89cf', + 'description': 'md5:bf0516c5ee32a3217aa703e9b1bc7826', + 'timestamp': 1313099220, + 'upload_date': '20110811', } }] @@ -143,3 +171,34 @@ class OneUPIE(IGNIE): result = super(OneUPIE, self)._real_extract(url) result['id'] = mobj.group('name_or_id') return result + + +class PCMagIE(IGNIE): + _VALID_URL = r'https?://(?:www\.)?pcmag\.com/(?P<type>videos|article2)(/.+)?/(?P<name_or_id>.+)' + IE_NAME = 'pcmag' + + _EMBED_RE = r'iframe.setAttribute\("src",\s*__util.objToUrlString\("http://widgets\.ign\.com/video/embed/content.html?[^"]*url=([^"]+)["&]' + + _TESTS = [{ + 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', + 'md5': '212d6154fd0361a2781075f1febbe9ad', + 'info_dict': { + 'id': 'ee10d774b508c9b8ec07e763b9125b91', + 'ext': 'mp4', + 'title': '010615_What\'s New Now: Is GoGo Snooping on Your Data?', + 'description': 'md5:a7071ae64d2f68cc821c729d4ded6bb3', + 'timestamp': 1420571160, + 'upload_date': '20150106', + } + },{ + 'url': 'http://www.pcmag.com/article2/0,2817,2470156,00.asp', + 'md5': '94130c1ca07ba0adb6088350681f16c1', + 'info_dict': { + 'id': '042e560ba94823d43afcb12ddf7142ca', + 'ext': 'mp4', + 'title': 'HTC\'s Weird New Re Camera - What\'s New Now', + 'description': 'md5:53433c45df96d2ea5d0fda18be2ca908', + 'timestamp': 1412953920, + 'upload_date': '20141010', + } + }] From 28809ab07a8d10f9cafc3d712414c7b355c27166 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 3 Oct 2015 09:47:19 +0100 Subject: [PATCH 0026/1214] [nba] extract more formats --- youtube_dl/extractor/nba.py | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 36ece5b64..8844b61a5 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -10,28 +10,60 @@ from ..utils import ( class NBABaseIE(InfoExtractor): def _get_formats(self, video_id): + formats = self._extract_m3u8_formats( + 'http://nbavod-f.akamaihd.net/i/nba/big%s_,640x360_664m,768x432_996,768x432_1404,960x540_2104,1280x720,.mp4.csmil/master.m3u8' % video_id, + video_id, + m3u8_id='hls') + formats.extend(self._extract_f4m_formats( + 'http://nbavod-f.akamaihd.net/z/nba/big%s_,640x360_664m,768x432_996,768x432_1404,960x540_2104,1280x720,.mp4.csmil/manifest.f4m?hdcore=3.4.1.1' % video_id, + video_id, + f4m_id='hds')) base_url = 'http://nba.cdn.turner.com/nba/big%s' % video_id - return [{ + formats.extend([{ + 'url': base_url + '_nba_ipad.mp4', + 'width': 400, + 'height': 224, + 'format_id': '224p', + 'preference': 1, + },{ 'url': base_url + '_nba_android_high.mp4', 'width': 480, 'height': 320, 'format_id': '320p', + 'preference': 2, + },{ + 'url': base_url + '_nba_576x324.mp4', + 'width': 576, + 'height': 324, + 'format_id': '324p', + 'preference': 3, },{ 'url': base_url + '_640x360_664b.mp4', 'width': 640, 'height': 360, 'format_id': '360p', + 'preference': 4, },{ 'url': base_url + '_768x432_1404.mp4', 'width': 768, 'height': 432, 'format_id': '432p', + 'preference': 5, + },{ + 'url': base_url + '_960x540_2104.mp4', + 'width': 960, + 'height': 540, + 'format_id': '540p', + 'preference': 6, },{ 'url': base_url + '_1280x720.mp4', 'width': 1280, 'height': 720, 'format_id': '720p', - }] + 'preference': 7, + }]) + self._sort_formats(formats) + return formats def _real_extract(self, url): video_id = self._match_id(url) From c233e6bcc398f9734d7138854978c1cb00fe757f Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 3 Oct 2015 12:30:05 +0100 Subject: [PATCH 0027/1214] [nba] extract video info from xml feed --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/nba.py | 224 +++++++++++++++++-------------- 2 files changed, 126 insertions(+), 103 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 78478b38b..a73a1317e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -351,10 +351,7 @@ from .myvideo import MyVideoIE from .myvidster import MyVidsterIE from .nationalgeographic import NationalGeographicIE from .naver import NaverIE -from .nba import ( - NBAIE, - NBAWatchIE, -) +from .nba import NBAIE from .nbc import ( NBCIE, NBCNewsIE, diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 8844b61a5..3d38d080e 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -3,131 +3,157 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( parse_duration, - parse_iso8601, int_or_none, ) -class NBABaseIE(InfoExtractor): - def _get_formats(self, video_id): - formats = self._extract_m3u8_formats( - 'http://nbavod-f.akamaihd.net/i/nba/big%s_,640x360_664m,768x432_996,768x432_1404,960x540_2104,1280x720,.mp4.csmil/master.m3u8' % video_id, - video_id, - m3u8_id='hls') - formats.extend(self._extract_f4m_formats( - 'http://nbavod-f.akamaihd.net/z/nba/big%s_,640x360_664m,768x432_996,768x432_1404,960x540_2104,1280x720,.mp4.csmil/manifest.f4m?hdcore=3.4.1.1' % video_id, - video_id, - f4m_id='hds')) - base_url = 'http://nba.cdn.turner.com/nba/big%s' % video_id - formats.extend([{ - 'url': base_url + '_nba_ipad.mp4', - 'width': 400, - 'height': 224, - 'format_id': '224p', - 'preference': 1, - },{ - 'url': base_url + '_nba_android_high.mp4', - 'width': 480, - 'height': 320, - 'format_id': '320p', - 'preference': 2, - },{ - 'url': base_url + '_nba_576x324.mp4', - 'width': 576, - 'height': 324, - 'format_id': '324p', - 'preference': 3, - },{ - 'url': base_url + '_640x360_664b.mp4', - 'width': 640, - 'height': 360, - 'format_id': '360p', - 'preference': 4, - },{ - 'url': base_url + '_768x432_1404.mp4', - 'width': 768, - 'height': 432, - 'format_id': '432p', - 'preference': 5, - },{ - 'url': base_url + '_960x540_2104.mp4', - 'width': 960, - 'height': 540, - 'format_id': '540p', - 'preference': 6, - },{ - 'url': base_url + '_1280x720.mp4', - 'width': 1280, - 'height': 720, - 'format_id': '720p', - 'preference': 7, - }]) - self._sort_formats(formats) - return formats - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - ret = self._extract_metadata(webpage, video_id) - ret['id'] = video_id.rpartition('/')[2] - ret['formats'] = self._get_formats(video_id) - return ret - - -class NBAIE(NBABaseIE): - IE_NAME = 'nba' - _VALID_URL = r'https?://(?:www\.)?nba\.com/(?:nba/)?video(?P<id>/[^?]*?)/?(?:/index\.html)?(?:\?.*)?$' +class NBAIE(InfoExtractor): + _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?:nba/)?video/(?P<id>[^?]*?)/?(?:/index\.html)?(?:\?.*)?$' _TESTS = [{ 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', 'md5': '9d902940d2a127af3f7f9d2f3dc79c96', 'info_dict': { - 'id': '0021200253-okc-bkn-recap.nba', + 'id': '0021200253-okc-bkn-recap', 'ext': 'mp4', 'title': 'Thunder vs. Nets', 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', 'duration': 181, - 'timestamp': 1354680189, - 'upload_date': '20121205', + 'timestamp': 1354638466, + 'upload_date': '20121204', }, }, { 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', 'only_matching': True, - }] - - def _extract_metadata(self, webpage, video_id): - return { - 'title': self._html_search_meta('name', webpage), - 'description': self._html_search_meta('description', webpage), - 'duration': parse_duration(self._html_search_meta('duration', webpage)), - 'thumbnail': self._html_search_meta('thumbnailUrl', webpage), - 'timestamp': parse_iso8601(self._html_search_meta('uploadDate', webpage)) - } - -class NBAWatchIE(NBABaseIE): - IE_NAME = 'nba:watch' - _VALID_URL = r'https?://watch.nba\.com/(?:nba/)?video(?P<id>/[^?]*?)/?(?:/index\.html)?(?:\?.*)?$' - _TESTS = [{ + },{ 'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4', 'info_dict': { - 'id': '0041400301-cle-atl-recap.nba', + 'id': '0041400301-cle-atl-recap', 'ext': 'mp4', 'title': 'Hawks vs. Cavaliers Game 1', 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', 'duration': 228, - 'timestamp': 1432094400, + 'timestamp': 1432134543, 'upload_date': '20150520', } }] - def _extract_metadata(self, webpage, video_id): - program_id = self._search_regex(r'var\s+programId\s*=\s*(\d+);', webpage, 'program id') - metadata = self._download_json( - 'http://smbsolr.cdnak.neulion.com/solr_nbav6/nba/nba/mlt/?wt=json&fl=name,description,image,runtime,releaseDate&q=sequence%3A' + program_id, video_id)['match']['docs'][0] + _BASE_PATHS = { + 'turner': 'http://nba.cdn.turner.com/nba/big', + 'akamai': 'http://nbavod-f.akamaihd.net', + } + + _QUALITIES = { + '420mp4': { + 'width': 400, + 'height': 224, + 'preference': 1, + }, + '416x234': { + 'width': 416, + 'height': 234, + 'preference': 2, + }, + '556': { + 'width': 416, + 'height': 234, + 'preference': 3, + }, + '480x320_910': { + 'width': 480, + 'height': 320, + 'preference': 4, + }, + 'nba_576x324': { + 'width': 576, + 'height': 324, + 'preference': 5, + }, + 'nba_640x360': { + 'width': 640, + 'height': 360, + 'preference': 6, + }, + '640x360_664b': { + 'width': 640, + 'height': 360, + 'preference': 7, + }, + '640x360_664m': { + 'width': 640, + 'height': 360, + 'preference': 8, + }, + '768x432_996': { + 'width': 768, + 'height': 432, + 'preference': 9, + }, + '768x432_1404': { + 'width': 768, + 'height': 432, + 'preference': 10, + }, + '960x540_2104': { + 'width': 960, + 'height': 540, + 'preference': 11, + }, + '1280x720_3072': { + 'width': 1280, + 'height': 720, + 'preference': 12, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_info = self._download_xml('http://www.nba.com/video/%s.xml' % video_id, video_id) + video_id = video_info.find('slug').text + title = video_info.find('headline').text + description = video_info.find('description').text + duration = parse_duration(video_info.find('length').text) + timestamp = int_or_none(video_info.find('dateCreated').attrib.get('uts')) + + thumbnails = [] + for image in video_info.find('images'): + thumbnails.append({ + 'id': image.attrib.get('cut'), + 'url': image.text, + 'width': int_or_none(image.attrib.get('width')), + 'height': int_or_none(image.attrib.get('height')), + }) + + formats = [] + for video_file in video_info.find('files').iter('file'): + video_url = video_file.text + if not video_url.startswith('http://'): + if video_url.endswith('.m3u8') or video_url.endswith('.f4m'): + video_url = self._BASE_PATHS['akamai'] + video_url + else: + video_url = self._BASE_PATHS['turner'] + video_url + if video_url.endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(video_url, video_id)) + elif video_url.endswith('.f4m'): + formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.1.1', video_id)) + else: + key = video_file.attrib.get('bitrate') + quality = self._QUALITIES[key] + formats.append({ + 'format_id': key, + 'url': video_url, + 'width': quality['width'], + 'height': quality['height'], + 'preference': quality['preference'], + }) + self._sort_formats(formats) + return { - 'title': metadata['name'], - 'description': metadata.get('description'), - 'duration': int_or_none(metadata.get('runtime')), - 'thumbnail': metadata.get('image'), - 'timestamp': parse_iso8601(metadata.get('releaseDate')) + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'formats': formats, } From 30787f7259c4e6a08f691cc691f14fa0c8fe4b87 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 3 Oct 2015 19:28:48 +0100 Subject: [PATCH 0028/1214] [cspan] correct the clip info extraction --- youtube_dl/extractor/cspan.py | 58 ++++++++++++++++------------------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index fbefd37d0..994e080d5 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -18,22 +18,21 @@ class CSpanIE(InfoExtractor): IE_DESC = 'C-SPAN' _TESTS = [{ 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', - 'md5': '8e44ce11f0f725527daccc453f553eb0', + 'md5': '067803f994e049b455a58b16e5aab442', 'info_dict': { 'id': '315139', 'ext': 'mp4', 'title': 'Attorney General Eric Holder on Voting Rights Act Decision', - 'description': 'Attorney General Eric Holder spoke to reporters following the Supreme Court decision in Shelby County v. Holder in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced until Congress established new guidelines for review.', + 'description': 'Attorney General Eric Holder speaks to reporters following the Supreme Court decision in [Shelby County v. Holder], in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced.', }, 'skip': 'Regularly fails on travis, for unknown reasons', }, { 'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models', - # For whatever reason, the served video alternates between - # two different ones + 'md5': '4eafd1e91a75d2b1e6a3cbd0995816a2', 'info_dict': { - 'id': '340723', + 'id': 'c4486943', 'ext': 'mp4', - 'title': 'International Health Care Models', + 'title': 'CSPAN - International Health Care Models', 'description': 'md5:7a985a2d595dba00af3d9c9f0783c967', } }, { @@ -44,7 +43,7 @@ class CSpanIE(InfoExtractor): 'ext': 'mp4', 'title': 'General Motors Ignition Switch Recall', 'duration': 14848, - 'description': 'md5:70c7c3b8fa63fa60d42772440596034c' + 'description': 'md5:118081aedd24bf1d3b68b3803344e7f3' }, }, { # Video from senate.gov @@ -57,36 +56,33 @@ class CSpanIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_id = mobj.group('id') - webpage = self._download_webpage(url, page_id) - video_id = self._search_regex(r'progid=\'?([0-9]+)\'?>', webpage, 'video id') + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + matches = re.search(r'data-(prog|clip)id=\'([0-9]+)\'', webpage) + if matches: + video_type, video_id = matches.groups() + if video_type == 'prog': + video_type = 'program' + else: + senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) + if senate_isvp_url: + title = self._og_search_title(webpage) + surl = smuggle_url(senate_isvp_url, {'force_title': title}) + return self.url_result(surl, 'SenateISVP', video_id, title) - description = self._html_search_regex( - [ - # The full description - r'<div class=\'expandable\'>(.*?)<a href=\'#\'', - # If the description is small enough the other div is not - # present, otherwise this is a stripped version - r'<p class=\'initial\'>(.*?)</p>' - ], - webpage, 'description', flags=re.DOTALL, default=None) - - info_url = 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=program&id=' + video_id - data = self._download_json(info_url, video_id) + data = self._download_json( + 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), + video_id) doc = self._download_xml( - 'http://www.c-span.org/common/services/flashXml.php?programid=' + video_id, + 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id), video_id) + description = self._html_search_meta('description', webpage) + title = find_xpath_attr(doc, './/string', 'name', 'title').text thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) - if senate_isvp_url: - surl = smuggle_url(senate_isvp_url, {'force_title': title}) - return self.url_result(surl, 'SenateISVP', video_id, title) - files = data['video']['files'] try: capfile = data['video']['capfile']['#text'] @@ -112,12 +108,12 @@ class CSpanIE(InfoExtractor): if len(entries) == 1: entry = dict(entries[0]) - entry['id'] = video_id + entry['id'] = 'c' + video_id if video_type == 'clip' else video_id return entry else: return { '_type': 'playlist', 'entries': entries, 'title': title, - 'id': video_id, + 'id': 'c' + video_id if video_type == 'clip' else video_id, } From 139f27827e1d771aba5cf7f1473129073686f5ab Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 7 Oct 2015 06:53:19 +0100 Subject: [PATCH 0029/1214] [nba] skip Legacy Video Files --- youtube_dl/extractor/nba.py | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 3d38d080e..73116c7c6 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -38,11 +38,6 @@ class NBAIE(InfoExtractor): } }] - _BASE_PATHS = { - 'turner': 'http://nba.cdn.turner.com/nba/big', - 'akamai': 'http://nbavod-f.akamaihd.net', - } - _QUALITIES = { '420mp4': { 'width': 400, @@ -54,55 +49,50 @@ class NBAIE(InfoExtractor): 'height': 234, 'preference': 2, }, - '556': { - 'width': 416, - 'height': 234, - 'preference': 3, - }, '480x320_910': { 'width': 480, 'height': 320, - 'preference': 4, + 'preference': 3, }, 'nba_576x324': { 'width': 576, 'height': 324, - 'preference': 5, + 'preference': 4, }, 'nba_640x360': { 'width': 640, 'height': 360, - 'preference': 6, + 'preference': 5, }, '640x360_664b': { 'width': 640, 'height': 360, - 'preference': 7, + 'preference': 6, }, '640x360_664m': { 'width': 640, 'height': 360, - 'preference': 8, + 'preference': 7, }, '768x432_996': { 'width': 768, 'height': 432, - 'preference': 9, + 'preference': 8, }, '768x432_1404': { 'width': 768, 'height': 432, - 'preference': 10, + 'preference': 9, }, '960x540_2104': { 'width': 960, 'height': 540, - 'preference': 11, + 'preference': 10, }, '1280x720_3072': { 'width': 1280, 'height': 720, - 'preference': 12, + 'preference': 11, }, } @@ -127,11 +117,8 @@ class NBAIE(InfoExtractor): formats = [] for video_file in video_info.find('files').iter('file'): video_url = video_file.text - if not video_url.startswith('http://'): - if video_url.endswith('.m3u8') or video_url.endswith('.f4m'): - video_url = self._BASE_PATHS['akamai'] + video_url - else: - video_url = self._BASE_PATHS['turner'] + video_url + if video_url.startswith('/'): + continue if video_url.endswith('.m3u8'): formats.extend(self._extract_m3u8_formats(video_url, video_id)) elif video_url.endswith('.f4m'): From ecf6de5b02ad3996f770efd33f9b400d04ac8a85 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 7 Oct 2015 07:09:45 +0100 Subject: [PATCH 0030/1214] [nba] extract width,height and bitrate from format key --- youtube_dl/extractor/nba.py | 68 ++++--------------------------------- 1 file changed, 6 insertions(+), 62 deletions(-) diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 73116c7c6..ea1482fc8 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( parse_duration, @@ -38,64 +40,6 @@ class NBAIE(InfoExtractor): } }] - _QUALITIES = { - '420mp4': { - 'width': 400, - 'height': 224, - 'preference': 1, - }, - '416x234': { - 'width': 416, - 'height': 234, - 'preference': 2, - }, - '480x320_910': { - 'width': 480, - 'height': 320, - 'preference': 3, - }, - 'nba_576x324': { - 'width': 576, - 'height': 324, - 'preference': 4, - }, - 'nba_640x360': { - 'width': 640, - 'height': 360, - 'preference': 5, - }, - '640x360_664b': { - 'width': 640, - 'height': 360, - 'preference': 6, - }, - '640x360_664m': { - 'width': 640, - 'height': 360, - 'preference': 7, - }, - '768x432_996': { - 'width': 768, - 'height': 432, - 'preference': 8, - }, - '768x432_1404': { - 'width': 768, - 'height': 432, - 'preference': 9, - }, - '960x540_2104': { - 'width': 960, - 'height': 540, - 'preference': 10, - }, - '1280x720_3072': { - 'width': 1280, - 'height': 720, - 'preference': 11, - }, - } - def _real_extract(self, url): video_id = self._match_id(url) video_info = self._download_xml('http://www.nba.com/video/%s.xml' % video_id, video_id) @@ -125,13 +69,13 @@ class NBAIE(InfoExtractor): formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.1.1', video_id)) else: key = video_file.attrib.get('bitrate') - quality = self._QUALITIES[key] + width, height, bitrate = re.search(r'(\d+)x(\d+)(?:_(\d+))?', key).groups() formats.append({ 'format_id': key, 'url': video_url, - 'width': quality['width'], - 'height': quality['height'], - 'preference': quality['preference'], + 'width': int_or_none(width), + 'height': int_or_none(height), + 'tbr': int_or_none(bitrate), }) self._sort_formats(formats) From 6a11bb77baf9f70da76f2595b74061b31223d4ff Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 7 Oct 2015 12:17:32 +0100 Subject: [PATCH 0031/1214] [nba] add support for team subsites --- youtube_dl/extractor/nba.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index ea1482fc8..a0cc58c12 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -10,13 +10,13 @@ from ..utils import ( class NBAIE(InfoExtractor): - _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?:nba/)?video/(?P<id>[^?]*?)/?(?:/index\.html)?(?:\?.*)?$' + _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P<path>(?:[^/]+/)?video/(?P<id>[^?]*?))/?(?:/index\.html)?(?:\?.*)?$' _TESTS = [{ 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', - 'md5': '9d902940d2a127af3f7f9d2f3dc79c96', + 'md5': '9e7729d3010a9c71506fd1248f74e4f4', 'info_dict': { 'id': '0021200253-okc-bkn-recap', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Thunder vs. Nets', 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', 'duration': 181, @@ -27,7 +27,7 @@ class NBAIE(InfoExtractor): 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', 'only_matching': True, },{ - 'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', + 'url': 'http://watch.nba.com/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4', 'info_dict': { 'id': '0041400301-cle-atl-recap', @@ -41,8 +41,8 @@ class NBAIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) - video_info = self._download_xml('http://www.nba.com/video/%s.xml' % video_id, video_id) + path, video_id = re.match(self._VALID_URL, url).groups() + video_info = self._download_xml('http://www.nba.com/%s.xml' % path, video_id) video_id = video_info.find('slug').text title = video_info.find('headline').text description = video_info.find('description').text @@ -64,9 +64,9 @@ class NBAIE(InfoExtractor): if video_url.startswith('/'): continue if video_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(video_url, video_id)) + formats.extend(self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls')) elif video_url.endswith('.f4m'): - formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.1.1', video_id)) + formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.1.1', video_id, f4m_id='hds')) else: key = video_file.attrib.get('bitrate') width, height, bitrate = re.search(r'(\d+)x(\d+)(?:_(\d+))?', key).groups() From 1ef1563649374568870e9334cce7055f7c83a817 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 9 Oct 2015 20:08:37 +0100 Subject: [PATCH 0032/1214] [srgssr] Add generic extractor for SRGSSR Group sites --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/srf.py | 104 ------------------------- youtube_dl/extractor/srgssr.py | 130 +++++++++++++++++++++++++++++++ 3 files changed, 134 insertions(+), 105 deletions(-) delete mode 100644 youtube_dl/extractor/srf.py create mode 100644 youtube_dl/extractor/srgssr.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 3ace1cc2c..042ad3678 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -591,7 +591,10 @@ from .sportbox import ( SportBoxEmbedIE, ) from .sportdeutschland import SportDeutschlandIE -from .srf import SrfIE +from .srgssr import ( + SRGSSRIE, + SRGSSRPlayIE, +) from .srmediathek import SRMediathekIE from .ssa import SSAIE from .stanfordoc import StanfordOpenClassroomIE diff --git a/youtube_dl/extractor/srf.py b/youtube_dl/extractor/srf.py deleted file mode 100644 index 77eec0bc7..000000000 --- a/youtube_dl/extractor/srf.py +++ /dev/null @@ -1,104 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -from .common import InfoExtractor -from ..utils import ( - determine_ext, - parse_iso8601, - xpath_text, -) - - -class SrfIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.srf\.ch/play(?:er)?/tv/[^/]+/video/(?P<display_id>[^?]+)\?id=|tp\.srgssr\.ch/p/flash\?urn=urn:srf:ais:video:)(?P<id>[0-9a-f\-]{36})' - _TESTS = [{ - 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'md5': '4cd93523723beff51bb4bee974ee238d', - 'info_dict': { - 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'display_id': 'snowden-beantragt-asyl-in-russland', - 'ext': 'm4v', - 'upload_date': '20130701', - 'title': 'Snowden beantragt Asyl in Russland', - 'timestamp': 1372713995, - } - }, { - # No Speichern (Save) button - 'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa', - 'md5': 'd97e236e80d1d24729e5d0953d276a4f', - 'info_dict': { - 'id': '677f5829-e473-4823-ac83-a1087fe97faa', - 'display_id': 'jaguar-xk120-shadow-und-tornado-dampflokomotive', - 'ext': 'flv', - 'upload_date': '20130710', - 'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive', - 'timestamp': 1373493600, - }, - }, { - 'url': 'http://www.srf.ch/player/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'only_matching': True, - }, { - 'url': 'https://tp.srgssr.ch/p/flash?urn=urn:srf:ais:video:28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - display_id = re.match(self._VALID_URL, url).group('display_id') or video_id - - video_data = self._download_xml( - 'http://il.srgssr.ch/integrationlayer/1.0/ue/srf/video/play/%s.xml' % video_id, - display_id) - - title = xpath_text( - video_data, './AssetMetadatas/AssetMetadata/title', fatal=True) - thumbnails = [{ - 'url': s.text - } for s in video_data.findall('.//ImageRepresentation/url')] - timestamp = parse_iso8601(xpath_text(video_data, './createdDate')) - # The <duration> field in XML is different from the exact duration, skipping - - formats = [] - for item in video_data.findall('./Playlists/Playlist') + video_data.findall('./Downloads/Download'): - for url_node in item.findall('url'): - quality = url_node.attrib['quality'] - full_url = url_node.text - original_ext = determine_ext(full_url) - format_id = '%s-%s' % (quality, item.attrib['protocol']) - if original_ext == 'f4m': - formats.extend(self._extract_f4m_formats( - full_url + '?hdcore=3.4.0', display_id, f4m_id=format_id)) - elif original_ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - full_url, display_id, 'mp4', m3u8_id=format_id)) - else: - formats.append({ - 'url': full_url, - 'ext': original_ext, - 'format_id': format_id, - 'quality': 0 if 'HD' in quality else -1, - 'preference': 1, - }) - - self._sort_formats(formats) - - subtitles = {} - subtitles_data = video_data.find('Subtitles') - if subtitles_data is not None: - subtitles_list = [{ - 'url': sub.text, - 'ext': determine_ext(sub.text), - } for sub in subtitles_data] - if subtitles_list: - subtitles['de'] = subtitles_list - - return { - 'id': video_id, - 'display_id': display_id, - 'formats': formats, - 'title': title, - 'thumbnails': thumbnails, - 'timestamp': timestamp, - 'subtitles': subtitles, - } diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py new file mode 100644 index 000000000..addf4d26e --- /dev/null +++ b/youtube_dl/extractor/srgssr.py @@ -0,0 +1,130 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + parse_iso8601, + qualities, +) + + +class SRGSSRIE(InfoExtractor): + _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=)?urn:(?P<bu>srf|rts|rsi|rtr|swi):(?:[^:]+:)?(?P<type>video|audio):(?P<id>[0-9a-f\-]{36}|\d+)' + + _ERRORS = { + 'AGERATING12': 'To protect children under the age of 12, this video is only available between 8 p.m. and 6 a.m.', + 'AGERATING18': 'To protect children under the age of 18, this video is only available between 11 p.m. and 5 a.m.', +# 'ENDDATE': 'For legal reasons, this video was only available for a specified period of time.', + 'GEOBLOCK': 'For legal reasons, this video is only available in Switzerland.', + 'LEGAL': 'The video cannot be transmitted for legal reasons.', + 'STARTDATE': 'This video is not yet available. Please try again later.', + } + + def _real_extract(self, url): + bu, media_type, media_id = re.match(self._VALID_URL, url).groups() + + media_data = self._download_json( + 'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id), + media_id)[media_type.capitalize()] + + if media_data.get('block') and media_data['block'] in self._ERRORS: + raise ExtractorError( + '%s said: %s' % ( + self.IE_NAME, + self._ERRORS[media_data['block']]), + expected=True) + + metadata = media_data['AssetMetadatas']['AssetMetadata'][0] + title = metadata['title'] + description = metadata.get('description') + created_date = media_data.get('createdDate') or metadata.get('createdDate') + timestamp = parse_iso8601(created_date) + + thumbnails = [] + for image in media_data['Image']['ImageRepresentations']['ImageRepresentation']: + thumbnails.append({ + 'id': image.get('id'), + 'url': image['url'], + }) + + preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD']) + formats = [] + for source in media_data['Playlists']['Playlist']: + assets = {} + protocol = source.get('@protocol') + if protocol in ('HTTP-HDS', 'HTTP-HLS'): + for quality in source['url']: + assets[quality['@quality']] = quality['text'] + asset_url = assets.get('HD') or assets.get('HQ') or assets.get('SD') or assets.get('MQ') or assets.get('LQ') + if '.f4m' in asset_url: + formats.extend(self._extract_f4m_formats(asset_url + '?hdcore=3.4.0', media_id, f4m_id='hds')) + elif '.m3u8' in asset_url: + formats.extend(self._extract_m3u8_formats(asset_url, media_id, m3u8_id='hls')) + else: + for asset in source['url']: + asset_url = asset['text'] + ext = None + if asset_url.startswith('rtmp'): + ext = self._search_regex(r'([a-z0-9]+):[^/]+', asset_url, 'ext') + formats.append({ + 'url': asset_url, + 'preference': preference(asset['@quality']), + 'ext': ext, + }) + + downloads = media_data.get('Downloads') + if downloads: + for source in downloads['Download']: + for asset in source['url']: + formats.append({ + 'url': asset['text'], + 'preference': preference(asset['@quality']) + }) + self._sort_formats(formats) + + return { + 'id': media_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'formats': formats, + } + + +class SRGSSRPlayIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|play)\.)?(?P<bu>srf|rts|rsi|rtr|swi)\.ch/play/(?:tv|radio)/[^/]+/(?P<type>video|audio)/[^?]+\?id=(?P<id>[0-9a-f\-]{36}|\d+)' + + _TESTS = [{ + 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'md5': '4cd93523723beff51bb4bee974ee238d', + 'info_dict': { + 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'ext': 'm4v', + 'upload_date': '20130701', + 'title': 'Snowden beantragt Asyl in Russland', + 'timestamp': 1372713995, + } + }, { + # No Speichern (Save) button + 'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa', + 'md5': '0a274ce38fda48c53c01890651985bc6', + 'info_dict': { + 'id': '677f5829-e473-4823-ac83-a1087fe97faa', + 'ext': 'flv', + 'upload_date': '20130710', + 'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive', + 'description': 'md5:88604432b60d5a38787f152dec89cd56', + 'timestamp': 1373493600, + }, + }, { + 'url': 'http://www.srf.ch/player/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', + 'only_matching': True, + }] + + def _real_extract(self, url): + bu, media_type, media_id = re.match(self._VALID_URL, url).groups() + return self.url_result('urn:%s:%s:%s' % (bu, media_type, media_id), 'SRGSSR') From 05ad5409b4fd044169ea0f67b9ae92d555564c4e Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 9 Oct 2015 20:34:03 +0100 Subject: [PATCH 0033/1214] [srgssr] fix regex for swissinfo.ch --- youtube_dl/extractor/srgssr.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index addf4d26e..3b5dcc503 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -96,7 +96,7 @@ class SRGSSRIE(InfoExtractor): class SRGSSRPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|play)\.)?(?P<bu>srf|rts|rsi|rtr|swi)\.ch/play/(?:tv|radio)/[^/]+/(?P<type>video|audio)/[^?]+\?id=(?P<id>[0-9a-f\-]{36}|\d+)' + _VALID_URL = r'https?://(?:(?:www|play)\.)?(?P<bu>srf|rts|rsi|rtr|swissinfo)\.ch/play/(?:tv|radio)/[^/]+/(?P<type>video|audio)/[^?]+\?id=(?P<id>[0-9a-f\-]{36}|\d+)' _TESTS = [{ 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', @@ -120,11 +120,8 @@ class SRGSSRPlayIE(InfoExtractor): 'description': 'md5:88604432b60d5a38787f152dec89cd56', 'timestamp': 1373493600, }, - }, { - 'url': 'http://www.srf.ch/player/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'only_matching': True, }] def _real_extract(self, url): bu, media_type, media_id = re.match(self._VALID_URL, url).groups() - return self.url_result('urn:%s:%s:%s' % (bu, media_type, media_id), 'SRGSSR') + return self.url_result('urn:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') From e09f58b3bc3af6ce1e541fb7d034fe869fba6e82 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 14 Oct 2015 10:40:54 +0100 Subject: [PATCH 0034/1214] [srgssr] change the url chortcut, fix image extraction ,add a test and extract format id --- youtube_dl/extractor/srgssr.py | 35 ++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index 3b5dcc503..f759e5600 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -12,7 +12,7 @@ from ..utils import ( class SRGSSRIE(InfoExtractor): - _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=)?urn:(?P<bu>srf|rts|rsi|rtr|swi):(?:[^:]+:)?(?P<type>video|audio):(?P<id>[0-9a-f\-]{36}|\d+)' + _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?P<bu>srf|rts|rsi|rtr|swi):(?:[^:]+:)?(?P<type>video|audio):(?P<id>[0-9a-f\-]{36}|\d+)' _ERRORS = { 'AGERATING12': 'To protect children under the age of 12, this video is only available between 8 p.m. and 6 a.m.', @@ -44,11 +44,12 @@ class SRGSSRIE(InfoExtractor): timestamp = parse_iso8601(created_date) thumbnails = [] - for image in media_data['Image']['ImageRepresentations']['ImageRepresentation']: - thumbnails.append({ - 'id': image.get('id'), - 'url': image['url'], - }) + if 'Image' in media_data: + for image in media_data['Image']['ImageRepresentations']['ImageRepresentation']: + thumbnails.append({ + 'id': image.get('id'), + 'url': image['url'], + }) preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD']) formats = [] @@ -70,16 +71,17 @@ class SRGSSRIE(InfoExtractor): if asset_url.startswith('rtmp'): ext = self._search_regex(r'([a-z0-9]+):[^/]+', asset_url, 'ext') formats.append({ + 'format_id': asset['@quality'], 'url': asset_url, 'preference': preference(asset['@quality']), 'ext': ext, }) - downloads = media_data.get('Downloads') - if downloads: - for source in downloads['Download']: + if 'Downloads' in media_data: + for source in media_data['Downloads']['Download']: for asset in source['url']: formats.append({ + 'format_id': asset['@quality'], 'url': asset['text'], 'preference': preference(asset['@quality']) }) @@ -120,8 +122,21 @@ class SRGSSRPlayIE(InfoExtractor): 'description': 'md5:88604432b60d5a38787f152dec89cd56', 'timestamp': 1373493600, }, + },{ + 'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc', + 'info_dict': { + 'id': '63cb0778-27f8-49af-9284-8c7a8c6d15fc', + 'ext': 'mp3', + 'upload_date': '20151013', + 'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', + 'timestamp': 1444750398, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, }] def _real_extract(self, url): bu, media_type, media_id = re.match(self._VALID_URL, url).groups() - return self.url_result('urn:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') + return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') From db8e38b8cff2e67a9ff51104c4a7b33c20650204 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 14 Oct 2015 11:55:03 +0100 Subject: [PATCH 0035/1214] [ign] add tests for me.ign specific language urls --- youtube_dl/extractor/ign.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index fa4e67394..fb2753738 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -77,6 +77,14 @@ class IGNIE(InfoExtractor): 'upload_date': '20140814', }, }, + { + 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', + 'only_matching': True, + }, + { + 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', + 'only_matching': True, + }, ] def _find_video_id(self, webpage): From 90bddb6cdd59107d137c13970dc50a6193d204a7 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 15 Oct 2015 14:28:56 +0100 Subject: [PATCH 0036/1214] [ooyala] extract more formats and metadata --- youtube_dl/extractor/ooyala.py | 151 ++++++++++++--------------------- 1 file changed, 53 insertions(+), 98 deletions(-) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index a262a9f6d..592cdc564 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -1,108 +1,64 @@ from __future__ import unicode_literals import re -import json import base64 from .common import InfoExtractor from ..utils import ( - unescapeHTML, - ExtractorError, - determine_ext, int_or_none, + float_or_none, ) class OoyalaBaseIE(InfoExtractor): - def _extract_result(self, info, more_info): - embedCode = info['embedCode'] - video_url = info.get('ipad_url') or info['url'] - - if determine_ext(video_url) == 'm3u8': - formats = self._extract_m3u8_formats(video_url, embedCode, ext='mp4') - else: - formats = [{ - 'url': video_url, - 'ext': 'mp4', - }] - - return { - 'id': embedCode, - 'title': unescapeHTML(info['title']), - 'formats': formats, - 'description': unescapeHTML(more_info['description']), - 'thumbnail': more_info['promo'], + def _extract(self, player_url, video_id): + print(player_url) + content_tree = self._download_json(player_url, video_id)['content_tree'] + metadata = content_tree[list(content_tree)[0]] + embed_code = metadata['embed_code'] + pcode = metadata.get('asset_pcode') or embed_code + video_info = { + 'id': embed_code, + 'title': metadata['title'], + 'description': metadata.get('description'), + 'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'), + 'duration': int_or_none(metadata.get('duration')), } - def _extract(self, player_url, video_id): - player = self._download_webpage(player_url, video_id) - mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="', - player, 'mobile player url') - # Looks like some videos are only available for particular devices - # (e.g. http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0 - # is only available for ipad) - # Working around with fetching URLs for all the devices found starting with 'unknown' - # until we succeed or eventually fail for each device. - devices = re.findall(r'device\s*=\s*"([^"]+)";', player) - devices.remove('unknown') - devices.insert(0, 'unknown') - for device in devices: - mobile_player = self._download_webpage( - '%s&device=%s' % (mobile_url, device), video_id, - 'Downloading mobile player JS for %s device' % device) - videos_info = self._search_regex( - r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);', - mobile_player, 'info', fatal=False, default=None) - if videos_info: - break - - if not videos_info: - formats = [] + formats = [] + for supported_format in ('mp4', 'm3u8', 'hds', 'rtmp'): auth_data = self._download_json( - 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=mp4,webm' % (video_id, video_id), - video_id) + 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=%s' % (pcode, embed_code, supported_format), + video_id, 'Downloading %s JSON' % supported_format) - cur_auth_data = auth_data['authorization_data'][video_id] + cur_auth_data = auth_data['authorization_data'][embed_code] for stream in cur_auth_data['streams']: - formats.append({ - 'url': base64.b64decode(stream['url']['data'].encode('ascii')).decode('utf-8'), - 'ext': stream.get('delivery_type'), - 'format': stream.get('video_codec'), - 'format_id': stream.get('profile'), - 'width': int_or_none(stream.get('width')), - 'height': int_or_none(stream.get('height')), - 'abr': int_or_none(stream.get('audio_bitrate')), - 'vbr': int_or_none(stream.get('video_bitrate')), - }) - if formats: - return { - 'id': video_id, - 'formats': formats, - 'title': 'Ooyala video', - } + url = base64.b64decode(stream['url']['data'].encode('ascii')).decode('utf-8') + delivery_type = stream['delivery_type'] + if delivery_type == 'remote_asset': + video_info['url'] = url + return video_info + if delivery_type == 'hls': + formats.extend(self._extract_m3u8_formats(url, embed_code, 'mp4', 'm3u8_native', 0, m3u8_id='hls', fatal=False)) + elif delivery_type == 'hds': + formats.extend(self._extract_f4m_formats(url, embed_code, f4m_id='hds', fatal=False)) + else: + formats.append({ + 'url': url, + 'ext': stream.get('delivery_type'), + 'vcodec': stream.get('video_codec'), + 'format_id': stream.get('profile'), + 'width': int_or_none(stream.get('width')), + 'height': int_or_none(stream.get('height')), + 'abr': int_or_none(stream.get('audio_bitrate')), + 'vbr': int_or_none(stream.get('video_bitrate')), + 'fps': float_or_none(stream.get('framerate')), + }) + self._sort_formats(formats) - if not cur_auth_data['authorized']: - raise ExtractorError(cur_auth_data['message'], expected=True) - - if not videos_info: - raise ExtractorError('Unable to extract info') - videos_info = videos_info.replace('\\"', '"') - videos_more_info = self._search_regex( - r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, 'more info').replace('\\"', '"') - videos_info = json.loads(videos_info) - videos_more_info = json.loads(videos_more_info) - - if videos_more_info.get('lineup'): - videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])] - return { - '_type': 'playlist', - 'id': video_id, - 'title': unescapeHTML(videos_more_info['title']), - 'entries': videos, - } - else: - return self._extract_result(videos_info[0], videos_more_info) + video_info['formats'] = formats + return video_info class OoyalaIE(OoyalaBaseIE): @@ -117,6 +73,7 @@ class OoyalaIE(OoyalaBaseIE): 'ext': 'mp4', 'title': 'Explaining Data Recovery from Hard Drives and SSDs', 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', + 'duration': 853386, }, }, { # Only available for ipad @@ -125,7 +82,7 @@ class OoyalaIE(OoyalaBaseIE): 'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', 'ext': 'mp4', 'title': 'Simulation Overview - Levels of Simulation', - 'description': '', + 'duration': 194948, }, }, { @@ -136,7 +93,8 @@ class OoyalaIE(OoyalaBaseIE): 'info_dict': { 'id': 'FiOG81ZTrvckcchQxmalf4aQj590qTEx', 'ext': 'mp4', - 'title': 'Ooyala video', + 'title': 'Divide Tool Path.mp4', + 'duration': 204405, } } ] @@ -152,8 +110,8 @@ class OoyalaIE(OoyalaBaseIE): def _real_extract(self, url): embed_code = self._match_id(url) - player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code - return self._extract(player_url, embed_code) + content_tree_url = 'http://player.ooyala.com/player_api/v1/content_tree/embed_code/%s/%s' % (embed_code, embed_code) + return self._extract(content_tree_url, embed_code) class OoyalaExternalIE(OoyalaBaseIE): @@ -170,7 +128,7 @@ class OoyalaExternalIE(OoyalaBaseIE): .*?&pcode= ) (?P<pcode>.+?) - (&|$) + (?:&|$) ''' _TEST = { @@ -179,7 +137,7 @@ class OoyalaExternalIE(OoyalaBaseIE): 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG', 'ext': 'mp4', 'title': 'dm_140128_30for30Shorts___JudgingJewellv2', - 'description': '', + 'duration': 1302000, }, 'params': { # m3u8 download @@ -188,9 +146,6 @@ class OoyalaExternalIE(OoyalaBaseIE): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - partner_id = mobj.group('partner_id') - video_id = mobj.group('id') - pcode = mobj.group('pcode') - player_url = 'http://player.ooyala.com/player.js?externalId=%s:%s&pcode=%s' % (partner_id, video_id, pcode) - return self._extract(player_url, video_id) + partner_id, video_id, pcode = re.match(self._VALID_URL, url).groups() + content_tree_url = 'http://player.ooyala.com/player_api/v1/content_tree/external_id/%s/%s:%s' % (pcode, partner_id, video_id) + return self._extract(content_tree_url, video_id) From 497ca088a60fdd0a98f16e22a9d4fec135a26ab0 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 15 Oct 2015 14:37:05 +0100 Subject: [PATCH 0037/1214] [ooyala] remove print statment --- youtube_dl/extractor/ooyala.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 592cdc564..df99a39f4 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -12,7 +12,6 @@ from ..utils import ( class OoyalaBaseIE(InfoExtractor): def _extract(self, player_url, video_id): - print(player_url) content_tree = self._download_json(player_url, video_id)['content_tree'] metadata = content_tree[list(content_tree)[0]] embed_code = metadata['embed_code'] From 77302fe5c989b9cafcb675c0a03642b80fa557ff Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 15 Oct 2015 23:27:46 +0100 Subject: [PATCH 0038/1214] [bliptv] remove extractor and add support for site replacement(makertv) --- youtube_dl/extractor/__init__.py | 3 +- youtube_dl/extractor/bliptv.py | 292 --------------------------- youtube_dl/extractor/cinemassacre.py | 18 +- youtube_dl/extractor/generic.py | 6 - youtube_dl/extractor/jwplatform.py | 67 ++++++ youtube_dl/extractor/makertv.py | 27 +++ 6 files changed, 103 insertions(+), 310 deletions(-) delete mode 100644 youtube_dl/extractor/bliptv.py create mode 100644 youtube_dl/extractor/jwplatform.py create mode 100644 youtube_dl/extractor/makertv.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 462717b1e..f9c40e6cd 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -54,7 +54,6 @@ from .bet import BetIE from .bild import BildIE from .bilibili import BiliBiliIE from .blinkx import BlinkxIE -from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE from .bpb import BpbIE from .br import BRIE @@ -263,6 +262,7 @@ from .jadorecettepub import JadoreCettePubIE from .jeuxvideo import JeuxVideoIE from .jove import JoveIE from .jukebox import JukeboxIE +from .jwplatform import JWPlatformIE from .jpopsukitv import JpopsukiIE from .kaltura import KalturaIE from .kanalplay import KanalPlayIE @@ -317,6 +317,7 @@ from .lynda import ( from .m6 import M6IE from .macgamestore import MacGameStoreIE from .mailru import MailRuIE +from .makertv import MakerTVIE from .malemotion import MalemotionIE from .mdr import MDRIE from .metacafe import MetacafeIE diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py deleted file mode 100644 index c3296283d..000000000 --- a/youtube_dl/extractor/bliptv.py +++ /dev/null @@ -1,292 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - -from ..compat import ( - compat_urllib_request, - compat_urlparse, -) -from ..utils import ( - clean_html, - int_or_none, - parse_iso8601, - unescapeHTML, - xpath_text, - xpath_with_ns, -) - - -class BlipTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z+_]+)))' - - _TESTS = [ - { - 'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352', - 'md5': '80baf1ec5c3d2019037c1c707d676b9f', - 'info_dict': { - 'id': '5779306', - 'ext': 'm4v', - 'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3', - 'description': 'md5:9bc31f227219cde65e47eeec8d2dc596', - 'timestamp': 1323138843, - 'upload_date': '20111206', - 'uploader': 'cbr', - 'uploader_id': '679425', - 'duration': 81, - } - }, - { - # https://github.com/rg3/youtube-dl/pull/2274 - 'note': 'Video with subtitles', - 'url': 'http://blip.tv/play/h6Uag5OEVgI.html', - 'md5': '309f9d25b820b086ca163ffac8031806', - 'info_dict': { - 'id': '6586561', - 'ext': 'mp4', - 'title': 'Red vs. Blue Season 11 Episode 1', - 'description': 'One-Zero-One', - 'timestamp': 1371261608, - 'upload_date': '20130615', - 'uploader': 'redvsblue', - 'uploader_id': '792887', - 'duration': 279, - } - }, - { - # https://bugzilla.redhat.com/show_bug.cgi?id=967465 - 'url': 'http://a.blip.tv/api.swf#h6Uag5KbVwI', - 'md5': '314e87b1ebe7a48fcbfdd51b791ce5a6', - 'info_dict': { - 'id': '6573122', - 'ext': 'mov', - 'upload_date': '20130520', - 'description': 'Two hapless space marines argue over what to do when they realize they have an astronomically huge problem on their hands.', - 'title': 'Red vs. Blue Season 11 Trailer', - 'timestamp': 1369029609, - 'uploader': 'redvsblue', - 'uploader_id': '792887', - } - }, - { - 'url': 'http://blip.tv/play/gbk766dkj4Yn', - 'md5': 'fe0a33f022d49399a241e84a8ea8b8e3', - 'info_dict': { - 'id': '1749452', - 'ext': 'mp4', - 'upload_date': '20090208', - 'description': 'Witness the first appearance of the Nostalgia Critic character, as Doug reviews the movie Transformers.', - 'title': 'Nostalgia Critic: Transformers', - 'timestamp': 1234068723, - 'uploader': 'NostalgiaCritic', - 'uploader_id': '246467', - } - }, - { - # https://github.com/rg3/youtube-dl/pull/4404 - 'note': 'Audio only', - 'url': 'http://blip.tv/hilarios-productions/weekly-manga-recap-kingdom-7119982', - 'md5': '76c0a56f24e769ceaab21fbb6416a351', - 'info_dict': { - 'id': '7103299', - 'ext': 'flv', - 'title': 'Weekly Manga Recap: Kingdom', - 'description': 'And then Shin breaks the enemy line, and he's all like HWAH! And then he slices a guy and it's all like FWASHING! And... it's really hard to describe the best parts of this series without breaking down into sound effects, okay?', - 'timestamp': 1417660321, - 'upload_date': '20141204', - 'uploader': 'The Rollo T', - 'uploader_id': '407429', - 'duration': 7251, - 'vcodec': 'none', - } - }, - { - # missing duration - 'url': 'http://blip.tv/rss/flash/6700880', - 'info_dict': { - 'id': '6684191', - 'ext': 'm4v', - 'title': 'Cowboy Bebop: Gateway Shuffle Review', - 'description': 'md5:3acc480c0f9ae157f5fe88547ecaf3f8', - 'timestamp': 1386639757, - 'upload_date': '20131210', - 'uploader': 'sfdebris', - 'uploader_id': '706520', - } - } - ] - - @staticmethod - def _extract_url(webpage): - mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage) - if mobj: - return 'http://blip.tv/a/a-' + mobj.group(1) - mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage) - if mobj: - return mobj.group(1) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - lookup_id = mobj.group('lookup_id') - - # See https://github.com/rg3/youtube-dl/issues/857 and - # https://github.com/rg3/youtube-dl/issues/4197 - if lookup_id: - urlh = self._request_webpage( - 'http://blip.tv/play/%s' % lookup_id, lookup_id, 'Resolving lookup id') - url = compat_urlparse.urlparse(urlh.geturl()) - qs = compat_urlparse.parse_qs(url.query) - mobj = re.match(self._VALID_URL, qs['file'][0]) - - video_id = mobj.group('id') - - rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS') - - def _x(p): - return xpath_with_ns(p, { - 'blip': 'http://blip.tv/dtd/blip/1.0', - 'media': 'http://search.yahoo.com/mrss/', - 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', - }) - - item = rss.find('channel/item') - - video_id = xpath_text(item, _x('blip:item_id'), 'video id') or lookup_id - title = xpath_text(item, 'title', 'title', fatal=True) - description = clean_html(xpath_text(item, _x('blip:puredescription'), 'description')) - timestamp = parse_iso8601(xpath_text(item, _x('blip:datestamp'), 'timestamp')) - uploader = xpath_text(item, _x('blip:user'), 'uploader') - uploader_id = xpath_text(item, _x('blip:userid'), 'uploader id') - duration = int_or_none(xpath_text(item, _x('blip:runtime'), 'duration')) - media_thumbnail = item.find(_x('media:thumbnail')) - thumbnail = (media_thumbnail.get('url') if media_thumbnail is not None - else xpath_text(item, 'image', 'thumbnail')) - categories = [category.text for category in item.findall('category') if category is not None] - - formats = [] - subtitles_urls = {} - - media_group = item.find(_x('media:group')) - for media_content in media_group.findall(_x('media:content')): - url = media_content.get('url') - role = media_content.get(_x('blip:role')) - msg = self._download_webpage( - url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url', - video_id, 'Resolving URL for %s' % role) - real_url = compat_urlparse.parse_qs(msg.strip())['message'][0] - - media_type = media_content.get('type') - if media_type == 'text/srt' or url.endswith('.srt'): - LANGS = { - 'english': 'en', - } - lang = role.rpartition('-')[-1].strip().lower() - langcode = LANGS.get(lang, lang) - subtitles_urls[langcode] = url - elif media_type.startswith('video/'): - formats.append({ - 'url': real_url, - 'format_id': role, - 'format_note': media_type, - 'vcodec': media_content.get(_x('blip:vcodec')) or 'none', - 'acodec': media_content.get(_x('blip:acodec')), - 'filesize': media_content.get('filesize'), - 'width': int_or_none(media_content.get('width')), - 'height': int_or_none(media_content.get('height')), - }) - self._check_formats(formats, video_id) - self._sort_formats(formats) - - subtitles = self.extract_subtitles(video_id, subtitles_urls) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'duration': duration, - 'thumbnail': thumbnail, - 'categories': categories, - 'formats': formats, - 'subtitles': subtitles, - } - - def _get_subtitles(self, video_id, subtitles_urls): - subtitles = {} - for lang, url in subtitles_urls.items(): - # For some weird reason, blip.tv serves a video instead of subtitles - # when we request with a common UA - req = compat_urllib_request.Request(url) - req.add_header('User-Agent', 'youtube-dl') - subtitles[lang] = [{ - # The extension is 'srt' but it's actually an 'ass' file - 'ext': 'ass', - 'data': self._download_webpage(req, None, note=False), - }] - return subtitles - - -class BlipTVUserIE(InfoExtractor): - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?blip\.tv/)|bliptvuser:)(?!api\.swf)([^/]+)/*$' - _PAGE_SIZE = 12 - IE_NAME = 'blip.tv:user' - _TEST = { - 'url': 'http://blip.tv/actone', - 'info_dict': { - 'id': 'actone', - 'title': 'Act One: The Series', - }, - 'playlist_count': 5, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - username = mobj.group(1) - - page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1' - - page = self._download_webpage(url, username, 'Downloading user page') - mobj = re.search(r'data-users-id="([^"]+)"', page) - page_base = page_base % mobj.group(1) - title = self._og_search_title(page) - - # Download video ids using BlipTV Ajax calls. Result size per - # query is limited (currently to 12 videos) so we need to query - # page by page until there are no video ids - it means we got - # all of them. - - video_ids = [] - pagenum = 1 - - while True: - url = page_base + "&page=" + str(pagenum) - page = self._download_webpage( - url, username, 'Downloading video ids from page %d' % pagenum) - - # Extract video identifiers - ids_in_page = [] - - for mobj in re.finditer(r'href="/([^"]+)"', page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(unescapeHTML(mobj.group(1))) - - video_ids.extend(ids_in_page) - - # A little optimization - if current page is not - # "full", ie. does not contain PAGE_SIZE video ids then - # we can assume that this page is the last one - there - # are no more ids on further pages - no need to query - # again. - - if len(ids_in_page) < self._PAGE_SIZE: - break - - pagenum += 1 - - urls = ['http://blip.tv/%s' % video_id for video_id in video_ids] - url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls] - return self.playlist_result( - url_entries, playlist_title=title, playlist_id=username) diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index fd1770dac..6d9cd8abd 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..utils import ExtractorError -from .bliptv import BlipTVIE from .screenwavemedia import ScreenwaveMediaIE @@ -34,18 +33,17 @@ class CinemassacreIE(InfoExtractor): }, }, { - # blip.tv embedded video + # Youtube embedded video 'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/', - 'md5': 'ca9b3c8dd5a66f9375daeb5135f5a3de', + 'md5': 'df4cf8a1dcedaec79a73d96d83b99023', 'info_dict': { - 'id': '4065369', - 'ext': 'flv', + 'id': 'OEVzPCY2T-g', + 'ext': 'mp4', 'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles', 'upload_date': '20061207', - 'uploader': 'cinemassacre', - 'uploader_id': '250778', - 'timestamp': 1283233867, - 'description': 'md5:0a108c78d130676b207d0f6d029ecffd', + 'uploader': 'Cinemassacre', + 'uploader_id': 'JamesNintendoNerd', + 'description': 'md5:784734696c2b8b7f4b8625cc799e07f6', } }, { @@ -88,8 +86,6 @@ class CinemassacreIE(InfoExtractor): r'<iframe[^>]+src="(?P<url>(?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', ], webpage, 'player data URL', default=None, group='url') - if not playerdata_url: - playerdata_url = BlipTVIE._extract_url(webpage) if not playerdata_url: raise ExtractorError('Unable to find player data') diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ca5fbafb2..285c0ff66 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -41,7 +41,6 @@ from .myvi import MyviIE from .condenast import CondeNastIE from .udn import UDNEmbedIE from .senateisvp import SenateISVPIE -from .bliptv import BlipTVIE from .svt import SVTIE from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE @@ -1389,11 +1388,6 @@ class GenericIE(InfoExtractor): 'id': match.group('id') } - # Look for embedded blip.tv player - bliptv_url = BlipTVIE._extract_url(webpage) - if bliptv_url: - return self.url_result(bliptv_url, 'BlipTV') - # Look for SVT player svt_url = SVTIE._extract_url(webpage) if svt_url: diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py new file mode 100644 index 000000000..3a3dc439a --- /dev/null +++ b/youtube_dl/extractor/jwplatform.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class JWPlatformIE(InfoExtractor): + _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' + _TEST = { + 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', + 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', + 'info_dict': { + 'id': 'nPripu9l', + 'ext': 'mov', + 'title': 'Big Buck Bunny Trailer', + 'description': 'Big Buck Bunny is a short animated film by the Blender Institute. It is made using free and open source software.', + 'upload_date': '20081127', + 'timestamp': 1227796140, + } + } + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<script[^>]+?src=["\'](?P<url>(?:https?:)?//content.jwplatform.com/players/[a-zA-Z0-9]{8}', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id) + video_data = json_data['playlist'][0] + subtitles = {} + for track in video_data['tracks']: + if track['kind'] == 'captions': + subtitles[track['label']] = [{'url': self._proto_relative_url(track['file'])}] + + formats = [] + for source in video_data['sources']: + source_url = self._proto_relative_url(source['file']) + source_type = source.get('type') or '' + if source_type == 'application/vnd.apple.mpegurl': + formats.extend(self._extract_m3u8_formats(source_url, video_id, 'mp4', 'm3u8_native', fatal=None)) + elif source_type.startswith('audio'): + formats.append({ + 'url': source_url, + 'vcodec': 'none', + }) + else: + formats.append({ + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + }) + self._sort_formats(formats) + + return { + 'id': video_data['mediaid'], + 'title': video_data['title'], + 'description': video_data.get('description'), + 'thumbnail': self._proto_relative_url(video_data.get('image')), + 'timestamp': int_or_none(video_data.get('pubdate')), + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/youtube_dl/extractor/makertv.py b/youtube_dl/extractor/makertv.py new file mode 100644 index 000000000..0256e4e24 --- /dev/null +++ b/youtube_dl/extractor/makertv.py @@ -0,0 +1,27 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MakerTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?maker\.tv/(?:[^/]+/)?video|http://makerplayer.com/embed/maker)/(?P<id>[a-zA-Z0-9]{12})' + _TEST = { + 'url': 'http://www.maker.tv/video/Fh3QgymL9gsc', + 'md5': 'ca237a53a8eb20b6dc5bd60564d4ab3e', + 'info_dict': { + 'id': 'brOEcGut', + 'ext': 'mp4', + 'title': 'Maze Runner: The Scorch Trials Official Movie Review', + 'description': 'md5:11ff3362d7ef1d679fdb649f6413975a', + 'upload_date': '20150918', + 'timestamp': 1442549540, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + jwplatform_id = self._search_regex([r'jwid="([^"]+)"', r'Maker.jw_id\s*=\s*"([^"]+)";'], webpage, 'jwplatform id') + + return self.url_result('jwplatform:%s' % jwplatform_id, 'JWPlatform') From dd414c970bcc493358ff6a76f6544a0417125594 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 16 Oct 2015 10:12:42 +0100 Subject: [PATCH 0039/1214] [ooyala] fix sorting and format id --- youtube_dl/extractor/ooyala.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index df99a39f4..075b594ce 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -39,15 +39,15 @@ class OoyalaBaseIE(InfoExtractor): video_info['url'] = url return video_info if delivery_type == 'hls': - formats.extend(self._extract_m3u8_formats(url, embed_code, 'mp4', 'm3u8_native', 0, m3u8_id='hls', fatal=False)) + formats.extend(self._extract_m3u8_formats(url, embed_code, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) elif delivery_type == 'hds': - formats.extend(self._extract_f4m_formats(url, embed_code, f4m_id='hds', fatal=False)) + formats.extend(self._extract_f4m_formats(url, embed_code, -1, 'hds', fatal=False)) else: formats.append({ 'url': url, 'ext': stream.get('delivery_type'), 'vcodec': stream.get('video_codec'), - 'format_id': stream.get('profile'), + 'format_id': '%s-%s-%sp' % (stream.get('profile'), delivery_type, stream.get('height')), 'width': int_or_none(stream.get('width')), 'height': int_or_none(stream.get('height')), 'abr': int_or_none(stream.get('audio_bitrate')), From cce9d15d0115e8b4cd1f6e2a327b5e9dbdf0ee54 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 16 Oct 2015 16:02:40 +0100 Subject: [PATCH 0040/1214] [ooyala] extract domain,handle errors and change related tests --- youtube_dl/extractor/byutv.py | 5 ++- youtube_dl/extractor/generic.py | 9 ++-- youtube_dl/extractor/groupon.py | 2 + youtube_dl/extractor/howcast.py | 1 + youtube_dl/extractor/ooyala.py | 60 ++++++++++++++----------- youtube_dl/extractor/teachingchannel.py | 1 + youtube_dl/extractor/vice.py | 1 + 7 files changed, 48 insertions(+), 31 deletions(-) diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py index 3b2de517e..ce25816f0 100644 --- a/youtube_dl/extractor/byutv.py +++ b/youtube_dl/extractor/byutv.py @@ -14,9 +14,10 @@ class BYUtvIE(InfoExtractor): 'info_dict': { 'id': 'studio-c-season-5-episode-5', 'ext': 'mp4', - 'description': 'md5:5438d33774b6bdc662f9485a340401cc', + 'description': 'md5:e07269172baff037f8e8bf9956bc9747', 'title': 'Season 5 Episode 5', - 'thumbnail': 're:^https?://.*\.jpg$' + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 1486486, }, 'params': { 'skip_download': True, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ca5fbafb2..805677364 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -335,6 +335,7 @@ class GenericIE(InfoExtractor): 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', 'ext': 'mp4', 'title': '2cc213299525360.mov', # that's what we get + 'duration': 238231, }, 'add_ie': ['Ooyala'], }, @@ -346,6 +347,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': '"Steve Jobs: Man in the Machine" trailer', 'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."', + 'duration': 135427, }, 'params': { 'skip_download': True, @@ -943,8 +945,9 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs', 'ext': 'mp4', - 'description': 'VIDEO: Index/Match versus VLOOKUP.', + 'description': 'VIDEO: INDEX/MATCH versus VLOOKUP.', 'title': 'This is what separates the Excel masters from the wannabes', + 'duration': 191933, }, 'params': { # m3u8 downloads @@ -1454,7 +1457,7 @@ class GenericIE(InfoExtractor): re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) if mobj is not None: - return OoyalaIE._build_url_result(mobj.group('ec')) + return OoyalaIE._build_url_result(smuggle_url(mobj.group('ec'), {'domain': url})) # Look for multiple Ooyala embeds on SBN network websites mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) @@ -1462,7 +1465,7 @@ class GenericIE(InfoExtractor): embeds = self._parse_json(mobj.group(1), video_id, fatal=False) if embeds: return _playlist_from_matches( - embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala') + embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') # Look for Aparat videos mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage) diff --git a/youtube_dl/extractor/groupon.py b/youtube_dl/extractor/groupon.py index 8b9e0e2f8..22ff7182f 100644 --- a/youtube_dl/extractor/groupon.py +++ b/youtube_dl/extractor/groupon.py @@ -18,6 +18,8 @@ class GrouponIE(InfoExtractor): 'id': 'tubGNycTo_9Uxg82uESj4i61EYX8nyuf', 'ext': 'mp4', 'title': 'Bikram Yoga Huntington Beach | Orange County', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'duration': 44961, }, }], 'params': { diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py index 16677f179..165b9f39e 100644 --- a/youtube_dl/extractor/howcast.py +++ b/youtube_dl/extractor/howcast.py @@ -16,6 +16,7 @@ class HowcastIE(InfoExtractor): 'description': 'md5:dbe792e5f6f1489027027bf2eba188a3', 'timestamp': 1276081287, 'upload_date': '20100609', + 'duration': 56823, }, 'params': { # m3u8 download diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 075b594ce..3b692e903 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -6,13 +6,16 @@ from .common import InfoExtractor from ..utils import ( int_or_none, float_or_none, + ExtractorError, + unsmuggle_url, ) +from ..compat import compat_urllib_parse class OoyalaBaseIE(InfoExtractor): - def _extract(self, player_url, video_id): - content_tree = self._download_json(player_url, video_id)['content_tree'] + def _extract(self, content_tree_url, video_id, domain='example.org'): + content_tree = self._download_json(content_tree_url, video_id)['content_tree'] metadata = content_tree[list(content_tree)[0]] embed_code = metadata['embed_code'] pcode = metadata.get('asset_pcode') or embed_code @@ -27,33 +30,36 @@ class OoyalaBaseIE(InfoExtractor): formats = [] for supported_format in ('mp4', 'm3u8', 'hds', 'rtmp'): auth_data = self._download_json( - 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=%s' % (pcode, embed_code, supported_format), + 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?' % (pcode, embed_code) + compat_urllib_parse.urlencode({'domain': domain, 'supportedFormats': supported_format}), video_id, 'Downloading %s JSON' % supported_format) cur_auth_data = auth_data['authorization_data'][embed_code] - for stream in cur_auth_data['streams']: - url = base64.b64decode(stream['url']['data'].encode('ascii')).decode('utf-8') - delivery_type = stream['delivery_type'] - if delivery_type == 'remote_asset': - video_info['url'] = url - return video_info - if delivery_type == 'hls': - formats.extend(self._extract_m3u8_formats(url, embed_code, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - elif delivery_type == 'hds': - formats.extend(self._extract_f4m_formats(url, embed_code, -1, 'hds', fatal=False)) - else: - formats.append({ - 'url': url, - 'ext': stream.get('delivery_type'), - 'vcodec': stream.get('video_codec'), - 'format_id': '%s-%s-%sp' % (stream.get('profile'), delivery_type, stream.get('height')), - 'width': int_or_none(stream.get('width')), - 'height': int_or_none(stream.get('height')), - 'abr': int_or_none(stream.get('audio_bitrate')), - 'vbr': int_or_none(stream.get('video_bitrate')), - 'fps': float_or_none(stream.get('framerate')), - }) + if cur_auth_data['authorized']: + for stream in cur_auth_data['streams']: + url = base64.b64decode(stream['url']['data'].encode('ascii')).decode('utf-8') + delivery_type = stream['delivery_type'] + if delivery_type == 'remote_asset': + video_info['url'] = url + return video_info + if delivery_type == 'hls': + formats.extend(self._extract_m3u8_formats(url, embed_code, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + elif delivery_type == 'hds': + formats.extend(self._extract_f4m_formats(url, embed_code, -1, 'hds', fatal=False)) + else: + formats.append({ + 'url': url, + 'ext': stream.get('delivery_type'), + 'vcodec': stream.get('video_codec'), + 'format_id': '%s-%s-%sp' % (stream.get('profile'), delivery_type, stream.get('height')), + 'width': int_or_none(stream.get('width')), + 'height': int_or_none(stream.get('height')), + 'abr': int_or_none(stream.get('audio_bitrate')), + 'vbr': int_or_none(stream.get('video_bitrate')), + 'fps': float_or_none(stream.get('framerate')), + }) + else: + raise ExtractorError('%s said: %s' % (self.IE_NAME, cur_auth_data['message']), expected=True) self._sort_formats(formats) video_info['formats'] = formats @@ -108,9 +114,11 @@ class OoyalaIE(OoyalaBaseIE): ie=cls.ie_key()) def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) embed_code = self._match_id(url) + domain = smuggled_data.get('domain') content_tree_url = 'http://player.ooyala.com/player_api/v1/content_tree/embed_code/%s/%s' % (embed_code, embed_code) - return self._extract(content_tree_url, embed_code) + return self._extract(content_tree_url, embed_code, domain) class OoyalaExternalIE(OoyalaBaseIE): diff --git a/youtube_dl/extractor/teachingchannel.py b/youtube_dl/extractor/teachingchannel.py index 117afa9bf..36a6fc679 100644 --- a/youtube_dl/extractor/teachingchannel.py +++ b/youtube_dl/extractor/teachingchannel.py @@ -16,6 +16,7 @@ class TeachingChannelIE(InfoExtractor): 'ext': 'mp4', 'title': 'A History of Teaming', 'description': 'md5:2a9033db8da81f2edffa4c99888140b3', + 'duration': 422255, }, 'params': { # m3u8 download diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 01af7a995..7df87c31c 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -15,6 +15,7 @@ class ViceIE(InfoExtractor): 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', 'ext': 'mp4', 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', + 'duration': 725983, }, 'params': { # Requires ffmpeg (m3u8 manifest) From d90e40305bee84f5e3cd4927c729e1d16bbd3dc6 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 17 Oct 2015 17:28:09 +0100 Subject: [PATCH 0041/1214] [bilibili] fix info extraction --- youtube_dl/extractor/bilibili.py | 151 ++++++++++++------------------- 1 file changed, 57 insertions(+), 94 deletions(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index ecc17ebeb..8f23a30ab 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -1,15 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals -import re -import itertools import json import xml.etree.ElementTree as ET from .common import InfoExtractor from ..utils import ( int_or_none, - unified_strdate, ExtractorError, ) @@ -21,12 +18,15 @@ class BiliBiliIE(InfoExtractor): 'url': 'http://www.bilibili.tv/video/av1074402/', 'md5': '2c301e4dab317596e837c3e7633e7d86', 'info_dict': { - 'id': '1074402_part1', + 'id': '1554319', 'ext': 'flv', 'title': '【金坷垃】金泡沫', - 'duration': 308, + 'duration': 308313, 'upload_date': '20140420', 'thumbnail': 're:^https?://.+\.jpg', + 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', + 'timestamp': 1397983878, + 'uploader': '菊子桑', }, }, { 'url': 'http://www.bilibili.com/video/av1041170/', @@ -34,109 +34,72 @@ class BiliBiliIE(InfoExtractor): 'id': '1041170', 'title': '【BD1080P】刀语【诸神&异域】', }, - 'playlist_count': 9, + 'playlist_count': 12, }] - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + def _extract_video_info(self, cid, view_data, page_num=1, num_pages=1): + title = view_data['title'] - if '(此视频不存在或被删除)' in webpage: - raise ExtractorError( - 'The video does not exist or was deleted', expected=True) - - if '>你没有权限浏览! 由于版权相关问题 我们不对您所在的地区提供服务<' in webpage: - raise ExtractorError( - 'The video is not available in your region due to copyright reasons', - expected=True) - - video_code = self._search_regex( - r'(?s)<div itemprop="video".*?>(.*?)</div>', webpage, 'video code') - - title = self._html_search_meta( - 'media:title', video_code, 'title', fatal=True) - duration_str = self._html_search_meta( - 'duration', video_code, 'duration') - if duration_str is None: - duration = None - else: - duration_mobj = re.match( - r'^T(?:(?P<hours>[0-9]+)H)?(?P<minutes>[0-9]+)M(?P<seconds>[0-9]+)S$', - duration_str) - duration = ( - int_or_none(duration_mobj.group('hours'), default=0) * 3600 + - int(duration_mobj.group('minutes')) * 60 + - int(duration_mobj.group('seconds'))) - upload_date = unified_strdate(self._html_search_meta( - 'uploadDate', video_code, fatal=False)) - thumbnail = self._html_search_meta( - 'thumbnailUrl', video_code, 'thumbnail', fatal=False) - - cid = self._search_regex(r'cid=(\d+)', webpage, 'cid') - - entries = [] - - lq_page = self._download_webpage( - 'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid, - video_id, - note='Downloading LQ video info' + page = self._download_webpage( + 'http://interface.bilibili.com/v_cdn_play?appkey=8e9fc618fbd41e28&cid=%s' % cid, + cid, + 'Downloading page %d/%d' % (page_num, num_pages) ) try: - err_info = json.loads(lq_page) + err_info = json.loads(page) raise ExtractorError( 'BiliBili said: ' + err_info['error_text'], expected=True) except ValueError: pass - lq_doc = ET.fromstring(lq_page) - lq_durls = lq_doc.findall('./durl') + doc = ET.fromstring(page) + durls = doc.findall('./durl') - hq_doc = self._download_xml( - 'http://interface.bilibili.com/playurl?appkey=1&cid=%s' % cid, - video_id, - note='Downloading HQ video info', - fatal=False, - ) - if hq_doc is not False: - hq_durls = hq_doc.findall('./durl') - assert len(lq_durls) == len(hq_durls) - else: - hq_durls = itertools.repeat(None) - - i = 1 - for lq_durl, hq_durl in zip(lq_durls, hq_durls): - formats = [{ - 'format_id': 'lq', - 'quality': 1, - 'url': lq_durl.find('./url').text, - 'filesize': int_or_none( - lq_durl.find('./size'), get_attr='text'), - }] - if hq_durl is not None: - formats.append({ - 'format_id': 'hq', - 'quality': 2, - 'ext': 'flv', - 'url': hq_durl.find('./url').text, - 'filesize': int_or_none( - hq_durl.find('./size'), get_attr='text'), - }) - self._sort_formats(formats) + entries = [] + for durl in durls: entries.append({ - 'id': '%s_part%d' % (video_id, i), + 'id': '%s_part%s' % (cid, durl.find('./order').text), 'title': title, - 'formats': formats, - 'duration': duration, - 'upload_date': upload_date, - 'thumbnail': thumbnail, + 'url': durl.find('./url').text, + 'filesize': int_or_none(durl.find('./filesize').text), + 'ext': 'flv', + 'duration': int_or_none(durl.find('./length').text) // 1000, }) - i += 1 - - return { - '_type': 'multi_video', - 'entries': entries, - 'id': video_id, - 'title': title + info = { + 'id': cid, + 'title': title, + 'description': view_data.get('description'), + 'thumbnail': view_data.get('pic'), + 'uploader': view_data.get('author'), + 'timestamp': int_or_none(view_data.get('created')), + 'view_count': view_data.get('play'), + 'duration': int_or_none(doc.find('./timelength').text), } + + if len(entries) == 1: + entries[0].update(info) + return entries[0] + else: + info.update({ + '_type': 'multi_video', + 'entries': entries, + }) + return info + + def _real_extract(self, url): + video_id = self._match_id(url) + view_data = self._download_json('http://api.bilibili.com/view?type=json&appkey=8e9fc618fbd41e28&id=%s' % video_id, video_id) + + num_pages = int_or_none(view_data['pages']) + if num_pages > 1: + play_list_title = view_data['title'] + page_list = self._download_json('http://www.bilibili.com/widget/getPageList?aid=%s' % video_id, video_id, 'Downloading page list metadata') + entries = [] + for page in page_list: + view_data['title'] = page['pagename'] + entries.append(self._extract_video_info(str(page['cid']), view_data, page['page'], num_pages)) + return self.playlist_result(entries, video_id, play_list_title, view_data.get('description')) + else: + return self._extract_video_info(str(view_data['cid']), view_data) From 55af2b26e0f169bef2f10a7b5f6ec8e34c6dbb6d Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 17 Oct 2015 18:30:51 +0100 Subject: [PATCH 0042/1214] [bilibili] extract backup url --- youtube_dl/extractor/bilibili.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 8f23a30ab..85156ce49 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -58,13 +58,21 @@ class BiliBiliIE(InfoExtractor): entries = [] for durl in durls: + formats = [] + backup_url = durl.find('./backup_url') + if backup_url is not None: + formats.append({'url': backup_url.find('./url').text}) + size = durl.find('./filesize|./size') + formats.append({ + 'url': durl.find('./url').text, + 'filesize': int_or_none(size.text) if size else None, + 'ext': 'flv', + }) entries.append({ 'id': '%s_part%s' % (cid, durl.find('./order').text), 'title': title, - 'url': durl.find('./url').text, - 'filesize': int_or_none(durl.find('./filesize').text), - 'ext': 'flv', 'duration': int_or_none(durl.find('./length').text) // 1000, + 'formats': formats, }) info = { From 355c7ad361aa3c8a57ff83e3f702a496dce59e65 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 17 Oct 2015 21:30:38 +0100 Subject: [PATCH 0043/1214] [cspan] handle error massages and extract qualities --- youtube_dl/extractor/cspan.py | 67 +++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 994e080d5..c74b35fd9 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -9,16 +9,21 @@ from ..utils import ( find_xpath_attr, smuggle_url, determine_ext, + ExtractorError, ) from .senateisvp import SenateISVPIE +def get_text_attr(d, attr): + return d.get(attr, {}).get('#text') + + class CSpanIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P<id>[0-9a-f]+)' IE_DESC = 'C-SPAN' _TESTS = [{ 'url': 'http://www.c-span.org/video/?313572-1/HolderonV', - 'md5': '067803f994e049b455a58b16e5aab442', + 'md5': '94b29a4f131ff03d23471dd6f60b6a1d', 'info_dict': { 'id': '315139', 'ext': 'mp4', @@ -28,7 +33,7 @@ class CSpanIE(InfoExtractor): 'skip': 'Regularly fails on travis, for unknown reasons', }, { 'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models', - 'md5': '4eafd1e91a75d2b1e6a3cbd0995816a2', + 'md5': '8e5fbfabe6ad0f89f3012a7943c1287b', 'info_dict': { 'id': 'c4486943', 'ext': 'mp4', @@ -37,7 +42,7 @@ class CSpanIE(InfoExtractor): } }, { 'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall', - 'md5': '446562a736c6bf97118e389433ed88d4', + 'md5': '2ae5051559169baadba13fc35345ae74', 'info_dict': { 'id': '342759', 'ext': 'mp4', @@ -71,8 +76,10 @@ class CSpanIE(InfoExtractor): return self.url_result(surl, 'SenateISVP', video_id, title) data = self._download_json( - 'http://c-spanvideo.org/videoLibrary/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), - video_id) + 'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), + video_id)['video'] + if data['@status'] != 'Success': + raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True) doc = self._download_xml( 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id), @@ -83,28 +90,36 @@ class CSpanIE(InfoExtractor): title = find_xpath_attr(doc, './/string', 'name', 'title').text thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text - files = data['video']['files'] - try: - capfile = data['video']['capfile']['#text'] - except KeyError: - capfile = None + files = data['files'] + capfile = get_text_attr(data, 'capfile') - entries = [{ - 'id': '%s_%d' % (video_id, partnum + 1), - 'title': ( - title if len(files) == 1 else - '%s part %d' % (title, partnum + 1)), - 'url': unescapeHTML(f['path']['#text']), - 'description': description, - 'thumbnail': thumbnail, - 'duration': int_or_none(f.get('length', {}).get('#text')), - 'subtitles': { - 'en': [{ - 'url': capfile, - 'ext': determine_ext(capfile, 'dfxp') - }], - } if capfile else None, - } for partnum, f in enumerate(files)] + entries = [] + for partnum, f in enumerate(files): + formats = [] + for quality in f['qualities']: + formats.append({ + 'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')), + 'url': unescapeHTML(get_text_attr(quality, 'file')), + 'height': int_or_none(get_text_attr(quality, 'height')), + 'tbr': int_or_none(get_text_attr(quality, 'bitrate')), + }) + self._sort_formats(formats) + entries.append({ + 'id': '%s_%d' % (video_id, partnum + 1), + 'title': ( + title if len(files) == 1 else + '%s part %d' % (title, partnum + 1)), + 'formats': formats, + 'description': description, + 'thumbnail': thumbnail, + 'duration': int_or_none(get_text_attr(f, 'length')), + 'subtitles': { + 'en': [{ + 'url': capfile, + 'ext': determine_ext(capfile, 'dfxp') + }], + } if capfile else None, + }) if len(entries) == 1: entry = dict(entries[0]) From 520e753390aab5845bb257b964ebcd6f818455df Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 17 Oct 2015 23:12:58 +0100 Subject: [PATCH 0044/1214] [bilibili] add support for specefic page extraction --- youtube_dl/extractor/bilibili.py | 45 ++++++++++++++++---------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 85156ce49..e00efb6eb 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -1,18 +1,20 @@ # coding: utf-8 from __future__ import unicode_literals +import re import json import xml.etree.ElementTree as ET from .common import InfoExtractor from ..utils import ( int_or_none, + unescapeHTML, ExtractorError, ) class BiliBiliIE(InfoExtractor): - _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P<id>[0-9]+)/' + _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P<id>\d+)(?:/index_(?P<page_num>\d+).html)?' _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', @@ -33,17 +35,31 @@ class BiliBiliIE(InfoExtractor): 'info_dict': { 'id': '1041170', 'title': '【BD1080P】刀语【诸神&异域】', + 'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦!~', + 'uploader': '枫叶逝去', + 'timestamp': 1396501299, }, - 'playlist_count': 12, + 'playlist_count': 9, }] - def _extract_video_info(self, cid, view_data, page_num=1, num_pages=1): - title = view_data['title'] + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + page_num = mobj.group('page_num') or '1' + + view_data = self._download_json( + 'http://api.bilibili.com/view?type=json&appkey=8e9fc618fbd41e28&id=%s&page=%s' % (video_id, page_num), + video_id) + if 'error' in view_data: + raise ExtractorError('%s said: %s' % (self.IE_NAME, view_data['error']), expected=True) + + cid = view_data['cid'] + title = unescapeHTML(view_data['title']) page = self._download_webpage( 'http://interface.bilibili.com/v_cdn_play?appkey=8e9fc618fbd41e28&cid=%s' % cid, cid, - 'Downloading page %d/%d' % (page_num, num_pages) + 'Downloading page %s/%s' % (page_num, view_data['pages']) ) try: err_info = json.loads(page) @@ -76,7 +92,7 @@ class BiliBiliIE(InfoExtractor): }) info = { - 'id': cid, + 'id': str(cid), 'title': title, 'description': view_data.get('description'), 'thumbnail': view_data.get('pic'), @@ -92,22 +108,7 @@ class BiliBiliIE(InfoExtractor): else: info.update({ '_type': 'multi_video', + 'id': video_id, 'entries': entries, }) return info - - def _real_extract(self, url): - video_id = self._match_id(url) - view_data = self._download_json('http://api.bilibili.com/view?type=json&appkey=8e9fc618fbd41e28&id=%s' % video_id, video_id) - - num_pages = int_or_none(view_data['pages']) - if num_pages > 1: - play_list_title = view_data['title'] - page_list = self._download_json('http://www.bilibili.com/widget/getPageList?aid=%s' % video_id, video_id, 'Downloading page list metadata') - entries = [] - for page in page_list: - view_data['title'] = page['pagename'] - entries.append(self._extract_video_info(str(page['cid']), view_data, page['page'], num_pages)) - return self.playlist_result(entries, video_id, play_list_title, view_data.get('description')) - else: - return self._extract_video_info(str(view_data['cid']), view_data) From 4bf56141950f3c24000381403417d20095f04460 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Tue, 20 Oct 2015 07:43:39 +0100 Subject: [PATCH 0045/1214] [cspan] move get_text_attr to CSpanIE --- youtube_dl/extractor/cspan.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index c74b35fd9..388460a32 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -14,10 +14,6 @@ from ..utils import ( from .senateisvp import SenateISVPIE -def get_text_attr(d, attr): - return d.get(attr, {}).get('#text') - - class CSpanIE(InfoExtractor): _VALID_URL = r'http://(?:www\.)?c-span\.org/video/\?(?P<id>[0-9a-f]+)' IE_DESC = 'C-SPAN' @@ -60,6 +56,9 @@ class CSpanIE(InfoExtractor): } }] + def get_text_attr(self, d, attr): + return d.get(attr, {}).get('#text') + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -79,7 +78,7 @@ class CSpanIE(InfoExtractor): 'http://www.c-span.org/assets/player/ajax-player.php?os=android&html5=%s&id=%s' % (video_type, video_id), video_id)['video'] if data['@status'] != 'Success': - raise ExtractorError('%s said: %s' % (self.IE_NAME, get_text_attr(data, 'error')), expected=True) + raise ExtractorError('%s said: %s' % (self.IE_NAME, self.get_text_attr(data, 'error')), expected=True) doc = self._download_xml( 'http://www.c-span.org/common/services/flashXml.php?%sid=%s' % (video_type, video_id), @@ -91,17 +90,17 @@ class CSpanIE(InfoExtractor): thumbnail = find_xpath_attr(doc, './/string', 'name', 'poster').text files = data['files'] - capfile = get_text_attr(data, 'capfile') + capfile = self.get_text_attr(data, 'capfile') entries = [] for partnum, f in enumerate(files): formats = [] for quality in f['qualities']: formats.append({ - 'format_id': '%s-%sp' % (get_text_attr(quality, 'bitrate'), get_text_attr(quality, 'height')), - 'url': unescapeHTML(get_text_attr(quality, 'file')), - 'height': int_or_none(get_text_attr(quality, 'height')), - 'tbr': int_or_none(get_text_attr(quality, 'bitrate')), + 'format_id': '%s-%sp' % (self.get_text_attr(quality, 'bitrate'), self.get_text_attr(quality, 'height')), + 'url': unescapeHTML(self.get_text_attr(quality, 'file')), + 'height': int_or_none(self.get_text_attr(quality, 'height')), + 'tbr': int_or_none(self.get_text_attr(quality, 'bitrate')), }) self._sort_formats(formats) entries.append({ @@ -112,7 +111,7 @@ class CSpanIE(InfoExtractor): 'formats': formats, 'description': description, 'thumbnail': thumbnail, - 'duration': int_or_none(get_text_attr(f, 'length')), + 'duration': int_or_none(self.get_text_attr(f, 'length')), 'subtitles': { 'en': [{ 'url': capfile, From 497f5fd93fe1efd0df8dc58d518c328ed1409457 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Wed, 21 Oct 2015 08:24:05 +0100 Subject: [PATCH 0046/1214] [bilibili] extract multiple backup_urls --- youtube_dl/extractor/bilibili.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index e00efb6eb..935fcc55c 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -69,21 +69,22 @@ class BiliBiliIE(InfoExtractor): pass doc = ET.fromstring(page) - durls = doc.findall('./durl') entries = [] - for durl in durls: - formats = [] - backup_url = durl.find('./backup_url') - if backup_url is not None: - formats.append({'url': backup_url.find('./url').text}) + for durl in doc.findall('./durl'): size = durl.find('./filesize|./size') - formats.append({ + formats = [{ 'url': durl.find('./url').text, 'filesize': int_or_none(size.text) if size else None, 'ext': 'flv', - }) + }] + backup_urls = durl.find('./backup_url') + if backup_urls is not None: + for backup_url in backup_urls.findall('./url'): + formats.append({'url': backup_url.text}) + formats.reverse() + entries.append({ 'id': '%s_part%s' % (cid, durl.find('./order').text), 'title': title, From 36e6f62cd0883f0f486d1666d010e5d9e6d515bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 25 Oct 2015 20:04:55 +0100 Subject: [PATCH 0047/1214] Use a wrapper around xml.etree.ElementTree.fromstring in python 2.x (#7178) Attributes aren't unicode objects, so they couldn't be directly used in info_dict fields (for example '--write-description' doesn't work with bytes). --- test/test_compat.py | 7 +++++++ test/test_utils.py | 11 +++++++---- youtube_dl/compat.py | 25 +++++++++++++++++++++++++ youtube_dl/downloader/f4m.py | 4 ++-- youtube_dl/extractor/bbc.py | 8 +++++--- youtube_dl/extractor/bilibili.py | 6 ++++-- youtube_dl/extractor/brightcove.py | 4 ++-- youtube_dl/extractor/common.py | 4 ++-- youtube_dl/extractor/crunchyroll.py | 4 ++-- youtube_dl/extractor/vevo.py | 6 +++--- youtube_dl/utils.py | 3 ++- 11 files changed, 61 insertions(+), 21 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index 4ee0dc99d..2b0860479 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -13,8 +13,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.utils import get_filesystem_encoding from youtube_dl.compat import ( compat_getenv, + compat_etree_fromstring, compat_expanduser, compat_shlex_split, + compat_str, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, ) @@ -71,5 +73,10 @@ class TestCompat(unittest.TestCase): def test_compat_shlex_split(self): self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) + def test_compat_etree_fromstring(self): + xml = '<el foo="bar"></el>' + doc = compat_etree_fromstring(xml.encode('utf-8')) + self.assertTrue(isinstance(doc.attrib['foo'], compat_str)) + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index 918a7a9ef..a9e0fed7e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -68,6 +68,9 @@ from youtube_dl.utils import ( cli_valueless_option, cli_bool_option, ) +from youtube_dl.compat import ( + compat_etree_fromstring, +) class TestUtil(unittest.TestCase): @@ -242,7 +245,7 @@ class TestUtil(unittest.TestCase): <node x="b" y="d" /> <node x="" /> </root>''' - doc = xml.etree.ElementTree.fromstring(testxml) + doc = compat_etree_fromstring(testxml) self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n'), None) self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None) @@ -263,7 +266,7 @@ class TestUtil(unittest.TestCase): <url>http://server.com/download.mp3</url> </media:song> </root>''' - doc = xml.etree.ElementTree.fromstring(testxml) + doc = compat_etree_fromstring(testxml) find = lambda p: doc.find(xpath_with_ns(p, {'media': 'http://example.com/'})) self.assertTrue(find('media:song') is not None) self.assertEqual(find('media:song/media:author').text, 'The Author') @@ -285,7 +288,7 @@ class TestUtil(unittest.TestCase): <p>Foo</p> </div> </root>''' - doc = xml.etree.ElementTree.fromstring(testxml) + doc = compat_etree_fromstring(testxml) self.assertEqual(xpath_text(doc, 'div/p'), 'Foo') self.assertEqual(xpath_text(doc, 'div/bar', default='default'), 'default') self.assertTrue(xpath_text(doc, 'div/bar') is None) @@ -297,7 +300,7 @@ class TestUtil(unittest.TestCase): <p x="a">Foo</p> </div> </root>''' - doc = xml.etree.ElementTree.fromstring(testxml) + doc = compat_etree_fromstring(testxml) self.assertEqual(xpath_attr(doc, 'div/p', 'x'), 'a') self.assertEqual(xpath_attr(doc, 'div/bar', 'x'), None) self.assertEqual(xpath_attr(doc, 'div/p', 'y'), None) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index d103ab9ad..cf10835ca 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -14,6 +14,7 @@ import socket import subprocess import sys import itertools +import xml.etree.ElementTree try: @@ -212,6 +213,29 @@ try: except ImportError: # Python 2.6 from xml.parsers.expat import ExpatError as compat_xml_parse_error +if sys.version_info[0] >= 3: + compat_etree_fromstring = xml.etree.ElementTree.fromstring +else: + # on python 2.x the the attributes of a node are str objects instead of + # unicode + etree = xml.etree.ElementTree + + # on 2.6 XML doesn't have a parser argument, function copied from CPython + # 2.7 source + def _XML(text, parser=None): + if not parser: + parser = etree.XMLParser(target=etree.TreeBuilder()) + parser.feed(text) + return parser.close() + + def _element_factory(*args, **kwargs): + el = etree.Element(*args, **kwargs) + for k, v in el.items(): + el.set(k, v.decode('utf-8')) + return el + + def compat_etree_fromstring(text): + return _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory))) try: from urllib.parse import parse_qs as compat_parse_qs @@ -507,6 +531,7 @@ __all__ = [ 'compat_chr', 'compat_cookiejar', 'compat_cookies', + 'compat_etree_fromstring', 'compat_expanduser', 'compat_get_terminal_size', 'compat_getenv', diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 7f6143954..6170cc155 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -5,10 +5,10 @@ import io import itertools import os import time -import xml.etree.ElementTree as etree from .fragment import FragmentFD from ..compat import ( + compat_etree_fromstring, compat_urlparse, compat_urllib_error, compat_urllib_parse_urlparse, @@ -290,7 +290,7 @@ class F4mFD(FragmentFD): man_url = urlh.geturl() manifest = urlh.read() - doc = etree.fromstring(manifest) + doc = compat_etree_fromstring(manifest) formats = [(int(f.attrib.get('bitrate', -1)), f) for f in self._get_unencrypted_media(doc)] if requested_bitrate is None: diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 2cdce1eb9..a55a6dbc9 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -14,7 +13,10 @@ from ..utils import ( remove_end, unescapeHTML, ) -from ..compat import compat_HTTPError +from ..compat import ( + compat_etree_fromstring, + compat_HTTPError, +) class BBCCoUkIE(InfoExtractor): @@ -344,7 +346,7 @@ class BBCCoUkIE(InfoExtractor): url, programme_id, 'Downloading media selection XML') except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - media_selection = xml.etree.ElementTree.fromstring(ee.cause.read().decode('utf-8')) + media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8')) else: raise return self._process_media_selector(media_selection, programme_id) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index ecc17ebeb..6c66a1236 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -4,9 +4,11 @@ from __future__ import unicode_literals import re import itertools import json -import xml.etree.ElementTree as ET from .common import InfoExtractor +from ..compat import ( + compat_etree_fromstring, +) from ..utils import ( int_or_none, unified_strdate, @@ -88,7 +90,7 @@ class BiliBiliIE(InfoExtractor): except ValueError: pass - lq_doc = ET.fromstring(lq_page) + lq_doc = compat_etree_fromstring(lq_page) lq_durls = lq_doc.findall('./durl') hq_doc = self._download_xml( diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 4721c2293..1686cdde1 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -3,10 +3,10 @@ from __future__ import unicode_literals import re import json -import xml.etree.ElementTree from .common import InfoExtractor from ..compat import ( + compat_etree_fromstring, compat_parse_qs, compat_str, compat_urllib_parse, @@ -119,7 +119,7 @@ class BrightcoveIE(InfoExtractor): object_str = fix_xml_ampersands(object_str) try: - object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) + object_doc = compat_etree_fromstring(object_str.encode('utf-8')) except compat_xml_parse_error: return diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 10c0d5d1f..52523d7b2 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -10,7 +10,6 @@ import re import socket import sys import time -import xml.etree.ElementTree from ..compat import ( compat_cookiejar, @@ -23,6 +22,7 @@ from ..compat import ( compat_urllib_request, compat_urlparse, compat_str, + compat_etree_fromstring, ) from ..utils import ( NO_DEFAULT, @@ -461,7 +461,7 @@ class InfoExtractor(object): return xml_string if transform_source: xml_string = transform_source(xml_string) - return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) + return compat_etree_fromstring(xml_string.encode('utf-8')) def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index f8ce10111..0c9b8ca02 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -5,12 +5,12 @@ import re import json import base64 import zlib -import xml.etree.ElementTree from hashlib import sha1 from math import pow, sqrt, floor from .common import InfoExtractor from ..compat import ( + compat_etree_fromstring, compat_urllib_parse, compat_urllib_parse_unquote, compat_urllib_request, @@ -234,7 +234,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text return output def _extract_subtitles(self, subtitle): - sub_root = xml.etree.ElementTree.fromstring(subtitle) + sub_root = compat_etree_fromstring(subtitle) return [{ 'ext': 'srt', 'data': self._convert_subtitles_to_srt(sub_root), diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index c17094f81..4c0de354f 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -1,10 +1,10 @@ from __future__ import unicode_literals import re -import xml.etree.ElementTree from .common import InfoExtractor from ..compat import ( + compat_etree_fromstring, compat_urllib_request, ) from ..utils import ( @@ -97,7 +97,7 @@ class VevoIE(InfoExtractor): if last_version['version'] == -1: raise ExtractorError('Unable to extract last version of the video') - renditions = xml.etree.ElementTree.fromstring(last_version['data']) + renditions = compat_etree_fromstring(last_version['data']) formats = [] # Already sorted from worst to best quality for rend in renditions.findall('rendition'): @@ -114,7 +114,7 @@ class VevoIE(InfoExtractor): def _formats_from_smil(self, smil_xml): formats = [] - smil_doc = xml.etree.ElementTree.fromstring(smil_xml.encode('utf-8')) + smil_doc = compat_etree_fromstring(smil_xml.encode('utf-8')) els = smil_doc.findall('.//{http://www.w3.org/2001/SMIL20/Language}video') for el in els: src = el.attrib['src'] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a61e47646..7d846d680 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -36,6 +36,7 @@ import zlib from .compat import ( compat_basestring, compat_chr, + compat_etree_fromstring, compat_html_entities, compat_http_client, compat_kwargs, @@ -1974,7 +1975,7 @@ def dfxp2srt(dfxp_data): return out - dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) + dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) out = [] paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p') From 387db16a789fea25795433538d80513c18d0f699 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 25 Oct 2015 20:30:54 +0100 Subject: [PATCH 0048/1214] [compat] compat_etree_fromstring: only decode bytes objects --- test/test_compat.py | 3 ++- youtube_dl/compat.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index 2b0860479..834f4bc55 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -74,9 +74,10 @@ class TestCompat(unittest.TestCase): self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) def test_compat_etree_fromstring(self): - xml = '<el foo="bar"></el>' + xml = '<el foo="bar" spam="中文"></el>' doc = compat_etree_fromstring(xml.encode('utf-8')) self.assertTrue(isinstance(doc.attrib['foo'], compat_str)) + self.assertTrue(isinstance(doc.attrib['spam'], compat_str)) if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index cf10835ca..f39d4e9a9 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -216,8 +216,7 @@ except ImportError: # Python 2.6 if sys.version_info[0] >= 3: compat_etree_fromstring = xml.etree.ElementTree.fromstring else: - # on python 2.x the the attributes of a node are str objects instead of - # unicode + # on python 2.x the the attributes of a node aren't always unicode objects etree = xml.etree.ElementTree # on 2.6 XML doesn't have a parser argument, function copied from CPython @@ -231,7 +230,8 @@ else: def _element_factory(*args, **kwargs): el = etree.Element(*args, **kwargs) for k, v in el.items(): - el.set(k, v.decode('utf-8')) + if isinstance(v, bytes): + el.set(k, v.decode('utf-8')) return el def compat_etree_fromstring(text): From f78546272cf7c4b10c8003870728ab69bec982fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 26 Oct 2015 16:41:24 +0100 Subject: [PATCH 0049/1214] [compat] compat_etree_fromstring: also decode the text attribute Deletes parse_xml from utils, because it also does it. --- test/test_compat.py | 11 ++++++++++- youtube_dl/compat.py | 18 ++++++++++++++++-- youtube_dl/extractor/ard.py | 4 ++-- youtube_dl/extractor/generic.py | 4 ++-- youtube_dl/utils.py | 23 ----------------------- 5 files changed, 30 insertions(+), 30 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index 834f4bc55..b6bfad05e 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -74,10 +74,19 @@ class TestCompat(unittest.TestCase): self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) def test_compat_etree_fromstring(self): - xml = '<el foo="bar" spam="中文"></el>' + xml = ''' + <root foo="bar" spam="中文"> + <normal>foo</normal> + <chinese>中文</chinese> + <foo><bar>spam</bar></foo> + </root> + ''' doc = compat_etree_fromstring(xml.encode('utf-8')) self.assertTrue(isinstance(doc.attrib['foo'], compat_str)) self.assertTrue(isinstance(doc.attrib['spam'], compat_str)) + self.assertTrue(isinstance(doc.find('normal').text, compat_str)) + self.assertTrue(isinstance(doc.find('chinese').text, compat_str)) + self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str)) if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index f39d4e9a9..2d43ec852 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -216,9 +216,19 @@ except ImportError: # Python 2.6 if sys.version_info[0] >= 3: compat_etree_fromstring = xml.etree.ElementTree.fromstring else: - # on python 2.x the the attributes of a node aren't always unicode objects + # on python 2.x the attributes and text of a node aren't always unicode + # objects etree = xml.etree.ElementTree + try: + _etree_iter = etree.Element.iter + except AttributeError: # Python <=2.6 + def _etree_iter(root): + for el in root.findall('*'): + yield el + for sub in _etree_iter(el): + yield sub + # on 2.6 XML doesn't have a parser argument, function copied from CPython # 2.7 source def _XML(text, parser=None): @@ -235,7 +245,11 @@ else: return el def compat_etree_fromstring(text): - return _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory))) + doc = _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory))) + for el in _etree_iter(doc): + if el.text is not None and isinstance(el.text, bytes): + el.text = el.text.decode('utf-8') + return doc try: from urllib.parse import parse_qs as compat_parse_qs diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 6f465789b..73be6d204 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -14,8 +14,8 @@ from ..utils import ( parse_duration, unified_strdate, xpath_text, - parse_xml, ) +from ..compat import compat_etree_fromstring class ARDMediathekIE(InfoExtractor): @@ -161,7 +161,7 @@ class ARDMediathekIE(InfoExtractor): raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True) if re.search(r'[\?&]rss($|[=&])', url): - doc = parse_xml(webpage) + doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': return GenericIE()._extract_rss(url, video_id, doc) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ca5fbafb2..1de96b268 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -9,6 +9,7 @@ import sys from .common import InfoExtractor from .youtube import YoutubeIE from ..compat import ( + compat_etree_fromstring, compat_urllib_parse_unquote, compat_urllib_request, compat_urlparse, @@ -21,7 +22,6 @@ from ..utils import ( HEADRequest, is_html, orderedSet, - parse_xml, smuggle_url, unescapeHTML, unified_strdate, @@ -1237,7 +1237,7 @@ class GenericIE(InfoExtractor): # Is it an RSS feed, a SMIL file or a XSPF playlist? try: - doc = parse_xml(webpage) + doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7d846d680..c761ea22a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1652,29 +1652,6 @@ def encode_dict(d, encoding='utf-8'): return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items()) -try: - etree_iter = xml.etree.ElementTree.Element.iter -except AttributeError: # Python <=2.6 - etree_iter = lambda n: n.findall('.//*') - - -def parse_xml(s): - class TreeBuilder(xml.etree.ElementTree.TreeBuilder): - def doctype(self, name, pubid, system): - pass # Ignore doctypes - - parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder()) - kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {} - tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs) - # Fix up XML parser in Python 2.x - if sys.version_info < (3, 0): - for n in etree_iter(tree): - if n.text is not None: - if not isinstance(n.text, compat_str): - n.text = n.text.decode('utf-8') - return tree - - US_RATINGS = { 'G': 0, 'PG': 10, From ae37338e681319a28d98dc551253d9fa1830969a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Thu, 29 Oct 2015 13:58:40 +0100 Subject: [PATCH 0050/1214] [compat] compat_etree_fromstring: clarify comment --- youtube_dl/compat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 2d43ec852..a3e85264a 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -216,8 +216,8 @@ except ImportError: # Python 2.6 if sys.version_info[0] >= 3: compat_etree_fromstring = xml.etree.ElementTree.fromstring else: - # on python 2.x the attributes and text of a node aren't always unicode - # objects + # python 2.x tries to encode unicode strings with ascii (see the + # XMLParser._fixtext method) etree = xml.etree.ElementTree try: From 30bd1c16c8e2a767246cc115f12c1d2e99e2f8ec Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 29 Oct 2015 19:44:26 +0100 Subject: [PATCH 0051/1214] [adobetv] use api for extraction and add support specific language videos --- youtube_dl/extractor/adobetv.py | 60 +++++++++++++-------------------- 1 file changed, 24 insertions(+), 36 deletions(-) diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py index 5e43adc51..3d25caad6 100644 --- a/youtube_dl/extractor/adobetv.py +++ b/youtube_dl/extractor/adobetv.py @@ -1,23 +1,26 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( parse_duration, unified_strdate, str_to_int, + int_or_none, float_or_none, ISO639Utils, ) class AdobeTVIE(InfoExtractor): - _VALID_URL = r'https?://tv\.adobe\.com/watch/[^/]+/(?P<id>[^/]+)' + _VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?watch/(?P<show_urlname>[^/]+)/(?P<id>[^/]+)' _TEST = { 'url': 'http://tv.adobe.com/watch/the-complete-picture-with-julieanne-kost/quick-tip-how-to-draw-a-circle-around-an-object-in-photoshop/', 'md5': '9bc5727bcdd55251f35ad311ca74fa1e', 'info_dict': { - 'id': 'quick-tip-how-to-draw-a-circle-around-an-object-in-photoshop', + 'id': '10981', 'ext': 'mp4', 'title': 'Quick Tip - How to Draw a Circle Around an Object in Photoshop', 'description': 'md5:99ec318dc909d7ba2a1f2b038f7d2311', @@ -29,46 +32,31 @@ class AdobeTVIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + language, show_urlname, urlname = re.match(self._VALID_URL, url).groups() + if not language: + language = 'en' - player = self._parse_json( - self._search_regex(r'html5player:\s*({.+?})\s*\n', webpage, 'player'), - video_id) - - title = player.get('title') or self._search_regex( - r'data-title="([^"]+)"', webpage, 'title') - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - - upload_date = unified_strdate( - self._html_search_meta('datepublished', webpage, 'upload date')) - - duration = parse_duration( - self._html_search_meta('duration', webpage, 'duration') or - self._search_regex( - r'Runtime:\s*(\d{2}:\d{2}:\d{2})', - webpage, 'duration', fatal=False)) - - view_count = str_to_int(self._search_regex( - r'<div class="views">\s*Views?:\s*([\d,.]+)\s*</div>', - webpage, 'view count')) + video_data = self._download_json( + 'http://tv.adobe.com/api/v4/episode/get/?language=%s&show_urlname=%s&urlname=%s&disclosure=standard' % (language, show_urlname, urlname), + urlname)['data'][0] formats = [{ - 'url': source['src'], - 'format_id': source.get('quality') or source['src'].split('-')[-1].split('.')[0] or None, - 'tbr': source.get('bitrate'), - } for source in player['sources']] + 'url': source['url'], + 'format_id': source.get('quality_level') or source['url'].split('-')[-1].split('.')[0] or None, + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + 'tbr': int_or_none(source.get('video_data_rate')), + } for source in video_data['videos']] self._sort_formats(formats) return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'duration': duration, - 'view_count': view_count, + 'id': str(video_data['id']), + 'title': video_data['title'], + 'description': video_data.get('description'), + 'thumbnail': video_data.get('thumbnail'), + 'upload_date': unified_strdate(video_data.get('start_date')), + 'duration': parse_duration(video_data.get('duration')), + 'view_count': str_to_int(video_data.get('playcount')), 'formats': formats, } From 402ca40c9d0b20f1d04a0035b4cda0cc1184c689 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 29 Oct 2015 19:55:04 +0100 Subject: [PATCH 0052/1214] [adobetv] extract AdobeTVVideo info from json directly --- youtube_dl/extractor/adobetv.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py index 3d25caad6..383c89485 100644 --- a/youtube_dl/extractor/adobetv.py +++ b/youtube_dl/extractor/adobetv.py @@ -10,6 +10,7 @@ from ..utils import ( int_or_none, float_or_none, ISO639Utils, + determine_ext, ) @@ -79,28 +80,25 @@ class AdobeTVVideoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - player_params = self._parse_json(self._search_regex( - r'var\s+bridge\s*=\s*([^;]+);', webpage, 'player parameters'), - video_id) + video_data = self._download_json(url + '?format=json', video_id) formats = [{ + 'format_id': '%s-%s' % (determine_ext(source['src']), source.get('height')), 'url': source['src'], - 'width': source.get('width'), - 'height': source.get('height'), - 'tbr': source.get('bitrate'), - } for source in player_params['sources']] + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + 'tbr': int_or_none(source.get('bitrate')), + } for source in video_data['sources']] + self._sort_formats(formats) # For both metadata and downloaded files the duration varies among # formats. I just pick the max one duration = max(filter(None, [ float_or_none(source.get('duration'), scale=1000) - for source in player_params['sources']])) + for source in video_data['sources']])) subtitles = {} - for translation in player_params.get('translations', []): + for translation in video_data.get('translations', []): lang_id = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium']) if lang_id not in subtitles: subtitles[lang_id] = [] @@ -112,8 +110,9 @@ class AdobeTVVideoIE(InfoExtractor): return { 'id': video_id, 'formats': formats, - 'title': player_params['title'], - 'description': self._og_search_description(webpage), + 'title': video_data['title'], + 'description': video_data.get('description'), + 'thumbnail': video_data['video'].get('poster'), 'duration': duration, 'subtitles': subtitles, } From 9a605c8859e5ecf164719b890ea62b76afb0b874 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 29 Oct 2015 20:00:27 +0100 Subject: [PATCH 0053/1214] [adobetv] add support for show and channel extraction --- youtube_dl/extractor/__init__.py | 2 + youtube_dl/extractor/adobetv.py | 72 ++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6318ac4a2..d4b42dc25 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -6,6 +6,8 @@ from .academicearth import AcademicEarthCourseIE from .addanime import AddAnimeIE from .adobetv import ( AdobeTVIE, + AdobeTVShowIE, + AdobeTVChannelIE, AdobeTVVideoIE, ) from .adultswim import AdultSwimIE diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py index 383c89485..d0bfafa45 100644 --- a/youtube_dl/extractor/adobetv.py +++ b/youtube_dl/extractor/adobetv.py @@ -62,6 +62,78 @@ class AdobeTVIE(InfoExtractor): } +class AdobeTVPlaylistBaseIE(InfoExtractor): + def _parse_page_data(self, page_data): + return [self.url_result(self._get_element_url(element_data)) for element_data in page_data] + + def _extract_playlist_entries(self, url, display_id): + page = self._download_json(url, display_id) + entries = self._parse_page_data(page['data']) + for page_num in range(2, page['paging']['pages'] + 1): + entries.extend(self._parse_page_data( + self._download_json(url + '&page=%d' % page_num, display_id)['data'])) + return entries + + +class AdobeTVShowIE(AdobeTVPlaylistBaseIE): + _VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?show/(?P<id>[^/]+)' + + _TEST = { + 'url': 'http://tv.adobe.com/show/the-complete-picture-with-julieanne-kost', + 'info_dict': { + 'id': '36', + 'title': 'The Complete Picture with Julieanne Kost', + 'description': 'md5:fa50867102dcd1aa0ddf2ab039311b27', + }, + 'playlist_mincount': 136, + } + + def _get_element_url(self, element_data): + return element_data['urls'][0] + + def _real_extract(self, url): + language, show_urlname = re.match(self._VALID_URL, url).groups() + if not language: + language = 'en' + query = 'language=%s&show_urlname=%s' % (language, show_urlname) + + show_data = self._download_json( + 'http://tv.adobe.com/api/v4/show/get/?%s' % query, show_urlname)['data'][0] + + return self.playlist_result( + self._extract_playlist_entries('http://tv.adobe.com/api/v4/episode/?%s' % query, show_urlname), + str(show_data['id']), + show_data['show_name'], + show_data['show_description']) + + +class AdobeTVChannelIE(AdobeTVPlaylistBaseIE): + _VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?channel/(?P<id>[^/]+)(?:/(?P<category_urlname>[^/]+))?' + + _TEST = { + 'url': 'http://tv.adobe.com/channel/development', + 'info_dict': { + 'id': 'development', + }, + 'playlist_mincount': 96, + } + + def _get_element_url(self, element_data): + return element_data['url'] + + def _real_extract(self, url): + language, channel_urlname, category_urlname = re.match(self._VALID_URL, url).groups() + if not language: + language = 'en' + query = 'language=%s&channel_urlname=%s' % (language, channel_urlname) + if category_urlname: + query += '&category_urlname=%s' % category_urlname + + return self.playlist_result( + self._extract_playlist_entries('http://tv.adobe.com/api/v4/show/?%s' % query, channel_urlname), + channel_urlname) + + class AdobeTVVideoIE(InfoExtractor): _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)' From 00d24327efcac74b11dbc4d813aed74da9a501e0 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 30 Oct 2015 09:48:56 +0100 Subject: [PATCH 0054/1214] [vgtv] extract videos from FTV, Aftenposten, Aftonbladet using VGTVIE --- youtube_dl/extractor/__init__.py | 1 - youtube_dl/extractor/aftenposten.py | 23 ----------- youtube_dl/extractor/vgtv.py | 60 ++++++++++++++++++++++------- 3 files changed, 46 insertions(+), 38 deletions(-) delete mode 100644 youtube_dl/extractor/aftenposten.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f98e6487e..f7dcabcf7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -9,7 +9,6 @@ from .adobetv import ( AdobeTVVideoIE, ) from .adultswim import AdultSwimIE -from .aftenposten import AftenpostenIE from .aftonbladet import AftonbladetIE from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE diff --git a/youtube_dl/extractor/aftenposten.py b/youtube_dl/extractor/aftenposten.py deleted file mode 100644 index 0c00acfb5..000000000 --- a/youtube_dl/extractor/aftenposten.py +++ /dev/null @@ -1,23 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class AftenpostenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aftenposten\.no/webtv/(?:#!/)?video/(?P<id>\d+)' - _TEST = { - 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', - 'md5': 'fd828cd29774a729bf4d4425fe192972', - 'info_dict': { - 'id': '21039', - 'ext': 'mov', - 'title': 'TRAILER: "Sweatshop" - I can´t take any more', - 'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238', - 'timestamp': 1416927969, - 'upload_date': '20141125', - } - } - - def _real_extract(self, url): - return self.url_result('xstream:ap:%s' % self._match_id(url), 'Xstream') diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index f38a72fde..17213d9b6 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -11,16 +11,17 @@ from ..utils import ( class VGTVIE(InfoExtractor): - IE_DESC = 'VGTV and BTTV' + IE_DESC = 'VGTV, BTTV, FTV, Aftenposten, Aftonbladet' _VALID_URL = r'''(?x) (?: vgtv:| http://(?:www\.)? ) - (?P<host>vgtv|bt) + (?P<host>vgtv.no|(?:bt.no|aftenbladet.no)/tv|fvn.no/fvntv|aftenposten.no/webtv) (?: :| - \.no/(?:tv/)?\#!/(?:video|live)/ + /\#!/(?:video|live)/| + /embed?id= ) (?P<id>[0-9]+) ''' @@ -59,17 +60,18 @@ class VGTVIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + 'skip': 'Video is no longer available', }, { - # streamType: live + # streamType: wasLive 'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla', 'info_dict': { 'id': '113063', - 'ext': 'flv', - 'title': 're:^DIREKTE: V75 fra Solvalla [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'ext': 'mp4', + 'title': 'V75 fra Solvalla 30.05.15', 'description': 'md5:b3743425765355855f88e096acc93231', 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 0, + 'duration': 25966, 'timestamp': 1432975582, 'upload_date': '20150530', 'view_count': int, @@ -78,26 +80,56 @@ class VGTVIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + },{ + 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', + 'md5': '7fbc265a3ca4933a423c7a66aa879a67', + 'info_dict': { + 'id': '21039', + 'ext': 'mp4', + 'title': 'TRAILER: «SWEATSHOP» - I can´t take any more', + 'description': 'md5:21891f2b0dd7ec2f78d84a50e54f8238', + 'duration': 66, + 'timestamp': 1417002452, + 'upload_date': '20141126', + 'view_count': int, + } }, { 'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien', 'only_matching': True, }, ] + _HOST_WEBSITES = { + 'vgtv.no': { + 'vendor': 'vgtv', + 'appname': 'vgtv', + }, + 'bt.no/tv': { + 'vendor': 'bt', + 'appname': 'bttv', + }, + 'aftenbladet.no/tv': { + 'vendor': 'sa', + 'appname': 'satv', + }, + 'fvn.no/fvntv': { + 'vendor': 'fvn', + 'appname': 'fvntv', + }, + 'aftenposten.no/webtv': { + 'vendor': 'ap', + 'appname': 'aptv', + }, + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') host = mobj.group('host') - HOST_WEBSITES = { - 'vgtv': 'vgtv', - 'bt': 'bttv', - } - data = self._download_json( 'http://svp.vg.no/svp/api/v1/%s/assets/%s?appName=%s-website' - % (host, video_id, HOST_WEBSITES[host]), + % (self._HOST_WEBSITES[host]['vendor'], video_id, self._HOST_WEBSITES[host]['appname']), video_id, 'Downloading media JSON') if data.get('status') == 'inactive': @@ -144,7 +176,7 @@ class VGTVIE(InfoExtractor): return { 'id': video_id, - 'title': self._live_title(data['title']), + 'title': self._live_title(data['title']) if stream_type == 'live' else data['title'], 'description': data['description'], 'thumbnail': data['images']['main'] + '?t[]=900x506q80', 'timestamp': data['published'], From 804afc5871a88eaa32a6c161df67e6b37383d7d1 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 30 Oct 2015 10:20:38 +0100 Subject: [PATCH 0055/1214] [vgtv] improve _VALID_URL regex --- youtube_dl/extractor/vgtv.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 17213d9b6..e8039ec7f 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -11,17 +11,19 @@ from ..utils import ( class VGTVIE(InfoExtractor): - IE_DESC = 'VGTV, BTTV, FTV, Aftenposten, Aftonbladet' + IE_DESC = 'VGTV, BTTV, FTV, Aftenposten and Aftonbladet' _VALID_URL = r'''(?x) - (?: - vgtv:| - http://(?:www\.)? + http://(?:www\.)? + (?P<host> + vgtv.no| + (?:bt|aftenbladet).no/tv| + fvn.no/fvntv| + aftenposten.no/webtv ) - (?P<host>vgtv.no|(?:bt.no|aftenbladet.no)/tv|fvn.no/fvntv|aftenposten.no/webtv) + / (?: - :| - /\#!/(?:video|live)/| - /embed?id= + \#!/(?:video|live)/| + embed?.*id= ) (?P<id>[0-9]+) ''' @@ -211,7 +213,7 @@ class BTArticleIE(InfoExtractor): webpage = self._download_webpage(url, self._match_id(url)) video_id = self._search_regex( r'SVP\.Player\.load\(\s*(\d+)', webpage, 'video id') - return self.url_result('vgtv:bt:%s' % video_id, 'VGTV') + return self.url_result('http://bt.no/tv/embed?id=%s' % video_id, 'VGTV') class BTVestlendingenIE(InfoExtractor): From 957e0db1d2d1d4b2e9fdd1c314b0c46e68ca0cb3 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 30 Oct 2015 13:56:21 +0100 Subject: [PATCH 0056/1214] [baidu] improve info extraction --- youtube_dl/extractor/baidu.py | 45 +++++++++++++---------------------- 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/baidu.py b/youtube_dl/extractor/baidu.py index e37ee4440..84fab551b 100644 --- a/youtube_dl/extractor/baidu.py +++ b/youtube_dl/extractor/baidu.py @@ -14,8 +14,8 @@ class BaiduVideoIE(InfoExtractor): 'url': 'http://v.baidu.com/comic/1069.htm?frp=bdbrand&q=%E4%B8%AD%E5%8D%8E%E5%B0%8F%E5%BD%93%E5%AE%B6', 'info_dict': { 'id': '1069', - 'title': '中华小当家 TV版 (全52集)', - 'description': 'md5:395a419e41215e531c857bb037bbaf80', + 'title': '中华小当家 TV版国语', + 'description': 'md5:40a9c1b1c7f4e05d642e7bb1c84eeda0', }, 'playlist_count': 52, }, { @@ -25,45 +25,32 @@ class BaiduVideoIE(InfoExtractor): 'title': 're:^奔跑吧兄弟', 'description': 'md5:1bf88bad6d850930f542d51547c089b8', }, - 'playlist_mincount': 3, + 'playlist_mincount': 12, }] + def _call_api(self, path, category, playlist_id): + return self._download_json('http://app.video.baidu.com/%s/?worktype=adnative%s&id=%s' % (path, category, playlist_id), playlist_id) + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') - category = category2 = mobj.group('type') + category, playlist_id = re.match(self._VALID_URL, url).groups() if category == 'show': - category2 = 'tvshow' + category = 'tvshow' + if category == 'tv': + category = 'tvplay' - webpage = self._download_webpage(url, playlist_id) + playlist_detail = self._call_api('xqinfo', category, playlist_id) - playlist_title = self._html_search_regex( - r'title\s*:\s*(["\'])(?P<title>[^\']+)\1', webpage, - 'playlist title', group='title') - playlist_description = self._html_search_regex( - r'<input[^>]+class="j-data-intro"[^>]+value="([^"]+)"/>', webpage, - playlist_id, 'playlist description') + playlist_title = playlist_detail['title'] + playlist_description = playlist_detail.get('intro') - site = self._html_search_regex( - r'filterSite\s*:\s*["\']([^"]*)["\']', webpage, - 'primary provider site') - api_result = self._download_json( - 'http://v.baidu.com/%s_intro/?dtype=%sPlayUrl&id=%s&site=%s' % ( - category, category2, playlist_id, site), - playlist_id, 'Get playlist links') + episodes_detail = self._call_api('xqsingle', category, playlist_id) entries = [] - for episode in api_result[0]['episodes']: + for episode in episodes_detail['videos']: episode_id = '%s_%s' % (playlist_id, episode['episode']) - redirect_page = self._download_webpage( - compat_urlparse.urljoin(url, episode['url']), episode_id, - note='Download Baidu redirect page') - real_url = self._html_search_regex( - r'location\.replace\("([^"]+)"\)', redirect_page, 'real URL') - entries.append(self.url_result( - real_url, video_title=episode['single_title'])) + episode['url'], video_title=episode['title'])) return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) From 240384afe6983b71d3a6cb8891312185a15619f6 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 30 Oct 2015 20:06:38 +0100 Subject: [PATCH 0057/1214] [clipfish] improve info extraction --- youtube_dl/extractor/clipfish.py | 56 ++++++++++++-------------------- 1 file changed, 21 insertions(+), 35 deletions(-) diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index 7af903571..d142e326f 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -4,11 +4,8 @@ import re from .common import InfoExtractor from ..utils import ( - determine_ext, int_or_none, - js_to_json, - parse_iso8601, - remove_end, + unified_strdate, ) @@ -21,48 +18,37 @@ class ClipfishIE(InfoExtractor): 'id': '3966754', 'ext': 'mp4', 'title': 'FIFA 14 - E3 2013 Trailer', - 'timestamp': 1370938118, + 'description': 'Video zu FIFA 14: E3 2013 Trailer', 'upload_date': '20130611', 'duration': 82, + 'view_count': int, } } def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + video_info = self._download_json('http://www.clipfish.de/devapi/id/%s?format=json&apikey=hbbtv' % video_id, video_id)['items'][0] - video_info = self._parse_json( - js_to_json(self._html_search_regex( - '(?s)videoObject\s*=\s*({.+?});', webpage, 'video object')), - video_id) - - formats = [] - for video_url in re.findall(r'var\s+videourl\s*=\s*"([^"]+)"', webpage): - ext = determine_ext(video_url) - if ext == 'm3u8': - formats.append({ - 'url': video_url.replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'), - 'ext': 'mp4', - 'format_id': 'hls', - }) - else: - formats.append({ - 'url': video_url, - 'format_id': ext, - }) - self._sort_formats(formats) - - title = remove_end(self._og_search_title(webpage), ' - Video') - thumbnail = self._og_search_thumbnail(webpage) - duration = int_or_none(video_info.get('length')) - timestamp = parse_iso8601(self._html_search_meta('uploadDate', webpage, 'upload date')) + formats = [{ + 'url': video_info['media_videourl_hls'].replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'), + 'ext': 'mp4', + 'format_id': 'hls', + },{ + 'url': video_info['media_videourl'], + 'format_id': 'mp4', + 'width': int_or_none(video_info.get('width')), + 'height': int_or_none(video_info.get('height')), + 'tbr': int_or_none(video_info.get('bitrate')), + }] return { 'id': video_id, - 'title': title, + 'title': video_info['title'], + 'description': video_info.get('descr'), 'formats': formats, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, + 'thumbnail': video_info.get('media_content_thumbnail_large') or video_info.get('media_thumbnail'), + 'duration': int_or_none(video_info.get('media_length')), + 'upload_date': unified_strdate(video_info.get('pubDate')), + 'view_count': int_or_none(video_info.get('media_views')) } From 720334659ac13d7f5bc0759662ba0534f48a1ce4 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 31 Oct 2015 01:08:37 +0100 Subject: [PATCH 0058/1214] [daum] improve info extraction --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/daum.py | 123 +++++++++++++++++++------------ 2 files changed, 79 insertions(+), 49 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f98e6487e..99b4be002 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -119,7 +119,10 @@ from .dailymotion import ( DailymotionUserIE, DailymotionCloudIE, ) -from .daum import DaumIE +from .daum import ( + DaumIE, + DaumClipIE, +) from .dbtv import DBTVIE from .dcn import DCNIE from .dctp import DctpTvIE diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index 934da765e..0f5686e07 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -2,16 +2,73 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, -) +from ..compat import compat_urllib_parse +from ..utils import int_or_none class DaumIE(InfoExtractor): - _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/(?:v/|.*?clipid=)(?P<id>[^?#&]+)' + _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/v/(?P<id>[^?#&]+)' + IE_NAME = 'daum.net' + + _TESTS = [{ + 'url': 'http://tvpot.daum.net/v/vab4dyeDBysyBssyukBUjBz', + 'info_dict': { + 'id': 'vab4dyeDBysyBssyukBUjBz', + 'ext': 'mp4', + 'title': '마크 헌트 vs 안토니오 실바', + 'description': 'Mark Hunt vs Antonio Silva', + 'upload_date': '20131217', + 'duration': 2117, + }, + }, { + 'url': 'http://tvpot.daum.net/v/07dXWRka62Y%24', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + query = compat_urllib_parse.urlencode({'vid': video_id}) + info = self._download_xml( + 'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, + 'Downloading video info') + movie_data = self._download_json( + 'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json?' + query, + video_id, 'Downloading video formats info') + + formats = [] + for format_el in movie_data['output_list']['output_list']: + profile = format_el['profile'] + format_query = compat_urllib_parse.urlencode({ + 'vid': video_id, + 'profile': profile, + }) + url_doc = self._download_xml( + 'http://videofarm.daum.net/controller/api/open/v1_2/MovieLocation.apixml?' + format_query, + video_id, note='Downloading video data for %s format' % profile) + format_url = url_doc.find('result/url').text + formats.append({ + 'url': format_url, + 'format_id': profile, + 'width': int_or_none(format_el.get('width')), + 'height': int_or_none(format_el.get('height')), + 'filesize': int_or_none(format_el.get('filesize')), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': info.find('TITLE').text, + 'formats': formats, + 'thumbnail': info.find('THUMB_URL').text, + 'description': info.find('CONTENTS').text, + 'duration': int_or_none(info.find('DURATION').text), + 'upload_date': info.find('REGDTTM').text[:8], + } + + +class DaumClipIE(InfoExtractor): + _VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/.*?clipid=(?P<id>\d+)' IE_NAME = 'daum.net' _TESTS = [{ @@ -23,53 +80,23 @@ class DaumIE(InfoExtractor): 'description': 'DOTA 2GETHER 시즌2 6회 - 2부', 'upload_date': '20130831', 'duration': 3868, + 'view_count': int, }, - }, { - 'url': 'http://tvpot.daum.net/v/vab4dyeDBysyBssyukBUjBz', - 'only_matching': True, - }, { - 'url': 'http://tvpot.daum.net/v/07dXWRka62Y%24', - 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - canonical_url = 'http://tvpot.daum.net/v/%s' % video_id - webpage = self._download_webpage(canonical_url, video_id) - full_id = self._search_regex( - r'src=["\']http://videofarm\.daum\.net/controller/video/viewer/Video\.html\?.*?vid=(.+?)[&"\']', - webpage, 'full id') - query = compat_urllib_parse.urlencode({'vid': full_id}) - info = self._download_xml( - 'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, - 'Downloading video info') - urls = self._download_xml( - 'http://videofarm.daum.net/controller/api/open/v1_2/MovieData.apixml?' + query, - video_id, 'Downloading video formats info') - - formats = [] - for format_el in urls.findall('result/output_list/output_list'): - profile = format_el.attrib['profile'] - format_query = compat_urllib_parse.urlencode({ - 'vid': full_id, - 'profile': profile, - }) - url_doc = self._download_xml( - 'http://videofarm.daum.net/controller/api/open/v1_2/MovieLocation.apixml?' + format_query, - video_id, note='Downloading video data for %s format' % profile) - format_url = url_doc.find('result/url').text - formats.append({ - 'url': format_url, - 'format_id': profile, - }) + video_id = self._match_id(url) + clip_info = self._download_json('http://tvpot.daum.net/mypot/json/GetClipInfo.do?clipid=%s' % video_id, video_id)['clip_bean'] return { + '_type': 'url_transparent', 'id': video_id, - 'title': info.find('TITLE').text, - 'formats': formats, - 'thumbnail': self._og_search_thumbnail(webpage), - 'description': info.find('CONTENTS').text, - 'duration': int(info.find('DURATION').text), - 'upload_date': info.find('REGDTTM').text[:8], + 'url': 'http://tvpot.daum.net/v/%s' % clip_info['vid'], + 'title': clip_info['title'], + 'thumbnail': clip_info.get('thumb_url'), + 'description': clip_info.get('contents'), + 'duration': int_or_none(clip_info.get('duration')), + 'upload_date': clip_info.get('up_date')[:8], + 'view_count': int_or_none(clip_info.get('play_count')), + 'ie_key': 'Daum', } From 47f2d01a5ac074f6959aa12e8bc00310f18a54e8 Mon Sep 17 00:00:00 2001 From: Lucas <mikotosc@gmail.com> Date: Thu, 24 Sep 2015 22:19:09 +0200 Subject: [PATCH 0059/1214] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/kika.py | 115 +++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 youtube_dl/extractor/kika.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f98e6487e..5ad4e9c36 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -274,6 +274,7 @@ from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE +from .kika import KikaIE from .keek import KeekIE from .kontrtube import KontrTubeIE from .krasview import KrasViewIE diff --git a/youtube_dl/extractor/kika.py b/youtube_dl/extractor/kika.py new file mode 100644 index 000000000..db0f333ff --- /dev/null +++ b/youtube_dl/extractor/kika.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class KikaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kika\.de/(?:[a-z-]+/)*(?:video|sendung)(?P<id>\d+).*' + + _TESTS = [ + { + 'url': 'http://www.kika.de/baumhaus/videos/video9572.html', + 'md5': '94fc748cf5d64916571d275a07ffe2d5', + 'info_dict': { + 'id': '9572', + 'ext': 'mp4', + 'title': 'Baumhaus vom 29. Oktober 2014', + 'description': None + } + }, + { + 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', + 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', + 'info_dict': { + 'id': '8182', + 'ext': 'mp4', + 'title': 'Beutolomäus und der geheime Weihnachtswunsch', + 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd' + } + }, + { + 'url': 'http://www.kika.de/videos/allevideos/video9572_zc-32ca94ad_zs-3f535991.html', + 'md5': '94fc748cf5d64916571d275a07ffe2d5', + 'info_dict': { + 'id': '9572', + 'ext': 'mp4', + 'title': 'Baumhaus vom 29. Oktober 2014', + 'description': None + } + }, + { + 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/sendung81244_zc-81d703f8_zs-f82d5e31.html', + 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', + 'info_dict': { + 'id': '8182', + 'ext': 'mp4', + 'title': 'Beutolomäus und der geheime Weihnachtswunsch', + 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd' + } + } + ] + + def _real_extract(self, url): + # broadcast_id may be the same as the video_id + broadcast_id = self._match_id(url) + webpage = self._download_webpage(url, broadcast_id) + + xml_re = r'sectionArticle[ "](?:(?!sectionA[ "])(?:.|\n))*?dataURL:\'(?:/[a-z-]+?)*?/video(\d+)-avCustom\.xml' + video_id = self._search_regex(xml_re, webpage, "xml_url", default=None) + if not video_id: + # Video is not available online + err_msg = 'Video %s is not available online' % broadcast_id + raise ExtractorError(err_msg, expected=True) + + xml_url = 'http://www.kika.de/video%s-avCustom.xml' % (video_id) + xml_tree = self._download_xml(xml_url, video_id) + + title = xml_tree.find('title').text + webpage_url = xml_tree.find('htmlUrl').text + + # Try to get the description, not available for all videos + try: + broadcast_elem = xml_tree.find('broadcast') + description = broadcast_elem.find('broadcastDescription').text + except AttributeError: + # No description available + description = None + + # duration string format is mm:ss (even if it is >= 1 hour, e.g. 78:42) + tmp = xml_tree.find('duration').text.split(':') + duration = int(tmp[0]) * 60 + int(tmp[1]) + + formats_list = [] + for elem in xml_tree.find('assets'): + format_dict = {} + format_dict['url'] = elem.find('progressiveDownloadUrl').text + format_dict['ext'] = elem.find('mediaType').text.lower() + format_dict['format'] = elem.find('profileName').text + width = int(elem.find('frameWidth').text) + height = int(elem.find('frameHeight').text) + format_dict['width'] = width + format_dict['height'] = height + format_dict['resolution'] = '%dx%d' % (width, height) + format_dict['abr'] = int(elem.find('bitrateAudio').text) + format_dict['vbr'] = int(elem.find('bitrateVideo').text) + format_dict['tbr'] = format_dict['abr'] + format_dict['vbr'] + format_dict['filesize'] = int(elem.find('fileSize').text) + + # append resolution and dict for sorting by resolution + formats_list.append((width * height, format_dict)) + + # Sort by resolution (=quality) + formats_list.sort() + + out_list = [x[1] for x in formats_list] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': out_list, + 'duration': duration, + 'webpage_url': webpage_url + } From 892015b088fa21915270b0a05937fcc7063ccdd2 Mon Sep 17 00:00:00 2001 From: Lucas <mikotosc@gmail.com> Date: Mon, 28 Sep 2015 22:00:56 +0200 Subject: [PATCH 0060/1214] replaced inefficient code --- youtube_dl/extractor/kika.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/kika.py b/youtube_dl/extractor/kika.py index db0f333ff..871e4ea44 100644 --- a/youtube_dl/extractor/kika.py +++ b/youtube_dl/extractor/kika.py @@ -87,29 +87,25 @@ class KikaIE(InfoExtractor): format_dict['url'] = elem.find('progressiveDownloadUrl').text format_dict['ext'] = elem.find('mediaType').text.lower() format_dict['format'] = elem.find('profileName').text - width = int(elem.find('frameWidth').text) - height = int(elem.find('frameHeight').text) - format_dict['width'] = width - format_dict['height'] = height - format_dict['resolution'] = '%dx%d' % (width, height) + format_dict['width'] = int(elem.find('frameWidth').text) + format_dict['height'] = int(elem.find('frameHeight').text) + format_dict['resolution'] = '%dx%d' % (format_dict['width'], + format_dict['height']) format_dict['abr'] = int(elem.find('bitrateAudio').text) format_dict['vbr'] = int(elem.find('bitrateVideo').text) format_dict['tbr'] = format_dict['abr'] + format_dict['vbr'] format_dict['filesize'] = int(elem.find('fileSize').text) - # append resolution and dict for sorting by resolution - formats_list.append((width * height, format_dict)) + formats_list.append(format_dict) # Sort by resolution (=quality) - formats_list.sort() - - out_list = [x[1] for x in formats_list] + formats_list.sort(key=lambda x: x['width'] * x['height']) return { 'id': video_id, 'title': title, 'description': description, - 'formats': out_list, + 'formats': formats_list, 'duration': duration, 'webpage_url': webpage_url } From 78d7ee19dc417b16b26fe2fa1101124866727a85 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 31 Oct 2015 22:21:52 +0800 Subject: [PATCH 0061/1214] [democracynow] Fix _TESTS --- youtube_dl/extractor/democracynow.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 973bb437b..05cfc7502 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -36,10 +36,9 @@ class DemocracynowIE(InfoExtractor): if display_id == '': display_id = 'home' webpage = self._download_webpage(url, display_id) - re_desc = re.search(r'<meta property=.og:description. content=(["\'])(.+?)\1', webpage, re.DOTALL) - description = re_desc.group(2) if re_desc else '' + description = self._og_search_description(webpage) - jstr = self._search_regex(r'({.+?"related_video_xml".+?})', webpage, 'json', default=None) + jstr = self._search_regex(r'<script[^>]+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json') js = self._parse_json(jstr, display_id) video_id = None formats = [] @@ -56,7 +55,7 @@ class DemocracynowIE(InfoExtractor): 'ext': ext, 'url': url, }] - for key in ('file', 'audio'): + for key in ('file', 'audio', 'video'): url = js.get(key, '') if url == '' or url is None: continue From 50b9dd734423231f7a01ed8a156d09ca04a23a31 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 31 Oct 2015 15:40:11 +0100 Subject: [PATCH 0062/1214] [dcn] improve season info extraction --- youtube_dl/extractor/dcn.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 8b360a9d7..a9a5e94f5 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -210,16 +210,14 @@ class DCNSeasonIE(InfoExtractor): }) show = self._download_json(request, show_id) - season_id = season_id or show['default_season'] - season = {} - for _ in show['seasons']: - if _['id'] == season_id: - season = _ - break - title = season.get('title_en') or season['title_ar'] + if not season_id: + season_id = show['default_season'] + for season in show['seasons']: + if season['id'] == season_id: + title = season.get('title_en') or season['title_ar'] - entries = [] - for video in show['videos']: - entries.append(self.url_result('http://www.dcndigital.ae/#/media/%s' % video['id'], 'DCNVideo')) + entries = [] + for video in show['videos']: + entries.append(self.url_result('http://www.dcndigital.ae/#/media/%s' % video['id'], 'DCNVideo')) - return self.playlist_result(entries, season_id, title) + return self.playlist_result(entries, season_id, title) From 8c1aa28c27af204c4996260cdc70359e83c2c3d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 31 Oct 2015 16:14:36 +0100 Subject: [PATCH 0063/1214] [kika] Replace non working tests and recognize 'einzelsendung' urls. --- youtube_dl/extractor/kika.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/kika.py b/youtube_dl/extractor/kika.py index 871e4ea44..c9169076a 100644 --- a/youtube_dl/extractor/kika.py +++ b/youtube_dl/extractor/kika.py @@ -6,16 +6,16 @@ from ..utils import ExtractorError class KikaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?kika\.de/(?:[a-z-]+/)*(?:video|sendung)(?P<id>\d+).*' + _VALID_URL = r'https?://(?:www\.)?kika\.de/(?:[a-z-]+/)*(?:video|(?:einzel)?sendung)(?P<id>\d+).*' _TESTS = [ { - 'url': 'http://www.kika.de/baumhaus/videos/video9572.html', - 'md5': '94fc748cf5d64916571d275a07ffe2d5', + 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', + 'md5': '4930515e36b06c111213e80d1e4aad0e', 'info_dict': { - 'id': '9572', + 'id': '19636', 'ext': 'mp4', - 'title': 'Baumhaus vom 29. Oktober 2014', + 'title': 'Baumhaus vom 30. Oktober 2015', 'description': None } }, @@ -30,17 +30,17 @@ class KikaIE(InfoExtractor): } }, { - 'url': 'http://www.kika.de/videos/allevideos/video9572_zc-32ca94ad_zs-3f535991.html', - 'md5': '94fc748cf5d64916571d275a07ffe2d5', + 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', + 'md5': '4930515e36b06c111213e80d1e4aad0e', 'info_dict': { - 'id': '9572', + 'id': '19636', 'ext': 'mp4', - 'title': 'Baumhaus vom 29. Oktober 2014', + 'title': 'Baumhaus vom 30. Oktober 2015', 'description': None } }, { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/sendung81244_zc-81d703f8_zs-f82d5e31.html', + 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', 'info_dict': { 'id': '8182', From c3040bd00a43e111dab0d1ab903df03ac7d19a00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 31 Oct 2015 16:32:35 +0100 Subject: [PATCH 0064/1214] [kika] Cleanup Closes #6957. --- youtube_dl/extractor/kika.py | 54 +++++++++++++++--------------------- 1 file changed, 22 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/kika.py b/youtube_dl/extractor/kika.py index c9169076a..5337ac439 100644 --- a/youtube_dl/extractor/kika.py +++ b/youtube_dl/extractor/kika.py @@ -16,8 +16,8 @@ class KikaIE(InfoExtractor): 'id': '19636', 'ext': 'mp4', 'title': 'Baumhaus vom 30. Oktober 2015', - 'description': None - } + 'description': None, + }, }, { 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', @@ -26,8 +26,8 @@ class KikaIE(InfoExtractor): 'id': '8182', 'ext': 'mp4', 'title': 'Beutolomäus und der geheime Weihnachtswunsch', - 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd' - } + 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', + }, }, { 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', @@ -36,8 +36,8 @@ class KikaIE(InfoExtractor): 'id': '19636', 'ext': 'mp4', 'title': 'Baumhaus vom 30. Oktober 2015', - 'description': None - } + 'description': None, + }, }, { 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', @@ -46,9 +46,9 @@ class KikaIE(InfoExtractor): 'id': '8182', 'ext': 'mp4', 'title': 'Beutolomäus und der geheime Weihnachtswunsch', - 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd' - } - } + 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', + }, + }, ] def _real_extract(self, url): @@ -59,7 +59,6 @@ class KikaIE(InfoExtractor): xml_re = r'sectionArticle[ "](?:(?!sectionA[ "])(?:.|\n))*?dataURL:\'(?:/[a-z-]+?)*?/video(\d+)-avCustom\.xml' video_id = self._search_regex(xml_re, webpage, "xml_url", default=None) if not video_id: - # Video is not available online err_msg = 'Video %s is not available online' % broadcast_id raise ExtractorError(err_msg, expected=True) @@ -74,38 +73,29 @@ class KikaIE(InfoExtractor): broadcast_elem = xml_tree.find('broadcast') description = broadcast_elem.find('broadcastDescription').text except AttributeError: - # No description available description = None # duration string format is mm:ss (even if it is >= 1 hour, e.g. 78:42) tmp = xml_tree.find('duration').text.split(':') duration = int(tmp[0]) * 60 + int(tmp[1]) - formats_list = [] - for elem in xml_tree.find('assets'): - format_dict = {} - format_dict['url'] = elem.find('progressiveDownloadUrl').text - format_dict['ext'] = elem.find('mediaType').text.lower() - format_dict['format'] = elem.find('profileName').text - format_dict['width'] = int(elem.find('frameWidth').text) - format_dict['height'] = int(elem.find('frameHeight').text) - format_dict['resolution'] = '%dx%d' % (format_dict['width'], - format_dict['height']) - format_dict['abr'] = int(elem.find('bitrateAudio').text) - format_dict['vbr'] = int(elem.find('bitrateVideo').text) - format_dict['tbr'] = format_dict['abr'] + format_dict['vbr'] - format_dict['filesize'] = int(elem.find('fileSize').text) - - formats_list.append(format_dict) - - # Sort by resolution (=quality) - formats_list.sort(key=lambda x: x['width'] * x['height']) + formats = [{ + 'url': elem.find('progressiveDownloadUrl').text, + 'ext': elem.find('mediaType').text.lower(), + 'format': elem.find('profileName').text, + 'width': int(elem.find('frameWidth').text), + 'height': int(elem.find('frameHeight').text), + 'abr': int(elem.find('bitrateAudio').text), + 'vbr': int(elem.find('bitrateVideo').text), + 'filesize': int(elem.find('fileSize').text), + } for elem in xml_tree.find('assets')] + self._sort_formats(formats) return { 'id': video_id, 'title': title, 'description': description, - 'formats': formats_list, + 'formats': formats, 'duration': duration, - 'webpage_url': webpage_url + 'webpage_url': webpage_url, } From 2b1b2d83cacfdce19cae5eea2f9bbfd142efc7f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 22:17:09 +0600 Subject: [PATCH 0065/1214] [mdr] Modernize and include kika.de --- youtube_dl/extractor/__init__.py | 1 - youtube_dl/extractor/kika.py | 101 ------------------ youtube_dl/extractor/mdr.py | 172 +++++++++++++++++++++++-------- 3 files changed, 131 insertions(+), 143 deletions(-) delete mode 100644 youtube_dl/extractor/kika.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 5ad4e9c36..f98e6487e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -274,7 +274,6 @@ from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE -from .kika import KikaIE from .keek import KeekIE from .kontrtube import KontrTubeIE from .krasview import KrasViewIE diff --git a/youtube_dl/extractor/kika.py b/youtube_dl/extractor/kika.py deleted file mode 100644 index 5337ac439..000000000 --- a/youtube_dl/extractor/kika.py +++ /dev/null @@ -1,101 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ExtractorError - - -class KikaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?kika\.de/(?:[a-z-]+/)*(?:video|(?:einzel)?sendung)(?P<id>\d+).*' - - _TESTS = [ - { - 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', - 'md5': '4930515e36b06c111213e80d1e4aad0e', - 'info_dict': { - 'id': '19636', - 'ext': 'mp4', - 'title': 'Baumhaus vom 30. Oktober 2015', - 'description': None, - }, - }, - { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', - 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', - 'info_dict': { - 'id': '8182', - 'ext': 'mp4', - 'title': 'Beutolomäus und der geheime Weihnachtswunsch', - 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', - }, - }, - { - 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', - 'md5': '4930515e36b06c111213e80d1e4aad0e', - 'info_dict': { - 'id': '19636', - 'ext': 'mp4', - 'title': 'Baumhaus vom 30. Oktober 2015', - 'description': None, - }, - }, - { - 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', - 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', - 'info_dict': { - 'id': '8182', - 'ext': 'mp4', - 'title': 'Beutolomäus und der geheime Weihnachtswunsch', - 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', - }, - }, - ] - - def _real_extract(self, url): - # broadcast_id may be the same as the video_id - broadcast_id = self._match_id(url) - webpage = self._download_webpage(url, broadcast_id) - - xml_re = r'sectionArticle[ "](?:(?!sectionA[ "])(?:.|\n))*?dataURL:\'(?:/[a-z-]+?)*?/video(\d+)-avCustom\.xml' - video_id = self._search_regex(xml_re, webpage, "xml_url", default=None) - if not video_id: - err_msg = 'Video %s is not available online' % broadcast_id - raise ExtractorError(err_msg, expected=True) - - xml_url = 'http://www.kika.de/video%s-avCustom.xml' % (video_id) - xml_tree = self._download_xml(xml_url, video_id) - - title = xml_tree.find('title').text - webpage_url = xml_tree.find('htmlUrl').text - - # Try to get the description, not available for all videos - try: - broadcast_elem = xml_tree.find('broadcast') - description = broadcast_elem.find('broadcastDescription').text - except AttributeError: - description = None - - # duration string format is mm:ss (even if it is >= 1 hour, e.g. 78:42) - tmp = xml_tree.find('duration').text.split(':') - duration = int(tmp[0]) * 60 + int(tmp[1]) - - formats = [{ - 'url': elem.find('progressiveDownloadUrl').text, - 'ext': elem.find('mediaType').text.lower(), - 'format': elem.find('profileName').text, - 'width': int(elem.find('frameWidth').text), - 'height': int(elem.find('frameHeight').text), - 'abr': int(elem.find('bitrateAudio').text), - 'vbr': int(elem.find('bitrateVideo').text), - 'filesize': int(elem.find('fileSize').text), - } for elem in xml_tree.find('assets')] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'formats': formats, - 'duration': duration, - 'webpage_url': webpage_url, - } diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index fc7499958..541ddd909 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -1,64 +1,154 @@ +# coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + determine_ext, + int_or_none, + parse_duration, + parse_iso8601, + xpath_text, +) class MDRIE(InfoExtractor): - _VALID_URL = r'^(?P<domain>https?://(?:www\.)?mdr\.de)/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)(?:_|\.html)' + IE_DESC = 'MDR.DE and KiKA' + _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+(?P<id>\d+)(?:_.+?)?\.html' - # No tests, MDR regularily deletes its videos - _TEST = { + _TESTS = [{ + # MDR regularily deletes its videos 'url': 'http://www.mdr.de/fakt/video189002.html', 'only_matching': True, - } + }, { + 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', + 'md5': '4930515e36b06c111213e80d1e4aad0e', + 'info_dict': { + 'id': '19636', + 'ext': 'mp4', + 'title': 'Baumhaus vom 30. Oktober 2015', + 'duration': 134, + 'uploader': 'KIKA', + }, + }, { + 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', + 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', + 'info_dict': { + 'id': '8182', + 'ext': 'mp4', + 'title': 'Beutolomäus und der geheime Weihnachtswunsch', + 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', + 'timestamp': 1419047100, + 'upload_date': '20141220', + 'duration': 4628, + 'uploader': 'KIKA', + }, + }, { + 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', + 'only_matching': True, + }, { + 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html', + 'only_matching': True, + }] def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('video_id') - domain = m.group('domain') + video_id = self._match_id(url) - # determine title and media streams from webpage - html = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<h[12]>(.*?)</h[12]>', html, 'title') - xmlurl = self._search_regex( - r'dataURL:\'(/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, 'XML URL') + data_url = self._search_regex( + r'dataURL\s*:\s*(["\'])(?P<url>/.+/(?:video|audio)[0-9]+-avCustom\.xml)\1', + webpage, 'data url', group='url') + + doc = self._download_xml( + compat_urlparse.urljoin(url, data_url), video_id) + + title = (xpath_text(doc, './title', 'title', default=None) or + xpath_text(doc, './broadcast/broadcastName', 'title')) - doc = self._download_xml(domain + xmlurl, video_id) formats = [] - for a in doc.findall('./assets/asset'): - url_el = a.find('./progressiveDownloadUrl') - if url_el is None: - continue - abr = int(a.find('bitrateAudio').text) // 1000 - media_type = a.find('mediaType').text - format = { - 'abr': abr, - 'filesize': int(a.find('fileSize').text), - 'url': url_el.text, - } + processed_urls = [] + for asset in doc.findall('./assets/asset'): + for source in ( + 'progressiveDownload', + 'dynamicHttpStreamingRedirector', + 'adaptiveHttpStreamingRedirector'): + url_el = asset.find('./%sUrl' % source) + if url_el is None: + continue - vbr_el = a.find('bitrateVideo') - if vbr_el is None: - format.update({ - 'vcodec': 'none', - 'format_id': '%s-%d' % (media_type, abr), - }) - else: - vbr = int(vbr_el.text) // 1000 - format.update({ - 'vbr': vbr, - 'width': int(a.find('frameWidth').text), - 'height': int(a.find('frameHeight').text), - 'format_id': '%s-%d' % (media_type, vbr), - }) - formats.append(format) + video_url = url_el.text + if video_url in processed_urls: + continue + + processed_urls.append(video_url) + + vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) + abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) + + url_formats = [] + + ext = determine_ext(url_el.text) + if ext == 'm3u8': + url_formats = self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + preference=0, m3u8_id='HLS', fatal=False) + elif ext == 'f4m': + url_formats = self._extract_f4m_formats( + video_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, + preference=0, f4m_id='HDS', fatal=False) + else: + media_type = xpath_text(asset, './mediaType', 'media type', default='MP4') + vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) + abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) + filesize = int_or_none(xpath_text(asset, './fileSize', 'file size')) + + f = { + 'url': video_url, + 'format_id': '%s-%d' % (media_type, vbr or abr), + 'filesize': filesize, + 'abr': abr, + 'preference': 1, + } + + if vbr: + width = int_or_none(xpath_text(asset, './frameWidth', 'width')) + height = int_or_none(xpath_text(asset, './frameHeight', 'height')) + f.update({ + 'vbr': vbr, + 'width': width, + 'height': height, + }) + + url_formats.append(f) + + if not vbr: + for f in url_formats: + abr = f.get('tbr') or abr + if 'tbr' in f: + del f['tbr'] + f.update({ + 'abr': abr, + 'vcodec': 'none', + }) + + if url_formats: + formats.extend(url_formats) self._sort_formats(formats) + description = xpath_text(doc, './broadcast/broadcastDescription', 'description') + timestamp = parse_iso8601( + xpath_text(doc, './broadcast/broadcastDate', 'timestamp', default=None) or + xpath_text(doc, './broadcast/broadcastStartDate', 'timestamp', default=None)) + duration = parse_duration(xpath_text(doc, './duration', 'duration')) + uploader = xpath_text(doc, './rights', 'uploader') + return { 'id': video_id, 'title': title, + 'description': description, + 'timestamp': timestamp, + 'duration': duration, + 'uploader': uploader, 'formats': formats, } From 8cdb5c845336ad3dc48c85a0558a38bd42972b00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 22:24:21 +0600 Subject: [PATCH 0066/1214] [mdr] Add audio test --- youtube_dl/extractor/mdr.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 541ddd909..e05577496 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -20,6 +20,17 @@ class MDRIE(InfoExtractor): # MDR regularily deletes its videos 'url': 'http://www.mdr.de/fakt/video189002.html', 'only_matching': True, + }, { + # audio + 'url': 'http://www.mdr.de/kultur/audio1312272_zc-15948bad_zs-86171fdd.html', + 'md5': '64c4ee50f0a791deb9479cd7bbe9d2fa', + 'info_dict': { + 'id': '1312272', + 'ext': 'mp3', + 'title': 'Feuilleton vom 30. Oktober 2015', + 'duration': 250, + 'uploader': 'MITTELDEUTSCHER RUNDFUNK', + }, }, { 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', 'md5': '4930515e36b06c111213e80d1e4aad0e', From 578c074575f45ffdfd032d7b84f6fe449614f511 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 22:39:44 +0600 Subject: [PATCH 0067/1214] [utils] Support list of xpath in xpath_element --- test/test_utils.py | 7 +++++++ youtube_dl/utils.py | 15 ++++++++++++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 0c34f0e55..5a56ad776 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -275,9 +275,16 @@ class TestUtil(unittest.TestCase): p = xml.etree.ElementTree.SubElement(div, 'p') p.text = 'Foo' self.assertEqual(xpath_element(doc, 'div/p'), p) + self.assertEqual(xpath_element(doc, ['div/p']), p) + self.assertEqual(xpath_element(doc, ['div/bar', 'div/p']), p) self.assertEqual(xpath_element(doc, 'div/bar', default='default'), 'default') + self.assertEqual(xpath_element(doc, ['div/bar'], default='default'), 'default') self.assertTrue(xpath_element(doc, 'div/bar') is None) + self.assertTrue(xpath_element(doc, ['div/bar']) is None) + self.assertTrue(xpath_element(doc, ['div/bar'], 'div/baz') is None) self.assertRaises(ExtractorError, xpath_element, doc, 'div/bar', fatal=True) + self.assertRaises(ExtractorError, xpath_element, doc, ['div/bar'], fatal=True) + self.assertRaises(ExtractorError, xpath_element, doc, ['div/bar', 'div/baz'], fatal=True) def test_xpath_text(self): testxml = '''<root> diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 558c9c7d5..89c88a4d3 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -178,10 +178,19 @@ def xpath_with_ns(path, ns_map): def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): - if sys.version_info < (2, 7): # Crazy 2.6 - xpath = xpath.encode('ascii') + def _find_xpath(xpath): + if sys.version_info < (2, 7): # Crazy 2.6 + xpath = xpath.encode('ascii') + return node.find(xpath) + + if isinstance(xpath, (str, compat_str)): + n = _find_xpath(xpath) + else: + for xp in xpath: + n = _find_xpath(xp) + if n is not None: + break - n = node.find(xpath) if n is None: if default is not NO_DEFAULT: return default From 11465da70257663ee52c7be50debe1c1e825ec67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 22:45:45 +0600 Subject: [PATCH 0068/1214] [mdr] Simplify xpath --- youtube_dl/extractor/mdr.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index e05577496..a63257c56 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -74,8 +74,7 @@ class MDRIE(InfoExtractor): doc = self._download_xml( compat_urlparse.urljoin(url, data_url), video_id) - title = (xpath_text(doc, './title', 'title', default=None) or - xpath_text(doc, './broadcast/broadcastName', 'title')) + title = xpath_text(doc, ['./title', './broadcast/broadcastName'], 'title', fatal=True) formats = [] processed_urls = [] @@ -149,8 +148,12 @@ class MDRIE(InfoExtractor): description = xpath_text(doc, './broadcast/broadcastDescription', 'description') timestamp = parse_iso8601( - xpath_text(doc, './broadcast/broadcastDate', 'timestamp', default=None) or - xpath_text(doc, './broadcast/broadcastStartDate', 'timestamp', default=None)) + xpath_text( + doc, [ + './broadcast/broadcastDate', + './broadcast/broadcastStartDate', + './broadcast/broadcastEndDate'], + 'timestamp', default=None)) duration = parse_duration(xpath_text(doc, './duration', 'duration')) uploader = xpath_text(doc, './rights', 'uploader') From 82b69a5cbb1455d31916be4f19ab327ae63f313c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 23:00:36 +0600 Subject: [PATCH 0069/1214] [mdr] PEP 8 --- youtube_dl/extractor/mdr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index a63257c56..a566c6a2c 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -20,7 +20,7 @@ class MDRIE(InfoExtractor): # MDR regularily deletes its videos 'url': 'http://www.mdr.de/fakt/video189002.html', 'only_matching': True, - }, { + }, { # audio 'url': 'http://www.mdr.de/kultur/audio1312272_zc-15948bad_zs-86171fdd.html', 'md5': '64c4ee50f0a791deb9479cd7bbe9d2fa', From e327b736ca6a6a1c880b93e09a3b310c354c2c7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 23:05:30 +0600 Subject: [PATCH 0070/1214] [generic] Update test --- youtube_dl/extractor/generic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index ca5fbafb2..a84135032 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -141,6 +141,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Automatics, robotics and biocybernetics', 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', + 'upload_date': '20130627', 'formats': 'mincount:16', 'subtitles': 'mincount:1', }, From ae12bc3ebb4cb377c2b4337ec255e652b36f5143 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 23:07:37 +0600 Subject: [PATCH 0071/1214] [utils] Make unified_strdate always return unicode string --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 89c88a4d3..764a89cca 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -910,7 +910,7 @@ def unified_strdate(date_str, day_first=True): timetuple = email.utils.parsedate_tz(date_str) if timetuple: upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') - return upload_date + return compat_str(upload_date) def determine_ext(url, default_ext='unknown_video'): From dc519b5421366a8cac681455a817ae25f7f4aa83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 31 Oct 2015 23:12:57 +0600 Subject: [PATCH 0072/1214] [extractor/common] Make ie_key and IE_NAME return unicode string --- youtube_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 10c0d5d1f..59c3fa8dc 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -310,11 +310,11 @@ class InfoExtractor(object): @classmethod def ie_key(cls): """A string for getting the InfoExtractor with get_info_extractor""" - return cls.__name__[:-2] + return compat_str(cls.__name__[:-2]) @property def IE_NAME(self): - return type(self).__name__[:-2] + return compat_str(type(self).__name__[:-2]) def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns the response handle """ From 76f0c50d3d3e2eb5903b61da08829699e902916d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Nov 2015 00:01:08 +0600 Subject: [PATCH 0073/1214] [mdr] Fix failed formats processing --- youtube_dl/extractor/mdr.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index a566c6a2c..88334889e 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -96,8 +96,6 @@ class MDRIE(InfoExtractor): vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) - url_formats = [] - ext = determine_ext(url_el.text) if ext == 'm3u8': url_formats = self._extract_m3u8_formats( @@ -130,7 +128,10 @@ class MDRIE(InfoExtractor): 'height': height, }) - url_formats.append(f) + url_formats = [f] + + if not url_formats: + continue if not vbr: for f in url_formats: @@ -142,8 +143,8 @@ class MDRIE(InfoExtractor): 'vcodec': 'none', }) - if url_formats: - formats.extend(url_formats) + formats.extend(url_formats) + self._sort_formats(formats) description = xpath_text(doc, './broadcast/broadcastDescription', 'description') From dbd82a1d4fff1655920e111cc25a7fd526d7bf9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Nov 2015 00:01:34 +0600 Subject: [PATCH 0074/1214] [extractor/common] Fix m3u8 extraction on failure --- youtube_dl/extractor/common.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 59c3fa8dc..1f09fbb47 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -943,13 +943,14 @@ class InfoExtractor(object): if re.match(r'^https?://', u) else compat_urlparse.urljoin(m3u8_url, u)) - m3u8_doc, urlh = self._download_webpage_handle( + res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', errnote=errnote or 'Failed to download m3u8 information', fatal=fatal) - if m3u8_doc is False: - return m3u8_doc + if res is False: + return res + m3u8_doc, urlh = res m3u8_url = urlh.geturl() last_info = None last_media = None From 9550ca506fccf9c9d795816cc0a7817ff262ef45 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 31 Oct 2015 19:36:04 +0100 Subject: [PATCH 0075/1214] [utils] change extract_attributes to work in python 2 --- youtube_dl/extractor/brightcove.py | 3 +-- youtube_dl/utils.py | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index b41cee91b..c6ad1d065 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -383,8 +383,7 @@ class BrightcoveInPageEmbedIE(InfoExtractor): return None def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - account_id, player_id, embed, video_id = mobj.groups() + account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage('http://players.brightcove.net/%s/%s_%s/index.min.js' % (account_id, player_id, embed), video_id) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index bcebf9cc5..518cea98b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -252,7 +252,8 @@ def extract_attributes(attributes_str, attributes_regex=r'(?s)\s*([^\s=]+)\s*=\s attributes = re.findall(attributes_regex, attributes_str) attributes_dict = {} if attributes: - attributes_dict = {attribute_name: attribute_value for (attribute_name, attribute_value) in attributes} + for (attribute_name, attribute_value) in attributes: + attributes_dict[attribute_name] = attribute_value return attributes_dict From 80dcee5cd5cbe623a53e0c582e3e3ae170c63e8d Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 31 Oct 2015 04:02:49 +0100 Subject: [PATCH 0076/1214] [eitb] fix info extraction --- youtube_dl/extractor/eitb.py | 65 ++++++++++++++++++++++++------------ 1 file changed, 44 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py index 2cba82532..fc8f15544 100644 --- a/youtube_dl/extractor/eitb.py +++ b/youtube_dl/extractor/eitb.py @@ -1,39 +1,62 @@ # encoding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from .brightcove import BrightcoveIE -from ..utils import ExtractorError +from ..compat import compat_urllib_request +from ..utils import ( + int_or_none, + unified_strdate, +) class EitbIE(InfoExtractor): IE_NAME = 'eitb.tv' - _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/(?P<playlist_id>\d+)/(?P<chapter_id>\d+)' + _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/\d+/(?P<id>\d+)' _TEST = { - 'add_ie': ['Brightcove'], - 'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/2677100210001/2743577154001/lasa-y-zabala-30-anos/', + 'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/4104995148001/4090227752001/lasa-y-zabala-30-anos/', 'md5': 'edf4436247185adee3ea18ce64c47998', 'info_dict': { - 'id': '2743577154001', + 'id': '4090227752001', 'ext': 'mp4', 'title': '60 minutos (Lasa y Zabala, 30 años)', - # All videos from eitb has this description in the brightcove info - 'description': '.', - 'uploader': 'Euskal Telebista', + 'description': '', + 'duration': 3996760, + 'upload_date': '20131014', }, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - chapter_id = mobj.group('chapter_id') - webpage = self._download_webpage(url, chapter_id) - bc_url = BrightcoveIE._extract_brightcove_url(webpage) - if bc_url is None: - raise ExtractorError('Could not extract the Brightcove url') - # The BrightcoveExperience object doesn't contain the video id, we set - # it manually - bc_url += '&%40videoPlayer={0}'.format(chapter_id) - return self.url_result(bc_url, BrightcoveIE.ie_key()) + video_id = self._match_id(url) + video_data = self._download_json('http://mam.eitb.eus/mam/REST/ServiceMultiweb/Video/MULTIWEBTV/%s/' % video_id, video_id)['web_media'][0] + + formats = [] + for rendition in video_data['RENDITIONS']: + formats.append({ + 'url': rendition['PMD_URL'], + 'width': int_or_none(rendition.get('FRAME_WIDTH')), + 'height': int_or_none(rendition.get('FRAME_HEIGHT')), + 'tbr': int_or_none(rendition.get('ENCODING_RATE')), + }) + + # TODO: parse f4m manifest + request = compat_urllib_request.Request( + 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/DomainRestrictedSecurity/TokenAuth/', + headers={'Referer': url}) + token_data = self._download_json(request, video_id, fatal=False) + if token_data: + m3u8_formats = self._extract_m3u8_formats('%s?hdnts=%s' % (video_data['HLS_SURL'], token_data['token']), video_id, m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_data['NAME_ES'], + 'description': video_data.get('SHORT_DESC_ES'), + 'thumbnail': video_data.get('STILL_URL'), + 'duration': int_or_none(video_data.get('LENGTH')), + 'upload_date': unified_strdate(video_data.get('BROADCST_DATE')), + 'formats': formats, + } From 8a06999ba0f9c948f8d2a1ef89c73eedbfb09cfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Nov 2015 01:52:33 +0600 Subject: [PATCH 0077/1214] [eitb] Improve, make more robust and extract f4m formats (Closes #7328) --- youtube_dl/extractor/eitb.py | 71 +++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py index fc8f15544..0de8d3dc6 100644 --- a/youtube_dl/extractor/eitb.py +++ b/youtube_dl/extractor/eitb.py @@ -4,14 +4,15 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_request from ..utils import ( + float_or_none, int_or_none, - unified_strdate, + parse_iso8601, ) class EitbIE(InfoExtractor): IE_NAME = 'eitb.tv' - _VALID_URL = r'https?://www\.eitb\.tv/(eu/bideoa|es/video)/[^/]+/\d+/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?eitb\.tv/(?:eu/bideoa|es/video)/[^/]+/\d+/(?P<id>\d+)' _TEST = { 'url': 'http://www.eitb.tv/es/video/60-minutos-60-minutos-2013-2014/4104995148001/4090227752001/lasa-y-zabala-30-anos/', @@ -20,43 +21,71 @@ class EitbIE(InfoExtractor): 'id': '4090227752001', 'ext': 'mp4', 'title': '60 minutos (Lasa y Zabala, 30 años)', - 'description': '', - 'duration': 3996760, + 'description': 'Programa de reportajes de actualidad.', + 'duration': 3996.76, + 'timestamp': 1381789200, 'upload_date': '20131014', + 'tags': list, }, } def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json('http://mam.eitb.eus/mam/REST/ServiceMultiweb/Video/MULTIWEBTV/%s/' % video_id, video_id)['web_media'][0] + + video = self._download_json( + 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/Video/MULTIWEBTV/%s/' % video_id, + video_id, 'Downloading video JSON') + + media = video['web_media'][0] formats = [] - for rendition in video_data['RENDITIONS']: + for rendition in media['RENDITIONS']: + video_url = rendition.get('PMD_URL') + if not video_url: + continue + tbr = float_or_none(rendition.get('ENCODING_RATE'), 1000) + format_id = 'http' + if tbr: + format_id += '-%d' % int(tbr) formats.append({ 'url': rendition['PMD_URL'], + 'format_id': format_id, 'width': int_or_none(rendition.get('FRAME_WIDTH')), 'height': int_or_none(rendition.get('FRAME_HEIGHT')), - 'tbr': int_or_none(rendition.get('ENCODING_RATE')), + 'tbr': tbr, }) - # TODO: parse f4m manifest - request = compat_urllib_request.Request( - 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/DomainRestrictedSecurity/TokenAuth/', - headers={'Referer': url}) - token_data = self._download_json(request, video_id, fatal=False) - if token_data: - m3u8_formats = self._extract_m3u8_formats('%s?hdnts=%s' % (video_data['HLS_SURL'], token_data['token']), video_id, m3u8_id='hls', fatal=False) - if m3u8_formats: - formats.extend(m3u8_formats) + hls_url = media.get('HLS_SURL') + if hls_url: + request = compat_urllib_request.Request( + 'http://mam.eitb.eus/mam/REST/ServiceMultiweb/DomainRestrictedSecurity/TokenAuth/', + headers={'Referer': url}) + token_data = self._download_json( + request, video_id, 'Downloading auth token', fatal=False) + if token_data: + token = token_data.get('token') + if token: + m3u8_formats = self._extract_m3u8_formats( + '%s?hdnts=%s' % (hls_url, token), video_id, m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + + hds_url = media.get('HDS_SURL').replace('euskalsvod', 'euskalvod') + if hds_url: + f4m_formats = self._extract_f4m_formats( + '%s?hdcore=3.7.0' % hds_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) self._sort_formats(formats) return { 'id': video_id, - 'title': video_data['NAME_ES'], - 'description': video_data.get('SHORT_DESC_ES'), - 'thumbnail': video_data.get('STILL_URL'), - 'duration': int_or_none(video_data.get('LENGTH')), - 'upload_date': unified_strdate(video_data.get('BROADCST_DATE')), + 'title': media.get('NAME_ES') or media.get('name') or media['NAME_EU'], + 'description': media.get('SHORT_DESC_ES') or video.get('desc_group') or media.get('SHORT_DESC_EU'), + 'thumbnail': media.get('STILL_URL') or media.get('THUMBNAIL_URL'), + 'duration': float_or_none(media.get('LENGTH'), 1000), + 'timestamp': parse_iso8601(media.get('BROADCST_DATE'), ' '), + 'tags': media.get('TAGS'), 'formats': formats, } From 02fb9804513ce1bfe28ec7c285526db7989e5844 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sun, 1 Nov 2015 02:08:19 +0100 Subject: [PATCH 0078/1214] [flickr] extract more info and formats --- youtube_dl/extractor/flickr.py | 110 ++++++++++++++++++--------------- 1 file changed, 60 insertions(+), 50 deletions(-) diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 2fe76d661..5ca754105 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -1,67 +1,77 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import compat_urllib_request +from ..compat import compat_urllib_parse from ..utils import ( - ExtractorError, - find_xpath_attr, + int_or_none, + qualities, ) class FlickrIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|secure\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*' + _VALID_URL = r'https?://(?:www\.|secure\.)?flickr\.com/photos/[\w\-_@]+/(?P<id>\d+)' _TEST = { 'url': 'http://www.flickr.com/photos/forestwander-nature-pictures/5645318632/in/photostream/', - 'md5': '6fdc01adbc89d72fc9c4f15b4a4ba87b', + 'md5': '164fe3fa6c22e18d448d4d5af2330f31', 'info_dict': { 'id': '5645318632', - 'ext': 'mp4', - "description": "Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.", - "uploader_id": "forestwander-nature-pictures", - "title": "Dark Hollow Waterfalls" + 'ext': 'mpg', + 'description': 'Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.', + 'uploader_id': 'forestwander-nature-pictures', + 'title': 'Dark Hollow Waterfalls', + 'duration': 19, + 'timestamp': 1303528740, + 'upload_date': '20110423', + 'uploader_id': '10922353@N03', + 'uploader': 'Forest Wander', + 'comment_count': int, } } - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) + _API_BASE_URL = 'https://api.flickr.com/services/rest?' + _API_KEY = '61b16865f916058e63580a912d9143be' - video_id = mobj.group('id') - video_uploader_id = mobj.group('uploader_id') - webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id - req = compat_urllib_request.Request(webpage_url) - req.add_header( - 'User-Agent', - # it needs a more recent version - 'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20150101 Firefox/38.0 (Chrome)') - webpage = self._download_webpage(req, video_id) - - secret = self._search_regex(r'secret"\s*:\s*"(\w+)"', webpage, 'secret') - - first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' - first_xml = self._download_xml(first_url, video_id, 'Downloading first data webpage') - - node_id = find_xpath_attr( - first_xml, './/{http://video.yahoo.com/YEP/1.0/}Item', 'id', - 'id').text - - second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' - second_xml = self._download_xml(second_url, video_id, 'Downloading second data webpage') - - self.report_extraction(video_id) - - stream = second_xml.find('.//STREAM') - if stream is None: - raise ExtractorError('Unable to extract video url') - video_url = stream.attrib['APP'] + stream.attrib['FULLPATH'] - - return { - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader_id': video_uploader_id, + def _call_api(self, method, video_id, secret=None): + query = { + 'photo_id': video_id, + 'method': 'flickr.%s' % method, + 'api_key': self._API_KEY, + 'format': 'json', + 'nojsoncallback': 1, } + if secret: + query['secret'] = secret + return self._download_json(self._API_BASE_URL + compat_urllib_parse.urlencode(query), video_id) + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_info = self._call_api('photos.getInfo', video_id)['photo'] + if video_info['media'] == 'video': + streams = self._call_api('video.getStreamInfo', video_id, video_info['secret'])['streams'] + + preference = qualities(['iphone_wifi', '700', 'appletv', 'orig']) + + formats = [] + for stream in streams['stream']: + stream_type = str(stream.get('type')) + formats.append({ + 'format_id': stream_type, + 'url': stream['_content'], + 'preference': preference(stream_type), + }) + self._sort_formats(formats) + + owner = video_info.get('owner', {}) + + return { + 'id': video_id, + 'title': video_info['title']['_content'], + 'description': video_info.get('description', {}).get('_content'), + 'formats': formats, + 'timestamp': int_or_none(video_info.get('dateuploaded')), + 'duration': int_or_none(video_info.get('video', {}).get('duration')), + 'uploader_id': owner.get('nsid'), + 'uploader': owner.get('realname'), + 'comment_count': int_or_none(video_info.get('comments', {}).get('_content')), + } From 999079b4543b4cd5e71a235865fbfefd349eb064 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 1 Nov 2015 15:49:11 +0600 Subject: [PATCH 0079/1214] [eitb] Improve hds extraction --- youtube_dl/extractor/eitb.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/eitb.py b/youtube_dl/extractor/eitb.py index 0de8d3dc6..357a2196c 100644 --- a/youtube_dl/extractor/eitb.py +++ b/youtube_dl/extractor/eitb.py @@ -70,10 +70,11 @@ class EitbIE(InfoExtractor): if m3u8_formats: formats.extend(m3u8_formats) - hds_url = media.get('HDS_SURL').replace('euskalsvod', 'euskalvod') + hds_url = media.get('HDS_SURL') if hds_url: f4m_formats = self._extract_f4m_formats( - '%s?hdcore=3.7.0' % hds_url, video_id, f4m_id='hds', fatal=False) + '%s?hdcore=3.7.0' % hds_url.replace('euskalsvod', 'euskalvod'), + video_id, f4m_id='hds', fatal=False) if f4m_formats: formats.extend(f4m_formats) From 146672254e409bf97c82a302095fbfabf2c48928 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sun, 1 Nov 2015 13:23:23 +0100 Subject: [PATCH 0080/1214] [flickr] extract fresh api key and remove duplication in test --- youtube_dl/extractor/flickr.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 5ca754105..0d5d6b0b9 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -17,7 +17,6 @@ class FlickrIE(InfoExtractor): 'id': '5645318632', 'ext': 'mpg', 'description': 'Waterfalls in the Springtime at Dark Hollow Waterfalls. These are located just off of Skyline Drive in Virginia. They are only about 6/10 of a mile hike but it is a pretty steep hill and a good climb back up.', - 'uploader_id': 'forestwander-nature-pictures', 'title': 'Dark Hollow Waterfalls', 'duration': 19, 'timestamp': 1303528740, @@ -29,26 +28,27 @@ class FlickrIE(InfoExtractor): } _API_BASE_URL = 'https://api.flickr.com/services/rest?' - _API_KEY = '61b16865f916058e63580a912d9143be' - def _call_api(self, method, video_id, secret=None): + def _call_api(self, method, video_id, api_key, note, secret=None): query = { 'photo_id': video_id, 'method': 'flickr.%s' % method, - 'api_key': self._API_KEY, + 'api_key': api_key, 'format': 'json', 'nojsoncallback': 1, } if secret: query['secret'] = secret - return self._download_json(self._API_BASE_URL + compat_urllib_parse.urlencode(query), video_id) + return self._download_json(self._API_BASE_URL + compat_urllib_parse.urlencode(query), video_id, note) def _real_extract(self, url): video_id = self._match_id(url) - video_info = self._call_api('photos.getInfo', video_id)['photo'] + api_key = self._download_json('https://www.flickr.com/hermes_error_beacon.gne', video_id, 'Downloading api key',)['site_key'] + + video_info = self._call_api('photos.getInfo', video_id, api_key, 'Downloading video info')['photo'] if video_info['media'] == 'video': - streams = self._call_api('video.getStreamInfo', video_id, video_info['secret'])['streams'] + streams = self._call_api('video.getStreamInfo', video_id, api_key, 'Downloading streams info', video_info['secret'])['streams'] preference = qualities(['iphone_wifi', '700', 'appletv', 'orig']) From f3003531a5622cc01501325b9f35dcb2424cfb70 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sun, 1 Nov 2015 13:38:11 +0100 Subject: [PATCH 0081/1214] [flickr] handle error message --- youtube_dl/extractor/flickr.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 0d5d6b0b9..e97754d36 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urllib_parse from ..utils import ( + ExtractorError, int_or_none, qualities, ) @@ -39,7 +40,10 @@ class FlickrIE(InfoExtractor): } if secret: query['secret'] = secret - return self._download_json(self._API_BASE_URL + compat_urllib_parse.urlencode(query), video_id, note) + data = self._download_json(self._API_BASE_URL + compat_urllib_parse.urlencode(query), video_id, note) + if data['stat'] != 'ok': + raise ExtractorError(data['message']) + return data def _real_extract(self, url): video_id = self._match_id(url) From ab6ca0480280abb2a35a54e1b380bbae07a48863 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 1 Nov 2015 14:20:10 +0100 Subject: [PATCH 0082/1214] release 2015.11.01 --- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 03561b87d..805af14a0 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -93,6 +93,7 @@ - **Clipsyndicate** - **Cloudy** - **Clubic** + - **Clyp** - **cmt.com** - **CNET** - **CNN** @@ -281,7 +282,7 @@ - **macgamestore**: MacGameStore trailers - **mailru**: Видео@Mail.Ru - **Malemotion** - - **MDR** + - **MDR**: MDR.DE and KiKA - **media.ccc.de** - **metacafe** - **Metacritic** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 125e8ccf5..006b973c0 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.10.24' +__version__ = '2015.11.01' From c90d16cf36d8edf03f4dc923ee9dbeadca910844 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 2 Nov 2015 04:26:20 +0600 Subject: [PATCH 0083/1214] [utils:sanitize_path] Disallow trailing whitespace in path segment (Closes #7332) --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index efd5f4ae1..7b3f79141 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -366,7 +366,7 @@ def sanitize_path(s): if drive_or_unc: norm_path.pop(0) sanitized_path = [ - path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part) + path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part) for path_part in norm_path] if drive_or_unc: sanitized_path.insert(0, drive_or_unc + os.path.sep) From eb97f46e8bd9cb04f0fe5f8a5c13aeeaabeefef5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 2 Nov 2015 12:46:10 +0100 Subject: [PATCH 0084/1214] [mitele] Fix extraction and update test checksum (fixes #7343) --- youtube_dl/extractor/mitele.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 3142fcde2..c595f2077 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,7 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse +from ..compat import ( + compat_urllib_parse, + compat_urlparse, +) from ..utils import ( encode_dict, get_element_by_attribute, @@ -15,7 +18,7 @@ class MiTeleIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', - 'md5': '757b0b66cbd7e0a97226d7d3156cb3e9', + 'md5': '0ff1a13aebb35d9bc14081ff633dd324', 'info_dict': { 'id': '0NF1jJnxS1Wu3pHrmvFyw2', 'display_id': 'programa-144', @@ -34,6 +37,7 @@ class MiTeleIE(InfoExtractor): config_url = self._search_regex( r'data-config\s*=\s*"([^"]+)"', webpage, 'data config url') + config_url = compat_urlparse.urljoin(url, config_url) config = self._download_json( config_url, display_id, 'Downloading config JSON') From c514b0ec655b23e7804eb18df04daa863d973f32 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sun, 1 Nov 2015 22:12:20 +0100 Subject: [PATCH 0085/1214] [videofy.me] fix info extraction Closes #7339. --- youtube_dl/extractor/videofyme.py | 40 ++++++++++++++++--------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/videofyme.py b/youtube_dl/extractor/videofyme.py index 94f9e9be9..cd3f50a63 100644 --- a/youtube_dl/extractor/videofyme.py +++ b/youtube_dl/extractor/videofyme.py @@ -2,8 +2,8 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - find_xpath_attr, int_or_none, + parse_iso8601, ) @@ -18,33 +18,35 @@ class VideofyMeIE(InfoExtractor): 'id': '1100701', 'ext': 'mp4', 'title': 'This is VideofyMe', - 'description': None, + 'description': '', + 'upload_date': '20130326', + 'timestamp': 1364288959, 'uploader': 'VideofyMe', 'uploader_id': 'thisisvideofyme', 'view_count': int, + 'likes': int, + 'comment_count': int, }, - } def _real_extract(self, url): video_id = self._match_id(url) - config = self._download_xml('http://sunshine.videofy.me/?videoId=%s' % video_id, - video_id) - video = config.find('video') - sources = video.find('sources') - url_node = next(node for node in [find_xpath_attr(sources, 'source', 'id', 'HQ %s' % key) - for key in ['on', 'av', 'off']] if node is not None) - video_url = url_node.find('url').text - view_count = int_or_none(self._search_regex( - r'([0-9]+)', video.find('views').text, 'view count', fatal=False)) + + config = self._download_json('http://vf-player-info-loader.herokuapp.com/%s.json' % video_id, video_id)['videoinfo'] + + video = config.get('video') + blog = config.get('blog', {}) return { 'id': video_id, - 'title': video.find('title').text, - 'url': video_url, - 'thumbnail': video.find('thumb').text, - 'description': video.find('description').text, - 'uploader': config.find('blog/name').text, - 'uploader_id': video.find('identifier').text, - 'view_count': view_count, + 'title': video['title'], + 'url': video['sources']['source']['url'], + 'thumbnail': video.get('thumb'), + 'description': video.get('description'), + 'timestamp': parse_iso8601(video.get('date')), + 'uploader': blog.get('name'), + 'uploader_id': blog.get('identifier'), + 'view_count': int_or_none(self._search_regex(r'([0-9]+)', video.get('views'), 'view count', fatal=False)), + 'likes': int_or_none(video.get('likes')), + 'comment_count': int_or_none(video.get('nrOfComments')), } From 6a750402787dfc1f39a9ad347f2d78ae1c94c52c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Mon, 2 Nov 2015 14:08:38 +0100 Subject: [PATCH 0086/1214] [utils] unified_strdate: Return None if the date format can't be recognized (fixes #7340) This issue was introduced with ae12bc3ebb4cb377c2b4337ec255e652b36f5143, it returned 'None'. --- test/test_utils.py | 1 + youtube_dl/utils.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 3298315d2..01829f71e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -236,6 +236,7 @@ class TestUtil(unittest.TestCase): unified_strdate('2/2/2015 6:47:40 PM', day_first=False), '20150202') self.assertEqual(unified_strdate('25-09-2014'), '20140925') + self.assertEqual(unified_strdate('UNKNOWN DATE FORMAT'), None) def test_find_xpath_attr(self): testxml = '''<root> diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7b3f79141..d39f313a4 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -911,7 +911,8 @@ def unified_strdate(date_str, day_first=True): timetuple = email.utils.parsedate_tz(date_str) if timetuple: upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') - return compat_str(upload_date) + if upload_date is not None: + return compat_str(upload_date) def determine_ext(url, default_ext='unknown_video'): From a230068ff7427c19e29331fc0f2bb17d50003bca Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Mon, 2 Nov 2015 16:18:54 +0100 Subject: [PATCH 0087/1214] release 2015.11.02 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 006b973c0..6ef482b78 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.11.01' +__version__ = '2015.11.02' From dde9fe9788f23f168e0bddaf8ab0470f469165fa Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 3 Nov 2015 18:36:54 +0800 Subject: [PATCH 0088/1214] [democracynow] Simplify --- youtube_dl/extractor/democracynow.py | 86 ++++++++++++++-------------- 1 file changed, 44 insertions(+), 42 deletions(-) diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 05cfc7502..824b8e2c5 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -2,11 +2,18 @@ from __future__ import unicode_literals import re +import os.path + from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + url_basename, + remove_start, +) class DemocracynowIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?democracynow.org/?(?P<id>[^\?]*)' + _VALID_URL = r'https?://(?:www\.)?democracynow.org/(?P<id>[^\?]*)' IE_NAME = 'democracynow' _TESTS = [{ 'url': 'http://www.democracynow.org/shows/2015/7/3', @@ -14,9 +21,7 @@ class DemocracynowIE(InfoExtractor): 'id': '2015-0703-001', 'ext': 'mp4', 'title': 'July 03, 2015 - Democracy Now!', - 'description': 'A daily independent global news hour with Amy Goodman & Juan Gonz\xe1lez "What to the Slave is 4th of July?": James Earl Jones Reads Frederick Douglass\u2019 Historic Speech : "This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag : "We Shall Overcome": Remembering Folk Icon, Activist Pete Seeger in His Own Words & Songs', - 'uploader': 'Democracy Now', - 'upload_date': None, + 'description': 'A daily independent global news hour with Amy Goodman & Juan González "What to the Slave is 4th of July?": James Earl Jones Reads Frederick Douglass\u2019 Historic Speech : "This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag : "We Shall Overcome": Remembering Folk Icon, Activist Pete Seeger in His Own Words & Songs', }, }, { 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', @@ -25,60 +30,57 @@ class DemocracynowIE(InfoExtractor): 'ext': 'mp4', 'title': '"This Flag Comes Down Today": Bree Newsome Scales SC Capitol Flagpole, Takes Down Confederate Flag', 'description': 'md5:4d2bc4f0d29f5553c2210a4bc7761a21', - 'uploader': 'Democracy Now', - 'upload_date': None, }, }] def _real_extract(self, url): display_id = self._match_id(url) - base_host = re.search(r'^(.+?://[^/]+)', url).group(1) - if display_id == '': - display_id = 'home' webpage = self._download_webpage(url, display_id) description = self._og_search_description(webpage) - jstr = self._search_regex(r'<script[^>]+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json') - js = self._parse_json(jstr, display_id) + js = self._parse_json(self._search_regex( + r'<script[^>]+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json'), + display_id) video_id = None formats = [] + + default_lang = 'en' + subtitles = {} - for key in ('caption_file', '.......'): - # ....... = pending vtt support that doesn't clobber srt 'chapter_file': - url = js.get(key, '') - if url == '' or url is None: - continue - if not re.match(r'^https?://', url): - url = base_host + url - ext = re.search(r'\.([^\.]+)$', url).group(1) - subtitles['eng'] = [{ - 'ext': ext, - 'url': url, - }] - for key in ('file', 'audio', 'video'): - url = js.get(key, '') - if url == '' or url is None: - continue - if not re.match(r'^https?://', url): - url = base_host + url - purl = re.search(r'/(?P<dir>[^/]+)/(?:dn)?(?P<fn>[^/]+?)\.(?P<ext>[^\.\?]+)(?P<hasparams>\?|$)', url) - if video_id is None: - video_id = purl.group('fn') - if js.get('start') is not None: - url += '&' if purl.group('hasparams') == '?' else '?' - url = url + 'start=' + str(js.get('start')) - formats.append({ - 'format_id': purl.group('dir'), - 'ext': purl.group('ext'), - 'url': url, + + def add_subtitle_item(lang, info_dict): + if lang not in subtitles: + subtitles[lang] = [] + subtitles[lang].append(info_dict) + + # chapter_file are not subtitles + if 'caption_file' in js: + add_subtitle_item(default_lang, { + 'url': compat_urlparse.urljoin(url, js['caption_file']), }) + + for subtitle_item in js.get('captions', []): + lang = subtitle_item.get('language', '').lower() or default_lang + add_subtitle_item(lang, { + 'url': compat_urlparse.urljoin(url, subtitle_item['url']), + }) + + for key in ('file', 'audio', 'video'): + media_url = js.get(key, '') + if not media_url: + continue + media_url = re.sub(r'\?.*', '', compat_urlparse.urljoin(url, media_url)) + video_id = video_id or remove_start(os.path.splitext(url_basename(media_url))[0], 'dn') + formats.append({ + 'url': media_url, + }) + self._sort_formats(formats) - ret = { + + return { 'id': video_id, 'title': js.get('title'), 'description': description, - 'uploader': 'Democracy Now', 'subtitles': subtitles, 'formats': formats, } - return ret From fc68d52bb95dc81ed3d05a5c5397cd3f35ee093a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 3 Nov 2015 21:24:10 +0800 Subject: [PATCH 0089/1214] [democracynow] Add MD5 sums --- youtube_dl/extractor/democracynow.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 824b8e2c5..70c364e8b 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -17,6 +17,7 @@ class DemocracynowIE(InfoExtractor): IE_NAME = 'democracynow' _TESTS = [{ 'url': 'http://www.democracynow.org/shows/2015/7/3', + 'md5': 'fbb8fe3d7a56a5e12431ce2f9b2fab0d', 'info_dict': { 'id': '2015-0703-001', 'ext': 'mp4', @@ -25,6 +26,7 @@ class DemocracynowIE(InfoExtractor): }, }, { 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', + 'md5': 'fbb8fe3d7a56a5e12431ce2f9b2fab0d', 'info_dict': { 'id': '2015-0703-001', 'ext': 'mp4', From 852fad922ffa931b3c90b0b9fdb2fa1c7f965ab4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 3 Nov 2015 20:53:17 +0600 Subject: [PATCH 0090/1214] [vimeo] Fix non-ASCII video passwords (Closes #7352) --- youtube_dl/extractor/vimeo.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 2437ae1eb..cc0d337e8 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -13,6 +13,7 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + encode_dict, ExtractorError, InAdvancePagedList, int_or_none, @@ -208,10 +209,10 @@ class VimeoIE(VimeoBaseInfoExtractor): if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata({ + data = urlencode_postdata(encode_dict({ 'password': password, 'token': token, - }) + })) if url.startswith('http://'): # vimeo only supports https now, but the user can give an http url url = url.replace('http://', 'https://') From 0a0110fc6bbd21850e25541fd0bd4b602ce194e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 3 Nov 2015 21:01:09 +0600 Subject: [PATCH 0091/1214] [vimeo] Fix non-ASCII video passwords (2) --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index cc0d337e8..fa07bd59c 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -228,7 +228,7 @@ class VimeoIE(VimeoBaseInfoExtractor): password = self._downloader.params.get('videopassword', None) if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option') - data = compat_urllib_parse.urlencode({'password': password}) + data = urlencode_postdata(encode_dict({'password': password})) pass_url = url + '/check-password' password_request = compat_urllib_request.Request(pass_url, data) password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') From 3fa3ff1bc36aaf82ac0a5e880304cb7aae217b9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 3 Nov 2015 21:06:36 +0600 Subject: [PATCH 0092/1214] [vimeo] Fix non-ASCII login --- youtube_dl/extractor/vimeo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index fa07bd59c..46fb36f21 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -41,13 +41,13 @@ class VimeoBaseInfoExtractor(InfoExtractor): self.report_login() webpage = self._download_webpage(self._LOGIN_URL, None, False) token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata({ + data = urlencode_postdata(encode_dict({ 'action': 'login', 'email': username, 'password': password, 'service': 'vimeo', 'token': token, - }) + })) login_request = compat_urllib_request.Request(self._LOGIN_URL, data) login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') login_request.add_header('Cookie', 'vuid=%s' % vuid) From bfdf891fd36811909aa5d83dc0614eacbb634fcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 3 Nov 2015 21:09:24 +0600 Subject: [PATCH 0093/1214] [vimeo] Fix non-ASCII album passwords --- youtube_dl/extractor/vimeo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 46fb36f21..b608740b8 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -489,7 +489,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): token, vuid = self._extract_xsrft_and_vuid(webpage) fields['token'] = token fields['password'] = password - post = urlencode_postdata(fields) + post = urlencode_postdata(encode_dict(fields)) password_path = self._search_regex( r'action="([^"]+)"', login_form, 'password URL') password_url = compat_urlparse.urljoin(page_url, password_path) From fd8102820c4d14fdb1ff7e090553211717012f67 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 4 Nov 2015 00:09:55 +0800 Subject: [PATCH 0094/1214] [democracynow] Rename js to json_data --- youtube_dl/extractor/democracynow.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 70c364e8b..72fc75d80 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -40,7 +40,7 @@ class DemocracynowIE(InfoExtractor): webpage = self._download_webpage(url, display_id) description = self._og_search_description(webpage) - js = self._parse_json(self._search_regex( + json_data = self._parse_json(self._search_regex( r'<script[^>]+type="text/json"[^>]*>\s*({[^>]+})', webpage, 'json'), display_id) video_id = None @@ -56,19 +56,19 @@ class DemocracynowIE(InfoExtractor): subtitles[lang].append(info_dict) # chapter_file are not subtitles - if 'caption_file' in js: + if 'caption_file' in json_data: add_subtitle_item(default_lang, { - 'url': compat_urlparse.urljoin(url, js['caption_file']), + 'url': compat_urlparse.urljoin(url, json_data['caption_file']), }) - for subtitle_item in js.get('captions', []): + for subtitle_item in json_data.get('captions', []): lang = subtitle_item.get('language', '').lower() or default_lang add_subtitle_item(lang, { 'url': compat_urlparse.urljoin(url, subtitle_item['url']), }) for key in ('file', 'audio', 'video'): - media_url = js.get(key, '') + media_url = json_data.get(key, '') if not media_url: continue media_url = re.sub(r'\?.*', '', compat_urlparse.urljoin(url, media_url)) @@ -81,7 +81,7 @@ class DemocracynowIE(InfoExtractor): return { 'id': video_id, - 'title': js.get('title'), + 'title': json_data.get('title'), 'description': description, 'subtitles': subtitles, 'formats': formats, From 0aeb9a106e1aad37967e0ee666ed816a7d5eb7c2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 4 Nov 2015 00:13:00 +0800 Subject: [PATCH 0095/1214] [democracynow] Prevent required fields to be None --- youtube_dl/extractor/democracynow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 72fc75d80..6cd395e11 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -80,8 +80,8 @@ class DemocracynowIE(InfoExtractor): self._sort_formats(formats) return { - 'id': video_id, - 'title': json_data.get('title'), + 'id': video_id or display_id, + 'title': json_data['title'], 'description': description, 'subtitles': subtitles, 'formats': formats, From 66d041f250f7d3e0c4d501e3b98721f2c6588c35 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 4 Nov 2015 00:53:30 +0800 Subject: [PATCH 0096/1214] [test/subtitles] Add test for DemocracynowIE --- test/test_subtitles.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 0343967d9..75f0ea75f 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -28,6 +28,7 @@ from youtube_dl.extractor import ( ThePlatformFeedIE, RTVEALaCartaIE, FunnyOrDieIE, + DemocracynowIE, ) @@ -346,5 +347,25 @@ class TestFunnyOrDieSubtitles(BaseTestSubtitles): self.assertEqual(md5(subtitles['en']), 'c5593c193eacd353596c11c2d4f9ecc4') +class TestDemocracynowSubtitles(BaseTestSubtitles): + url = 'http://www.democracynow.org/shows/2015/7/3' + IE = DemocracynowIE + + def test_allsubtitles(self): + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), 'acaca989e24a9e45a6719c9b3d60815c') + + def test_subtitles_in_page(self): + self.url = 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree' + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['en'])) + self.assertEqual(md5(subtitles['en']), 'acaca989e24a9e45a6719c9b3d60815c') + + if __name__ == '__main__': unittest.main() From ad607563a2fbb5275ea39f7a052c09ffa232e271 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 16:46:26 +0600 Subject: [PATCH 0097/1214] [globo] Separate article extractor --- youtube_dl/extractor/__init__.py | 5 +- youtube_dl/extractor/globo.py | 140 +++++++++++++++++-------------- 2 files changed, 79 insertions(+), 66 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 10286aa88..94150a28f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -212,7 +212,10 @@ from .gfycat import GfycatIE from .giantbomb import GiantBombIE from .giga import GigaIE from .glide import GlideIE -from .globo import GloboIE +from .globo import ( + GloboIE, + GloboArticleIE, +) from .godtube import GodTubeIE from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 33d6432a6..828e40d76 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -18,75 +18,52 @@ from ..utils import ( class GloboIE(InfoExtractor): - _VALID_URL = 'https?://.+?\.globo\.com/(?P<id>.+)' + _VALID_URL = '(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P<id>\d{7,})' _API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist' _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=17.0.0.132&resource_id=%s' - _VIDEOID_REGEXES = [ - r'\bdata-video-id="(\d+)"', - r'\bdata-player-videosids="(\d+)"', - r'<div[^>]+\bid="(\d+)"', - ] - _RESIGN_EXPIRATION = 86400 - _TESTS = [ - { - 'url': 'http://globotv.globo.com/sportv/futebol-nacional/v/os-gols-de-atletico-mg-3-x-2-santos-pela-24a-rodada-do-brasileirao/3654973/', - 'md5': '03ebf41cb7ade43581608b7d9b71fab0', - 'info_dict': { - 'id': '3654973', - 'ext': 'mp4', - 'title': 'Os gols de Atlético-MG 3 x 2 Santos pela 24ª rodada do Brasileirão', - 'duration': 251.585, - 'uploader': 'SporTV', - 'uploader_id': 698, - 'like_count': int, - } - }, - { - 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', - 'md5': 'b3ccc801f75cd04a914d51dadb83a78d', - 'info_dict': { - 'id': '3607726', - 'ext': 'mp4', - 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa', - 'duration': 103.204, - 'uploader': 'Globo.com', - 'uploader_id': 265, - 'like_count': int, - } - }, - { - 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html', - 'md5': '307fdeae4390ccfe6ba1aa198cf6e72b', - 'info_dict': { - 'id': '3652183', - 'ext': 'mp4', - 'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião', - 'duration': 110.711, - 'uploader': 'Rede Globo', - 'uploader_id': 196, - 'like_count': int, - } - }, - { - 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/', - 'md5': 'c1defca721ce25b2354e927d3e4b3dec', - 'info_dict': { - 'id': '3928201', - 'ext': 'mp4', - 'title': 'Ator e diretor argentino, Ricado Darín fala sobre utopias e suas perdas', - 'duration': 1472.906, - 'uploader': 'Canal Brasil', - 'uploader_id': 705, - 'like_count': int, - } - }, - ] + _TESTS = [{ + 'url': 'http://globotv.globo.com/sportv/futebol-nacional/v/os-gols-de-atletico-mg-3-x-2-santos-pela-24a-rodada-do-brasileirao/3654973/', + 'md5': '03ebf41cb7ade43581608b7d9b71fab0', + 'info_dict': { + 'id': '3654973', + 'ext': 'mp4', + 'title': 'Os gols de Atlético-MG 3 x 2 Santos pela 24ª rodada do Brasileirão', + 'duration': 251.585, + 'uploader': 'SporTV', + 'uploader_id': 698, + 'like_count': int, + } + }, { + 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', + 'md5': 'b3ccc801f75cd04a914d51dadb83a78d', + 'info_dict': { + 'id': '3607726', + 'ext': 'mp4', + 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa', + 'duration': 103.204, + 'uploader': 'Globo.com', + 'uploader_id': 265, + 'like_count': int, + } + }, { + 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/', + 'md5': 'c1defca721ce25b2354e927d3e4b3dec', + 'info_dict': { + 'id': '3928201', + 'ext': 'mp4', + 'title': 'Ator e diretor argentino, Ricado Darín fala sobre utopias e suas perdas', + 'duration': 1472.906, + 'uploader': 'Canal Brasil', + 'uploader_id': 705, + 'like_count': int, + } + }] - class MD5(): + class MD5: HEX_FORMAT_LOWERCASE = 0 HEX_FORMAT_UPPERCASE = 1 BASE64_PAD_CHARACTER_DEFAULT_COMPLIANCE = '' @@ -353,9 +330,6 @@ class GloboIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_id = self._search_regex(self._VIDEOID_REGEXES, webpage, 'video id') - video = self._download_json( self._API_URL_TEMPLATE % video_id, video_id)['videos'][0] @@ -417,3 +391,39 @@ class GloboIE(InfoExtractor): 'like_count': like_count, 'formats': formats } + + +class GloboArticleIE(InfoExtractor): + _VALID_URL = 'https?://.+?\.globo\.com/(?:[^/]+/)*(?P<id>[^/]+)\.html' + + _VIDEOID_REGEXES = [ + r'\bdata-video-id=["\'](\d{7,})', + r'\bdata-player-videosids=["\'](\d{7,})', + r'\bvideosIDs\s*:\s*["\'](\d{7,})', + r'\bdata-id=["\'](\d{7,})', + r'<div[^>]+\bid=["\'](\d{7,})', + ] + + _TEST = { + 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html', + 'md5': '307fdeae4390ccfe6ba1aa198cf6e72b', + 'info_dict': { + 'id': '3652183', + 'ext': 'mp4', + 'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião', + 'duration': 110.711, + 'uploader': 'Rede Globo', + 'uploader_id': 196, + 'like_count': int, + } + } + + @classmethod + def suitable(cls, url): + return False if GloboIE.suitable(url) else super(GloboArticleIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex(self._VIDEOID_REGEXES, webpage, 'video id') + return self.url_result('globo:%s' % video_id, 'Globo') From e3778cce0e912f803ea10cb806406f7fcafe840f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 16:51:19 +0600 Subject: [PATCH 0098/1214] [globo] Improve m3u8 extraction --- youtube_dl/extractor/globo.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 828e40d76..c28899011 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -367,7 +367,10 @@ class GloboIE(InfoExtractor): resource_url = resource['url'] signed_url = '%s?h=%s&k=%s' % (resource_url, signed_hash, 'flash') if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(signed_url, resource_id, 'mp4')) + m3u8_formats = self._extract_m3u8_formats( + signed_url, resource_id, 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) else: formats.append({ 'url': signed_url, From c3459d24f16056e8ae8f982db2a10871ef18e80a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 16:53:21 +0600 Subject: [PATCH 0099/1214] [globo] Skip unsupported smooth streaming --- youtube_dl/extractor/globo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index c28899011..ec451bb07 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -338,7 +338,7 @@ class GloboIE(InfoExtractor): formats = [] for resource in video['resources']: resource_id = resource.get('_id') - if not resource_id: + if not resource_id or resource_id.endswith('manifest'): continue security = self._download_json( From 5d235ca7f66af1f82c1a4d753d238f48fc3afa40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 16:55:39 +0600 Subject: [PATCH 0100/1214] [globo] Prefer native m3u8 --- youtube_dl/extractor/globo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index ec451bb07..2a805cbb2 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -368,7 +368,8 @@ class GloboIE(InfoExtractor): signed_url = '%s?h=%s&k=%s' % (resource_url, signed_hash, 'flash') if resource_id.endswith('m3u8') or resource_url.endswith('.m3u8'): m3u8_formats = self._extract_m3u8_formats( - signed_url, resource_id, 'mp4', m3u8_id='hls', fatal=False) + signed_url, resource_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) if m3u8_formats: formats.extend(m3u8_formats) else: From b4ef6a0038657c1adde565df947e42ad1e1b4195 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:01:27 +0600 Subject: [PATCH 0101/1214] [globo] Remove non available test --- youtube_dl/extractor/globo.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 2a805cbb2..8aada01dc 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -26,18 +26,6 @@ class GloboIE(InfoExtractor): _RESIGN_EXPIRATION = 86400 _TESTS = [{ - 'url': 'http://globotv.globo.com/sportv/futebol-nacional/v/os-gols-de-atletico-mg-3-x-2-santos-pela-24a-rodada-do-brasileirao/3654973/', - 'md5': '03ebf41cb7ade43581608b7d9b71fab0', - 'info_dict': { - 'id': '3654973', - 'ext': 'mp4', - 'title': 'Os gols de Atlético-MG 3 x 2 Santos pela 24ª rodada do Brasileirão', - 'duration': 251.585, - 'uploader': 'SporTV', - 'uploader_id': 698, - 'like_count': int, - } - }, { 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', 'md5': 'b3ccc801f75cd04a914d51dadb83a78d', 'info_dict': { From aebb42d32b608eaffb424e5e7c22f1b68a491e3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:01:55 +0600 Subject: [PATCH 0102/1214] [globo] Remove like count It's no longer provided --- youtube_dl/extractor/globo.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 8aada01dc..dc89e46ac 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -35,7 +35,6 @@ class GloboIE(InfoExtractor): 'duration': 103.204, 'uploader': 'Globo.com', 'uploader_id': 265, - 'like_count': int, } }, { 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/', @@ -47,7 +46,6 @@ class GloboIE(InfoExtractor): 'duration': 1472.906, 'uploader': 'Canal Brasil', 'uploader_id': 705, - 'like_count': int, } }] @@ -370,7 +368,6 @@ class GloboIE(InfoExtractor): self._sort_formats(formats) duration = float_or_none(video.get('duration'), 1000) - like_count = int_or_none(video.get('likes')) uploader = video.get('channel') uploader_id = video.get('channel_id') @@ -380,7 +377,6 @@ class GloboIE(InfoExtractor): 'duration': duration, 'uploader': uploader, 'uploader_id': uploader_id, - 'like_count': like_count, 'formats': formats } @@ -406,7 +402,6 @@ class GloboArticleIE(InfoExtractor): 'duration': 110.711, 'uploader': 'Rede Globo', 'uploader_id': 196, - 'like_count': int, } } From a4a6b7b80f18680ee0a8bba50a24c58edd3f2a73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:03:45 +0600 Subject: [PATCH 0103/1214] [globo] Improve http formats --- youtube_dl/extractor/globo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index dc89e46ac..64622aa5c 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -361,8 +361,8 @@ class GloboIE(InfoExtractor): else: formats.append({ 'url': signed_url, - 'format_id': resource_id, - 'height': resource.get('height'), + 'format_id': 'http-%s' % resource_id, + 'height': int_or_none(resource.get('height')), }) self._sort_formats(formats) From 264cd00fff4f6d7063d43e1d476de46901bd9c5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:10:45 +0600 Subject: [PATCH 0104/1214] [globo] Update tests --- youtube_dl/extractor/globo.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 64622aa5c..0337256ed 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -35,18 +35,30 @@ class GloboIE(InfoExtractor): 'duration': 103.204, 'uploader': 'Globo.com', 'uploader_id': 265, - } + }, + }, { + 'url': 'http://globoplay.globo.com/v/4581987/', + 'md5': 'f36a1ecd6a50da1577eee6dd17f67eff', + 'info_dict': { + 'id': '4581987', + 'ext': 'mp4', + 'title': 'Acidentes de trânsito estão entre as maiores causas de queda de energia em SP', + 'duration': 137.973, + 'uploader': 'Rede Globo', + 'uploader_id': 196, + }, + }, { + 'url': 'http://canalbrasil.globo.com/programas/sangue-latino/videos/3928201.html', + 'only_matching': True, + }, { + 'url': 'http://globosatplay.globo.com/globonews/v/4472924/', + 'only_matching': True, + }, { + 'url': 'http://globotv.globo.com/t/programa/v/clipe-sexo-e-as-negas-adeus/3836166/', + 'only_matching': True, }, { 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/', - 'md5': 'c1defca721ce25b2354e927d3e4b3dec', - 'info_dict': { - 'id': '3928201', - 'ext': 'mp4', - 'title': 'Ator e diretor argentino, Ricado Darín fala sobre utopias e suas perdas', - 'duration': 1472.906, - 'uploader': 'Canal Brasil', - 'uploader_id': 705, - } + 'only_matching': True, }] class MD5: From e7d34c03f200e178e9d6dfe4ae3f6856e382a4b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:12:42 +0600 Subject: [PATCH 0105/1214] [globo] Force uploader id to be string --- youtube_dl/extractor/globo.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 0337256ed..6c0fc54de 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -14,6 +14,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + str_or_none, ) @@ -34,7 +35,7 @@ class GloboIE(InfoExtractor): 'title': 'Mercedes-Benz GLA passa por teste de colisão na Europa', 'duration': 103.204, 'uploader': 'Globo.com', - 'uploader_id': 265, + 'uploader_id': '265', }, }, { 'url': 'http://globoplay.globo.com/v/4581987/', @@ -45,7 +46,7 @@ class GloboIE(InfoExtractor): 'title': 'Acidentes de trânsito estão entre as maiores causas de queda de energia em SP', 'duration': 137.973, 'uploader': 'Rede Globo', - 'uploader_id': 196, + 'uploader_id': '196', }, }, { 'url': 'http://canalbrasil.globo.com/programas/sangue-latino/videos/3928201.html', @@ -381,7 +382,7 @@ class GloboIE(InfoExtractor): duration = float_or_none(video.get('duration'), 1000) uploader = video.get('channel') - uploader_id = video.get('channel_id') + uploader_id = str_or_none(video.get('channel_id')) return { 'id': video_id, From c13722480bebfb1fc33169516790df2e99b3e499 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:13:35 +0600 Subject: [PATCH 0106/1214] [globo:article] Fix test --- youtube_dl/extractor/globo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 6c0fc54de..5883be704 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -414,7 +414,7 @@ class GloboArticleIE(InfoExtractor): 'title': 'Receita Federal explica como vai fiscalizar bagagens de quem retorna ao Brasil de avião', 'duration': 110.711, 'uploader': 'Rede Globo', - 'uploader_id': 196, + 'uploader_id': '196', } } From 5d501a0901c36695c9d6ca3958ac4ccfdea90954 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:42:11 +0600 Subject: [PATCH 0107/1214] [globo] Add more tests --- youtube_dl/extractor/globo.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 5883be704..c65ef6bcf 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -60,6 +60,9 @@ class GloboIE(InfoExtractor): }, { 'url': 'http://globotv.globo.com/canal-brasil/sangue-latino/t/todos-os-videos/v/ator-e-diretor-argentino-ricado-darin-fala-sobre-utopias-e-suas-perdas/3928201/', 'only_matching': True, + }, { + 'url': 'http://canaloff.globo.com/programas/desejar-profundo/videos/4518560.html', + 'only_matching': True, }] class MD5: @@ -405,7 +408,7 @@ class GloboArticleIE(InfoExtractor): r'<div[^>]+\bid=["\'](\d{7,})', ] - _TEST = { + _TESTS = [{ 'url': 'http://g1.globo.com/jornal-nacional/noticia/2014/09/novidade-na-fiscalizacao-de-bagagem-pela-receita-provoca-discussoes.html', 'md5': '307fdeae4390ccfe6ba1aa198cf6e72b', 'info_dict': { @@ -416,7 +419,13 @@ class GloboArticleIE(InfoExtractor): 'uploader': 'Rede Globo', 'uploader_id': '196', } - } + }, { + 'url': 'http://gq.globo.com/Prazeres/Poder/noticia/2015/10/all-o-desafio-assista-ao-segundo-capitulo-da-serie.html', + 'only_matching': True, + }, { + 'url': 'http://gshow.globo.com/programas/tv-xuxa/O-Programa/noticia/2014/01/xuxa-e-junno-namoram-muuuito-em-luau-de-zeze-di-camargo-e-luciano.html', + 'only_matching': True, + }] @classmethod def suitable(cls, url): From 17d1900581ffd12866e56640080ce340d99149a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 17:57:46 +0600 Subject: [PATCH 0108/1214] [vk] Fix view count extraction (Closes #7353) --- youtube_dl/extractor/vk.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 765e9e6fd..01960b827 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -281,9 +281,13 @@ class VKIE(InfoExtractor): mobj.group(1) + ' ' + mobj.group(2) upload_date = unified_strdate(mobj.group(1) + ' ' + mobj.group(2)) - view_count = str_to_int(self._search_regex( - r'"mv_views_count_number"[^>]*>([\d,.]+) views<', - info_page, 'view count', fatal=False)) + view_count = None + views = self._html_search_regex( + r'"mv_views_count_number"[^>]*>(.+?\bviews?)<', + info_page, 'view count', fatal=False) + if views: + view_count = str_to_int(self._search_regex( + r'([\d,.]+)', views, 'view count', fatal=False)) formats = [{ 'format_id': k, From cb5a470635ea2ad91f18d33e391979aabb0755fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Wed, 4 Nov 2015 16:18:51 +0100 Subject: [PATCH 0109/1214] [vimeo] Remove unused import --- youtube_dl/extractor/vimeo.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index b608740b8..ca716c8f5 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -8,7 +8,6 @@ import itertools from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_urllib_parse, compat_urllib_request, compat_urlparse, ) From 44b2264feae331eeb34e83eed1387def3d61a437 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 22:12:24 +0600 Subject: [PATCH 0110/1214] [youtube] Prefer video_info with token available --- youtube_dl/extractor/youtube.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d7eda7aa7..5eeb3c663 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1107,6 +1107,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not video_info: video_info = get_video_info if 'token' in get_video_info: + if 'token' not in video_info: + video_info = get_video_info break if 'token' not in video_info: if 'reason' in video_info: From 89ea063eebae84792a7ccb968533ff8bf6a41d56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 4 Nov 2015 22:49:23 +0600 Subject: [PATCH 0111/1214] [youtube] Clarify rationale for preferring a video info with token (#7362) --- youtube_dl/extractor/youtube.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5eeb3c663..e2a43299f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1107,6 +1107,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not video_info: video_info = get_video_info if 'token' in get_video_info: + # Different get_video_info requests may report different results, e.g. + # some may report video unavailability, but some may serve it without + # any complaint (see https://github.com/rg3/youtube-dl/issues/7362, + # the original webpage as well as el=info and el=embedded get_video_info + # requests report video unavailability due to geo restriction while + # el=detailpage succeeds and returns valid data). This is probably + # due to YouTube measures against IP ranges of hosting providers. + # Working around by preferring the first succeeded video_info containing + # the token if no such video_info yet was found. if 'token' not in video_info: video_info = get_video_info break From f93ded98522cc1272a8d2210738937132292afc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Nov 2015 01:54:49 +0600 Subject: [PATCH 0112/1214] [prosiebensat1] Add support for .ch domains (Closes #7365) --- youtube_dl/extractor/prosiebensat1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index effcf1db3..baa54a3af 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -20,7 +20,7 @@ from ..utils import ( class ProSiebenSat1IE(InfoExtractor): IE_NAME = 'prosiebensat1' IE_DESC = 'ProSiebenSat.1 Digital' - _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at)|ran\.de|fem\.com)/(?P<id>.+)' + _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at|ch)|ran\.de|fem\.com)/(?P<id>.+)' _TESTS = [ { From b15c44cd36831f175e9dd4081b82beb8075790b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Nov 2015 02:51:30 +0600 Subject: [PATCH 0113/1214] [periscope] Add support for videos with broadcast_id (Closes #7359) --- youtube_dl/extractor/periscope.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 8ad936758..0f9d7576f 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -27,9 +27,10 @@ class PeriscopeIE(InfoExtractor): 'skip': 'Expires in 24 hours', } - def _call_api(self, method, token): + def _call_api(self, method, value): + attribute = 'token' if len(value) > 13 else 'broadcast_id' return self._download_json( - 'https://api.periscope.tv/api/v2/%s?token=%s' % (method, token), token) + 'https://api.periscope.tv/api/v2/%s?%s=%s' % (method, attribute, value), value) def _real_extract(self, url): token = self._match_id(url) From 2549e113b8750a493917436d4fd15ed74a1a4983 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Nov 2015 02:55:53 +0600 Subject: [PATCH 0114/1214] [periscope] Add test for broadcast_id based URL --- youtube_dl/extractor/periscope.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 0f9d7576f..7621d9e99 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -12,7 +12,7 @@ from ..utils import parse_iso8601 class PeriscopeIE(InfoExtractor): IE_DESC = 'Periscope' _VALID_URL = r'https?://(?:www\.)?periscope\.tv/w/(?P<id>[^/?#]+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', 'md5': '65b57957972e503fcbbaeed8f4fa04ca', 'info_dict': { @@ -25,7 +25,10 @@ class PeriscopeIE(InfoExtractor): 'uploader_id': '1465763', }, 'skip': 'Expires in 24 hours', - } + }, { + 'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv', + 'only_matching': True, + }] def _call_api(self, method, value): attribute = 'token' if len(value) > 13 else 'broadcast_id' From 53472df85793cc89deb779c2ffc3ae1f47292fd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Nov 2015 02:56:44 +0600 Subject: [PATCH 0115/1214] [periscope] Add note on where to find alive example URLs --- youtube_dl/extractor/periscope.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 7621d9e99..887c8020d 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -12,6 +12,7 @@ from ..utils import parse_iso8601 class PeriscopeIE(InfoExtractor): IE_DESC = 'Periscope' _VALID_URL = r'https?://(?:www\.)?periscope\.tv/w/(?P<id>[^/?#]+)' + # Alive example URLs can be found here http://onperiscope.com/ _TESTS = [{ 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', 'md5': '65b57957972e503fcbbaeed8f4fa04ca', From b3613d36da14ab527166326707c0f911d192144d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 5 Nov 2015 04:37:51 +0600 Subject: [PATCH 0116/1214] [YoutubeDL] Sanitize path after output template substitution (Closes #7367) --- youtube_dl/YoutubeDL.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 12977bf80..1783ce01b 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -572,7 +572,7 @@ class YoutubeDL(object): if v is not None) template_dict = collections.defaultdict(lambda: 'NA', template_dict) - outtmpl = sanitize_path(self.params.get('outtmpl', DEFAULT_OUTTMPL)) + outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) tmpl = compat_expanduser(outtmpl) filename = tmpl % template_dict # Temporary fix for #4787 @@ -580,7 +580,7 @@ class YoutubeDL(object): # to workaround encoding issues with subprocess on python2 @ Windows if sys.version_info < (3, 0) and sys.platform == 'win32': filename = encodeFilename(filename, True).decode(preferredencoding()) - return filename + return sanitize_path(filename) except ValueError as err: self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')') return None From 967c9076a31ca2a2b43fb71082ad1a8db88116bd Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 5 Nov 2015 18:01:13 +0100 Subject: [PATCH 0117/1214] raise ExtractorError if the page doesn't contain a video --- youtube_dl/extractor/flickr.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index e97754d36..92d2ac553 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -79,3 +79,5 @@ class FlickrIE(InfoExtractor): 'uploader': owner.get('realname'), 'comment_count': int_or_none(video_info.get('comments', {}).get('_content')), } + else: + raise ExtractorError('not a video', expected=True) From 6953d8e95a78e83f087693b7353baab96b09fbdd Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 6 Nov 2015 02:09:55 +0100 Subject: [PATCH 0118/1214] [miomio] fix info extraction (fixes #7366) --- youtube_dl/extractor/miomio.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py index a784fc5fb..3f812e005 100644 --- a/youtube_dl/extractor/miomio.py +++ b/youtube_dl/extractor/miomio.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import random from .common import InfoExtractor +from ..compat import compat_urllib_request from ..utils import ( xpath_text, int_or_none, @@ -60,10 +61,12 @@ class MioMioIE(InfoExtractor): 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/xml.php?id=%s&r=%s' % (id, random.randint(100, 999)), video_id) - # the following xml contains the actual configuration information on the video file(s) - vid_config = self._download_xml( + vid_config_request = compat_urllib_request.Request( 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/sina.php?{0}'.format(xml_config), - video_id) + headers={'Referer': 'http://www.miomio.tv/mioplayer/mioplayer-v3.0.swf'}) + + # the following xml contains the actual configuration information on the video file(s) + vid_config = self._download_xml(vid_config_request, video_id) http_headers = { 'Referer': 'http://www.miomio.tv%s' % mioplayer_path, From e68dd1921ad7528d225a8571066f99b9934b6a06 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 6 Nov 2015 06:33:05 +0100 Subject: [PATCH 0119/1214] [miomio] use the formats urls headers for downloading xml --- youtube_dl/extractor/miomio.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py index 3f812e005..6f40bf1b9 100644 --- a/youtube_dl/extractor/miomio.py +++ b/youtube_dl/extractor/miomio.py @@ -52,6 +52,8 @@ class MioMioIE(InfoExtractor): mioplayer_path = self._search_regex( r'src="(/mioplayer/[^"]+)"', webpage, 'ref_path') + http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path,} + xml_config = self._search_regex( r'flashvars="type=(?:sina|video)&(.+?)&', webpage, 'xml config') @@ -63,15 +65,11 @@ class MioMioIE(InfoExtractor): vid_config_request = compat_urllib_request.Request( 'http://www.miomio.tv/mioplayer/mioplayerconfigfiles/sina.php?{0}'.format(xml_config), - headers={'Referer': 'http://www.miomio.tv/mioplayer/mioplayer-v3.0.swf'}) + headers=http_headers) # the following xml contains the actual configuration information on the video file(s) vid_config = self._download_xml(vid_config_request, video_id) - http_headers = { - 'Referer': 'http://www.miomio.tv%s' % mioplayer_path, - } - if not int_or_none(xpath_text(vid_config, 'timelength')): raise ExtractorError('Unable to load videos!', expected=True) From a641b2459263228fb1dd86dfe05d6047cedbf345 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 6 Nov 2015 07:23:03 +0100 Subject: [PATCH 0120/1214] [cnet] skip hls_phone if hls_tablet is present --- youtube_dl/extractor/cnet.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index 2fac0d79d..3ecf0efd4 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -57,7 +57,9 @@ class CNETIE(InfoExtractor): subtitles = {} description = vdata.get('description') - for vid in vdata['files'].values(): + for (fkey, vid) in vdata['files'].items(): + if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: + continue result = tp.extract(('http://link.theplatform.com/s/%s/%s' % (mpx_account, vid))) formats.extend(result['formats']) subtitles = self._merge_subtitles(subtitles, result['subtitles']) From 5003e4283b35acb82ea9793d91bc3cd1ee679f86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 21:06:44 +0600 Subject: [PATCH 0121/1214] [ndr] Relax _VALID_URL (Closes #7383) --- youtube_dl/extractor/ndr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index ba06d8a98..a2b51ccb3 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -23,7 +23,7 @@ class NDRBaseIE(InfoExtractor): class NDRIE(NDRBaseIE): IE_NAME = 'ndr' IE_DESC = 'NDR.de - Norddeutscher Rundfunk' - _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)+(?P<id>[^/?#]+),[\da-z]+\.html' + _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)*(?P<id>[^/?#]+),[\da-z]+\.html' _TESTS = [{ # httpVideo, same content id 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', From 01003d072c20c2ed095930d87c5ce3d5610e66b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 21:07:52 +0600 Subject: [PATCH 0122/1214] [ndr] Add test for #7383 --- youtube_dl/extractor/ndr.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index a2b51ccb3..0be866681 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -78,6 +78,9 @@ class NDRIE(NDRBaseIE): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html', + 'only_matching': True, }] def _extract_embed(self, webpage, display_id): From 1e2eb4b40d46f39d15c067657ecad16fa3b2121d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 21:08:21 +0600 Subject: [PATCH 0123/1214] [njoy] Relax _VALID_URL --- youtube_dl/extractor/ndr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 0be866681..7043c7e0f 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -105,7 +105,7 @@ class NDRIE(NDRBaseIE): class NJoyIE(NDRBaseIE): IE_NAME = 'njoy' IE_DESC = 'N-JOY' - _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)+(?:(?P<display_id>[^/?#]+),)?(?P<id>[\da-z]+)\.html' + _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)*(?:(?P<display_id>[^/?#]+),)?(?P<id>[\da-z]+)\.html' _TESTS = [{ # httpVideo, same content id 'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html', From 81413c01651eddcc5180af379f2ce3689a376051 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 21:08:52 +0600 Subject: [PATCH 0124/1214] [ndr:embed] Relax _VALID_URL --- youtube_dl/extractor/ndr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 7043c7e0f..477ce4e6b 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -238,7 +238,7 @@ class NDREmbedBaseIE(InfoExtractor): class NDREmbedIE(NDREmbedBaseIE): IE_NAME = 'ndr:embed' - _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)+(?P<id>[\da-z]+)-(?:player|externalPlayer)\.html' + _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)\.html' _TESTS = [{ 'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html', 'md5': '8b9306142fe65bbdefb5ce24edb6b0a9', From 92366d189ef280b8ba0057930c54aa14b0ecdd24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 21:09:17 +0600 Subject: [PATCH 0125/1214] [njoy:embed] Relax _VALID_URL --- youtube_dl/extractor/ndr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 477ce4e6b..16213eed9 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -332,7 +332,7 @@ class NDREmbedIE(NDREmbedBaseIE): class NJoyEmbedIE(NDREmbedBaseIE): IE_NAME = 'njoy:embed' - _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)+(?P<id>[\da-z]+)-(?:player|externalPlayer)_[^/]+\.html' + _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)_[^/]+\.html' _TESTS = [{ # httpVideo 'url': 'http://www.n-joy.de/events/reeperbahnfestival/doku948-player_image-bc168e87-5263-4d6d-bd27-bb643005a6de_theme-n-joy.html', From deb85c32bbd32e8d280e1919432a11c0bdaa26bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 21:56:31 +0600 Subject: [PATCH 0126/1214] [postprocessor/ffmpeg] Use ffmpeg as prefix since it's used all over the places (Closes #7371) --- youtube_dl/postprocessor/ffmpeg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 4f320e124..5ed723bc6 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -272,7 +272,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): return [], information try: - self._downloader.to_screen('[' + self.basename + '] Destination: ' + new_path) + self._downloader.to_screen('[ffmpeg] Destination: ' + new_path) self.run_ffmpeg(path, new_path, acodec, more_opts) except AudioConversionError as e: raise PostProcessingError( From 179ffab69c3359ab7d0a7b0a2b63c94d8c70af67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 23:06:13 +0600 Subject: [PATCH 0127/1214] [lynda:course] Force log out (Closes #7361) --- youtube_dl/extractor/lynda.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 5c973e75c..67f2025de 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -82,6 +82,11 @@ class LyndaBaseIE(InfoExtractor): expected=True) raise ExtractorError('Unable to log in') + def _logout(self): + self._download_webpage( + 'http://www.lynda.com/ajax/logout.aspx', None, + 'Logging out', 'Unable to log out', fatal=False) + class LyndaIE(LyndaBaseIE): IE_NAME = 'lynda' @@ -210,6 +215,8 @@ class LyndaCourseIE(LyndaBaseIE): course_id, 'Downloading course JSON') course_json = json.loads(page) + self._logout() + if 'Status' in course_json and course_json['Status'] == 'NotFound': raise ExtractorError( 'Course %s does not exist' % course_id, expected=True) From 71bb016160744a80fecaadf5b75b0dc2b1e8089b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 23:10:07 +0600 Subject: [PATCH 0128/1214] [lynda:course] Modernize and make more robust --- youtube_dl/extractor/lynda.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 67f2025de..98474ded9 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -210,14 +210,13 @@ class LyndaCourseIE(LyndaBaseIE): course_path = mobj.group('coursepath') course_id = mobj.group('courseid') - page = self._download_webpage( + course = self._download_json( 'http://www.lynda.com/ajax/player?courseId=%s&type=course' % course_id, course_id, 'Downloading course JSON') - course_json = json.loads(page) self._logout() - if 'Status' in course_json and course_json['Status'] == 'NotFound': + if course.get('Status') == 'NotFound': raise ExtractorError( 'Course %s does not exist' % course_id, expected=True) @@ -227,12 +226,14 @@ class LyndaCourseIE(LyndaBaseIE): # Might want to extract videos right here from video['Formats'] as it seems 'Formats' is not provided # by single video API anymore - for chapter in course_json['Chapters']: - for video in chapter['Videos']: - if video['HasAccess'] is False: + for chapter in course['Chapters']: + for video in chapter.get('Videos', []): + if video.get('HasAccess') is False: unaccessible_videos += 1 continue - videos.append(video['ID']) + video_id = video.get('ID') + if video_id: + videos.append(video_id) if unaccessible_videos > 0: self._downloader.report_warning( @@ -245,6 +246,6 @@ class LyndaCourseIE(LyndaBaseIE): 'Lynda') for video_id in videos] - course_title = course_json['Title'] + course_title = course.get('Title') return self.playlist_result(entries, course_id, course_title) From ea8ed40b2fb70fc2f01aba475128821078873d46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 23:24:39 +0600 Subject: [PATCH 0129/1214] [lynda] Modernize and make more robust --- youtube_dl/extractor/lynda.py | 52 ++++++++++++++++------------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 98474ded9..c8a16842e 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -113,51 +113,47 @@ class LyndaIE(LyndaBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - page = self._download_webpage( + video = self._download_json( 'http://www.lynda.com/ajax/player?videoId=%s&type=video' % video_id, video_id, 'Downloading video JSON') - video_json = json.loads(page) - if 'Status' in video_json: + if 'Status' in video: raise ExtractorError( - 'lynda returned error: %s' % video_json['Message'], expected=True) + 'lynda returned error: %s' % video['Message'], expected=True) - if video_json['HasAccess'] is False: + if video.get('HasAccess') is False: self.raise_login_required('Video %s is only available for members' % video_id) - video_id = compat_str(video_json['ID']) - duration = video_json['DurationInSeconds'] - title = video_json['Title'] + video_id = compat_str(video.get('ID') or video_id) + duration = int_or_none(video.get('DurationInSeconds')) + title = video['Title'] formats = [] - fmts = video_json.get('Formats') + fmts = video.get('Formats') if fmts: - formats.extend([ - { - 'url': fmt['Url'], - 'ext': fmt['Extension'], - 'width': fmt['Width'], - 'height': fmt['Height'], - 'filesize': fmt['FileSize'], - 'format_id': str(fmt['Resolution']) - } for fmt in fmts]) + formats.extend([{ + 'url': f['Url'], + 'ext': f.get('Extension'), + 'width': int_or_none(f.get('Width')), + 'height': int_or_none(f.get('Height')), + 'filesize': int_or_none(f.get('FileSize')), + 'format_id': compat_str(f.get('Resolution')) if f.get('Resolution') else None, + } for f in fmts if f.get('Url')]) - prioritized_streams = video_json.get('PrioritizedStreams') + prioritized_streams = video.get('PrioritizedStreams') if prioritized_streams: for prioritized_stream_id, prioritized_stream in prioritized_streams.items(): - formats.extend([ - { - 'url': video_url, - 'width': int_or_none(format_id), - 'format_id': '%s-%s' % (prioritized_stream_id, format_id), - } for format_id, video_url in prioritized_stream.items() - ]) + formats.extend([{ + 'url': video_url, + 'width': int_or_none(format_id), + 'format_id': '%s-%s' % (prioritized_stream_id, format_id), + } for format_id, video_url in prioritized_stream.items()]) self._check_formats(formats, video_id) self._sort_formats(formats) - subtitles = self.extract_subtitles(video_id, page) + subtitles = self.extract_subtitles(video_id) return { 'id': video_id, @@ -188,7 +184,7 @@ class LyndaIE(LyndaBaseIE): if srt: return srt - def _get_subtitles(self, video_id, webpage): + def _get_subtitles(self, video_id): url = 'http://www.lynda.com/ajax/player?videoId=%s&type=transcript' % video_id subs = self._download_json(url, None, False) if subs: From ae4ddf9efae816f4d52fc584c93e4f0e3c79c410 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 23:27:38 +0600 Subject: [PATCH 0130/1214] [lynda] PEP 8 --- youtube_dl/extractor/lynda.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index c8a16842e..9a207b2cd 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -227,9 +227,8 @@ class LyndaCourseIE(LyndaBaseIE): if video.get('HasAccess') is False: unaccessible_videos += 1 continue - video_id = video.get('ID') - if video_id: - videos.append(video_id) + if video.get('ID'): + videos.append(video['ID']) if unaccessible_videos > 0: self._downloader.report_warning( From 472404953a22811cc8156da110ea872a924f1f18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 23:28:14 +0600 Subject: [PATCH 0131/1214] [miomio] PEP 8 --- youtube_dl/extractor/miomio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py index 6f40bf1b9..ce391c759 100644 --- a/youtube_dl/extractor/miomio.py +++ b/youtube_dl/extractor/miomio.py @@ -52,7 +52,7 @@ class MioMioIE(InfoExtractor): mioplayer_path = self._search_regex( r'src="(/mioplayer/[^"]+)"', webpage, 'ref_path') - http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path,} + http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path} xml_config = self._search_regex( r'flashvars="type=(?:sina|video)&(.+?)&', From 0fa6b17dccd2347cb0611651fc04e36839d33a4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 6 Nov 2015 23:45:26 +0600 Subject: [PATCH 0132/1214] [pbs] Simplify and speed up player URL search --- youtube_dl/extractor/pbs.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 3448736a2..7b868d057 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -191,9 +191,13 @@ class PBSIE(InfoExtractor): if media_id: return media_id, presumptive_id, upload_date - url = self._search_regex( - r'(?s)<iframe[^>]+?(?:[a-z-]+?=["\'].*?["\'][^>]+?)*?\bsrc=["\']([^\'"]+partnerplayer[^\'"]+)["\']', - webpage, 'player URL') + for iframe in re.findall(r'(?s)<iframe(.+?)></iframe>', webpage): + url = self._search_regex( + r'src=(["\'])(?P<url>.+?partnerplayer.+?)\1', iframe, + 'player URL', default=None, group='url') + if url: + break + mobj = re.match(self._VALID_URL, url) player_id = mobj.group('player_id') From 686f98816ecbbcb224d1336682688b05cdb051a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 00:39:16 +0600 Subject: [PATCH 0133/1214] [pbs] Add support for flp frontlines (Closes #7369) --- youtube_dl/extractor/pbs.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 7b868d057..3169e9c3f 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -8,6 +8,7 @@ from ..utils import ( ExtractorError, determine_ext, int_or_none, + strip_jsonp, unified_strdate, US_RATINGS, ) @@ -191,6 +192,23 @@ class PBSIE(InfoExtractor): if media_id: return media_id, presumptive_id, upload_date + # Fronline video embedded via flp + video_id = self._search_regex( + r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid') + if video_id: + # pkg_id calculation is reverse engineered from + # http://www.pbs.org/wgbh/pages/frontline/js/flp2012.js + prg_id = self._search_regex( + r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid')[7:] + if 'q' in prg_id: + prg_id = prg_id.split('q')[1] + prg_id = int(prg_id, 16) + getdir = self._download_json( + 'http://www.pbs.org/wgbh/pages/frontline/.json/getdir/getdir%d.json' % prg_id, + presumptive_id, 'Downloading getdir JSON', + transform_source=strip_jsonp) + return getdir['mid'], presumptive_id, upload_date + for iframe in re.findall(r'(?s)<iframe(.+?)></iframe>', webpage): url = self._search_regex( r'src=(["\'])(?P<url>.+?partnerplayer.+?)\1', iframe, From 8b6d9406db1d3361b006016e6aace54b05cb6fea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 00:42:30 +0600 Subject: [PATCH 0134/1214] [pbs] Add test for flp frontline embeds --- youtube_dl/extractor/pbs.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 3169e9c3f..a690f9c29 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -154,6 +154,22 @@ class PBSIE(InfoExtractor): 'params': { 'skip_download': True, # requires ffmpeg }, + }, + { + # Frontline video embedded via flp2012.js + 'url': 'http://www.pbs.org/wgbh/pages/frontline/the-atomic-artists', + 'info_dict': { + 'id': '2070868960', + 'display_id': 'the-atomic-artists', + 'ext': 'mp4', + 'title': 'FRONTLINE - The Atomic Artists', + 'description': 'md5:f5bfbefadf421e8bb8647602011caf8e', + 'duration': 723, + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, # requires ffmpeg + }, } ] _ERRORS = { From 21d0c33ecde573db961b97f5f0c37ba9d3c02ff3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 01:08:40 +0600 Subject: [PATCH 0135/1214] [pbs] Make flp embed lookup non fatal --- youtube_dl/extractor/pbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index a690f9c29..8fb9b1849 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -210,7 +210,7 @@ class PBSIE(InfoExtractor): # Fronline video embedded via flp video_id = self._search_regex( - r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid') + r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid', default=None) if video_id: # pkg_id calculation is reverse engineered from # http://www.pbs.org/wgbh/pages/frontline/js/flp2012.js From ee223abb88263bdda2d92c4b2139d1dca60ba3ae Mon Sep 17 00:00:00 2001 From: Mister Hat <misterhat144@gmail.com> Date: Tue, 3 Nov 2015 19:13:27 -0600 Subject: [PATCH 0136/1214] [vidzi] fixed. finds url from hash and host in script Closes #7386. --- youtube_dl/extractor/vidzi.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 08a5a7b8d..2ba9f31df 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -20,8 +20,14 @@ class VidziIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'{\s*file\s*:\s*"([^"]+)"\s*}', webpage, 'video url') + video_host = self._html_search_regex( + r'id=\'vplayer\'><img src="http://(.*?)/i', webpage, + 'video host') + video_hash = self._html_search_regex( + r'\|([a-z0-9]+)\|hls\|type', webpage, 'video_hash') + ext = self._html_search_regex( + r'\|tracks\|([a-z0-9]+)\|', webpage, 'video ext') + video_url = 'http://' + video_host + '/' + video_hash + '/v.' + ext title = self._html_search_regex( r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') From 5d0f84d32cc038dd71673987cb6efaa85e953474 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 06:23:00 +0600 Subject: [PATCH 0137/1214] [beeg] Skip empty URLs (Closes #7392) --- youtube_dl/extractor/beeg.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index e6c928699..61bc2f744 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -33,6 +33,8 @@ class BeegIE(InfoExtractor): formats = [] for format_id, video_url in video.items(): + if not video_url: + continue height = self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None) if not height: From 5214f1e31d5e5ba692fb1ed4803ff71ef4e480e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 19:25:59 +0600 Subject: [PATCH 0138/1214] [crunchyroll] Fix title extraction (Closes #7396) --- youtube_dl/extractor/crunchyroll.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 0c9b8ca02..4243f3e2e 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -287,7 +287,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text if 'To view this, please log in to verify you are 18 or older.' in webpage: self.raise_login_required() - video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL) + video_title = self._html_search_regex( + r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>', + webpage, 'video_title') video_title = re.sub(r' {2,}', ' ', video_title) video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='') if not video_description: From 2c740cf28d257d2a915195e7cc60f83e6690d2cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 19:29:09 +0600 Subject: [PATCH 0139/1214] [crunchyroll] Simplify description extraction --- youtube_dl/extractor/crunchyroll.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 4243f3e2e..9aa5d58b4 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -291,9 +291,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>', webpage, 'video_title') video_title = re.sub(r' {2,}', ' ', video_title) - video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='') - if not video_description: - video_description = None + video_description = self._html_search_regex( + r'"description":"([^"]+)', webpage, 'video_description', default=None) video_upload_date = self._html_search_regex( [r'<div>Availability for free users:(.+?)</div>', r'<div>[^<>]+<span>\s*(.+?\d{4})\s*</span></div>'], webpage, 'video_upload_date', fatal=False, flags=re.DOTALL) From 6d02b9a392d39c114d3fb58bf7965f62196ccecd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 7 Nov 2015 20:02:39 +0600 Subject: [PATCH 0140/1214] [crunchyroll] Fix description extraction --- youtube_dl/extractor/crunchyroll.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 9aa5d58b4..6e5999c72 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -21,6 +21,7 @@ from ..utils import ( bytes_to_intlist, intlist_to_bytes, int_or_none, + lowercase_escape, remove_end, unified_strdate, urlencode_postdata, @@ -104,7 +105,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'id': '589804', 'ext': 'flv', 'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11', - 'description': 'md5:fe2743efedb49d279552926d0bd0cd9e', + 'description': 'md5:2fbc01f90b87e8e9137296f37b461c12', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'Danny Choo Network', 'upload_date': '20120213', @@ -292,7 +293,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text webpage, 'video_title') video_title = re.sub(r' {2,}', ' ', video_title) video_description = self._html_search_regex( - r'"description":"([^"]+)', webpage, 'video_description', default=None) + r'<script[^>]*>\s*.+?\[media_id=%s\].+?"description"\s*:\s*"([^"]+)' % video_id, + webpage, 'description', default=None) + if video_description: + video_description = lowercase_escape(video_description.replace(r'\r\n', '\n')) video_upload_date = self._html_search_regex( [r'<div>Availability for free users:(.+?)</div>', r'<div>[^<>]+<span>\s*(.+?\d{4})\s*</span></div>'], webpage, 'video_upload_date', fatal=False, flags=re.DOTALL) From 3793090b1b1c1e3462b80dd3045a3573545cfb29 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 7 Nov 2015 16:54:35 +0100 Subject: [PATCH 0141/1214] [amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews extractors --- youtube_dl/extractor/amp.py | 84 ++++++++++++++++++++++++++++++ youtube_dl/extractor/dramafever.py | 65 ++++------------------- youtube_dl/extractor/foxnews.py | 64 ++++------------------- 3 files changed, 105 insertions(+), 108 deletions(-) create mode 100644 youtube_dl/extractor/amp.py diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py new file mode 100644 index 000000000..b573b9280 --- /dev/null +++ b/youtube_dl/extractor/amp.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class AMPIE(InfoExtractor): + def _get_media_node(self, item, name, default=None): + media_name = 'media-%s' % name + media_group = item.get('media-group') or item + return media_group.get(media_name) or item.get(media_name) or item.get(name, default) + + # parse Akamai Adaptive Media Player feed + def _extract_feed_info(self, url): + item = self._download_json( + url, None, + 'Downloading Akamai AMP feed', + 'Unable to download Akamai AMP feed' + )['channel']['item'] + + video_id = item['guid'] + + thumbnails = [] + media_thumbnail = self._get_media_node(item, 'thumbnail') + if media_thumbnail: + if isinstance(media_thumbnail, dict): + media_thumbnail = [media_thumbnail] + for thumbnail_data in media_thumbnail: + thumbnail = thumbnail_data['@attributes'] + thumbnails.append({ + 'url': self._proto_relative_url(thumbnail['url'], 'http:'), + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + subtitles = {} + media_subtitle = self._get_media_node(item, 'subTitle') + if media_subtitle: + if isinstance(media_subtitle, dict): + media_subtitle = [media_subtitle] + for subtitle_data in media_subtitle: + subtitle = subtitle_data['@attributes'] + lang = subtitle.get('lang') or 'en' + subtitles[lang] = [{'url': subtitle['href']}] + + formats = [] + media_content = self._get_media_node(item, 'content') + if isinstance(media_content, dict): + media_content = [media_content] + for media_data in media_content: + media = media_data['@attributes'] + media_type = media['type'] + if media_type == 'video/f4m': + f4m_formats = self._extract_f4m_formats(media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + elif media_type == 'application/x-mpegURL': + m3u8_formats = self._extract_m3u8_formats(media['url'], video_id, m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + else: + formats.append({ + 'format_id': media_data['media-category']['@attributes']['label'], + 'url': media['url'], + 'preference': 1, + 'vbr': int_or_none(media.get('bitrate')), + 'filesize': int_or_none(media.get('fileSize')), + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._get_media_node(item, 'title'), + 'description': self._get_media_node(item, 'description'), + 'thumbnails': thumbnails, + 'timestamp': parse_iso8601(item.get('pubDate'), ' '), + 'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index 38e6597c8..80a928827 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import itertools -from .common import InfoExtractor +from .amp import AMPIE from ..compat import ( compat_HTTPError, compat_urllib_parse, @@ -19,7 +19,7 @@ from ..utils import ( ) -class DramaFeverBaseIE(InfoExtractor): +class DramaFeverBaseIE(AMPIE): _LOGIN_URL = 'https://www.dramafever.com/accounts/login/' _NETRC_MACHINE = 'dramafever' @@ -80,60 +80,24 @@ class DramaFeverIE(DramaFeverBaseIE): 'timestamp': 1404336058, 'upload_date': '20140702', 'duration': 343, - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): video_id = self._match_id(url).replace('/', '.') try: - feed = self._download_json( - 'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id, - video_id, 'Downloading episode JSON')['channel']['item'] + info = self._extract_feed_info('http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError): raise ExtractorError( 'Currently unavailable in your country.', expected=True) raise - media_group = feed.get('media-group', {}) - - formats = [] - for media_content in media_group['media-content']: - src = media_content.get('@attributes', {}).get('url') - if not src: - continue - ext = determine_ext(src) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - src, video_id, f4m_id='hds')) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', m3u8_id='hls')) - else: - formats.append({ - 'url': src, - }) - self._sort_formats(formats) - - title = media_group.get('media-title') - description = media_group.get('media-description') - duration = int_or_none(media_group['media-content'][0].get('@attributes', {}).get('duration')) - thumbnail = self._proto_relative_url( - media_group.get('media-thumbnail', {}).get('@attributes', {}).get('url')) - timestamp = parse_iso8601(feed.get('pubDate'), ' ') - - subtitles = {} - for media_subtitle in media_group.get('media-subTitle', []): - lang = media_subtitle.get('@attributes', {}).get('lang') - href = media_subtitle.get('@attributes', {}).get('href') - if not lang or not href: - continue - subtitles[lang] = [{ - 'ext': 'ttml', - 'url': href, - }] - series_id, episode_number = video_id.split('.') episode_info = self._download_json( # We only need a single episode info, so restricting page size to one episode @@ -146,21 +110,12 @@ class DramaFeverIE(DramaFeverBaseIE): if value: subfile = value[0].get('subfile') or value[0].get('new_subfile') if subfile and subfile != 'http://www.dramafever.com/st/': - subtitles.setdefault('English', []).append({ + info['subtitiles'].setdefault('English', []).append({ 'ext': 'srt', 'url': subfile, }) - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } + return info class DramaFeverSeriesIE(DramaFeverBaseIE): diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index 3a4a59135..0cd0f9fa8 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -2,14 +2,14 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .amp import AMPIE from ..utils import ( parse_iso8601, int_or_none, ) -class FoxNewsIE(InfoExtractor): +class FoxNewsIE(AMPIE): IE_DESC = 'Fox News and Fox Business Video' _VALID_URL = r'https?://(?P<host>video\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)' _TESTS = [ @@ -20,10 +20,10 @@ class FoxNewsIE(InfoExtractor): 'id': '3937480', 'ext': 'flv', 'title': 'Frozen in Time', - 'description': 'Doctors baffled by 16-year-old girl that is the size of a toddler', + 'description': '16-year-old girl is size of toddler', 'duration': 265, - 'timestamp': 1304411491, - 'upload_date': '20110503', + #'timestamp': 1304411491, + #'upload_date': '20110503', 'thumbnail': 're:^https?://.*\.jpg$', }, }, @@ -34,10 +34,10 @@ class FoxNewsIE(InfoExtractor): 'id': '3922535568001', 'ext': 'mp4', 'title': "Rep. Luis Gutierrez on if Obama's immigration plan is legal", - 'description': "Congressman discusses the president's executive action", + 'description': "Congressman discusses president's plan", 'duration': 292, - 'timestamp': 1417662047, - 'upload_date': '20141204', + #'timestamp': 1417662047, + #'upload_date': '20141204', 'thumbnail': 're:^https?://.*\.jpg$', }, }, @@ -56,48 +56,6 @@ class FoxNewsIE(InfoExtractor): video_id = mobj.group('id') host = mobj.group('host') - video = self._download_json( - 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id), video_id) - - item = video['channel']['item'] - title = item['title'] - description = item['description'] - timestamp = parse_iso8601(item['dc-date']) - - media_group = item['media-group'] - duration = None - formats = [] - for media in media_group['media-content']: - attributes = media['@attributes'] - video_url = attributes['url'] - if video_url.endswith('.f4m'): - formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id)) - elif video_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(video_url, video_id, 'flv')) - elif not video_url.endswith('.smil'): - duration = int_or_none(attributes.get('duration')) - formats.append({ - 'url': video_url, - 'format_id': media['media-category']['@attributes']['label'], - 'preference': 1, - 'vbr': int_or_none(attributes.get('bitrate')), - 'filesize': int_or_none(attributes.get('fileSize')) - }) - self._sort_formats(formats) - - media_thumbnail = media_group['media-thumbnail']['@attributes'] - thumbnails = [{ - 'url': media_thumbnail['url'], - 'width': int_or_none(media_thumbnail.get('width')), - 'height': int_or_none(media_thumbnail.get('height')), - }] if media_thumbnail else [] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - 'thumbnails': thumbnails, - } + info = self._extract_feed_info('http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) + info['id'] = video_id + return info From 63b728f06f00c2f1a45a67eddebd18bcdc36a753 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 7 Nov 2015 16:56:21 +0100 Subject: [PATCH 0142/1214] [bleacherreport] Add new Extractor --- youtube_dl/extractor/__init__.py | 4 + youtube_dl/extractor/bleacherreport.py | 121 +++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 youtube_dl/extractor/bleacherreport.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 94150a28f..4d65ece94 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -54,6 +54,10 @@ from .beatportpro import BeatportProIE from .bet import BetIE from .bild import BildIE from .bilibili import BiliBiliIE +from .bleacherreport import ( + BleacherReportIE, + BleacherReportCMSIE, +) from .blinkx import BlinkxIE from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py new file mode 100644 index 000000000..a55e696d2 --- /dev/null +++ b/youtube_dl/extractor/bleacherreport.py @@ -0,0 +1,121 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .amp import AMPIE +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, +) + + +class BleacherReportIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/articles/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://bleacherreport.com/articles/2496438-fsu-stat-projections-is-jalen-ramsey-best-defensive-player-in-college-football', + 'md5': 'a3ffc3dc73afdbc2010f02d98f990f20', + 'info_dict': { + 'id': '2496438', + 'ext': 'mp4', + 'title': 'FSU Stat Projections: Is Jalen Ramsey Best Defensive Player in College Football?', + 'uploader_id': 3992341, + 'description': 'CFB, ACC, Florida State', + 'timestamp': 1434380212, + 'upload_date': '20150615', + 'uploader': 'Team Stream Now ', + }, + 'add_ie': ['Ooyala'], + },{ + 'url': 'http://bleacherreport.com/articles/2586817-aussie-golfers-get-fright-of-their-lives-after-being-chased-by-angry-kangaroo', + 'md5': 'af5f90dc9c7ba1c19d0a3eac806bbf50', + 'info_dict': { + 'id': '2586817', + 'ext': 'mp4', + 'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo', + 'timestamp': 1446839961, + 'uploader': 'Sean Fay', + 'description': 'md5:e95afafa43619816552723878b3b0a84', + 'uploader_id': 6466954, + 'upload_date': '20151011', + }, + 'add_ie': ['Youtube'], + },{ + 'url': 'http://bleacherreport.com/articles/2496438-fsu-stat-projections-is-jalen-ramsey-best-defensive-player-in-college-football', + 'md5': 'a3ffc3dc73afdbc2010f02d98f990f20', + 'info_dict': { + 'id': '2496438', + 'ext': 'mp4', + 'title': 'FSU Stat Projections: Is Jalen Ramsey Best Defensive Player in College Football?', + 'upload_date': '20150615', + 'uploader': 'Team Stream Now ', + 'timestamp': 1434380212, + 'description': 'CFB, ACC, Florida State', + 'uploader_id': 3992341, + }, + 'add_ie': ['Vine'], + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + + article_data = self._download_json('http://api.bleacherreport.com/api/v1/articles/%s' % article_id, article_id)['article'] + + thumbnails = [] + primary_photo = article_data.get('primaryPhoto') + if primary_photo: + thumbnails = [{ + 'url': primary_photo['url'], + 'width': primary_photo.get('width'), + 'height': primary_photo.get('height'), + }] + + info = { + '_type': 'url_transparent', + 'id': article_id, + 'title': article_data['title'], + 'uploader': article_data.get('author', {}).get('name'), + 'uploader_id': article_data.get('authorId'), + 'timestamp': parse_iso8601(article_data.get('createdAt')), + 'thumbnails': thumbnails, + 'comment_count': int_or_none(article_data.get('commentsCount')), + 'view_count': int_or_none(article_data.get('hitCount')), + } + + video = article_data.get('video') + if video: + video_type = video['type'] + if video_type == 'cms.bleacherreport.com': + info['url'] = 'http://bleacherreport.com/video_embed?id=%s' % video['id'] + elif video_type == 'ooyala.com': + info['url'] = 'ooyala:%s' % video['id'] + elif video_type == 'youtube.com': + info['url'] = video['id'] + elif video_type == 'vine.co': + info['url'] = 'https://vine.co/v/%s' % video['id'] + else: + info['url'] = video_type + video['id'] + return info + else: + raise ExtractorError('no video in the article', expected=True) + + +class BleacherReportCMSIE(AMPIE): + _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36})' + _TESTS = [{ + 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1', + 'md5': 'f0ca220af012d4df857b54f792c586bb', + 'info_dict': { + 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', + 'ext': 'flv', + 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', + 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + info = self._extract_feed_info('http://cms.bleacherreport.com/media/items/%s/akamai.json' % video_id) + info['id'] = video_id + return info From cff551c0b0ed8eb55c1ab63ec669c07a51aa4998 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 7 Nov 2015 18:43:22 +0100 Subject: [PATCH 0143/1214] [googleplus] Fix extraction of formats --- youtube_dl/extractor/googleplus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index fcefe54cd..731bacd67 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -61,7 +61,7 @@ class GooglePlusIE(InfoExtractor): 'width': int(width), 'height': int(height), } for width, height, video_url in re.findall( - r'\d+,(\d+),(\d+),"(https?://redirector\.googlevideo\.com.*?)"', webpage)] + r'\d+,(\d+),(\d+),"(https?://[^.]+\.googleusercontent.com.*?)"', webpage)] self._sort_formats(formats) return { From ee4337d100f68bbb2ae795101d4c391b522ec753 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 6 Nov 2015 20:16:14 +0100 Subject: [PATCH 0144/1214] [videolecture] add support for multi part videos --- youtube_dl/extractor/videolecturesnet.py | 95 +++++++++++++++++------- 1 file changed, 70 insertions(+), 25 deletions(-) diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py index 649ac9433..351706362 100644 --- a/youtube_dl/extractor/videolecturesnet.py +++ b/youtube_dl/extractor/videolecturesnet.py @@ -10,17 +10,19 @@ from ..compat import ( from ..utils import ( ExtractorError, parse_duration, + js_to_json, + parse_iso8601, ) class VideoLecturesNetIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/*(?:[#?].*)?$' + _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?' IE_NAME = 'videolectures.net' _TESTS = [{ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', 'info_dict': { - 'id': 'promogram_igor_mekjavic_eng', + 'id': '20171_part1', 'ext': 'mp4', 'title': 'Automatics, robotics and biocybernetics', 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', @@ -32,7 +34,7 @@ class VideoLecturesNetIE(InfoExtractor): # video with invalid direct format links (HTTP 403) 'url': 'http://videolectures.net/russir2010_filippova_nlp/', 'info_dict': { - 'id': 'russir2010_filippova_nlp', + 'id': '14891_part1', 'ext': 'flv', 'title': 'NLP at Google', 'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3', @@ -46,37 +48,80 @@ class VideoLecturesNetIE(InfoExtractor): }, { 'url': 'http://videolectures.net/deeplearning2015_montreal/', 'info_dict': { - 'id': 'deeplearning2015_montreal', + 'id': '23181', 'title': 'Deep Learning Summer School, Montreal 2015', - 'description': 'md5:90121a40cc6926df1bf04dcd8563ed3b', + 'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7', + 'timestamp': 1438560000, }, 'playlist_count': 30, + }, { + # multi part lecture + 'url': 'http://videolectures.net/mlss09uk_bishop_ibi/', + 'info_dict': { + 'id': '9737', + 'title': 'Introduction To Bayesian Inference', + 'timestamp': 1251622800, + }, + 'playlist': [{ + 'info_dict': { + 'id': '9737_part1', + 'ext': 'wmv', + 'title': 'Introduction To Bayesian Inference', + }, + }, { + 'info_dict': { + 'id': '9737_part2', + 'ext': 'wmv', + 'title': 'Introduction To Bayesian Inference', + }, + }], + 'playlist_count': 2, }] def _real_extract(self, url): - video_id = self._match_id(url) + lecture_slug, part = re.match(self._VALID_URL, url).groups() - smil_url = 'http://videolectures.net/%s/video/1/smil.xml' % video_id + webpage = self._download_webpage(url, lecture_slug) - try: - smil = self._download_smil(smil_url, video_id) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: - # Probably a playlist - webpage = self._download_webpage(url, video_id) - entries = [ - self.url_result(compat_urlparse.urljoin(url, video_url), 'VideoLecturesNet') - for _, video_url in re.findall(r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', webpage)] - playlist_title = self._html_search_meta('title', webpage, 'title', fatal=True) - playlist_description = self._html_search_meta('description', webpage, 'description') - return self.playlist_result(entries, video_id, playlist_title, playlist_description) + cfg = self._parse_json(self._search_regex(r'cfg\s*:\s*({[^}]+})', webpage, 'cfg'), lecture_slug, js_to_json) - info = self._parse_smil(smil, smil_url, video_id) + lecture_id = str(cfg['obj_id']) - info['id'] = video_id + lecture_data = self._download_json('%s/site/api/lecture/%s?format=json' % (self._proto_relative_url(cfg['livepipe'], 'http:'), lecture_id), lecture_id)['lecture'][0] - switch = smil.find('.//switch') - if switch is not None: - info['duration'] = parse_duration(switch.attrib.get('dur')) + lecture_info = { + 'id': lecture_id, + 'display_id': lecture_slug, + 'title': lecture_data['title'], + 'timestamp': parse_iso8601(lecture_data.get('time')), + 'description': lecture_data.get('description_wiki'), + 'thumbnail': lecture_data.get('thumb'), + } - return info + entries = [] + parts = cfg.get('videos') + if parts: + if len(parts) == 1: + part = str(parts[0]) + if part: + smil_url = 'http://videolectures.net/%s/video/%s/smil.xml' % (lecture_slug, part) + smil = self._download_smil(smil_url, lecture_id) + info = self._parse_smil(smil, smil_url, lecture_id) + info['id'] = '%s_part%s' % (lecture_id, part) + switch = smil.find('.//switch') + if switch is not None: + info['duration'] = parse_duration(switch.attrib.get('dur')) + return info + else: + for part in parts: + entries.append(self.url_result('http://videolectures.net/%s/video/%s' % (lecture_slug, part), 'VideoLecturesNet')) + lecture_info['_type'] = 'multi_video' + else: + # Probably a playlist + entries = [ + self.url_result(compat_urlparse.urljoin(url, video_url), 'VideoLecturesNet') + for _, video_url in re.findall(r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', webpage)] + lecture_info['_type'] = 'playlist' + + lecture_info['entries'] = entries + return lecture_info From a06bf87a2c6009d82ec28afe566f653b3deb11bf Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 6 Nov 2015 21:23:41 +0100 Subject: [PATCH 0145/1214] [viidea] add support for sites using viidea service --- youtube_dl/extractor/__init__.py | 2 +- .../{videolecturesnet.py => viidea.py} | 33 ++++++++++++++----- 2 files changed, 26 insertions(+), 9 deletions(-) rename youtube_dl/extractor/{videolecturesnet.py => viidea.py} (77%) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 94150a28f..0a90da73c 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -724,7 +724,6 @@ from .vh1 import VH1IE from .vice import ViceIE from .viddler import ViddlerIE from .videodetective import VideoDetectiveIE -from .videolecturesnet import VideoLecturesNetIE from .videofyme import VideofyMeIE from .videomega import VideoMegaIE from .videopremium import VideoPremiumIE @@ -734,6 +733,7 @@ from .vidme import VidmeIE from .vidzi import VidziIE from .vier import VierIE, VierVideosIE from .viewster import ViewsterIE +from .viidea import ViideaIE from .vimeo import ( VimeoIE, VimeoAlbumIE, diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/viidea.py similarity index 77% rename from youtube_dl/extractor/videolecturesnet.py rename to youtube_dl/extractor/viidea.py index 351706362..71fb298e6 100644 --- a/youtube_dl/extractor/videolecturesnet.py +++ b/youtube_dl/extractor/viidea.py @@ -15,9 +15,23 @@ from ..utils import ( ) -class VideoLecturesNetIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?' - IE_NAME = 'videolectures.net' +class ViideaIE(InfoExtractor): + _VALID_URL = r'''(?x)http://(?:www\.)?(?: + videolectures\.net| + flexilearn\.viidea\.net| + presentations\.ocwconsortium\.org| + video\.travel-zoom\.si| + video\.pomp-forum\.si| + tv\.nil\.si| + video\.hekovnik.com| + video\.szko\.si| + kpk\.viidea\.com| + inside\.viidea\.net| + video\.kiberpipa\.org| + bvvideo\.si| + kongres\.viidea\.net| + edemokracija\.viidea\.com + )(?:/lecture)?/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?''' _TESTS = [{ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', @@ -87,7 +101,9 @@ class VideoLecturesNetIE(InfoExtractor): lecture_id = str(cfg['obj_id']) - lecture_data = self._download_json('%s/site/api/lecture/%s?format=json' % (self._proto_relative_url(cfg['livepipe'], 'http:'), lecture_id), lecture_id)['lecture'][0] + base_url = self._proto_relative_url(cfg['livepipe'], 'http:') + + lecture_data = self._download_json('%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), lecture_id)['lecture'][0] lecture_info = { 'id': lecture_id, @@ -104,7 +120,7 @@ class VideoLecturesNetIE(InfoExtractor): if len(parts) == 1: part = str(parts[0]) if part: - smil_url = 'http://videolectures.net/%s/video/%s/smil.xml' % (lecture_slug, part) + smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part) smil = self._download_smil(smil_url, lecture_id) info = self._parse_smil(smil, smil_url, lecture_id) info['id'] = '%s_part%s' % (lecture_id, part) @@ -114,13 +130,14 @@ class VideoLecturesNetIE(InfoExtractor): return info else: for part in parts: - entries.append(self.url_result('http://videolectures.net/%s/video/%s' % (lecture_slug, part), 'VideoLecturesNet')) + entries.append(self.url_result('%s/video/%s' % (base_url, lecture_id, part), 'Viidea')) lecture_info['_type'] = 'multi_video' else: # Probably a playlist + playlist_webpage = self._download_webpage('%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id) entries = [ - self.url_result(compat_urlparse.urljoin(url, video_url), 'VideoLecturesNet') - for _, video_url in re.findall(r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', webpage)] + self.url_result(compat_urlparse.urljoin(url, video_url), 'Viidea') + for _, video_url in re.findall(r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)] lecture_info['_type'] = 'playlist' lecture_info['entries'] = entries From 8e3a2bd6200660f9fb9d485b1c924fa5462bd566 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 7 Nov 2015 17:43:23 +0100 Subject: [PATCH 0146/1214] [viidea] fix _VALID_URL regex and tests --- youtube_dl/extractor/viidea.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index 71fb298e6..ae9a42737 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -31,7 +31,7 @@ class ViideaIE(InfoExtractor): bvvideo\.si| kongres\.viidea\.net| edemokracija\.viidea\.com - )(?:/lecture)?/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?''' + )(?:/lecture)?/(?P<id>[^/]+)(?:/video/(?P<part>\d+))?/*(?:[#?].*)?$''' _TESTS = [{ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', @@ -130,7 +130,7 @@ class ViideaIE(InfoExtractor): return info else: for part in parts: - entries.append(self.url_result('%s/video/%s' % (base_url, lecture_id, part), 'Viidea')) + entries.append(self.url_result('%s/%s/video/%s' % (base_url, lecture_slug, part), 'Viidea')) lecture_info['_type'] = 'multi_video' else: # Probably a playlist From 6fdb39ded15c6276b49fa67cb517bf1fed63af35 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Sat, 7 Nov 2015 20:38:33 +0100 Subject: [PATCH 0147/1214] [viidia] Cleaup [viidea] extract playlist if lecture is an event [viidia] use compat_str --- youtube_dl/extractor/viidea.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index ae9a42737..2541a36ed 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -4,11 +4,10 @@ import re from .common import InfoExtractor from ..compat import ( - compat_HTTPError, compat_urlparse, + compat_str, ) from ..utils import ( - ExtractorError, parse_duration, js_to_json, parse_iso8601, @@ -97,9 +96,9 @@ class ViideaIE(InfoExtractor): webpage = self._download_webpage(url, lecture_slug) - cfg = self._parse_json(self._search_regex(r'cfg\s*:\s*({[^}]+})', webpage, 'cfg'), lecture_slug, js_to_json) + cfg = self._parse_json(self._search_regex([r'cfg\s*:\s*({.+?}),[\da-zA-Z_]:\(?function', r'cfg\s*:\s*({[^}]+})'], webpage, 'cfg'), lecture_slug, js_to_json) - lecture_id = str(cfg['obj_id']) + lecture_id = compat_str(cfg['obj_id']) base_url = self._proto_relative_url(cfg['livepipe'], 'http:') @@ -118,7 +117,7 @@ class ViideaIE(InfoExtractor): parts = cfg.get('videos') if parts: if len(parts) == 1: - part = str(parts[0]) + part = compat_str(parts[0]) if part: smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part) smil = self._download_smil(smil_url, lecture_id) @@ -132,7 +131,7 @@ class ViideaIE(InfoExtractor): for part in parts: entries.append(self.url_result('%s/%s/video/%s' % (base_url, lecture_slug, part), 'Viidea')) lecture_info['_type'] = 'multi_video' - else: + if not parts or lecture_data.get('type') == 'evt': # Probably a playlist playlist_webpage = self._download_webpage('%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id) entries = [ From e8ce2375e0851e65c4882002297404825fe1045e Mon Sep 17 00:00:00 2001 From: Sergey M? <dstftw@gmail.com> Date: Sun, 8 Nov 2015 06:54:27 +0600 Subject: [PATCH 0148/1214] [viidea] Improve and cleanup (Closes #7390) * Optimize requests for multipart videos * Fix cfg regex * Improve titles and identifiers --- youtube_dl/extractor/viidea.py | 99 ++++++++++++++++++++++++---------- 1 file changed, 72 insertions(+), 27 deletions(-) diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py index 2541a36ed..525e303d4 100644 --- a/youtube_dl/extractor/viidea.py +++ b/youtube_dl/extractor/viidea.py @@ -35,35 +35,42 @@ class ViideaIE(InfoExtractor): _TESTS = [{ 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', 'info_dict': { - 'id': '20171_part1', + 'id': '20171', + 'display_id': 'promogram_igor_mekjavic_eng', 'ext': 'mp4', 'title': 'Automatics, robotics and biocybernetics', 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', + 'thumbnail': 're:http://.*\.jpg', + 'timestamp': 1372349289, 'upload_date': '20130627', 'duration': 565, - 'thumbnail': 're:http://.*\.jpg', }, }, { # video with invalid direct format links (HTTP 403) 'url': 'http://videolectures.net/russir2010_filippova_nlp/', 'info_dict': { - 'id': '14891_part1', + 'id': '14891', + 'display_id': 'russir2010_filippova_nlp', 'ext': 'flv', 'title': 'NLP at Google', 'description': 'md5:fc7a6d9bf0302d7cc0e53f7ca23747b3', - 'duration': 5352, 'thumbnail': 're:http://.*\.jpg', + 'timestamp': 1284375600, + 'upload_date': '20100913', + 'duration': 5352, }, 'params': { # rtmp download 'skip_download': True, }, }, { + # event playlist 'url': 'http://videolectures.net/deeplearning2015_montreal/', 'info_dict': { 'id': '23181', 'title': 'Deep Learning Summer School, Montreal 2015', 'description': 'md5:0533a85e4bd918df52a01f0e1ebe87b7', + 'thumbnail': 're:http://.*\.jpg', 'timestamp': 1438560000, }, 'playlist_count': 30, @@ -72,37 +79,54 @@ class ViideaIE(InfoExtractor): 'url': 'http://videolectures.net/mlss09uk_bishop_ibi/', 'info_dict': { 'id': '9737', + 'display_id': 'mlss09uk_bishop_ibi', 'title': 'Introduction To Bayesian Inference', + 'thumbnail': 're:http://.*\.jpg', 'timestamp': 1251622800, }, 'playlist': [{ 'info_dict': { 'id': '9737_part1', + 'display_id': 'mlss09uk_bishop_ibi_part1', 'ext': 'wmv', - 'title': 'Introduction To Bayesian Inference', + 'title': 'Introduction To Bayesian Inference (Part 1)', + 'thumbnail': 're:http://.*\.jpg', + 'duration': 4622, + 'timestamp': 1251622800, + 'upload_date': '20090830', }, }, { 'info_dict': { 'id': '9737_part2', + 'display_id': 'mlss09uk_bishop_ibi_part2', 'ext': 'wmv', - 'title': 'Introduction To Bayesian Inference', + 'title': 'Introduction To Bayesian Inference (Part 2)', + 'thumbnail': 're:http://.*\.jpg', + 'duration': 5641, + 'timestamp': 1251622800, + 'upload_date': '20090830', }, }], 'playlist_count': 2, }] def _real_extract(self, url): - lecture_slug, part = re.match(self._VALID_URL, url).groups() + lecture_slug, explicit_part_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, lecture_slug) - cfg = self._parse_json(self._search_regex([r'cfg\s*:\s*({.+?}),[\da-zA-Z_]:\(?function', r'cfg\s*:\s*({[^}]+})'], webpage, 'cfg'), lecture_slug, js_to_json) + cfg = self._parse_json(self._search_regex( + [r'cfg\s*:\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*:\s*\(?\s*function', + r'cfg\s*:\s*({[^}]+})'], + webpage, 'cfg'), lecture_slug, js_to_json) lecture_id = compat_str(cfg['obj_id']) base_url = self._proto_relative_url(cfg['livepipe'], 'http:') - lecture_data = self._download_json('%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), lecture_id)['lecture'][0] + lecture_data = self._download_json( + '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id), + lecture_id)['lecture'][0] lecture_info = { 'id': lecture_id, @@ -113,31 +137,52 @@ class ViideaIE(InfoExtractor): 'thumbnail': lecture_data.get('thumb'), } - entries = [] - parts = cfg.get('videos') + playlist_entries = [] + lecture_type = lecture_data.get('type') + parts = [compat_str(video) for video in cfg.get('videos', [])] if parts: - if len(parts) == 1: - part = compat_str(parts[0]) - if part: - smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part) + multipart = len(parts) > 1 + + def extract_part(part_id): + smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part_id) smil = self._download_smil(smil_url, lecture_id) info = self._parse_smil(smil, smil_url, lecture_id) - info['id'] = '%s_part%s' % (lecture_id, part) + info['id'] = lecture_id if not multipart else '%s_part%s' % (lecture_id, part_id) + info['display_id'] = lecture_slug if not multipart else '%s_part%s' % (lecture_slug, part_id) + if multipart: + info['title'] += ' (Part %s)' % part_id switch = smil.find('.//switch') if switch is not None: info['duration'] = parse_duration(switch.attrib.get('dur')) - return info + item_info = lecture_info.copy() + item_info.update(info) + return item_info + + if explicit_part_id or not multipart: + result = extract_part(explicit_part_id or parts[0]) else: - for part in parts: - entries.append(self.url_result('%s/%s/video/%s' % (base_url, lecture_slug, part), 'Viidea')) - lecture_info['_type'] = 'multi_video' - if not parts or lecture_data.get('type') == 'evt': - # Probably a playlist - playlist_webpage = self._download_webpage('%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id) + result = { + '_type': 'multi_video', + 'entries': [extract_part(part) for part in parts], + } + result.update(lecture_info) + + # Immediately return explicitly requested part or non event item + if explicit_part_id or lecture_type != 'evt': + return result + + playlist_entries.append(result) + + # It's probably a playlist + if not parts or lecture_type == 'evt': + playlist_webpage = self._download_webpage( + '%s/site/ajax/drilldown/?id=%s' % (base_url, lecture_id), lecture_id) entries = [ self.url_result(compat_urlparse.urljoin(url, video_url), 'Viidea') - for _, video_url in re.findall(r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)] - lecture_info['_type'] = 'playlist' + for _, video_url in re.findall( + r'<a[^>]+href=(["\'])(.+?)\1[^>]+id=["\']lec=\d+', playlist_webpage)] + playlist_entries.extend(entries) - lecture_info['entries'] = entries - return lecture_info + playlist = self.playlist_result(playlist_entries, lecture_id) + playlist.update(lecture_info) + return playlist From d5c181a14e08198e400932d591b47683a630c8c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 8 Nov 2015 11:49:51 +0100 Subject: [PATCH 0149/1214] [movieclips] Fix extraction (fixes #7404) They use theplatform now. Changed the test, because the old one seems to be georestricted. --- youtube_dl/extractor/movieclips.py | 77 ++++++++---------------------- 1 file changed, 19 insertions(+), 58 deletions(-) diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py index 04e17d055..e06828b55 100644 --- a/youtube_dl/extractor/movieclips.py +++ b/youtube_dl/extractor/movieclips.py @@ -1,80 +1,41 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import ( - compat_str, -) -from ..utils import ( - ExtractorError, - clean_html, + compat_urllib_request, ) class MovieClipsIE(InfoExtractor): - _VALID_URL = r'https?://movieclips\.com/(?P<id>[\da-zA-Z]+)(?:-(?P<display_id>[\da-z-]+))?' + _VALID_URL = r'https?://(?:www.)?movieclips\.com/videos/(?P<id>[^/?#]+)' _TEST = { - 'url': 'http://movieclips.com/Wy7ZU-my-week-with-marilyn-movie-do-you-love-me/', + 'url': 'http://www.movieclips.com/videos/warcraft-trailer-1-561180739597?autoPlay=true&playlistId=5', 'info_dict': { - 'id': 'Wy7ZU', - 'display_id': 'my-week-with-marilyn-movie-do-you-love-me', + 'id': 'pKIGmG83AqD9', + 'display_id': 'warcraft-trailer-1-561180739597', 'ext': 'mp4', - 'title': 'My Week with Marilyn - Do You Love Me?', - 'description': 'md5:e86795bd332fe3cff461e7c8dc542acb', + 'title': 'Warcraft Trailer 1', + 'description': 'Watch Trailer 1 from Warcraft (2016). Legendary’s WARCRAFT is a 3D epic adventure of world-colliding conflict based.', 'thumbnail': 're:^https?://.*\.jpg$', }, - 'params': { - # rtmp download - 'skip_download': True, - } + 'add_ie': ['ThePlatform'], } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - show_id = display_id or video_id + display_id = self._match_id(url) - config = self._download_xml( - 'http://config.movieclips.com/player/config/%s' % video_id, - show_id, 'Downloading player config') - - if config.find('./country-region').text == 'false': - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, config.find('./region_alert').text), expected=True) - - properties = config.find('./video/properties') - smil_file = properties.attrib['smil_file'] - - smil = self._download_xml(smil_file, show_id, 'Downloading SMIL') - base_url = smil.find('./head/meta').attrib['base'] - - formats = [] - for video in smil.findall('./body/switch/video'): - vbr = int(video.attrib['system-bitrate']) / 1000 - src = video.attrib['src'] - formats.append({ - 'url': base_url, - 'play_path': src, - 'ext': src.split(':')[0], - 'vbr': vbr, - 'format_id': '%dk' % vbr, - }) - - self._sort_formats(formats) - - title = '%s - %s' % (properties.attrib['clip_movie_title'], properties.attrib['clip_title']) - description = clean_html(compat_str(properties.attrib['clip_description'])) - thumbnail = properties.attrib['image'] - categories = properties.attrib['clip_categories'].split(',') + req = compat_urllib_request.Request(url) + # it doesn't work if it thinks the browser it's too old + req.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/43.0 (Chrome)') + webpage = self._download_webpage(req, display_id) + theplatform_link = self._html_search_regex(r'src="(http://player.theplatform.com/p/.*?)"', webpage, 'theplatform link') + title = self._html_search_regex(r'<title[^>]*>([^>]+)-\s*\d+\s*|\s*Movieclips.com', webpage, 'title') + description = self._html_search_meta('description', webpage) return { - 'id': video_id, - 'display_id': display_id, + '_type': 'url_transparent', + 'url': theplatform_link, 'title': title, + 'display_id': display_id, 'description': description, - 'thumbnail': thumbnail, - 'categories': categories, - 'formats': formats, } From 937511dfc01c3d00c35a00f78c2b6f989b4d46e3 Mon Sep 17 00:00:00 2001 From: Frans de Jonge Date: Sat, 7 Nov 2015 22:55:02 +0100 Subject: [PATCH 0150/1214] Added support for the RTBF OUFtivi subpage --- youtube_dl/extractor/rtbf.py | 41 ++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index 04a66df90..e75b45112 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -9,17 +9,36 @@ from ..utils import ( class RTBFIE(InfoExtractor): - _VALID_URL = r'https?://www.rtbf.be/video/[^\?]+\?id=(?P\d+)' - _TEST = { - 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', - 'md5': '799f334ddf2c0a582ba80c44655be570', - 'info_dict': { - 'id': '1921274', - 'ext': 'mp4', - 'title': 'Les Diables au coeur (épisode 2)', - 'duration': 3099, - } - } + _VALID_URL = r'''(?x) + https?://www\.rtbf\.be/ + (?: + video/[^\?]+\?id=| + ouftivi/heros/[^&]+&videoId= + ) + (?P\d+) + ''' + _TESTS = [ + { + 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', + 'md5': '799f334ddf2c0a582ba80c44655be570', + 'info_dict': { + 'id': '1921274', + 'ext': 'mp4', + 'title': 'Les Diables au coeur (épisode 2)', + 'duration': 3099, + } + }, + { + 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', + 'md5': '25aea17e949e1e0c7c41270d60d25f22', + 'info_dict': { + 'id': '2057442', + 'ext': 'mp4', + 'title': 'Scooby-Doo, myst\xe8res associ\xe9s', + 'duration': 1279, + } + }, + ] _QUALITIES = [ ('mobile', 'mobile'), From fda2717ef9d429358d5816582590d15d18f9109f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 16:56:20 +0600 Subject: [PATCH 0151/1214] [movieclips] Add coding cookie --- youtube_dl/extractor/movieclips.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/movieclips.py b/youtube_dl/extractor/movieclips.py index e06828b55..b8c43a163 100644 --- a/youtube_dl/extractor/movieclips.py +++ b/youtube_dl/extractor/movieclips.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor From 114e6025b09e12bd01b5ce22bd2c43a3ef0ba460 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 17:01:45 +0600 Subject: [PATCH 0152/1214] [rtbf] Expand _VALID_URL (Closes #7402) --- youtube_dl/extractor/rtbf.py | 48 ++++++++++++++---------------------- 1 file changed, 18 insertions(+), 30 deletions(-) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index e75b45112..acf10e253 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -9,36 +9,24 @@ from ..utils import ( class RTBFIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?://www\.rtbf\.be/ - (?: - video/[^\?]+\?id=| - ouftivi/heros/[^&]+&videoId= - ) - (?P\d+) - ''' - _TESTS = [ - { - 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', - 'md5': '799f334ddf2c0a582ba80c44655be570', - 'info_dict': { - 'id': '1921274', - 'ext': 'mp4', - 'title': 'Les Diables au coeur (épisode 2)', - 'duration': 3099, - } - }, - { - 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', - 'md5': '25aea17e949e1e0c7c41270d60d25f22', - 'info_dict': { - 'id': '2057442', - 'ext': 'mp4', - 'title': 'Scooby-Doo, myst\xe8res associ\xe9s', - 'duration': 1279, - } - }, - ] + _VALID_URL = r'https?://www\.rtbf\.be/(?:video/[^?]+\?.*\bid=|ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=)(?P\d+)' + _TESTS = [{ + 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', + 'md5': '799f334ddf2c0a582ba80c44655be570', + 'info_dict': { + 'id': '1921274', + 'ext': 'mp4', + 'title': 'Les Diables au coeur (épisode 2)', + 'duration': 3099, + } + }, { + # geo restricted + 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', + 'only_matching': True, + }, { + 'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858', + 'only_matching': True, + }] _QUALITIES = [ ('mobile', 'mobile'), From aa8d2d5be6a99542b85a85af3310fab1bf641e86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 17:03:21 +0600 Subject: [PATCH 0153/1214] [rtbf] Make www optional in _VALID_URL --- youtube_dl/extractor/rtbf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/rtbf.py b/youtube_dl/extractor/rtbf.py index acf10e253..e42b319a3 100644 --- a/youtube_dl/extractor/rtbf.py +++ b/youtube_dl/extractor/rtbf.py @@ -9,7 +9,7 @@ from ..utils import ( class RTBFIE(InfoExtractor): - _VALID_URL = r'https?://www\.rtbf\.be/(?:video/[^?]+\?.*\bid=|ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=)(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?rtbf\.be/(?:video/[^?]+\?.*\bid=|ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=)(?P\d+)' _TESTS = [{ 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', 'md5': '799f334ddf2c0a582ba80c44655be570', From 50506cb60798fe4d2ebb9603798b3fb1cb81e55f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 19:01:37 +0600 Subject: [PATCH 0154/1214] [extremetube] Fix extraction (Closes #7163) --- youtube_dl/extractor/extremetube.py | 45 +++++++++++++++++++---------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index c826a5404..3e11e3299 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -3,12 +3,9 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_request, -) +from ..compat import compat_urllib_request from ..utils import ( - qualities, + int_or_none, str_to_int, ) @@ -49,20 +46,36 @@ class ExtremeTubeIE(InfoExtractor): r'Views:\s*\s*([\d,\.]+)', webpage, 'view count', fatal=False)) - flash_vars = compat_parse_qs(self._search_regex( - r']+?name="flashvars"[^>]+?value="([^"]+)"', webpage, 'flash vars')) + flash_vars = self._parse_json( + self._search_regex( + r'var\s+flashvars\s*=\s*({.+?});', webpage, 'flash vars'), + video_id) formats = [] - quality = qualities(['180p', '240p', '360p', '480p', '720p', '1080p']) - for k, vals in flash_vars.items(): - m = re.match(r'quality_(?P[0-9]+p)$', k) - if m is not None: - formats.append({ - 'format_id': m.group('quality'), - 'quality': quality(m.group('quality')), - 'url': vals[0], + for quality_key, video_url in flash_vars.items(): + height = int_or_none(self._search_regex( + r'quality_(\d+)[pP]$', quality_key, 'height', default=None)) + if not height: + continue + f = { + 'url': video_url, + } + mobj = re.search( + r'/(?P\d{3,4})[pP]_(?P\d+)[kK]_\d+', video_url) + if mobj: + height = int(mobj.group('height')) + bitrate = int(mobj.group('bitrate')) + f.update({ + 'format_id': '%dp-%dk' % (height, bitrate), + 'height': height, + 'tbr': bitrate, }) - + else: + f.update({ + 'format_id': '%dp' % height, + 'height': height, + }) + formats.append(f) self._sort_formats(formats) return { From cc8034cc4c52fcbfaf9f8edf34d562c481860193 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 19:14:39 +0600 Subject: [PATCH 0155/1214] [extremetube] Modernize --- youtube_dl/extractor/extremetube.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index 3e11e3299..c5677c82b 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -11,12 +11,12 @@ from ..utils import ( class ExtremeTubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?Pextremetube\.com/.*?video/.+?(?P[0-9]+))(?:[/?&]|$)' + _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P[^/#?&]+)' _TESTS = [{ 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', 'md5': '344d0c6d50e2f16b06e49ca011d8ac69', 'info_dict': { - 'id': '652431', + 'id': 'music-video-14-british-euro-brit-european-cumshots-swallow-652431', 'ext': 'mp4', 'title': 'Music Video 14 british euro brit european cumshots swallow', 'uploader': 'unknown', @@ -26,12 +26,16 @@ class ExtremeTubeIE(InfoExtractor): }, { 'url': 'http://www.extremetube.com/gay/video/abcde-1234', 'only_matching': True, + }, { + 'url': 'http://www.extremetube.com/video/latina-slut-fucked-by-fat-black-dick', + 'only_matching': True, + }, { + 'url': 'http://www.extremetube.com/video/652431', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - url = 'http://www.' + mobj.group('url') + video_id = self._match_id(url) req = compat_urllib_request.Request(url) req.add_header('Cookie', 'age_verified=1') From f09a767d3198823e5c0ac187a91284c8d2736eb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 Nov 2015 19:19:13 +0600 Subject: [PATCH 0156/1214] [mit] Allow external embeds (Closes #7406) --- youtube_dl/extractor/mit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py index f088ab9e2..29ca45778 100644 --- a/youtube_dl/extractor/mit.py +++ b/youtube_dl/extractor/mit.py @@ -86,7 +86,7 @@ class MITIE(TechTVMITIE): webpage = self._download_webpage(url, page_title) embed_url = self._search_regex( r'