From 8bb56eeeea8154f811076c0a9093203fab224003 Mon Sep 17 00:00:00 2001 From: Brian Foley Date: Sat, 2 Jan 2016 19:49:59 +0000 Subject: [PATCH 01/42] [utils] Add extract_attributes for extracting html tag attributes This is much more robust than just using regexps, and handles all the common scenarios, such as empty/no values, repeated attributes, entity decoding, mixed case names, and the different possible value quoting schemes. --- test/test_utils.py | 40 ++++++++++++++++++++++++++++++++++++++++ youtube_dl/compat.py | 6 ++++++ youtube_dl/utils.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 97587ad2f..cb85e18f0 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -28,6 +28,7 @@ from youtube_dl.utils import ( encodeFilename, escape_rfc3986, escape_url, + extract_attributes, ExtractorError, find_xpath_attr, fix_xml_ampersands, @@ -75,6 +76,7 @@ from youtube_dl.utils import ( cli_bool_option, ) from youtube_dl.compat import ( + compat_chr, compat_etree_fromstring, ) @@ -591,6 +593,44 @@ class TestUtil(unittest.TestCase): on = js_to_json('{"abc": "def",}') self.assertEqual(json.loads(on), {'abc': 'def'}) + def test_extract_attributes(self): + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(""), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': "a 'b' c"}) + self.assertEqual(extract_attributes(''), {'x': 'a "b" c'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': '&'}) # XML + self.assertEqual(extract_attributes(''), {'x': '"'}) + self.assertEqual(extract_attributes(''), {'x': '£'}) # HTML 3.2 + self.assertEqual(extract_attributes(''), {'x': 'λ'}) # HTML 4.0 + self.assertEqual(extract_attributes(''), {'x': '&foo'}) + self.assertEqual(extract_attributes(''), {'x': "'"}) + self.assertEqual(extract_attributes(''), {'x': '"'}) + self.assertEqual(extract_attributes(''), {'x': None}) + self.assertEqual(extract_attributes(''), {'x': 'y', 'a': None}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'y': '2', 'x': '3'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': 'y'}) + self.assertEqual(extract_attributes(""), {'x': 'y'}) + self.assertEqual(extract_attributes(''), {'x': '\ny\n'}) + self.assertEqual(extract_attributes(''), {'caps': 'x'}) # Names lowercased + self.assertEqual(extract_attributes(''), {'x': '2'}) + self.assertEqual(extract_attributes(''), {'x': '2'}) + self.assertEqual(extract_attributes(''), {'_:funny-name1': '1'}) + self.assertEqual(extract_attributes(''), {'x': 'Fáilte 世界 \U0001f600'}) + self.assertEqual(extract_attributes(''), {'x': 'décompose\u0301'}) + # "Narrow" Python builds don't support unicode code points outside BMP. + try: + compat_chr(0x10000) + supports_outside_bmp = True + except ValueError: + supports_outside_bmp = False + if supports_outside_bmp: + self.assertEqual(extract_attributes(''), {'x': 'Smile \U0001f600!'}) + def test_clean_html(self): self.assertEqual(clean_html('a:\nb'), 'a: b') self.assertEqual(clean_html('a:\n "b"'), 'a: "b"') diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index b497da696..7b9afc36d 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -77,6 +77,11 @@ try: except ImportError: # Python 2 from urllib import urlretrieve as compat_urlretrieve +try: + from html.parser import HTMLParser as compat_HTMLParser +except ImportError: # Python 2 + from HTMLParser import HTMLParser as compat_HTMLParser + try: from subprocess import DEVNULL @@ -540,6 +545,7 @@ else: from tokenize import generate_tokens as compat_tokenize_tokenize __all__ = [ + 'compat_HTMLParser', 'compat_HTTPError', 'compat_basestring', 'compat_chr', diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 210c47fce..a0234a3a8 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -35,6 +35,7 @@ import xml.etree.ElementTree import zlib from .compat import ( + compat_HTMLParser, compat_basestring, compat_chr, compat_etree_fromstring, @@ -272,6 +273,35 @@ def get_element_by_attribute(attribute, value, html): return unescapeHTML(res) +class HTMLAttributeParser(compat_HTMLParser): + """Trivial HTML parser to gather the attributes for a single element""" + def __init__(self): + self.attrs = { } + compat_HTMLParser.__init__(self) + + def handle_starttag(self, tag, attrs): + self.attrs = dict(attrs) + +def extract_attributes(html_element): + """Given a string for an HTML element such as + + Decode and return a dictionary of attributes. + { + 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz', + 'empty': '', 'noval': None, 'entity': '&', + 'sq': '"', 'dq': '\'' + }. + NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions, + but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5. + """ + parser = HTMLAttributeParser() + parser.feed(html_element) + parser.close() + return parser.attrs def clean_html(html): """Clean an HTML snippet into a readable string""" From 48254c3f2cb315c4b9d2b679a6126f1e1208fbf8 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 16 Mar 2016 09:14:37 +0100 Subject: [PATCH 02/42] [brightcove] some improvements and fixes - use FFmpeg downloader to download m3u8 formats extracted from BrightcoveNew(some of the m3u8 media playlists use AES-128) - update comment and update_url_query to handle url query --- youtube_dl/extractor/brightcove.py | 32 +++++++++++++----------------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index f56b642ab..304fb89e3 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -24,16 +24,16 @@ from ..utils import ( js_to_json, int_or_none, parse_iso8601, - sanitized_Request, unescapeHTML, unsmuggle_url, + update_url_query, ) class BrightcoveLegacyIE(InfoExtractor): IE_NAME = 'brightcove:legacy' _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P.*)' - _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' + _FEDERATED_URL = 'http://c.brightcove.com/services/viewer/htmlFederated' _TESTS = [ { @@ -156,7 +156,7 @@ class BrightcoveLegacyIE(InfoExtractor): # Not all pages define this value if playerKey is not None: params['playerKey'] = playerKey - # The three fields hold the id of the video + # These fields hold the id of the video videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList') if videoPlayer is not None: params['@videoPlayer'] = videoPlayer @@ -185,8 +185,7 @@ class BrightcoveLegacyIE(InfoExtractor): @classmethod def _make_brightcove_url(cls, params): - data = compat_urllib_parse.urlencode(params) - return cls._FEDERATED_URL_TEMPLATE % data + return update_url_query(cls._FEDERATED_URL, params) @classmethod def _extract_brightcove_url(cls, webpage): @@ -240,7 +239,7 @@ class BrightcoveLegacyIE(InfoExtractor): # We set the original url as the default 'Referer' header referer = smuggled_data.get('Referer', url) return self._get_video_info( - videoPlayer[0], query_str, query, referer=referer) + videoPlayer[0], query, referer=referer) elif 'playerKey' in query: player_key = query['playerKey'] return self._get_playlist_info(player_key[0]) @@ -249,15 +248,14 @@ class BrightcoveLegacyIE(InfoExtractor): 'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?', expected=True) - def _get_video_info(self, video_id, query_str, query, referer=None): - request_url = self._FEDERATED_URL_TEMPLATE % query_str - req = sanitized_Request(request_url) + def _get_video_info(self, video_id, query, referer=None): + headers = {} linkBase = query.get('linkBaseURL') if linkBase is not None: referer = linkBase[0] if referer is not None: - req.add_header('Referer', referer) - webpage = self._download_webpage(req, video_id) + headers['Referer'] = referer + webpage = self._download_webpage(self._FEDERATED_URL, video_id, headers=headers, query=query) error_msg = self._html_search_regex( r"

We're sorry.

([\s\n]*

.*?

)+", webpage, @@ -459,12 +457,11 @@ class BrightcoveNewIE(InfoExtractor): r'policyKey\s*:\s*(["\'])(?P.+?)\1', webpage, 'policy key', group='pk') - req = sanitized_Request( - 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' - % (account_id, video_id), - headers={'Accept': 'application/json;pk=%s' % policy_key}) + api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id) try: - json_data = self._download_json(req, video_id) + json_data = self._download_json(api_url, video_id, headers={ + 'Accept': 'application/json;pk=%s' % policy_key + }) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: json_data = self._parse_json(e.cause.read().decode(), video_id) @@ -482,8 +479,7 @@ class BrightcoveNewIE(InfoExtractor): if not src: continue formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + src, video_id, 'mp4', m3u8_id='hls', fatal=False)) elif source_type == 'application/dash+xml': if not src: continue From 23edc49509052e06afe7032802a0f4deb6710b47 Mon Sep 17 00:00:00 2001 From: remitamine Date: Wed, 16 Mar 2016 10:47:39 +0100 Subject: [PATCH 03/42] [tv3] Add new extractor(closes #8059) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/tv3.py | 33 ++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 youtube_dl/extractor/tv3.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bf9fa17c9..9502d07a4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -784,6 +784,7 @@ from .tv2 import ( TV2IE, TV2ArticleIE, ) +from .tv3 import TV3IE from .tv4 import TV4IE from .tvc import ( TVCIE, diff --git a/youtube_dl/extractor/tv3.py b/youtube_dl/extractor/tv3.py new file mode 100644 index 000000000..d3f690dc7 --- /dev/null +++ b/youtube_dl/extractor/tv3.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class TV3IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tv3\.co\.nz/(?P[^/]+)/tabid/\d+/articleID/\d+/MCat/\d+/Default\.aspx' + _TEST = { + 'url': 'http://www.tv3.co.nz/MOTORSPORT-SRS-SsangYong-Hampton-Downs-Round-3/tabid/3692/articleID/121615/MCat/2915/Default.aspx', + 'info_dict': { + 'id': '4659127992001', + 'ext': 'mp4', + 'title': 'CRC Motorsport: SRS SsangYong Hampton Downs Round 3 - S2015 Ep3', + 'description': 'SsangYong Racing Series returns for Round 3 with drivers from New Zealand and Australia taking to the grid at Hampton Downs raceway.', + 'uploader_id': '3812193411001', + 'upload_date': '20151213', + 'timestamp': 1449975272, + }, + 'expected_warnings': [ + 'Failed to download MPD manifest' + ], + 'params': { + 'skip_download': True, + }, + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/3812193411001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + brightcove_id = self._search_regex(r' Date: Wed, 16 Mar 2016 11:46:53 +0100 Subject: [PATCH 04/42] [brightcove:new] extract protocol-less embed URLs(closes #2914) --- youtube_dl/extractor/brightcove.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 304fb89e3..3ab383461 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -413,8 +413,8 @@ class BrightcoveNewIE(InfoExtractor): # Look for iframe embeds [1] for _, url in re.findall( - r']+src=(["\'])((?:https?:)//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): - entries.append(url) + r']+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): + entries.append(url if url.startswith('http') else 'http:' + url) # Look for embed_in_page embeds [2] for video_id, account_id, player_id, embed in re.findall( From a7ba57dc176efaa50b5121a1f63963f4fc0111e7 Mon Sep 17 00:00:00 2001 From: Quan Hua Date: Wed, 16 Mar 2016 10:15:39 +0700 Subject: [PATCH 05/42] [udemy] Update course id regex to cover v4 layout (Closes #8753, closes #8868, closes #8870) --- youtube_dl/extractor/udemy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index f5b5e7fd6..0fd2a0a0a 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -144,7 +144,8 @@ class UdemyIE(InfoExtractor): webpage = self._download_webpage(url, lecture_id) course_id = self._search_regex( - r'data-course-id=["\'](\d+)', webpage, 'course id') + (r'data-course-id=["\'](\d+)', r'"id": (\d+)'), + webpage, 'course id') try: lecture = self._download_lecture(course_id, lecture_id) From 70cab344c48598904fde657620156be62b70ee0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 16 Mar 2016 21:46:09 +0600 Subject: [PATCH 06/42] [udemy] Improve course id v4 regex --- youtube_dl/extractor/udemy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 0fd2a0a0a..74cc36ece 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -144,7 +144,7 @@ class UdemyIE(InfoExtractor): webpage = self._download_webpage(url, lecture_id) course_id = self._search_regex( - (r'data-course-id=["\'](\d+)', r'"id": (\d+)'), + (r'data-course-id=["\'](\d+)', r'"id"\s*:\s*(\d+)'), webpage, 'course id') try: From 96f4f796fb02e3ef13fa6584b8f77ebafaabf59f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 16 Mar 2016 21:47:51 +0600 Subject: [PATCH 07/42] [brightcover] Remove unused import --- youtube_dl/extractor/brightcove.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 3ab383461..59e8008f9 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -9,7 +9,6 @@ from ..compat import ( compat_etree_fromstring, compat_parse_qs, compat_str, - compat_urllib_parse, compat_urllib_parse_urlparse, compat_urlparse, compat_xml_parse_error, From c5229f3926d64bce101d328fc5acf25bda83e0d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 16 Mar 2016 21:50:04 +0600 Subject: [PATCH 08/42] [utils] PEP 8 --- test/test_utils.py | 6 +++--- youtube_dl/utils.py | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 5a0109977..9a3a8ddff 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -641,8 +641,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(extract_attributes(''), {'x': 'y'}) self.assertEqual(extract_attributes(''), {'x': '&'}) # XML self.assertEqual(extract_attributes(''), {'x': '"'}) - self.assertEqual(extract_attributes(''), {'x': '£'}) # HTML 3.2 - self.assertEqual(extract_attributes(''), {'x': 'λ'}) # HTML 4.0 + self.assertEqual(extract_attributes(''), {'x': '£'}) # HTML 3.2 + self.assertEqual(extract_attributes(''), {'x': 'λ'}) # HTML 4.0 self.assertEqual(extract_attributes(''), {'x': '&foo'}) self.assertEqual(extract_attributes(''), {'x': "'"}) self.assertEqual(extract_attributes(''), {'x': '"'}) @@ -654,7 +654,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(extract_attributes(''), {'x': 'y'}) self.assertEqual(extract_attributes(""), {'x': 'y'}) self.assertEqual(extract_attributes(''), {'x': '\ny\n'}) - self.assertEqual(extract_attributes(''), {'caps': 'x'}) # Names lowercased + self.assertEqual(extract_attributes(''), {'caps': 'x'}) # Names lowercased self.assertEqual(extract_attributes(''), {'x': '2'}) self.assertEqual(extract_attributes(''), {'x': '2'}) self.assertEqual(extract_attributes(''), {'_:funny-name1': '1'}) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ec186918c..8ec1bd469 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -273,15 +273,17 @@ def get_element_by_attribute(attribute, value, html): return unescapeHTML(res) + class HTMLAttributeParser(compat_HTMLParser): """Trivial HTML parser to gather the attributes for a single element""" def __init__(self): - self.attrs = { } + self.attrs = {} compat_HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): self.attrs = dict(attrs) + def extract_attributes(html_element): """Given a string for an HTML element such as Date: Wed, 16 Mar 2016 21:26:25 +0100 Subject: [PATCH 09/42] [bravotv] Add new extractor(#4657) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/bravotv.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 youtube_dl/extractor/bravotv.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 9502d07a4..725ebec04 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -81,6 +81,7 @@ from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bpb import BpbIE from .br import BRIE +from .bravotv import BravoTVIE from .breakcom import BreakIE from .brightcove import ( BrightcoveLegacyIE, diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py new file mode 100644 index 000000000..69d00b466 --- /dev/null +++ b/youtube_dl/extractor/bravotv.py @@ -0,0 +1,28 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import smuggle_url + + +class BravoTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+videos/(?P[^/?]+)' + _TEST = { + 'url': 'http://www.bravotv.com/last-chance-kitchen/season-5/videos/lck-ep-12-fishy-finale', + 'md5': 'd60cdf68904e854fac669bd26cccf801', + 'info_dict': { + 'id': 'LitrBdX64qLn', + 'ext': 'mp4', + 'title': 'Last Chance Kitchen Returns', + 'description': 'S13: Last Chance Kitchen Returns for Top Chef Season 13', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + account_pid = self._search_regex(r'"account_pid"\s*:\s*"([^"]+)"', webpage, 'account pid') + release_pid = self._search_regex(r'"release_pid"\s*:\s*"([^"]+)"', webpage, 'release pid') + return self.url_result(smuggle_url( + 'http://link.theplatform.com/s/%s/%s?format=SMIL&mbr=true&switch=progressive' % (account_pid, release_pid), + {'force_smil_url': True}), 'ThePlatform', release_pid) From a646a8cf980a946cfc15d2286fcec6ee3987886f Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 17 Mar 2016 02:02:18 +0100 Subject: [PATCH 10/42] [sbs] improve extraction(fixes #3811) - extract error messages - force the platform smil url(previously the manifest param in the query is not respected which make theplatform return non working mp4 files for some videos) --- youtube_dl/extractor/sbs.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/sbs.py b/youtube_dl/extractor/sbs.py index d6ee2d9e2..2f96477ca 100644 --- a/youtube_dl/extractor/sbs.py +++ b/youtube_dl/extractor/sbs.py @@ -2,6 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + smuggle_url, + ExtractorError, +) class SBSIE(InfoExtractor): @@ -31,21 +35,28 @@ class SBSIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + player_params = self._download_json( + 'http://www.sbs.com.au/api/video_pdkvars/id/%s?form=json' % video_id, video_id) - webpage = self._download_webpage( - 'http://www.sbs.com.au/ondemand/video/single/%s?context=web' % video_id, video_id) - - player_params = self._parse_json( - self._search_regex( - r'(?s)var\s+playerParams\s*=\s*({.+?});', webpage, 'playerParams'), - video_id) + error = player_params.get('error') + if error: + error_message = 'Sorry, The video you are looking for does not exist.' + video_data = error.get('results') or {} + error_code = error.get('errorCode') + if error_code == 'ComingSoon': + error_message = '%s is not yet available.' % video_data.get('title', '') + elif error_code in ('Forbidden', 'intranetAccessOnly'): + error_message = 'Sorry, This video cannot be accessed via this website' + elif error_code == 'Expired': + error_message = 'Sorry, %s is no longer available.' % video_data.get('title', '') + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) urls = player_params['releaseUrls'] - theplatform_url = (urls.get('progressive') or urls.get('standard') or - urls.get('html') or player_params['relatedItemsURL']) + theplatform_url = (urls.get('progressive') or urls.get('html') or + urls.get('standard') or player_params['relatedItemsURL']) return { '_type': 'url_transparent', 'id': video_id, - 'url': theplatform_url, + 'url': smuggle_url(theplatform_url, {'force_smil_url': True}), } From 11f12195af73a2b0a09de928247cb87aed6dd693 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 17 Mar 2016 19:25:37 +0800 Subject: [PATCH 11/42] [youtube] Added itag 91 Seen in https://www.youtube.com/watch?v=jMN4cxyhJjk --- youtube_dl/extractor/youtube.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 27e67feb4..466f5da2e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -309,6 +309,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, # Apple HTTP Live Streaming + '91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, From 0436ec0e7a4683539bc7844511ba76fbcab03f7b Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 17 Mar 2016 16:05:31 +0100 Subject: [PATCH 12/42] [once] Add new format extractor --- youtube_dl/extractor/once.py | 37 ++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 youtube_dl/extractor/once.py diff --git a/youtube_dl/extractor/once.py b/youtube_dl/extractor/once.py new file mode 100644 index 000000000..403f8c0af --- /dev/null +++ b/youtube_dl/extractor/once.py @@ -0,0 +1,37 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class OnceIE(InfoExtractor): + _VALID_URL = r'https?://once\.unicornmedia\.com/now/[^/]+/[^/]+/(?P[^/]+)/(?P[^/]+)/(?:[^/]+/)?(?P[^/]+)/content\.(?:once|m3u8|mp4)' + ADAPTIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/master/playlist/%s/%s/%s/content.m3u8' + PROGRESSIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/media/progressive/%s/%s/%s/%s/content.mp4' + + def _extract_once_formats(self, url): + domain_id, application_id, media_item_id = re.match( + OnceIE._VALID_URL, url).groups() + adaptive_formats = self._extract_m3u8_formats( + self.ADAPTIVE_URL_TEMPLATE % ( + domain_id, application_id, media_item_id), + media_item_id, 'mp4', m3u8_id='hls', fatal=False) + formats = [] + formats.extend(adaptive_formats) + for adaptive_format in adaptive_formats: + rendition_id = self._search_regex( + r'/now/media/playlist/[^/]+/[^/]+/([^/]+)', + adaptive_format['url'], 'redition id', default=None) + if rendition_id: + progressive_format = adaptive_format.copy() + progressive_format.update({ + 'url': self.PROGRESSIVE_URL_TEMPLATE % ( + domain_id, application_id, rendition_id, media_item_id), + 'format_id': adaptive_format['format_id'].replace( + 'hls', 'http'), + 'protocol': 'http', + }) + formats.append(progressive_format) + return formats From 9f02ff537c6ddfd3f1ea3586f3e44f0ec07a2aea Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 17 Mar 2016 16:06:25 +0100 Subject: [PATCH 13/42] [theplatform] extract brightcove once formats --- youtube_dl/extractor/theplatform.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 9a57b49df..ffe7c57ad 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -8,13 +8,12 @@ import binascii import hashlib -from .common import InfoExtractor +from .once import OnceIE from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, ) from ..utils import ( - determine_ext, ExtractorError, float_or_none, int_or_none, @@ -29,7 +28,7 @@ default_ns = 'http://www.w3.org/2005/SMIL21/Language' _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) -class ThePlatformBaseIE(InfoExtractor): +class ThePlatformBaseIE(OnceIE): def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): meta = self._download_xml(smil_url, video_id, note=note) error_element = find_xpath_attr( @@ -38,17 +37,19 @@ class ThePlatformBaseIE(InfoExtractor): if error_element is not None: raise ExtractorError(error_element.attrib['abstract'], expected=True) - formats = self._parse_smil_formats( + smil_formats = self._parse_smil_formats( meta, smil_url, video_id, namespace=default_ns, # the parameters are from syfy.com, other sites may use others, # they also work for nbc.com f4m_params={'g': 'UXWGVKRWHFSP', 'hdcore': '3.0.3'}, transform_rtmp_url=lambda streamer, src: (streamer, 'mp4:' + src)) - for _format in formats: - ext = determine_ext(_format['url']) - if ext == 'once': - _format['ext'] = 'mp4' + formats = [] + for _format in smil_formats: + if OnceIE.suitable(_format['url']): + formats.extend(self._extract_once_formats(_format['url'])) + else: + formats.append(_format) self._sort_formats(formats) @@ -125,7 +126,7 @@ class ThePlatformIE(ThePlatformBaseIE): 'only_matching': True, }, { 'url': 'http://player.theplatform.com/p/2E2eJC/nbcNewsOffsite?guid=tdy_or_siri_150701', - 'md5': '734f3790fb5fc4903da391beeebc4836', + 'md5': 'fb96bb3d85118930a5b055783a3bd992', 'info_dict': { 'id': 'tdy_or_siri_150701', 'ext': 'mp4', @@ -135,7 +136,6 @@ class ThePlatformIE(ThePlatformBaseIE): 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': 1435752600, 'upload_date': '20150701', - 'categories': ['Today/Shows/Orange Room', 'Today/Sections/Money', 'Today/Topics/Tech', "Today/Topics/Editor's picks"], }, }, { # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1 @@ -250,7 +250,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE): _TEST = { # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207 'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207', - 'md5': '22d2b84f058d3586efcd99e57d59d314', + 'md5': '6e32495b5073ab414471b615c5ded394', 'info_dict': { 'id': 'n_hardball_5biden_140207', 'ext': 'mp4', From 574b2a7393ef389792d5010704e505ef0eaaa5e8 Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 17 Mar 2016 16:07:36 +0100 Subject: [PATCH 14/42] [nbc:nbcnews] improve extraction(fixes #6922) - extract more metadata and formats - relax regex --- youtube_dl/extractor/nbc.py | 137 ++++++++++++++++++++++++------------ 1 file changed, 91 insertions(+), 46 deletions(-) diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 2202cfa33..bb0817e34 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -3,13 +3,16 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from .theplatform import ThePlatformIE from ..utils import ( - ExtractorError, find_xpath_attr, lowercase_escape, smuggle_url, unescapeHTML, + update_url_query, + int_or_none, + HEADRequest, + parse_iso8601, ) @@ -131,10 +134,10 @@ class NBCSportsIE(InfoExtractor): NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') -class NBCNewsIE(InfoExtractor): +class NBCNewsIE(ThePlatformIE): _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/ (?:video/.+?/(?P\d+)| - (?:watch|feature|nightly-news)/[^/]+/(?P.+)) + ([^/]+/)*(?P<display_id>[^/?]+)) ''' _TESTS = [ @@ -149,15 +152,14 @@ class NBCNewsIE(InfoExtractor): }, }, { - 'url': 'http://www.nbcnews.com/feature/edward-snowden-interview/how-twitter-reacted-snowden-interview-n117236', - 'md5': 'b2421750c9f260783721d898f4c42063', + 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', + 'md5': 'af1adfa51312291a017720403826bb64', 'info_dict': { - 'id': 'I1wpAI_zmhsQ', + 'id': '269389891880', 'ext': 'mp4', 'title': 'How Twitter Reacted To The Snowden Interview', 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', }, - 'add_ie': ['ThePlatform'], }, { 'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156', @@ -168,17 +170,29 @@ class NBCNewsIE(InfoExtractor): 'title': 'FULL EPISODE: Family Business', 'description': 'md5:757988edbaae9d7be1d585eb5d55cc04', }, + 'skip': 'This page is unavailable.', }, { 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', - 'md5': 'b5dda8cddd8650baa0dcb616dd2cf60d', + 'md5': '73135a2e0ef819107bbb55a5a9b2a802', 'info_dict': { - 'id': 'sekXqyTVnmN3', + 'id': '394064451844', 'ext': 'mp4', 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', }, }, + { + 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', + 'md5': 'a49e173825e5fcd15c13fc297fced39d', + 'info_dict': { + 'id': '529953347624', + 'ext': 'mp4', + 'title': 'Volkswagen U.S. Chief: We \'Totally Screwed Up\'', + 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301', + }, + 'expected_warnings': ['http-6000 is not available'] + }, { 'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952', 'only_matching': True, @@ -202,49 +216,80 @@ class NBCNewsIE(InfoExtractor): } else: # "feature" and "nightly-news" pages use theplatform.com - title = mobj.group('title') - webpage = self._download_webpage(url, title) + display_id = mobj.group('display_id') + webpage = self._download_webpage(url, display_id) + info = None bootstrap_json = self._search_regex( - r'var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$', - webpage, 'bootstrap json', flags=re.MULTILINE) - bootstrap = self._parse_json(bootstrap_json, video_id) - info = bootstrap['results'][0]['video'] - mpxid = info['mpxId'] + r'(?m)var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$', + webpage, 'bootstrap json', default=None) + if bootstrap_json: + bootstrap = self._parse_json(bootstrap_json, display_id) + info = bootstrap['results'][0]['video'] + else: + player_instance_json = self._search_regex( + r'videoObj\s*:\s*({.+})', webpage, 'player instance') + info = self._parse_json(player_instance_json, display_id) + video_id = info['mpxId'] + title = info['title'] - base_urls = [ - info['fallbackPlaylistUrl'], - info['associatedPlaylistUrl'], - ] + subtitles = {} + caption_links = info.get('captionLinks') + if caption_links: + for (sub_key, sub_ext) in (('smpte-tt', 'ttml'), ('web-vtt', 'vtt'), ('srt', 'srt')): + sub_url = caption_links.get(sub_key) + if sub_url: + subtitles.setdefault('en', []).append({ + 'url': sub_url, + 'ext': sub_ext, + }) - for base_url in base_urls: - if not base_url: + formats = [] + for video_asset in info['videoAssets']: + video_url = video_asset.get('publicUrl') + if not video_url: continue - playlist_url = base_url + '?form=MPXNBCNewsAPI' - - try: - all_videos = self._download_json(playlist_url, title) - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError): - continue - raise - - if not all_videos or 'videos' not in all_videos: + container = video_asset.get('format') + asset_type = video_asset.get('assetType') or '' + if container == 'ISM' or asset_type == 'FireTV-Once': continue - - try: - info = next(v for v in all_videos['videos'] if v['mpxId'] == mpxid) - break - except StopIteration: - continue - - if info is None: - raise ExtractorError('Could not find video in playlists') + elif asset_type == 'OnceURL': + tp_formats, tp_subtitles = self._extract_theplatform_smil( + video_url, video_id) + formats.extend(tp_formats) + subtitles = self._merge_subtitles(subtitles, tp_subtitles) + else: + tbr = int_or_none(video_asset.get('bitRate'), 1000) + format_id = 'http%s' % ('-%d' % tbr if tbr else '') + video_url = update_url_query( + video_url, {'format': 'redirect'}) + # resolve the url so that we can check availability and detect the correct extension + head = self._request_webpage( + HEADRequest(video_url), video_id, + 'Checking %s url' % format_id, + '%s is not available' % format_id, + fatal=False) + if head: + video_url = head.geturl() + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'width': int_or_none(video_asset.get('width')), + 'height': int_or_none(video_asset.get('height')), + 'tbr': tbr, + 'container': video_asset.get('format'), + }) + self._sort_formats(formats) return { - '_type': 'url', - # We get the best quality video - 'url': info['videoAssets'][-1]['publicUrl'], - 'ie_key': 'ThePlatform', + 'id': video_id, + 'title': title, + 'description': info.get('description'), + 'thumbnail': info.get('description'), + 'thumbnail': info.get('thumbnail'), + 'duration': int_or_none(info.get('duration')), + 'timestamp': parse_iso8601(info.get('pubDate')), + 'formats': formats, + 'subtitles': subtitles, } From cf45ed786e580999afe864724c3b7d16abadb4e1 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 17 Mar 2016 17:48:17 +0100 Subject: [PATCH 15/42] [wistia] extract more metadata --- youtube_dl/extractor/wistia.py | 42 ++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index 41061dd31..8b14840a2 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -4,6 +4,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, sanitized_Request, + int_or_none, ) @@ -18,6 +19,9 @@ class WistiaIE(InfoExtractor): 'id': 'sh7fpupwlt', 'ext': 'mov', 'title': 'Being Resourceful', + 'description': 'a Clients From Hell Video Series video from worldwidewebhosting', + 'upload_date': '20131204', + 'timestamp': 1386185018, 'duration': 117, }, } @@ -32,35 +36,43 @@ class WistiaIE(InfoExtractor): raise ExtractorError('Error while getting the playlist', expected=True) data = data_json['media'] + title = data['name'] formats = [] thumbnails = [] for a in data['assets']: + astatus = a.get('status') atype = a.get('type') - if atype == 'still': + if (astatus is not None and astatus != 2) or atype == 'preview': + continue + elif atype in ('still', 'still_image'): thumbnails.append({ 'url': a['url'], 'resolution': '%dx%d' % (a['width'], a['height']), }) - continue - if atype == 'preview': - continue - formats.append({ - 'format_id': atype, - 'url': a['url'], - 'width': a['width'], - 'height': a['height'], - 'filesize': a['size'], - 'ext': a['ext'], - 'preference': 1 if atype == 'original' else None, - }) + else: + formats.append({ + 'format_id': atype, + 'url': a['url'], + 'tbr': int_or_none(a.get('bitrate')), + 'vbr': int_or_none(a.get('opt_vbitrate')), + 'width': int_or_none(a.get('width')), + 'height': int_or_none(a.get('height')), + 'filesize': int_or_none(a.get('size')), + 'vcodec': a.get('codec'), + 'container': a.get('container'), + 'ext': a.get('ext'), + 'preference': 1 if atype == 'original' else None, + }) self._sort_formats(formats) return { 'id': video_id, - 'title': data['name'], + 'title': title, + 'description': data.get('seoDescription'), 'formats': formats, 'thumbnails': thumbnails, - 'duration': data.get('duration'), + 'duration': int_or_none(data.get('duration')), + 'timestamp': int_or_none(data.get('createdAt')), } From cc162f6a0aa63a3e050c55cec9da728aa2cb9100 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 17 Mar 2016 22:55:04 +0600 Subject: [PATCH 16/42] [crunchyroll] Fix custom _download_webpage (Closes #8883) --- youtube_dl/extractor/crunchyroll.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index c7032ffa2..85fa7a725 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -54,7 +54,7 @@ class CrunchyrollBaseIE(InfoExtractor): def _real_initialize(self): self._login() - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): + def _download_webpage(self, url_or_request, *args, **kwargs): request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) else sanitized_Request(url_or_request)) # Accept-Language must be set explicitly to accept any language to avoid issues @@ -65,8 +65,7 @@ class CrunchyrollBaseIE(InfoExtractor): # Crunchyroll to not work in georestriction cases in some browsers that don't place # the locale lang first in header. However allowing any language seems to workaround the issue. request.add_header('Accept-Language', '*') - return super(CrunchyrollBaseIE, self)._download_webpage( - request, video_id, note, errnote, fatal, tries, timeout, encoding) + return super(CrunchyrollBaseIE, self)._download_webpage(request, *args, **kwargs) @staticmethod def _add_skip_wall(url): From 8c97e7efb6ab273f0b7c91f0aa9ac6869c911bf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 17 Mar 2016 23:43:14 +0600 Subject: [PATCH 17/42] [animeondemand] Expand episode title regex (Closes #8875) --- youtube_dl/extractor/animeondemand.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index a7d8daf7b..3dbbe2a62 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -93,7 +93,7 @@ class AnimeOnDemandIE(InfoExtractor): for episode_html in re.findall(r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage): m = re.search( - r'class="episodebox-title"[^>]+title="Episode (?P<number>\d+) - (?P<title>.+?)"', episode_html) + r'class="episodebox-title"[^>]+title="(?:Episode|Film)\s*(?P<number>\d+)\s*-\s*(?P<title>.+?)"', episode_html) if not m: continue From b57fecfdddc78b5ef5cfd1c3302f7b79ab1bf64f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 17 Mar 2016 23:50:10 +0600 Subject: [PATCH 18/42] [animeondemand] Add test --- youtube_dl/extractor/animeondemand.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 3dbbe2a62..6cb3a84f9 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -18,7 +18,7 @@ class AnimeOnDemandIE(InfoExtractor): _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in' _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' _NETRC_MACHINE = 'animeondemand' - _TEST = { + _TESTS = [{ 'url': 'https://www.anime-on-demand.de/anime/161', 'info_dict': { 'id': '161', @@ -26,7 +26,11 @@ class AnimeOnDemandIE(InfoExtractor): 'description': 'md5:6681ce3c07c7189d255ac6ab23812d31', }, 'playlist_mincount': 4, - } + }, { + # Film wording is used instead of Episode + 'url': 'https://www.anime-on-demand.de/anime/39', + 'only_matching': True, + }] def _login(self): (username, password) = self._get_login_info() From 85e8f26b827e77cbed6a83268787d450ab2bea3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 00:02:34 +0600 Subject: [PATCH 19/42] [animeondemand] Improve extraction --- youtube_dl/extractor/animeondemand.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 6cb3a84f9..3631c2451 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -30,6 +30,10 @@ class AnimeOnDemandIE(InfoExtractor): # Film wording is used instead of Episode 'url': 'https://www.anime-on-demand.de/anime/39', 'only_matching': True, + }, { + # Episodes without titles + 'url': 'https://www.anime-on-demand.de/anime/162', + 'only_matching': True, }] def _login(self): @@ -95,14 +99,22 @@ class AnimeOnDemandIE(InfoExtractor): entries = [] - for episode_html in re.findall(r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage): - m = re.search( - r'class="episodebox-title"[^>]+title="(?:Episode|Film)\s*(?P<number>\d+)\s*-\s*(?P<title>.+?)"', episode_html) - if not m: + for num, episode_html in enumerate(re.findall( + r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage)): + episodebox_title = self._search_regex( + (r'class="episodebox-title"[^>]+title="(.+?)"', + r'class="episodebox-title"[^>]+>(.+?)<'), + webpage, 'episodebox title', default=None) + if not episodebox_title: continue - episode_number = int(m.group('number')) - episode_title = m.group('title') + episode_number = int(self._search_regex( + r'^(?:Episode|Film)\s*(\d+)', + episodebox_title, 'episode number', default=num)) + episode_title = self._search_regex( + r'(?:Episode|Film)\s*\d+\s*-\s*(?P<title>.+?)', + episodebox_title, 'episode title', default=None) + video_id = 'episode-%d' % episode_number common_info = { From 0d0e282912a7ade43a148518c742557c310a41a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 00:12:34 +0600 Subject: [PATCH 20/42] [animeondemand] Fix typo and improve --- youtube_dl/extractor/animeondemand.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 3631c2451..0158407f6 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -100,19 +100,19 @@ class AnimeOnDemandIE(InfoExtractor): entries = [] for num, episode_html in enumerate(re.findall( - r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage)): + r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage), 1): episodebox_title = self._search_regex( - (r'class="episodebox-title"[^>]+title="(.+?)"', - r'class="episodebox-title"[^>]+>(.+?)<'), - webpage, 'episodebox title', default=None) + (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1', + r'class="episodebox-title"[^>]+>(?P<title>.+?)<'), + episode_html, 'episodebox title', default=None, group='title') if not episodebox_title: continue episode_number = int(self._search_regex( - r'^(?:Episode|Film)\s*(\d+)', + r'(?:Episode|Film)\s*(\d+)', episodebox_title, 'episode number', default=num)) episode_title = self._search_regex( - r'(?:Episode|Film)\s*\d+\s*-\s*(?P<title>.+?)', + r'(?:Episode|Film)\s*\d+\s*-\s*(.+)', episodebox_title, 'episode title', default=None) video_id = 'episode-%d' % episode_number From 57f7e3c62df187457a057be88fca43136f4c507f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 02:51:38 +0600 Subject: [PATCH 21/42] [compat] Add compat_xpath --- youtube_dl/compat.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 74702786a..dbb91a6ef 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -256,6 +256,16 @@ else: el.text = el.text.decode('utf-8') return doc +if sys.version_info < (2, 7): + # Here comes the crazy part: In 2.6, if the xpath is a unicode, + # .//node does not match if a node is a direct child of . ! + def compat_xpath(xpath): + if isinstance(xpath, compat_str): + xpath = xpath.encode('ascii') + return xpath +else: + compat_xpath = lambda xpath: xpath + try: from urllib.parse import parse_qs as compat_parse_qs except ImportError: # Python 2 @@ -585,6 +595,7 @@ __all__ = [ 'compat_urlparse', 'compat_urlretrieve', 'compat_xml_parse_error', + 'compat_xpath', 'shlex_quote', 'subprocess_check_output', 'workaround_optparse_bug9161', From 810c10baa1e0177a6a0ef39496f7e972db02d806 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 02:52:23 +0600 Subject: [PATCH 22/42] [utils] Use compat_xpath --- youtube_dl/utils.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8ec1bd469..ef6e7c7cb 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -50,6 +50,7 @@ from .compat import ( compat_urllib_parse_urlparse, compat_urllib_request, compat_urlparse, + compat_xpath, shlex_quote, ) @@ -165,12 +166,7 @@ if sys.version_info >= (2, 7): return node.find(expr) else: def find_xpath_attr(node, xpath, key, val=None): - # Here comes the crazy part: In 2.6, if the xpath is a unicode, - # .//node does not match if a node is a direct child of . ! - if isinstance(xpath, compat_str): - xpath = xpath.encode('ascii') - - for f in node.findall(xpath): + for f in node.findall(compat_xpath(xpath)): if key not in f.attrib: continue if val is None or f.attrib.get(key) == val: @@ -195,9 +191,7 @@ def xpath_with_ns(path, ns_map): def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): def _find_xpath(xpath): - if sys.version_info < (2, 7): # Crazy 2.6 - xpath = xpath.encode('ascii') - return node.find(xpath) + return node.find(compat_xpath(xpath)) if isinstance(xpath, (str, compat_str)): n = _find_xpath(xpath) From e3d17b3c07c6d8bc7fd45af1e45523e8fde5fb58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 02:54:27 +0600 Subject: [PATCH 23/42] [noz] Fix extraction on python 2.6 by means of using compat_xpath --- youtube_dl/extractor/noz.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/noz.py b/youtube_dl/extractor/noz.py index 656443c49..c47a33d15 100644 --- a/youtube_dl/extractor/noz.py +++ b/youtube_dl/extractor/noz.py @@ -2,7 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_urllib_parse_unquote, + compat_xpath, +) from ..utils import ( int_or_none, find_xpath_attr, @@ -47,7 +50,7 @@ class NozIE(InfoExtractor): duration = int_or_none(xpath_text( doc, './/article/movie/file/duration')) formats = [] - for qnode in doc.findall('.//article/movie/file/qualities/qual'): + for qnode in doc.findall(compat_xpath('.//article/movie/file/qualities/qual')): http_url_ele = find_xpath_attr( qnode, './html_urls/video_url', 'format', 'video/mp4') http_url = http_url_ele.text if http_url_ele is not None else None From 4c92fd2e835cde89866d3dfb1fc05d23196b19db Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 18 Mar 2016 09:21:21 +0100 Subject: [PATCH 24/42] [theplatform] always force theplatform to return a smil for _extract_theplatform_smil --- youtube_dl/extractor/bravotv.py | 2 +- youtube_dl/extractor/cbsnews.py | 2 +- youtube_dl/extractor/cnet.py | 2 +- youtube_dl/extractor/nationalgeographic.py | 2 +- youtube_dl/extractor/theplatform.py | 10 ++++++---- 5 files changed, 10 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/bravotv.py b/youtube_dl/extractor/bravotv.py index 69d00b466..34d451f38 100644 --- a/youtube_dl/extractor/bravotv.py +++ b/youtube_dl/extractor/bravotv.py @@ -24,5 +24,5 @@ class BravoTVIE(InfoExtractor): account_pid = self._search_regex(r'"account_pid"\s*:\s*"([^"]+)"', webpage, 'account pid') release_pid = self._search_regex(r'"release_pid"\s*:\s*"([^"]+)"', webpage, 'release pid') return self.url_result(smuggle_url( - 'http://link.theplatform.com/s/%s/%s?format=SMIL&mbr=true&switch=progressive' % (account_pid, release_pid), + 'http://link.theplatform.com/s/%s/%s?mbr=true&switch=progressive' % (account_pid, release_pid), {'force_smil_url': True}), 'ThePlatform', release_pid) diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 7319ee1b7..8ddcc5097 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -78,7 +78,7 @@ class CBSNewsIE(ThePlatformIE): pid = item.get('media' + format_id) if not pid: continue - release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?format=SMIL&mbr=true' % pid + release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' % pid tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % pid) formats.extend(tp_formats) subtitles = self._merge_subtitles(subtitles, tp_subtitles) diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py index 3cf0bf95b..c154b3e19 100644 --- a/youtube_dl/extractor/cnet.py +++ b/youtube_dl/extractor/cnet.py @@ -60,7 +60,7 @@ class CNETIE(ThePlatformIE): for (fkey, vid) in vdata['files'].items(): if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: continue - release_url = 'http://link.theplatform.com/s/kYEXFC/%s?format=SMIL&mbr=true' % vid + release_url = 'http://link.theplatform.com/s/kYEXFC/%s?mbr=true' % vid if fkey == 'hds': release_url += '&manifest=f4m' tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey) diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 6fc9e7b05..7ce8d9b18 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -48,7 +48,7 @@ class NationalGeographicIE(InfoExtractor): theplatform_id = url_basename(content.attrib.get('url')) return self.url_result(smuggle_url( - 'http://link.theplatform.com/s/ngs/%s?format=SMIL&formats=MPEG4&manifest=f4m' % theplatform_id, + 'http://link.theplatform.com/s/ngs/%s?formats=MPEG4&manifest=f4m' % theplatform_id, # For some reason, the normal links don't work and we must force # the use of f4m {'force_smil_url': True})) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index ffe7c57ad..a148f78ce 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -22,6 +22,7 @@ from ..utils import ( xpath_with_ns, mimetype2ext, find_xpath_attr, + update_url_query, ) default_ns = 'http://www.w3.org/2005/SMIL21/Language' @@ -30,6 +31,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) class ThePlatformBaseIE(OnceIE): def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): + smil_url = update_url_query(smil_url, {'format': 'SMIL'}) meta = self._download_xml(smil_url, video_id, note=note) error_element = find_xpath_attr( meta, _x('.//smil:ref'), 'src', @@ -213,7 +215,7 @@ class ThePlatformIE(ThePlatformBaseIE): webpage, 'smil url', group='url') path = self._search_regex( r'link\.theplatform\.com/s/((?:[^/?#&]+/)+[^/?#&]+)', smil_url, 'path') - smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4&format=SMIL' + smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4' elif mobj.group('config'): config_url = url + '&form=json' config_url = config_url.replace('swf/', 'config/') @@ -223,9 +225,9 @@ class ThePlatformIE(ThePlatformBaseIE): release_url = config['releaseUrl'] else: release_url = 'http://link.theplatform.com/s/%s?mbr=true' % path - smil_url = release_url + '&format=SMIL&formats=MPEG4&manifest=f4m' + smil_url = release_url + '&formats=MPEG4&manifest=f4m' else: - smil_url = 'http://link.theplatform.com/s/%s/meta.smil?format=smil&mbr=true' % path + smil_url = 'http://link.theplatform.com/s/%s?mbr=true' % path sig = smuggled_data.get('sig') if sig: @@ -280,7 +282,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE): first_video_id = None duration = None for item in entry['media$content']: - smil_url = item['plfile$url'] + '&format=SMIL&mbr=true' + smil_url = item['plfile$url'] + '&mbr=true' cur_video_id = ThePlatformIE._match_id(smil_url) if first_video_id is None: first_video_id = cur_video_id From 87c03c6bd22e99d6410c907128ab872e79df1560 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 18 Mar 2016 09:43:28 +0100 Subject: [PATCH 25/42] [theplatform] remove unnecessary import --- youtube_dl/extractor/theplatform.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index a148f78ce..2230dfe02 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -22,7 +22,6 @@ from ..utils import ( xpath_with_ns, mimetype2ext, find_xpath_attr, - update_url_query, ) default_ns = 'http://www.w3.org/2005/SMIL21/Language' @@ -31,8 +30,7 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) class ThePlatformBaseIE(OnceIE): def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): - smil_url = update_url_query(smil_url, {'format': 'SMIL'}) - meta = self._download_xml(smil_url, video_id, note=note) + meta = self._download_xml(smil_url, video_id, note=note, query={'format': 'SMIL'}) error_element = find_xpath_attr( meta, _x('.//smil:ref'), 'src', 'http://link.theplatform.com/s/errorFiles/Unavailable.mp4') From 0d33166ec586b9c75e20835adca927e923cb36e3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 18 Mar 2016 11:43:48 +0100 Subject: [PATCH 26/42] release 2016.03.18 --- docs/supportedsites.md | 4 ++++ youtube_dl/version.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a6dcc2576..3415efc45 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -81,6 +81,7 @@ - **BokeCC** - **Bpb**: Bundeszentrale für politische Bildung - **BR**: Bayerischer Rundfunk Mediathek + - **BravoTV** - **Break** - **brightcove:legacy** - **brightcove:new** @@ -499,6 +500,7 @@ - **Restudy** - **ReverbNation** - **Revision3** + - **RICE** - **RingTV** - **RottenTomatoes** - **Roxwel** @@ -617,6 +619,7 @@ - **ThePlatform** - **ThePlatformFeed** - **TheSixtyOne** + - **TheStar** - **ThisAmericanLife** - **ThisAV** - **THVideo** @@ -650,6 +653,7 @@ - **tv.dfb.de** - **TV2** - **TV2Article** + - **TV3** - **TV4**: tv4.se and tv4play.se - **TVC** - **TVCArticle** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 9216fa547..6b2c5fac9 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.03.14' +__version__ = '2016.03.18' From 61870915323abd126f5440282b1fd5734ee1ce6f Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 18 Mar 2016 11:50:04 +0100 Subject: [PATCH 27/42] [once] check http formats availability --- youtube_dl/extractor/once.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/once.py b/youtube_dl/extractor/once.py index 403f8c0af..080045d4c 100644 --- a/youtube_dl/extractor/once.py +++ b/youtube_dl/extractor/once.py @@ -14,13 +14,12 @@ class OnceIE(InfoExtractor): def _extract_once_formats(self, url): domain_id, application_id, media_item_id = re.match( OnceIE._VALID_URL, url).groups() - adaptive_formats = self._extract_m3u8_formats( + formats = self._extract_m3u8_formats( self.ADAPTIVE_URL_TEMPLATE % ( domain_id, application_id, media_item_id), media_item_id, 'mp4', m3u8_id='hls', fatal=False) - formats = [] - formats.extend(adaptive_formats) - for adaptive_format in adaptive_formats: + progressive_formats = [] + for adaptive_format in formats: rendition_id = self._search_regex( r'/now/media/playlist/[^/]+/[^/]+/([^/]+)', adaptive_format['url'], 'redition id', default=None) @@ -33,5 +32,7 @@ class OnceIE(InfoExtractor): 'hls', 'http'), 'protocol': 'http', }) - formats.append(progressive_format) + progressive_formats.append(progressive_format) + self._check_formats(progressive_formats, media_item_id) + formats.extend(progressive_formats) return formats From cae21032ab38f404a9959e6b28984b960e579fb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 21:08:25 +0600 Subject: [PATCH 28/42] [theplatform] Improve geo restriction detection --- youtube_dl/extractor/theplatform.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 2230dfe02..863914299 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -31,10 +31,9 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns}) class ThePlatformBaseIE(OnceIE): def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): meta = self._download_xml(smil_url, video_id, note=note, query={'format': 'SMIL'}) - error_element = find_xpath_attr( - meta, _x('.//smil:ref'), 'src', - 'http://link.theplatform.com/s/errorFiles/Unavailable.mp4') - if error_element is not None: + error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src') + if error_element is not None and error_element.attrib['src'].startswith( + 'http://link.theplatform.com/s/errorFiles/Unavailable.'): raise ExtractorError(error_element.attrib['abstract'], expected=True) smil_formats = self._parse_smil_formats( From 263eff9537c73caa9bff42b1e675043eaa124f9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 21:50:10 +0600 Subject: [PATCH 29/42] [extractor/generic] Properly extract format id from Content-Type Fixes extraction for cases like: audio/x-mpegURL; charset=utf-8 --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8121f04a5..b75db1252 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1249,7 +1249,7 @@ class GenericIE(InfoExtractor): # Check for direct link to a video content_type = head_response.headers.get('Content-Type', '') - m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>.+)$', content_type) + m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: upload_date = unified_strdate( head_response.headers.get('Last-Modified')) From 955737b2d40c0ce947c13659a27aae0c41077c65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 21:50:44 +0600 Subject: [PATCH 30/42] [extractor/generic] Force Content-Type to lowecase --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index b75db1252..cce7799e2 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1248,7 +1248,7 @@ class GenericIE(InfoExtractor): } # Check for direct link to a video - content_type = head_response.headers.get('Content-Type', '') + content_type = head_response.headers.get('Content-Type', '').lower() m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: upload_date = unified_strdate( From 20938f768b16c945c6041ba3c0a7ae1a4e790881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 21:54:33 +0600 Subject: [PATCH 31/42] [extractor/generic] Add another test for generic m3u8 --- youtube_dl/extractor/generic.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index cce7799e2..62b51e84e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -239,6 +239,20 @@ class GenericIE(InfoExtractor): 'format': 'bestvideo', }, }, + # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 + { + 'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8', + 'info_dict': { + 'id': 'content', + 'ext': 'mp4', + 'title': 'content', + 'formats': 'mincount:8', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + } + }, # google redirect { 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', From 303dcdb99505b29ef4c499cc395ab9ec90c07ec1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 22:41:16 +0600 Subject: [PATCH 32/42] [extractor/generic] Simplify upload_date extraction --- youtube_dl/extractor/generic.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 62b51e84e..a2e7ba5ad 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1259,14 +1259,13 @@ class GenericIE(InfoExtractor): info_dict = { 'id': video_id, 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), + 'upload_date': unified_strdate(head_response.headers.get('Last-Modified')) } # Check for direct link to a video content_type = head_response.headers.get('Content-Type', '').lower() m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: - upload_date = unified_strdate( - head_response.headers.get('Last-Modified')) format_id = m.group('format_id') if format_id.endswith('mpegurl'): formats = self._extract_m3u8_formats(url, video_id, 'mp4') @@ -1281,7 +1280,6 @@ class GenericIE(InfoExtractor): info_dict.update({ 'direct': True, 'formats': formats, - 'upload_date': upload_date, }) return info_dict @@ -1309,12 +1307,9 @@ class GenericIE(InfoExtractor): if not is_html(first_bytes): self._downloader.report_warning( 'URL could be a direct video link, returning it as such.') - upload_date = unified_strdate( - head_response.headers.get('Last-Modified')) info_dict.update({ 'direct': True, 'url': url, - 'upload_date': upload_date, }) return info_dict From de6c51e88eb61b49a95ccfcfa82547c2172eb52b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 22:43:07 +0600 Subject: [PATCH 33/42] [extractor/generic] Fix direct link semantics --- youtube_dl/extractor/generic.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a2e7ba5ad..5649e26da 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1277,10 +1277,8 @@ class GenericIE(InfoExtractor): 'url': url, 'vcodec': 'none' if m.group('type') == 'audio' else None }] - info_dict.update({ - 'direct': True, - 'formats': formats, - }) + info_dict['direct'] = True + info_dict['formats'] = formats return info_dict if not self._downloader.params.get('test', False) and not is_intentional: From 5940862d5a75ae45a640e0ce3104dd18c9864e26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 22:45:28 +0600 Subject: [PATCH 34/42] [extractor/generic] Detect m3u playlists served without proper Content-Type --- youtube_dl/extractor/generic.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 5649e26da..24d43a247 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1299,9 +1299,15 @@ class GenericIE(InfoExtractor): request.add_header('Accept-Encoding', '*') full_response = self._request_webpage(request, video_id) + first_bytes = full_response.read(512) + + # Is it an M3U playlist? + if first_bytes.startswith('#EXTM3U'): + info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4') + return info_dict + # Maybe it's a direct link to a video? # Be careful not to download the whole thing! - first_bytes = full_response.read(512) if not is_html(first_bytes): self._downloader.report_warning( 'URL could be a direct video link, returning it as such.') From edd9b71c2cca7e5a0df8799710d9ad410ec77d29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 22:49:11 +0600 Subject: [PATCH 35/42] [extractor/generic] Add a test for m3u playlist served without proper Content-Type --- youtube_dl/extractor/generic.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 24d43a247..f28a65d9b 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -253,6 +253,21 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, + # m3u8 served with Content-Type: text/plain + { + 'url': 'http://www.nacentapps.com/m3u8/index.m3u8', + 'info_dict': { + 'id': 'index', + 'ext': 'mp4', + 'title': 'index', + 'upload_date': '20140720', + 'formats': 'mincount:11', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + } + }, # google redirect { 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', From 19e2617a6fb614a84340757dacb2ea918c097a84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 18 Mar 2016 23:42:15 +0600 Subject: [PATCH 36/42] [commonprotocols] Add generic support for rtmp URLs (Closes #8488) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/commonprotocols.py | 36 +++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 youtube_dl/extractor/commonprotocols.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 725ebec04..acc0b03bd 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -136,6 +136,7 @@ from .collegerama import CollegeRamaIE from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE from .comcarcoff import ComCarCoffIE from .commonmistakes import CommonMistakesIE, UnicodeBOMIE +from .commonprotocols import RtmpIE from .condenast import CondeNastIE from .cracked import CrackedIE from .crackle import CrackleIE diff --git a/youtube_dl/extractor/commonprotocols.py b/youtube_dl/extractor/commonprotocols.py new file mode 100644 index 000000000..5d130a170 --- /dev/null +++ b/youtube_dl/extractor/commonprotocols.py @@ -0,0 +1,36 @@ +from __future__ import unicode_literals + +import os + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_unquote, + compat_urlparse, +) +from ..utils import url_basename + + +class RtmpIE(InfoExtractor): + IE_DESC = False # Do not list + _VALID_URL = r'(?i)rtmp[est]?://.+' + + _TESTS = [{ + 'url': 'rtmp://cp44293.edgefcs.net/ondemand?auth=daEcTdydfdqcsb8cZcDbAaCbhamacbbawaS-bw7dBb-bWG-GqpGFqCpNCnGoyL&aifp=v001&slist=public/unsecure/audio/2c97899446428e4301471a8cb72b4b97--audio--pmg-20110908-0900a_flv_aac_med_int.mp4', + 'only_matching': True, + }, { + 'url': 'rtmp://edge.live.hitbox.tv/live/dimak', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) + title = compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) + return { + 'id': video_id, + 'title': title, + 'formats': [{ + 'url': url, + 'ext': 'flv', + 'format_id': compat_urlparse.urlparse(url).scheme, + }], + } From d5aacf9a90e0855976401b6085ac56b66ca09d12 Mon Sep 17 00:00:00 2001 From: John Peel <john@dgby.org> Date: Fri, 18 Mar 2016 00:33:03 -0700 Subject: [PATCH 37/42] Added format_id to the filers on -f. --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 8c651cd52..93b6ca54d 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -905,7 +905,7 @@ class YoutubeDL(object): '*=': lambda attr, value: value in attr, } str_operator_rex = re.compile(r'''(?x) - \s*(?P<key>ext|acodec|vcodec|container|protocol) + \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id) \s*(?P<op>%s)(?P<none_inclusive>\s*\?)? \s*(?P<value>[a-zA-Z0-9._-]+) \s*$ From 4c3b16d5d1bf4806693d2895928ac1b03585b2b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Mar 2016 00:04:26 +0600 Subject: [PATCH 38/42] [test_YoutubeDL] Add test for format_id format selection --- test/test_YoutubeDL.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index efbee3b71..ca25025e2 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -222,6 +222,11 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'dash-video-low') + ydl = YDL({'format': 'bestvideo[format_id^=dash][format_id$=low]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'dash-video-low') + formats = [ {'format_id': 'vid-vcodec-dot', 'ext': 'mp4', 'preference': 1, 'vcodec': 'avc1.123456', 'acodec': 'none', 'url': TEST_URL}, ] From 09fc33198a4cfc93a98ce1ba7d51d41c487e5f56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 18 Mar 2016 19:18:55 +0100 Subject: [PATCH 39/42] utils: lookup_unit_table: Use a stricter regex In parse_count multiple units start with the same letter, so it would match different units depending on the order they were sorted when iterating over them. --- youtube_dl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index ef6e7c7cb..bad1c4ea8 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1346,7 +1346,7 @@ def format_bytes(bytes): def lookup_unit_table(unit_table, s): units_re = '|'.join(re.escape(u) for u in unit_table) m = re.match( - r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s) + r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)$' % units_re, s) if not m: return None num_str = m.group('num').replace(',', '.') From 4cd70099ea79a4a82b26694937ca46d31f7436ca Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 18 Mar 2016 21:17:45 +0100 Subject: [PATCH 40/42] [hbo] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/hbo.py | 122 +++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+) create mode 100644 youtube_dl/extractor/hbo.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index acc0b03bd..529051a93 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -284,6 +284,7 @@ from .goshgay import GoshgayIE from .gputechconf import GPUTechConfIE from .groupon import GrouponIE from .hark import HarkIE +from .hbo import HBOIE from .hearthisat import HearThisAtIE from .heise import HeiseIE from .hellporno import HellPornoIE diff --git a/youtube_dl/extractor/hbo.py b/youtube_dl/extractor/hbo.py new file mode 100644 index 000000000..dad0f3994 --- /dev/null +++ b/youtube_dl/extractor/hbo.py @@ -0,0 +1,122 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + xpath_text, + xpath_element, + int_or_none, + parse_duration, +) + + +class HBOIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hbo\.com/video/video\.html\?.*vid=(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.hbo.com/video/video.html?autoplay=true&g=u&vid=1437839', + 'md5': '1c33253f0c7782142c993c0ba62a8753', + 'info_dict': { + 'id': '1437839', + 'ext': 'mp4', + 'title': 'Ep. 64 Clip: Encryption', + } + } + _FORMATS_INFO = { + '1920': { + 'width': 1280, + 'height': 720, + }, + '640': { + 'width': 768, + 'height': 432, + }, + 'highwifi': { + 'width': 640, + 'height': 360, + }, + 'high3g': { + 'width': 640, + 'height': 360, + }, + 'medwifi': { + 'width': 400, + 'height': 224, + }, + 'med3g': { + 'width': 400, + 'height': 224, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_xml( + 'http://render.lv3.hbo.com/data/content/global/videos/data/%s.xml' % video_id, video_id) + title = xpath_text(video_data, 'title', 'title', True) + + formats = [] + for source in xpath_element(video_data, 'videos', 'sources', True): + if source.tag == 'size': + path = xpath_text(source, './/path') + if not path: + continue + width = source.attrib.get('width') + format_info = self._FORMATS_INFO.get(width, {}) + height = format_info.get('height') + fmt = { + 'url': path, + 'format_id': 'http%s' % ('-%dp' % height if height else ''), + 'width': format_info.get('width'), + 'height': height, + } + rtmp = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', path) + if rtmp: + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'ext': 'flv', + 'format_id': fmt['format_id'].replace('http', 'rtmp'), + }) + formats.append(fmt) + else: + video_url = source.text + if not video_url: + continue + if source.tag == 'tarball': + formats.extend(self._extract_m3u8_formats( + video_url.replace('.tar', '/base_index_w8.m3u8'), + video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + format_info = self._FORMATS_INFO.get(source.tag, {}) + formats.append({ + 'format_id': 'http-%s' % source.tag, + 'url': video_url, + 'width': format_info.get('width'), + 'height': format_info.get('height'), + }) + self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) + + thumbnails = [] + card_sizes = xpath_element(video_data, 'titleCardSizes') + if card_sizes is not None: + for size in card_sizes: + path = xpath_text(size, 'path') + if not path: + continue + width = int_or_none(size.get('width')) + thumbnails.append({ + 'id': width, + 'url': path, + 'width': width, + }) + + return { + 'id': video_id, + 'title': title, + 'duration': parse_duration(xpath_element(video_data, 'duration/tv14')), + 'formats': formats, + 'thumbnails': thumbnails, + } From 0d769bcb781b46a00ddf958d6ea945560f2d6cd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 19 Mar 2016 05:43:43 +0600 Subject: [PATCH 41/42] [extractor/generic] Fix missing byte literal prefix --- youtube_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f28a65d9b..26de27a7e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1317,7 +1317,7 @@ class GenericIE(InfoExtractor): first_bytes = full_response.read(512) # Is it an M3U playlist? - if first_bytes.startswith('#EXTM3U'): + if first_bytes.startswith(b'#EXTM3U'): info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4') return info_dict From 782b1b5bd1cdaaead6865dee5d300486e7dd8348 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 19 Mar 2016 11:42:35 +0100 Subject: [PATCH 42/42] [utils] lookup_unit_table: Match word boundary instead of end of string --- test/test_utils.py | 2 ++ youtube_dl/utils.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 9a3a8ddff..325b870cc 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -702,6 +702,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_count('1.000'), 1000) self.assertEqual(parse_count('1.1k'), 1100) self.assertEqual(parse_count('1.1kk'), 1100000) + self.assertEqual(parse_count('1.1kk '), 1100000) + self.assertEqual(parse_count('1.1kk views'), 1100000) def test_version_tuple(self): self.assertEqual(version_tuple('1'), (1,)) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index bad1c4ea8..067b8a184 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1346,7 +1346,7 @@ def format_bytes(bytes): def lookup_unit_table(unit_table, s): units_re = '|'.join(re.escape(u) for u in unit_table) m = re.match( - r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)$' % units_re, s) + r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s) if not m: return None num_str = m.group('num').replace(',', '.')