From bffb245a4882b10b5e66015fa89ef1cadf974415 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 9 Apr 2016 10:47:46 +0100 Subject: [PATCH 001/347] [aol] add support for videos with vidible IDs(closes #9124) --- youtube_dl/extractor/aol.py | 78 +++++++++++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index 95a99c6b0..b729157d2 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -1,11 +1,17 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, +) class AolIE(InfoExtractor): IE_NAME = 'on.aol.com' - _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/video/.*-)(?P[0-9]+)(?:$|\?)' + _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/video/.*-)(?P[^/?-]+)' _TESTS = [{ 'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img', @@ -14,13 +20,79 @@ class AolIE(InfoExtractor): 'id': '518167793', 'ext': 'mp4', 'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam', + 'description': 'A major phone scam has cost thousands of taxpayers more than $1 million, with less than a month until income tax returns are due to the IRS.', + 'timestamp': 1395405060, + 'upload_date': '20140321', + 'uploader': 'Newsy Studio', }, - 'add_ie': ['FiveMin'], + 'params': { + # m3u8 download + 'skip_download': True, + } + }, { + 'url': 'http://on.aol.com/video/netflix-is-raising-rates-5707d6b8e4b090497b04f706?context=PC:homepage:PL1944:1460189336183', + 'info_dict': { + 'id': '5707d6b8e4b090497b04f706', + 'ext': 'mp4', + 'title': 'Netflix is Raising Rates', + 'description': 'Netflix is rewarding millions of it’s long-standing members with an increase in cost. Veuer’s Carly Figueroa has more.', + 'upload_date': '20160408', + 'timestamp': 1460123280, + 'uploader': 'Veuer', + }, + 'params': { + # m3u8 download + 'skip_download': True, + } }] def _real_extract(self, url): video_id = self._match_id(url) - return self.url_result('5min:%s' % video_id) + + response = self._download_json( + 'https://feedapi.b2c.on.aol.com/v1.0/app/videos/aolon/%s/details' % video_id, + video_id)['response'] + if response['statusText'] != 'Ok': + raise ExtractorError('%s said: %s' % (self.IE_NAME, response['statusText']), expected=True) + + video_data = response['data'] + formats = [] + m3u8_url = video_data.get('videoMasterPlaylist') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + for rendition in video_data.get('renditions', []): + video_url = rendition.get('url') + if not video_url: + continue + ext = rendition.get('format') + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: + f = { + 'url': video_url, + 'format_id': rendition.get('quality'), + } + mobj = re.search(r'(\d+)x(\d+)', video_url) + if mobj: + f.update({ + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + }) + formats.append(f) + self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) + + return { + 'id': video_id, + 'title': video_data['title'], + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': int_or_none(video_data.get('publishDate')), + 'view_count': int_or_none(video_data.get('views')), + 'description': video_data.get('description'), + 'uploader': video_data.get('videoOwner'), + 'formats': formats, + } class AolFeaturesIE(InfoExtractor): From cacd9966624883523b264fa9ac48138074597730 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 9 Apr 2016 19:27:54 +0800 Subject: [PATCH 002/347] [utils] Don't touch URLs if not necessary Fix test_Generic_15 (Google redirect) --- youtube_dl/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8e53962c9..999dfabb5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1792,6 +1792,8 @@ def urlencode_postdata(*args, **kargs): def update_url_query(url, query): + if not query: + return url parsed_url = compat_urlparse.urlparse(url) qs = compat_parse_qs(parsed_url.query) qs.update(query) From 92c7f3157aad87096aa1fdd1a4daed3bdf262178 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 9 Apr 2016 17:32:23 +0600 Subject: [PATCH 003/347] [aol] Add coding cookie --- youtube_dl/extractor/aol.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index b729157d2..d4801a25b 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re From ab481b48e536dd2e03d6022abb7f4d1593294721 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 9 Apr 2016 20:12:11 +0800 Subject: [PATCH 004/347] [funnyordie] Relax M3U8 URL matching Also, m3u8_url extraction should be fatal as all formats depends directly or indirectly on it. This change fixes test_Generic_26 and TestFunnyOrDieSubtitles --- youtube_dl/extractor/funnyordie.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 4c4a87e2a..8c5ffc9e8 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -46,8 +46,8 @@ class FunnyOrDieIE(InfoExtractor): links.sort(key=lambda link: 1 if link[1] == 'mp4' else 0) m3u8_url = self._search_regex( - r']+src=(["\'])(?P.+?/master\.m3u8)\1', - webpage, 'm3u8 url', default=None, group='url') + r']+src=(["\'])(?P.+?/master\.m3u8[^"\']*)\1', + webpage, 'm3u8 url', group='url') formats = [] From bfe96d7bea7c5227456bf1aecca51907c8f30c51 Mon Sep 17 00:00:00 2001 From: Philip Huppert Date: Fri, 9 Oct 2015 18:38:11 +0200 Subject: [PATCH 005/347] [presstv] Added extractor PressTV. Fixes #7060 --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/presstv.py | 80 ++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 youtube_dl/extractor/presstv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index de29c7956..c2fa83918 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -583,6 +583,7 @@ from .pornhub import ( from .pornotube import PornotubeIE from .pornovoisines import PornoVoisinesIE from .pornoxo import PornoXOIE +from .presstv import PressTVIE from .primesharetv import PrimeShareTVIE from .promptfile import PromptFileIE from .prosiebensat1 import ProSiebenSat1IE diff --git a/youtube_dl/extractor/presstv.py b/youtube_dl/extractor/presstv.py new file mode 100644 index 000000000..724d8b1c4 --- /dev/null +++ b/youtube_dl/extractor/presstv.py @@ -0,0 +1,80 @@ +# coding: utf-8 +from __future__ import unicode_literals +import re + +from .common import InfoExtractor +from ..utils import str_to_int + + +class PressTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?presstv\.ir/Video/(?P[0-9]+)/(?P[0-9]+)/(?P[0-9]+)/(?P[0-9]+)/' + + _TEST = { + 'url': 'http://www.presstv.ir/Video/2015/10/04/431915/Max-Igan-Press-TV-Face-to-Face', + 'md5': 'e95736ac75088b5f1e5bbb68f248f90d', + 'info_dict': { + 'id': '431915', + 'ext': 'mp4', + 'title': 'Press TV’s full interview with Max Igan', + 'upload_date': '20151004', + 'thumbnail': 'http://217.218.67.233/photo/20151004/d5c333ad-98f9-4bd3-bc3e-a1ad6a192803.jpg', + 'description': ('Watch Press TV’s full interview with Max Igan, a radio talk show host and political ' + 'commentator.\nThe interview, conducted on Press TV’s Face ' + 'to Face program, was aired on October 3, 2015.') + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # extract video URL from webpage + video_url = self._html_search_regex(r'', webpage, + 'Video URL') + + # build list of available formats + # specified in http://www.presstv.ir/Scripts/playback.js + base_url = 'http://192.99.219.222:82/presstv' + formats = [ + { + 'url': base_url + video_url, + 'format': '1080p mp4', + 'format_id': '1080p' + }, { + 'url': base_url + video_url.replace(".mp4", "_low800.mp4"), + 'format': '720p mp4', + 'format_id': '720p' + }, { + 'url': base_url + video_url.replace(".mp4", "_low400.mp4"), + 'format': '360p mp4', + 'format_id': '360p' + }, { + 'url': base_url + video_url.replace(".mp4", "_low200.mp4"), + 'format': '180p mp4', + 'format_id': '180p' + } + ] + formats.reverse() + + # extract video metadata + title = self._html_search_meta('title', webpage, 'Title', True) + title = title.partition(' - ')[2] + + description = self._html_search_regex(r'
(.*?)
', webpage, + 'Description', flags=re.DOTALL) + + thumbnail = self._html_search_meta('og:image', webpage, 'Thumbnail', True) + + year = str_to_int(self._search_regex(PressTVIE._VALID_URL, url, 'Upload year', group='y')) + month = str_to_int(self._search_regex(PressTVIE._VALID_URL, url, 'Upload month', group='m')) + day = str_to_int(self._search_regex(PressTVIE._VALID_URL, url, 'Upload day', group='d')) + upload_date = '%04d%02d%02d' % (year, month, day) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'description': description + } From c05025fdd79993314e20a6074aed084889199e50 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 9 Apr 2016 21:46:51 +0800 Subject: [PATCH 006/347] [internetvideoarchive] Fix extraction and support json URLs --- youtube_dl/extractor/internetvideoarchive.py | 118 +++++++++---------- 1 file changed, 58 insertions(+), 60 deletions(-) diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index e60145b3d..45add007f 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -1,93 +1,91 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import ( + compat_parse_qs, compat_urlparse, - compat_urllib_parse_urlencode, ) from ..utils import ( - xpath_with_ns, + determine_ext, + int_or_none, + xpath_text, ) class InternetVideoArchiveIE(InfoExtractor): - _VALID_URL = r'https?://video\.internetvideoarchive\.net/flash/players/.*?\?.*?publishedid.*?' + _VALID_URL = r'https?://video\.internetvideoarchive\.net/(?:player|flash/players)/.*?\?.*?publishedid.*?' _TEST = { - 'url': 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?customerid=69249&publishedid=452693&playerid=247', + 'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?customerid=69249&publishedid=194487&reporttag=vdbetatitle&playerid=641&autolist=0&domain=www.videodetective.com&maxrate=high&minrate=low&socialplayer=false', 'info_dict': { - 'id': '452693', + 'id': '194487', 'ext': 'mp4', - 'title': 'SKYFALL', - 'description': 'In SKYFALL, Bond\'s loyalty to M is tested as her past comes back to haunt her. As MI6 comes under attack, 007 must track down and destroy the threat, no matter how personal the cost.', - 'duration': 152, + 'title': 'KICK-ASS 2', + 'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, } @staticmethod - def _build_url(query): - return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query + def _build_json_url(query): + return 'http://video.internetvideoarchive.net/player/6/configuration.ashx?' + query @staticmethod - def _clean_query(query): - NEEDED_ARGS = ['publishedid', 'customerid'] - query_dic = compat_urlparse.parse_qs(query) - cleaned_dic = dict((k, v[0]) for (k, v) in query_dic.items() if k in NEEDED_ARGS) - # Other player ids return m3u8 urls - cleaned_dic['playerid'] = '247' - cleaned_dic['videokbrate'] = '100000' - return compat_urllib_parse_urlencode(cleaned_dic) + def _build_xml_url(query): + return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query def _real_extract(self, url): query = compat_urlparse.urlparse(url).query - query_dic = compat_urlparse.parse_qs(query) + query_dic = compat_parse_qs(query) video_id = query_dic['publishedid'][0] - url = self._build_url(query) - flashconfiguration = self._download_xml(url, video_id, - 'Downloading flash configuration') - file_url = flashconfiguration.find('file').text - file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx') - # Replace some of the parameters in the query to get the best quality - # and http links (no m3u8 manifests) - file_url = re.sub(r'(?<=\?)(.+)$', - lambda m: self._clean_query(m.group()), - file_url) - info = self._download_xml(file_url, video_id, - 'Downloading video info') - item = info.find('channel/item') + if '/player/' in url: + configuration = self._download_json(url, video_id) - def _bp(p): - return xpath_with_ns( - p, - { - 'media': 'http://search.yahoo.com/mrss/', - 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats', - } - ) - formats = [] - for content in item.findall(_bp('media:group/media:content')): - attr = content.attrib - f_url = attr['url'] - width = int(attr['width']) - bitrate = int(attr['bitrate']) - format_id = '%d-%dk' % (width, bitrate) - formats.append({ - 'format_id': format_id, - 'url': f_url, - 'width': width, - 'tbr': bitrate, - }) + # There are multiple videos in the playlist whlie only the first one + # matches the video played in browsers + video_info = configuration['playlist'][0] - self._sort_formats(formats) + formats = [] + for source in video_info['sources']: + file_url = source['file'] + if determine_ext(file_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + file_url, video_id, ext='mp4', m3u8_id='hls')) + else: + a_format = { + 'url': file_url, + } + + if source.get('label') and source['label'][-4:] == ' kbs': + tbr = int_or_none(source['label'][:-4]) + a_format.update({ + 'tbr': tbr, + 'format_id': 'http-%d' % tbr, + }) + formats.append(a_format) + + self._sort_formats(formats) + + title = video_info['title'] + description = video_info.get('description') + thumbnail = video_info.get('image') + else: + configuration = self._download_xml(url, video_id) + formats = [{ + 'url': xpath_text(configuration, './file', 'file URL', fatal=True), + }] + thumbnail = xpath_text(configuration, './image', 'thumbnail') + title = 'InternetVideoArchive video %s' % video_id + description = None return { 'id': video_id, - 'title': item.find('title').text, + 'title': title, 'formats': formats, - 'thumbnail': item.find(_bp('media:thumbnail')).attrib['url'], - 'description': item.find('description').text, - 'duration': int(attr['duration']), + 'thumbnail': thumbnail, + 'description': description, } From dae2a058de81e42d73bdbe0041a598262703c352 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 9 Apr 2016 21:47:12 +0800 Subject: [PATCH 007/347] [rottentomatoes] Adapt to InternetVideoArchiveIE --- youtube_dl/extractor/rottentomatoes.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/rottentomatoes.py b/youtube_dl/extractor/rottentomatoes.py index e8bb20a08..f9cd48790 100644 --- a/youtube_dl/extractor/rottentomatoes.py +++ b/youtube_dl/extractor/rottentomatoes.py @@ -1,11 +1,11 @@ from __future__ import unicode_literals -from .videodetective import VideoDetectiveIE +from .common import InfoExtractor +from ..compat import compat_urlparse +from .internetvideoarchive import InternetVideoArchiveIE -# It just uses the same method as videodetective.com, -# the internetvideoarchive.com is extracted from the og:video property -class RottenTomatoesIE(VideoDetectiveIE): +class RottenTomatoesIE(InfoExtractor): _VALID_URL = r'https?://www\.rottentomatoes\.com/m/[^/]+/trailers/(?P\d+)' _TEST = { @@ -13,7 +13,19 @@ class RottenTomatoesIE(VideoDetectiveIE): 'info_dict': { 'id': '613340', 'ext': 'mp4', - 'title': 'TOY STORY 3', - 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', + 'title': 'Toy Story 3', }, } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + og_video = self._og_search_video_url(webpage) + query = compat_urlparse.urlparse(og_video).query + + return { + '_type': 'url_transparent', + 'url': InternetVideoArchiveIE._build_xml_url(query), + 'ie_key': InternetVideoArchiveIE.ie_key(), + 'title': self._og_search_title(webpage), + } From c991106706c05401090bcba79e65feae5c7e3fda Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 9 Apr 2016 21:47:35 +0800 Subject: [PATCH 008/347] [videodetective] Adapt to InternetVideoArchiveIE --- youtube_dl/extractor/videodetective.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py index 0ffc7ff7d..2ed5d9643 100644 --- a/youtube_dl/extractor/videodetective.py +++ b/youtube_dl/extractor/videodetective.py @@ -14,8 +14,11 @@ class VideoDetectiveIE(InfoExtractor): 'id': '194487', 'ext': 'mp4', 'title': 'KICK-ASS 2', - 'description': 'md5:65ba37ad619165afac7d432eaded6013', - 'duration': 138, + 'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, } @@ -24,4 +27,4 @@ class VideoDetectiveIE(InfoExtractor): webpage = self._download_webpage(url, video_id) og_video = self._og_search_video_url(webpage) query = compat_urlparse.urlparse(og_video).query - return self.url_result(InternetVideoArchiveIE._build_url(query), ie=InternetVideoArchiveIE.ie_key()) + return self.url_result(InternetVideoArchiveIE._build_json_url(query), ie=InternetVideoArchiveIE.ie_key()) From 6c4c7539f222cd9e80dfae0b1c9dabbd45d1b3dc Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 9 Apr 2016 22:04:48 +0800 Subject: [PATCH 009/347] [test/helper] Check got values to be strings for md5: fields Seen in PBSIE tests --- test/helper.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/helper.py b/test/helper.py index f2d878212..b8e22c5cb 100644 --- a/test/helper.py +++ b/test/helper.py @@ -143,6 +143,9 @@ def expect_value(self, got, expected, field): expect_value(self, item_got, item_expected, field) else: if isinstance(expected, compat_str) and expected.startswith('md5:'): + self.assertTrue( + isinstance(got, compat_str), + 'Expected field %s to be a unicode object, but got value %r of type %r' % (field, got, type(got))) got = 'md5:' + md5(got) elif isinstance(expected, compat_str) and expected.startswith('mincount:'): self.assertTrue( From 95153a960d098d75e6100e38e77fdaa32f5267a2 Mon Sep 17 00:00:00 2001 From: Philip Huppert Date: Sat, 9 Apr 2016 16:14:05 +0200 Subject: [PATCH 010/347] [presstv] updated extractor and tests to work with current PressTV website --- youtube_dl/extractor/presstv.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/youtube_dl/extractor/presstv.py b/youtube_dl/extractor/presstv.py index 724d8b1c4..9af6780c1 100644 --- a/youtube_dl/extractor/presstv.py +++ b/youtube_dl/extractor/presstv.py @@ -7,20 +7,20 @@ from ..utils import str_to_int class PressTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?presstv\.ir/Video/(?P[0-9]+)/(?P[0-9]+)/(?P[0-9]+)/(?P[0-9]+)/' + _VALID_URL = r'https?://(?:www\.)?presstv\.ir/[^/]+/(?P[0-9]+)/(?P[0-9]+)/(?P[0-9]+)/(?P[0-9]+)/' _TEST = { - 'url': 'http://www.presstv.ir/Video/2015/10/04/431915/Max-Igan-Press-TV-Face-to-Face', - 'md5': 'e95736ac75088b5f1e5bbb68f248f90d', + 'url': 'http://www.presstv.ir/Detail/2016/04/09/459911/Australian-sewerage-treatment-facility-/', + 'md5': '5d7e3195a447cb13e9267e931d8dd5a5', 'info_dict': { - 'id': '431915', + 'id': '459911', 'ext': 'mp4', - 'title': 'Press TV’s full interview with Max Igan', - 'upload_date': '20151004', - 'thumbnail': 'http://217.218.67.233/photo/20151004/d5c333ad-98f9-4bd3-bc3e-a1ad6a192803.jpg', - 'description': ('Watch Press TV’s full interview with Max Igan, a radio talk show host and political ' - 'commentator.\nThe interview, conducted on Press TV’s Face ' - 'to Face program, was aired on October 3, 2015.') + 'title': 'Organic mattresses used to clean waste water', + 'upload_date': '20160409', + 'thumbnail': 'http://media.presstv.com/photo/20160409/41719129-76fa-4372-a09d-bf348278eb5d.jpg', + 'description': ('A trial program at an Australian sewerage treatment facility hopes to change ' + 'the way waste water is treated by using plant mattresses to reduce chemical ' + 'and electricity use.') } } @@ -58,12 +58,10 @@ class PressTVIE(InfoExtractor): # extract video metadata title = self._html_search_meta('title', webpage, 'Title', True) - title = title.partition(' - ')[2] - - description = self._html_search_regex(r'
(.*?)
', webpage, - 'Description', flags=re.DOTALL) + title = title.partition('-')[2].strip() thumbnail = self._html_search_meta('og:image', webpage, 'Thumbnail', True) + description = self._html_search_meta('og:description', webpage, 'Description', True) year = str_to_int(self._search_regex(PressTVIE._VALID_URL, url, 'Upload year', group='y')) month = str_to_int(self._search_regex(PressTVIE._VALID_URL, url, 'Upload month', group='m')) From eb9c3edd5ec970abb349bd4c71040b75e9d19e0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 9 Apr 2016 22:40:05 +0200 Subject: [PATCH 011/347] [test/utils] Add test for date_from_str --- test/test_utils.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index a35debfe1..0f36bb9f0 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -20,6 +20,7 @@ from youtube_dl.utils import ( args_to_str, encode_base_n, clean_html, + date_from_str, DateRange, detect_exe_version, determine_ext, @@ -234,6 +235,13 @@ class TestUtil(unittest.TestCase): self.assertEqual(unescapeHTML('é'), 'é') self.assertEqual(unescapeHTML('�'), '�') + def test_date_from_str(self): + self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day')) + self.assertEqual(date_from_str('now+7day'), date_from_str('now+1week')) + self.assertEqual(date_from_str('now+14day'), date_from_str('now+2week')) + self.assertEqual(date_from_str('now+365day'), date_from_str('now+1year')) + self.assertEqual(date_from_str('now+30day'), date_from_str('now+1month')) + def test_daterange(self): _20century = DateRange("19000101", "20000101") self.assertFalse("17890714" in _20century) From 61dd350a04a77abe86e46cfe8b7603514e8f2ca0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Apr 2016 03:02:35 +0600 Subject: [PATCH 012/347] [1tv] Fix extraction (Closes #9103) --- youtube_dl/extractor/firsttv.py | 145 ++++++++++++++++++++++---------- 1 file changed, 100 insertions(+), 45 deletions(-) diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 98b165143..88bca1007 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -2,78 +2,133 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none +from ..compat import compat_xpath +from ..utils import ( + int_or_none, + qualities, + unified_strdate, + xpath_attr, + xpath_element, + xpath_text, + xpath_with_ns, +) class FirstTVIE(InfoExtractor): IE_NAME = '1tv' IE_DESC = 'Первый канал' - _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P.+)' + _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+p?(?P\d+)' _TESTS = [{ - 'url': 'http://www.1tv.ru/videoarchive/73390', - 'md5': '777f525feeec4806130f4f764bc18a4f', - 'info_dict': { - 'id': '73390', - 'ext': 'mp4', - 'title': 'Олимпийские канатные дороги', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', - 'duration': 149, - 'like_count': int, - 'dislike_count': int, - }, - 'skip': 'Only works from Russia', - }, { + # single format via video_materials.json API 'url': 'http://www.1tv.ru/prj/inprivate/vypusk/35930', - 'md5': 'a1b6b60d530ebcf8daacf4565762bbaf', + 'md5': '82a2777648acae812d58b3f5bd42882b', 'info_dict': { 'id': '35930', 'ext': 'mp4', - 'title': 'Наедине со всеми. Людмила Сенчина', - 'description': 'md5:89553aed1d641416001fe8d450f06cb9', + 'title': 'Гость Людмила Сенчина. Наедине со всеми. Выпуск от 12.02.2015', + 'description': 'md5:357933adeede13b202c7c21f91b871b2', 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', + 'upload_date': '20150212', 'duration': 2694, }, - 'skip': 'Only works from Russia', + }, { + # multiple formats via video_materials.json API + 'url': 'http://www.1tv.ru/video_archive/projects/dobroeutro/p113641', + 'info_dict': { + 'id': '113641', + 'ext': 'mp4', + 'title': 'Весенняя аллергия. Доброе утро. Фрагмент выпуска от 07.04.2016', + 'description': 'md5:8dcebb3dded0ff20fade39087fd1fee2', + 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', + 'upload_date': '20160407', + 'duration': 179, + 'formats': 'mincount:3', + }, + 'params': { + 'skip_download': True, + }, + }, { + # single format only available via ONE_ONLINE_VIDEOS.archive_single_xml API + 'url': 'http://www.1tv.ru/video_archive/series/f7552/p47038', + 'md5': '519d306c5b5669761fd8906c39dbee23', + 'info_dict': { + 'id': '47038', + 'ext': 'mp4', + 'title': '"Побег". Второй сезон. 3 серия', + 'description': 'md5:3abf8f6b9bce88201c33e9a3d794a00b', + 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', + 'upload_date': '20120516', + 'duration': 3080, + }, + }, { + 'url': 'http://www.1tv.ru/videoarchive/9967', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id, 'Downloading page') + # Videos with multiple formats only available via this API + video = self._download_json( + 'http://www.1tv.ru/video_materials.json?legacy_id=%s' % video_id, + video_id, fatal=False) - video_url = self._html_search_regex( - r'''(?s)(?:jwplayer\('flashvideoportal_1'\)\.setup\({|var\s+playlistObj\s*=).*?'file'\s*:\s*'([^']+)'.*?}\);''', - webpage, 'video URL') + description, thumbnail, upload_date, duration = [None] * 4 - title = self._html_search_regex( - [r'
\s*

([^<]*)', - r"'title'\s*:\s*'([^']+)'"], webpage, 'title') - description = self._html_search_regex( - r'
\s*
 
\s*

([^<]*)

', - webpage, 'description', default=None) or self._html_search_meta( + if video: + item = video[0] + title = item['title'] + quality = qualities(('ld', 'sd', 'hd', )) + formats = [{ + 'url': f['src'], + 'format_id': f.get('name'), + 'quality': quality(f.get('name')), + } for f in item['mbr'] if f.get('src')] + thumbnail = item.get('poster') + else: + # Some videos are not available via video_materials.json + video = self._download_xml( + 'http://www.1tv.ru/owa/win/ONE_ONLINE_VIDEOS.archive_single_xml?pid=%s' % video_id, + video_id) + + NS_MAP = { + 'media': 'http://search.yahoo.com/mrss/', + } + + item = xpath_element(video, './channel/item', fatal=True) + title = xpath_text(item, './title', fatal=True) + formats = [{ + 'url': content.attrib['url'], + } for content in item.findall( + compat_xpath(xpath_with_ns('./media:content', NS_MAP))) if content.attrib.get('url')] + thumbnail = xpath_attr( + item, xpath_with_ns('./media:thumbnail', NS_MAP), 'url') + + self._sort_formats(formats) + + webpage = self._download_webpage(url, video_id, 'Downloading page', fatal=False) + if webpage: + title = self._html_search_regex( + (r'
\s*

([^<]*)', + r"'title'\s*:\s*'([^']+)'"), + webpage, 'title', default=None) or title + description = self._html_search_regex( + r'
\s*
 
\s*

([^<]*)

', + webpage, 'description', default=None) or self._html_search_meta( 'description', webpage, 'description') - - thumbnail = self._og_search_thumbnail(webpage) - duration = self._og_search_property( - 'video:duration', webpage, - 'video duration', fatal=False) - - like_count = self._html_search_regex( - r'title="Понравилось".*?/> \[(\d+)\]', - webpage, 'like count', default=None) - dislike_count = self._html_search_regex( - r'title="Не понравилось".*?/> \[(\d+)\]', - webpage, 'dislike count', default=None) + thumbnail = thumbnail or self._og_search_thumbnail(webpage) + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'video duration', fatal=False)) + upload_date = unified_strdate(self._html_search_meta( + 'ya:ovs:upload_date', webpage, 'upload date', fatal=False)) return { 'id': video_id, - 'url': video_url, 'thumbnail': thumbnail, 'title': title, 'description': description, + 'upload_date': upload_date, 'duration': int_or_none(duration), - 'like_count': int_or_none(like_count), - 'dislike_count': int_or_none(dislike_count), + 'formats': formats } From 6a801f44704c3df49563852108c104c43a0551cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 9 Apr 2016 23:18:41 +0200 Subject: [PATCH 013/347] [test/InfoExtractors] add test for _download_json --- test/test_InfoExtractor.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 938466a80..6404ac89f 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -11,6 +11,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL from youtube_dl.extractor.common import InfoExtractor from youtube_dl.extractor import YoutubeIE, get_info_extractor +from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError class TestIE(InfoExtractor): @@ -66,5 +67,14 @@ class TestInfoExtractor(unittest.TestCase): self.assertEqual(ie._html_search_meta('e', html), '5') self.assertEqual(ie._html_search_meta('f', html), '6') + def test_download_json(self): + uri = encode_data_uri(b'{"foo": "blah"}', 'application/json') + self.assertEqual(self.ie._download_json(uri, None), {'foo': 'blah'}) + uri = encode_data_uri(b'callback({"foo": "blah"})', 'application/javascript') + self.assertEqual(self.ie._download_json(uri, None, transform_source=strip_jsonp), {'foo': 'blah'}) + uri = encode_data_uri(b'{"foo": invalid}', 'application/json') + self.assertRaises(ExtractorError, self.ie._download_json, uri, None) + self.assertEqual(self.ie._download_json(uri, None, fatal=False), None) + if __name__ == '__main__': unittest.main() From 49caf3307f1ae713acaeed651984a6338293b8d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Apr 2016 17:10:27 +0600 Subject: [PATCH 014/347] [extractor/common] Remove irrelevant comment --- youtube_dl/extractor/common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 17d00721c..5269059d0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -376,7 +376,6 @@ class InfoExtractor(object): self.to_screen('%s' % (note,)) else: self.to_screen('%s: %s' % (video_id, note)) - # data, headers and query params will be ignored for `Request` objects if isinstance(url_or_request, compat_urllib_request.Request): url_or_request = update_Request( url_or_request, data=data, headers=headers, query=query) From a1fa60a9340f61a8455a0cd85c18f63d9bdfe681 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Apr 2016 18:43:40 +0600 Subject: [PATCH 015/347] [cliprs] Add extractor (Closes #9099) --- youtube_dl/extractor/cliprs.py | 90 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 7 +++ 2 files changed, 97 insertions(+) create mode 100644 youtube_dl/extractor/cliprs.py diff --git a/youtube_dl/extractor/cliprs.py b/youtube_dl/extractor/cliprs.py new file mode 100644 index 000000000..4f9320ea5 --- /dev/null +++ b/youtube_dl/extractor/cliprs.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + parse_iso8601, +) + + +class ClipRsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P[^/]+)/\d+' + _TEST = { + 'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732', + 'md5': 'c412d57815ba07b56f9edc7b5d6a14e5', + 'info_dict': { + 'id': '1488842.1399140381', + 'ext': 'mp4', + 'title': 'PREMIJERA Frajle predstavljaju novi spot za pesmu Moli me, moli', + 'description': 'md5:56ce2c3b4ab31c5a2e0b17cb9a453026', + 'duration': 229, + 'timestamp': 1459850243, + 'upload_date': '20160405', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + r'id=(["\'])mvp:(?P.+?)\1', webpage, 'mvp id', group='id') + + response = self._download_json( + 'http://qi.ckm.onetapi.pl/', video_id, + query={ + 'body[id]': video_id, + 'body[jsonrpc]': '2.0', + 'body[method]': 'get_asset_detail', + 'body[params][ID_Publikacji]': video_id, + 'body[params][Service]': 'www.onet.pl', + 'content-type': 'application/jsonp', + 'x-onet-app': 'player.front.onetapi.pl', + }) + + error = response.get('error') + if error: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error['message']), expected=True) + + video = response['result'].get('0') + + formats = [] + for _, formats_dict in video['formats'].items(): + if not isinstance(formats_dict, dict): + continue + for format_id, format_list in formats_dict.items(): + if not isinstance(format_list, list): + continue + for f in format_list: + if not f.get('url'): + continue + formats.append({ + 'url': f['url'], + 'format_id': format_id, + 'height': int_or_none(f.get('vertical_resolution')), + 'width': int_or_none(f.get('horizontal_resolution')), + 'abr': float_or_none(f.get('audio_bitrate')), + 'vbr': float_or_none(f.get('video_bitrate')), + }) + self._sort_formats(formats) + + meta = video.get('meta', {}) + + title = self._og_search_title(webpage, default=None) or meta['title'] + description = self._og_search_description(webpage, default=None) or meta.get('description') + duration = meta.get('length') or meta.get('lenght') + timestamp = parse_iso8601(meta.get('addDate'), ' ') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index de29c7956..aefc4df01 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -123,6 +123,7 @@ from .chirbit import ( ) from .cinchcast import CinchcastIE from .cinemassacre import CinemassacreIE +from .cliprs import ClipRsIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE from .clipsyndicate import ClipsyndicateIE @@ -939,6 +940,12 @@ from .xhamster import ( XHamsterIE, XHamsterEmbedIE, ) +from .xiami import ( + XiamiIE, + XiamiAlbumIE, + XiamiArtistIE, + XiamiCollectionIE +) from .xminus import XMinusIE from .xnxx import XNXXIE from .xstream import XstreamIE From f44c2768421bc3b0ead3ccf86b5e499d498674c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Apr 2016 19:21:58 +0600 Subject: [PATCH 016/347] [extractor/extractors] Remove non-existant imports --- youtube_dl/extractor/extractors.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index aefc4df01..c1a13c982 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -940,12 +940,6 @@ from .xhamster import ( XHamsterIE, XHamsterEmbedIE, ) -from .xiami import ( - XiamiIE, - XiamiAlbumIE, - XiamiArtistIE, - XiamiCollectionIE -) from .xminus import XMinusIE from .xnxx import XNXXIE from .xstream import XstreamIE From de728757ad7218ce175649ec0d3f0b5723f2c580 Mon Sep 17 00:00:00 2001 From: Philip Huppert Date: Sun, 10 Apr 2016 16:36:44 +0200 Subject: [PATCH 017/347] [presstv] Refactored extractor. --- youtube_dl/extractor/presstv.py | 52 +++++++++++++++------------------ 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/presstv.py b/youtube_dl/extractor/presstv.py index 9af6780c1..755e32528 100644 --- a/youtube_dl/extractor/presstv.py +++ b/youtube_dl/extractor/presstv.py @@ -17,10 +17,8 @@ class PressTVIE(InfoExtractor): 'ext': 'mp4', 'title': 'Organic mattresses used to clean waste water', 'upload_date': '20160409', - 'thumbnail': 'http://media.presstv.com/photo/20160409/41719129-76fa-4372-a09d-bf348278eb5d.jpg', - 'description': ('A trial program at an Australian sewerage treatment facility hopes to change ' - 'the way waste water is treated by using plant mattresses to reduce chemical ' - 'and electricity use.') + 'thumbnail': 're:^https?://.*\.jpg', + 'description': 'md5:20002e654bbafb6908395a5c0cfcd125' } } @@ -35,38 +33,34 @@ class PressTVIE(InfoExtractor): # build list of available formats # specified in http://www.presstv.ir/Scripts/playback.js base_url = 'http://192.99.219.222:82/presstv' - formats = [ - { - 'url': base_url + video_url, - 'format': '1080p mp4', - 'format_id': '1080p' - }, { - 'url': base_url + video_url.replace(".mp4", "_low800.mp4"), - 'format': '720p mp4', - 'format_id': '720p' - }, { - 'url': base_url + video_url.replace(".mp4", "_low400.mp4"), - 'format': '360p mp4', - 'format_id': '360p' - }, { - 'url': base_url + video_url.replace(".mp4", "_low200.mp4"), - 'format': '180p mp4', - 'format_id': '180p' - } + _formats = [ + ("180p", "_low200.mp4"), + ("360p", "_low400.mp4"), + ("720p", "_low800.mp4"), + ("1080p", ".mp4") ] - formats.reverse() + + formats = [] + for fmt in _formats: + format_id, extension = fmt + formats.append({ + 'url': base_url + video_url[:-4] + extension, + 'format_id': format_id + }) # extract video metadata title = self._html_search_meta('title', webpage, 'Title', True) title = title.partition('-')[2].strip() - thumbnail = self._html_search_meta('og:image', webpage, 'Thumbnail', True) - description = self._html_search_meta('og:description', webpage, 'Description', True) + thumbnail = self._og_search_thumbnail(webpage) + description = self._og_search_description(webpage) - year = str_to_int(self._search_regex(PressTVIE._VALID_URL, url, 'Upload year', group='y')) - month = str_to_int(self._search_regex(PressTVIE._VALID_URL, url, 'Upload month', group='m')) - day = str_to_int(self._search_regex(PressTVIE._VALID_URL, url, 'Upload day', group='d')) - upload_date = '%04d%02d%02d' % (year, month, day) + match = re.match(PressTVIE._VALID_URL, url) + upload_date = '%04d%02d%02d' % ( + str_to_int(match.group('y')), + str_to_int(match.group('m')), + str_to_int(match.group('d')) + ) return { 'id': video_id, From 443285aabef470f546f0b01b8e8194ca988bb315 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Apr 2016 22:15:11 +0600 Subject: [PATCH 018/347] [ebaumsworlds] Update _VALID_URL (Closes #9135) --- youtube_dl/extractor/ebaumsworld.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/ebaumsworld.py b/youtube_dl/extractor/ebaumsworld.py index b6bfd2b2d..c97682cd3 100644 --- a/youtube_dl/extractor/ebaumsworld.py +++ b/youtube_dl/extractor/ebaumsworld.py @@ -4,10 +4,10 @@ from .common import InfoExtractor class EbaumsWorldIE(InfoExtractor): - _VALID_URL = r'https?://www\.ebaumsworld\.com/video/watch/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?ebaumsworld\.com/videos/[^/]+/(?P\d+)' _TEST = { - 'url': 'http://www.ebaumsworld.com/video/watch/83367677/', + 'url': 'http://www.ebaumsworld.com/videos/a-giant-python-opens-the-door/83367677/', 'info_dict': { 'id': '83367677', 'ext': 'mp4', From 66fa49586879418e357337ff82794fe851e71e7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Apr 2016 22:37:14 +0600 Subject: [PATCH 019/347] [screencastomatic] Fix extraction (Closes #9136) --- youtube_dl/extractor/screencastomatic.py | 35 ++++++++---------------- 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py index 05337421c..c08c89d94 100644 --- a/youtube_dl/extractor/screencastomatic.py +++ b/youtube_dl/extractor/screencastomatic.py @@ -1,15 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - ExtractorError, - js_to_json, -) +from .jwplatform import JWPlatformBaseIE +from ..utils import js_to_json -class ScreencastOMaticIE(InfoExtractor): +class ScreencastOMaticIE(JWPlatformBaseIE): _VALID_URL = r'https?://screencast-o-matic\.com/watch/(?P[0-9a-zA-Z]+)' _TEST = { 'url': 'http://screencast-o-matic.com/watch/c2lD3BeOPl', @@ -27,23 +23,14 @@ class ScreencastOMaticIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - setup_js = self._search_regex( - r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);", - webpage, 'setup code') - data = self._parse_json(setup_js, video_id, transform_source=js_to_json) - try: - video_data = next( - m for m in data['modes'] if m.get('type') == 'html5') - except StopIteration: - raise ExtractorError('Could not find any video entries!') - video_url = compat_urlparse.urljoin(url, video_data['config']['file']) - thumbnail = data.get('image') + jwplayer_data = self._parse_json( + self._search_regex( + r"(?s)jwplayer\('mp4Player'\).setup\((\{.*?\})\);", webpage, 'setup code'), + video_id, transform_source=js_to_json) - return { - 'id': video_id, + info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) + info_dict.update({ 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), - 'url': video_url, - 'ext': 'mp4', - 'thumbnail': thumbnail, - } + }) + return info_dict From a6d6722c8fc2174ce72ed462e649d397d1448a0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Apr 2016 22:47:38 +0600 Subject: [PATCH 020/347] [jwplatform:base] Extract duration --- youtube_dl/extractor/jwplatform.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 6770685d7..01601c59e 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + float_or_none, + int_or_none, +) class JWPlatformBaseIE(InfoExtractor): @@ -41,6 +44,7 @@ class JWPlatformBaseIE(InfoExtractor): 'description': video_data.get('description'), 'thumbnail': self._proto_relative_url(video_data.get('image')), 'timestamp': int_or_none(video_data.get('pubdate')), + 'duration': float_or_none(jwplayer_data.get('duration')), 'subtitles': subtitles, 'formats': formats, } From d7eb052fa2ab26839b050a7c3fa3f8874d508a02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Apr 2016 22:48:04 +0600 Subject: [PATCH 021/347] [screencastomatic] Add duration to test --- youtube_dl/extractor/screencastomatic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/screencastomatic.py b/youtube_dl/extractor/screencastomatic.py index c08c89d94..7a88a42cd 100644 --- a/youtube_dl/extractor/screencastomatic.py +++ b/youtube_dl/extractor/screencastomatic.py @@ -16,6 +16,7 @@ class ScreencastOMaticIE(JWPlatformBaseIE): 'title': 'Welcome to 3-4 Philosophy @ DECV!', 'thumbnail': 're:^https?://.*\.jpg$', 'description': 'as the title says! also: some general info re 1) VCE philosophy and 2) distance learning.', + 'duration': 369.163, } } From 7ebc36900d15888321a45f04113eeda169469004 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Apr 2016 22:55:07 +0600 Subject: [PATCH 022/347] [jwplatform:base] Improve subtitles extraction --- youtube_dl/extractor/jwplatform.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index 01601c59e..8a5e562db 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -13,10 +13,6 @@ from ..utils import ( class JWPlatformBaseIE(InfoExtractor): def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True): video_data = jwplayer_data['playlist'][0] - subtitles = {} - for track in video_data['tracks']: - if track['kind'] == 'captions': - subtitles[track['label']] = [{'url': self._proto_relative_url(track['file'])}] formats = [] for source in video_data['sources']: @@ -38,6 +34,15 @@ class JWPlatformBaseIE(InfoExtractor): }) self._sort_formats(formats) + subtitles = {} + tracks = video_data.get('tracks') + if tracks and isinstance(tracks, list): + for track in tracks: + if track.get('file') and track.get('kind') == 'captions': + subtitles.setdefault(track.get('label') or 'en', []).append({ + 'url': self._proto_relative_url(track['file']) + }) + return { 'id': video_id, 'title': video_data['title'] if require_title else video_data.get('title'), From 4a121d29bb0700beb19e8b6edb5d479e9fe7ac1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 10 Apr 2016 23:45:17 +0600 Subject: [PATCH 023/347] [glide] Fix extraction (Closes #9141) --- youtube_dl/extractor/glide.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/glide.py b/youtube_dl/extractor/glide.py index 9561ed5fb..0ab23f766 100644 --- a/youtube_dl/extractor/glide.py +++ b/youtube_dl/extractor/glide.py @@ -23,8 +23,9 @@ class GlideIE(InfoExtractor): webpage = self._download_webpage(url, video_id) title = self._html_search_regex( r'(.*?)', webpage, 'title') - video_url = self.http_scheme() + self._search_regex( - r'', webpage, 'video URL') + video_url = self._proto_relative_url(self._search_regex( + r']+src=(["\'])(?P.+?)\1', + webpage, 'video URL', group='url'), self.http_scheme()) thumbnail_url = self._search_regex( r' Date: Sun, 10 Apr 2016 23:56:23 +0600 Subject: [PATCH 024/347] [glide] Improve extraction and extract upload info --- youtube_dl/extractor/glide.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/glide.py b/youtube_dl/extractor/glide.py index 0ab23f766..62ff84835 100644 --- a/youtube_dl/extractor/glide.py +++ b/youtube_dl/extractor/glide.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import unified_strdate class GlideIE(InfoExtractor): @@ -15,27 +16,38 @@ class GlideIE(InfoExtractor): 'ext': 'mp4', 'title': 'Damon Timm\'s Glide message', 'thumbnail': 're:^https?://.*?\.cloudfront\.net/.*\.jpg$', + 'uploader': 'Damon Timm', + 'upload_date': '20140919', } } def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex( - r'(.*?)', webpage, 'title') + r'(.+?)', webpage, 'title') video_url = self._proto_relative_url(self._search_regex( r']+src=(["\'])(?P.+?)\1', - webpage, 'video URL', group='url'), self.http_scheme()) - thumbnail_url = self._search_regex( - r']+id=["\']video-thumbnail["\'][^>]+src=(["\'])(?P.+?)\1', + webpage, 'thumbnail url', default=None, + group='url')) or self._og_search_thumbnail(webpage) + uploader = self._search_regex( + r']+class=["\']info-name["\'][^>]*>([^<]+)', + webpage, 'uploader', fatal=False) + upload_date = unified_strdate(self._search_regex( + r']+class="info-date"[^>]*>([^<]+)', + webpage, 'upload date', fatal=False)) return { 'id': video_id, 'title': title, 'url': video_url, 'thumbnail': thumbnail, + 'uploader': uploader, + 'upload_date': upload_date, } From 452908b257da1a5b228a2c0522c89fff87296622 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 11 Apr 2016 00:06:05 +0600 Subject: [PATCH 025/347] [telebruxelles] Fix extraction (Closes #9142) --- youtube_dl/extractor/telebruxelles.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/telebruxelles.py b/youtube_dl/extractor/telebruxelles.py index a3d05f97d..eefecc490 100644 --- a/youtube_dl/extractor/telebruxelles.py +++ b/youtube_dl/extractor/telebruxelles.py @@ -1,11 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor class TeleBruxellesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?telebruxelles\.be/(news|sport|dernier-jt)/?(?P[^/#?]+)' + _VALID_URL = r'https?://(?:www\.)?(?:telebruxelles|bx1)\.be/(news|sport|dernier-jt)/?(?P[^/#?]+)' _TESTS = [{ 'url': 'http://www.telebruxelles.be/news/auditions-devant-parlement-francken-galant-tres-attendus/', 'md5': '59439e568c9ee42fb77588b2096b214f', @@ -39,18 +41,18 @@ class TeleBruxellesIE(InfoExtractor): webpage = self._download_webpage(url, display_id) article_id = self._html_search_regex( - r"
(.*?)

', webpage, 'title') - description = self._og_search_description(webpage) + description = self._og_search_description(webpage, default=None) rtmp_url = self._html_search_regex( - r"file: \"(rtmp://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}/vod/mp4:\" \+ \"\w+\" \+ \".mp4)\"", + r'file\s*:\s*"(rtmp://[^/]+/vod/mp4:"\s*\+\s*"[^"]+"\s*\+\s*".mp4)"', webpage, 'RTMP url') - rtmp_url = rtmp_url.replace("\" + \"", "") + rtmp_url = re.sub(r'"\s*\+\s*"', '', rtmp_url) return { - 'id': article_id, + 'id': article_id or display_id, 'display_id': display_id, 'title': title, 'description': description, From dfbc7f7f3f44ff7f9ed2beff76dc37edbb66af8d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 11 Apr 2016 16:14:07 +0800 Subject: [PATCH 026/347] [presstv] Improve and simplify --- youtube_dl/extractor/presstv.py | 48 +++++++++++++++++---------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/presstv.py b/youtube_dl/extractor/presstv.py index 755e32528..2da93ed34 100644 --- a/youtube_dl/extractor/presstv.py +++ b/youtube_dl/extractor/presstv.py @@ -1,19 +1,21 @@ # coding: utf-8 from __future__ import unicode_literals + import re from .common import InfoExtractor -from ..utils import str_to_int +from ..utils import remove_start class PressTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?presstv\.ir/[^/]+/(?P[0-9]+)/(?P[0-9]+)/(?P[0-9]+)/(?P[0-9]+)/' + _VALID_URL = r'https?://(?:www\.)?presstv\.ir/[^/]+/(?P\d+)/(?P\d+)/(?P\d+)/(?P\d+)/(?P[^/]+)?' _TEST = { 'url': 'http://www.presstv.ir/Detail/2016/04/09/459911/Australian-sewerage-treatment-facility-/', 'md5': '5d7e3195a447cb13e9267e931d8dd5a5', 'info_dict': { 'id': '459911', + 'display_id': 'Australian-sewerage-treatment-facility-', 'ext': 'mp4', 'title': 'Organic mattresses used to clean waste water', 'upload_date': '20160409', @@ -23,47 +25,47 @@ class PressTVIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) # extract video URL from webpage - video_url = self._html_search_regex(r'', webpage, - 'Video URL') + video_url = self._hidden_inputs(webpage)['inpPlayback'] # build list of available formats # specified in http://www.presstv.ir/Scripts/playback.js base_url = 'http://192.99.219.222:82/presstv' _formats = [ - ("180p", "_low200.mp4"), - ("360p", "_low400.mp4"), - ("720p", "_low800.mp4"), - ("1080p", ".mp4") + (180, '_low200.mp4'), + (360, '_low400.mp4'), + (720, '_low800.mp4'), + (1080, '.mp4') ] - formats = [] - for fmt in _formats: - format_id, extension = fmt - formats.append({ - 'url': base_url + video_url[:-4] + extension, - 'format_id': format_id - }) + formats = [{ + 'url': base_url + video_url[:-4] + extension, + 'format_id': '%dp' % height, + 'height': height, + } for height, extension in _formats] # extract video metadata - title = self._html_search_meta('title', webpage, 'Title', True) - title = title.partition('-')[2].strip() + title = remove_start( + self._html_search_meta('title', webpage, fatal=True), 'PressTV-') thumbnail = self._og_search_thumbnail(webpage) description = self._og_search_description(webpage) - match = re.match(PressTVIE._VALID_URL, url) upload_date = '%04d%02d%02d' % ( - str_to_int(match.group('y')), - str_to_int(match.group('m')), - str_to_int(match.group('d')) + int(mobj.group('y')), + int(mobj.group('m')), + int(mobj.group('d')), ) return { 'id': video_id, + 'display_id': display_id, 'title': title, 'formats': formats, 'thumbnail': thumbnail, From 134c207e3faf1ad38a23e31d5067eafe0ef8e92a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 11 Apr 2016 19:17:11 +0800 Subject: [PATCH 027/347] [arte.tv:embed] Extended support (#2620) --- youtube_dl/extractor/arte.py | 2 +- youtube_dl/extractor/generic.py | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index ae0f27dcb..f042d9163 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -337,7 +337,7 @@ class ArteTVEmbedIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:embed' _VALID_URL = r'''(?x) http://www\.arte\.tv - /playerv2/embed\.php\?json_url= + /(?:playerv2/embed|arte_vp/index)\.php\?json_url= (?P http://arte\.tv/papi/tvguide/videos/stream/player/ (?P[^/]+)/(?P[^/]+)[^&]* diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 589d1e152..2aadd6a12 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1128,6 +1128,18 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + # Another form of arte.tv embed + { + 'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html', + 'md5': '850bfe45417ddf221288c88a0cffe2e2', + 'info_dict': { + 'id': '030273-562_PLUS7-F', + 'ext': 'mp4', + 'title': 'ARTE Reportage - Nulle part, en France', + 'description': 'md5:e3a0e8868ed7303ed509b9e3af2b870d', + 'upload_date': '20160409', + }, + }, ] def report_following_redirect(self, new_url): @@ -1702,7 +1714,7 @@ class GenericIE(InfoExtractor): # Look for embedded arte.tv player mobj = re.search( - r'', webpage, 'JS code') + decoded = self.openload_decode(code) + video_url = self._search_regex( - r'return\s+"(https?://[^"]+)"', self.openload_decode(code), 'video URL') + r'return\s+"(https?://[^"]+)"', decoded, 'video URL') title = self._og_search_title(webpage, default=None) or self._search_regex( r']+class=["\']title["\'][^>]*>([^<]+)', webpage, 'title', default=None) or self._html_search_meta( 'description', webpage, 'title', fatal=True) + ext = mimetype2ext(self._search_regex( + r'window\.vt\s*=\s*(["\'])(?P.+?)\1', decoded, + 'mimetype', default=None, group='mimetype')) or determine_ext( + video_url, 'mp4') + return { 'id': video_id, 'title': title, + 'ext': ext, 'thumbnail': self._og_search_thumbnail(webpage), 'url': video_url, } From e9063b5de9a1118842185768f5e615b76ec8692c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 25 Apr 2016 00:22:55 +0600 Subject: [PATCH 160/347] [openload] Add test --- youtube_dl/extractor/openload.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 697f312c3..9704f2e9d 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -31,6 +31,11 @@ class OpenloadIE(InfoExtractor): }, { 'url': 'https://openload.io/f/ZAn6oz-VZGE/', 'only_matching': True, + }, { + # unavailable via https://openload.co/f/Sxz5sADo82g/, different layout + # for title and ext + 'url': 'https://openload.co/embed/Sxz5sADo82g/', + 'only_matching': True, }] @staticmethod From c83a352227401d7ca7eac045b58043ed576c0cdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 25 Apr 2016 00:26:06 +0600 Subject: [PATCH 161/347] [openload] Make thumbnail optional --- youtube_dl/extractor/openload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 9704f2e9d..456561bcc 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -122,6 +122,6 @@ class OpenloadIE(InfoExtractor): 'id': video_id, 'title': title, 'ext': ext, - 'thumbnail': self._og_search_thumbnail(webpage), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'url': video_url, } From fb72ec58ae2612590d661c9943fe6b2fa0864401 Mon Sep 17 00:00:00 2001 From: remitamine Date: Thu, 30 Jul 2015 17:34:38 +0100 Subject: [PATCH 162/347] [extractor/common] do not process f4m manifest that contain akamai playerVerificationChallenge --- youtube_dl/extractor/common.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index dc5080504..e3d1dd076 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -856,6 +856,13 @@ class InfoExtractor(object): # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) transform_source=transform_source) + # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy + akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0') + if akamai_pv is not None and ';' in akamai_pv.text: + playerVerificationChallenge = akamai_pv.text.split(';')[0] + if playerVerificationChallenge.strip() != '': + return [] + formats = [] manifest_version = '1.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') From abc1723edd03d38b256e012d465e3343064f5682 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 25 Apr 2016 22:24:40 +0800 Subject: [PATCH 163/347] [unistra] Sort formats Originally URLs are passed to set() and not sorted, so the result is not deterministic, causing occasional FAILs on Travis CI. --- youtube_dl/extractor/unistra.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/unistra.py b/youtube_dl/extractor/unistra.py index 66d9f1bf3..a724cdbef 100644 --- a/youtube_dl/extractor/unistra.py +++ b/youtube_dl/extractor/unistra.py @@ -49,6 +49,7 @@ class UnistraIE(InfoExtractor): 'format_id': format_id, 'quality': quality(format_id) }) + self._sort_formats(formats) title = self._html_search_regex( r'UTV - (.*?)</', webpage, 'title') From 2beff95da5fb28440d26a3dee5de575c792d133c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 25 Apr 2016 22:26:19 +0800 Subject: [PATCH 164/347] [nrk] Comment out unstable MD5 checksums Both are Akamai f4f fragments. --- youtube_dl/extractor/nrk.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 9df200822..51dfc27ac 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -23,7 +23,7 @@ class NRKIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': 'bccd850baebefe23b56d708a113229c2', + # MD5 is unstable 'info_dict': { 'id': '150533', 'ext': 'flv', @@ -34,7 +34,7 @@ class NRKIE(InfoExtractor): }, { 'url': 'http://www.nrk.no/video/PS*154915', - 'md5': '0b1493ba1aae7d9579a5ad5531bc395a', + # MD5 is unstable 'info_dict': { 'id': '154915', 'ext': 'flv', From 6bdc2d5358c2843e3be4d073b2005e5196519664 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 25 Apr 2016 22:27:25 +0800 Subject: [PATCH 165/347] [mitele] Comment out unstable MD5 Also Akamai f4f fragments --- youtube_dl/extractor/mitele.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 7b4581dc5..3589c223d 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -15,9 +15,9 @@ class MiTeleIE(InfoExtractor): IE_DESC = 'mitele.es' _VALID_URL = r'https?://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/' - _TESTS = [{ + _TEST = { 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', - 'md5': '0ff1a13aebb35d9bc14081ff633dd324', + # MD5 is unstable 'info_dict': { 'id': '0NF1jJnxS1Wu3pHrmvFyw2', 'display_id': 'programa-144', @@ -27,7 +27,7 @@ class MiTeleIE(InfoExtractor): 'thumbnail': 're:(?i)^https?://.*\.jpg$', 'duration': 2913, }, - }] + } def _real_extract(self, url): display_id = self._match_id(url) From 4645432d7a92bfb950571dde5dd690110e0f2284 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 25 Apr 2016 22:48:17 +0800 Subject: [PATCH 166/347] [eagleplatform] Checking direct HTTP links Sometimes they fail with 404 --- youtube_dl/extractor/eagleplatform.py | 7 +++++-- youtube_dl/extractor/generic.py | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 0f8c73fd7..113a4966f 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -23,7 +23,7 @@ class EaglePlatformIE(InfoExtractor): _TESTS = [{ # http://lenta.ru/news/2015/03/06/navalny/ 'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201', - 'md5': '881ee8460e1b7735a8be938e2ffb362b', + # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used 'info_dict': { 'id': '227304', 'ext': 'mp4', @@ -109,8 +109,11 @@ class EaglePlatformIE(InfoExtractor): mobj = re.search('/([^/]+)/index\.m3u8', m3u8_format['url']) if mobj: http_format = m3u8_format.copy() + video_url = mp4_url.replace(mp4_url_basename, mobj.group(1)) + if not self._is_valid_url(video_url, video_id): + continue http_format.update({ - 'url': mp4_url.replace(mp4_url_basename, mobj.group(1)), + 'url': video_url, 'format_id': m3u8_format['format_id'].replace('hls', 'http'), 'protocol': 'http', }) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c63bdbd08..a95501d86 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -887,6 +887,7 @@ class GenericIE(InfoExtractor): # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', + # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used 'info_dict': { 'id': '227304', 'ext': 'mp4', @@ -901,6 +902,7 @@ class GenericIE(InfoExtractor): # ClipYou (Eagle.Platform) embed (custom URL) { 'url': 'http://muz-tv.ru/play/7129/', + # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used 'info_dict': { 'id': '12820', 'ext': 'mp4', From ad58942d57996f7f43601f22c85b8c6a9afe1b09 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 25 Apr 2016 23:35:05 +0800 Subject: [PATCH 167/347] [muzu] Remove extractor MUZU is shutting down in October 2015. [1] [1] http://www.musicbusinessworldwide.com/youtube-rival-muzu-is-heading-into-liquidation/ --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/muzu.py | 63 ------------------------------ 2 files changed, 64 deletions(-) delete mode 100644 youtube_dl/extractor/muzu.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6de3438fc..8b215c5ab 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -439,7 +439,6 @@ from .mtv import ( ) from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE -from .muzu import MuzuTVIE from .mwave import MwaveIE from .myspace import MySpaceIE, MySpaceAlbumIE from .myspass import MySpassIE diff --git a/youtube_dl/extractor/muzu.py b/youtube_dl/extractor/muzu.py deleted file mode 100644 index cbc800481..000000000 --- a/youtube_dl/extractor/muzu.py +++ /dev/null @@ -1,63 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode - - -class MuzuTVIE(InfoExtractor): - _VALID_URL = r'https?://www\.muzu\.tv/(.+?)/(.+?)/(?P<id>\d+)' - IE_NAME = 'muzu.tv' - - _TEST = { - 'url': 'http://www.muzu.tv/defected/marcashken-featuring-sos-cat-walk-original-mix-music-video/1981454/', - 'md5': '98f8b2c7bc50578d6a0364fff2bfb000', - 'info_dict': { - 'id': '1981454', - 'ext': 'mp4', - 'title': 'Cat Walk (Original Mix)', - 'description': 'md5:90e868994de201b2570e4e5854e19420', - 'uploader': 'MarcAshken featuring SOS', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - info_data = compat_urllib_parse_urlencode({ - 'format': 'json', - 'url': url, - }) - info = self._download_json( - 'http://www.muzu.tv/api/oembed/?%s' % info_data, - video_id, 'Downloading video info') - - player_info = self._download_json( - 'http://player.muzu.tv/player/playerInit?ai=%s' % video_id, - video_id, 'Downloading player info') - video_info = player_info['videos'][0] - for quality in ['1080', '720', '480', '360']: - if video_info.get('v%s' % quality): - break - - data = compat_urllib_parse_urlencode({ - 'ai': video_id, - # Even if each time you watch a video the hash changes, - # it seems to work for different videos, and it will work - # even if you use any non empty string as a hash - 'viewhash': 'VBNff6djeV4HV5TRPW5kOHub2k', - 'device': 'web', - 'qv': quality, - }) - video_url_info = self._download_json( - 'http://player.muzu.tv/player/requestVideo?%s' % data, - video_id, 'Downloading video url') - video_url = video_url_info['url'] - - return { - 'id': video_id, - 'title': info['title'], - 'url': video_url, - 'thumbnail': info['thumbnail_url'], - 'description': info['description'], - 'uploader': info['author_name'], - } From e3de3d6f2f9c82683e76b6bc12697aa7264372ca Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Mon, 25 Apr 2016 23:49:12 +0800 Subject: [PATCH 168/347] [normalboots] Fix extraction Now it's using ScreenwaveMedia --- youtube_dl/extractor/normalboots.py | 18 +++++++++--------- youtube_dl/extractor/screenwavemedia.py | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/normalboots.py b/youtube_dl/extractor/normalboots.py index 77e091072..af44c3bb5 100644 --- a/youtube_dl/extractor/normalboots.py +++ b/youtube_dl/extractor/normalboots.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .screenwavemedia import ScreenwaveMediaIE from ..utils import ( unified_strdate, @@ -12,7 +13,6 @@ class NormalbootsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?normalboots\.com/video/(?P<id>[0-9a-z-]*)/?$' _TEST = { 'url': 'http://normalboots.com/video/home-alone-games-jontron/', - 'md5': '8bf6de238915dd501105b44ef5f1e0f6', 'info_dict': { 'id': 'home-alone-games-jontron', 'ext': 'mp4', @@ -22,9 +22,10 @@ class NormalbootsIE(InfoExtractor): 'upload_date': '20140125', }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, }, + 'add_ie': ['ScreenwaveMedia'], } def _real_extract(self, url): @@ -38,16 +39,15 @@ class NormalbootsIE(InfoExtractor): r'<span style="text-transform:uppercase; font-size:inherit;">[A-Za-z]+, (?P<date>.*)</span>', webpage, 'date', fatal=False)) - player_url = self._html_search_regex( - r'<iframe\swidth="[0-9]+"\sheight="[0-9]+"\ssrc="(?P<url>[\S]+)"', - webpage, 'player url') - player_page = self._download_webpage(player_url, video_id) - video_url = self._html_search_regex( - r"file:\s'(?P<file>[^']+\.mp4)'", player_page, 'file') + screenwavemedia_url = self._html_search_regex( + ScreenwaveMediaIE.EMBED_PATTERN, webpage, 'screenwave URL', + group='url') return { + '_type': 'url_transparent', 'id': video_id, - 'url': video_url, + 'url': screenwavemedia_url, + 'ie_key': ScreenwaveMediaIE.ie_key(), 'title': self._og_search_title(webpage), 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), diff --git a/youtube_dl/extractor/screenwavemedia.py b/youtube_dl/extractor/screenwavemedia.py index 44b0bbee6..40333c825 100644 --- a/youtube_dl/extractor/screenwavemedia.py +++ b/youtube_dl/extractor/screenwavemedia.py @@ -12,7 +12,7 @@ from ..utils import ( class ScreenwaveMediaIE(InfoExtractor): - _VALID_URL = r'https?://player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=(?P<id>[A-Za-z0-9-]+)' + _VALID_URL = r'(?:https?:)?//player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=(?P<id>[A-Za-z0-9-]+)' EMBED_PATTERN = r'src=(["\'])(?P<url>(?:https?:)?//player\d?\.screenwavemedia\.com/(?:play/)?[a-zA-Z]+\.php\?.*\bid=.+?)\1' _TESTS = [{ 'url': 'http://player.screenwavemedia.com/play/play.php?playerdiv=videoarea&companiondiv=squareAd&id=Cinemassacre-19911', From 749b0046a8664d023ff622dd38844f5c8632f3f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 25 Apr 2016 22:05:47 +0600 Subject: [PATCH 169/347] [ok] Allow embeds without title (Closes #9303) --- youtube_dl/extractor/odnoklassniki.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index f9e064a60..cd614f427 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -60,6 +60,22 @@ class OdnoklassnikiIE(InfoExtractor): 'uploader': 'Алина П', 'age_limit': 0, }, + }, { + # YouTube embed (metadata, provider == USER_YOUTUBE, no metadata.movie.title field) + 'url': 'http://ok.ru/video/62036049272859-0', + 'info_dict': { + 'id': '62036049272859-0', + 'ext': 'mp4', + 'title': 'МУЗЫКА ДОЖДЯ .', + 'description': 'md5:6f1867132bd96e33bf53eda1091e8ed0', + 'upload_date': '20120106', + 'uploader_id': '473534735899', + 'uploader': 'МARINA D', + 'age_limit': 0, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452', 'only_matching': True, @@ -106,7 +122,14 @@ class OdnoklassnikiIE(InfoExtractor): video_id, 'Downloading metadata JSON') movie = metadata['movie'] - title = movie['title'] + + # Some embedded videos may not contain title in movie dict (e.g. + # http://ok.ru/video/62036049272859-0) thus we allow missing title + # here and it's going to be extracted later by an extractor that + # will process the actual embed. + provider = metadata.get('provider') + title = movie['title'] if provider == 'UPLOADED_ODKL' else movie.get('title') + thumbnail = movie.get('poster') duration = int_or_none(movie.get('duration')) @@ -137,7 +160,7 @@ class OdnoklassnikiIE(InfoExtractor): 'age_limit': age_limit, } - if metadata.get('provider') == 'USER_YOUTUBE': + if provider == 'USER_YOUTUBE': info.update({ '_type': 'url_transparent', 'url': movie['contentId'], From c9fd5306709d0c03487a3b0163b7a33cab6774aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 25 Apr 2016 22:15:15 +0600 Subject: [PATCH 170/347] [ok] Extract start time --- youtube_dl/extractor/odnoklassniki.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index cd614f427..986708e75 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -2,7 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, +) from ..utils import ( ExtractorError, unified_strdate, @@ -32,7 +36,7 @@ class OdnoklassnikiIE(InfoExtractor): 'skip': 'Video has been blocked', }, { # metadataUrl - 'url': 'http://ok.ru/video/63567059965189-0', + 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5', 'md5': '9676cf86eff5391d35dea675d224e131', 'info_dict': { 'id': '63567059965189-0', @@ -44,6 +48,7 @@ class OdnoklassnikiIE(InfoExtractor): 'uploader': '☭ Андрей Мещанинов ☭', 'like_count': int, 'age_limit': 0, + 'start_time': 5, }, }, { # YouTube embed (metadataUrl, provider == USER_YOUTUBE) @@ -94,6 +99,9 @@ class OdnoklassnikiIE(InfoExtractor): }] def _real_extract(self, url): + start_time = int_or_none(compat_parse_qs( + compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0]) + video_id = self._match_id(url) webpage = self._download_webpage( @@ -158,6 +166,7 @@ class OdnoklassnikiIE(InfoExtractor): 'uploader_id': uploader_id, 'like_count': like_count, 'age_limit': age_limit, + 'start_time': start_time, } if provider == 'USER_YOUTUBE': From f1f879098a38c786d78927df8915b547f7ac3569 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Tue, 26 Apr 2016 13:39:53 +0100 Subject: [PATCH 171/347] [viewster] extract more metadata for http formats --- youtube_dl/extractor/viewster.py | 37 ++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 7839225d4..6edc2c44e 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -118,6 +118,7 @@ class ViewsterIE(InfoExtractor): formats = [] manifest_url = None + m3u8_formats = [] for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'): media = self._download_json( 'https://public-api.viewster.com/movies/%s/video?mediaType=%s' @@ -154,18 +155,32 @@ class ViewsterIE(InfoExtractor): 'qualities', default=None) if not qualities: continue - qualities = qualities.strip(',').split(',') - http_template = re.sub(QUALITIES_RE, r'%s', qualities_basename) + qualities = list(map(lambda q: int(q[:-1]), qualities.strip(',').split(','))) + qualities.sort() + http_template = re.sub(QUALITIES_RE, r'%dk', qualities_basename) http_url_basename = url_basename(video_url) - for q in qualities: - tbr = int_or_none(self._search_regex( - r'(\d+)k', q, 'bitrate', default=None)) - formats.append({ - 'url': video_url.replace(http_url_basename, http_template % q), - 'ext': 'mp4', - 'format_id': 'http' + ('-%d' % tbr if tbr else ''), - 'tbr': tbr, - }) + if m3u8_formats: + self._sort_formats(m3u8_formats) + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + m3u8_formats)) + if len(qualities) == len(m3u8_formats): + for q, m3u8_format in zip(qualities, m3u8_formats): + f = m3u8_format.copy() + f.update({ + 'url': video_url.replace(http_url_basename, http_template % q), + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + formats.append(f) + else: + for q in qualities: + formats.append({ + 'url': video_url.replace(http_url_basename, http_template % q), + 'ext': 'mp4', + 'format_id': 'http-%d' % q, + 'tbr': q, + }) if not formats and not info.get('LanguageSets') and not info.get('VODSettings'): self.raise_geo_restricted() From 175c2e9ec326f9ef820413837608eb4f5c8c5961 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Tue, 26 Apr 2016 22:29:29 +0600 Subject: [PATCH 172/347] [youtube:search_url] Reimplement in terms of youtube:playlistbase --- youtube_dl/extractor/youtube.py | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 44f98d294..b7c3cb63f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2139,10 +2139,11 @@ class YoutubeSearchDateIE(YoutubeSearchIE): _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'} -class YoutubeSearchURLIE(InfoExtractor): +class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' + _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, @@ -2157,32 +2158,8 @@ class YoutubeSearchURLIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) - webpage = self._download_webpage(url, query) - result_code = self._search_regex( - r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML') - - part_codes = re.findall( - r'(?s)<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*>(.*?)</h3>', result_code) - entries = [] - for part_code in part_codes: - part_title = self._html_search_regex( - [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False) - part_url_snippet = self._html_search_regex( - r'(?s)href="([^"]+)"', part_code, 'item URL') - part_url = compat_urlparse.urljoin( - 'https://www.youtube.com/', part_url_snippet) - entries.append({ - '_type': 'url', - 'url': part_url, - 'title': part_title, - }) - - return { - '_type': 'playlist', - 'entries': entries, - 'title': query, - } + return self.playlist_result(self._process_page(webpage), playlist_title=query) class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): From 7464360379a1a3fc6ba3228f54dd4853df349142 Mon Sep 17 00:00:00 2001 From: Sergey M <dstftw@gmail.com> Date: Wed, 27 Apr 2016 00:16:48 +0600 Subject: [PATCH 173/347] [README.md] Add FAQ entry on output template conflicts --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index e062444b3..fb57b0323 100644 --- a/README.md +++ b/README.md @@ -697,6 +697,10 @@ YouTube changed their playlist format in March 2014 and later on, so you'll need If you have installed youtube-dl with a package manager, pip, setup.py or a tarball, please use that to update. Note that Ubuntu packages do not seem to get updated anymore. Since we are not affiliated with Ubuntu, there is little we can do. Feel free to [report bugs](https://bugs.launchpad.net/ubuntu/+source/youtube-dl/+filebug) to the [Ubuntu packaging guys](mailto:ubuntu-motu@lists.ubuntu.com?subject=outdated%20version%20of%20youtube-dl) - all they have to do is update the package to a somewhat recent version. See above for a way to update. +### I'm getting an error when trying to use output template: `error: using output template conflicts with using title, video ID or auto number` + +Make sure you are not using `-o` with any of these options `-t`, `--title`, `--id`, `-A` or `--auto-number` set in command line or in a configuration file. Remove the latter if any. + ### Do I always have to pass `-citw`? By default, youtube-dl intends to have the best options (incidentally, if you have a convincing case that these should be different, [please file an issue where you explain that](https://yt-dl.org/bug)). Therefore, it is unnecessary and sometimes harmful to copy long option strings from webpages. In particular, the only option out of `-citw` that is regularly useful is `-i`. From 046ea04a7d8601a85007430a7a3da3ce236549f7 Mon Sep 17 00:00:00 2001 From: Sergey M <dstftw@gmail.com> Date: Wed, 27 Apr 2016 00:22:08 +0600 Subject: [PATCH 174/347] [README.md] Mention mpv --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fb57b0323..ecf737047 100644 --- a/README.md +++ b/README.md @@ -721,7 +721,7 @@ Videos or video formats streamed via RTMP protocol can only be downloaded when [ ### I have downloaded a video but how can I play it? -Once the video is fully downloaded, use any video player, such as [vlc](http://www.videolan.org) or [mplayer](http://www.mplayerhq.hu/). +Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](http://www.videolan.org) or [mplayer](http://www.mplayerhq.hu/). ### I extracted a video URL with `-g`, but it does not play on another machine / in my webbrowser. From a7e03861e8d0ce18ad698e0e38ffac40a09cef8b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 27 Apr 2016 13:52:04 +0800 Subject: [PATCH 175/347] [scivee] Skip the test Not accessible from either Travis CI or my machine. Closes #9315 --- youtube_dl/extractor/scivee.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/scivee.py b/youtube_dl/extractor/scivee.py index 3bf93c870..b1ca12fde 100644 --- a/youtube_dl/extractor/scivee.py +++ b/youtube_dl/extractor/scivee.py @@ -18,6 +18,7 @@ class SciVeeIE(InfoExtractor): 'title': 'Adam Arkin at the 2014 DOE JGI Genomics of Energy & Environment Meeting', 'description': 'md5:81f1710638e11a481358fab1b11059d7', }, + 'skip': 'Not accessible from Travis CI server', } def _real_extract(self, url): From 2ac2cbc0a351785e0c6d034bd1bab77973ec7a41 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 27 Apr 2016 13:55:32 +0800 Subject: [PATCH 176/347] [malemotion] Remove the extractor Announcement from their homepage: ``` MaleMotion is closed After another system crash, I'm forced to close the site This week all content will be erased Don't forget to cancel your subscription if any ! ``` Closes #9311. --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/malemotion.py | 46 ------------------------------ 2 files changed, 47 deletions(-) delete mode 100644 youtube_dl/extractor/malemotion.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8b215c5ab..00f8a7a85 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -400,7 +400,6 @@ from .macgamestore import MacGameStoreIE from .mailru import MailRuIE from .makerschannel import MakersChannelIE from .makertv import MakerTVIE -from .malemotion import MalemotionIE from .matchtv import MatchTVIE from .mdr import MDRIE from .metacafe import MetacafeIE diff --git a/youtube_dl/extractor/malemotion.py b/youtube_dl/extractor/malemotion.py deleted file mode 100644 index 92511a671..000000000 --- a/youtube_dl/extractor/malemotion.py +++ /dev/null @@ -1,46 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote - - -class MalemotionIE(InfoExtractor): - _VALID_URL = r'https?://malemotion\.com/video/(.+?)\.(?P<id>.+?)(#|$)' - _TEST = { - 'url': 'http://malemotion.com/video/bete-de-concours.ltc', - 'md5': '3013e53a0afbde2878bc39998c33e8a5', - 'info_dict': { - 'id': 'ltc', - 'ext': 'mp4', - 'title': 'Bête de Concours', - 'age_limit': 18, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_url = compat_urllib_parse_unquote(self._search_regex( - r'<source type="video/mp4" src="(.+?)"', webpage, 'video URL')) - video_title = self._html_search_regex( - r'<title>(.*?)</title', webpage, 'title') - video_thumbnail = self._search_regex( - r'<video .+?poster="(.+?)"', webpage, 'thumbnail', fatal=False) - - formats = [{ - 'url': video_url, - 'ext': 'mp4', - 'format_id': 'mp4', - 'preference': 1, - }] - self._sort_formats(formats) - - return { - 'id': video_id, - 'formats': formats, - 'title': video_title, - 'thumbnail': video_thumbnail, - 'age_limit': 18, - } From 5b5d7cc11e3037408aeedf8d6dc57ac228b02496 Mon Sep 17 00:00:00 2001 From: Peter Rowlands <peter@pmrowla.com> Date: Wed, 27 Apr 2016 15:57:17 +0900 Subject: [PATCH 177/347] [mwave] Add Mwave Meet & Greet extractor --- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/mwave.py | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8b215c5ab..9d1992721 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -439,7 +439,7 @@ from .mtv import ( ) from .muenchentv import MuenchenTVIE from .musicplayon import MusicPlayOnIE -from .mwave import MwaveIE +from .mwave import MwaveIE, MwaveMeetGreetIE from .myspace import MySpaceIE, MySpaceAlbumIE from .myspass import MySpassIE from .myvi import MyviIE diff --git a/youtube_dl/extractor/mwave.py b/youtube_dl/extractor/mwave.py index 5c3c8d464..6485c6928 100644 --- a/youtube_dl/extractor/mwave.py +++ b/youtube_dl/extractor/mwave.py @@ -56,3 +56,26 @@ class MwaveIE(InfoExtractor): 'view_count': int_or_none(vod_info.get('hit')), 'formats': formats, } + + +class MwaveMeetGreetIE(InfoExtractor): + _VALID_URL = r'https?://mwave\.interest\.me/meetgreet/view/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://mwave.interest.me/meetgreet/view/256', + 'info_dict': { + 'id': '173294', + 'ext': 'flv', + 'title': '[MEET&GREET] Park BoRam', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Mwave', + 'duration': 3634, + 'view_count': int, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + clip_id = self._html_search_regex(r'<iframe src="/mnettv/ifr_clip\.m\?searchVideoDetailVO\.clip_id=(?P<id>[0-9]+)', webpage, 'clip ID') + clip_url = 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id={0}'.format(clip_id) + return self.url_result(clip_url, 'Mwave', clip_id) From dcf094d62699f8ad06ceaf3fba55e453980fac91 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Wed, 27 Apr 2016 18:08:23 +0800 Subject: [PATCH 178/347] [theplatform] Fix for Python 3.2 test_AENetworks{,_1} fails as in Python < 3.3, binascii.a2b_* functions accepts only bytes-like objects --- youtube_dl/extractor/theplatform.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 8272dd969..a25417f94 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -159,11 +159,11 @@ class ThePlatformIE(ThePlatformBaseIE): def str_to_hex(str): return binascii.b2a_hex(str.encode('ascii')).decode('ascii') - def hex_to_str(hex): - return binascii.a2b_hex(hex) + def hex_to_bytes(hex): + return binascii.a2b_hex(hex.encode('ascii')) relative_path = re.match(r'https?://link.theplatform.com/s/([^?]+)', url).group(1) - clear_text = hex_to_str(flags + expiration_date + str_to_hex(relative_path)) + clear_text = hex_to_bytes(flags + expiration_date + str_to_hex(relative_path)) checksum = hmac.new(sig_key.encode('ascii'), clear_text, hashlib.sha1).hexdigest() sig = flags + expiration_date + checksum + str_to_hex(sig_secret) return '%s&sig=%s' % (url, sig) From 3cc8649c9d42bab8c7b665115ebdc569bf44a762 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 28 Apr 2016 02:58:11 +0800 Subject: [PATCH 179/347] [20min] Detect embedded YouTube videos Fixes #9331 --- youtube_dl/extractor/twentymin.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py index ca7d953b8..b721ecb0a 100644 --- a/youtube_dl/extractor/twentymin.py +++ b/youtube_dl/extractor/twentymin.py @@ -32,7 +32,22 @@ class TwentyMinutenIE(InfoExtractor): 'title': '«Wir müssen mutig nach vorne schauen»', 'description': 'Kein Land sei innovativer als die Schweiz, sagte Johann Schneider-Ammann in seiner Neujahrsansprache. Das Land müsse aber seine Hausaufgaben machen.', 'thumbnail': 'http://www.20min.ch/images/content/2/2/0/22050469/10/teaserbreit.jpg' - } + }, + 'skip': '"This video is no longer available" is shown both on the web page and in the downloaded file.', + }, { + # YouTube embed + 'url': 'http://www.20min.ch/ro/sports/football/story/Il-marque-une-bicyclette-de-plus-de-30-metres--21115184', + 'md5': 'cec64d59aa01c0ed9dbba9cf639dd82f', + 'info_dict': { + 'id': 'ivM7A7SpDOs', + 'ext': 'mp4', + 'title': 'GOLAZO DE CHILENA DE JAVI GÓMEZ, FINALISTA AL BALÓN DE CLM 2016', + 'description': 'md5:903c92fbf2b2f66c09de514bc25e9f5a', + 'upload_date': '20160424', + 'uploader': 'RTVCM Castilla-La Mancha', + 'uploader_id': 'RTVCM', + }, + 'add_ie': ['Youtube'], }, { 'url': 'http://www.20min.ch/videotv/?cid=44&vid=468738', 'only_matching': True, @@ -48,6 +63,12 @@ class TwentyMinutenIE(InfoExtractor): webpage = self._download_webpage(url, display_id) + youtube_url = self._html_search_regex( + r'<iframe[^>]+src="((?:https?:)?//www\.youtube\.com/embed/[^"]+)"', + webpage, 'YouTube embed URL', default=None) + if youtube_url is not None: + return self.url_result(youtube_url, 'Youtube') + title = self._html_search_regex( r'<h1>.*?<span>(.+?)</span></h1>', webpage, 'title', default=None) From 52af8f222bc4f067b4c5e7a977a64345d35ae4fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 Apr 2016 04:01:21 +0600 Subject: [PATCH 180/347] [cwtv] Relax _VALID_URL (Closes #9327) --- youtube_dl/extractor/cwtv.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py index f5cefd966..ebd14cb16 100644 --- a/youtube_dl/extractor/cwtv.py +++ b/youtube_dl/extractor/cwtv.py @@ -9,7 +9,7 @@ from ..utils import ( class CWTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cw(?:tv|seed)\.com/shows/(?:[^/]+/){2}\?play=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})' + _VALID_URL = r'https?://(?:www\.)?cw(?:tv|seed)\.com/(?:shows/)?(?:[^/]+/){2}\?.*\bplay=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})' _TESTS = [{ 'url': 'http://cwtv.com/shows/arrow/legends-of-yesterday/?play=6b15e985-9345-4f60-baf8-56e96be57c63', 'info_dict': { @@ -48,6 +48,9 @@ class CWTVIE(InfoExtractor): # m3u8 download 'skip_download': True, } + }, { + 'url': 'http://cwtv.com/thecw/chroniclesofcisco/?play=8adebe35-f447-465f-ab52-e863506ff6d6', + 'only_matching': True, }] def _real_extract(self, url): From 618c71dc64086f751b6ae87d5f32687e02a54e58 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 28 Apr 2016 15:00:02 +0800 Subject: [PATCH 181/347] [cloudy] New domain name for the test_cloudy_1 I'm sure whether videoraj.ch still works or not, so keep it. --- youtube_dl/extractor/cloudy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index 9e267e6c0..9a28ef354 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -19,7 +19,7 @@ from ..utils import ( class CloudyIE(InfoExtractor): _IE_DESC = 'cloudy.ec and videoraj.ch' _VALID_URL = r'''(?x) - https?://(?:www\.)?(?P<host>cloudy\.ec|videoraj\.ch)/ + https?://(?:www\.)?(?P<host>cloudy\.ec|videoraj\.(?:ch|to))/ (?:v/|embed\.php\?id=) (?P<id>[A-Za-z0-9]+) ''' @@ -37,7 +37,7 @@ class CloudyIE(InfoExtractor): } }, { - 'url': 'http://www.videoraj.ch/v/47f399fd8bb60', + 'url': 'http://www.videoraj.to/v/47f399fd8bb60', 'md5': '7d0f8799d91efd4eda26587421c3c3b0', 'info_dict': { 'id': '47f399fd8bb60', From a5941305b6ba0921ea4f34641dd9095372dd1c1d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 28 Apr 2016 16:03:08 +0800 Subject: [PATCH 182/347] [mwave] Coding style --- youtube_dl/extractor/mwave.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/mwave.py b/youtube_dl/extractor/mwave.py index 6485c6928..a103e0323 100644 --- a/youtube_dl/extractor/mwave.py +++ b/youtube_dl/extractor/mwave.py @@ -10,6 +10,7 @@ from ..utils import ( class MwaveIE(InfoExtractor): _VALID_URL = r'https?://mwave\.interest\.me/mnettv/videodetail\.m\?searchVideoDetailVO\.clip_id=(?P<id>[0-9]+)' + _URL_TEMPLATE = 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=%s' _TEST = { 'url': 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id=168859', # md5 is unstable @@ -59,7 +60,7 @@ class MwaveIE(InfoExtractor): class MwaveMeetGreetIE(InfoExtractor): - _VALID_URL = r'https?://mwave\.interest\.me/meetgreet/view/(?P<id>[0-9]+)' + _VALID_URL = r'https?://mwave\.interest\.me/meetgreet/view/(?P<id>\d+)' _TEST = { 'url': 'http://mwave.interest.me/meetgreet/view/256', 'info_dict': { @@ -76,6 +77,8 @@ class MwaveMeetGreetIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - clip_id = self._html_search_regex(r'<iframe src="/mnettv/ifr_clip\.m\?searchVideoDetailVO\.clip_id=(?P<id>[0-9]+)', webpage, 'clip ID') - clip_url = 'http://mwave.interest.me/mnettv/videodetail.m?searchVideoDetailVO.clip_id={0}'.format(clip_id) + clip_id = self._html_search_regex( + r'<iframe[^>]+src="/mnettv/ifr_clip\.m\?searchVideoDetailVO\.clip_id=(\d+)', + webpage, 'clip ID') + clip_url = MwaveIE._URL_TEMPLATE % clip_id return self.url_result(clip_url, 'Mwave', clip_id) From 7f776fa4b510b7973e08f06de556fa39cb5946e5 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 28 Apr 2016 17:08:41 +0800 Subject: [PATCH 183/347] [yandexmusic] Skip tests as Travis CI blocked --- youtube_dl/extractor/yandexmusic.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 7a90cc60c..0d32a612f 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -22,6 +22,12 @@ class YandexMusicBaseIE(InfoExtractor): if error: raise ExtractorError(error, expected=True) + def _download_webpage(self, *args, **kwargs): + webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs) + if 'Нам очень жаль, но запросы, поступившие с вашего IP-адреса, похожи на автоматические.' in webpage: + raise ExtractorError('Blocked by YandexMusic', expected=True) + return webpage + def _download_json(self, *args, **kwargs): response = super(YandexMusicBaseIE, self)._download_json(*args, **kwargs) self._handle_error(response) @@ -47,7 +53,8 @@ class YandexMusicTrackIE(YandexMusicBaseIE): 'album_artist': 'Carlo Ambrosio', 'artist': 'Carlo Ambrosio & Fabio Di Bari, Carlo Ambrosio', 'release_year': '2009', - } + }, + 'skip': 'Travis CI servers blocked by YandexMusic', } def _get_track_url(self, storage_dir, track_id): @@ -139,6 +146,7 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE): 'title': 'Carlo Ambrosio - Gypsy Soul (2009)', }, 'playlist_count': 50, + 'skip': 'Travis CI servers blocked by YandexMusic', } def _real_extract(self, url): @@ -171,6 +179,7 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9', }, 'playlist_count': 6, + 'skip': 'Travis CI servers blocked by YandexMusic', }, { # playlist exceeding the limit of 150 tracks shipped with webpage (see # https://github.com/rg3/youtube-dl/issues/6666) @@ -180,6 +189,7 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'title': 'Музыка 90-х', }, 'playlist_count': 310, + 'skip': 'Travis CI servers blocked by YandexMusic', }] def _real_extract(self, url): From 0cbcbdd89dbc3573ecfcf68496c54bd84804967d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 28 Apr 2016 17:51:20 +0800 Subject: [PATCH 184/347] [nuvid] Fix extraction Closes #7620 --- youtube_dl/extractor/nuvid.py | 44 +++++++++++++++++------------------ 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/nuvid.py b/youtube_dl/extractor/nuvid.py index 9fa7cefad..ab6bfcd7f 100644 --- a/youtube_dl/extractor/nuvid.py +++ b/youtube_dl/extractor/nuvid.py @@ -5,8 +5,6 @@ import re from .common import InfoExtractor from ..utils import ( parse_duration, - sanitized_Request, - unified_strdate, ) @@ -20,7 +18,6 @@ class NuvidIE(InfoExtractor): 'ext': 'mp4', 'title': 'Horny babes show their awesome bodeis and', 'duration': 129, - 'upload_date': '20140508', 'age_limit': 18, } } @@ -28,28 +25,31 @@ class NuvidIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - formats = [] + page_url = 'http://m.nuvid.com/video/%s' % video_id + webpage = self._download_webpage( + page_url, video_id, 'Downloading video page') + # When dwnld_speed exists and has a value larger than the MP4 file's + # bitrate, Nuvid returns the MP4 URL + # It's unit is 100bytes/millisecond, see mobile-nuvid-min.js for the algorithm + self._set_cookie('nuvid.com', 'dwnld_speed', '10.0') + mp4_webpage = self._download_webpage( + page_url, video_id, 'Downloading video page for MP4 format') - for dwnld_speed, format_id in [(0, '3gp'), (5, 'mp4')]: - request = sanitized_Request( - 'http://m.nuvid.com/play/%s' % video_id) - request.add_header('Cookie', 'skip_download_page=1; dwnld_speed=%d; adv_show=1' % dwnld_speed) - webpage = self._download_webpage( - request, video_id, 'Downloading %s page' % format_id) - video_url = self._html_search_regex( - r'<a\s+href="([^"]+)"\s+class="b_link">', webpage, '%s video URL' % format_id, fatal=False) - if not video_url: - continue + html5_video_re = r'(?s)<(?:video|audio)[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', + video_url = self._html_search_regex(html5_video_re, webpage, video_id) + mp4_video_url = self._html_search_regex(html5_video_re, mp4_webpage, video_id) + formats = [{ + 'url': video_url, + }] + if mp4_video_url != video_url: formats.append({ - 'url': video_url, - 'format_id': format_id, + 'url': mp4_video_url, }) - webpage = self._download_webpage( - 'http://m.nuvid.com/video/%s' % video_id, video_id, 'Downloading video page') title = self._html_search_regex( [r'<span title="([^"]+)">', - r'<div class="thumb-holder video">\s*<h5[^>]*>([^<]+)</h5>'], webpage, 'title').strip() + r'<div class="thumb-holder video">\s*<h5[^>]*>([^<]+)</h5>', + r'<span[^>]+class="title_thumb">([^<]+)</span>'], webpage, 'title').strip() thumbnails = [ { 'url': thumb_url, @@ -57,9 +57,8 @@ class NuvidIE(InfoExtractor): ] thumbnail = thumbnails[0]['url'] if thumbnails else None duration = parse_duration(self._html_search_regex( - r'<i class="fa fa-clock-o"></i>\s*(\d{2}:\d{2})', webpage, 'duration', fatal=False)) - upload_date = unified_strdate(self._html_search_regex( - r'<i class="fa fa-user"></i>\s*(\d{4}-\d{2}-\d{2})', webpage, 'upload date', fatal=False)) + [r'<i class="fa fa-clock-o"></i>\s*(\d{2}:\d{2})', + r'<span[^>]+class="view_time">([^<]+)</span>'], webpage, 'duration', fatal=False)) return { 'id': video_id, @@ -67,7 +66,6 @@ class NuvidIE(InfoExtractor): 'thumbnails': thumbnails, 'thumbnail': thumbnail, 'duration': duration, - 'upload_date': upload_date, 'age_limit': 18, 'formats': formats, } From eebe6b382eb6bd9e8118b616f3dde48c294e3b0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 Apr 2016 21:37:34 +0600 Subject: [PATCH 185/347] [yandexmusic] Improve error handling --- youtube_dl/extractor/yandexmusic.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 0d32a612f..b0e68a087 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -18,9 +18,10 @@ from ..utils import ( class YandexMusicBaseIE(InfoExtractor): @staticmethod def _handle_error(response): - error = response.get('error') - if error: - raise ExtractorError(error, expected=True) + if isinstance(response, dict): + error = response.get('error') + if error: + raise ExtractorError(error, expected=True) def _download_webpage(self, *args, **kwargs): webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs) From 4b537629143c8f51c5814c650227971c438b12e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 28 Apr 2016 21:45:33 +0600 Subject: [PATCH 186/347] [yandexmusic] Clarify blockage --- youtube_dl/extractor/yandexmusic.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index b0e68a087..a33fe3d83 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -26,7 +26,11 @@ class YandexMusicBaseIE(InfoExtractor): def _download_webpage(self, *args, **kwargs): webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs) if 'Нам очень жаль, но запросы, поступившие с вашего IP-адреса, похожи на автоматические.' in webpage: - raise ExtractorError('Blocked by YandexMusic', expected=True) + raise ExtractorError( + 'YandexMusic asks you to solve a CAPTCHA: go to ' + 'https://music.yandex.ru/ and solve it, then export ' + 'cookies and pass cookie file to youtube-dl with --cookies', + expected=True) return webpage def _download_json(self, *args, **kwargs): From 0ba9e3ca2233d018d695bac4eebe0e34043a7ec9 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 28 Apr 2016 17:44:33 +0100 Subject: [PATCH 187/347] [viewster] extract formats for videos with multiple audios/subtitles --- youtube_dl/extractor/viewster.py | 147 +++++++++++++++++-------------- 1 file changed, 80 insertions(+), 67 deletions(-) diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 6edc2c44e..1813b81d6 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -78,11 +78,11 @@ class ViewsterIE(InfoExtractor): _ACCEPT_HEADER = 'application/json, text/javascript, */*; q=0.01' - def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True): + def _download_json(self, url, video_id, note='Downloading JSON metadata', fatal=True, query={}): request = sanitized_Request(url) request.add_header('Accept', self._ACCEPT_HEADER) request.add_header('Auth-token', self._AUTH_TOKEN) - return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal) + return super(ViewsterIE, self)._download_json(request, video_id, note, fatal=fatal, query=query) def _real_extract(self, url): video_id = self._match_id(url) @@ -117,72 +117,85 @@ class ViewsterIE(InfoExtractor): return self.playlist_result(entries, video_id, title, description) formats = [] - manifest_url = None - m3u8_formats = [] - for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'): - media = self._download_json( - 'https://public-api.viewster.com/movies/%s/video?mediaType=%s' - % (entry_id, compat_urllib_parse.quote(media_type)), - video_id, 'Downloading %s JSON' % media_type, fatal=False) - if not media: - continue - video_url = media.get('Uri') - if not video_url: - continue - ext = determine_ext(video_url) - if ext == 'f4m': - manifest_url = video_url - video_url += '&' if '?' in video_url else '?' - video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1' - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id='hds')) - elif ext == 'm3u8': - manifest_url = video_url - m3u8_formats = self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id='hls', - fatal=False) # m3u8 sometimes fail - if m3u8_formats: - formats.extend(m3u8_formats) - else: - qualities_basename = self._search_regex( - '/([^/]+)\.csmil/', - manifest_url, 'qualities basename', default=None) - if not qualities_basename: - continue - QUALITIES_RE = r'((,\d+k)+,?)' - qualities = self._search_regex( - QUALITIES_RE, qualities_basename, - 'qualities', default=None) - if not qualities: - continue - qualities = list(map(lambda q: int(q[:-1]), qualities.strip(',').split(','))) - qualities.sort() - http_template = re.sub(QUALITIES_RE, r'%dk', qualities_basename) - http_url_basename = url_basename(video_url) - if m3u8_formats: - self._sort_formats(m3u8_formats) - m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - m3u8_formats)) - if len(qualities) == len(m3u8_formats): - for q, m3u8_format in zip(qualities, m3u8_formats): - f = m3u8_format.copy() - f.update({ - 'url': video_url.replace(http_url_basename, http_template % q), - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - else: - for q in qualities: - formats.append({ - 'url': video_url.replace(http_url_basename, http_template % q), - 'ext': 'mp4', - 'format_id': 'http-%d' % q, - 'tbr': q, - }) + for language_set in info.get('LanguageSets', []): + manifest_url = None + m3u8_formats = [] + audio = language_set.get('Audio') or '' + subtitle = language_set.get('Subtitle') or '' + base_format_id = audio + if subtitle: + base_format_id += '-%s' % subtitle - if not formats and not info.get('LanguageSets') and not info.get('VODSettings'): + def concat(suffix, sep='-'): + return (base_format_id + '%s%s' % (sep, suffix)) if base_format_id else suffix + + for media_type in ('application/f4m+xml', 'application/x-mpegURL', 'video/mp4'): + media = self._download_json( + 'https://public-api.viewster.com/movies/%s/video' % entry_id, + video_id, 'Downloading %s JSON' % concat(media_type, ' '), fatal=False, query={ + 'mediaType': media_type, + 'language': audio, + 'subtitle': subtitle, + }) + if not media: + continue + video_url = media.get('Uri') + if not video_url: + continue + ext = determine_ext(video_url) + if ext == 'f4m': + manifest_url = video_url + video_url += '&' if '?' in video_url else '?' + video_url += 'hdcore=3.2.0&plugin=flowplayer-3.2.0.1' + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id=concat('hds'))) + elif ext == 'm3u8': + manifest_url = video_url + m3u8_formats = self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id=concat('hls'), + fatal=False) # m3u8 sometimes fail + if m3u8_formats: + formats.extend(m3u8_formats) + else: + qualities_basename = self._search_regex( + '/([^/]+)\.csmil/', + manifest_url, 'qualities basename', default=None) + if not qualities_basename: + continue + QUALITIES_RE = r'((,\d+k)+,?)' + qualities = self._search_regex( + QUALITIES_RE, qualities_basename, + 'qualities', default=None) + if not qualities: + continue + qualities = list(map(lambda q: int(q[:-1]), qualities.strip(',').split(','))) + qualities.sort() + http_template = re.sub(QUALITIES_RE, r'%dk', qualities_basename) + http_url_basename = url_basename(video_url) + if m3u8_formats: + self._sort_formats(m3u8_formats) + m3u8_formats = list(filter( + lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + m3u8_formats)) + if len(qualities) == len(m3u8_formats): + for q, m3u8_format in zip(qualities, m3u8_formats): + f = m3u8_format.copy() + f.update({ + 'url': video_url.replace(http_url_basename, http_template % q), + 'format_id': f['format_id'].replace('hls', 'http'), + 'protocol': 'http', + }) + formats.append(f) + else: + for q in qualities: + formats.append({ + 'url': video_url.replace(http_url_basename, http_template % q), + 'ext': 'mp4', + 'format_id': 'http-%d' % q, + 'tbr': q, + }) + + if not formats and not info.get('VODSettings'): self.raise_geo_restricted() self._sort_formats(formats) From e757fb3d053a195da4084c08a59a7b17b08ba598 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 28 Apr 2016 18:42:20 +0100 Subject: [PATCH 188/347] [crunchyroll] improve extraction - extract more metadata(series, episode, episode_number) - reduce duplicate requests for extracting formats - remove duplicate formats --- youtube_dl/extractor/crunchyroll.py | 31 +++++++++++++++++------------ 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 8ae3f2890..dd753c7c3 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -11,7 +11,6 @@ from math import pow, sqrt, floor from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, - compat_urllib_parse_unquote, compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, @@ -306,28 +305,24 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', webpage, 'video_uploader', fatal=False) - playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url')) - playerdata_req = sanitized_Request(playerdata_url) - playerdata_req.data = urlencode_postdata({'current_page': webpage_url}) - playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') - playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info') - - stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, 'stream_id') - video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False) - formats = [] - for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage): + video_encode_ids = [] + for fmt in re.findall(r'token="showmedia\.([0-9]{3,4})p"', webpage): stream_quality, stream_format = self._FORMAT_IDS[fmt] video_format = fmt + 'p' streamdata_req = sanitized_Request( 'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s' - % (stream_id, stream_format, stream_quality), + % (video_id, stream_format, stream_quality), compat_urllib_parse_urlencode({'current_page': url}).encode('utf-8')) streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded') streamdata = self._download_xml( streamdata_req, video_id, note='Downloading media info for %s' % video_format) stream_info = streamdata.find('./{default}preload/stream_info') + video_encode_id = xpath_text(stream_info, './video_encode_id') + if video_encode_id in video_encode_ids: + continue + video_encode_ids.append(video_encode_id) video_url = xpath_text(stream_info, './host') video_play_path = xpath_text(stream_info, './file') if not video_url or not video_play_path: @@ -360,15 +355,25 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text }) formats.append(format_info) + metadata = self._download_xml( + 'http://www.crunchyroll.com/xml', video_id, + note='Downloading media info', query={ + 'req': 'RpcApiVideoPlayer_GetMediaMetadata', + 'media_id': video_id, + }) + subtitles = self.extract_subtitles(video_id, webpage) return { 'id': video_id, 'title': video_title, 'description': video_description, - 'thumbnail': video_thumbnail, + 'thumbnail': xpath_text(metadata, 'episode_image_url'), 'uploader': video_uploader, 'upload_date': video_upload_date, + 'series': xpath_text(metadata, 'series_title'), + 'episode': xpath_text(metadata, 'episode_title'), + 'episode_number': int_or_none(xpath_text(metadata, 'episode_number')), 'subtitles': subtitles, 'formats': formats, } From 497971cd4a8407651debfb2fd4b10fc4009b0f15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 29 Apr 2016 01:28:07 +0600 Subject: [PATCH 189/347] [yandexmusic] Clarify blockage even more --- youtube_dl/extractor/yandexmusic.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index a33fe3d83..ce3723b55 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -27,9 +27,12 @@ class YandexMusicBaseIE(InfoExtractor): webpage = super(YandexMusicBaseIE, self)._download_webpage(*args, **kwargs) if 'Нам очень жаль, но запросы, поступившие с вашего IP-адреса, похожи на автоматические.' in webpage: raise ExtractorError( - 'YandexMusic asks you to solve a CAPTCHA: go to ' - 'https://music.yandex.ru/ and solve it, then export ' - 'cookies and pass cookie file to youtube-dl with --cookies', + 'YandexMusic has considered youtube-dl requests automated and ' + 'asks you to solve a CAPTCHA. You can either wait for some ' + 'time until unblocked and optionally use --sleep-interval ' + 'in future or alternatively you can go to https://music.yandex.ru/ ' + 'solve CAPTCHA, then export cookies and pass cookie file to ' + 'youtube-dl with --cookies', expected=True) return webpage From 683d892bf9332df1a255c673bca56a8f5487292a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 29 Apr 2016 01:30:53 +0600 Subject: [PATCH 190/347] [viewster] Remove unused import --- youtube_dl/extractor/viewster.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 1813b81d6..a93196a07 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -6,7 +6,6 @@ import re from .common import InfoExtractor from ..compat import ( compat_HTTPError, - compat_urllib_parse, compat_urllib_parse_unquote, ) from ..utils import ( From 72670c39decc296a3ee757301dc70389674d19c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 29 Apr 2016 04:46:23 +0600 Subject: [PATCH 191/347] [arte:+7] Fix typo in _VALID_URL --- youtube_dl/extractor/arte.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index a9e3266dc..881cacfab 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -63,7 +63,7 @@ class ArteTvIE(InfoExtractor): class ArteTVPlus7IE(InfoExtractor): IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&+])' + _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P<id>[^/]+)/(?P<name>[^/?#&]+)' @classmethod def _extract_url_info(cls, url): From 31ff3c074eddf4078b6eb49281830875eb4e65a1 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 29 Apr 2016 13:36:52 +0800 Subject: [PATCH 192/347] [sexykarma] Remove the extractor Its domain name is on sale. Closes #9317 --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/sexykarma.py | 121 ----------------------------- 2 files changed, 122 deletions(-) delete mode 100644 youtube_dl/extractor/sexykarma.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 88405f070..41ff1e7a5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -657,7 +657,6 @@ from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE from .senateisvp import SenateISVPIE from .servingsys import ServingSysIE from .sexu import SexuIE -from .sexykarma import SexyKarmaIE from .shahid import ShahidIE from .shared import SharedIE from .sharesix import ShareSixIE diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/sexykarma.py deleted file mode 100644 index e33483674..000000000 --- a/youtube_dl/extractor/sexykarma.py +++ /dev/null @@ -1,121 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - unified_strdate, - parse_duration, - int_or_none, -) - - -class SexyKarmaIE(InfoExtractor): - IE_DESC = 'Sexy Karma and Watch Indian Porn' - _VALID_URL = r'https?://(?:www\.)?(?:sexykarma\.com|watchindianporn\.net)/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html' - _TESTS = [{ - 'url': 'http://www.sexykarma.com/gonewild/video/taking-a-quick-pee-yHI70cOyIHt.html', - 'md5': 'b9798e7d1ef1765116a8f516c8091dbd', - 'info_dict': { - 'id': 'yHI70cOyIHt', - 'display_id': 'taking-a-quick-pee', - 'ext': 'mp4', - 'title': 'Taking a quick pee.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'wildginger7', - 'upload_date': '20141008', - 'duration': 22, - 'view_count': int, - 'comment_count': int, - 'categories': list, - 'age_limit': 18, - } - }, { - 'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html', - 'md5': 'dd216c68d29b49b12842b9babe762a5d', - 'info_dict': { - 'id': '8Id6EZPbuHf', - 'display_id': 'pot-pixie-tribute', - 'ext': 'mp4', - 'title': 'pot_pixie tribute', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'banffite', - 'upload_date': '20141013', - 'duration': 16, - 'view_count': int, - 'comment_count': int, - 'categories': list, - 'age_limit': 18, - } - }, { - 'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html', - 'md5': '9afb80675550406ed9a63ac2819ef69d', - 'info_dict': { - 'id': 'dW2mtctxJfs', - 'display_id': 'desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number', - 'ext': 'mp4', - 'title': 'Desi dancer namrata stripping completely nude and dancing on a hot number', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'Don', - 'upload_date': '20140213', - 'duration': 83, - 'view_count': int, - 'comment_count': int, - 'categories': list, - 'age_limit': 18, - } - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - webpage = self._download_webpage(url, display_id) - - video_url = self._html_search_regex( - r"url: escape\('([^']+)'\)", webpage, 'url') - - title = self._html_search_regex( - r'<h2 class="he2"><span>(.*?)</span>', - webpage, 'title') - thumbnail = self._html_search_regex( - r'<span id="container"><img\s+src="([^"]+)"', - webpage, 'thumbnail', fatal=False) - - uploader = self._html_search_regex( - r'class="aupa">\s*(.*?)</a>', - webpage, 'uploader') - upload_date = unified_strdate(self._html_search_regex( - r'Added: <strong>(.+?)</strong>', webpage, 'upload date', fatal=False)) - - duration = parse_duration(self._search_regex( - r'<td>Time:\s*</td>\s*<td align="right"><span>\s*(.+?)\s*</span>', - webpage, 'duration', fatal=False)) - - view_count = int_or_none(self._search_regex( - r'<td>Views:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', - webpage, 'view count', fatal=False)) - comment_count = int_or_none(self._search_regex( - r'<td>Comments:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', - webpage, 'comment count', fatal=False)) - - categories = re.findall( - r'<a href="[^"]+/search/video/desi"><span>([^<]+)</span></a>', - webpage) - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'upload_date': upload_date, - 'duration': duration, - 'view_count': view_count, - 'comment_count': comment_count, - 'categories': categories, - 'age_limit': 18, - } From f5535ed0e3537acee90820c98d6ca474d437d7d0 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 29 Apr 2016 14:24:07 +0800 Subject: [PATCH 193/347] [orf] Skip the expired test --- youtube_dl/extractor/orf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 66c75f8b3..4e3864f0d 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -185,6 +185,7 @@ class ORFFM4IE(InfoExtractor): 'timestamp': 1452456073, 'upload_date': '20160110', }, + 'skip': 'Live streams on FM4 got deleted soon', } def _real_extract(self, url): From 5819edef034819b76b8eec6a0cdf7b29cc9ddff3 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 29 Apr 2016 14:27:15 +0800 Subject: [PATCH 194/347] [ooyala] Skip an invalid test Ooyala is used by lots of extractors and its correctness can be verified by these websites. --- youtube_dl/extractor/ooyala.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 16f040191..95e982897 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -96,6 +96,8 @@ class OoyalaIE(OoyalaBaseIE): 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', 'duration': 853.386, }, + # The video in the original webpage now uses PlayWire + 'skip': 'Ooyala said: movie expired', }, { # Only available for ipad 'url': 'http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', From 1910077ed77a270fea8e368c3815b23cee254f85 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 29 Apr 2016 17:59:23 +0800 Subject: [PATCH 195/347] Revert "[sexykarma] Remove the extractor" This reverts commit 31ff3c074eddf4078b6eb49281830875eb4e65a1. --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/sexykarma.py | 121 +++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 youtube_dl/extractor/sexykarma.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 41ff1e7a5..88405f070 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -657,6 +657,7 @@ from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE from .senateisvp import SenateISVPIE from .servingsys import ServingSysIE from .sexu import SexuIE +from .sexykarma import SexyKarmaIE from .shahid import ShahidIE from .shared import SharedIE from .sharesix import ShareSixIE diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/sexykarma.py new file mode 100644 index 000000000..e33483674 --- /dev/null +++ b/youtube_dl/extractor/sexykarma.py @@ -0,0 +1,121 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + unified_strdate, + parse_duration, + int_or_none, +) + + +class SexyKarmaIE(InfoExtractor): + IE_DESC = 'Sexy Karma and Watch Indian Porn' + _VALID_URL = r'https?://(?:www\.)?(?:sexykarma\.com|watchindianporn\.net)/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html' + _TESTS = [{ + 'url': 'http://www.sexykarma.com/gonewild/video/taking-a-quick-pee-yHI70cOyIHt.html', + 'md5': 'b9798e7d1ef1765116a8f516c8091dbd', + 'info_dict': { + 'id': 'yHI70cOyIHt', + 'display_id': 'taking-a-quick-pee', + 'ext': 'mp4', + 'title': 'Taking a quick pee.', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'wildginger7', + 'upload_date': '20141008', + 'duration': 22, + 'view_count': int, + 'comment_count': int, + 'categories': list, + 'age_limit': 18, + } + }, { + 'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html', + 'md5': 'dd216c68d29b49b12842b9babe762a5d', + 'info_dict': { + 'id': '8Id6EZPbuHf', + 'display_id': 'pot-pixie-tribute', + 'ext': 'mp4', + 'title': 'pot_pixie tribute', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'banffite', + 'upload_date': '20141013', + 'duration': 16, + 'view_count': int, + 'comment_count': int, + 'categories': list, + 'age_limit': 18, + } + }, { + 'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html', + 'md5': '9afb80675550406ed9a63ac2819ef69d', + 'info_dict': { + 'id': 'dW2mtctxJfs', + 'display_id': 'desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number', + 'ext': 'mp4', + 'title': 'Desi dancer namrata stripping completely nude and dancing on a hot number', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Don', + 'upload_date': '20140213', + 'duration': 83, + 'view_count': int, + 'comment_count': int, + 'categories': list, + 'age_limit': 18, + } + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + + video_url = self._html_search_regex( + r"url: escape\('([^']+)'\)", webpage, 'url') + + title = self._html_search_regex( + r'<h2 class="he2"><span>(.*?)</span>', + webpage, 'title') + thumbnail = self._html_search_regex( + r'<span id="container"><img\s+src="([^"]+)"', + webpage, 'thumbnail', fatal=False) + + uploader = self._html_search_regex( + r'class="aupa">\s*(.*?)</a>', + webpage, 'uploader') + upload_date = unified_strdate(self._html_search_regex( + r'Added: <strong>(.+?)</strong>', webpage, 'upload date', fatal=False)) + + duration = parse_duration(self._search_regex( + r'<td>Time:\s*</td>\s*<td align="right"><span>\s*(.+?)\s*</span>', + webpage, 'duration', fatal=False)) + + view_count = int_or_none(self._search_regex( + r'<td>Views:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', + webpage, 'view count', fatal=False)) + comment_count = int_or_none(self._search_regex( + r'<td>Comments:\s*</td>\s*<td align="right"><span>\s*(\d+)\s*</span>', + webpage, 'comment count', fatal=False)) + + categories = re.findall( + r'<a href="[^"]+/search/video/desi"><span>([^<]+)</span></a>', + webpage) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, + 'categories': categories, + 'age_limit': 18, + } From 14638e291511c3305b70dce64e9bd97686e9da93 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 29 Apr 2016 18:17:08 +0800 Subject: [PATCH 196/347] [sexykarma] Rename to WatchIndianPornIE and fix extraction --- youtube_dl/extractor/extractors.py | 2 +- .../{sexykarma.py => watchindianporn.py} | 63 +++++-------------- 2 files changed, 17 insertions(+), 48 deletions(-) rename youtube_dl/extractor/{sexykarma.py => watchindianporn.py} (54%) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 88405f070..3adcd41c4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -657,7 +657,6 @@ from .screenwavemedia import ScreenwaveMediaIE, TeamFourIE from .senateisvp import SenateISVPIE from .servingsys import ServingSysIE from .sexu import SexuIE -from .sexykarma import SexyKarmaIE from .shahid import ShahidIE from .shared import SharedIE from .sharesix import ShareSixIE @@ -918,6 +917,7 @@ from .vulture import VultureIE from .walla import WallaIE from .washingtonpost import WashingtonPostIE from .wat import WatIE +from .watchindianporn import WatchIndianPornIE from .wdr import ( WDRIE, WDRMobileIE, diff --git a/youtube_dl/extractor/sexykarma.py b/youtube_dl/extractor/watchindianporn.py similarity index 54% rename from youtube_dl/extractor/sexykarma.py rename to youtube_dl/extractor/watchindianporn.py index e33483674..5d3b5bdb4 100644 --- a/youtube_dl/extractor/sexykarma.py +++ b/youtube_dl/extractor/watchindianporn.py @@ -11,61 +11,27 @@ from ..utils import ( ) -class SexyKarmaIE(InfoExtractor): - IE_DESC = 'Sexy Karma and Watch Indian Porn' - _VALID_URL = r'https?://(?:www\.)?(?:sexykarma\.com|watchindianporn\.net)/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html' - _TESTS = [{ - 'url': 'http://www.sexykarma.com/gonewild/video/taking-a-quick-pee-yHI70cOyIHt.html', - 'md5': 'b9798e7d1ef1765116a8f516c8091dbd', +class WatchIndianPornIE(InfoExtractor): + IE_DESC = 'Watch Indian Porn' + _VALID_URL = r'https?://(?:www\.)?watchindianporn\.net/(?:[^/]+/)*video/(?P<display_id>[^/]+)-(?P<id>[a-zA-Z0-9]+)\.html' + _TEST = { + 'url': 'http://www.watchindianporn.net/video/hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera-RZa2avywNPa.html', + 'md5': '249589a164dde236ec65832bfce17440', 'info_dict': { - 'id': 'yHI70cOyIHt', - 'display_id': 'taking-a-quick-pee', + 'id': 'RZa2avywNPa', + 'display_id': 'hot-milf-from-kerala-shows-off-her-gorgeous-large-breasts-on-camera', 'ext': 'mp4', - 'title': 'Taking a quick pee.', + 'title': 'Hot milf from kerala shows off her gorgeous large breasts on camera', 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'wildginger7', - 'upload_date': '20141008', - 'duration': 22, + 'uploader': 'LoveJay', + 'upload_date': '20160428', + 'duration': 226, 'view_count': int, 'comment_count': int, 'categories': list, 'age_limit': 18, } - }, { - 'url': 'http://www.sexykarma.com/gonewild/video/pot-pixie-tribute-8Id6EZPbuHf.html', - 'md5': 'dd216c68d29b49b12842b9babe762a5d', - 'info_dict': { - 'id': '8Id6EZPbuHf', - 'display_id': 'pot-pixie-tribute', - 'ext': 'mp4', - 'title': 'pot_pixie tribute', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'banffite', - 'upload_date': '20141013', - 'duration': 16, - 'view_count': int, - 'comment_count': int, - 'categories': list, - 'age_limit': 18, - } - }, { - 'url': 'http://www.watchindianporn.net/video/desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number-dW2mtctxJfs.html', - 'md5': '9afb80675550406ed9a63ac2819ef69d', - 'info_dict': { - 'id': 'dW2mtctxJfs', - 'display_id': 'desi-dancer-namrata-stripping-completely-nude-and-dancing-on-a-hot-number', - 'ext': 'mp4', - 'title': 'Desi dancer namrata stripping completely nude and dancing on a hot number', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'Don', - 'upload_date': '20140213', - 'duration': 83, - 'view_count': int, - 'comment_count': int, - 'categories': list, - 'age_limit': 18, - } - }] + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -109,6 +75,9 @@ class SexyKarmaIE(InfoExtractor): 'id': video_id, 'display_id': display_id, 'url': video_url, + 'http_headers': { + 'Referer': url, + }, 'title': title, 'thumbnail': thumbnail, 'uploader': uploader, From 67167920db50e818c9fca20579c8a05eb2218f86 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 29 Apr 2016 11:14:42 +0100 Subject: [PATCH 197/347] [viewlift] replace SnagFilms extractors - add support for other sites that use the same logic - improve format extraction and sorting --- youtube_dl/extractor/extractors.py | 8 ++--- youtube_dl/extractor/generic.py | 10 +++--- .../extractor/{snagfilms.py => viewlift.py} | 35 +++++++++++++------ 3 files changed, 34 insertions(+), 19 deletions(-) rename youtube_dl/extractor/{snagfilms.py => viewlift.py} (81%) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3adcd41c4..b1b7f9b42 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -673,10 +673,6 @@ from .smotri import ( SmotriUserIE, SmotriBroadcastIE, ) -from .snagfilms import ( - SnagFilmsIE, - SnagFilmsEmbedIE, -) from .snotr import SnotrIE from .sohu import SohuIE from .soundcloud import ( @@ -879,6 +875,10 @@ from .vidme import ( ) from .vidzi import VidziIE from .vier import VierIE, VierVideosIE +from .viewlift import ( + ViewLiftIE, + ViewLiftEmbedIE, +) from .viewster import ViewsterIE from .viidea import ViideaIE from .vimeo import ( diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a95501d86..0f1eb7fa6 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -51,7 +51,7 @@ from .tnaflix import TNAFlixNetworkEmbedIE from .vimeo import VimeoIE from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE -from .snagfilms import SnagFilmsEmbedIE +from .viewlift import ViewLiftEmbedIE from .screenwavemedia import ScreenwaveMediaIE from .mtv import MTVServicesEmbeddedIE from .pladform import PladformIE @@ -1924,10 +1924,10 @@ class GenericIE(InfoExtractor): if onionstudios_url: return self.url_result(onionstudios_url) - # Look for SnagFilms embeds - snagfilms_url = SnagFilmsEmbedIE._extract_url(webpage) - if snagfilms_url: - return self.url_result(snagfilms_url) + # Look for ViewLift embeds + viewlift_url = ViewLiftEmbedIE._extract_url(webpage) + if viewlift_url: + return self.url_result(viewlift_url) # Look for JWPlatform embeds jwplatform_url = JWPlatformIE._extract_url(webpage) diff --git a/youtube_dl/extractor/snagfilms.py b/youtube_dl/extractor/viewlift.py similarity index 81% rename from youtube_dl/extractor/snagfilms.py rename to youtube_dl/extractor/viewlift.py index 6977afb27..dd4a13a4a 100644 --- a/youtube_dl/extractor/snagfilms.py +++ b/youtube_dl/extractor/viewlift.py @@ -13,8 +13,12 @@ from ..utils import ( ) -class SnagFilmsEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|embed)\.)?snagfilms\.com/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})' +class ViewLiftBaseIE(InfoExtractor): + _DOMAINS_REGEX = '(?:snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|monumentalsportsnetwork|vayafilm)\.com|kesari\.tv' + + +class ViewLiftEmbedIE(ViewLiftBaseIE): + _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', 'md5': '2924e9215c6eff7a55ed35b72276bd93', @@ -40,7 +44,7 @@ class SnagFilmsEmbedIE(InfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?snagfilms\.com/embed/player.+?)\1', + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?(?:%s)/embed/player.+?)\1' % ViewLiftBaseIE._DOMAINS_REGEX, webpage) if mobj: return mobj.group('url') @@ -55,6 +59,7 @@ class SnagFilmsEmbedIE(InfoExtractor): 'Film %s is not playable in your area.' % video_id, expected=True) formats = [] + has_bitrate = False for source in self._parse_json(js_to_json(self._search_regex( r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id): file_ = source.get('file') @@ -63,22 +68,25 @@ class SnagFilmsEmbedIE(InfoExtractor): type_ = source.get('type') ext = determine_ext(file_) format_id = source.get('label') or ext - if all(v == 'm3u8' for v in (type_, ext)): + if all(v == 'm3u8' or v == 'hls' for v in (type_, ext)): formats.extend(self._extract_m3u8_formats( file_, video_id, 'mp4', m3u8_id='hls')) else: bitrate = int_or_none(self._search_regex( [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext], file_, 'bitrate', default=None)) + if not has_bitrate and bitrate: + has_bitrate = True height = int_or_none(self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None)) formats.append({ 'url': file_, - 'format_id': format_id, + 'format_id': 'http-%s%s' % (format_id, ('-%dk' % bitrate if bitrate else '')), 'tbr': bitrate, 'height': height, }) - self._sort_formats(formats) + field_preference = None if has_bitrate else ('height', 'tbr', 'format_id') + self._sort_formats(formats, field_preference) title = self._search_regex( [r"title\s*:\s*'([^']+)'", r'<title>([^<]+)'], @@ -91,8 +99,8 @@ class SnagFilmsEmbedIE(InfoExtractor): } -class SnagFilmsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?snagfilms\.com/(?:films/title|show)/(?P[^?#]+)' +class ViewLiftIE(ViewLiftBaseIE): + _VALID_URL = r'https?://(?:www\.)?(?P%s)/(?:films/title|show|(?:news/)?videos?)/(?P[^?#]+)' % ViewLiftBaseIE._DOMAINS_REGEX _TESTS = [{ 'url': 'http://www.snagfilms.com/films/title/lost_for_life', 'md5': '19844f897b35af219773fd63bdec2942', @@ -127,10 +135,16 @@ class SnagFilmsIE(InfoExtractor): # Film is not available. 'url': 'http://www.snagfilms.com/show/augie_alone/flirting', 'only_matching': True, + }, { + 'url': 'http://www.winnersview.com/videos/the-good-son', + 'only_matching': True, + }, { + 'url': 'http://www.kesari.tv/news/video/1461919076414', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) + domain, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) @@ -170,7 +184,7 @@ class SnagFilmsIE(InfoExtractor): return { '_type': 'url_transparent', - 'url': 'http://embed.snagfilms.com/embed/player?filmId=%s' % film_id, + 'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id), 'id': film_id, 'display_id': display_id, 'title': title, @@ -178,4 +192,5 @@ class SnagFilmsIE(InfoExtractor): 'thumbnail': thumbnail, 'duration': duration, 'categories': categories, + 'ie_key': 'ViewLiftEmbed', } From 065216d94f59953a228d2683d3bafe4241fd1e29 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 29 Apr 2016 11:46:42 +0100 Subject: [PATCH 198/347] [crunchyroll] reduce requests for formats extraction --- youtube_dl/extractor/crunchyroll.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index dd753c7c3..184ba6896 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -26,6 +26,7 @@ from ..utils import ( unified_strdate, urlencode_postdata, xpath_text, + extract_attributes, ) from ..aes import ( aes_cbc_decrypt, @@ -305,9 +306,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text r']+href="/publisher/[^"]+"[^>]*>([^<]+)', webpage, 'video_uploader', fatal=False) - formats = [] + available_fmts = [] + for a, fmt in re.findall(r'(]+token="showmedia\.([0-9]{3,4})p"[^>]+>.*?)', webpage): + attrs = extract_attributes(a) + href = attrs.get('href') + if href and '/freetrial' in href: + continue + available_fmts.append(fmt) + if not available_fmts: + available_fmts = re.findall(r'token="showmedia\.([0-9]{3,4})p"', webpage) video_encode_ids = [] - for fmt in re.findall(r'token="showmedia\.([0-9]{3,4})p"', webpage): + formats = [] + for fmt in available_fmts: stream_quality, stream_format = self._FORMAT_IDS[fmt] video_format = fmt + 'p' streamdata_req = sanitized_Request( From b24d6336a797b99339c12a0aa1b431755e22e8cf Mon Sep 17 00:00:00 2001 From: Kagami Hiiragi Date: Tue, 26 Apr 2016 17:30:24 +0300 Subject: [PATCH 199/347] [vlive] Add support for live videos --- youtube_dl/extractor/common.py | 8 ++- youtube_dl/extractor/vlive.py | 98 ++++++++++++++++++++++++++-------- 2 files changed, 83 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a285ee7d8..2763d2ffe 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1061,7 +1061,7 @@ class InfoExtractor(object): def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, m3u8_id=None, note=None, errnote=None, - fatal=True): + fatal=True, live=False): formats = [{ 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), @@ -1139,7 +1139,11 @@ class InfoExtractor(object): if m3u8_id: format_id.append(m3u8_id) last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None - format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) + # Bandwidth of live streams may differ over time thus making + # format_id unpredictable. So it's better to keep provided + # format_id intact. + if last_media_name and not live: + format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) f = { 'format_id': '-'.join(format_id), 'url': format_url(line.strip()), diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index baf39bb2c..2151696ea 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -1,8 +1,11 @@ # coding: utf-8 -from __future__ import unicode_literals +from __future__ import division, unicode_literals +import re +import time from .common import InfoExtractor from ..utils import ( + ExtractorError, dict_get, float_or_none, int_or_none, @@ -31,16 +34,77 @@ class VLiveIE(InfoExtractor): webpage = self._download_webpage( 'http://www.vlive.tv/video/%s' % video_id, video_id) - long_video_id = self._search_regex( - r'vlive\.tv\.video\.ajax\.request\.handler\.init\(\s*"[0-9]+"\s*,\s*"[^"]*"\s*,\s*"([^"]+)"', - webpage, 'long video id') + # UTC+x - UTC+9 (KST) + tz = time.altzone if time.localtime().tm_isdst == 1 else time.timezone + tz_offset = -tz // 60 - 9 * 60 + self._set_cookie('vlive.tv', 'timezoneOffset', '%d' % tz_offset) - key = self._search_regex( - r'vlive\.tv\.video\.ajax\.request\.handler\.init\(\s*"[0-9]+"\s*,\s*"[^"]*"\s*,\s*"[^"]+"\s*,\s*"([^"]+)"', - webpage, 'key') + status_params = self._download_json( + 'http://www.vlive.tv/video/status?videoSeq=%s' % video_id, + video_id, 'Downloading JSON status', + headers={'Referer': url}) + status = status_params.get('status') + air_start = status_params.get('onAirStartAt', '') + is_live = status_params.get('isLive') + video_params = self._search_regex( + r'vlive\.tv\.video\.ajax\.request\.handler\.init\((.+)\)', + webpage, 'video params') + live_params, long_video_id, key = re.split( + r'"\s*,\s*"', video_params)[1:4] + + if status == 'LIVE_ON_AIR' or status == 'BIG_EVENT_ON_AIR': + live_params = self._parse_json('"%s"' % live_params, video_id) + live_params = self._parse_json(live_params, video_id) + return self._live(video_id, webpage, live_params) + elif status == 'VOD_ON_AIR' or status == 'BIG_EVENT_INTRO': + if long_video_id and key: + return self._replay(video_id, webpage, long_video_id, key) + elif is_live: + status = 'LIVE_END' + else: + status = 'COMING_SOON' + + if status == 'LIVE_END': + raise ExtractorError('Uploading for replay. Please wait...', + expected=True) + elif status == 'COMING_SOON': + raise ExtractorError('Coming soon! %s' % air_start, expected=True) + elif status == 'CANCELED': + raise ExtractorError('We are sorry, ' + 'but the live broadcast has been canceled.', + expected=True) + else: + raise ExtractorError('Unknown status %s' % status) + + def _get_common_fields(self, webpage): title = self._og_search_title(webpage) + creator = self._html_search_regex( + r']+class="info_area"[^>]*>\s*]*>([^<]+)', + webpage, 'creator', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + return { + 'title': title, + 'creator': creator, + 'thumbnail': thumbnail, + } + def _live(self, video_id, webpage, live_params): + formats = [] + for vid in live_params.get('resolutions', []): + formats.extend(self._extract_m3u8_formats( + vid['cdnUrl'], video_id, 'mp4', + m3u8_id=vid.get('name'), + fatal=False, live=True)) + self._sort_formats(formats) + + return dict(self._get_common_fields(webpage), + id=video_id, + formats=formats, + is_live=True, + ) + + def _replay(self, video_id, webpage, long_video_id, key): playinfo = self._download_json( 'http://global.apis.naver.com/rmcnmv/rmcnmv/vod_play_videoInfo.json?%s' % compat_urllib_parse_urlencode({ @@ -62,11 +126,6 @@ class VLiveIE(InfoExtractor): } for vid in playinfo.get('videos', {}).get('list', []) if vid.get('source')] self._sort_formats(formats) - thumbnail = self._og_search_thumbnail(webpage) - creator = self._html_search_regex( - r']+class="info_area"[^>]*>\s*]*>([^<]+)', - webpage, 'creator', fatal=False) - view_count = int_or_none(playinfo.get('meta', {}).get('count')) subtitles = {} @@ -77,12 +136,9 @@ class VLiveIE(InfoExtractor): 'ext': 'vtt', 'url': caption['source']}] - return { - 'id': video_id, - 'title': title, - 'creator': creator, - 'thumbnail': thumbnail, - 'view_count': view_count, - 'formats': formats, - 'subtitles': subtitles, - } + return dict(self._get_common_fields(webpage), + id=video_id, + formats=formats, + view_count=view_count, + subtitles=subtitles, + ) From 9d186afac818645490122aa7457f247c31c601bf Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 29 Apr 2016 19:29:00 +0800 Subject: [PATCH 200/347] [vlive] Coding style and PEP8 --- youtube_dl/extractor/vlive.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index 2151696ea..7f9e99ec2 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -3,10 +3,11 @@ from __future__ import division, unicode_literals import re import time + from .common import InfoExtractor from ..utils import ( - ExtractorError, dict_get, + ExtractorError, float_or_none, int_or_none, ) @@ -99,10 +100,9 @@ class VLiveIE(InfoExtractor): self._sort_formats(formats) return dict(self._get_common_fields(webpage), - id=video_id, - formats=formats, - is_live=True, - ) + id=video_id, + formats=formats, + is_live=True) def _replay(self, video_id, webpage, long_video_id, key): playinfo = self._download_json( @@ -137,8 +137,7 @@ class VLiveIE(InfoExtractor): 'url': caption['source']}] return dict(self._get_common_fields(webpage), - id=video_id, - formats=formats, - view_count=view_count, - subtitles=subtitles, - ) + id=video_id, + formats=formats, + view_count=view_count, + subtitles=subtitles) From 6ff4469528d642bd678df9b1fa83545a0942e333 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 29 Apr 2016 19:39:27 +0600 Subject: [PATCH 201/347] [crunchyroll] Relax fmt regex --- youtube_dl/extractor/crunchyroll.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 184ba6896..4a7664296 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -307,7 +307,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'video_uploader', fatal=False) available_fmts = [] - for a, fmt in re.findall(r'(]+token="showmedia\.([0-9]{3,4})p"[^>]+>.*?)', webpage): + for a, fmt in re.findall(r'(]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): attrs = extract_attributes(a) href = attrs.get('href') if href and '/freetrial' in href: From 8312b1a3d1dc07d80d33e31f9b2b6facf13fa744 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 29 Apr 2016 19:43:53 +0600 Subject: [PATCH 202/347] [crunchyroll] Add even more relaxed fmt fallback --- youtube_dl/extractor/crunchyroll.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 4a7664296..58960b2f8 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -314,7 +314,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text continue available_fmts.append(fmt) if not available_fmts: - available_fmts = re.findall(r'token="showmedia\.([0-9]{3,4})p"', webpage) + for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'): + available_fmts = re.findall(p, webpage) + if available_fmts: + break video_encode_ids = [] formats = [] for fmt in available_fmts: From 00a17a9e1234ecc868a15b5759472a0f9215f797 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 29 Apr 2016 19:44:10 +0600 Subject: [PATCH 203/347] [crunchyroll] Sort formats --- youtube_dl/extractor/crunchyroll.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 58960b2f8..90a64303d 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -367,6 +367,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'ext': 'flv', }) formats.append(format_info) + self._sort_formats(formats) metadata = self._download_xml( 'http://www.crunchyroll.com/xml', video_id, From e9c6cdf4a103d1ebdb6927bdab429c370cbe66b2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 29 Apr 2016 22:49:04 +0800 Subject: [PATCH 204/347] [common] Fix format_id construction for HLS --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 2763d2ffe..61a5d124c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1142,7 +1142,7 @@ class InfoExtractor(object): # Bandwidth of live streams may differ over time thus making # format_id unpredictable. So it's better to keep provided # format_id intact. - if last_media_name and not live: + if not live: format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) f = { 'format_id': '-'.join(format_id), From cef3f3011f9d3a67de3ff064a5185a1a4bcf40e7 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 30 Apr 2016 00:17:09 +0800 Subject: [PATCH 205/347] [funimation] Detect blocking and support CloudFlare cookies --- youtube_dl/extractor/funimation.py | 48 ++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index 1eb528f31..0ad0d9b6a 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -2,6 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import ( + compat_HTTPError, + compat_urllib_parse_unquote_plus, +) from ..utils import ( clean_html, determine_ext, @@ -27,6 +31,7 @@ class FunimationIE(InfoExtractor): 'description': 'md5:1769f43cd5fc130ace8fd87232207892', 'thumbnail': 're:https?://.*\.jpg', }, + 'skip': 'Access without user interaction is forbidden by CloudFlare, and video removed', }, { 'url': 'http://www.funimation.com/shows/hacksign/videos/official/role-play', 'info_dict': { @@ -37,6 +42,7 @@ class FunimationIE(InfoExtractor): 'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd', 'thumbnail': 're:https?://.*\.jpg', }, + 'skip': 'Access without user interaction is forbidden by CloudFlare', }, { 'url': 'http://www.funimation.com/shows/attack-on-titan-junior-high/videos/promotional/broadcast-dub-preview', 'info_dict': { @@ -47,8 +53,36 @@ class FunimationIE(InfoExtractor): 'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803', 'thumbnail': 're:https?://.*\.(?:jpg|png)', }, + 'skip': 'Access without user interaction is forbidden by CloudFlare', }] + _LOGIN_URL = 'http://www.funimation.com/login' + + def _download_webpage(self, *args, **kwargs): + try: + return super(FunimationIE, self)._download_webpage(*args, **kwargs) + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: + response = ee.cause.read() + if b'>Please complete the security check to access<' in response: + raise ExtractorError( + 'Access to funimation.com is blocked by CloudFlare. ' + 'Please browse to http://www.funimation.com/, solve ' + 'the reCAPTCHA, export browser cookies to a text file,' + ' and then try again with --cookies YOUR_COOKIE_FILE.', + expected=True) + raise + + def _extract_cloudflare_session_ua(self, url): + ci_session_cookie = self._get_cookies(url).get('ci_session') + if ci_session_cookie: + ci_session = compat_urllib_parse_unquote_plus(ci_session_cookie.value) + # ci_session is a string serialized by PHP function serialize() + # This case is simple enough to use regular expressions only + return self._search_regex( + r'"user_agent";s:\d+:"([^"]+)"', ci_session, 'user agent', + default=None) + def _login(self): (username, password) = self._get_login_info() if username is None: @@ -57,8 +91,11 @@ class FunimationIE(InfoExtractor): 'email_field': username, 'password_field': password, }) - login_request = sanitized_Request('http://www.funimation.com/login', data, headers={ - 'User-Agent': 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0', + user_agent = self._extract_cloudflare_session_ua(self._LOGIN_URL) + if not user_agent: + user_agent = 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0' + login_request = sanitized_Request(self._LOGIN_URL, data, headers={ + 'User-Agent': user_agent, 'Content-Type': 'application/x-www-form-urlencoded' }) login_page = self._download_webpage( @@ -103,11 +140,16 @@ class FunimationIE(InfoExtractor): ('mobile', 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36'), ) + user_agent = self._extract_cloudflare_session_ua(url) + if user_agent: + USER_AGENTS = ((None, user_agent),) + for kind, user_agent in USER_AGENTS: request = sanitized_Request(url) request.add_header('User-Agent', user_agent) webpage = self._download_webpage( - request, display_id, 'Downloading %s webpage' % kind) + request, display_id, + 'Downloading %s webpage' % kind if kind else 'Downloading webpage') playlist = self._parse_json( self._search_regex( From 65a3bfb379c9d5e53cac874af097d2071ee4ac4d Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 29 Apr 2016 19:21:17 +0100 Subject: [PATCH 206/347] [dfb] extract m3u8 formats --- youtube_dl/extractor/dfb.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/dfb.py b/youtube_dl/extractor/dfb.py index cdfeccacb..a4d0448c2 100644 --- a/youtube_dl/extractor/dfb.py +++ b/youtube_dl/extractor/dfb.py @@ -12,39 +12,46 @@ class DFBIE(InfoExtractor): _TEST = { 'url': 'http://tv.dfb.de/video/u-19-em-stimmen-zum-spiel-gegen-russland/11633/', - # The md5 is different each time + 'md5': 'ac0f98a52a330f700b4b3034ad240649', 'info_dict': { 'id': '11633', 'display_id': 'u-19-em-stimmen-zum-spiel-gegen-russland', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'U 19-EM: Stimmen zum Spiel gegen Russland', 'upload_date': '20150714', }, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, display_id) player_info = self._download_xml( 'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id, display_id) video_info = player_info.find('video') + stream_access_url = self._proto_relative_url(video_info.find('url').text.strip()) - f4m_info = self._download_xml( - self._proto_relative_url(video_info.find('url').text.strip()), display_id) - token_el = f4m_info.find('token') - manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0' - formats = self._extract_f4m_formats(manifest_url, display_id) + formats = [] + # see http://tv.dfb.de/player/js/ajax.js for the method to extract m3u8 formats + for sa_url in (stream_access_url, stream_access_url + '&area=&format=iphone'): + stream_access_info = self._download_xml(sa_url, display_id) + token_el = stream_access_info.find('token') + manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + if '.f4m' in manifest_url: + formats.extend(self._extract_f4m_formats( + manifest_url + '&hdcore=3.2.0', + display_id, f4m_id='hds', fatal=False)) + else: + formats.extend(self._extract_m3u8_formats( + manifest_url, display_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) self._sort_formats(formats) return { 'id': video_id, 'display_id': display_id, 'title': video_info.find('title').text, - 'thumbnail': self._og_search_thumbnail(webpage), + 'thumbnail': 'http://tv.dfb.de/images/%s_640x360.jpg' % video_id, 'upload_date': unified_strdate(video_info.find('time_date').text), 'formats': formats, } From 5556047465e0601d2bdee0e5a436cee64b745851 Mon Sep 17 00:00:00 2001 From: Reino17 Date: Wed, 27 Apr 2016 13:11:38 +0200 Subject: [PATCH 207/347] [rtlnl] Update 720p PG_URL_TEMPLATE - Fixed the format_id for the 720p progressive videostream and added the video's resolution. - The adaptive videostreams have the m3u8-extension, so I removed the confusing mp4-extension in order to make a better distinction between the these and the progressive videostreams. --- youtube_dl/extractor/rtlnl.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 543d94417..e8b55ea25 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -94,19 +94,30 @@ class RtlNlIE(InfoExtractor): videopath = material['videopath'] m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath - formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4') + formats = self._extract_m3u8_formats(m3u8_url, uuid) video_urlpart = videopath.split('/adaptive/')[1][:-5] PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4' formats.extend([ { - 'url': PG_URL_TEMPLATE % ('a2m', video_urlpart), - 'format_id': 'pg-sd', + 'url': PG_URL_TEMPLATE % ('a2t', video_urlpart), + 'format_id': 'a2t', + 'width': 512, + 'height': 288, }, { - 'url': PG_URL_TEMPLATE % ('a3m', video_urlpart), - 'format_id': 'pg-hd', + 'url': PG_URL_TEMPLATE % ('a3t', video_urlpart), + 'format_id': 'a3t', + 'width': 704, + 'height': 400, + 'quality': 0, + }, + { + 'url': PG_URL_TEMPLATE % ('nettv', video_urlpart), + 'format_id': 'nettv', + 'width': 1280, + 'height': 720, 'quality': 0, } ]) From 0571ffda7dd12fc1067c0344f3ce4ce47b39edb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 30 Apr 2016 01:43:39 +0600 Subject: [PATCH 208/347] [rtlnl] Improve extraction (Closes #9329) * Make hls extraction non fatal and revert ext * Extract progressive formats' metadata from corresponding hls formats --- youtube_dl/extractor/rtlnl.py | 55 +++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index e8b55ea25..c95bcf035 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -94,33 +94,44 @@ class RtlNlIE(InfoExtractor): videopath = material['videopath'] m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath - formats = self._extract_m3u8_formats(m3u8_url, uuid) + formats = self._extract_m3u8_formats( + m3u8_url, uuid, 'mp4', m3u8_id='hls', fatal=False) video_urlpart = videopath.split('/adaptive/')[1][:-5] PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4' - formats.extend([ - { - 'url': PG_URL_TEMPLATE % ('a2t', video_urlpart), - 'format_id': 'a2t', - 'width': 512, - 'height': 288, - }, - { - 'url': PG_URL_TEMPLATE % ('a3t', video_urlpart), - 'format_id': 'a3t', - 'width': 704, - 'height': 400, - 'quality': 0, - }, - { - 'url': PG_URL_TEMPLATE % ('nettv', video_urlpart), - 'format_id': 'nettv', - 'width': 1280, - 'height': 720, - 'quality': 0, + PG_FORMATS = ( + ('a2t', 512, 288), + ('a3t', 704, 400), + ('nettv', 1280, 720), + ) + + def pg_format(format_id, width, height): + return { + 'url': PG_URL_TEMPLATE % (format_id, video_urlpart), + 'format_id': 'pg-%s' % format_id, + 'protocol': 'http', + 'width': width, + 'height': height, } - ]) + + if not formats: + formats = [pg_format(*pg_tuple) for pg_tuple in PG_FORMATS] + else: + pg_formats = [] + for format_id, width, height in PG_FORMATS: + try: + # Find hls format with the same width and height corresponding + # to progressive format and copy metadata from it. + f = next(f for f in formats + if f.get('width') == width and f.get('height') == height).copy() + f.update(pg_format(format_id, width, height)) + pg_formats.append(f) + except StopIteration: + # Missing hls format does mean that no progressive format with + # such width and height exists either. + pass + formats.extend(pg_formats) self._sort_formats(formats) thumbnails = [] From cd63d091cecd8a85a2080035051205b00f3454d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 30 Apr 2016 01:48:14 +0600 Subject: [PATCH 209/347] [rtlnl] Fix tests --- youtube_dl/extractor/rtlnl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index c95bcf035..e4411054a 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -39,7 +39,7 @@ class RtlNlIE(InfoExtractor): 'ext': 'mp4', 'timestamp': 1424039400, 'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag', - 'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$', + 'thumbnail': 're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$', 'upload_date': '20150215', 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.', } @@ -50,7 +50,7 @@ class RtlNlIE(InfoExtractor): 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a', 'ext': 'mp4', 'title': 'RTL Nieuws - Meer beelden van overval juwelier', - 'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$', + 'thumbnail': 're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$', 'timestamp': 1437233400, 'upload_date': '20150718', 'duration': 30.474, From 373e1230e4a3b934ddc59c212773d36a7e998dec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 30 Apr 2016 01:50:26 +0600 Subject: [PATCH 210/347] [rtlnl] Clarify tests --- youtube_dl/extractor/rtlnl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index e4411054a..5e916c4ab 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -32,6 +32,7 @@ class RtlNlIE(InfoExtractor): 'duration': 576.880, }, }, { + # best format avaialble a3t 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false', 'md5': 'dea7474214af1271d91ef332fb8be7ea', 'info_dict': { @@ -45,6 +46,7 @@ class RtlNlIE(InfoExtractor): } }, { # empty synopsis and missing episodes (see https://github.com/rg3/youtube-dl/issues/6275) + # best format available nettv 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false', 'info_dict': { 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a', From ca278a182b9331201e058f9f4d46b3b6114a1518 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 30 Apr 2016 02:07:29 +0600 Subject: [PATCH 211/347] [rtlnl] Replace test --- youtube_dl/extractor/rtlnl.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 5e916c4ab..8598b5840 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -20,16 +20,16 @@ class RtlNlIE(InfoExtractor): (?P[0-9a-f-]+)''' _TESTS = [{ - 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677', - 'md5': 'cc16baa36a6c169391f0764fa6b16654', + 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/82b1aad1-4a14-3d7b-b554-b0aed1b2c416', + 'md5': '473d1946c1fdd050b2c0161a4b13c373', 'info_dict': { - 'id': '6e4203a6-0a5e-3596-8424-c599a59e0677', + 'id': '82b1aad1-4a14-3d7b-b554-b0aed1b2c416', 'ext': 'mp4', - 'title': 'RTL Nieuws - Laat', - 'description': 'md5:6b61f66510c8889923b11f2778c72dc5', - 'timestamp': 1408051800, - 'upload_date': '20140814', - 'duration': 576.880, + 'title': 'RTL Nieuws', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'timestamp': 1461951000, + 'upload_date': '20160429', + 'duration': 1167.96, }, }, { # best format avaialble a3t From 69c4cde4ba6a4c7dfb8a46d1713cbb46d6f1d623 Mon Sep 17 00:00:00 2001 From: remitamine Date: Fri, 29 Apr 2016 21:35:09 +0100 Subject: [PATCH 212/347] [wsj] improve extraction --- youtube_dl/extractor/wsj.py | 95 +++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 47 deletions(-) diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py index 5a897371d..a83e68b17 100644 --- a/youtube_dl/extractor/wsj.py +++ b/youtube_dl/extractor/wsj.py @@ -4,16 +4,22 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, + float_or_none, unified_strdate, ) class WSJIE(InfoExtractor): - _VALID_URL = r'https?://video-api\.wsj\.com/api-video/player/iframe\.html\?guid=(?P[a-zA-Z0-9-]+)' + _VALID_URL = r'''(?x)https?:// + (?: + video-api\.wsj\.com/api-video/player/iframe\.html\?guid=| + (?:www\.)?wsj\.com/video/[^/]+/ + ) + (?P[a-zA-Z0-9-]+)''' IE_DESC = 'Wall Street Journal' - _TEST = { + _TESTS = [{ 'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', - 'md5': '9747d7a6ebc2f4df64b981e1dde9efa9', + 'md5': 'e230a5bb249075e40793b655a54a02e4', 'info_dict': { 'id': '1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', 'ext': 'mp4', @@ -24,65 +30,60 @@ class WSJIE(InfoExtractor): 'duration': 90, 'title': 'Bills Coach Rex Ryan Updates His Old Jets Tattoo', }, - } + }, { + 'url': 'http://www.wsj.com/video/can-alphabet-build-a-smarter-city/359DDAA8-9AC1-489C-82E6-0429C1E430E0.html', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - bitrates = [128, 174, 264, 320, 464, 664, 1264] api_url = ( 'http://video-api.wsj.com/api-video/find_all_videos.asp?' - 'type=guid&count=1&query=%s&' - 'fields=hls,adZone,thumbnailList,guid,state,secondsUntilStartTime,' - 'author,description,name,linkURL,videoStillURL,duration,videoURL,' - 'adCategory,catastrophic,linkShortURL,doctypeID,youtubeID,' - 'titletag,rssURL,wsj-section,wsj-subsection,allthingsd-section,' - 'allthingsd-subsection,sm-section,sm-subsection,provider,' - 'formattedCreationDate,keywords,keywordsOmniture,column,editor,' - 'emailURL,emailPartnerID,showName,omnitureProgramName,' - 'omnitureVideoFormat,linkRelativeURL,touchCastID,' - 'omniturePublishDate,%s') % ( - video_id, ','.join('video%dkMP4Url' % br for br in bitrates)) + 'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,' + 'thumbnailList,author,description,name,duration,videoURL,' + 'titletag,formattedCreationDate,keywords,editor' % video_id) info = self._download_json(api_url, video_id)['items'][0] - - # Thumbnails are conveniently in the correct format already - thumbnails = info.get('thumbnailList') - creator = info.get('author') - uploader_id = info.get('editor') - categories = info.get('keywords') - duration = int_or_none(info.get('duration')) - upload_date = unified_strdate( - info.get('formattedCreationDate'), day_first=False) title = info.get('name', info.get('titletag')) - formats = [{ - 'format_id': 'f4m', - 'format_note': 'f4m (meta URL)', - 'url': info['videoURL'], - }] - if info.get('hls'): + formats = [] + + f4m_url = info.get('videoURL') + if f4m_url: + formats.extend(self._extract_f4m_formats( + f4m_url, video_id, f4m_id='hds', fatal=False)) + + m3u8_url = info.get('hls') + if m3u8_url: formats.extend(self._extract_m3u8_formats( info['hls'], video_id, ext='mp4', - preference=0, entry_protocol='m3u8_native')) - for br in bitrates: - field = 'video%dkMP4Url' % br - if info.get(field): - formats.append({ - 'format_id': 'mp4-%d' % br, - 'container': 'mp4', - 'tbr': br, - 'url': info[field], - }) + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + + for v in info.get('videoMP4List', []): + mp4_url = v.get('url') + if not mp4_url: + continue + tbr = int_or_none(v.get('bitrate')) + formats.append({ + 'url': mp4_url, + 'format_id': 'http' + ('-%d' % tbr if tbr else ''), + 'tbr': tbr, + 'width': int_or_none(v.get('width')), + 'height': int_or_none(v.get('height')), + 'fps': float_or_none(v.get('fps')), + }) self._sort_formats(formats) return { 'id': video_id, 'formats': formats, - 'thumbnails': thumbnails, - 'creator': creator, - 'uploader_id': uploader_id, - 'duration': duration, - 'upload_date': upload_date, + # Thumbnails are conveniently in the correct format already + 'thumbnails': info.get('thumbnailList'), + 'creator': info.get('author'), + 'uploader_id': info.get('editor'), + 'duration': int_or_none(info.get('duration')), + 'upload_date': unified_strdate(info.get( + 'formattedCreationDate'), day_first=False), 'title': title, - 'categories': categories, + 'categories': info.get('keywords'), } From cbc032c8b70a038a69259378c92b4ba97b42d491 Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 30 Apr 2016 01:24:36 +0100 Subject: [PATCH 213/347] [pbs] extract all http formats --- youtube_dl/extractor/pbs.py | 48 ++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index f43e3a146..38cdb9975 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -196,7 +196,7 @@ class PBSIE(InfoExtractor): _TESTS = [ { 'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', - 'md5': 'ce1888486f0908d555a8093cac9a7362', + 'md5': '173dc391afd361fa72eab5d3d918968d', 'info_dict': { 'id': '2365006249', 'ext': 'mp4', @@ -204,13 +204,10 @@ class PBSIE(InfoExtractor): 'description': 'md5:36f341ae62e251b8f5bd2b754b95a071', 'duration': 3190, }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { 'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/', - 'md5': '143c98aa54a346738a3d78f54c925321', + 'md5': '6f722cb3c3982186d34b0f13374499c7', 'info_dict': { 'id': '2365297690', 'ext': 'mp4', @@ -218,9 +215,6 @@ class PBSIE(InfoExtractor): 'description': 'md5:4d3eaa01f94e61b3e73704735f1196d9', 'duration': 5050, }, - 'params': { - 'skip_download': True, # requires ffmpeg - } }, { 'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/', @@ -244,9 +238,6 @@ class PBSIE(InfoExtractor): 'duration': 6559, 'thumbnail': 're:^https?://.*\.jpg$', }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { 'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html', @@ -262,9 +253,6 @@ class PBSIE(InfoExtractor): 'upload_date': '20140122', 'age_limit': 10, }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { 'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/', @@ -290,6 +278,7 @@ class PBSIE(InfoExtractor): }, { 'url': 'http://www.pbs.org/video/2365245528/', + 'md5': '115223d41bd55cda8ae5cd5ed4e11497', 'info_dict': { 'id': '2365245528', 'display_id': '2365245528', @@ -299,15 +288,13 @@ class PBSIE(InfoExtractor): 'duration': 6851, 'thumbnail': 're:^https?://.*\.jpg$', }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, }, { # Video embedded in iframe containing angle brackets as attribute's value (e.g. # "', webpage, 'embed url')) + + return { + '_type': 'url_transparent', + 'url': embed_url, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c9d1422e5..14b4f245f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -75,6 +75,7 @@ from .bigflix import BigflixIE from .bild import BildIE from .bilibili import BiliBiliIE from .biobiochiletv import BioBioChileTVIE +from .biqle import BIQLEIE from .bleacherreport import ( BleacherReportIE, BleacherReportCMSIE, diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 67220f1b7..041d93629 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -26,12 +26,16 @@ class VKIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:m\.)?vk\.com/video_ext\.php\?.*?\boid=(?P-?\d+).*?\bid=(?P\d+)| + (?: + (?:m\.)?vk\.com/video_| + (?:www\.)?daxab.com/ + ) + ext\.php\?(?P.*?\boid=(?P-?\d+).*?\bid=(?P\d+).*)| (?: (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video| - (?:www\.)?biqle\.ru/watch/ + (?:www\.)?daxab.com/embed/ ) - (?P[^s].*?)(?:\?(?:.*\blist=(?P[\da-f]+))?|%2F|$) + (?P-?\d+_\d+)(?:.*\blist=(?P[\da-f]+))? ) ''' _NETRC_MACHINE = 'vk' @@ -75,7 +79,8 @@ class VKIE(InfoExtractor): 'duration': 101, 'upload_date': '20120730', 'view_count': int, - } + }, + 'skip': 'This video has been removed from public access.', }, { # VIDEO NOW REMOVED @@ -142,7 +147,7 @@ class VKIE(InfoExtractor): 'id': 'V3K4mi0SYkc', 'ext': 'webm', 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", - 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', + 'description': 'md5:d9903938abdc74c738af77f527ca0596', 'duration': 178, 'upload_date': '20130116', 'uploader': "Children's Joy Foundation", @@ -173,11 +178,6 @@ class VKIE(InfoExtractor): 'url': 'https://vk.com/video205387401_164765225', 'only_matching': True, }, - { - # vk wrapper - 'url': 'http://www.biqle.ru/watch/847655_160197695', - 'only_matching': True, - }, { # pladform embed 'url': 'https://vk.com/video-76116461_171554880', @@ -217,20 +217,22 @@ class VKIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') - if not video_id: + info_url = url + if video_id: + info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id + # Some videos (removed?) can only be downloaded with list id specified + list_id = mobj.group('list_id') + if list_id: + info_url += '&list=%s' % list_id + else: + info_url = 'http://vk.com/video_ext.php?' + mobj.group('embed_query') video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) - info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id - - # Some videos (removed?) can only be downloaded with list id specified - list_id = mobj.group('list_id') - if list_id: - info_url += '&list=%s' % list_id - info_page = self._download_webpage(info_url, video_id) error_message = self._html_search_regex( - r'(?s)]+class="video_layer_message"[^>]*>(.+?)
', + [r'(?s)]+class="video_layer_message"[^>]*>(.+?)

', + r'(?s)]+id="video_ext_msg"[^>]*>(.+?)'], info_page, 'error message', default=None) if error_message: raise ExtractorError(error_message, expected=True) @@ -305,17 +307,17 @@ class VKIE(InfoExtractor): view_count = None views = self._html_search_regex( r'"mv_views_count_number"[^>]*>(.+?\bviews?)<', - info_page, 'view count', fatal=False) + info_page, 'view count', default=None) if views: view_count = str_to_int(self._search_regex( r'([\d,.]+)', views, 'view count', fatal=False)) formats = [] for k, v in data.items(): - if not k.startswith('url') and k != 'extra_data' or not v: + if not k.startswith('url') and not k.startswith('cache') and k != 'extra_data' or not v: continue height = int_or_none(self._search_regex( - r'^url(\d+)', k, 'height', default=None)) + r'^(?:url|cache)(\d+)', k, 'height', default=None)) formats.append({ 'format_id': k, 'url': v, From abc97b5eda4ed4b36cec29e9966eb1bb7bcd97ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 6 May 2016 22:07:30 +0600 Subject: [PATCH 284/347] [utils] Allow empty attribute values in get_element_by_attribute (Closes #9415) --- youtube_dl/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index a5922b2b5..6e4573784 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -256,9 +256,9 @@ def get_element_by_attribute(attribute, value, html): m = re.search(r'''(?xs) <([a-zA-Z0-9:._-]+) - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*? + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? \s+%s=['"]?%s['"]? - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*? + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? \s*> (?P.*?) From 25cb7a0eebae0093a81fa1c930480fafa13feb25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 6 May 2016 22:11:18 +0600 Subject: [PATCH 285/347] [youtube] Allow empty attribute values in description regex --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b7c3cb63f..f3f102c30 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1326,9 +1326,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if video_description: video_description = re.sub(r'''(?x) ]*> [^<]+\.{3}\s* From 3e80e6f40d6ef76142340a2292ef2445dc79594b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 6 May 2016 23:35:58 +0600 Subject: [PATCH 286/347] [vevo] Allow request to api.vevo.com to fail (Closes #9417) I don't know whether this it's tempopary or api has just gone --- youtube_dl/extractor/vevo.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index c0ef08c02..30b3a9e7e 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -201,9 +201,10 @@ class VevoIE(VevoBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - json_url = 'http://api.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id + json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id response = self._download_json( - json_url, video_id, 'Downloading video info', 'Unable to download info') + json_url, video_id, 'Downloading video info', + 'Unable to download info', fatal=False) or {} video_info = response.get('video') or {} artist = None featured_artist = None @@ -212,7 +213,7 @@ class VevoIE(VevoBaseIE): formats = [] if not video_info: - if response.get('statusCode') != 909: + if response and response.get('statusCode') != 909: ytid = response.get('errorInfo', {}).get('ytid') if ytid: self.report_warning( From f745403b5b448c170710256a61b8505e09e77674 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 6 May 2016 23:37:17 +0600 Subject: [PATCH 287/347] [vevo] Revert videoplayer.vevo.com to api.vevo.com --- youtube_dl/extractor/vevo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 30b3a9e7e..c0632cd6a 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -201,7 +201,7 @@ class VevoIE(VevoBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id + json_url = 'http://api.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id response = self._download_json( json_url, video_id, 'Downloading video info', 'Unable to download info', fatal=False) or {} From e2ee97dcd5c55e1c2aceae0d93fbfd64d0cc5ba3 Mon Sep 17 00:00:00 2001 From: inondle Date: Fri, 6 May 2016 12:05:37 -0700 Subject: [PATCH 288/347] [liveleak] Adds support for thumbnails, updates tests --- youtube_dl/extractor/liveleak.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 29fba5f30..ea0565ac0 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -17,7 +17,8 @@ class LiveLeakIE(InfoExtractor): 'ext': 'flv', 'description': 'extremely bad day for this guy..!', 'uploader': 'ljfriel2', - 'title': 'Most unlucky car accident' + 'title': 'Most unlucky car accident', + 'thumbnail': 're:^https?://.*\.jpg$' } }, { 'url': 'http://www.liveleak.com/view?i=f93_1390833151', @@ -28,6 +29,7 @@ class LiveLeakIE(InfoExtractor): 'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.', 'uploader': 'ARD_Stinkt', 'title': 'German Television does first Edward Snowden Interview (ENGLISH)', + 'thumbnail': 're:^https?://.*\.jpg$' } }, { 'url': 'http://www.liveleak.com/view?i=4f7_1392687779', @@ -49,7 +51,8 @@ class LiveLeakIE(InfoExtractor): 'ext': 'mp4', 'description': 'Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.', 'uploader': 'bony333', - 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia' + 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia', + 'thumbnail': 're:^https?://.*\.jpg$' } }] @@ -72,6 +75,7 @@ class LiveLeakIE(InfoExtractor): age_limit = int_or_none(self._search_regex( r'you confirm that you are ([0-9]+) years and over.', webpage, 'age limit', default=None)) + video_thumbnail = self._og_search_thumbnail(webpage) sources_raw = self._search_regex( r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None) @@ -124,4 +128,5 @@ class LiveLeakIE(InfoExtractor): 'uploader': video_uploader, 'formats': formats, 'age_limit': age_limit, + 'thumbnail': video_thumbnail, } From 3fd6332c056115e5de37b0789d907e9344c2ff5c Mon Sep 17 00:00:00 2001 From: remitamine Date: Sat, 7 May 2016 15:12:20 +0100 Subject: [PATCH 289/347] [flickr] extract license field(closes #9425) --- youtube_dl/extractor/flickr.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 0a3de1498..73ae3adee 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -27,10 +27,24 @@ class FlickrIE(InfoExtractor): 'comment_count': int, 'view_count': int, 'tags': list, + 'license': 'Attribution-ShareAlike', } } - _API_BASE_URL = 'https://api.flickr.com/services/rest?' + # https://help.yahoo.com/kb/flickr/SLN25525.html + _LICENSES = { + '0': 'All Rights Reserved', + '1': 'Attribution-NonCommercial-ShareAlike', + '2': 'Attribution-NonCommercial', + '3': 'Attribution-NonCommercial-NoDerivs', + '4': 'Attribution', + '5': 'Attribution-ShareAlike', + '6': 'Attribution-NoDerivs', + '7': 'No known copyright restrictions', + '8': 'United States government work', + '9': 'Public Domain Dedication (CC0)', + '10': 'Public Domain Work', + } def _call_api(self, method, video_id, api_key, note, secret=None): query = { @@ -87,7 +101,8 @@ class FlickrIE(InfoExtractor): 'uploader': owner.get('realname'), 'comment_count': int_or_none(video_info.get('comments', {}).get('_content')), 'view_count': int_or_none(video_info.get('views')), - 'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])] + 'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])], + 'license': self._LICENSES.get(video_info.get('license')), } else: raise ExtractorError('not a video', expected=True) From cb1fa5881315ed998a366f47511b7a4b4ea067b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 7 May 2016 20:15:40 +0600 Subject: [PATCH 290/347] [flickr] Extract uploader URL (Closes #9426) --- youtube_dl/extractor/flickr.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index 73ae3adee..a8e1bf42a 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -24,6 +24,7 @@ class FlickrIE(InfoExtractor): 'upload_date': '20110423', 'uploader_id': '10922353@N03', 'uploader': 'Forest Wander', + 'uploader_url': 'https://www.flickr.com/photos/forestwander-nature-pictures/', 'comment_count': int, 'view_count': int, 'tags': list, @@ -89,6 +90,9 @@ class FlickrIE(InfoExtractor): self._sort_formats(formats) owner = video_info.get('owner', {}) + uploader_id = owner.get('nsid') + uploader_path = owner.get('path_alias') or uploader_id + uploader_url = 'https://www.flickr.com/photos/%s/' % uploader_path if uploader_path else None return { 'id': video_id, @@ -97,8 +101,9 @@ class FlickrIE(InfoExtractor): 'formats': formats, 'timestamp': int_or_none(video_info.get('dateuploaded')), 'duration': int_or_none(video_info.get('video', {}).get('duration')), - 'uploader_id': owner.get('nsid'), + 'uploader_id': uploader_id, 'uploader': owner.get('realname'), + 'uploader_url': uploader_url, 'comment_count': int_or_none(video_info.get('comments', {}).get('_content')), 'view_count': int_or_none(video_info.get('views')), 'tags': [tag.get('_content') for tag in video_info.get('tags', {}).get('tag', [])], From a0904c5d8024c12b7f95b1126a6b8152a4e1021f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 8 May 2016 00:56:31 +0800 Subject: [PATCH 291/347] [telegraaf] Fix extractor (closes #9318) --- youtube_dl/extractor/telegraaf.py | 58 +++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py index 6f8333cfc..9092e9b85 100644 --- a/youtube_dl/extractor/telegraaf.py +++ b/youtube_dl/extractor/telegraaf.py @@ -2,14 +2,16 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import remove_end +from ..utils import ( + determine_ext, + remove_end, +) class TelegraafIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/tv/(?:[^/]+/)+(?P\d+)/[^/]+\.html' _TEST = { 'url': 'http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html', - 'md5': '83245a9779bcc4a24454bfd53c65b6dc', 'info_dict': { 'id': '24353229', 'ext': 'mp4', @@ -18,18 +20,60 @@ class TelegraafIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 33, }, + 'params': { + # m3u8 download + 'skip_download': True, + }, } def _real_extract(self, url): - playlist_id = self._match_id(url) + video_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) + webpage = self._download_webpage(url, video_id) + player_url = self._html_search_regex( + r']+src="([^"]+")', webpage, 'player URL') + player_page = self._download_webpage( + player_url, video_id, note='Download player webpage') playlist_url = self._search_regex( - r"iframe\.loadPlayer\('([^']+)'", webpage, 'player') + r'playlist\s*:\s*"([^"]+)"', player_page, 'playlist URL') + playlist_data = self._download_json(playlist_url, video_id) + + item = playlist_data['items'][0] + formats = [] + locations = item['locations'] + for location in locations.get('adaptive', []): + manifest_url = location['src'] + ext = determine_ext(manifest_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, ext='mp4', m3u8_id='hls')) + elif ext == 'mpd': + # TODO: Current DASH formats are broken - $Time$ pattern in + # not implemented yet + continue + else: + self.report_warning('Unknown adaptive format %s' % ext) + for location in locations.get('progressive', []): + formats.append({ + 'url': location['sources'][0]['src'], + 'width': location.get('width'), + 'height': location.get('height'), + 'format_id': 'http-%s' % location['label'], + }) + + self._sort_formats(formats) - entries = self._extract_xspf_playlist(playlist_url, playlist_id) title = remove_end(self._og_search_title(webpage), ' - VIDEO') description = self._og_search_description(webpage) + duration = item.get('duration') + thumbnail = item.get('poster') - return self.playlist_result(entries, playlist_id, title, description) + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'duration': duration, + 'thumbnail': thumbnail, + } From e2eca6f65e9969c31b3374bd3688321f3e471cd7 Mon Sep 17 00:00:00 2001 From: Kevin Deldycke Date: Sat, 7 May 2016 20:03:25 +0200 Subject: [PATCH 292/347] Expand user's home in batch file path. --- youtube_dl/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 737f6545d..7a0466077 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -86,7 +86,9 @@ def _real_main(argv=None): if opts.batchfile == '-': batchfd = sys.stdin else: - batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore') + batchfd = io.open( + compat_expanduser(opts.batchfile), + 'r', encoding='utf-8', errors='ignore') batch_urls = read_batch_urls(batchfd) if opts.verbose: write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n') From 00c21c225decf648199013f2fa3385a1332037bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 May 2016 00:11:44 +0600 Subject: [PATCH 293/347] Credit @kdeldycke for #9430 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 814fe9ec3..5f668338b 100644 --- a/AUTHORS +++ b/AUTHORS @@ -169,3 +169,4 @@ Viťas Strádal Kagami Hiiragi Philip Huppert blahgeek +Kevin Deldycke From 5c24873a9e6a47e58b10eb0c0825e165604796f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 May 2016 02:04:34 +0600 Subject: [PATCH 294/347] Credit @inondle for #9400 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 5f668338b..bf860b7f7 100644 --- a/AUTHORS +++ b/AUTHORS @@ -170,3 +170,4 @@ Kagami Hiiragi Philip Huppert blahgeek Kevin Deldycke +inondle From f5436c5d9e4e65790440ada40476712ff430651b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 May 2016 02:29:26 +0600 Subject: [PATCH 295/347] [downloader/external] Add temp fix ffmpeg m3u8 downloads (Closes #9394) --- youtube_dl/downloader/external.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 8d642fc3e..45f49c350 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -224,7 +224,7 @@ class FFmpegFD(ExternalFD): args += ['-rtmp_live', 'live'] args += ['-i', url, '-c', 'copy'] - if protocol == 'm3u8': + if protocol in ('m3u8', 'm3u8_native'): if self.params.get('hls_use_mpegts', False) or tmpfilename == '-': args += ['-f', 'mpegts'] else: From 3e169233daf76cd7585ebac12504f8e624b7693b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 May 2016 04:36:57 +0600 Subject: [PATCH 296/347] Expanduser for more options with input files --- youtube_dl/YoutubeDL.py | 1 + youtube_dl/__init__.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2187dcc8f..a96482e68 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -2018,6 +2018,7 @@ class YoutubeDL(object): if opts_cookiefile is None: self.cookiejar = compat_cookiejar.CookieJar() else: + opts_cookiefile = compat_expanduser(opts_cookiefile) self.cookiejar = compat_cookiejar.MozillaCookieJar( opts_cookiefile) if os.access(opts_cookiefile, os.R_OK): diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 7a0466077..cbd84c3af 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -406,7 +406,7 @@ def _real_main(argv=None): try: if opts.load_info_filename is not None: - retcode = ydl.download_with_info_file(opts.load_info_filename) + retcode = ydl.download_with_info_file(compat_expanduser(opts.load_info_filename)) else: retcode = ydl.download(all_urls) except MaxDownloadsReached: From 9c072d38c6b0361d91e92c50cd0c753dc8ce3101 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 May 2016 06:52:42 +0600 Subject: [PATCH 297/347] [arte] Improve language preference (Closes #9401, closes #9162) --- youtube_dl/extractor/arte.py | 58 ++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 881cacfab..e37fdae13 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -161,24 +161,53 @@ class ArteTVPlus7IE(InfoExtractor): 'es': 'E[ESP]', } + langcode = LANGS.get(lang, lang) + formats = [] for format_id, format_dict in player_info['VSR'].items(): f = dict(format_dict) versionCode = f.get('versionCode') - langcode = LANGS.get(lang, lang) - lang_rexs = [r'VO?%s-' % re.escape(langcode), r'VO?.-ST%s$' % re.escape(langcode)] - lang_pref = None - if versionCode: - matched_lang_rexs = [r for r in lang_rexs if re.match(r, versionCode)] - lang_pref = -10 if not matched_lang_rexs else 10 * len(matched_lang_rexs) - source_pref = 0 - if versionCode is not None: - # The original version with subtitles has lower relevance - if re.match(r'VO-ST(F|A|E)', versionCode): - source_pref -= 10 - # The version with sourds/mal subtitles has also lower relevance - elif re.match(r'VO?(F|A|E)-STM\1', versionCode): - source_pref -= 9 + l = re.escape(langcode) + + # Language preference from most to least priority + # Reference: section 5.6.3 of + # http://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-05.pdf + PREFERENCES = ( + # original version in requested language, without subtitles + r'VO{0}$'.format(l), + # original version in requested language, with partial subtitles in requested language + r'VO{0}-ST{0}$'.format(l), + # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language + r'VO{0}-STM{0}$'.format(l), + # non-original (dubbed) version in requested language, without subtitles + r'V{0}$'.format(l), + # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language + r'V{0}-ST{0}$'.format(l), + # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language + r'V{0}-STM{0}$'.format(l), + # original version in requested language, with partial subtitles in different language + r'VO{0}-ST(?!{0}).+?$'.format(l), + # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language + r'VO{0}-STM(?!{0}).+?$'.format(l), + # original version in different language, with partial subtitles in requested language + r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l), + # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language + r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l), + # original version in different language, without subtitles + r'VO(?:(?!{0}))?$'.format(l), + # original version in different language, with partial subtitles in different language + r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l), + # original version in different language, with subtitles for the deaf and hard-of-hearing in different language + r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l), + ) + + for pref, p in enumerate(PREFERENCES): + if re.match(p, versionCode): + lang_pref = len(PREFERENCES) - pref + break + else: + lang_pref = -1 + format = { 'format_id': format_id, 'preference': -10 if f.get('videoFormat') == 'M3U8' else None, @@ -188,7 +217,6 @@ class ArteTVPlus7IE(InfoExtractor): 'height': int_or_none(f.get('height')), 'tbr': int_or_none(f.get('bitrate')), 'quality': qfunc(f.get('quality')), - 'source_preference': source_pref, } if f.get('mediaType') == 'rtmp': From 3b01a9fbb63e33325fa979db8a846d3e655e79e6 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 8 May 2016 14:34:38 +0800 Subject: [PATCH 298/347] [litv] Add new extractor LiTV is a streaming platform providing free and paid legal contents in Taiwan. --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/litv.py | 137 +++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+) create mode 100644 youtube_dl/extractor/litv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 14b4f245f..7bacef184 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -384,6 +384,7 @@ from .limelight import ( LimelightChannelIE, LimelightChannelListIE, ) +from .litv import LiTVIE from .liveleak import LiveLeakIE from .livestream import ( LivestreamIE, diff --git a/youtube_dl/extractor/litv.py b/youtube_dl/extractor/litv.py new file mode 100644 index 000000000..3356d015d --- /dev/null +++ b/youtube_dl/extractor/litv.py @@ -0,0 +1,137 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + smuggle_url, + unsmuggle_url, +) + + +class LiTVIE(InfoExtractor): + _VALID_URL = r'https?://www\.litv\.tv/vod/[^/]+/content\.do\?.*?\bid=(?P[^&]+)' + + _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s' + + _TESTS = [{ + 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', + 'info_dict': { + 'id': 'VOD00041606', + 'title': '花千骨', + }, + 'playlist_count': 50, + }, { + 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', + 'info_dict': { + 'id': 'VOD00041610', + 'ext': 'mp4', + 'title': '花千骨第1集', + 'thumbnail': 're:https?://.*\.jpg$', + 'description': 'md5:c7017aa144c87467c4fb2909c4b05d6f', + 'episode_number': 1, + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, # m3u8 download + }, + 'skip': 'Georestricted to Taiwan', + }] + + def _extract_playlist(self, season_list, video_id, vod_data, view_data, prompt=True): + episode_title = view_data['title'] + content_id = season_list['contentId'] + + if prompt: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (content_id, video_id)) + + all_episodes = [ + self.url_result(smuggle_url( + self._URL_TEMPLATE % (view_data['contentType'], episode['contentId']), + {'force_noplaylist': True})) # To prevent infinite recursion + for episode in season_list['episode']] + + return self.playlist_result(all_episodes, content_id, episode_title) + + def _real_extract(self, url): + url, data = unsmuggle_url(url, {}) + + video_id = self._match_id(url) + + noplaylist = self._downloader.params.get('noplaylist') + noplaylist_prompt = True + if 'force_noplaylist' in data: + noplaylist = data['force_noplaylist'] + noplaylist_prompt = False + + webpage = self._download_webpage(url, video_id) + + view_data = dict(map(lambda t: (t[0], t[2]), re.findall( + r'viewData\.([a-zA-Z]+)\s*=\s*(["\'])([^"\']+)\2', + webpage))) + + vod_data = self._parse_json(self._search_regex( + 'var\s+vod\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'), + video_id) + + season_list = list(vod_data.get('seasonList', {}).values()) + if season_list: + if not noplaylist: + return self._extract_playlist( + season_list[0], video_id, vod_data, view_data, + prompt=noplaylist_prompt) + + if noplaylist_prompt: + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + + # In browsers `getMainUrl` request is always issued. Usually this + # endpoint gives the same result as the data embedded in the webpage. + # If georestricted, there are no embedded data, so an extra request is + # necessary to get the error code + video_data = self._parse_json(self._search_regex( + r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);', + webpage, 'video data', default='{}'), video_id) + if not video_data: + payload = { + 'assetId': view_data['assetId'], + 'watchDevices': vod_data['watchDevices'], + 'contentType': view_data['contentType'], + } + video_data = self._download_json( + 'https://www.litv.tv/vod/getMainUrl', video_id, + data=json.dumps(payload).encode('utf-8'), + headers={'Content-Type': 'application/json'}) + + if not video_data.get('fullpath'): + error_msg = video_data.get('errorMessage') + if error_msg == 'vod.error.outsideregionerror': + self.raise_geo_restricted('This video is available in Taiwan only') + if error_msg: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_msg), expected=True) + raise ExtractorError('Unexpected result from %s' % self.IE_NAME) + + formats = self._extract_m3u8_formats( + video_data['fullpath'], video_id, ext='mp4', m3u8_id='hls') + for a_format in formats: + # LiTV HLS segments doesn't like compressions + a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = True + + title = view_data['title'] + view_data.get('secondaryMark', '') + description = view_data.get('description') + thumbnail = view_data.get('imageFile') + categories = [item['name'] for item in vod_data.get('category', [])] + episode = int_or_none(view_data.get('episode')) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'categories': categories, + 'episode_number': episode, + } From f23a92a0cecac0d4db60e086e429793556347271 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 8 May 2016 20:02:54 +0600 Subject: [PATCH 299/347] [mva] Add extractor (Closes #6667) --- youtube_dl/extractor/extractors.py | 4 + .../extractor/microsoftvirtualacademy.py | 192 ++++++++++++++++++ 2 files changed, 196 insertions(+) create mode 100644 youtube_dl/extractor/microsoftvirtualacademy.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7bacef184..a0bb3d4c2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -409,6 +409,10 @@ from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mgoon import MgoonIE from .mgtv import MGTVIE +from .microsoftvirtualacademy import ( + MicrosoftVirtualAcademyIE, + MicrosoftVirtualAcademyCourseIE, +) from .minhateca import MinhatecaIE from .ministrygrid import MinistryGridIE from .minoto import MinotoIE diff --git a/youtube_dl/extractor/microsoftvirtualacademy.py b/youtube_dl/extractor/microsoftvirtualacademy.py new file mode 100644 index 000000000..b7fea47ee --- /dev/null +++ b/youtube_dl/extractor/microsoftvirtualacademy.py @@ -0,0 +1,192 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_xpath, +) +from ..utils import ( + int_or_none, + parse_duration, + smuggle_url, + unsmuggle_url, + xpath_text, +) + + +class MicrosoftVirtualAcademyBaseIE(InfoExtractor): + def _extract_base_url(self, course_id, display_id): + return self._download_json( + 'https://api-mlxprod.microsoft.com/services/products/anonymous/%s' % course_id, + display_id, 'Downloading course base URL') + + def _extract_chapter_and_title(self, title): + if not title: + return None, None + m = re.search(r'(?P\d+)\s*\|\s*(?P.+)', title) + return (int(m.group('chapter')), m.group('title')) if m else (None, title) + + +class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): + IE_NAME = 'mva' + IE_DESC = 'Microsoft Virtual Academy videos' + _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME + + _TESTS = [{ + 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382', + 'md5': '7826c44fc31678b12ad8db11f6b5abb9', + 'info_dict': { + 'id': 'gfVXISmEB_6804984382', + 'ext': 'mp4', + 'title': 'Course Introduction', + 'formats': 'mincount:3', + 'subtitles': { + 'en': [{ + 'ext': 'ttml', + }], + }, + } + }, { + 'url': 'mva:11788:gfVXISmEB_6804984382', + 'only_matching': True, + }] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + mobj = re.match(self._VALID_URL, url) + course_id = mobj.group('course_id') + video_id = mobj.group('id') + + base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id) + + settings = self._download_xml( + '%s/content/content_%s/videosettings.xml?v=1' % (base_url, video_id), + video_id, 'Downloading video settings XML') + + _, title = self._extract_chapter_and_title(xpath_text( + settings, './/Title', 'title', fatal=True)) + + formats = [] + + for sources in settings.findall(compat_xpath('.//MediaSources')): + if sources.get('videoType') == 'smoothstreaming': + continue + for source in sources.findall(compat_xpath('./MediaSource')): + video_url = source.text + if not video_url or not video_url.startswith('http'): + continue + video_mode = source.get('videoMode') + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', video_mode or '', 'height', default=None)) + codec = source.get('codec') + acodec, vcodec = [None] * 2 + if codec: + codecs = codec.split(',') + if len(codecs) == 2: + acodec, vcodec = codecs + elif len(codecs) == 1: + vcodec = codecs[0] + formats.append({ + 'url': video_url, + 'format_id': video_mode, + 'height': height, + 'acodec': acodec, + 'vcodec': vcodec, + }) + self._sort_formats(formats) + + subtitles = {} + for source in settings.findall(compat_xpath('.//MarkerResourceSource')): + subtitle_url = source.text + if not subtitle_url: + continue + subtitles.setdefault('en', []).append({ + 'url': '%s/%s' % (base_url, subtitle_url), + 'ext': source.get('type'), + }) + + return { + 'id': video_id, + 'title': title, + 'subtitles': subtitles, + 'formats': formats + } + + +class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE): + IE_NAME = 'mva:course' + IE_DESC = 'Microsoft Virtual Academy courses' + _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME + + _TESTS = [{ + 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', + 'info_dict': { + 'id': '11788', + 'title': 'Microsoft Azure Fundamentals: Virtual Machines', + }, + 'playlist_count': 36, + }, { + # with emphasized chapters + 'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335', + 'info_dict': { + 'id': '16335', + 'title': 'Developing Windows 10 Games with Construct 2', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', + 'only_matching': True, + }, { + 'url': 'mva:course:11788', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if MicrosoftVirtualAcademyIE.suitable(url) else super( + MicrosoftVirtualAcademyCourseIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_id = mobj.group('id') + display_id = mobj.group('display_id') + + base_url = self._extract_base_url(course_id, display_id) + + manifest = self._download_json( + '%s/imsmanifestlite.json' % base_url, + display_id, 'Downloading course manifest JSON')['manifest'] + + organization = manifest['organizations']['organization'][0] + + entries = [] + for chapter in organization['item']: + chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title')) + chapter_id = chapter.get('@identifier') + for item in chapter.get('item', []): + item_id = item.get('@identifier') + if not item_id: + continue + metadata = item.get('resource', {}).get('metadata') or {} + if metadata.get('learningresourcetype') != 'Video': + continue + _, title = self._extract_chapter_and_title(item.get('title')) + duration = parse_duration(metadata.get('duration')) + description = metadata.get('description') + entries.append({ + '_type': 'url_transparent', + 'url': smuggle_url( + 'mva:%s:%s' % (course_id, item_id), {'base_url': base_url}), + 'title': title, + 'description': description, + 'duration': duration, + 'chapter': chapter_title, + 'chapter_number': chapter_number, + 'chapter_id': chapter_id, + }) + + title = organization.get('title') or manifest.get('metadata', {}).get('title') + + return self.playlist_result(entries, course_id, title) From c52f4efaee2386a72c3f6b694fb4f4c3132ced55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sun, 8 May 2016 20:10:20 +0600 Subject: [PATCH 300/347] [mva] Improve _VALID_URLs --- youtube_dl/extractor/microsoftvirtualacademy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/microsoftvirtualacademy.py b/youtube_dl/extractor/microsoftvirtualacademy.py index b7fea47ee..afd3e98ec 100644 --- a/youtube_dl/extractor/microsoftvirtualacademy.py +++ b/youtube_dl/extractor/microsoftvirtualacademy.py @@ -31,7 +31,7 @@ class MicrosoftVirtualAcademyBaseIE(InfoExtractor): class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): IE_NAME = 'mva' IE_DESC = 'Microsoft Virtual Academy videos' - _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME + _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME _TESTS = [{ 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382', @@ -118,7 +118,7 @@ class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE): IE_NAME = 'mva:course' IE_DESC = 'Microsoft Virtual Academy courses' - _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME + _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME _TESTS = [{ 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788', From f1f6f5aa5e2a6d66fa54d35bf3e8b3626e85ee73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20=C4=8Cech?= <sleep_walker@suse.cz> Date: Sat, 7 May 2016 20:15:49 +0200 Subject: [PATCH 301/347] [ceskatelevize] Add support for live streams Live streams has no playlist title, use title of the stream containing TV channel name. Internal m3u8 handler doesn't seem to handle well continuous streams. Add test for live stream. Remove no longer reachable test. --- youtube_dl/extractor/ceskatelevize.py | 35 +++++++++++++++++++-------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 6652c8e42..b41888531 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -33,14 +33,13 @@ class CeskaTelevizeIE(InfoExtractor): 'skip_download': True, }, }, { - 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina', + # live stream + 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', 'info_dict': { - 'id': '61924494876844374', + 'id': 402, 'ext': 'mp4', - 'title': 'První republika: Zpěvačka z Dupárny Bobina', - 'description': 'Sága mapující atmosféru první republiky od r. 1918 do r. 1945.', - 'thumbnail': 're:^https?://.*\.jpg', - 'duration': 88.4, + 'title': 're:ČT Sport.*', + 'is_live': True, }, 'params': { # m3u8 download @@ -118,19 +117,21 @@ class CeskaTelevizeIE(InfoExtractor): req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) req.add_header('Referer', url) - playlist_title = self._og_search_title(webpage) - playlist_description = self._og_search_description(webpage) + playlist_title = self._og_search_title(webpage, default=None) + playlist_description = self._og_search_description(webpage, default=None) playlist = self._download_json(req, playlist_id)['playlist'] playlist_len = len(playlist) entries = [] for item in playlist: + is_live = item['type'] == 'LIVE' formats = [] for format_id, stream_url in item['streamUrls'].items(): formats.extend(self._extract_m3u8_formats( stream_url, playlist_id, 'mp4', - entry_protocol='m3u8_native', fatal=False)) + entry_protocol='m3u8' if is_live else 'm3u8_native', + fatal=False)) self._sort_formats(formats) item_id = item.get('id') or item['assetId'] @@ -145,14 +146,28 @@ class CeskaTelevizeIE(InfoExtractor): if subs: subtitles = self.extract_subtitles(episode_id, subs) + if playlist_len == 1: + if is_live: + # live streams has channel name in title + final_title = self._live_title(title) + elif playlist_title: + # title is always set (no KeyError caught) + # and gives good fallback + final_title = title + else: + final_title = playlist_title + else: + final_title = '%s (%s)' % (playlist_title, title) + entries.append({ 'id': item_id, - 'title': playlist_title if playlist_len == 1 else '%s (%s)' % (playlist_title, title), + 'title': final_title, 'description': playlist_description if playlist_len == 1 else None, 'thumbnail': thumbnail, 'duration': duration, 'formats': formats, 'subtitles': subtitles, + 'is_live': is_live, }) return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) From 3951e7eb9305448aab6395f4303ed7ab19248c52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 May 2016 20:37:20 +0600 Subject: [PATCH 302/347] [ceskatelevize] Simplify, restore bonus video test and skip georestricted test (Closes #9431) --- youtube_dl/extractor/ceskatelevize.py | 29 ++++++++++++++++++--------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index b41888531..5a58d1777 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -32,19 +32,34 @@ class CeskaTelevizeIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + }, { + 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', + 'info_dict': { + 'id': '61924494877028507', + 'ext': 'mp4', + 'title': 'Hyde Park Civilizace: Bonus 01 - En', + 'description': 'English Subtittles', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 81.3, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { # live stream 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', 'info_dict': { 'id': 402, 'ext': 'mp4', - 'title': 're:ČT Sport.*', + 'title': 're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'is_live': True, }, 'params': { # m3u8 download 'skip_download': True, }, + 'skip': 'Georestricted to Czech Republic', }, { # video with 18+ caution trailer 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', @@ -125,7 +140,7 @@ class CeskaTelevizeIE(InfoExtractor): entries = [] for item in playlist: - is_live = item['type'] == 'LIVE' + is_live = item.get('type') == 'LIVE' formats = [] for format_id, stream_url in item['streamUrls'].items(): formats.extend(self._extract_m3u8_formats( @@ -147,15 +162,9 @@ class CeskaTelevizeIE(InfoExtractor): subtitles = self.extract_subtitles(episode_id, subs) if playlist_len == 1: + final_title = playlist_title or title if is_live: - # live streams has channel name in title - final_title = self._live_title(title) - elif playlist_title: - # title is always set (no KeyError caught) - # and gives good fallback - final_title = title - else: - final_title = playlist_title + final_title = self._live_title(final_title) else: final_title = '%s (%s)' % (playlist_title, title) From 965fefdcd879405c3e4b5604513719353ba8474a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 May 2016 20:38:33 +0600 Subject: [PATCH 303/347] Credit @sleep-walker for #9431 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index bf860b7f7..5ca71ace7 100644 --- a/AUTHORS +++ b/AUTHORS @@ -171,3 +171,4 @@ Philip Huppert blahgeek Kevin Deldycke inondle +Tomáš Čech From c15c47d19bfeeacd42f44dd7736f175711a91346 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 May 2016 20:45:03 +0600 Subject: [PATCH 304/347] [downloader/hls] Remove EXT-X-MEDIA-SEQUENCE from unsupported features for hlsnative --- youtube_dl/downloader/hls.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index d7b34bde3..dcedc9a64 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -23,7 +23,9 @@ class HlsFD(FragmentFD): UNSUPPORTED_FEATURES = ( r'#EXT-X-KEY:METHOD=(?!NONE)', # encrypted streams [1] r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2] - r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3] + # Live streams heuristic does not always work (e.g. geo restricted to Germany + # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0) + #r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3] # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 From 6104cc2985c36e996df1aae7cfcc686f3bae0b82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 May 2016 20:55:37 +0600 Subject: [PATCH 305/347] [downloader/hls] Add event media playlists to unsupported features of hlsnative --- youtube_dl/downloader/hls.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index dcedc9a64..a8279718b 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -26,9 +26,12 @@ class HlsFD(FragmentFD): # Live streams heuristic does not always work (e.g. geo restricted to Germany # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0) #r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3] + r'#EXT-X-PLAYLIST-TYPE:EVENT', # media segments may be appended to the end of + # event media playlists [4] # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 + # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5 ) return all(not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES) From fe40f9eef2483748ed83c9749f35220143d8cc9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 May 2016 21:55:03 +0600 Subject: [PATCH 306/347] [compat] Add compat_setenv --- test/test_compat.py | 8 ++++++++ youtube_dl/compat.py | 10 ++++++++++ 2 files changed, 18 insertions(+) diff --git a/test/test_compat.py b/test/test_compat.py index 618668210..0d751a856 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -13,6 +13,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.utils import get_filesystem_encoding from youtube_dl.compat import ( compat_getenv, + compat_setenv, compat_etree_fromstring, compat_expanduser, compat_shlex_split, @@ -31,6 +32,13 @@ class TestCompat(unittest.TestCase): else test_str.encode(get_filesystem_encoding())) self.assertEqual(compat_getenv('YOUTUBE-DL-TEST'), test_str) + def test_compat_setenv(self): + test_var = 'YOUTUBE-DL-TEST' + test_str = 'тест' + compat_setenv(test_var, test_str) + compat_getenv(test_var) + self.assertEqual(compat_getenv(test_var), test_str) + def test_compat_expanduser(self): old_home = os.environ.get('HOME') test_str = 'C:\Documents and Settings\тест\Application Data' diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 0b6c5ca7a..12b53cdc8 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -373,6 +373,9 @@ compat_os_name = os._name if os.name == 'java' else os.name if sys.version_info >= (3, 0): compat_getenv = os.getenv compat_expanduser = os.path.expanduser + + def compat_setenv(key, value, env=os.environ): + env[key] = value else: # Environment variables should be decoded with filesystem encoding. # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918) @@ -384,6 +387,12 @@ else: env = env.decode(get_filesystem_encoding()) return env + def compat_setenv(key, value, env=os.environ): + def encode(v): + from .utils import get_filesystem_encoding + return v.encode(get_filesystem_encoding()) if isinstance(v, compat_str) else v + env[encode(key)] = encode(value) + # HACK: The default implementations of os.path.expanduser from cpython do not decode # environment variables with filesystem encoding. We will work around this by # providing adjusted implementations. @@ -604,6 +613,7 @@ __all__ = [ 'compat_os_name', 'compat_parse_qs', 'compat_print', + 'compat_setenv', 'compat_shlex_split', 'compat_socket_create_connection', 'compat_str', From 129263875403841da485ac74b09960d862d23f63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 May 2016 21:58:38 +0600 Subject: [PATCH 307/347] [test_compat] Use compat_setenv --- test/test_compat.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index 0d751a856..afe6bd528 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -27,9 +27,7 @@ from youtube_dl.compat import ( class TestCompat(unittest.TestCase): def test_compat_getenv(self): test_str = 'тест' - os.environ['YOUTUBE-DL-TEST'] = ( - test_str if sys.version_info >= (3, 0) - else test_str.encode(get_filesystem_encoding())) + compat_setenv('YOUTUBE-DL-TEST', test_str) self.assertEqual(compat_getenv('YOUTUBE-DL-TEST'), test_str) def test_compat_setenv(self): @@ -42,11 +40,9 @@ class TestCompat(unittest.TestCase): def test_compat_expanduser(self): old_home = os.environ.get('HOME') test_str = 'C:\Documents and Settings\тест\Application Data' - os.environ['HOME'] = ( - test_str if sys.version_info >= (3, 0) - else test_str.encode(get_filesystem_encoding())) + compat_setenv('HOME', test_str) self.assertEqual(compat_expanduser('~'), test_str) - os.environ['HOME'] = old_home + compat_setenv('HOME', old_home) def test_all_present(self): import youtube_dl.compat From 20cfdcc910d0bc2ee4b0ee38bdf5e6ecb67e5731 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 May 2016 22:00:14 +0600 Subject: [PATCH 308/347] [test_compat] Avoid None values for compat_setenv --- test/test_compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_compat.py b/test/test_compat.py index afe6bd528..b20814249 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -42,7 +42,7 @@ class TestCompat(unittest.TestCase): test_str = 'C:\Documents and Settings\тест\Application Data' compat_setenv('HOME', test_str) self.assertEqual(compat_expanduser('~'), test_str) - compat_setenv('HOME', old_home) + compat_setenv('HOME', old_home or '') def test_all_present(self): import youtube_dl.compat From e62d9c5caaa972ef4b1ed5d6ab5ee4a087a4ba95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 May 2016 22:05:12 +0600 Subject: [PATCH 309/347] [downloader/external] Call ffmpeg with with HTTP_PROXY env variable set (#9437) --- youtube_dl/downloader/external.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index 45f49c350..3a73cee1c 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -6,6 +6,7 @@ import sys import re from .common import FileDownloader +from ..compat import compat_setenv from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS from ..utils import ( cli_option, @@ -198,6 +199,18 @@ class FFmpegFD(ExternalFD): '-headers', ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] + env = None + proxy = self.params.get('proxy') + if proxy: + if not re.match(r'^[\da-zA-Z]+://', proxy): + proxy = 'http://%s' % proxy + # Since December 2015 ffmpeg supports -http_proxy option (see + # http://git.videolan.org/?p=ffmpeg.git;a=commit;h=b4eb1f29ebddd60c41a2eb39f5af701e38e0d3fd) + # We could switch to the following code if we are able to detect version properly + # args += ['-http_proxy', proxy] + env = os.environ.copy() + compat_setenv('HTTP_PROXY', proxy, env=env) + protocol = info_dict.get('protocol') if protocol == 'rtmp': @@ -239,7 +252,7 @@ class FFmpegFD(ExternalFD): self._debug_cmd(args) - proc = subprocess.Popen(args, stdin=subprocess.PIPE) + proc = subprocess.Popen(args, stdin=subprocess.PIPE, env=env) try: retval = proc.wait() except KeyboardInterrupt: From fad7bbec3a1fb62964c8e6637dfd535fabe9c133 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 May 2016 22:15:55 +0600 Subject: [PATCH 310/347] [test_compat] Remove unused import --- test/test_compat.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_compat.py b/test/test_compat.py index b20814249..9adf75763 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -10,7 +10,6 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.utils import get_filesystem_encoding from youtube_dl.compat import ( compat_getenv, compat_setenv, From 2937590e8b70384ef91bdadbb56a55897aab0837 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Mon, 9 May 2016 22:16:33 +0600 Subject: [PATCH 311/347] [downloader/hls] PEP 8 --- youtube_dl/downloader/hls.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index a8279718b..62136ee54 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -25,7 +25,7 @@ class HlsFD(FragmentFD): r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2] # Live streams heuristic does not always work (e.g. geo restricted to Germany # http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0) - #r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3] + # r'#EXT-X-MEDIA-SEQUENCE:(?!0$)', # live streams [3] r'#EXT-X-PLAYLIST-TYPE:EVENT', # media segments may be appended to the end of # event media playlists [4] # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4 From 4350b74545ea3d3ce072444655613bc6974d5516 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 23 Apr 2016 15:44:34 +0800 Subject: [PATCH 312/347] [socks] Add socks.py from @bluec0re's public domain implementation https://gist.github.com/bluec0re/cafd3764412967417fd3 --- youtube_dl/socks.py | 336 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 336 insertions(+) create mode 100644 youtube_dl/socks.py diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py new file mode 100644 index 000000000..b0c36a189 --- /dev/null +++ b/youtube_dl/socks.py @@ -0,0 +1,336 @@ +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# For more information, please refer to <http://unlicense.org/> +# +# Example: +# import socks +# import ftplib +# import socket +# +# socks.patch_socket() +# +# f = ftplib.FTP('ftp.kernel.org') +# f.login() +# print f.retrlines('LIST') +# f.quit() +# +# s = socket.create_connection(('www.google.com', 80)) +# s.sendall('HEAD / HTTP/1.0\r\n\r\n') +# print s.recv(1024) +# s.close() +from __future__ import unicode_literals +import os +import struct +import socket +import time + +__author__ = 'Timo Schmid <coding@timoschmid.de>' + +_orig_socket = socket.socket + +try: + from collections import namedtuple +except ImportError: + from Collections import namedtuple + +try: + from urllib.parse import urlparse +except: + from urlparse import urlparse + +try: + from enum import Enum +except ImportError: + Enum = object + + +class ProxyError(IOError): pass +class Socks4Error(ProxyError): + CODES = { + 0x5B: 'request rejected or failed', + 0x5C: 'request rejected becasue SOCKS server cannot connect to identd on the client', + 0x5D: 'request rejected because the client program and identd report different user-ids' + } + def __init__(self, code=None, msg=None): + if code is not None and msg is None: + msg = self.CODES.get(code) + if msg is None: + msg = 'unknown error' + super(Socks4Error, self).__init__(code, msg) + +class Socks5Error(Socks4Error): + CODES = { + 0x01: 'general SOCKS server failure', + 0x02: 'connection not allowed by ruleset', + 0x03: 'Network unreachable', + 0x04: 'Host unreachable', + 0x05: 'Connection refused', + 0x06: 'TTL expired', + 0x07: 'Command not supported', + 0x08: 'Address type not supported', + 0xFE: 'unknown username or invalid password', + 0xFF: 'all offered authentication methods were rejected' + } + +class ProxyType(Enum): + SOCKS4 = 0 + SOCKS4A = 1 + SOCKS5 = 2 + +Proxy = namedtuple('Proxy', ('type', 'host', 'port', 'username', 'password', 'remote_dns')) + +_default_proxy = None + +def setdefaultproxy(proxytype=None, addr=None, port=None, rdns=True, username=None, password=None, allow_env_override=True): + global _default_proxy + if allow_env_override: + all_proxy = os.environ.get('ALL_PROXY', os.environ.get('all_proxy')) + if all_proxy: + all_proxy = urlparse(all_proxy) + if all_proxy.scheme.startswith('socks'): + if all_proxy.scheme == 'socks' or all_proxy.scheme == 'socks4': + proxytype = ProxyType.SOCKS4 + elif all_proxy.scheme == 'socks4a': + proxytype = ProxyType.SOCKS4A + elif all_proxy.scheme == 'socks5': + proxytype = ProxyType.SOCKS5 + addr = all_proxy.hostname + port = all_proxy.port + username = all_proxy.username + password = all_proxy.password + + if proxytype is not None: + _default_proxy = Proxy(proxytype, addr, port, username, password, rdns) + + +def wrap_socket(sock): + return socksocket(_sock=sock._sock) + +def wrap_module(module): + if hasattr(module, 'socket'): + sock = module.socket + if isinstance(sock, socket.socket): + module.socket = sockssocket + elif hasattr(socket, 'socket'): + socket.socket = sockssocket + +def patch_socket(): + import sys + if 'socket' not in sys.modules: + import socket + sys.modules['socket'].socket = sockssocket + + +class sockssocket(socket.socket): + def __init__(self, *args, **kwargs): + self.__proxy = None + if 'proxy' in kwargs: + self.__proxy = kwargs['proxy'] + del kwargs['proxy'] + super(sockssocket, self).__init__(*args, **kwargs) + + @property + def _proxy(self): + if self.__proxy: + return self.__proxy + return _default_proxy + + @property + def _proxy_port(self): + if self._proxy: + if self._proxy.port: + return self._proxy.port + return 1080 + return None + + def setproxy(self, proxytype=None, addr=None, port=None, rdns=True, username=None, password=None): + if proxytype is None: + self.__proxy = None + else: + self.__proxy = Proxy(proxytype, addr, port, username, password, rdns) + + def recvall(self, cnt): + data = b'' + while len(data) < cnt: + cur = self.recv(cnt - len(data)) + if not cur: + raise IOError("{0} bytes missing".format(cnt-len(data))) + data += cur + return data + + def _setup_socks4(self, address, is_4a=False): + destaddr, port = address + + try: + ipaddr = socket.inet_aton(destaddr) + except socket.error: + if is_4a and self._proxy.remote_dns: + ipaddr = struct.pack('!BBBB', 0, 0, 0, 0xFF) + else: + ipaddr = socket.inet_aton(socket.gethostbyname(destaddr)) + + packet = struct.pack('!BBH', 0x4, 0x1, port) + ipaddr + if self._proxy.username: + username = self._proxy.username + if hasattr(username, 'encode'): + username = username.encode() + packet += struct.pack('!{0}s'.format(len(username)+1), username) + else: + packet += b'\x00' + + if is_4a and self._proxy.remote_dns: + if hasattr(destaddr, 'encode'): + destaddr = destaddr.encode() + packet += struct.pack('!{0}s'.format(len(destaddr)+1), destaddr) + + self.sendall(packet) + + packet = self.recvall(8) + nbyte, resp_code, dstport, dsthost = struct.unpack('!BBHI', packet) + + # check valid response + if nbyte != 0x00: + self.close() + raise ProxyError(0, "Invalid response from server. Expected {0:02x} got {1:02x}".format(0, nbyte)) + + # access granted + if resp_code != 0x5a: + self.close() + raise Socks4Error(resp_code) + + def _setup_socks5(self, address): + destaddr, port = address + + try: + ipaddr = socket.inet_aton(destaddr) + except socket.error: + if self._proxy.remote_dns: + ipaddr = None + else: + ipaddr = socket.inet_aton(socket.gethostbyname(destaddr)) + + auth_methods = 1 + if self._proxy.username and self._proxy.password: + # two auth methods available + auth_methods = 2 + packet = struct.pack('!BBB', 0x5, auth_methods, 0x00) # no auth + if self._proxy.username and self._proxy.password: + packet += struct.pack('!B', 0x02) # user/pass auth + + self.sendall(packet) + + packet = self.recvall(2) + version, method = struct.unpack('!BB', packet) + + # check valid response + if version != 0x05: + self.close() + raise ProxyError(0, "Invalid response from server. Expected {0:02x} got {1:02x}".format(5, version)) + + # no auth methods + if method == 0xFF: + self.close() + raise Socks5Error(method) + + # user/pass auth + if method == 0x01: + username = self._proxy.username + if hasattr(username, 'encode'): + username = username.encode() + password = self._proxy.password + if hasattr(password, 'encode'): + password = password.encode() + packet = struct.pack('!BB', 1, len(username)) + username + packet += struct.pack('!B', len(password)) + password + self.sendall(packet) + + packet = self.recvall(2) + version, status = struct.unpack('!BB', packet) + + if version != 0x01: + self.close() + raise ProxyError(0, "Invalid response from server. Expected {0:02x} got {1:02x}".format(1, version)) + + if status != 0x00: + self.close() + raise Socks5Error(1) + elif method == 0x00: # no auth + pass + + + packet = struct.pack('!BBB', 5, 1, 0) + if ipaddr is None: + if hasattr(destaddr, 'encode'): + destaddr = destaddr.encode() + packet += struct.pack('!BB', 3, len(destaddr)) + destaddr + else: + packet += struct.pack('!B', 1) + ipaddr + packet += struct.pack('!H', port) + + self.sendall(packet) + + packet = self.recvall(4) + version, status, _, atype = struct.unpack('!BBBB', packet) + + if version != 0x05: + self.close() + raise ProxyError(0, "Invalid response from server. Expected {0:02x} got {1:02x}".format(5, version)) + + if status != 0x00: + self.close() + raise Socks5Error(status) + + if atype == 0x01: + destaddr = self.recvall(4) + elif atype == 0x03: + alen = struct.unpack('!B', self.recv(1))[0] + destaddr = self.recvall(alen) + elif atype == 0x04: + destaddr = self.recvall(16) + destport = struct.unpack('!H', self.recvall(2))[0] + + def _make_proxy(self, connect_func, address): + if self._proxy.type == ProxyType.SOCKS4: + result = connect_func(self, (self._proxy.host, self._proxy_port)) + if result != 0 and result is not None: + return result + self._setup_socks4(address) + elif self._proxy.type == ProxyType.SOCKS4A: + result = connect_func(self, (self._proxy.host, self._proxy_port)) + if result != 0 and result is not None: + return result + self._setup_socks4(address, is_4a=True) + elif self._proxy.type == ProxyType.SOCKS5: + result = connect_func(self, (self._proxy.host, self._proxy_port)) + if result != 0 and result is not None: + return result + self._setup_socks5(address) + else: + return connect_func(self, address) + + def connect(self, address): + self._make_proxy(_orig_socket.connect, address) + + def connect_ex(self, address): + return self._make_proxy(_orig_socket.connect_ex, address) From dab0daeeb0929b9b560d2b9a5f39c1e2e6dfa449 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 23 Apr 2016 18:28:49 +0800 Subject: [PATCH 313/347] [utils,compat] Move struct_pack and struct_unpack to compat.py --- test/test_compat.py | 5 +++++ test/test_utils.py | 4 ---- youtube_dl/compat.py | 23 +++++++++++++++++++++++ youtube_dl/downloader/f4m.py | 4 ++-- youtube_dl/extractor/rtve.py | 4 +++- youtube_dl/swfinterp.py | 6 ++++-- youtube_dl/utils.py | 20 +------------------- 7 files changed, 38 insertions(+), 28 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index 9adf75763..dd62a5d6b 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -20,6 +20,7 @@ from youtube_dl.compat import ( compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, compat_urllib_parse_urlencode, + struct_unpack, ) @@ -102,5 +103,9 @@ class TestCompat(unittest.TestCase): self.assertTrue(isinstance(doc.find('chinese').text, compat_str)) self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str)) + def test_struct_unpack(self): + self.assertEqual(struct_unpack('!B', b'\x00'), (0,)) + + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index 00ada95ec..5702ffa97 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -55,7 +55,6 @@ from youtube_dl.utils import ( smuggle_url, str_to_int, strip_jsonp, - struct_unpack, timeconvert, unescapeHTML, unified_strdate, @@ -457,9 +456,6 @@ class TestUtil(unittest.TestCase): testPL(5, 2, (2, 99), [2, 3, 4]) testPL(5, 2, (20, 99), []) - def test_struct_unpack(self): - self.assertEqual(struct_unpack('!B', b'\x00'), (0,)) - def test_read_batch_urls(self): f = io.StringIO('''\xef\xbb\xbf foo bar\r diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 12b53cdc8..f697bee7e 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -11,6 +11,7 @@ import re import shlex import shutil import socket +import struct import subprocess import sys import itertools @@ -592,6 +593,26 @@ if sys.version_info >= (3, 0): else: from tokenize import generate_tokens as compat_tokenize_tokenize + +try: + struct.pack('!I', 0) +except TypeError: + # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument + # See https://bugs.python.org/issue19099 + def struct_pack(spec, *args): + if isinstance(spec, compat_str): + spec = spec.encode('ascii') + return struct.pack(spec, *args) + + def struct_unpack(spec, *args): + if isinstance(spec, compat_str): + spec = spec.encode('ascii') + return struct.unpack(spec, *args) +else: + struct_pack = struct.pack + struct_unpack = struct.unpack + + __all__ = [ 'compat_HTMLParser', 'compat_HTTPError', @@ -634,6 +655,8 @@ __all__ = [ 'compat_xml_parse_error', 'compat_xpath', 'shlex_quote', + 'struct_pack', + 'struct_unpack', 'subprocess_check_output', 'workaround_optparse_bug9161', ] diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 664d87543..b282fe3d6 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -12,13 +12,13 @@ from ..compat import ( compat_urlparse, compat_urllib_error, compat_urllib_parse_urlparse, + struct_pack, + struct_unpack, ) from ..utils import ( encodeFilename, fix_xml_ampersands, sanitize_open, - struct_pack, - struct_unpack, xpath_text, ) diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 79af47715..f59040877 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -6,6 +6,9 @@ import re import time from .common import InfoExtractor +from ..compat import ( + struct_unpack, +) from ..utils import ( ExtractorError, float_or_none, @@ -13,7 +16,6 @@ from ..utils import ( remove_start, sanitized_Request, std_headers, - struct_unpack, ) diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py index 06c1d6cc1..86b28716c 100644 --- a/youtube_dl/swfinterp.py +++ b/youtube_dl/swfinterp.py @@ -4,10 +4,12 @@ import collections import io import zlib -from .compat import compat_str +from .compat import ( + compat_str, + struct_unpack, +) from .utils import ( ExtractorError, - struct_unpack, ) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 6e4573784..fa16a42ad 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -26,7 +26,6 @@ import platform import re import socket import ssl -import struct import subprocess import sys import tempfile @@ -53,6 +52,7 @@ from .compat import ( compat_urlparse, compat_xpath, shlex_quote, + struct_pack, ) @@ -1761,24 +1761,6 @@ def escape_url(url): fragment=escape_rfc3986(url_parsed.fragment) ).geturl() -try: - struct.pack('!I', 0) -except TypeError: - # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument - # See https://bugs.python.org/issue19099 - def struct_pack(spec, *args): - if isinstance(spec, compat_str): - spec = spec.encode('ascii') - return struct.pack(spec, *args) - - def struct_unpack(spec, *args): - if isinstance(spec, compat_str): - spec = spec.encode('ascii') - return struct.unpack(spec, *args) -else: - struct_pack = struct.pack - struct_unpack = struct.unpack - def read_batch_urls(batch_fd): def fixup(url): From 71aff18809a70b7fa32d8fd07f4fb2f64641aea5 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 23 Apr 2016 21:30:06 +0800 Subject: [PATCH 314/347] [socks] Support SOCKS proxies --- youtube_dl/socks.py | 196 ++++++++++++-------------------------------- youtube_dl/utils.py | 63 +++++++++++++- 2 files changed, 115 insertions(+), 144 deletions(-) diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py index b0c36a189..95795b5a9 100644 --- a/youtube_dl/socks.py +++ b/youtube_dl/socks.py @@ -1,77 +1,30 @@ -# This is free and unencumbered software released into the public domain. -# -# Anyone is free to copy, modify, publish, use, compile, sell, or -# distribute this software, either in source code form or as a compiled -# binary, for any purpose, commercial or non-commercial, and by any -# means. -# -# In jurisdictions that recognize copyright laws, the author or authors -# of this software dedicate any and all copyright interest in the -# software to the public domain. We make this dedication for the benefit -# of the public at large and to the detriment of our heirs and -# successors. We intend this dedication to be an overt act of -# relinquishment in perpetuity of all present and future rights to this -# software under copyright law. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR -# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -# OTHER DEALINGS IN THE SOFTWARE. -# -# For more information, please refer to <http://unlicense.org/> -# -# Example: -# import socks -# import ftplib -# import socket -# -# socks.patch_socket() -# -# f = ftplib.FTP('ftp.kernel.org') -# f.login() -# print f.retrlines('LIST') -# f.quit() -# -# s = socket.create_connection(('www.google.com', 80)) -# s.sendall('HEAD / HTTP/1.0\r\n\r\n') -# print s.recv(1024) -# s.close() +# Public Domain SOCKS proxy protocol implementation +# Adapted from https://gist.github.com/bluec0re/cafd3764412967417fd3 + from __future__ import unicode_literals -import os -import struct + +import collections import socket -import time + +from .compat import ( + struct_pack, + struct_unpack, +) __author__ = 'Timo Schmid <coding@timoschmid.de>' -_orig_socket = socket.socket -try: - from collections import namedtuple -except ImportError: - from Collections import namedtuple - -try: - from urllib.parse import urlparse -except: - from urlparse import urlparse - -try: - from enum import Enum -except ImportError: - Enum = object +class ProxyError(IOError): + pass -class ProxyError(IOError): pass class Socks4Error(ProxyError): CODES = { 0x5B: 'request rejected or failed', 0x5C: 'request rejected becasue SOCKS server cannot connect to identd on the client', 0x5D: 'request rejected because the client program and identd report different user-ids' } + def __init__(self, code=None, msg=None): if code is not None and msg is None: msg = self.CODES.get(code) @@ -79,6 +32,7 @@ class Socks4Error(ProxyError): msg = 'unknown error' super(Socks4Error, self).__init__(code, msg) + class Socks5Error(Socks4Error): CODES = { 0x01: 'general SOCKS server failure', @@ -93,68 +47,19 @@ class Socks5Error(Socks4Error): 0xFF: 'all offered authentication methods were rejected' } -class ProxyType(Enum): - SOCKS4 = 0 + +class ProxyType(object): + SOCKS4 = 0 SOCKS4A = 1 - SOCKS5 = 2 + SOCKS5 = 2 -Proxy = namedtuple('Proxy', ('type', 'host', 'port', 'username', 'password', 'remote_dns')) - -_default_proxy = None - -def setdefaultproxy(proxytype=None, addr=None, port=None, rdns=True, username=None, password=None, allow_env_override=True): - global _default_proxy - if allow_env_override: - all_proxy = os.environ.get('ALL_PROXY', os.environ.get('all_proxy')) - if all_proxy: - all_proxy = urlparse(all_proxy) - if all_proxy.scheme.startswith('socks'): - if all_proxy.scheme == 'socks' or all_proxy.scheme == 'socks4': - proxytype = ProxyType.SOCKS4 - elif all_proxy.scheme == 'socks4a': - proxytype = ProxyType.SOCKS4A - elif all_proxy.scheme == 'socks5': - proxytype = ProxyType.SOCKS5 - addr = all_proxy.hostname - port = all_proxy.port - username = all_proxy.username - password = all_proxy.password - - if proxytype is not None: - _default_proxy = Proxy(proxytype, addr, port, username, password, rdns) - - -def wrap_socket(sock): - return socksocket(_sock=sock._sock) - -def wrap_module(module): - if hasattr(module, 'socket'): - sock = module.socket - if isinstance(sock, socket.socket): - module.socket = sockssocket - elif hasattr(socket, 'socket'): - socket.socket = sockssocket - -def patch_socket(): - import sys - if 'socket' not in sys.modules: - import socket - sys.modules['socket'].socket = sockssocket +Proxy = collections.namedtuple('Proxy', ('type', 'host', 'port', 'username', 'password', 'remote_dns')) class sockssocket(socket.socket): - def __init__(self, *args, **kwargs): - self.__proxy = None - if 'proxy' in kwargs: - self.__proxy = kwargs['proxy'] - del kwargs['proxy'] - super(sockssocket, self).__init__(*args, **kwargs) - @property def _proxy(self): - if self.__proxy: - return self.__proxy - return _default_proxy + return self.__proxy @property def _proxy_port(self): @@ -175,7 +80,7 @@ class sockssocket(socket.socket): while len(data) < cnt: cur = self.recv(cnt - len(data)) if not cur: - raise IOError("{0} bytes missing".format(cnt-len(data))) + raise IOError('{0} bytes missing'.format(cnt - len(data))) data += cur return data @@ -186,39 +91,42 @@ class sockssocket(socket.socket): ipaddr = socket.inet_aton(destaddr) except socket.error: if is_4a and self._proxy.remote_dns: - ipaddr = struct.pack('!BBBB', 0, 0, 0, 0xFF) + ipaddr = struct_pack('!BBBB', 0, 0, 0, 0xFF) else: ipaddr = socket.inet_aton(socket.gethostbyname(destaddr)) - packet = struct.pack('!BBH', 0x4, 0x1, port) + ipaddr + packet = struct_pack('!BBH', 0x4, 0x1, port) + ipaddr if self._proxy.username: username = self._proxy.username if hasattr(username, 'encode'): username = username.encode() - packet += struct.pack('!{0}s'.format(len(username)+1), username) + packet += struct_pack('!{0}s'.format(len(username) + 1), username) else: packet += b'\x00' if is_4a and self._proxy.remote_dns: if hasattr(destaddr, 'encode'): destaddr = destaddr.encode() - packet += struct.pack('!{0}s'.format(len(destaddr)+1), destaddr) + packet += struct_pack('!{0}s'.format(len(destaddr) + 1), destaddr) self.sendall(packet) packet = self.recvall(8) - nbyte, resp_code, dstport, dsthost = struct.unpack('!BBHI', packet) + nbyte, resp_code, dstport, dsthost = struct_unpack('!BBHI', packet) # check valid response if nbyte != 0x00: self.close() - raise ProxyError(0, "Invalid response from server. Expected {0:02x} got {1:02x}".format(0, nbyte)) + raise ProxyError( + 0, 'Invalid response from server. Expected {0:02x} got {1:02x}'.format(0, nbyte)) # access granted if resp_code != 0x5a: self.close() raise Socks4Error(resp_code) + return (dsthost, dstport) + def _setup_socks5(self, address): destaddr, port = address @@ -234,19 +142,20 @@ class sockssocket(socket.socket): if self._proxy.username and self._proxy.password: # two auth methods available auth_methods = 2 - packet = struct.pack('!BBB', 0x5, auth_methods, 0x00) # no auth + packet = struct_pack('!BBB', 0x5, auth_methods, 0x00) # no auth if self._proxy.username and self._proxy.password: - packet += struct.pack('!B', 0x02) # user/pass auth + packet += struct_pack('!B', 0x02) # user/pass auth self.sendall(packet) packet = self.recvall(2) - version, method = struct.unpack('!BB', packet) + version, method = struct_unpack('!BB', packet) # check valid response if version != 0x05: self.close() - raise ProxyError(0, "Invalid response from server. Expected {0:02x} got {1:02x}".format(5, version)) + raise ProxyError( + 0, 'Invalid response from server. Expected {0:02x} got {1:02x}'.format(5, version)) # no auth methods if method == 0xFF: @@ -261,41 +170,42 @@ class sockssocket(socket.socket): password = self._proxy.password if hasattr(password, 'encode'): password = password.encode() - packet = struct.pack('!BB', 1, len(username)) + username - packet += struct.pack('!B', len(password)) + password + packet = struct_pack('!BB', 1, len(username)) + username + packet += struct_pack('!B', len(password)) + password self.sendall(packet) packet = self.recvall(2) - version, status = struct.unpack('!BB', packet) + version, status = struct_unpack('!BB', packet) if version != 0x01: self.close() - raise ProxyError(0, "Invalid response from server. Expected {0:02x} got {1:02x}".format(1, version)) + raise ProxyError( + 0, 'Invalid response from server. Expected {0:02x} got {1:02x}'.format(1, version)) if status != 0x00: self.close() raise Socks5Error(1) - elif method == 0x00: # no auth + elif method == 0x00: # no auth pass - - packet = struct.pack('!BBB', 5, 1, 0) + packet = struct_pack('!BBB', 5, 1, 0) if ipaddr is None: if hasattr(destaddr, 'encode'): destaddr = destaddr.encode() - packet += struct.pack('!BB', 3, len(destaddr)) + destaddr + packet += struct_pack('!BB', 3, len(destaddr)) + destaddr else: - packet += struct.pack('!B', 1) + ipaddr - packet += struct.pack('!H', port) + packet += struct_pack('!B', 1) + ipaddr + packet += struct_pack('!H', port) self.sendall(packet) packet = self.recvall(4) - version, status, _, atype = struct.unpack('!BBBB', packet) + version, status, _, atype = struct_unpack('!BBBB', packet) if version != 0x05: self.close() - raise ProxyError(0, "Invalid response from server. Expected {0:02x} got {1:02x}".format(5, version)) + raise ProxyError( + 0, 'Invalid response from server. Expected {0:02x} got {1:02x}'.format(5, version)) if status != 0x00: self.close() @@ -304,11 +214,13 @@ class sockssocket(socket.socket): if atype == 0x01: destaddr = self.recvall(4) elif atype == 0x03: - alen = struct.unpack('!B', self.recv(1))[0] + alen = struct_unpack('!B', self.recv(1))[0] destaddr = self.recvall(alen) elif atype == 0x04: destaddr = self.recvall(16) - destport = struct.unpack('!H', self.recvall(2))[0] + destport = struct_unpack('!H', self.recvall(2))[0] + + return (destaddr, destport) def _make_proxy(self, connect_func, address): if self._proxy.type == ProxyType.SOCKS4: @@ -330,7 +242,7 @@ class sockssocket(socket.socket): return connect_func(self, address) def connect(self, address): - self._make_proxy(_orig_socket.connect, address) + self._make_proxy(socket.socket.connect, address) def connect_ex(self, address): - return self._make_proxy(_orig_socket.connect_ex, address) + return self._make_proxy(socket.socket.connect_ex, address) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index fa16a42ad..b2e4a2dfb 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -55,6 +55,11 @@ from .compat import ( struct_pack, ) +from .socks import ( + ProxyType, + sockssocket, +) + # This is not clearly defined otherwise compiled_regex_type = type(re.compile('')) @@ -752,8 +757,15 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): self._params = params def http_open(self, req): + conn_class = compat_http_client.HTTPConnection + + socks_proxy = req.headers.get('Ytdl-socks-proxy') + if socks_proxy: + conn_class = make_socks_conn_class(conn_class, socks_proxy) + del req.headers['Ytdl-socks-proxy'] + return self.do_open(functools.partial( - _create_http_connection, self, compat_http_client.HTTPConnection, False), + _create_http_connection, self, conn_class, False), req) @staticmethod @@ -849,6 +861,41 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): https_response = http_response +def make_socks_conn_class(base_class, socks_proxy): + assert issubclass(base_class, ( + compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection)) + + url_components = compat_urlparse.urlparse(socks_proxy) + if url_components.scheme.lower() == 'socks5': + socks_type = ProxyType.SOCKS5 + elif url_components.scheme.lower() in ('socks', 'socks4'): + socks_type = ProxyType.SOCKS4 + + proxy_args = ( + socks_type, + url_components.hostname, url_components.port or 1080, + True, # Remote DNS + url_components.username, url_components.password + ) + + class SocksConnection(base_class): + def connect(self): + self.sock = sockssocket() + self.sock.setproxy(*proxy_args) + if type(self.timeout) in (int, float): + self.sock.settimeout(self.timeout) + self.sock.connect((self.host, self.port)) + + if isinstance(self, compat_http_client.HTTPSConnection): + if hasattr(self, '_context'): # Python > 2.6 + self.sock = self._context.wrap_socket( + self.sock, server_hostname=self.host) + else: + self.sock = ssl.wrap_socket(self.sock) + + return SocksConnection + + class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): def __init__(self, params, https_conn_class=None, *args, **kwargs): compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs) @@ -857,12 +904,20 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): def https_open(self, req): kwargs = {} + conn_class = self._https_conn_class + if hasattr(self, '_context'): # python > 2.6 kwargs['context'] = self._context if hasattr(self, '_check_hostname'): # python 3.x kwargs['check_hostname'] = self._check_hostname + + socks_proxy = req.headers.get('Ytdl-socks-proxy') + if socks_proxy: + conn_class = make_socks_conn_class(conn_class, socks_proxy) + del req.headers['Ytdl-socks-proxy'] + return self.do_open(functools.partial( - _create_http_connection, self, self._https_conn_class, True), + _create_http_connection, self, conn_class, True), req, **kwargs) @@ -2683,6 +2738,10 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): if proxy == '__noproxy__': return None # No Proxy + if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks5'): + req.add_header('Ytdl-socks-proxy', proxy) + # youtube-dl's http/https handlers do wrapping the socket with socks + return None return compat_urllib_request.ProxyHandler.proxy_open( self, req, proxy, type) From 72f3289ac48d8dbfe1ee3fd2d82a23f1bff045df Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sat, 23 Apr 2016 21:30:44 +0800 Subject: [PATCH 315/347] [test/test_socks] Add tests for SOCKS proxies --- .gitignore | 1 + Makefile | 2 +- test/helper.py | 5 ++++ test/test_socks.py | 71 ++++++++++++++++++++++++++++++++++++++++++++++ tox.ini | 1 + 5 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 test/test_socks.py diff --git a/.gitignore b/.gitignore index 72c10425d..0e7128551 100644 --- a/.gitignore +++ b/.gitignore @@ -31,6 +31,7 @@ updates_key.pem *.part *.swp test/testdata +test/local_parameters.json .tox youtube-dl.zsh .idea diff --git a/Makefile b/Makefile index c9ce216d1..5d7cd5a7e 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ test: ot: offlinetest offlinetest: codetest - $(PYTHON) -m nose --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py + $(PYTHON) -m nose --verbose test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py --exclude test_socks.py tar: youtube-dl.tar.gz diff --git a/test/helper.py b/test/helper.py index b8e22c5cb..dfee217a9 100644 --- a/test/helper.py +++ b/test/helper.py @@ -24,8 +24,13 @@ from youtube_dl.utils import ( def get_params(override=None): PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), "parameters.json") + LOCAL_PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)), + "local_parameters.json") with io.open(PARAMETERS_FILE, encoding='utf-8') as pf: parameters = json.load(pf) + if os.path.exists(LOCAL_PARAMETERS_FILE): + with io.open(LOCAL_PARAMETERS_FILE, encoding='utf-8') as pf: + parameters.update(json.load(pf)) if override: parameters.update(override) return parameters diff --git a/test/test_socks.py b/test/test_socks.py new file mode 100644 index 000000000..92574c6fd --- /dev/null +++ b/test/test_socks.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +# coding: utf-8 +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import (FakeYDL, get_params) +from youtube_dl.compat import compat_urllib_request + + +class TestSocks(unittest.TestCase): + @staticmethod + def _check_params(attrs): + params = get_params() + for attr in attrs: + if attr not in params: + print('Missing %s. Skipping.' % attr) + return + return params + + def test_proxy_http(self): + params = self._check_params(['primary_proxy', 'primary_server_ip']) + if params is None: + return + ydl = FakeYDL({ + 'proxy': params['primary_proxy'] + }) + self.assertEqual( + ydl.urlopen('http://yt-dl.org/ip').read().decode('utf-8'), + params['primary_server_ip']) + + def test_proxy_https(self): + params = self._check_params(['primary_proxy', 'primary_server_ip']) + if params is None: + return + ydl = FakeYDL({ + 'proxy': params['primary_proxy'] + }) + self.assertEqual( + ydl.urlopen('https://yt-dl.org/ip').read().decode('utf-8'), + params['primary_server_ip']) + + def test_secondary_proxy_http(self): + params = self._check_params(['secondary_proxy', 'secondary_server_ip']) + if params is None: + return + ydl = FakeYDL() + req = compat_urllib_request.Request('http://yt-dl.org/ip') + req.add_header('Ytdl-request-proxy', params['secondary_proxy']) + self.assertEqual( + ydl.urlopen(req).read().decode('utf-8'), + params['secondary_server_ip']) + + def test_secondary_proxy_https(self): + params = self._check_params(['secondary_proxy', 'secondary_server_ip']) + if params is None: + return + ydl = FakeYDL() + req = compat_urllib_request.Request('https://yt-dl.org/ip') + req.add_header('Ytdl-request-proxy', params['secondary_proxy']) + self.assertEqual( + ydl.urlopen(req).read().decode('utf-8'), + params['secondary_server_ip']) + + +if __name__ == '__main__': + unittest.main() diff --git a/tox.ini b/tox.ini index 2d7134005..9c4e4a3d1 100644 --- a/tox.ini +++ b/tox.ini @@ -9,5 +9,6 @@ passenv = HOME defaultargs = test --exclude test_download.py --exclude test_age_restriction.py --exclude test_subtitles.py --exclude test_write_annotations.py --exclude test_youtube_lists.py --exclude test_iqiyi_sdk_interpreter.py + --exclude test_socks.py commands = nosetests --verbose {posargs:{[testenv]defaultargs}} # --with-coverage --cover-package=youtube_dl --cover-html # test.test_download:TestDownload.test_NowVideo From 9e9cd7248d387954d1009087ac300ee3ff6a9766 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 3 May 2016 15:11:05 +0800 Subject: [PATCH 316/347] [socks] Eliminate magic constants and improve --- youtube_dl/socks.py | 289 ++++++++++++++++++++++++-------------------- 1 file changed, 157 insertions(+), 132 deletions(-) diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py index 95795b5a9..0e3dd7893 100644 --- a/youtube_dl/socks.py +++ b/youtube_dl/socks.py @@ -3,37 +3,87 @@ from __future__ import unicode_literals +# References: +# SOCKS4 protocol http://www.openssh.com/txt/socks4.protocol +# SOCKS4A protocol http://www.openssh.com/txt/socks4a.protocol +# SOCKS5 protocol https://tools.ietf.org/html/rfc1928 +# SOCKS5 username/password authentication https://tools.ietf.org/html/rfc1929 + import collections import socket from .compat import ( + compat_ord, struct_pack, struct_unpack, ) __author__ = 'Timo Schmid <coding@timoschmid.de>' +SOCKS4_VERSION = 4 +SOCKS4_REPLY_VERSION = 0x00 +# Excerpt from SOCKS4A protocol: +# if the client cannot resolve the destination host's domain name to find its +# IP address, it should set the first three bytes of DSTIP to NULL and the last +# byte to a non-zero value. +SOCKS4_DEFAULT_DSTIP = struct_pack('!BBBB', 0, 0, 0, 0xFF) + +SOCKS5_VERSION = 5 +SOCKS5_USER_AUTH_VERSION = 0x01 +SOCKS5_USER_AUTH_SUCCESS = 0x00 + + +class Socks4Command(object): + CMD_CONNECT = 0x01 + CMD_BIND = 0x02 + + +class Socks5Command(Socks4Command): + CMD_UDP_ASSOCIATE = 0x03 + + +class Socks5Auth(object): + AUTH_NONE = 0x00 + AUTH_GSSAPI = 0x01 + AUTH_USER_PASS = 0x02 + AUTH_NO_ACCEPTABLE = 0xFF # For server response + + +class Socks5AddressType(object): + ATYP_IPV4 = 0x01 + ATYP_DOMAINNAME = 0x03 + ATYP_IPV6 = 0x04 + class ProxyError(IOError): - pass - - -class Socks4Error(ProxyError): - CODES = { - 0x5B: 'request rejected or failed', - 0x5C: 'request rejected becasue SOCKS server cannot connect to identd on the client', - 0x5D: 'request rejected because the client program and identd report different user-ids' - } + ERR_SUCCESS = 0x00 def __init__(self, code=None, msg=None): if code is not None and msg is None: - msg = self.CODES.get(code) - if msg is None: - msg = 'unknown error' - super(Socks4Error, self).__init__(code, msg) + msg = self.CODES.get(code) and 'unknown error' + super(ProxyError, self).__init__(code, msg) -class Socks5Error(Socks4Error): +class InvalidVersionError(ProxyError): + def __init__(self, expected_version, got_version): + msg = ('Invalid response version from server. Expected {0:02x} got ' + '{1:02x}'.format(expected_version, got_version)) + super(InvalidVersionError, self).__init__(0, msg) + + +class Socks4Error(ProxyError): + ERR_SUCCESS = 90 + + CODES = { + 91: 'request rejected or failed', + 92: 'request rejected becasue SOCKS server cannot connect to identd on the client', + 93: 'request rejected because the client program and identd report different user-ids' + } + + +class Socks5Error(ProxyError): + ERR_GENERAL_FAILURE = 0x01 + CODES = { 0x01: 'general SOCKS server failure', 0x02: 'connection not allowed by ruleset', @@ -53,27 +103,19 @@ class ProxyType(object): SOCKS4A = 1 SOCKS5 = 2 -Proxy = collections.namedtuple('Proxy', ('type', 'host', 'port', 'username', 'password', 'remote_dns')) +Proxy = collections.namedtuple('Proxy', ( + 'type', 'host', 'port', 'username', 'password', 'remote_dns')) class sockssocket(socket.socket): - @property - def _proxy(self): - return self.__proxy + def __init__(self, *args, **kwargs): + self._proxy = None + super(sockssocket, self).__init__(*args, **kwargs) - @property - def _proxy_port(self): - if self._proxy: - if self._proxy.port: - return self._proxy.port - return 1080 - return None + def setproxy(self, proxytype, addr, port, rdns=True, username=None, password=None): + assert proxytype in (ProxyType.SOCKS4, ProxyType.SOCKS4A, ProxyType.SOCKS5) - def setproxy(self, proxytype=None, addr=None, port=None, rdns=True, username=None, password=None): - if proxytype is None: - self.__proxy = None - else: - self.__proxy = Proxy(proxytype, addr, port, username, password, rdns) + self._proxy = Proxy(proxytype, addr, port, username, password, rdns) def recvall(self, cnt): data = b'' @@ -84,163 +126,146 @@ class sockssocket(socket.socket): data += cur return data + def _recv_bytes(self, cnt): + data = self.recvall(cnt) + return struct_unpack('!{0}B'.format(cnt), data) + + @staticmethod + def _len_and_data(data): + return struct_pack('!B', len(data)) + data + + def _check_response_version(self, expected_version, got_version): + if got_version != expected_version: + self.close() + raise InvalidVersionError(expected_version, got_version) + + def _resolve_address(self, destaddr, default, use_remote_dns): + try: + return socket.inet_aton(destaddr) + except socket.error: + if use_remote_dns and self._proxy.remote_dns: + return default + else: + return socket.inet_aton(socket.gethostbyname(destaddr)) + def _setup_socks4(self, address, is_4a=False): destaddr, port = address - try: - ipaddr = socket.inet_aton(destaddr) - except socket.error: - if is_4a and self._proxy.remote_dns: - ipaddr = struct_pack('!BBBB', 0, 0, 0, 0xFF) - else: - ipaddr = socket.inet_aton(socket.gethostbyname(destaddr)) + ipaddr = self._resolve_address(destaddr, SOCKS4_DEFAULT_DSTIP, use_remote_dns=is_4a) - packet = struct_pack('!BBH', 0x4, 0x1, port) + ipaddr - if self._proxy.username: - username = self._proxy.username - if hasattr(username, 'encode'): - username = username.encode() - packet += struct_pack('!{0}s'.format(len(username) + 1), username) - else: - packet += b'\x00' + packet = struct_pack('!BBH', SOCKS4_VERSION, Socks4Command.CMD_CONNECT, port) + ipaddr + + username = (self._proxy.username or '').encode('utf-8') + packet += username + b'\x00' if is_4a and self._proxy.remote_dns: - if hasattr(destaddr, 'encode'): - destaddr = destaddr.encode() - packet += struct_pack('!{0}s'.format(len(destaddr) + 1), destaddr) + packet += destaddr.encode('utf-8') + b'\x00' self.sendall(packet) - packet = self.recvall(8) - nbyte, resp_code, dstport, dsthost = struct_unpack('!BBHI', packet) + version, resp_code, dstport, dsthost = struct_unpack('!BBHI', self.recvall(8)) - # check valid response - if nbyte != 0x00: - self.close() - raise ProxyError( - 0, 'Invalid response from server. Expected {0:02x} got {1:02x}'.format(0, nbyte)) + self._check_response_version(SOCKS4_REPLY_VERSION, version) - # access granted - if resp_code != 0x5a: + if resp_code != Socks4Error.ERR_SUCCESS: self.close() raise Socks4Error(resp_code) return (dsthost, dstport) - def _setup_socks5(self, address): - destaddr, port = address + def _setup_socks4a(self, address): + self._setup_socks4(address, is_4a=True) - try: - ipaddr = socket.inet_aton(destaddr) - except socket.error: - if self._proxy.remote_dns: - ipaddr = None - else: - ipaddr = socket.inet_aton(socket.gethostbyname(destaddr)) + def _socks5_auth(self): + packet = struct_pack('!B', SOCKS5_VERSION) - auth_methods = 1 + auth_methods = [Socks5Auth.AUTH_NONE] if self._proxy.username and self._proxy.password: - # two auth methods available - auth_methods = 2 - packet = struct_pack('!BBB', 0x5, auth_methods, 0x00) # no auth - if self._proxy.username and self._proxy.password: - packet += struct_pack('!B', 0x02) # user/pass auth + auth_methods.append(Socks5Auth.AUTH_USER_PASS) + + packet += struct_pack('!B', len(auth_methods)) + packet += struct_pack('!{0}B'.format(len(auth_methods)), *auth_methods) self.sendall(packet) - packet = self.recvall(2) - version, method = struct_unpack('!BB', packet) + version, method = self._recv_bytes(2) - # check valid response - if version != 0x05: - self.close() - raise ProxyError( - 0, 'Invalid response from server. Expected {0:02x} got {1:02x}'.format(5, version)) + self._check_response_version(SOCKS5_VERSION, version) - # no auth methods - if method == 0xFF: + if method == Socks5Auth.AUTH_NO_ACCEPTABLE: self.close() raise Socks5Error(method) - # user/pass auth - if method == 0x01: - username = self._proxy.username - if hasattr(username, 'encode'): - username = username.encode() - password = self._proxy.password - if hasattr(password, 'encode'): - password = password.encode() - packet = struct_pack('!BB', 1, len(username)) + username - packet += struct_pack('!B', len(password)) + password + if method == Socks5Auth.AUTH_USER_PASS: + username = self._proxy.username.encode('utf-8') + password = self._proxy.password.encode('utf-8') + packet = struct_pack('!B', SOCKS5_USER_AUTH_VERSION) + packet += self._len_and_data(username) + self._len_and_data(password) self.sendall(packet) - packet = self.recvall(2) - version, status = struct_unpack('!BB', packet) + version, status = self._recv_bytes(2) - if version != 0x01: - self.close() - raise ProxyError( - 0, 'Invalid response from server. Expected {0:02x} got {1:02x}'.format(1, version)) + self._check_response_version(SOCKS5_USER_AUTH_VERSION, version) - if status != 0x00: + if status != SOCKS5_USER_AUTH_SUCCESS: self.close() - raise Socks5Error(1) - elif method == 0x00: # no auth + raise Socks5Error(Socks5Error.ERR_GENERAL_FAILURE) + elif method == Socks5Auth.AUTH_NONE: pass - packet = struct_pack('!BBB', 5, 1, 0) + def _setup_socks5(self, address): + destaddr, port = address + + ipaddr = self._resolve_address(destaddr, None, use_remote_dns=True) + + self._socks5_auth() + + reserved = 0 + packet = struct_pack('!BBB', SOCKS5_VERSION, Socks5Command.CMD_CONNECT, reserved) if ipaddr is None: - if hasattr(destaddr, 'encode'): - destaddr = destaddr.encode() - packet += struct_pack('!BB', 3, len(destaddr)) + destaddr + destaddr = destaddr.encode('utf-8') + packet += struct_pack('!B', Socks5AddressType.ATYP_DOMAINNAME) + packet += self._len_and_data(destaddr) else: - packet += struct_pack('!B', 1) + ipaddr + packet += struct_pack('!B', Socks5AddressType.ATYP_IPV4) + ipaddr packet += struct_pack('!H', port) self.sendall(packet) - packet = self.recvall(4) - version, status, _, atype = struct_unpack('!BBBB', packet) + version, status, reserved, atype = self._recv_bytes(4) - if version != 0x05: - self.close() - raise ProxyError( - 0, 'Invalid response from server. Expected {0:02x} got {1:02x}'.format(5, version)) + self._check_response_version(SOCKS5_VERSION, version) - if status != 0x00: + if status != Socks5Error.ERR_SUCCESS: self.close() raise Socks5Error(status) - if atype == 0x01: + if atype == Socks5AddressType.ATYP_IPV4: destaddr = self.recvall(4) - elif atype == 0x03: - alen = struct_unpack('!B', self.recv(1))[0] + elif atype == Socks5AddressType.ATYP_DOMAINNAME: + alen = compat_ord(self.recv(1)) destaddr = self.recvall(alen) - elif atype == 0x04: + elif atype == Socks5AddressType.ATYP_IPV6: destaddr = self.recvall(16) destport = struct_unpack('!H', self.recvall(2))[0] return (destaddr, destport) def _make_proxy(self, connect_func, address): - if self._proxy.type == ProxyType.SOCKS4: - result = connect_func(self, (self._proxy.host, self._proxy_port)) - if result != 0 and result is not None: - return result - self._setup_socks4(address) - elif self._proxy.type == ProxyType.SOCKS4A: - result = connect_func(self, (self._proxy.host, self._proxy_port)) - if result != 0 and result is not None: - return result - self._setup_socks4(address, is_4a=True) - elif self._proxy.type == ProxyType.SOCKS5: - result = connect_func(self, (self._proxy.host, self._proxy_port)) - if result != 0 and result is not None: - return result - self._setup_socks5(address) - else: + if not self._proxy: return connect_func(self, address) + result = connect_func(self, (self._proxy.host, self._proxy.port)) + if result != 0 and result is not None: + return result + setup_funcs = { + ProxyType.SOCKS4: self._setup_socks4, + ProxyType.SOCKS4A: self._setup_socks4a, + ProxyType.SOCKS5: self._setup_socks5, + } + setup_funcs[self._proxy.type](address) + return result + def connect(self, address): self._make_proxy(socket.socket.connect, address) From 51fb4995a5242c0edca09167cf8c4b050cf5a186 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 3 May 2016 15:15:32 +0800 Subject: [PATCH 317/347] [utils] Register SOCKS protocols in urllib and support SOCKS4A --- youtube_dl/YoutubeDL.py | 3 +++ youtube_dl/utils.py | 11 ++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a96482e68..34eeb77c5 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -64,6 +64,7 @@ from .utils import ( PostProcessingError, preferredencoding, prepend_extension, + register_socks_protocols, render_table, replace_extension, SameFileError, @@ -361,6 +362,8 @@ class YoutubeDL(object): for ph in self.params.get('progress_hooks', []): self.add_progress_hook(ph) + register_socks_protocols() + def warn_if_short_id(self, argv): # short YouTube ID starting with dash? idxs = [ diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index b2e4a2dfb..c9702fd93 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -61,6 +61,13 @@ from .socks import ( ) +def register_socks_protocols(): + # "Register" SOCKS protocols + for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): + if scheme not in compat_urlparse.uses_netloc: + compat_urlparse.uses_netloc.append(scheme) + + # This is not clearly defined otherwise compiled_regex_type = type(re.compile('')) @@ -870,6 +877,8 @@ def make_socks_conn_class(base_class, socks_proxy): socks_type = ProxyType.SOCKS5 elif url_components.scheme.lower() in ('socks', 'socks4'): socks_type = ProxyType.SOCKS4 + elif url_components.scheme.lower() == 'socks4a': + socks_type = ProxyType.SOCKS4A proxy_args = ( socks_type, @@ -2738,7 +2747,7 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): if proxy == '__noproxy__': return None # No Proxy - if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks5'): + if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): req.add_header('Ytdl-socks-proxy', proxy) # youtube-dl's http/https handlers do wrapping the socket with socks return None From d5ae6bb50124f8320f2b492380480038c487a6d2 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 3 May 2016 15:37:30 +0800 Subject: [PATCH 318/347] [utils] Add rationale for register_socks_protocols --- youtube_dl/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index c9702fd93..dc73f3407 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -63,6 +63,8 @@ from .socks import ( def register_socks_protocols(): # "Register" SOCKS protocols + # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904 + # URLs with protocols not in urlparse.uses_netloc are not handled correctly for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): if scheme not in compat_urlparse.uses_netloc: compat_urlparse.uses_netloc.append(scheme) From edaa23f822a1e4a62771422fb598c7bd8ae0a152 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 3 May 2016 16:50:16 +0800 Subject: [PATCH 319/347] [compat] Rename struct_(un)pack to compat_struct_(un)pack --- test/test_compat.py | 4 ++-- youtube_dl/compat.py | 12 ++++++------ youtube_dl/downloader/f4m.py | 14 +++++++------- youtube_dl/extractor/rtve.py | 4 ++-- youtube_dl/socks.py | 32 ++++++++++++++++---------------- youtube_dl/swfinterp.py | 14 +++++++------- youtube_dl/utils.py | 4 ++-- 7 files changed, 42 insertions(+), 42 deletions(-) diff --git a/test/test_compat.py b/test/test_compat.py index dd62a5d6b..539b30540 100644 --- a/test/test_compat.py +++ b/test/test_compat.py @@ -17,10 +17,10 @@ from youtube_dl.compat import ( compat_expanduser, compat_shlex_split, compat_str, + compat_struct_unpack, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, compat_urllib_parse_urlencode, - struct_unpack, ) @@ -104,7 +104,7 @@ class TestCompat(unittest.TestCase): self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str)) def test_struct_unpack(self): - self.assertEqual(struct_unpack('!B', b'\x00'), (0,)) + self.assertEqual(compat_struct_unpack('!B', b'\x00'), (0,)) if __name__ == '__main__': diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index f697bee7e..e48c761a6 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -599,18 +599,18 @@ try: except TypeError: # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument # See https://bugs.python.org/issue19099 - def struct_pack(spec, *args): + def compat_struct_pack(spec, *args): if isinstance(spec, compat_str): spec = spec.encode('ascii') return struct.pack(spec, *args) - def struct_unpack(spec, *args): + def compat_struct_unpack(spec, *args): if isinstance(spec, compat_str): spec = spec.encode('ascii') return struct.unpack(spec, *args) else: - struct_pack = struct.pack - struct_unpack = struct.unpack + compat_struct_pack = struct.pack + compat_struct_unpack = struct.unpack __all__ = [ @@ -638,6 +638,8 @@ __all__ = [ 'compat_shlex_split', 'compat_socket_create_connection', 'compat_str', + 'compat_struct_pack', + 'compat_struct_unpack', 'compat_subprocess_get_DEVNULL', 'compat_tokenize_tokenize', 'compat_urllib_error', @@ -655,8 +657,6 @@ __all__ = [ 'compat_xml_parse_error', 'compat_xpath', 'shlex_quote', - 'struct_pack', - 'struct_unpack', 'subprocess_check_output', 'workaround_optparse_bug9161', ] diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index b282fe3d6..3d9337afa 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -12,8 +12,8 @@ from ..compat import ( compat_urlparse, compat_urllib_error, compat_urllib_parse_urlparse, - struct_pack, - struct_unpack, + compat_struct_pack, + compat_struct_unpack, ) from ..utils import ( encodeFilename, @@ -31,13 +31,13 @@ class FlvReader(io.BytesIO): # Utility functions for reading numbers and strings def read_unsigned_long_long(self): - return struct_unpack('!Q', self.read(8))[0] + return compat_struct_unpack('!Q', self.read(8))[0] def read_unsigned_int(self): - return struct_unpack('!I', self.read(4))[0] + return compat_struct_unpack('!I', self.read(4))[0] def read_unsigned_char(self): - return struct_unpack('!B', self.read(1))[0] + return compat_struct_unpack('!B', self.read(1))[0] def read_string(self): res = b'' @@ -194,11 +194,11 @@ def build_fragments_list(boot_info): def write_unsigned_int(stream, val): - stream.write(struct_pack('!I', val)) + stream.write(compat_struct_pack('!I', val)) def write_unsigned_int_24(stream, val): - stream.write(struct_pack('!I', val)[1:]) + stream.write(compat_struct_pack('!I', val)[1:]) def write_flv_header(stream): diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index f59040877..edd0d108e 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -7,7 +7,7 @@ import time from .common import InfoExtractor from ..compat import ( - struct_unpack, + compat_struct_unpack, ) from ..utils import ( ExtractorError, @@ -23,7 +23,7 @@ def _decrypt_url(png): encrypted_data = base64.b64decode(png.encode('utf-8')) text_index = encrypted_data.find(b'tEXt') text_chunk = encrypted_data[text_index - 4:] - length = struct_unpack('!I', text_chunk[:4])[0] + length = compat_struct_unpack('!I', text_chunk[:4])[0] # Use bytearray to get integers when iterating in both python 2.x and 3.x data = bytearray(text_chunk[8:8 + length]) data = [chr(b) for b in data if b != 0] diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py index 0e3dd7893..a5b27fea7 100644 --- a/youtube_dl/socks.py +++ b/youtube_dl/socks.py @@ -14,8 +14,8 @@ import socket from .compat import ( compat_ord, - struct_pack, - struct_unpack, + compat_struct_pack, + compat_struct_unpack, ) __author__ = 'Timo Schmid <coding@timoschmid.de>' @@ -26,7 +26,7 @@ SOCKS4_REPLY_VERSION = 0x00 # if the client cannot resolve the destination host's domain name to find its # IP address, it should set the first three bytes of DSTIP to NULL and the last # byte to a non-zero value. -SOCKS4_DEFAULT_DSTIP = struct_pack('!BBBB', 0, 0, 0, 0xFF) +SOCKS4_DEFAULT_DSTIP = compat_struct_pack('!BBBB', 0, 0, 0, 0xFF) SOCKS5_VERSION = 5 SOCKS5_USER_AUTH_VERSION = 0x01 @@ -128,11 +128,11 @@ class sockssocket(socket.socket): def _recv_bytes(self, cnt): data = self.recvall(cnt) - return struct_unpack('!{0}B'.format(cnt), data) + return compat_struct_unpack('!{0}B'.format(cnt), data) @staticmethod def _len_and_data(data): - return struct_pack('!B', len(data)) + data + return compat_struct_pack('!B', len(data)) + data def _check_response_version(self, expected_version, got_version): if got_version != expected_version: @@ -153,7 +153,7 @@ class sockssocket(socket.socket): ipaddr = self._resolve_address(destaddr, SOCKS4_DEFAULT_DSTIP, use_remote_dns=is_4a) - packet = struct_pack('!BBH', SOCKS4_VERSION, Socks4Command.CMD_CONNECT, port) + ipaddr + packet = compat_struct_pack('!BBH', SOCKS4_VERSION, Socks4Command.CMD_CONNECT, port) + ipaddr username = (self._proxy.username or '').encode('utf-8') packet += username + b'\x00' @@ -163,7 +163,7 @@ class sockssocket(socket.socket): self.sendall(packet) - version, resp_code, dstport, dsthost = struct_unpack('!BBHI', self.recvall(8)) + version, resp_code, dstport, dsthost = compat_struct_unpack('!BBHI', self.recvall(8)) self._check_response_version(SOCKS4_REPLY_VERSION, version) @@ -177,14 +177,14 @@ class sockssocket(socket.socket): self._setup_socks4(address, is_4a=True) def _socks5_auth(self): - packet = struct_pack('!B', SOCKS5_VERSION) + packet = compat_struct_pack('!B', SOCKS5_VERSION) auth_methods = [Socks5Auth.AUTH_NONE] if self._proxy.username and self._proxy.password: auth_methods.append(Socks5Auth.AUTH_USER_PASS) - packet += struct_pack('!B', len(auth_methods)) - packet += struct_pack('!{0}B'.format(len(auth_methods)), *auth_methods) + packet += compat_struct_pack('!B', len(auth_methods)) + packet += compat_struct_pack('!{0}B'.format(len(auth_methods)), *auth_methods) self.sendall(packet) @@ -199,7 +199,7 @@ class sockssocket(socket.socket): if method == Socks5Auth.AUTH_USER_PASS: username = self._proxy.username.encode('utf-8') password = self._proxy.password.encode('utf-8') - packet = struct_pack('!B', SOCKS5_USER_AUTH_VERSION) + packet = compat_struct_pack('!B', SOCKS5_USER_AUTH_VERSION) packet += self._len_and_data(username) + self._len_and_data(password) self.sendall(packet) @@ -221,14 +221,14 @@ class sockssocket(socket.socket): self._socks5_auth() reserved = 0 - packet = struct_pack('!BBB', SOCKS5_VERSION, Socks5Command.CMD_CONNECT, reserved) + packet = compat_struct_pack('!BBB', SOCKS5_VERSION, Socks5Command.CMD_CONNECT, reserved) if ipaddr is None: destaddr = destaddr.encode('utf-8') - packet += struct_pack('!B', Socks5AddressType.ATYP_DOMAINNAME) + packet += compat_struct_pack('!B', Socks5AddressType.ATYP_DOMAINNAME) packet += self._len_and_data(destaddr) else: - packet += struct_pack('!B', Socks5AddressType.ATYP_IPV4) + ipaddr - packet += struct_pack('!H', port) + packet += compat_struct_pack('!B', Socks5AddressType.ATYP_IPV4) + ipaddr + packet += compat_struct_pack('!H', port) self.sendall(packet) @@ -247,7 +247,7 @@ class sockssocket(socket.socket): destaddr = self.recvall(alen) elif atype == Socks5AddressType.ATYP_IPV6: destaddr = self.recvall(16) - destport = struct_unpack('!H', self.recvall(2))[0] + destport = compat_struct_unpack('!H', self.recvall(2))[0] return (destaddr, destport) diff --git a/youtube_dl/swfinterp.py b/youtube_dl/swfinterp.py index 86b28716c..7cf490aa4 100644 --- a/youtube_dl/swfinterp.py +++ b/youtube_dl/swfinterp.py @@ -6,7 +6,7 @@ import zlib from .compat import ( compat_str, - struct_unpack, + compat_struct_unpack, ) from .utils import ( ExtractorError, @@ -25,17 +25,17 @@ def _extract_tags(file_contents): file_contents[:1]) # Determine number of bits in framesize rectangle - framesize_nbits = struct_unpack('!B', content[:1])[0] >> 3 + framesize_nbits = compat_struct_unpack('!B', content[:1])[0] >> 3 framesize_len = (5 + 4 * framesize_nbits + 7) // 8 pos = framesize_len + 2 + 2 while pos < len(content): - header16 = struct_unpack('<H', content[pos:pos + 2])[0] + header16 = compat_struct_unpack('<H', content[pos:pos + 2])[0] pos += 2 tag_code = header16 >> 6 tag_len = header16 & 0x3f if tag_len == 0x3f: - tag_len = struct_unpack('<I', content[pos:pos + 4])[0] + tag_len = compat_struct_unpack('<I', content[pos:pos + 4])[0] pos += 4 assert pos + tag_len <= len(content), \ ('Tag %d ends at %d+%d - that\'s longer than the file (%d)' @@ -103,7 +103,7 @@ def _read_int(reader): for _ in range(5): buf = reader.read(1) assert len(buf) == 1 - b = struct_unpack('<B', buf)[0] + b = compat_struct_unpack('<B', buf)[0] res = res | ((b & 0x7f) << shift) if b & 0x80 == 0: break @@ -129,7 +129,7 @@ def _s24(reader): bs = reader.read(3) assert len(bs) == 3 last_byte = b'\xff' if (ord(bs[2:3]) >= 0x80) else b'\x00' - return struct_unpack('<i', bs + last_byte)[0] + return compat_struct_unpack('<i', bs + last_byte)[0] def _read_string(reader): @@ -148,7 +148,7 @@ def _read_bytes(count, reader): def _read_byte(reader): resb = _read_bytes(1, reader=reader) - res = struct_unpack('<B', resb)[0] + res = compat_struct_unpack('<B', resb)[0] return res diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index dc73f3407..dbac38b55 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -44,6 +44,7 @@ from .compat import ( compat_parse_qs, compat_socket_create_connection, compat_str, + compat_struct_pack, compat_urllib_error, compat_urllib_parse, compat_urllib_parse_urlencode, @@ -52,7 +53,6 @@ from .compat import ( compat_urlparse, compat_xpath, shlex_quote, - struct_pack, ) from .socks import ( @@ -1259,7 +1259,7 @@ def bytes_to_intlist(bs): def intlist_to_bytes(xs): if not xs: return b'' - return struct_pack('%dB' % len(xs), *xs) + return compat_struct_pack('%dB' % len(xs), *xs) # Cross-platform file locking From e21f17fc86aab0ac7f1f4cee28f64e7b9b954f71 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 5 May 2016 17:09:13 +0800 Subject: [PATCH 320/347] [test/test_socks] Test with local SOCKS servers --- .gitignore | 1 + .travis.yml | 3 +++ devscripts/install_srelay.sh | 8 +++++++ test/test_socks.py | 42 +++++++++++++++++++++++++++++++++--- 4 files changed, 51 insertions(+), 3 deletions(-) create mode 100755 devscripts/install_srelay.sh diff --git a/.gitignore b/.gitignore index 0e7128551..d5f216b5f 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,4 @@ test/local_parameters.json youtube-dl.zsh .idea .idea/* +tmp/ diff --git a/.travis.yml b/.travis.yml index cc21fae8f..998995845 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,6 +7,9 @@ python: - "3.4" - "3.5" sudo: false +install: + - bash ./devscripts/install_srelay.sh + - export PATH=$PATH:$(pwd)/tmp/srelay-0.4.8b6 script: nosetests test --verbose notifications: email: diff --git a/devscripts/install_srelay.sh b/devscripts/install_srelay.sh new file mode 100755 index 000000000..33ce8a3f7 --- /dev/null +++ b/devscripts/install_srelay.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +mkdir -p tmp && cd tmp +wget -N http://downloads.sourceforge.net/project/socks-relay/socks-relay/srelay-0.4.8/srelay-0.4.8b6.tar.gz +tar zxvf srelay-0.4.8b6.tar.gz +cd srelay-0.4.8b6 +./configure +make diff --git a/test/test_socks.py b/test/test_socks.py index 92574c6fd..dc9b8d276 100644 --- a/test/test_socks.py +++ b/test/test_socks.py @@ -8,11 +8,20 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import (FakeYDL, get_params) -from youtube_dl.compat import compat_urllib_request +import random +import subprocess + +from test.helper import ( + FakeYDL, + get_params, +) +from youtube_dl.compat import ( + compat_str, + compat_urllib_request, +) -class TestSocks(unittest.TestCase): +class TestMultipleSocks(unittest.TestCase): @staticmethod def _check_params(attrs): params = get_params() @@ -67,5 +76,32 @@ class TestSocks(unittest.TestCase): params['secondary_server_ip']) +class TestSocks(unittest.TestCase): + def setUp(self): + self.port = random.randint(49152, 65535) + self.server_process = subprocess.Popen([ + 'srelay', '-f', '-i', '127.0.0.1:%d' % self.port], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + def tearDown(self): + self.server_process.terminate() + self.server_process.communicate() + + def _get_ip(self, protocol): + ydl = FakeYDL({ + 'proxy': '%s://127.0.0.1:%d' % (protocol, self.port), + }) + return ydl.urlopen('http://yt-dl.org/ip').read().decode('utf-8') + + def test_socks4(self): + self.assertTrue(isinstance(self._get_ip('socks4'), compat_str)) + + def test_socks4a(self): + self.assertTrue(isinstance(self._get_ip('socks4a'), compat_str)) + + def test_socks5(self): + self.assertTrue(isinstance(self._get_ip('socks5'), compat_str)) + + if __name__ == '__main__': unittest.main() From fa5cb8d0212918657cb58b4d5791ed3de831bd74 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 8 May 2016 15:14:56 +0800 Subject: [PATCH 321/347] [socks] Remove a superfluous clause --- youtube_dl/socks.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py index a5b27fea7..fd49d7435 100644 --- a/youtube_dl/socks.py +++ b/youtube_dl/socks.py @@ -210,8 +210,6 @@ class sockssocket(socket.socket): if status != SOCKS5_USER_AUTH_SUCCESS: self.close() raise Socks5Error(Socks5Error.ERR_GENERAL_FAILURE) - elif method == Socks5Auth.AUTH_NONE: - pass def _setup_socks5(self, address): destaddr, port = address From 6ddb4888d2610df3bbb5024440caddde50fe9ad8 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 8 May 2016 15:15:58 +0800 Subject: [PATCH 322/347] [options] Update --proxy description for SOCKS proxies --- youtube_dl/options.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index d1f8d1331..38efd292d 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -188,7 +188,10 @@ def parseOpts(overrideArguments=None): network.add_option( '--proxy', dest='proxy', default=None, metavar='URL', - help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection') + help='Use the specified HTTP/HTTPS/SOCKS proxy. To enable experimental ' + 'SOCKS proxy, specify a proper scheme. For example ' + 'socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") ' + 'for direct connection') network.add_option( '--socket-timeout', dest='socket_timeout', type=float, default=None, metavar='SECONDS', From c2876afafef392220cdb2baebace1d6d533f8d63 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Sun, 8 May 2016 15:16:32 +0800 Subject: [PATCH 323/347] [test/test_socks] Use a different port range Seems on Travis CI, ports in the original range are often used. --- test/test_socks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_socks.py b/test/test_socks.py index dc9b8d276..d07003ceb 100644 --- a/test/test_socks.py +++ b/test/test_socks.py @@ -78,7 +78,7 @@ class TestMultipleSocks(unittest.TestCase): class TestSocks(unittest.TestCase): def setUp(self): - self.port = random.randint(49152, 65535) + self.port = random.randint(20000, 30000) self.server_process = subprocess.Popen([ 'srelay', '-f', '-i', '127.0.0.1:%d' % self.port], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) From 28b4f73620c82e7007b3154e4d5f437cf6fb2608 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Tue, 10 May 2016 09:08:08 +0200 Subject: [PATCH 324/347] release 2016.05.10 --- .github/ISSUE_TEMPLATE.md | 6 +++--- README.md | 8 +++++--- docs/supportedsites.md | 11 +++++++++-- youtube_dl/version.py | 2 +- 4 files changed, 18 insertions(+), 9 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index a26ff1de4..1fb878b59 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.05.01*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.05.01** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.05.10*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.05.10** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.05.01 +[debug] youtube-dl version 2016.05.10 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/README.md b/README.md index 50acb26a0..4ef6b6d5a 100644 --- a/README.md +++ b/README.md @@ -85,9 +85,11 @@ which means you can modify it, redistribute it or use it however you like. --no-color Do not emit color codes in output ## Network Options: - --proxy URL Use the specified HTTP/HTTPS proxy. Pass in - an empty string (--proxy "") for direct - connection + --proxy URL Use the specified HTTP/HTTPS/SOCKS proxy. + To enable experimental SOCKS proxy, specify + a proper scheme. For example + socks5://127.0.0.1:1080/. Pass in an empty + string (--proxy "") for direct connection --socket-timeout SECONDS Time to wait before giving up, in seconds --source-address IP Client-side IP address to bind to (experimental) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 9fb43671f..de84e5c84 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -77,6 +77,7 @@ - **Bild**: Bild.de - **BiliBili** - **BioBioChileTV** + - **BIQLE** - **BleacherReport** - **BleacherReportCMS** - **blinkx** @@ -145,6 +146,7 @@ - **culturebox.francetvinfo.fr** - **CultureUnplugged** - **CWTV** + - **DailyMail** - **dailymotion** - **dailymotion:playlist** - **dailymotion:user** @@ -325,6 +327,7 @@ - **limelight** - **limelight:channel** - **limelight:channel_list** + - **LiTV** - **LiveLeak** - **livestream** - **livestream:original** @@ -374,6 +377,8 @@ - **mtvservices:embedded** - **MuenchenTV**: münchen.tv - **MusicPlayOn** + - **mva**: Microsoft Virtual Academy videos + - **mva:course**: Microsoft Virtual Academy courses - **Mwave** - **MwaveMeetGreet** - **MySpace** @@ -463,7 +468,8 @@ - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - **pcmag** - **People** - - **Periscope**: Periscope + - **periscope**: Periscope + - **periscope:user**: Periscope user videos - **PhilharmonieDeParis**: Philharmonie de Paris - **phoenix.de** - **Photobucket** @@ -700,6 +706,7 @@ - **Vessel** - **Vesti**: Вести.Ru - **Vevo** + - **VevoPlaylist** - **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet - **vh1.com** - **Vice** @@ -772,7 +779,7 @@ - **WSJ**: Wall Street Journal - **XBef** - **XboxClips** - - **XFileShare**: XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me + - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To - **XHamster** - **XHamsterEmbed** - **xiami:album**: 虾米音乐 - 专辑 diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 551160897..45e40c0d1 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.05.01' +__version__ = '2016.05.10' From 702ccf2dc08603fed98d2672f86af1a0e300d83e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 10 May 2016 15:58:25 +0800 Subject: [PATCH 325/347] [compat] Rename shlex_quote and remove unused subprocess_check_output --- youtube_dl/compat.py | 19 +++---------------- youtube_dl/postprocessor/execafterdownload.py | 4 ++-- youtube_dl/utils.py | 4 ++-- 3 files changed, 7 insertions(+), 20 deletions(-) diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index e48c761a6..1392361a1 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -341,9 +341,9 @@ except ImportError: # Python 2 return parsed_result try: - from shlex import quote as shlex_quote + from shlex import quote as compat_shlex_quote except ImportError: # Python < 3.3 - def shlex_quote(s): + def compat_shlex_quote(s): if re.match(r'^[-_\w./]+$', s): return s else: @@ -466,18 +466,6 @@ else: print(s) -try: - subprocess_check_output = subprocess.check_output -except AttributeError: - def subprocess_check_output(*args, **kwargs): - assert 'input' not in kwargs - p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs) - output, _ = p.communicate() - ret = p.poll() - if ret: - raise subprocess.CalledProcessError(ret, p.args, output=output) - return output - if sys.version_info < (3, 0) and sys.platform == 'win32': def compat_getpass(prompt, *args, **kwargs): if isinstance(prompt, compat_str): @@ -635,6 +623,7 @@ __all__ = [ 'compat_parse_qs', 'compat_print', 'compat_setenv', + 'compat_shlex_quote', 'compat_shlex_split', 'compat_socket_create_connection', 'compat_str', @@ -656,7 +645,5 @@ __all__ = [ 'compat_urlretrieve', 'compat_xml_parse_error', 'compat_xpath', - 'shlex_quote', - 'subprocess_check_output', 'workaround_optparse_bug9161', ] diff --git a/youtube_dl/postprocessor/execafterdownload.py b/youtube_dl/postprocessor/execafterdownload.py index 74f66d669..90630c2d7 100644 --- a/youtube_dl/postprocessor/execafterdownload.py +++ b/youtube_dl/postprocessor/execafterdownload.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import subprocess from .common import PostProcessor -from ..compat import shlex_quote +from ..compat import compat_shlex_quote from ..utils import PostProcessingError @@ -17,7 +17,7 @@ class ExecAfterDownloadPP(PostProcessor): if '{}' not in cmd: cmd += ' {}' - cmd = cmd.replace('{}', shlex_quote(information['filepath'])) + cmd = cmd.replace('{}', compat_shlex_quote(information['filepath'])) self._downloader.to_screen('[exec] Executing command: %s' % cmd) retCode = subprocess.call(cmd, shell=True) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index dbac38b55..e8b09e9db 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -42,6 +42,7 @@ from .compat import ( compat_http_client, compat_kwargs, compat_parse_qs, + compat_shlex_quote, compat_socket_create_connection, compat_str, compat_struct_pack, @@ -52,7 +53,6 @@ from .compat import ( compat_urllib_request, compat_urlparse, compat_xpath, - shlex_quote, ) from .socks import ( @@ -1977,7 +1977,7 @@ def ytdl_is_updateable(): def args_to_str(args): # Get a short string representation for a subprocess command - return ' '.join(shlex_quote(a) for a in args) + return ' '.join(compat_shlex_quote(a) for a in args) def error_to_compat_str(err): From e73b9c65e279f283b28d14be5b7173eae46d4364 Mon Sep 17 00:00:00 2001 From: teemuy <z0rs4m37tAlL> Date: Wed, 11 May 2016 18:10:30 +0300 Subject: [PATCH 326/347] Bugfix: Allow colons in custom HTTP header values. --- youtube_dl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index cbd84c3af..740a1904b 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -69,7 +69,7 @@ def _real_main(argv=None): for h in opts.headers: if h.find(':', 1) < 0: parser.error('wrong header formatting, it should be key:value, not "%s"' % h) - key, value = h.split(':', 2) + key, value = h.split(':', 1) if opts.verbose: write_string('[debug] Adding header from command line option %s:%s\n' % (key, value)) std_headers[key] = value From e0741fd4496c85ef447e72df935cb6edd1af53ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Wed, 11 May 2016 22:03:30 +0600 Subject: [PATCH 327/347] [__init__] Simplify colon presence check --- youtube_dl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 740a1904b..5df965191 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -67,7 +67,7 @@ def _real_main(argv=None): # Custom HTTP headers if opts.headers is not None: for h in opts.headers: - if h.find(':', 1) < 0: + if ':' not in h: parser.error('wrong header formatting, it should be key:value, not "%s"' % h) key, value = h.split(':', 1) if opts.verbose: From 4540515cb3daa0716fa94e54cacb566ef1461ab3 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 12 May 2016 18:48:27 +0800 Subject: [PATCH 328/347] [iqiyi] Fix 1080P extraction (closes #9446) --- youtube_dl/extractor/iqiyi.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index ffb8008ce..ddcb3c916 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -505,7 +505,10 @@ class IqiyiIE(InfoExtractor): 'enc': md5_text(enc_key + tail), 'qyid': _uuid, 'tn': random.random(), - 'um': 0, + # In iQiyi's flash player, um is set to 1 if there's a logged user + # Some 1080P formats are only available with a logged user. + # Here force um=1 to trick the iQiyi server + 'um': 1, 'authkey': md5_text(md5_text('') + tail), 'k_tag': 1, } From 778a1ccca7d6cce06faf17867f20b87883d84e98 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 12 May 2016 19:48:48 +0800 Subject: [PATCH 329/347] =?UTF-8?q?[utils]=20Add=20=C5=92=20and=20=C5=93?= =?UTF-8?q?=20found=20in=20French=20to=20ACCENT=5FCHARS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #9463 --- test/test_utils.py | 4 ++-- youtube_dl/utils.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 5702ffa97..ca254779f 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -155,8 +155,8 @@ class TestUtil(unittest.TestCase): self.assertTrue(sanitize_filename(':', restricted=True) != '') self.assertEqual(sanitize_filename( - 'ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', restricted=True), - 'AAAAAAAECEEEEIIIIDNOOOOOOUUUUYPssaaaaaaaeceeeeiiiionoooooouuuuypy') + 'ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØŒÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøœùúûüýþÿ', restricted=True), + 'AAAAAAAECEEEEIIIIDNOOOOOOOEUUUUYPssaaaaaaaeceeeeiiiionoooooooeuuuuypy') def test_sanitize_ids(self): self.assertEqual(sanitize_filename('_n_cd26wFpw', is_id=True), '_n_cd26wFpw') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e8b09e9db..6592c8ec2 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -104,9 +104,9 @@ KNOWN_EXTENSIONS = ( 'f4f', 'f4m', 'm3u8', 'smil') # needed for sanitizing filenames in restricted mode -ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', - itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'], - 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy'))) +ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØŒÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøœùúûüýþÿ', + itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOO', ['OE'], 'UUUUYP', ['ss'], + 'aaaaaa', ['ae'], 'ceeeeiiiionoooooo', ['oe'], 'uuuuypy'))) def preferredencoding(): From 7e8ddca1bb10068356d1ec43cf66e7627b76fce7 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 12 May 2016 19:56:58 +0800 Subject: [PATCH 330/347] [vevo] Delay the georestriction check to prevent false alerts Fixes #9408 --- youtube_dl/extractor/vevo.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index c0632cd6a..388b4debe 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -213,19 +213,17 @@ class VevoIE(VevoBaseIE): formats = [] if not video_info: - if response and response.get('statusCode') != 909: + try: + self._initialize_api(video_id) + except ExtractorError: ytid = response.get('errorInfo', {}).get('ytid') if ytid: self.report_warning( 'Video is geoblocked, trying with the YouTube video %s' % ytid) return self.url_result(ytid, 'Youtube', ytid) - if 'statusMessage' in response: - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, response['statusMessage']), expected=True) - raise ExtractorError('Unable to extract videos') + raise - self._initialize_api(video_id) video_info = self._call_api( 'video/%s' % video_id, video_id, 'Downloading api video info', 'Failed to download video info') From 1b405bb47d91119cc612a90d26f27f2b93f7c7b4 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Tue, 3 May 2016 18:06:50 +0800 Subject: [PATCH 331/347] [downloader/f4m] Tolerate truncate segments when testing Replaces #9216 Fixes #9214 and test_Bloomberg partially --- youtube_dl/downloader/f4m.py | 42 +++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 3d9337afa..314def4cb 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -23,26 +23,38 @@ from ..utils import ( ) +class DataTruncatedError(Exception): + pass + + class FlvReader(io.BytesIO): """ Reader for Flv files The file format is documented in https://www.adobe.com/devnet/f4v.html """ + def read_bytes(self, n): + data = self.read(n) + if len(data) < n: + raise DataTruncatedError( + 'FlvReader error: need %d bytes while only %d bytes got' % ( + n, len(data))) + return data + # Utility functions for reading numbers and strings def read_unsigned_long_long(self): - return compat_struct_unpack('!Q', self.read(8))[0] + return compat_struct_unpack('!Q', self.read_bytes(8))[0] def read_unsigned_int(self): - return compat_struct_unpack('!I', self.read(4))[0] + return compat_struct_unpack('!I', self.read_bytes(4))[0] def read_unsigned_char(self): - return compat_struct_unpack('!B', self.read(1))[0] + return compat_struct_unpack('!B', self.read_bytes(1))[0] def read_string(self): res = b'' while True: - char = self.read(1) + char = self.read_bytes(1) if char == b'\x00': break res += char @@ -53,18 +65,18 @@ class FlvReader(io.BytesIO): Read a box and return the info as a tuple: (box_size, box_type, box_data) """ real_size = size = self.read_unsigned_int() - box_type = self.read(4) + box_type = self.read_bytes(4) header_end = 8 if size == 1: real_size = self.read_unsigned_long_long() header_end = 16 - return real_size, box_type, self.read(real_size - header_end) + return real_size, box_type, self.read_bytes(real_size - header_end) def read_asrt(self): # version self.read_unsigned_char() # flags - self.read(3) + self.read_bytes(3) quality_entry_count = self.read_unsigned_char() # QualityEntryCount for i in range(quality_entry_count): @@ -85,7 +97,7 @@ class FlvReader(io.BytesIO): # version self.read_unsigned_char() # flags - self.read(3) + self.read_bytes(3) # time scale self.read_unsigned_int() @@ -119,7 +131,7 @@ class FlvReader(io.BytesIO): # version self.read_unsigned_char() # flags - self.read(3) + self.read_bytes(3) self.read_unsigned_int() # BootstrapinfoVersion # Profile,Live,Update,Reserved @@ -374,7 +386,17 @@ class F4mFD(FragmentFD): down.close() reader = FlvReader(down_data) while True: - _, box_type, box_data = reader.read_box_info() + try: + _, box_type, box_data = reader.read_box_info() + except DataTruncatedError: + if test: + # In tests, segments may be truncated, and thus + # FlvReader may not be able to parse the whole + # chunk. If so, write the segment as is + # See https://github.com/rg3/youtube-dl/issues/9214 + dest_stream.write(down_data) + break + raise if box_type == b'mdat': dest_stream.write(box_data) break From a3fa6024d676ec20a06fe618f5c3d6e064f49336 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 12 May 2016 20:05:43 +0800 Subject: [PATCH 332/347] [bloomberg] Fix test_Bloomberg In this test case, sometimes HLS is the best format while sometimes HDS is. To prevent occasional test failures, force HDS to be the best format. In the past, testing against HDS formats causes the same error as #9214, which is fixed as #9377 landed. --- youtube_dl/extractor/bloomberg.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index 13343bc25..bd538be50 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -17,6 +17,9 @@ class BloombergIE(InfoExtractor): 'title': 'Shah\'s Presentation on Foreign-Exchange Strategies', 'description': 'md5:a8ba0302912d03d246979735c17d2761', }, + 'params': { + 'format': 'best[format_id^=hds]', + }, }, { 'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets', 'only_matching': True, From 7581bfc958c8de77adbf8a502564d2263d17479d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 12 May 2016 18:57:53 +0800 Subject: [PATCH 333/347] [utils] Unquote crendentials passed to SOCKS proxies Fixes #9450 --- youtube_dl/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 6592c8ec2..d6f94f8cd 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -50,6 +50,7 @@ from .compat import ( compat_urllib_parse, compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, + compat_urllib_parse_unquote_plus, compat_urllib_request, compat_urlparse, compat_xpath, @@ -886,7 +887,8 @@ def make_socks_conn_class(base_class, socks_proxy): socks_type, url_components.hostname, url_components.port or 1080, True, # Remote DNS - url_components.username, url_components.password + compat_urllib_parse_unquote_plus(url_components.username), + compat_urllib_parse_unquote_plus(url_components.password), ) class SocksConnection(base_class): From 0db3a66162cf1059dbfccd60db350596f7c5b469 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 12 May 2016 23:57:52 +0600 Subject: [PATCH 334/347] [twitch] Skip dead tests --- youtube_dl/extractor/twitch.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 36ee1adff..68f50487b 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -171,6 +171,7 @@ class TwitchVideoIE(TwitchItemBaseIE): 'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG', }, 'playlist_mincount': 12, + 'skip': 'HTTP Error 404: Not Found', } @@ -187,6 +188,7 @@ class TwitchChapterIE(TwitchItemBaseIE): 'title': 'ACRL Off Season - Sports Cars @ Nordschleife', }, 'playlist_mincount': 3, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www.twitch.tv/tsm_theoddone/c/2349361', 'only_matching': True, @@ -368,6 +370,7 @@ class TwitchBookmarksIE(TwitchPlaylistBaseIE): 'title': 'Ognos', }, 'playlist_mincount': 3, + 'skip': 'HTTP Error 404: Not Found', } def _extract_playlist_page(self, response): From 0df79d552a6d528ac5bb1a9cce99199aafe79144 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 13 May 2016 00:14:30 +0600 Subject: [PATCH 335/347] [twitch:bookmarks] Remove extractor Bookmarks no longer available --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/twitch.py | 26 -------------------------- 2 files changed, 27 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index a0bb3d4c2..f2bd4fe97 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -834,7 +834,6 @@ from .twitch import ( TwitchVodIE, TwitchProfileIE, TwitchPastBroadcastsIE, - TwitchBookmarksIE, TwitchStreamIE, ) from .twitter import ( diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 68f50487b..f7b98e190 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -357,32 +357,6 @@ class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE): } -class TwitchBookmarksIE(TwitchPlaylistBaseIE): - IE_NAME = 'twitch:bookmarks' - _VALID_URL = r'%s/(?P<id>[^/]+)/profile/bookmarks/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE - _PLAYLIST_URL = '%s/api/bookmark/?user=%%s&offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE - _PLAYLIST_TYPE = 'bookmarks' - - _TEST = { - 'url': 'http://www.twitch.tv/ognos/profile/bookmarks', - 'info_dict': { - 'id': 'ognos', - 'title': 'Ognos', - }, - 'playlist_mincount': 3, - 'skip': 'HTTP Error 404: Not Found', - } - - def _extract_playlist_page(self, response): - entries = [] - for bookmark in response.get('bookmarks', []): - video = bookmark.get('video') - if not video: - continue - entries.append(video['url']) - return entries - - class TwitchStreamIE(TwitchBaseIE): IE_NAME = 'twitch:stream' _VALID_URL = r'%s/(?P<id>[^/#?]+)/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE From d8d540cf0d11dbf7b3d9de611470fc7114c8d1ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 13 May 2016 02:07:12 +0600 Subject: [PATCH 336/347] [nrk] Rework extractor (Closes #9470) --- youtube_dl/extractor/nrk.py | 435 ++++++++++++++++-------------------- 1 file changed, 196 insertions(+), 239 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 51dfc27ac..f0fbdd8be 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -4,91 +4,224 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_urllib_parse_unquote, -) +from ..compat import compat_urllib_parse_unquote from ..utils import ( - determine_ext, ExtractorError, - float_or_none, + int_or_none, + parse_age_limit, parse_duration, - unified_strdate, ) -class NRKIE(InfoExtractor): - _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)' - - _TESTS = [ - { - 'url': 'http://www.nrk.no/video/PS*150533', - # MD5 is unstable - 'info_dict': { - 'id': '150533', - 'ext': 'flv', - 'title': 'Dompap og andre fugler i Piip-Show', - 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', - 'duration': 263, - } - }, - { - 'url': 'http://www.nrk.no/video/PS*154915', - # MD5 is unstable - 'info_dict': { - 'id': '154915', - 'ext': 'flv', - 'title': 'Slik høres internett ut når du er blind', - 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', - 'duration': 20, - } - }, - ] +class NRKBaseIE(InfoExtractor): + def _extract_formats(self, manifest_url, video_id, fatal=True): + return self._extract_f4m_formats( + manifest_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', + video_id, f4m_id='hds', fatal=fatal) def _real_extract(self, url): video_id = self._match_id(url) data = self._download_json( - 'http://v8.psapi.nrk.no/mediaelement/%s' % video_id, - video_id, 'Downloading media JSON') + 'http://%s/mediaelement/%s' % (self._API_HOST, video_id), + video_id, 'Downloading mediaelement JSON') - media_url = data.get('mediaUrl') + title = data.get('fullTitle') or data.get('mainTitle') or data['title'] + video_id = data.get('id') or video_id - if not media_url: - if data['usageRights']['isGeoBlocked']: + entries = [] + + media_assets = data.get('mediaAssets') + if media_assets and isinstance(media_assets, list): + def video_id_and_title(idx): + return ((video_id, title) if len(media_assets) == 1 + else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx))) + for num, asset in enumerate(media_assets, 1): + asset_url = asset.get('url') + if not asset_url: + continue + formats = self._extract_formats(asset_url, video_id, fatal=False) + if not formats: + continue + self._sort_formats(formats) + entry_id, entry_title = video_id_and_title(num) + duration = parse_duration(asset.get('duration')) + subtitles = {} + for subtitle in ('webVtt', 'timedText'): + subtitle_url = asset.get('%sSubtitlesUrl' % subtitle) + if subtitle_url: + subtitles.setdefault('no', []).append({'url': subtitle_url}) + entries.append({ + 'id': asset.get('carrierId') or entry_id, + 'title': entry_title, + 'duration': duration, + 'subtitles': subtitles, + 'formats': formats, + }) + + if not entries: + media_url = data.get('mediaUrl') + if media_url: + formats = self._extract_formats(media_url, video_id) + self._sort_formats(formats) + duration = parse_duration(data.get('duration')) + entries = [{ + 'id': video_id, + 'title': title, + 'duration': duration, + 'formats': formats, + }] + + if not entries: + if data.get('usageRights', {}).get('isGeoBlocked'): raise ExtractorError( 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', expected=True) - if determine_ext(media_url) == 'f4m': - formats = self._extract_f4m_formats( - media_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', video_id, f4m_id='hds') - self._sort_formats(formats) - else: - formats = [{ - 'url': media_url, - 'ext': 'flv', - }] - - duration = parse_duration(data.get('duration')) + conviva = data.get('convivaStatistics') or {} + series = conviva.get('seriesName') or data.get('seriesTitle') + episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') + thumbnails = None images = data.get('images') - if images: - thumbnails = images['webImages'] - thumbnails.sort(key=lambda image: image['pixelWidth']) - thumbnail = thumbnails[-1]['imageUrl'] - else: - thumbnail = None + if images and isinstance(images, dict): + web_images = images.get('webImages') + if isinstance(web_images, list): + thumbnails = [{ + 'url': image['imageUrl'], + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in web_images if image.get('imageUrl')] - return { - 'id': video_id, - 'title': data['title'], - 'description': data['description'], - 'duration': duration, - 'thumbnail': thumbnail, - 'formats': formats, + description = data.get('description') + + common_info = { + 'description': description, + 'series': series, + 'episode': episode, + 'age_limit': parse_age_limit(data.get('legalAge')), + 'thumbnails': thumbnails, } + vcodec = 'none' if data.get('mediaType') == 'Audio' else None + + # TODO: extract chapters when https://github.com/rg3/youtube-dl/pull/9409 is merged + + for entry in entries: + entry.update(common_info) + for f in entry['formats']: + f['vcodec'] = vcodec + + return self.playlist_result(entries, video_id, title, description) + + +class NRKIE(NRKBaseIE): + _VALID_URL = r'(?:nrk:|https?://(?:www\.)?nrk\.no/video/PS\*)(?P<id>\d+)' + _API_HOST = 'v8.psapi.nrk.no' + _TESTS = [{ + # video + 'url': 'http://www.nrk.no/video/PS*150533', + # MD5 is unstable + 'info_dict': { + 'id': '150533', + 'ext': 'flv', + 'title': 'Dompap og andre fugler i Piip-Show', + 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', + 'duration': 263, + } + }, { + # audio + 'url': 'http://www.nrk.no/video/PS*154915', + # MD5 is unstable + 'info_dict': { + 'id': '154915', + 'ext': 'flv', + 'title': 'Slik høres internett ut når du er blind', + 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', + 'duration': 20, + } + }] + + +class NRKTVIE(NRKBaseIE): + IE_DESC = 'NRK TV and NRK Radio' + _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' + _API_HOST = 'psapi-we.nrk.no' + + _TESTS = [{ + 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', + 'info_dict': { + 'id': 'MUHH48000314', + 'ext': 'mp4', + 'title': '20 spørsmål', + 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', + 'upload_date': '20140523', + 'duration': 1741.52, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://tv.nrk.no/program/mdfp15000514', + 'info_dict': { + 'id': 'mdfp15000514', + 'ext': 'mp4', + 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting', + 'description': 'md5:654c12511f035aed1e42bdf5db3b206a', + 'upload_date': '20140524', + 'duration': 4605.08, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # single playlist video + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', + 'md5': 'adbd1dbd813edaf532b0a253780719c2', + 'info_dict': { + 'id': 'MSPO40010515-part2', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'upload_date': '20150106', + }, + 'skip': 'Only works from Norway', + }, { + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', + 'playlist': [{ + 'md5': '9480285eff92d64f06e02a5367970a7a', + 'info_dict': { + 'id': 'MSPO40010515-part1', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'upload_date': '20150106', + }, + }, { + 'md5': 'adbd1dbd813edaf532b0a253780719c2', + 'info_dict': { + 'id': 'MSPO40010515-part2', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'upload_date': '20150106', + }, + }], + 'info_dict': { + 'id': 'MSPO40010515', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'upload_date': '20150106', + 'duration': 6947.52, + }, + 'skip': 'Only works from Norway', + }, { + 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', + 'only_matching': True, + }] + class NRKPlaylistIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?nrk\.no/(?!video|skole)(?:[^/]+/)+(?P<id>[^/]+)' @@ -159,179 +292,3 @@ class NRKSkoleIE(InfoExtractor): nrk_id = self._search_regex(r'data-nrk-id=["\'](\d+)', webpage, 'nrk id') return self.url_result('nrk:%s' % nrk_id) - - -class NRKTVIE(InfoExtractor): - IE_DESC = 'NRK TV and NRK Radio' - _VALID_URL = r'(?P<baseurl>https?://(?:tv|radio)\.nrk(?:super)?\.no/)(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})(?:/\d{2}-\d{2}-\d{4})?(?:#del=(?P<part_id>\d+))?' - - _TESTS = [ - { - 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', - 'info_dict': { - 'id': 'MUHH48000314', - 'ext': 'mp4', - 'title': '20 spørsmål', - 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', - 'upload_date': '20140523', - 'duration': 1741.52, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - 'url': 'https://tv.nrk.no/program/mdfp15000514', - 'info_dict': { - 'id': 'mdfp15000514', - 'ext': 'mp4', - 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting', - 'description': 'md5:654c12511f035aed1e42bdf5db3b206a', - 'upload_date': '20140524', - 'duration': 4605.08, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - # single playlist video - 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', - 'md5': 'adbd1dbd813edaf532b0a253780719c2', - 'info_dict': { - 'id': 'MSPO40010515-part2', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', - 'upload_date': '20150106', - }, - 'skip': 'Only works from Norway', - }, - { - 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', - 'playlist': [ - { - 'md5': '9480285eff92d64f06e02a5367970a7a', - 'info_dict': { - 'id': 'MSPO40010515-part1', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', - 'upload_date': '20150106', - }, - }, - { - 'md5': 'adbd1dbd813edaf532b0a253780719c2', - 'info_dict': { - 'id': 'MSPO40010515-part2', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', - 'upload_date': '20150106', - }, - }, - ], - 'info_dict': { - 'id': 'MSPO40010515', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', - 'upload_date': '20150106', - 'duration': 6947.5199999999995, - }, - 'skip': 'Only works from Norway', - }, - { - 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', - 'only_matching': True, - } - ] - - def _extract_f4m(self, manifest_url, video_id): - return self._extract_f4m_formats( - manifest_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', video_id, f4m_id='hds') - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - part_id = mobj.group('part_id') - base_url = mobj.group('baseurl') - - webpage = self._download_webpage(url, video_id) - - title = self._html_search_meta( - 'title', webpage, 'title') - description = self._html_search_meta( - 'description', webpage, 'description') - - thumbnail = self._html_search_regex( - r'data-posterimage="([^"]+)"', - webpage, 'thumbnail', fatal=False) - upload_date = unified_strdate(self._html_search_meta( - 'rightsfrom', webpage, 'upload date', fatal=False)) - duration = float_or_none(self._html_search_regex( - r'data-duration="([^"]+)"', - webpage, 'duration', fatal=False)) - - # playlist - parts = re.findall( - r'<a href="#del=(\d+)"[^>]+data-argument="([^"]+)">([^<]+)</a>', webpage) - if parts: - entries = [] - for current_part_id, stream_url, part_title in parts: - if part_id and current_part_id != part_id: - continue - video_part_id = '%s-part%s' % (video_id, current_part_id) - formats = self._extract_f4m(stream_url, video_part_id) - entries.append({ - 'id': video_part_id, - 'title': part_title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'formats': formats, - }) - if part_id: - if entries: - return entries[0] - else: - playlist = self.playlist_result(entries, video_id, title, description) - playlist.update({ - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'duration': duration, - }) - return playlist - - formats = [] - - f4m_url = re.search(r'data-media="([^"]+)"', webpage) - if f4m_url: - formats.extend(self._extract_f4m(f4m_url.group(1), video_id)) - - m3u8_url = re.search(r'data-hls-media="([^"]+)"', webpage) - if m3u8_url: - formats.extend(self._extract_m3u8_formats(m3u8_url.group(1), video_id, 'mp4', m3u8_id='hls')) - self._sort_formats(formats) - - subtitles_url = self._html_search_regex( - r'data-subtitlesurl\s*=\s*(["\'])(?P<url>.+?)\1', - webpage, 'subtitle URL', default=None, group='url') - subtitles = {} - if subtitles_url: - subtitles['no'] = [{ - 'ext': 'ttml', - 'url': compat_urlparse.urljoin(base_url, subtitles_url), - }] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } From b9e7bc55da1c1275737b356efadc06435b8bfa2c Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 12 May 2016 22:45:54 +0100 Subject: [PATCH 337/347] [mgtv] extract http formats --- youtube_dl/extractor/mgtv.py | 43 ++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py index a14d176a5..9fbc74f5d 100644 --- a/youtube_dl/extractor/mgtv.py +++ b/youtube_dl/extractor/mgtv.py @@ -11,7 +11,7 @@ class MGTVIE(InfoExtractor): _TEST = { 'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', - 'md5': '', + 'md5': '1bdadcf760a0b90946ca68ee9a2db41a', 'info_dict': { 'id': '3116640', 'ext': 'mp4', @@ -20,15 +20,6 @@ class MGTVIE(InfoExtractor): 'duration': 7461, 'thumbnail': 're:^https?://.*\.jpg$', }, - 'params': { - 'skip_download': True, # m3u8 download - }, - } - - _FORMAT_MAP = { - '标清': ('Standard', 0), - '高清': ('High', 1), - '超清': ('SuperHigh', 2), } def _real_extract(self, url): @@ -40,17 +31,27 @@ class MGTVIE(InfoExtractor): formats = [] for idx, stream in enumerate(api_data['stream']): - format_name = stream.get('name') - format_id, preference = self._FORMAT_MAP.get(format_name, (None, None)) - format_info = self._download_json( - stream['url'], video_id, - note='Download video info for format %s' % format_id or '#%d' % idx) - formats.append({ - 'format_id': format_id, - 'url': format_info['info'], - 'ext': 'mp4', # These are m3u8 playlists - 'preference': preference, - }) + stream_url = stream.get('url') + if not stream_url: + continue + tbr = int_or_none(self._search_regex( + r'(\d+)\.mp4', stream_url, 'tbr', default=None)) + + def extract_format(stream_url, format_id, idx, query={}): + format_info = self._download_json( + stream_url, video_id, + note='Download video info for format %s' % format_id or '#%d' % idx, query=query) + return { + 'format_id': format_id, + 'url': format_info['info'], + 'ext': 'mp4', + 'tbr': tbr, + } + + formats.append(extract_format( + stream_url, 'hls-%d' % tbr if tbr else None, idx * 2)) + formats.append(extract_format(stream_url.replace( + '/playlist.m3u8', ''), 'http-%d' % tbr if tbr else None, idx * 2 + 1, {'pno': 1031})) self._sort_formats(formats) return { From 99d79b8692ae8981aff91cf5b1475516b60eb765 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 13 May 2016 05:21:45 +0100 Subject: [PATCH 338/347] [ustudio] add support ustudio app/embed urls --- youtube_dl/extractor/ustudio.py | 66 +++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/ustudio.py b/youtube_dl/extractor/ustudio.py index cafc082b6..3484a2046 100644 --- a/youtube_dl/extractor/ustudio.py +++ b/youtube_dl/extractor/ustudio.py @@ -6,10 +6,12 @@ from .common import InfoExtractor from ..utils import ( int_or_none, unified_strdate, + unescapeHTML, ) class UstudioIE(InfoExtractor): + IE_NAME = 'ustudio' _VALID_URL = r'https?://(?:(?:www|v1)\.)?ustudio\.com/video/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)' _TEST = { 'url': 'http://ustudio.com/video/Uxu2my9bgSph/san_francisco_golden_gate_bridge', @@ -27,9 +29,7 @@ class UstudioIE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') + video_id, display_id = re.match(self._VALID_URL, url).groups() config = self._download_xml( 'http://v1.ustudio.com/embed/%s/ustudio/config.xml' % video_id, @@ -37,7 +37,7 @@ class UstudioIE(InfoExtractor): def extract(kind): return [{ - 'url': item.attrib['url'], + 'url': unescapeHTML(item.attrib['url']), 'width': int_or_none(item.get('width')), 'height': int_or_none(item.get('height')), } for item in config.findall('./qualities/quality/%s' % kind) if item.get('url')] @@ -65,3 +65,61 @@ class UstudioIE(InfoExtractor): 'uploader': uploader, 'formats': formats, } + + +class UstudioEmbedIE(InfoExtractor): + IE_NAME = 'ustudio:embed' + _VALID_URL = r'https?://(?:(?:app|embed)\.)?ustudio\.com/embed/(?P<uid>[^/]+)/(?P<id>[^/]+)' + _TEST = { + 'url': 'http://app.ustudio.com/embed/DeN7VdYRDKhP/Uw7G1kMCe65T', + 'md5': '47c0be52a09b23a7f40de9469cec58f4', + 'info_dict': { + 'id': 'Uw7G1kMCe65T', + 'ext': 'mp4', + 'title': '5 Things IT Should Know About Video', + 'description': 'md5:93d32650884b500115e158c5677d25ad', + 'uploader_id': 'DeN7VdYRDKhP', + } + } + + def _real_extract(self, url): + uploader_id, video_id = re.match(self._VALID_URL, url).groups() + video_data = self._download_json( + 'http://app.ustudio.com/embed/%s/%s/config.json' % (uploader_id, video_id), + video_id)['videos'][0] + title = video_data['name'] + + formats = [] + for ext, qualities in video_data.get('transcodes', {}).items(): + for quality in qualities: + quality_url = quality.get('url') + if not quality_url: + continue + height = int_or_none(quality.get('height')) + formats.append({ + 'format_id': '%s-%dp' % (ext, height) if height else ext, + 'url': quality_url, + 'width': int_or_none(quality.get('width')), + 'height': height, + }) + self._sort_formats(formats) + + thumbnails = [] + for image in video_data.get('images', []): + image_url = image.get('url') + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'duration': int_or_none(video_data.get('duration')), + 'uploader_id': uploader_id, + 'tags': video_data.get('keywords'), + 'thumbnails': thumbnails, + 'formats': formats, + } From cdf32ff15d6fc9d1902bfb3ed10a582070d20cd9 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 13 May 2016 05:25:32 +0100 Subject: [PATCH 339/347] [extractors] add import for UstudioEmbedIE --- youtube_dl/extractor/extractors.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index f2bd4fe97..50d2204f2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -851,7 +851,10 @@ from .unistra import UnistraIE from .urort import UrortIE from .usatoday import USATodayIE from .ustream import UstreamIE, UstreamChannelIE -from .ustudio import UstudioIE +from .ustudio import ( + UstudioIE, + UstudioEmbedIE, +) from .varzesh3 import Varzesh3IE from .vbox7 import Vbox7IE from .veehd import VeeHDIE From 18cf6381f6b140431f3a747fc2d222be08ab2e23 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 13 May 2016 08:05:28 +0100 Subject: [PATCH 340/347] [nrk] extract m3u8 formats --- youtube_dl/extractor/nrk.py | 39 +++++++++++++++---------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index f0fbdd8be..7532f40c1 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -15,9 +15,14 @@ from ..utils import ( class NRKBaseIE(InfoExtractor): def _extract_formats(self, manifest_url, video_id, fatal=True): - return self._extract_f4m_formats( + formats = [] + formats.extend(self._extract_f4m_formats( manifest_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', - video_id, f4m_id='hds', fatal=fatal) + video_id, f4m_id='hds', fatal=fatal)) + formats.extend(self._extract_m3u8_formats(manifest_url.replace( + 'akamaihd.net/z/', 'akamaihd.net/i/').replace('/manifest.f4m', '/master.m3u8'), + video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=fatal)) + return formats def _real_extract(self, url): video_id = self._match_id(url) @@ -121,10 +126,10 @@ class NRKIE(NRKBaseIE): _TESTS = [{ # video 'url': 'http://www.nrk.no/video/PS*150533', - # MD5 is unstable + 'md5': '2f7f6eeb2aacdd99885f355428715cfa', 'info_dict': { 'id': '150533', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Dompap og andre fugler i Piip-Show', 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', 'duration': 263, @@ -150,32 +155,24 @@ class NRKTVIE(NRKBaseIE): _TESTS = [{ 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', + 'md5': '4e9ca6629f09e588ed240fb11619922a', 'info_dict': { - 'id': 'MUHH48000314', + 'id': 'MUHH48000314AA', 'ext': 'mp4', - 'title': '20 spørsmål', + 'title': '20 spørsmål 23.05.2014', 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', - 'upload_date': '20140523', 'duration': 1741.52, }, - 'params': { - # m3u8 download - 'skip_download': True, - }, }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', + 'md5': '43d0be26663d380603a9cf0c24366531', 'info_dict': { - 'id': 'mdfp15000514', + 'id': 'MDFP15000514CA', 'ext': 'mp4', - 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting', - 'description': 'md5:654c12511f035aed1e42bdf5db3b206a', - 'upload_date': '20140524', + 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', + 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', 'duration': 4605.08, }, - 'params': { - # m3u8 download - 'skip_download': True, - }, }, { # single playlist video 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', @@ -185,7 +182,6 @@ class NRKTVIE(NRKBaseIE): 'ext': 'flv', 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', - 'upload_date': '20150106', }, 'skip': 'Only works from Norway', }, { @@ -197,7 +193,6 @@ class NRKTVIE(NRKBaseIE): 'ext': 'flv', 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 1:2)', 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', - 'upload_date': '20150106', }, }, { 'md5': 'adbd1dbd813edaf532b0a253780719c2', @@ -206,14 +201,12 @@ class NRKTVIE(NRKBaseIE): 'ext': 'flv', 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', - 'upload_date': '20150106', }, }], 'info_dict': { 'id': 'MSPO40010515', 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn', 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', - 'upload_date': '20150106', 'duration': 6947.52, }, 'skip': 'Only works from Norway', From ad55e101651edc732acac22cfb25d276d6c8bdca Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 13 May 2016 08:35:38 +0100 Subject: [PATCH 341/347] [brightcove] change the protocol for m3u8 formats to m3u8_native --- youtube_dl/extractor/brightcove.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index f0781fc27..fc7fc5b16 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -307,9 +307,10 @@ class BrightcoveLegacyIE(InfoExtractor): playlist_title=playlist_info['mediaCollectionDTO']['displayName']) def _extract_video_info(self, video_info): + video_id = compat_str(video_info['id']) publisher_id = video_info.get('publisherId') info = { - 'id': compat_str(video_info['id']), + 'id': video_id, 'title': video_info['displayName'].strip(), 'description': video_info.get('shortDescription'), 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), @@ -331,7 +332,8 @@ class BrightcoveLegacyIE(InfoExtractor): url_comp = compat_urllib_parse_urlparse(url) if url_comp.path.endswith('.m3u8'): formats.extend( - self._extract_m3u8_formats(url, info['id'], 'mp4')) + self._extract_m3u8_formats( + url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) continue elif 'akamaihd.net' in url_comp.netloc: # This type of renditions are served through @@ -365,7 +367,7 @@ class BrightcoveLegacyIE(InfoExtractor): a_format.update({ 'format_id': 'hls%s' % ('-%s' % tbr if tbr else ''), 'ext': 'mp4', - 'protocol': 'm3u8', + 'protocol': 'm3u8_native', }) formats.append(a_format) @@ -395,7 +397,7 @@ class BrightcoveLegacyIE(InfoExtractor): return ad_info if 'url' not in info and not info.get('formats'): - raise ExtractorError('Unable to extract video url for %s' % info['id']) + raise ExtractorError('Unable to extract video url for %s' % video_id) return info @@ -527,7 +529,7 @@ class BrightcoveNewIE(InfoExtractor): if not src: continue formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', m3u8_id='hls', fatal=False)) + src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) elif source_type == 'application/dash+xml': if not src: continue From cc1028aa6d27aeec39617d1ff8d2edcf1ee989d7 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 13 May 2016 18:11:08 +0800 Subject: [PATCH 342/347] [openload] Fix extraction (closes #9472) --- youtube_dl/extractor/openload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 456561bcc..5049b870e 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -100,7 +100,7 @@ class OpenloadIE(InfoExtractor): raise ExtractorError('File not found', expected=True) code = self._search_regex( - r'<video[^>]+>\s*<script[^>]+>([^<]+)</script>', + r'</video>\s*</div>\s*<script[^>]+>([^<]+)</script>', webpage, 'JS code') decoded = self.openload_decode(code) From f196508f7b872963d13bcff94c0105d743322f71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 13 May 2016 22:19:00 +0600 Subject: [PATCH 343/347] [imdb] Relax _VALID_URL (Closes #9481) --- youtube_dl/extractor/imdb.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 8bed8ccd0..203156229 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -12,9 +12,9 @@ from ..utils import ( class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'https?://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/video/[^/]+/vi(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.imdb.com/video/imdb/vi2524815897', 'info_dict': { 'id': '2524815897', @@ -22,7 +22,10 @@ class ImdbIE(InfoExtractor): 'title': 'Ice Age: Continental Drift Trailer (No. 2) - IMDb', 'description': 'md5:9061c2219254e5d14e03c25c98e96a81', } - } + }, { + 'url': 'http://www.imdb.com/video/_/vi2524815897', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From 96c2e3e909171d103beafd1fd88e9d6e215681c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Fri, 13 May 2016 23:25:05 +0600 Subject: [PATCH 344/347] [imdb] Improve extraction --- youtube_dl/extractor/imdb.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 203156229..3a2b7cec5 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -1,10 +1,10 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor from ..utils import ( + mimetype2ext, qualities, ) @@ -51,13 +51,27 @@ class ImdbIE(InfoExtractor): json_data = self._search_regex( r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>', format_page, 'json data', flags=re.DOTALL) - info = json.loads(json_data) - format_info = info['videoPlayerObject']['video'] - f_id = format_info['ffname'] + info = self._parse_json(json_data, video_id, fatal=False) + if not info: + continue + format_info = info.get('videoPlayerObject', {}).get('video', {}) + if not format_info: + continue + video_info_list = format_info.get('videoInfoList') + if not video_info_list or not isinstance(video_info_list, list): + continue + video_info = video_info_list[0] + if not video_info or not isinstance(video_info, dict): + continue + video_url = video_info.get('videoUrl') + if not video_url: + continue + format_id = format_info.get('ffname') formats.append({ - 'format_id': f_id, - 'url': format_info['videoInfoList'][0]['videoUrl'], - 'quality': quality(f_id), + 'format_id': format_id, + 'url': video_url, + 'ext': mimetype2ext(video_info.get('videoMimeType')), + 'quality': quality(format_id), }) self._sort_formats(formats) From 0730be9022b415738e917c4cf72c2347ff0008e0 Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Fri, 13 May 2016 20:24:36 +0100 Subject: [PATCH 345/347] [sina] fix extraction(fixes #1146) --- youtube_dl/extractor/sina.py | 124 ++++++++++++++++++++++++----------- 1 file changed, 84 insertions(+), 40 deletions(-) diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py index d03f1b1d4..8fc66732a 100644 --- a/youtube_dl/extractor/sina.py +++ b/youtube_dl/extractor/sina.py @@ -4,28 +4,35 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode -from ..utils import sanitized_Request +from ..utils import ( + HEADRequest, + ExtractorError, + int_or_none, + update_url_query, + qualities, + get_element_by_attribute, + clean_html, +) class SinaIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(.*?\.)?video\.sina\.com\.cn/ - ( - (.+?/(((?P<pseudo_id>\d+).html)|(.*?(\#|(vid=)|b/)(?P<id>\d+?)($|&|\-)))) - | + _VALID_URL = r'''(?x)https?://(?:.*?\.)?video\.sina\.com\.cn/ + (?: + (?:view/|.*\#)(?P<video_id>\d+)| + .+?/(?P<pseudo_id>[^/?#]+)(?:\.s?html)| # This is used by external sites like Weibo - (api/sinawebApi/outplay.php/(?P<token>.+?)\.swf) + api/sinawebApi/outplay.php/(?P<token>.+?)\.swf ) ''' _TESTS = [ { - 'url': 'http://video.sina.com.cn/news/vlist/zt/chczlj2013/?opsubject_id=top12#110028898', - 'md5': 'd65dd22ddcf44e38ce2bf58a10c3e71f', + 'url': 'http://video.sina.com.cn/news/spj/topvideoes20160504/?opsubject_id=top1#250576622', + 'md5': 'd38433e2fc886007729735650ae4b3e9', 'info_dict': { - 'id': '110028898', - 'ext': 'flv', - 'title': '《中国新闻》 朝鲜要求巴拿马立即释放被扣船员', + 'id': '250576622', + 'ext': 'mp4', + 'title': '现场:克鲁兹宣布退选 特朗普将稳获提名', } }, { @@ -35,37 +42,74 @@ class SinaIE(InfoExtractor): 'ext': 'flv', 'title': '军方提高对朝情报监视级别', }, + 'skip': 'the page does not exist or has been deleted', + }, + { + 'url': 'http://video.sina.com.cn/view/250587748.html', + 'md5': '3d1807a25c775092aab3bc157fff49b4', + 'info_dict': { + 'id': '250587748', + 'ext': 'mp4', + 'title': '瞬间泪目:8年前汶川地震珍贵视频首曝光', + }, }, ] - def _extract_video(self, video_id): - data = compat_urllib_parse_urlencode({'vid': video_id}) - url_doc = self._download_xml('http://v.iask.com/v_play.php?%s' % data, - video_id, 'Downloading video url') - image_page = self._download_webpage( - 'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data, - video_id, 'Downloading thumbnail info') - - return {'id': video_id, - 'url': url_doc.find('./durl/url').text, - 'ext': 'flv', - 'title': url_doc.find('./vname').text, - 'thumbnail': image_page.split('=')[1], - } - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - if mobj.group('token') is not None: - # The video id is in the redirected url - self.to_screen('Getting video id') - request = sanitized_Request(url) - request.get_method = lambda: 'HEAD' - (_, urlh) = self._download_webpage_handle(request, 'NA', False) - return self._real_extract(urlh.geturl()) - elif video_id is None: - pseudo_id = mobj.group('pseudo_id') - webpage = self._download_webpage(url, pseudo_id) - video_id = self._search_regex(r'vid:\'(\d+?)\'', webpage, 'video id') - return self._extract_video(video_id) + video_id = mobj.group('video_id') + if not video_id: + if mobj.group('token') is not None: + # The video id is in the redirected url + self.to_screen('Getting video id') + request = HEADRequest(url) + (_, urlh) = self._download_webpage_handle(request, 'NA', False) + return self._real_extract(urlh.geturl()) + else: + pseudo_id = mobj.group('pseudo_id') + webpage = self._download_webpage(url, pseudo_id) + error = get_element_by_attribute('class', 'errtitle', webpage) + if error: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, clean_html(error)), expected=True) + video_id = self._search_regex( + r"video_id\s*:\s*'(\d+)'", webpage, 'video id') + + video_data = self._download_json( + 'http://s.video.sina.com.cn/video/h5play', + video_id, query={'video_id': video_id}) + if video_data['code'] != 1: + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, video_data['message']), expected=True) + else: + video_data = video_data['data'] + title = video_data['title'] + description = video_data.get('description') + if description: + description = description.strip() + + preference = qualities(['cif', 'sd', 'hd', 'fhd', 'ffd']) + formats = [] + for quality_id, quality in video_data.get('videos', {}).get('mp4', {}).items(): + file_api = quality.get('file_api') + file_id = quality.get('file_id') + if not file_api or not file_id: + continue + formats.append({ + 'format_id': quality_id, + 'url': update_url_query(file_api, {'vid': file_id}), + 'preference': preference(quality_id), + 'ext': 'mp4', + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': video_data.get('image'), + 'duration': int_or_none(video_data.get('length')), + 'timestamp': int_or_none(video_data.get('create_time')), + 'formats': formats, + } From 134c6ea856be472f253bffbe99b88546fe417806 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 14 May 2016 04:46:38 +0600 Subject: [PATCH 346/347] [YoutubeDL] Sanitize url for url and url_transparent extraction results --- youtube_dl/YoutubeDL.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 34eeb77c5..03a6a1890 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -720,6 +720,7 @@ class YoutubeDL(object): result_type = ie_result.get('_type', 'video') if result_type in ('url', 'url_transparent'): + ie_result['url'] = sanitize_url(ie_result['url']) extract_flat = self.params.get('extract_flat', False) if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or extract_flat is True): From b5abf8614898cc728488d7ecc7a55a4c5c92758f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Sat, 14 May 2016 04:53:14 +0600 Subject: [PATCH 347/347] [cinemassacre] Remove extractor (Closes #9457) It now uses jwplatform --- youtube_dl/extractor/cinemassacre.py | 119 --------------------------- youtube_dl/extractor/extractors.py | 1 - 2 files changed, 120 deletions(-) delete mode 100644 youtube_dl/extractor/cinemassacre.py diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py deleted file mode 100644 index 042c4f2f1..000000000 --- a/youtube_dl/extractor/cinemassacre.py +++ /dev/null @@ -1,119 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ExtractorError -from .screenwavemedia import ScreenwaveMediaIE - - -class CinemassacreIE(InfoExtractor): - _VALID_URL = 'https?://(?:www\.)?cinemassacre\.com/(?P<date_y>[0-9]{4})/(?P<date_m>[0-9]{2})/(?P<date_d>[0-9]{2})/(?P<display_id>[^?#/]+)' - _TESTS = [ - { - 'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', - 'md5': 'fde81fbafaee331785f58cd6c0d46190', - 'info_dict': { - 'id': 'Cinemassacre-19911', - 'ext': 'mp4', - 'upload_date': '20121110', - 'title': '“Angry Video Game Nerd: The Movie” – Trailer', - 'description': 'md5:fb87405fcb42a331742a0dce2708560b', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', - 'md5': 'd72f10cd39eac4215048f62ab477a511', - 'info_dict': { - 'id': 'Cinemassacre-521be8ef82b16', - 'ext': 'mp4', - 'upload_date': '20131002', - 'title': 'The Mummy’s Hand (1940)', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, - { - # Youtube embedded video - 'url': 'http://cinemassacre.com/2006/12/07/chronologically-confused-about-bad-movie-and-video-game-sequel-titles/', - 'md5': 'ec9838a5520ef5409b3e4e42fcb0a3b9', - 'info_dict': { - 'id': 'OEVzPCY2T-g', - 'ext': 'webm', - 'title': 'AVGN: Chronologically Confused about Bad Movie and Video Game Sequel Titles', - 'upload_date': '20061207', - 'uploader': 'Cinemassacre', - 'uploader_id': 'JamesNintendoNerd', - 'description': 'md5:784734696c2b8b7f4b8625cc799e07f6', - } - }, - { - # Youtube embedded video - 'url': 'http://cinemassacre.com/2006/09/01/mckids/', - 'md5': '7393c4e0f54602ad110c793eb7a6513a', - 'info_dict': { - 'id': 'FnxsNhuikpo', - 'ext': 'webm', - 'upload_date': '20060901', - 'uploader': 'Cinemassacre Extra', - 'description': 'md5:de9b751efa9e45fbaafd9c8a1123ed53', - 'uploader_id': 'Cinemassacre', - 'title': 'AVGN: McKids', - } - }, - { - 'url': 'http://cinemassacre.com/2015/05/25/mario-kart-64-nintendo-64-james-mike-mondays/', - 'md5': '1376908e49572389e7b06251a53cdd08', - 'info_dict': { - 'id': 'Cinemassacre-555779690c440', - 'ext': 'mp4', - 'description': 'Let’s Play Mario Kart 64 !! Mario Kart 64 is a classic go-kart racing game released for the Nintendo 64 (N64). Today James & Mike do 4 player Battle Mode with Kyle and Bootsy!', - 'title': 'Mario Kart 64 (Nintendo 64) James & Mike Mondays', - 'upload_date': '20150525', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - } - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - video_date = mobj.group('date_y') + mobj.group('date_m') + mobj.group('date_d') - - webpage = self._download_webpage(url, display_id) - - playerdata_url = self._search_regex( - [ - ScreenwaveMediaIE.EMBED_PATTERN, - r'<iframe[^>]+src="(?P<url>(?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', - ], - webpage, 'player data URL', default=None, group='url') - if not playerdata_url: - raise ExtractorError('Unable to find player data') - - video_title = self._html_search_regex( - r'<title>(?P<title>.+?)\|', webpage, 'title') - video_description = self._html_search_regex( - r'<div class="entry-content">(?P<description>.+?)</div>', - webpage, 'description', flags=re.DOTALL, fatal=False) - video_thumbnail = self._og_search_thumbnail(webpage) - - return { - '_type': 'url_transparent', - 'display_id': display_id, - 'title': video_title, - 'description': video_description, - 'upload_date': video_date, - 'thumbnail': video_thumbnail, - 'url': playerdata_url, - } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 50d2204f2..b6f4ccc5d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -124,7 +124,6 @@ from .chirbit import ( ChirbitProfileIE, ) from .cinchcast import CinchcastIE -from .cinemassacre import CinemassacreIE from .cliprs import ClipRsIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE