From 7be15d40976bf40f44bc47301d4e839a1e171e52 Mon Sep 17 00:00:00 2001 From: PeterDing Date: Fri, 29 Jul 2016 23:21:50 +0800 Subject: [PATCH 01/92] [bilibili] Support episodes [extractor/bilibili] add md5 for testing [extractor/bilibili] remove unnecessary headers [extractor/bilibili] correct _TESTS; find thumbnail for episode [extractor/bilibili] [Fix] restore removed tests --- youtube_dl/extractor/bilibili.py | 40 ++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index a332fbb69..35313c62b 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -12,9 +12,13 @@ from ..utils import ( unified_timestamp, ) +HEADERS = { + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', +} + class BiliBiliIE(InfoExtractor): - _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P\d+)' + _VALID_URL = r'https?://(www.|bangumi.|)bilibili\.(?:tv|com)/(video/av|anime/v/)(?P\d+)' _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', @@ -77,6 +81,17 @@ class BiliBiliIE(InfoExtractor): 'skip_download': True, }, 'expected_warnings': ['upload time'], + }, { + 'url': 'http://bangumi.bilibili.com/anime/v/40068', + 'md5': '08d539a0884f3deb7b698fb13ba69696', + 'info_dict': { + 'id': '40068', + 'ext': 'mp4', + 'duration': 1402.357, + 'title': '混沌武士 : 第7集 四面楚歌 A Risky Racket', + 'description': "故事发生在日本的江户时代。风是一个小酒馆的打工女。一日,酒馆里来了一群恶霸,虽然他们的举动令风十分不满,但是毕竟风只是一届女流,无法对他们采取什么行动,只能在心里嘟哝。这时,酒家里又进来了个“不良份子”无幻,说以50个丸子帮她搞定这群人,风觉得他莫名其妙,也就没多搭理他。而在这时,风因为一个意外而将茶水泼在了恶霸头领——龙次郎身上。愤怒的恶霸们欲将风的手指砍掉,风在无奈中大喊道:“丸子100个!”……   另一方面,龙次郎的父亲也就是当地的代官,依仗自己有着雄厚的保镖实力,在当地欺压穷人,当看到一穷人无法交齐足够的钱过桥时,欲下令将其杀死,武士仁看不惯这一幕,于是走上前,与代官的保镖交手了……   酒馆内,因为风答应给无幻100个团子,无幻将恶霸们打败了,就在这时,仁进来了。好战的无幻立刻向仁发了战书,最后两败俱伤,被代官抓入牢房,预计第二天斩首……   得知该状况的风,为报救命之恩,来到了刑场,利用烟花救出了无幻和仁。而风则以救命恩人的身份,命令二人和她一起去寻找带着向日葵香味的武士……(by百科)", + 'thumbnail': 're:^http?://.+\.jpg', + }, }] _APP_KEY = '6f90a59ac58a4123' @@ -84,13 +99,20 @@ class BiliBiliIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - cid = compat_parse_qs(self._search_regex( - [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', - r']+src="https://secure\.bilibili\.com/secure,([^"]+)"'], - webpage, 'player parameters'))['cid'][0] + _is_episode = 'anime/v' in url + if not _is_episode: + cid = compat_parse_qs(self._search_regex( + [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', + r']+src="https://secure\.bilibili\.com/secure,([^"]+)"'], + webpage, 'player parameters'))['cid'][0] + else: + url_t = 'http://bangumi.bilibili.com/web_api/get_source' + js = self._download_json(url_t, video_id, + data='episode_id=%s' % video_id, + headers=HEADERS) + cid = js['result']['cid'] payload = 'appkey=%s&cid=%s&otype=json&quality=2&type=mp4' % (self._APP_KEY, cid) sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() @@ -125,6 +147,10 @@ class BiliBiliIE(InfoExtractor): description = self._html_search_meta('description', webpage) timestamp = unified_timestamp(self._html_search_regex( r']+datetime="([^"]+)"', webpage, 'upload time', fatal=False)) + if _is_episode: + thumbnail = self._html_search_meta('og:image', webpage) + else: + thumbnail = self._html_search_meta('thumbnailUrl', webpage) # TODO 'view_count' requires deobfuscating Javascript info = { @@ -132,7 +158,7 @@ class BiliBiliIE(InfoExtractor): 'title': title, 'description': description, 'timestamp': timestamp, - 'thumbnail': self._html_search_meta('thumbnailUrl', webpage), + 'thumbnail': thumbnail, 'duration': float_or_none(video_info.get('timelength'), scale=1000), } From 2896dd73bc2c9844175258086c0300395722e5c9 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 1 Sep 2016 08:00:13 +0100 Subject: [PATCH 02/92] [cbs] extract once formats(closes #10515) --- youtube_dl/extractor/cbs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index c72ed2dbb..3f4dea40c 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -51,7 +51,7 @@ class CBSIE(CBSBaseIE): path = 'dJ5BDC/media/guid/2198311517/' + guid smil_url = 'http://link.theplatform.com/s/%s?mbr=true' % path formats, subtitles = self._extract_theplatform_smil(smil_url + '&manifest=m3u', guid) - for r in ('HLS&formats=M3U', 'RTMP', 'WIFI', '3G'): + for r in ('OnceURL&formats=M3U', 'HLS&formats=M3U', 'RTMP', 'WIFI', '3G'): try: tp_formats, _ = self._extract_theplatform_smil(smil_url + '&assetTypes=' + r, guid, 'Downloading %s SMIL data' % r.split('&')[0]) formats.extend(tp_formats) From 165c54e97d10705614934d5b1d86d90c06951b7c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 1 Sep 2016 16:28:03 +0800 Subject: [PATCH 03/92] =?UTF-8?q?[southpark.cc.com:espa=C3=B1ol]=20Skip=20?= =?UTF-8?q?geo-restricted=20=5FTESTS?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Breaks https://travis-ci.org/rg3/youtube-dl/jobs/156728175 --- youtube_dl/extractor/southpark.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index a147f7db1..e2a9e45ac 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -35,6 +35,7 @@ class SouthParkEsIE(SouthParkIE): 'description': 'Cartman Consigue Una Sonda Anal', }, 'playlist_count': 4, + 'skip': 'Geo-restricted', }] From 746a695b362cb602625ed7357294bb18de133883 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 1 Sep 2016 16:42:35 +0800 Subject: [PATCH 04/92] [myvidster] Update _TESTS (closes #10473) --- youtube_dl/extractor/myvidster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/myvidster.py b/youtube_dl/extractor/myvidster.py index 731c24542..2117d302d 100644 --- a/youtube_dl/extractor/myvidster.py +++ b/youtube_dl/extractor/myvidster.py @@ -13,7 +13,7 @@ class MyVidsterIE(InfoExtractor): 'id': '3685814', 'title': 'md5:7d8427d6d02c4fbcef50fe269980c749', 'upload_date': '20141027', - 'uploader_id': 'utkualp', + 'uploader': 'utkualp', 'ext': 'mp4', 'age_limit': 18, }, From 05d4612947d6dbfaedb8f2a00daa5f29d85f73df Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 1 Sep 2016 16:58:16 +0800 Subject: [PATCH 05/92] [movingimage] Adapt to the new domain name and fix extraction Closes #10466 --- ChangeLog | 6 +++++ youtube_dl/extractor/extractors.py | 2 +- .../extractor/{ssa.py => movingimage.py} | 26 +++++++------------ 3 files changed, 17 insertions(+), 17 deletions(-) rename youtube_dl/extractor/{ssa.py => movingimage.py} (65%) diff --git a/ChangeLog b/ChangeLog index 0f8076d96..877e8112e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version + +Extractors +* [movingimage] Fix for the new site name (#10466) + + version 2016.08.31 Extractors diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 21efa96b2..8d0688f53 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -486,6 +486,7 @@ from .motherless import MotherlessIE from .motorsport import MotorsportIE from .movieclips import MovieClipsIE from .moviezine import MoviezineIE +from .movingimage import MovingImageIE from .msn import MSNIE from .mtv import ( MTVIE, @@ -806,7 +807,6 @@ from .srgssr import ( SRGSSRPlayIE, ) from .srmediathek import SRMediathekIE -from .ssa import SSAIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE from .streamable import StreamableIE diff --git a/youtube_dl/extractor/ssa.py b/youtube_dl/extractor/movingimage.py similarity index 65% rename from youtube_dl/extractor/ssa.py rename to youtube_dl/extractor/movingimage.py index 54d1843f2..bb789c32e 100644 --- a/youtube_dl/extractor/ssa.py +++ b/youtube_dl/extractor/movingimage.py @@ -7,22 +7,19 @@ from ..utils import ( ) -class SSAIE(InfoExtractor): - _VALID_URL = r'https?://ssa\.nls\.uk/film/(?P\d+)' +class MovingImageIE(InfoExtractor): + _VALID_URL = r'https?://movingimage\.nls\.uk/film/(?P\d+)' _TEST = { - 'url': 'http://ssa.nls.uk/film/3561', + 'url': 'http://movingimage.nls.uk/film/3561', + 'md5': '4caa05c2b38453e6f862197571a7be2f', 'info_dict': { 'id': '3561', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'SHETLAND WOOL', 'description': 'md5:c5afca6871ad59b4271e7704fe50ab04', 'duration': 900, 'thumbnail': 're:^https?://.*\.jpg$', }, - 'params': { - # rtmp download - 'skip_download': True, - }, } def _real_extract(self, url): @@ -30,10 +27,9 @@ class SSAIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - streamer = self._search_regex( - r"'streamer'\s*,\S*'(rtmp[^']+)'", webpage, 'streamer') - play_path = self._search_regex( - r"'file'\s*,\s*'([^']+)'", webpage, 'file').rpartition('.')[0] + formats = self._extract_m3u8_formats( + self._html_search_regex(r'file\s*:\s*"([^"]+)"', webpage, 'm3u8 manifest URL'), + video_id, ext='mp4', entry_protocol='m3u8_native') def search_field(field_name, fatal=False): return self._search_regex( @@ -44,13 +40,11 @@ class SSAIE(InfoExtractor): description = unescapeHTML(search_field('Description')) duration = parse_duration(search_field('Running time')) thumbnail = self._search_regex( - r"'image'\s*,\s*'([^']+)'", webpage, 'thumbnails', fatal=False) + r"image\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False) return { 'id': video_id, - 'url': streamer, - 'play_path': play_path, - 'ext': 'flv', + 'formats': formats, 'title': title, 'description': description, 'duration': duration, From 4c8ab6fd715249290feab89bbc86eb803b459993 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 1 Sep 2016 17:04:41 +0800 Subject: [PATCH 06/92] [thvideo] Remove extractor. Website down. Closes #10464 According to a screenshot in http://tieba.baidu.com/p/4691302183, thvideo.tv is shut down "temporarily". I see no clues that it will be up again, so I remove it here. --- ChangeLog | 1 + youtube_dl/extractor/extractors.py | 4 -- youtube_dl/extractor/thvideo.py | 84 ------------------------------ 3 files changed, 1 insertion(+), 88 deletions(-) delete mode 100644 youtube_dl/extractor/thvideo.py diff --git a/ChangeLog b/ChangeLog index 877e8112e..2e75c003d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version Extractors +- [thvideo] Remove extractor (#10464) * [movingimage] Fix for the new site name (#10466) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8d0688f53..459d776b3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -869,10 +869,6 @@ from .tnaflix import ( MovieFapIE, ) from .toggle import ToggleIE -from .thvideo import ( - THVideoIE, - THVideoPlaylistIE -) from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE diff --git a/youtube_dl/extractor/thvideo.py b/youtube_dl/extractor/thvideo.py deleted file mode 100644 index 406f4a826..000000000 --- a/youtube_dl/extractor/thvideo.py +++ /dev/null @@ -1,84 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - unified_strdate -) - - -class THVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?thvideo\.tv/(?:v/th|mobile\.php\?cid=)(?P[0-9]+)' - _TEST = { - 'url': 'http://thvideo.tv/v/th1987/', - 'md5': 'fa107b1f73817e325e9433505a70db50', - 'info_dict': { - 'id': '1987', - 'ext': 'mp4', - 'title': '【动画】秘封活动记录 ~ The Sealed Esoteric History.分镜稿预览', - 'display_id': 'th1987', - 'thumbnail': 'http://thvideo.tv/uploadfile/2014/0722/20140722013459856.jpg', - 'description': '社团京都幻想剧团的第一个东方二次同人动画作品「秘封活动记录 ~ The Sealed Esoteric History.」 本视频是该动画第一期的分镜草稿...', - 'upload_date': '20140722' - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - # extract download link from mobile player page - webpage_player = self._download_webpage( - 'http://thvideo.tv/mobile.php?cid=%s-0' % (video_id), - video_id, note='Downloading video source page') - video_url = self._html_search_regex( - r'', webpage, - 'upload date', fatal=False)) - - return { - 'id': video_id, - 'ext': 'mp4', - 'url': video_url, - 'title': title, - 'display_id': display_id, - 'thumbnail': thumbnail, - 'description': description, - 'upload_date': upload_date - } - - -class THVideoPlaylistIE(InfoExtractor): - _VALID_URL = r'http?://(?:www\.)?thvideo\.tv/mylist(?P[0-9]+)' - _TEST = { - 'url': 'http://thvideo.tv/mylist2', - 'info_dict': { - 'id': '2', - 'title': '幻想万華鏡', - }, - 'playlist_mincount': 23, - } - - def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - list_title = self._html_search_regex( - r'

(.*?)\d+)' + _TEST = { + 'url': 'https://app.curiositystream.com/video/2', + 'md5': 'a0074c190e6cddaf86900b28d3e9ee7a', + 'info_dict': { + 'id': '2', + 'ext': 'mp4', + 'title': 'How Did You Develop The Internet?', + 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', + 'timestamp': 1448388615, + 'upload_date': '20151124', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + media = self._call_api('media/' + video_id, video_id) + return self._extract_media_info(media) + + +class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): + IE_NAME = 'curiositystream:collection' + _VALID_URL = r'https?://app\.curiositystream\.com/collection/(?P\d+)' + _TEST = { + 'url': 'https://app.curiositystream.com/collection/2', + 'info_dict': { + 'id': '2', + 'title': 'Curious Minds: The Internet', + 'description': 'How is the internet shaping our lives in the 21st Century?', + }, + 'playlist_mincount': 17, + } + + def _real_extract(self, url): + collection_id = self._match_id(url) + collection = self._call_api( + 'collections/' + collection_id, collection_id) + entries = [] + for media in collection.get('media', []): + entries.append(self._extract_media_info(media)) + return self.playlist_result( + entries, collection_id, + collection.get('title'), collection.get('description')) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 459d776b3..0c2436b67 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -194,6 +194,10 @@ from .ctsnews import CtsNewsIE from .ctv import CTVIE from .ctvnews import CTVNewsIE from .cultureunplugged import CultureUnpluggedIE +from .curiositystream import ( + CuriosityStreamIE, + CuriosityStreamCollectionIE, +) from .cwtv import CWTVIE from .dailymail import DailyMailIE from .dailymotion import ( From 9250181f37cf0289c02d18ab91203c6181f9cc71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Sep 2016 21:37:25 +0700 Subject: [PATCH 08/92] [extractor/common] Restore NAME usage from EXT-X-MEDIA tag for formats codes in _extract_m3u8_formats (Closes #10522) --- youtube_dl/extractor/common.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index da0af29ec..36d43fd50 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1202,6 +1202,7 @@ class InfoExtractor(object): 'preference': preference, }] last_info = None + last_media = None for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-STREAM-INF:'): last_info = parse_m3u8_attributes(line) @@ -1224,6 +1225,10 @@ class InfoExtractor(object): 'protocol': entry_protocol, 'preference': preference, }) + else: + # When there is no URI in EXT-X-MEDIA let this tag's + # data be used by regular URI lines below + last_media = media elif line.startswith('#') or not line.strip(): continue else: @@ -1234,13 +1239,14 @@ class InfoExtractor(object): format_id = [] if m3u8_id: format_id.append(m3u8_id) + last_media_name = last_media.get('NAME') if last_media else None + # Despite specification does not mention NAME attribute for + # EXT-X-STREAM-INF it still sometimes may be present + stream_name = last_info.get('NAME') or last_media_name # Bandwidth of live streams may differ over time thus making # format_id unpredictable. So it's better to keep provided # format_id intact. if not live: - # Despite specification does not mention NAME attribute for - # EXT-X-STREAM-INF it still sometimes may be present - stream_name = last_info.get('NAME') format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats))) f = { 'format_id': '-'.join(format_id), From e816c9d158629ef054c1cc77eecf83043d06fe8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Sep 2016 22:18:16 +0700 Subject: [PATCH 09/92] [extractor/common] Simplify _extract_m3u8_formats --- youtube_dl/extractor/common.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 36d43fd50..a9c7a8d16 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1201,8 +1201,8 @@ class InfoExtractor(object): 'protocol': entry_protocol, 'preference': preference, }] - last_info = None - last_media = None + last_info = {} + last_media = {} for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-STREAM-INF:'): last_info = parse_m3u8_attributes(line) @@ -1232,17 +1232,13 @@ class InfoExtractor(object): elif line.startswith('#') or not line.strip(): continue else: - if last_info is None: - formats.append({'url': format_url(line)}) - continue tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000) format_id = [] if m3u8_id: format_id.append(m3u8_id) - last_media_name = last_media.get('NAME') if last_media else None # Despite specification does not mention NAME attribute for # EXT-X-STREAM-INF it still sometimes may be present - stream_name = last_info.get('NAME') or last_media_name + stream_name = last_info.get('NAME') or last_media.get('NAME') # Bandwidth of live streams may differ over time thus making # format_id unpredictable. So it's better to keep provided # format_id intact. @@ -1275,6 +1271,7 @@ class InfoExtractor(object): f.update(parse_codecs(last_info.get('CODECS'))) formats.append(f) last_info = {} + last_media = {} return formats @staticmethod From f6af0f888b03e8c072b86c04492cc84c966c9f15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Sep 2016 23:15:01 +0700 Subject: [PATCH 10/92] [youporn] Fix categories and tags extraction (Closes #10521) --- youtube_dl/extractor/youporn.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 0df2d76ee..0265a64a7 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -35,7 +35,7 @@ class YouPornIE(InfoExtractor): 'age_limit': 18, }, }, { - # Anonymous User uploader + # Unknown uploader 'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4', 'info_dict': { 'id': '561726', @@ -44,7 +44,7 @@ class YouPornIE(InfoExtractor): 'title': 'Big Tits Awesome Brunette On amazing webcam show', 'description': 'http://sweetlivegirls.com Big Tits Awesome Brunette On amazing webcam show.mp4', 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'Anonymous User', + 'uploader': 'Unknown', 'upload_date': '20111125', 'average_rating': int, 'view_count': int, @@ -140,17 +140,17 @@ class YouPornIE(InfoExtractor): r'>All [Cc]omments? \(([\d,.]+)\)', webpage, 'comment count', fatal=False)) - def extract_tag_box(title): - tag_box = self._search_regex( - (r']+class=["\']tagBoxTitle["\'][^>]*>\s*%s\b.*?\s*' - ']+class=["\']tagBoxContent["\']>(.+?)') % re.escape(title), - webpage, '%s tag box' % title, default=None) + def extract_tag_box(regex, title): + tag_box = self._search_regex(regex, webpage, title, default=None) if not tag_box: return [] return re.findall(r']+href=[^>]+>([^<]+)', tag_box) - categories = extract_tag_box('Category') - tags = extract_tag_box('Tags') + categories = extract_tag_box( + r'(?s)Categories:.*?]+>(.+?)', 'categories') + tags = extract_tag_box( + r'(?s)Tags:.*?\s*]+class=["\']tagBoxContent["\'][^>]*>(.+?)', + 'tags') return { 'id': video_id, From 8fb6af6bba201c9f750aadb7b092704195c7f8e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Sep 2016 23:32:28 +0700 Subject: [PATCH 11/92] [exfm] Remove extractor (Closes #10482) --- youtube_dl/extractor/exfm.py | 58 ------------------------------ youtube_dl/extractor/extractors.py | 1 - 2 files changed, 59 deletions(-) delete mode 100644 youtube_dl/extractor/exfm.py diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py deleted file mode 100644 index 09ed4f2b5..000000000 --- a/youtube_dl/extractor/exfm.py +++ /dev/null @@ -1,58 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class ExfmIE(InfoExtractor): - IE_NAME = 'exfm' - IE_DESC = 'ex.fm' - _VALID_URL = r'https?://(?:www\.)?ex\.fm/song/(?P[^/]+)' - _SOUNDCLOUD_URL = r'http://(?:www\.)?api\.soundcloud\.com/tracks/([^/]+)/stream' - _TESTS = [ - { - 'url': 'http://ex.fm/song/eh359', - 'md5': 'e45513df5631e6d760970b14cc0c11e7', - 'info_dict': { - 'id': '44216187', - 'ext': 'mp3', - 'title': 'Test House "Love Is Not Enough" (Extended Mix) DeadJournalist Exclusive', - 'uploader': 'deadjournalist', - 'upload_date': '20120424', - 'description': 'Test House \"Love Is Not Enough\" (Extended Mix) DeadJournalist Exclusive', - }, - 'note': 'Soundcloud song', - 'skip': 'The site is down too often', - }, - { - 'url': 'http://ex.fm/song/wddt8', - 'md5': '966bd70741ac5b8570d8e45bfaed3643', - 'info_dict': { - 'id': 'wddt8', - 'ext': 'mp3', - 'title': 'Safe and Sound', - 'uploader': 'Capital Cities', - }, - 'skip': 'The site is down too often', - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - song_id = mobj.group('id') - info_url = 'http://ex.fm/api/v3/song/%s' % song_id - info = self._download_json(info_url, song_id)['song'] - song_url = info['url'] - if re.match(self._SOUNDCLOUD_URL, song_url) is not None: - self.to_screen('Soundcloud song detected') - return self.url_result(song_url.replace('/stream', ''), 'Soundcloud') - return { - 'id': song_id, - 'url': song_url, - 'ext': 'mp3', - 'title': info['title'], - 'thumbnail': info['image']['large'], - 'uploader': info['artist'], - 'view_count': info['loved_count'], - } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 0c2436b67..7b59d5db2 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -261,7 +261,6 @@ from .espn import ESPNIE from .esri import EsriVideoIE from .europa import EuropaIE from .everyonesmixtape import EveryonesMixtapeIE -from .exfm import ExfmIE from .expotv import ExpoTVIE from .extremetube import ExtremeTubeIE from .eyedotv import EyedoTVIE From af95ee94b4554449db175ae44060a66c89bd96ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Sep 2016 23:38:49 +0700 Subject: [PATCH 12/92] [glide] Fix extraction (Closes #10478) --- youtube_dl/extractor/glide.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/glide.py b/youtube_dl/extractor/glide.py index 62ff84835..50f698803 100644 --- a/youtube_dl/extractor/glide.py +++ b/youtube_dl/extractor/glide.py @@ -14,10 +14,8 @@ class GlideIE(InfoExtractor): 'info_dict': { 'id': 'UZF8zlmuQbe4mr+7dCiQ0w==', 'ext': 'mp4', - 'title': 'Damon Timm\'s Glide message', + 'title': "Damon's Glide message", 'thumbnail': 're:^https?://.*?\.cloudfront\.net/.*\.jpg$', - 'uploader': 'Damon Timm', - 'upload_date': '20140919', } } @@ -27,7 +25,8 @@ class GlideIE(InfoExtractor): webpage = self._download_webpage(url, video_id) title = self._html_search_regex( - r'(.+?)', webpage, 'title') + r'(.+?)', webpage, + 'title', default=None) or self._og_search_title(webpage) video_url = self._proto_relative_url(self._search_regex( r']+src=(["\'])(?P.+?)\1', webpage, 'video URL', default=None, @@ -36,18 +35,10 @@ class GlideIE(InfoExtractor): r']+id=["\']video-thumbnail["\'][^>]+src=(["\'])(?P.+?)\1', webpage, 'thumbnail url', default=None, group='url')) or self._og_search_thumbnail(webpage) - uploader = self._search_regex( - r']+class=["\']info-name["\'][^>]*>([^<]+)', - webpage, 'uploader', fatal=False) - upload_date = unified_strdate(self._search_regex( - r']+class="info-date"[^>]*>([^<]+)', - webpage, 'upload date', fatal=False)) return { 'id': video_id, 'title': title, 'url': video_url, 'thumbnail': thumbnail, - 'uploader': uploader, - 'upload_date': upload_date, } From 8276d3b87a54f43ca2f47b7709a6557ea979327c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Sep 2016 23:46:15 +0700 Subject: [PATCH 13/92] [thestar] Fix extraction (Closes #10465) --- youtube_dl/extractor/thestar.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/thestar.py b/youtube_dl/extractor/thestar.py index ba1380abc..c3f118894 100644 --- a/youtube_dl/extractor/thestar.py +++ b/youtube_dl/extractor/thestar.py @@ -2,8 +2,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from .brightcove import BrightcoveLegacyIE -from ..compat import compat_parse_qs class TheStarIE(InfoExtractor): @@ -30,6 +28,9 @@ class TheStarIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - brightcove_id = compat_parse_qs(brightcove_legacy_url)['@videoPlayer'][0] - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) + brightcove_id = self._search_regex( + r'mainartBrightcoveVideoId["\']?\s*:\s*["\']?(\d+)', + webpage, 'brightcove id') + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + 'BrightcoveNew', brightcove_id) From f97ec8bcb95b45d9a657392cd24eabfadb4053e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 1 Sep 2016 23:46:58 +0700 Subject: [PATCH 14/92] [glide] Remove unused import --- youtube_dl/extractor/glide.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/glide.py b/youtube_dl/extractor/glide.py index 50f698803..f0d951396 100644 --- a/youtube_dl/extractor/glide.py +++ b/youtube_dl/extractor/glide.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import unified_strdate class GlideIE(InfoExtractor): From 4191779dcda8a80faf6e53579e011b63ee5c3878 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 1 Sep 2016 19:07:41 +0100 Subject: [PATCH 15/92] [nytimes] improve extraction --- youtube_dl/extractor/nytimes.py | 93 +++++++++++++++++++++++---------- 1 file changed, 64 insertions(+), 29 deletions(-) diff --git a/youtube_dl/extractor/nytimes.py b/youtube_dl/extractor/nytimes.py index 681683e86..142c34256 100644 --- a/youtube_dl/extractor/nytimes.py +++ b/youtube_dl/extractor/nytimes.py @@ -1,26 +1,37 @@ from __future__ import unicode_literals +import hmac +import hashlib +import base64 + from .common import InfoExtractor from ..utils import ( float_or_none, int_or_none, parse_iso8601, + mimetype2ext, + determine_ext, ) class NYTimesBaseIE(InfoExtractor): + _SECRET = b'pX(2MbU2);4N{7J8)>YwKRJ+/pQ3JkiU2Q^V>mFYv6g6gYvt6v' + def _extract_video_from_id(self, video_id): - video_data = self._download_json( - 'http://www.nytimes.com/svc/video/api/v2/video/%s' % video_id, - video_id, 'Downloading video JSON') + # Authorization generation algorithm is reverse engineered from `signer` in + # http://graphics8.nytimes.com/video/vhs/vhs-2.x.min.js + path = '/svc/video/api/v3/video/' + video_id + hm = hmac.new(self._SECRET, (path + ':vhs').encode(), hashlib.sha512).hexdigest() + video_data = self._download_json('http://www.nytimes.com' + path, video_id, 'Downloading video JSON', headers={ + 'Authorization': 'NYTV ' + base64.b64encode(hm.encode()).decode(), + 'X-NYTV': 'vhs', + }, fatal=False) + if not video_data: + video_data = self._download_json( + 'http://www.nytimes.com/svc/video/api/v2/video/' + video_id, + video_id, 'Downloading video JSON') title = video_data['headline'] - description = video_data.get('summary') - duration = float_or_none(video_data.get('duration'), 1000) - - uploader = video_data.get('byline') - publication_date = video_data.get('publication_date') - timestamp = parse_iso8601(publication_date[:-8]) if publication_date else None def get_file_size(file_size): if isinstance(file_size, int): @@ -28,35 +39,59 @@ class NYTimesBaseIE(InfoExtractor): elif isinstance(file_size, dict): return int(file_size.get('value', 0)) else: - return 0 + return None - formats = [ - { - 'url': video['url'], - 'format_id': video.get('type'), - 'vcodec': video.get('video_codec'), - 'width': int_or_none(video.get('width')), - 'height': int_or_none(video.get('height')), - 'filesize': get_file_size(video.get('fileSize')), - } for video in video_data['renditions'] if video.get('url') - ] + urls = [] + formats = [] + for video in video_data.get('renditions', []): + video_url = video.get('url') + format_id = video.get('type') + if not video_url or format_id == 'thumbs' or video_url in urls: + continue + urls.append(video_url) + ext = mimetype2ext(video.get('mimetype')) or determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id or 'hls', fatal=False)) + elif ext == 'mpd': + continue + # formats.extend(self._extract_mpd_formats( + # video_url, video_id, format_id or 'dash', fatal=False)) + else: + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'vcodec': video.get('videoencoding') or video.get('video_codec'), + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + 'filesize': get_file_size(video.get('file_size') or video.get('fileSize')), + 'tbr': int_or_none(video.get('bitrate'), 1000), + 'ext': ext, + }) self._sort_formats(formats) - thumbnails = [ - { - 'url': 'http://www.nytimes.com/%s' % image['url'], + thumbnails = [] + for image in video_data.get('images', []): + image_url = image.get('url') + if not image_url: + continue + thumbnails.append({ + 'url': 'http://www.nytimes.com/' + image_url, 'width': int_or_none(image.get('width')), 'height': int_or_none(image.get('height')), - } for image in video_data.get('images', []) if image.get('url') - ] + }) + + publication_date = video_data.get('publication_date') + timestamp = parse_iso8601(publication_date[:-8]) if publication_date else None return { 'id': video_id, 'title': title, - 'description': description, + 'description': video_data.get('summary'), 'timestamp': timestamp, - 'uploader': uploader, - 'duration': duration, + 'uploader': video_data.get('byline'), + 'duration': float_or_none(video_data.get('duration'), 1000), 'formats': formats, 'thumbnails': thumbnails, } @@ -67,7 +102,7 @@ class NYTimesIE(NYTimesBaseIE): _TESTS = [{ 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263', - 'md5': '18a525a510f942ada2720db5f31644c0', + 'md5': 'd665342765db043f7e225cff19df0f2d', 'info_dict': { 'id': '100000002847155', 'ext': 'mov', From b207d5ebd4eab80e07673aba9696d240d1009bcf Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 1 Sep 2016 19:46:58 +0100 Subject: [PATCH 16/92] [curiositystream] don't cache auth token --- youtube_dl/extractor/curiositystream.py | 28 +++++++++---------------- 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py index 7105e3c4c..e3c99468c 100644 --- a/youtube_dl/extractor/curiositystream.py +++ b/youtube_dl/extractor/curiositystream.py @@ -33,24 +33,16 @@ class CuriosityStreamBaseIE(InfoExtractor): return result['data'] def _real_initialize(self): - if not self._auth_token: - user = self._downloader.cache.load('curiositystream', 'user') or {} - self._auth_token = user.get('auth_token') - if not self._auth_token: - (email, password) = self._get_login_info() - if email is None: - return - result = self._download_json( - self._API_BASE_URL + 'login', None, data=urlencode_postdata({ - 'email': email, - 'password': password, - })) - self._handle_errors(result) - self._auth_token = result['message']['auth_token'] - self._downloader.cache.store( - 'curiositystream', 'user', { - 'auth_token': self._auth_token, - }) + (email, password) = self._get_login_info() + if email is None: + return + result = self._download_json( + self._API_BASE_URL + 'login', None, data=urlencode_postdata({ + 'email': email, + 'password': password, + })) + self._handle_errors(result) + self._auth_token = result['message']['auth_token'] def _extract_media_info(self, media): video_id = compat_str(media['id']) From 6150502e4709b6b2ebc226c9c38fa346b9358699 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 1 Sep 2016 22:14:40 +0100 Subject: [PATCH 17/92] [adobepass] check for authz_token expiration(#10527) --- youtube_dl/extractor/adobepass.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index 9e3a3e362..68ec37e00 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -37,6 +37,10 @@ class AdobePassIE(InfoExtractor): return self._search_regex( '<%s>(.+?)' % (tag, tag), xml_str, tag) + def is_expired(token, date_ele): + token_expires = unified_timestamp(re.sub(r'[_ ]GMT', '', xml_text(token, date_ele))) + return token_expires and token_expires <= int(time.time()) + mvpd_headers = { 'ap_42': 'anonymous', 'ap_11': 'Linux i686', @@ -47,11 +51,8 @@ class AdobePassIE(InfoExtractor): guid = xml_text(resource, 'guid') requestor_info = self._downloader.cache.load('mvpd', requestor_id) or {} authn_token = requestor_info.get('authn_token') - if authn_token: - token_expires = unified_timestamp(re.sub(r'[_ ]GMT', '', xml_text(authn_token, 'simpleTokenExpires'))) - if token_expires and token_expires <= int(time.time()): - authn_token = None - requestor_info = {} + if authn_token and is_expired(authn_token, 'simpleTokenExpires'): + authn_token = None if not authn_token: # TODO add support for other TV Providers mso_id = 'DTV' @@ -98,6 +99,8 @@ class AdobePassIE(InfoExtractor): self._downloader.cache.store('mvpd', requestor_id, requestor_info) authz_token = requestor_info.get(guid) + if authz_token and is_expired(authz_token, 'simpleTokenTTL'): + authz_token = None if not authz_token: authorize = self._download_webpage( self._SERVICE_PROVIDER_TEMPLATE % 'authorize', video_id, From 2c3e0af93e00d7e2e20283be12541aaebabfa1bf Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 2 Sep 2016 09:53:04 +0100 Subject: [PATCH 18/92] [go] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/go.py | 101 +++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 youtube_dl/extractor/go.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7b59d5db2..2bcd5a0cd 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -318,6 +318,7 @@ from .globo import ( GloboIE, GloboArticleIE, ) +from .go import GoIE from .godtube import GodTubeIE from .godtv import GodTVIE from .golem import GolemIE diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py new file mode 100644 index 000000000..6a437c54d --- /dev/null +++ b/youtube_dl/extractor/go.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + determine_ext, + parse_age_limit, +) + + +class GoIE(InfoExtractor): + _BRANDS = { + 'abc': '001', + 'freeform': '002', + 'watchdisneychannel': '004', + 'watchdisneyjunior': '008', + 'watchdisneyxd': '009', + } + _VALID_URL = r'https?://(?:(?P%s)\.)?go\.com/.*?vdka(?P\w+)' % '|'.join(_BRANDS.keys()) + _TESTS = [{ + 'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx', + 'info_dict': { + 'id': '0_g86w5onx', + 'ext': 'mp4', + 'title': 'Sneak Peek: Language Arts', + 'description': 'md5:7dcdab3b2d17e5217c953256af964e9c', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://abc.go.com/shows/after-paradise/video/most-recent/vdka3335601', + 'only_matching': True, + }] + + def _real_extract(self, url): + sub_domain, video_id = re.match(self._VALID_URL, url).groups() + video_data = self._download_json( + 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/-1/-1/%s/-1/-1.json' % (self._BRANDS[sub_domain], video_id), + video_id)['video'][0] + title = video_data['title'] + + formats = [] + for asset in video_data.get('assets', {}).get('asset', []): + asset_url = asset.get('value') + if not asset_url: + continue + format_id = asset.get('format') + ext = determine_ext(asset_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + asset_url, video_id, 'mp4', m3u8_id=format_id or 'hls', fatal=False)) + else: + formats.append({ + 'format_id': format_id, + 'url': asset_url, + 'ext': ext, + }) + self._sort_formats(formats) + + subtitles = {} + for cc in video_data.get('closedcaption', {}).get('src', []): + cc_url = cc.get('value') + if not cc_url: + continue + ext = determine_ext(cc_url) + if ext == 'xml': + ext = 'ttml' + subtitles.setdefault(cc.get('lang'), []).append({ + 'url': cc_url, + 'ext': ext, + }) + + thumbnails = [] + for thumbnail in video_data.get('thumbnails', {}).get('thumbnail', []): + thumbnail_url = thumbnail.get('value') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('longdescription') or video_data.get('description'), + 'duration': int_or_none(video_data.get('duration', {}).get('value'), 1000), + 'age_limit': parse_age_limit(video_data.get('tvrating', {}).get('rating')), + 'episode_number': int_or_none(video_data.get('episodenumber')), + 'series': video_data.get('show', {}).get('title'), + 'season_number': int_or_none(video_data.get('season', {}).get('num')), + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, + } From 349fc5c705d6b81ae53d698972f40b1125bee13e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 2 Sep 2016 21:13:50 +0700 Subject: [PATCH 19/92] [facebook:plugins:video] Add extractor (Closes #10530) --- youtube_dl/extractor/extractors.py | 5 ++++- youtube_dl/extractor/facebook.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2bcd5a0cd..bc616223e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -264,7 +264,10 @@ from .everyonesmixtape import EveryonesMixtapeIE from .expotv import ExpoTVIE from .extremetube import ExtremeTubeIE from .eyedotv import EyedoTVIE -from .facebook import FacebookIE +from .facebook import ( + FacebookIE, + FacebookPluginsVideoIE, +) from .faz import FazIE from .fc2 import FC2IE from .fczenit import FczenitIE diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 228b0b6d7..3a220e995 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -351,3 +351,32 @@ class FacebookIE(InfoExtractor): self._VIDEO_PAGE_TEMPLATE % video_id, video_id, fatal_if_no_video=True) return info_dict + + +class FacebookPluginsVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/plugins/video\.php\?.*?\bhref=(?Phttps.+)' + + _TESTS = [{ + 'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fgov.sg%2Fvideos%2F10154383743583686%2F&show_text=0&width=560', + 'md5': '5954e92cdfe51fe5782ae9bda7058a07', + 'info_dict': { + 'id': '10154383743583686', + 'ext': 'mp4', + 'title': 'What to do during the haze?', + 'uploader': 'Gov.sg', + 'upload_date': '20160826', + 'timestamp': 1472184808, + }, + 'add_ie': [FacebookIE.ie_key()], + }, { + 'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fvideo.php%3Fv%3D10204634152394104', + 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/plugins/video.php?href=https://www.facebook.com/gov.sg/videos/10154383743583686/&show_text=0&width=560', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result( + compat_urllib_parse_unquote(self._match_id(url)), + FacebookIE.ie_key()) From 5e9e3d0f6bf2055c557f360758d6d7eb146edcba Mon Sep 17 00:00:00 2001 From: Sebastian Blunt Date: Fri, 2 Sep 2016 14:48:56 +0200 Subject: [PATCH 20/92] [drtv] Add support for dr.dk/nyheder It's the same video player, the only difference is that the video player is loaded differently, and certain metadata (title and description) is not available under dr.dk/mu, so make it by default get that from some of the html meta tags. Skip the dr.dk/tv test dr.dk/tv videos are only available for between 7 and 90 days due to Danish law, and in certain cases may be readded. Skip this test as it is no longer available. --- youtube_dl/extractor/drtv.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index 2d74ff855..e210cb610 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -5,13 +5,14 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, parse_iso8601, + remove_end, ) class DRTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dr\.dk/tv/se/(?:[^/]+/)*(?P[\da-z-]+)(?:[/#?]|$)' + _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv/se|nyheder)/(?:[^/]+/)*(?P[\da-z-]+)(?:[/#?]|$)' - _TEST = { + _TESTS = [{ 'url': 'https://www.dr.dk/tv/se/boern/ultra/panisk-paske/panisk-paske-5', 'md5': 'dc515a9ab50577fa14cc4e4b0265168f', 'info_dict': { @@ -23,7 +24,20 @@ class DRTVIE(InfoExtractor): 'upload_date': '20150322', 'duration': 1455, }, - } + 'skip': 'Video is no longer available', + }, { + 'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang', + 'md5': '2ada5074f9e79afc0d324a8e9784d850', + 'info_dict': { + 'id': 'christiania-pusher-street-ryddes-drdkrjpo', + 'ext': 'mp4', + 'title': 'LIVE Christianias rydning af Pusher Street er i gang', + 'description': '- Det er det fedeste, der er sket i 20 år, fortæller christianit til DR Nyheder.', + 'timestamp': 1472800279, + 'upload_date': '20160902', + 'duration': 131.4, + } + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -35,7 +49,8 @@ class DRTVIE(InfoExtractor): 'Video %s is not available' % video_id, expected=True) video_id = self._search_regex( - r'data-(?:material-identifier|episode-slug)="([^"]+)"', + (r'data-(?:material-identifier|episode-slug)="([^"]+)"', + r'data-resource="[^>"]+mu/programcard/expanded/([^"]+)"'), webpage, 'video id') programcard = self._download_json( @@ -43,8 +58,9 @@ class DRTVIE(InfoExtractor): video_id, 'Downloading video JSON') data = programcard['Data'][0] - title = data['Title'] - description = data['Description'] + title = remove_end(self._og_search_title(webpage), ' | TV | DR') or data['Title'] + description = self._og_search_description(webpage) or data['Description'] + timestamp = parse_iso8601(data['CreatedTime']) thumbnail = None From 6562d34a8cbdb93de77a8042f7409ebe31e3e3e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 2 Sep 2016 22:57:48 +0700 Subject: [PATCH 21/92] [utils] Improve mimetype2ext --- test/test_utils.py | 9 +++++++++ youtube_dl/utils.py | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index d16ea7f77..405c5d351 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -39,6 +39,7 @@ from youtube_dl.utils import ( is_html, js_to_json, limit_length, + mimetype2ext, ohdave_rsa_encrypt, OnDemandPagedList, orderedSet, @@ -625,6 +626,14 @@ class TestUtil(unittest.TestCase): limit_length('foo bar baz asd', 12).startswith('foo bar')) self.assertTrue('...' in limit_length('foo bar baz asd', 12)) + def test_mimetype2ext(self): + self.assertEqual(mimetype2ext(None), None) + self.assertEqual(mimetype2ext('video/x-flv'), 'flv') + self.assertEqual(mimetype2ext('application/x-mpegURL'), 'm3u8') + self.assertEqual(mimetype2ext('text/vtt'), 'vtt') + self.assertEqual(mimetype2ext('text/vtt;charset=utf-8'), 'vtt') + self.assertEqual(mimetype2ext('text/html; charset=utf-8'), 'html') + def test_parse_codecs(self): self.assertEqual(parse_codecs(''), {}) self.assertEqual(parse_codecs('avc1.77.30, mp4a.40.2'), { diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 1091f17f3..904f23fd7 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2148,7 +2148,7 @@ def mimetype2ext(mt): return ext _, _, res = mt.rpartition('/') - res = res.lower() + res = res.split(';')[0].strip().lower() return { '3gpp': '3gp', From 6066d03db02b9c545435b2b8faffe2e0f6c66702 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 2 Sep 2016 23:02:15 +0700 Subject: [PATCH 22/92] [drtv] Modernize and make more robust --- youtube_dl/extractor/drtv.py | 53 ++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index e210cb610..7122449a3 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -4,6 +4,9 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( ExtractorError, + int_or_none, + float_or_none, + mimetype2ext, parse_iso8601, remove_end, ) @@ -58,10 +61,12 @@ class DRTVIE(InfoExtractor): video_id, 'Downloading video JSON') data = programcard['Data'][0] - title = remove_end(self._og_search_title(webpage), ' | TV | DR') or data['Title'] - description = self._og_search_description(webpage) or data['Description'] + title = remove_end(self._og_search_title( + webpage, default=None), ' | TV | DR') or data['Title'] + description = self._og_search_description( + webpage, default=None) or data.get('Description') - timestamp = parse_iso8601(data['CreatedTime']) + timestamp = parse_iso8601(data.get('CreatedTime')) thumbnail = None duration = None @@ -72,16 +77,18 @@ class DRTVIE(InfoExtractor): subtitles = {} for asset in data['Assets']: - if asset['Kind'] == 'Image': - thumbnail = asset['Uri'] - elif asset['Kind'] == 'VideoResource': - duration = asset['DurationInMilliseconds'] / 1000.0 - restricted_to_denmark = asset['RestrictedToDenmark'] - spoken_subtitles = asset['Target'] == 'SpokenSubtitles' - for link in asset['Links']: - uri = link['Uri'] - target = link['Target'] - format_id = target + if asset.get('Kind') == 'Image': + thumbnail = asset.get('Uri') + elif asset.get('Kind') == 'VideoResource': + duration = float_or_none(asset.get('DurationInMilliseconds'), 1000) + restricted_to_denmark = asset.get('RestrictedToDenmark') + spoken_subtitles = asset.get('Target') == 'SpokenSubtitles' + for link in asset.get('Links', []): + uri = link.get('Uri') + if not uri: + continue + target = link.get('Target') + format_id = target or '' preference = None if spoken_subtitles: preference = -1 @@ -92,8 +99,8 @@ class DRTVIE(InfoExtractor): video_id, preference, f4m_id=format_id)) elif target == 'HLS': formats.extend(self._extract_m3u8_formats( - uri, video_id, 'mp4', preference=preference, - m3u8_id=format_id)) + uri, video_id, 'mp4', entry_protocol='m3u8_native', + preference=preference, m3u8_id=format_id)) else: bitrate = link.get('Bitrate') if bitrate: @@ -101,7 +108,7 @@ class DRTVIE(InfoExtractor): formats.append({ 'url': uri, 'format_id': format_id, - 'tbr': bitrate, + 'tbr': int_or_none(bitrate), 'ext': link.get('FileFormat'), }) subtitles_list = asset.get('SubtitlesList') @@ -110,12 +117,18 @@ class DRTVIE(InfoExtractor): 'Danish': 'da', } for subs in subtitles_list: - lang = subs['Language'] - subtitles[LANGS.get(lang, lang)] = [{'url': subs['Uri'], 'ext': 'vtt'}] + if not subs.get('Uri'): + continue + lang = subs.get('Language') or 'da' + subtitles.setdefault(LANGS.get(lang, lang), []).append({ + 'url': subs['Uri'], + 'ext': mimetype2ext(subs.get('MimeType')) or 'vtt' + }) if not formats and restricted_to_denmark: - raise ExtractorError( - 'Unfortunately, DR is not allowed to show this program outside Denmark.', expected=True) + self.raise_geo_restricted( + 'Unfortunately, DR is not allowed to show this program outside Denmark.', + expected=True) self._sort_formats(formats) From dacb3a864a8c89edb312cd28c3de1605a5467d0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 2 Sep 2016 23:43:20 +0700 Subject: [PATCH 23/92] [youtube:playlist] Fallback to video extraction for video/playlist URLs when playlist is broken (Closes #10537) --- youtube_dl/extractor/youtube.py | 56 +++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d5d5b7334..ea98fbf69 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1841,6 +1841,28 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', }, 'playlist_mincout': 21, + }, { + # Playlist URL that does not actually serve a playlist + 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', + 'info_dict': { + 'id': 'FqZTN594JQw', + 'ext': 'webm', + 'title': "Smiley's People 01 detective, Adventure Series, Action", + 'uploader': 'STREEM', + 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', + 'upload_date': '20150526', + 'license': 'Standard YouTube License', + 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', + 'categories': ['People & Blogs'], + 'tags': list, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [YoutubeIE.ie_key()], }] def _real_initialize(self): @@ -1901,9 +1923,20 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): playlist_title = self._html_search_regex( r'(?s)

]*>\s*(.*?)\s*

', - page, 'title') + page, 'title', default=None) - return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title) + has_videos = True + + if not playlist_title: + try: + # Some playlist URLs don't actually serve a playlist (e.g. + # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4) + next(self._entries(page, playlist_id)) + except StopIteration: + has_videos = False + + return has_videos, self.playlist_result( + self._entries(page, playlist_id), playlist_id, playlist_title) def _check_download_just_video(self, url, playlist_id): # Check if it's a video-specific URL @@ -1912,9 +1945,11 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): video_id = query_dict['v'][0] if self._downloader.params.get('noplaylist'): self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return self.url_result(video_id, 'Youtube', video_id=video_id) + return video_id, self.url_result(video_id, 'Youtube', video_id=video_id) else: self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + return video_id, None + return None, None def _real_extract(self, url): # Extract playlist id @@ -1923,7 +1958,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): raise ExtractorError('Invalid URL: %s' % url) playlist_id = mobj.group(1) or mobj.group(2) - video = self._check_download_just_video(url, playlist_id) + video_id, video = self._check_download_just_video(url, playlist_id) if video: return video @@ -1931,7 +1966,15 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): # Mixes require a custom extraction process return self._extract_mix(playlist_id) - return self._extract_playlist(playlist_id) + has_videos, playlist = self._extract_playlist(playlist_id) + if has_videos or not video_id: + return playlist + + # Some playlist URLs don't actually serve a playlist (see + # https://github.com/rg3/youtube-dl/issues/10537). + # Fallback to plain video extraction if there is a video id + # along with playlist id. + return self.url_result(video_id, 'Youtube', video_id=video_id) class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): @@ -2312,7 +2355,8 @@ class YoutubeWatchLaterIE(YoutubePlaylistIE): video = self._check_download_just_video(url, 'WL') if video: return video - return self._extract_playlist('WL') + _, playlist = self._extract_playlist('WL') + return playlist class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): From c2b2c7e1386056698ee1b0de5427ea90abf8e9c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 2 Sep 2016 23:50:42 +0700 Subject: [PATCH 24/92] [utils] Add quicktime to mimetype2ext --- youtube_dl/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 904f23fd7..ed199c4ad 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2168,6 +2168,7 @@ def mimetype2ext(mt): 'f4m+xml': 'f4m', 'hds+xml': 'f4m', 'vnd.ms-sstr+xml': 'ism', + 'quicktime': 'mov', }.get(res, res) From 3fcce30289a475901728af7a8dbe85304105b8ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 2 Sep 2016 23:53:17 +0700 Subject: [PATCH 25/92] [drtv] Update tests --- youtube_dl/extractor/drtv.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index 7122449a3..88d096b30 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -16,21 +16,23 @@ class DRTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv/se|nyheder)/(?:[^/]+/)*(?P[\da-z-]+)(?:[/#?]|$)' _TESTS = [{ - 'url': 'https://www.dr.dk/tv/se/boern/ultra/panisk-paske/panisk-paske-5', - 'md5': 'dc515a9ab50577fa14cc4e4b0265168f', + 'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10', + 'md5': '25e659cccc9a2ed956110a299fdf5983', 'info_dict': { - 'id': 'panisk-paske-5', + 'id': 'klassen-darlig-taber-10', 'ext': 'mp4', - 'title': 'Panisk Påske (5)', - 'description': 'md5:ca14173c5ab24cd26b0fcc074dff391c', - 'timestamp': 1426984612, - 'upload_date': '20150322', - 'duration': 1455, + 'title': 'Klassen - Dårlig taber (10)', + 'description': 'md5:815fe1b7fa656ed80580f31e8b3c79aa', + 'timestamp': 1471991907, + 'upload_date': '20160823', + 'duration': 606.84, + }, + 'params': { + 'skip_download': True, }, - 'skip': 'Video is no longer available', }, { 'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang', - 'md5': '2ada5074f9e79afc0d324a8e9784d850', + 'md5': '2c37175c718155930f939ef59952474a', 'info_dict': { 'id': 'christiania-pusher-street-ryddes-drdkrjpo', 'ext': 'mp4', @@ -39,7 +41,7 @@ class DRTVIE(InfoExtractor): 'timestamp': 1472800279, 'upload_date': '20160902', 'duration': 131.4, - } + }, }] def _real_extract(self, url): From 6496ccb41398971373a2f7162a0684dd12f0b56e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 3 Sep 2016 01:17:15 +0700 Subject: [PATCH 26/92] [youtube] Add support for rental videos' previews (Closes #10532) --- youtube_dl/extractor/youtube.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ea98fbf69..4c8edef8d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -844,6 +844,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # YouTube Red paid video (https://github.com/rg3/youtube-dl/issues/10059) 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo', 'only_matching': True, + }, + { + # Rental video preview + 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg', + 'info_dict': { + 'id': 'uGpuVWrhIzE', + 'ext': 'mp4', + 'title': 'Piku - Trailer', + 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb', + 'upload_date': '20150811', + 'uploader': 'FlixMatrix', + 'uploader_id': 'FlixMatrixKaravan', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan', + 'license': 'Standard YouTube License', + }, + 'params': { + 'skip_download': True, + }, } ] @@ -1254,6 +1272,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Convert to the same format returned by compat_parse_qs video_info = dict((k, [v]) for k, v in args.items()) add_dash_mpd(video_info) + # Rental video is not rented but preview is available (e.g. + # https://www.youtube.com/watch?v=yYr8q0y5Jfg, + # https://github.com/rg3/youtube-dl/issues/10532) + if not video_info and args.get('ypc_vid'): + return self.url_result( + args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) if args.get('livestream') == '1' or args.get('live_playback') == 1: is_live = True if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): From 3a7d35b982fac19ca47b87358001379fafbd5731 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 3 Sep 2016 01:42:33 +0700 Subject: [PATCH 27/92] Credit @C4K3 for #10536 --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index b9a602c12..c4bef040a 100644 --- a/AUTHORS +++ b/AUTHORS @@ -182,3 +182,4 @@ Rob van Bekkum Petr Zvoníček Pratyush Singh Aleksander Nitecki +Sebastian Blunt From 4b3a6076586a38450fa9633480d175a13e33dac7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 3 Sep 2016 01:45:17 +0700 Subject: [PATCH 28/92] [ChangeLog] Actualize --- ChangeLog | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ChangeLog b/ChangeLog index 2e75c003d..eb05fe77e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,8 +1,32 @@ version +Core +* Restore usage of NAME attribute from EXT-X-MEDIA tag for formats codes in + _extract_m3u8_formats (#10522) +* Handle semicolon in mimetype2ext + + Extractors ++ [youtube] Add support for rental videos' previews (#10532) +* [youtube:playlist] Fallback to video extraction for video/playlist URLs when + no playlist is actually served (#10537) ++ [drtv] Add support for dr.dk/nyheder (#10536) ++ [facebook:plugins:video] Add extractor (#10530) ++ [go] Add extractor for *.go.com sites +* [adobepass] Check for authz_token expiration (#10527) +* [nytimes] improve extraction +* [thestar] Fix extraction (#10465) +* [glide] Fix extraction (#10478) +- [exfm] Remove extractor (#10482) +* [youporn] Fix categories and tags extraction (#10521) ++ [curiositystream] Add extractor for app.curiositystream.com - [thvideo] Remove extractor (#10464) * [movingimage] Fix for the new site name (#10466) ++ [cbs] Add support for once formats (#10515) +* [limelight] Skip ism snd duplicate manifests ++ [porncom] Extract categories and tags (#10510) ++ [facebook] Extract timestamp (#10508) ++ [yahoo] Extract more formats version 2016.08.31 From 86c3bbbcede6efa175f5a93e02511fe32585521f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 3 Sep 2016 01:46:41 +0700 Subject: [PATCH 29/92] release 2016.09.03 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 9 +++++---- youtube_dl/version.py | 2 +- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 2caca5115..fc18e733b 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.31*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.31** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.09.03*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.09.03** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.08.31 +[debug] youtube-dl version 2016.09.03 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index eb05fe77e..68dbeb696 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2016.09.03 Core * Restore usage of NAME attribute from EXT-X-MEDIA tag for formats codes in diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 42bf291e2..015332bca 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -171,6 +171,8 @@ - **CTVNews** - **culturebox.francetvinfo.fr** - **CultureUnplugged** + - **curiositystream** + - **curiositystream:collection** - **CWTV** - **DailyMail** - **dailymotion** @@ -223,11 +225,11 @@ - **EsriVideo** - **Europa** - **EveryonesMixtape** - - **exfm**: ex.fm - **ExpoTV** - **ExtremeTube** - **EyedoTV** - **facebook** + - **FacebookPluginsVideo** - **faz.net** - **fc2** - **Fczenit** @@ -271,6 +273,7 @@ - **Glide**: Glide mobile video messages (glide.me) - **Globo** - **GloboArticle** + - **Go** - **GodTube** - **GodTV** - **Golem** @@ -406,6 +409,7 @@ - **MovieClips** - **MovieFap** - **Moviezine** + - **MovingImage** - **MPORA** - **MSN** - **mtg**: MTG services @@ -659,7 +663,6 @@ - **sr:mediathek**: Saarländischer Rundfunk - **SRGSSR** - **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites - - **SSA** - **stanfordoc**: Stanford Open ClassRoom - **Steam** - **Stitcher** @@ -702,8 +705,6 @@ - **TheStar** - **ThisAmericanLife** - **ThisAV** - - **THVideo** - - **THVideoPlaylist** - **tinypic**: tinypic.com videos - **tlc.de** - **TMZ** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index fe442dd88..5be8c0122 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.08.31' +__version__ = '2016.09.03' From dedb1770295d214225a3a31b5f99da877cf01eee Mon Sep 17 00:00:00 2001 From: Christian Pointner Date: Sat, 3 Sep 2016 01:50:26 +0200 Subject: [PATCH 30/92] Fix parsing of HTML5 media elements This fixes an error in _parse_html5_media_entries in case an audio or video tag directly uses a src attribute insted of elements in it's body. --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a9c7a8d16..a82968162 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1749,7 +1749,7 @@ class InfoExtractor(object): media_attributes = extract_attributes(media_tag) src = media_attributes.get('src') if src: - _, formats = _media_formats(src) + _, formats = _media_formats(src, media_type) media_info['formats'].extend(formats) media_info['thumbnail'] = media_attributes.get('poster') if media_content: From cf0efe96366259a5f0f07ae79280bfa17dc6f6e7 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 3 Sep 2016 17:25:03 +0800 Subject: [PATCH 31/92] [fc2:embed] New extractor for Flash player URLs Closes #10512 --- ChangeLog | 6 ++++ youtube_dl/extractor/extractors.py | 5 ++- youtube_dl/extractor/fc2.py | 58 ++++++++++++++++++++++++++---- 3 files changed, 61 insertions(+), 8 deletions(-) diff --git a/ChangeLog b/ChangeLog index 68dbeb696..065fc83a8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version + +Extractors ++ [fc2] Recognize Flash player URLs (#10512) + + version 2016.09.03 Core diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index bc616223e..d851e5f36 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -269,7 +269,10 @@ from .facebook import ( FacebookPluginsVideoIE, ) from .faz import FazIE -from .fc2 import FC2IE +from .fc2 import ( + FC2IE, + FC2EmbedIE, +) from .fczenit import FczenitIE from .firstpost import FirstpostIE from .firsttv import FirstTVIE diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index c7d69ff1f..b9e58d4df 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -1,10 +1,12 @@ -#! -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import hashlib +import re from .common import InfoExtractor from ..compat import ( + compat_parse_qs, compat_urllib_request, compat_urlparse, ) @@ -16,7 +18,7 @@ from ..utils import ( class FC2IE(InfoExtractor): - _VALID_URL = r'^https?://video\.fc2\.com/(?:[^/]+/)*content/(?P[^/]+)' + _VALID_URL = r'^(?:https?://video\.fc2\.com/(?:[^/]+/)*content/|fc2:)(?P[^/]+)' IE_NAME = 'fc2' _NETRC_MACHINE = 'fc2' _TESTS = [{ @@ -75,12 +77,17 @@ class FC2IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) self._login() - webpage = self._download_webpage(url, video_id) - self._downloader.cookiejar.clear_session_cookies() # must clear - self._login() + webpage = None + if not url.startswith('fc2:'): + webpage = self._download_webpage(url, video_id) + self._downloader.cookiejar.clear_session_cookies() # must clear + self._login() - title = self._og_search_title(webpage) - thumbnail = self._og_search_thumbnail(webpage) + title = 'FC2 video %s' % video_id + thumbnail = None + if webpage is not None: + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) refer = url.replace('/content/', '/a/content/') if '/a/content/' not in url else url mimi = hashlib.md5((video_id + '_gGddgPfeaf_gzyr').encode('utf-8')).hexdigest() @@ -113,3 +120,40 @@ class FC2IE(InfoExtractor): 'ext': 'flv', 'thumbnail': thumbnail, } + + +class FC2EmbedIE(InfoExtractor): + _VALID_URL = r'https?://video\.fc2\.com/flv2\.swf\?(?P.+)' + IE_NAME = 'fc2:embed' + + _TEST = { + 'url': 'http://video.fc2.com/flv2.swf?t=201404182936758512407645&i=20130316kwishtfitaknmcgd76kjd864hso93htfjcnaogz629mcgfs6rbfk0hsycma7shkf85937cbchfygd74&i=201403223kCqB3Ez&d=2625&sj=11&lang=ja&rel=1&from=11&cmt=1&tk=TlRBM09EQTNNekU9&tl=プリズン・ブレイク%20S1-01%20マイケル%20【吹替】', + 'md5': 'b8aae5334cb691bdb1193a88a6ab5d5a', + 'info_dict': { + 'id': '201403223kCqB3Ez', + 'ext': 'flv', + 'title': 'プリズン・ブレイク S1-01 マイケル 【吹替】', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + query = compat_parse_qs(mobj.group('query')) + + video_id = query['i'][-1] + title = query.get('tl', ['FC2 video %s' % video_id])[0] + + sj = query.get('sj', [None])[0] + thumbnail = None + if sj: + # See thumbnailImagePath() in ServerConst.as of flv2.swf + thumbnail = 'http://video%s-thumbnail.fc2.com/up/pic/%s.jpg' % ( + sj, '/'.join((video_id[:6], video_id[6:8], video_id[-2], video_id[-1], video_id))) + + return { + '_type': 'url_transparent', + 'url': 'fc2:%s' % video_id, + 'title': title, + 'thumbnail': thumbnail, + } From cdc783510bb575b2318b1d7d42fb98f0c0f0df18 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 3 Sep 2016 18:16:19 +0800 Subject: [PATCH 32/92] [foxnews:insider] Add new extractor Closes #10445 --- ChangeLog | 1 + youtube_dl/extractor/extractors.py | 5 +++- youtube_dl/extractor/foxnews.py | 48 +++++++++++++++++++++++++++++- 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index 065fc83a8..199983674 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version Extractors ++ [foxnews] Add support for FoxNews Insider (#10445) + [fc2] Recognize Flash player URLs (#10512) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d851e5f36..8c6ee0503 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -287,7 +287,10 @@ from .formula1 import Formula1IE from .fourtube import FourTubeIE from .fox import FOXIE from .foxgay import FoxgayIE -from .foxnews import FoxNewsIE +from .foxnews import ( + FoxNewsIE, + FoxNewsInsiderIE, +) from .foxsports import FoxSportsIE from .franceculture import FranceCultureIE from .franceinter import FranceInterIE diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index b04da2415..5c7acd795 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -3,11 +3,12 @@ from __future__ import unicode_literals import re from .amp import AMPIE +from .common import InfoExtractor class FoxNewsIE(AMPIE): IE_DESC = 'Fox News and Fox Business Video' - _VALID_URL = r'https?://(?Pvideo\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P\d+)' + _VALID_URL = r'https?://(?Pvideo\.(?:insider\.)?fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P\d+)' _TESTS = [ { 'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips', @@ -49,6 +50,11 @@ class FoxNewsIE(AMPIE): 'url': 'http://video.foxbusiness.com/v/4442309889001', 'only_matching': True, }, + { + # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words + 'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true', + 'only_matching': True, + }, ] def _real_extract(self, url): @@ -58,3 +64,43 @@ class FoxNewsIE(AMPIE): 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) info['id'] = video_id return info + + +class FoxNewsInsiderIE(InfoExtractor): + _VALID_URL = r'https?://insider\.foxnews\.com/([^/]+/)+(?P[a-z-]+)' + IE_NAME = 'foxnews:insider' + + _TEST = { + 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', + 'md5': 'a10c755e582d28120c62749b4feb4c0c', + 'info_dict': { + 'id': '5099377331001', + 'display_id': 'univ-wisconsin-student-group-pushing-silence-certain-words', + 'ext': 'mp4', + 'title': 'Student Group: Saying \'Politically Correct,\' \'Trash\' and \'Lame\' Is Offensive', + 'description': 'Is campus censorship getting out of control?', + 'timestamp': 1472168725, + 'upload_date': '20160825', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'add_ie': [FoxNewsIE.ie_key()], + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + embed_url = self._html_search_meta('embedUrl', webpage, 'embed URL') + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + + return { + '_type': 'url_transparent', + 'ie_key': FoxNewsIE.ie_key(), + 'url': embed_url, + 'display_id': display_id, + 'title': title, + 'description': description, + } From ed2bfe93aaa11f49f7b2b92b581abb6aa385dfbf Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 3 Sep 2016 18:22:00 +0800 Subject: [PATCH 33/92] [fc2:embed] Add ie_key --- youtube_dl/extractor/fc2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index b9e58d4df..c032d4d02 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -153,6 +153,7 @@ class FC2EmbedIE(InfoExtractor): return { '_type': 'url_transparent', + 'ie_key': FC2IE.ie_key(), 'url': 'fc2:%s' % video_id, 'title': title, 'thumbnail': thumbnail, From 45aab4d30b7c3fc03c9be9680550cba88bd85b5c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 3 Sep 2016 18:37:36 +0800 Subject: [PATCH 34/92] [youjizz] Fix extraction. The site has moved to HTML5 Closes #10437 --- ChangeLog | 1 + youtube_dl/extractor/youjizz.py | 43 +++++++-------------------------- 2 files changed, 10 insertions(+), 34 deletions(-) diff --git a/ChangeLog b/ChangeLog index 199983674..2809e55d7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version Extractors +* [youjizz] Fix extraction (#10437) + [foxnews] Add support for FoxNews Insider (#10445) + [fc2] Recognize Flash player URLs (#10512) diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index 31e2f9263..b50f34e9b 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -1,21 +1,16 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) class YouJizzIE(InfoExtractor): _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]+)?-(?P[0-9]+)\.html(?:$|[?#])' _TESTS = [{ 'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', - 'md5': '07e15fa469ba384c7693fd246905547c', + 'md5': '78fc1901148284c69af12640e01c6310', 'info_dict': { 'id': '2189178', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Zeichentrick 1', 'age_limit': 18, } @@ -27,38 +22,18 @@ class YouJizzIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + # YouJizz's HTML5 player has invalid HTML + webpage = webpage.replace('"controls', '" controls') age_limit = self._rta_search(webpage) video_title = self._html_search_regex( r'\s*(.*)\s*', webpage, 'title') - embed_page_url = self._search_regex( - r'(https?://www.youjizz.com/videos/embed/[0-9]+)', - webpage, 'embed page') - webpage = self._download_webpage( - embed_page_url, video_id, note='downloading embed page') + info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] - # Get the video URL - m_playlist = re.search(r'so.addVariable\("playlist", ?"(?P.+?)"\);', webpage) - if m_playlist is not None: - playlist_url = m_playlist.group('playlist') - playlist_page = self._download_webpage(playlist_url, video_id, - 'Downloading playlist page') - m_levels = list(re.finditer(r'[^"]+)"\)\);', - webpage, 'video URL') - - return { + info_dict.update({ 'id': video_id, - 'url': video_url, 'title': video_title, - 'ext': 'flv', - 'format': 'flv', - 'player_url': embed_page_url, 'age_limit': age_limit, - } + }) + + return info_dict From 9603b6601208333bc49e0c69199f0e652a7aaea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Aug 2016 04:52:18 +0700 Subject: [PATCH 35/92] Introduce --skip-unavailable-fragments --- youtube_dl/__init__.py | 1 + youtube_dl/downloader/fragment.py | 10 ++++++++-- youtube_dl/options.py | 10 +++++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index a9730292c..42128272a 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -318,6 +318,7 @@ def _real_main(argv=None): 'nooverwrites': opts.nooverwrites, 'retries': opts.retries, 'fragment_retries': opts.fragment_retries, + 'skip_unavailable_fragments': opts.skip_unavailable_fragments, 'buffersize': opts.buffersize, 'noresizebuffer': opts.noresizebuffer, 'continuedl': opts.continue_dl, diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index ba903ae10..b4a798f8f 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -22,14 +22,20 @@ class FragmentFD(FileDownloader): Available options: - fragment_retries: Number of times to retry a fragment for HTTP error (DASH only) + fragment_retries: Number of times to retry a fragment for HTTP error (DASH + and hlsnative only) + skip_unavailable_fragments: + Skip unavailable fragments (DASH and hlsnative only) """ def report_retry_fragment(self, fragment_name, count, retries): self.to_screen( - '[download] Got server HTTP error. Retrying fragment %s (attempt %d of %s)...' + '[download] Got server HTTP error: %s. Retrying fragment %s (attempt %d of %s)...' % (fragment_name, count, self.format_retries(retries))) + def report_skip_fragment(self, fragment_name): + self.to_screen('[download] Skipping fragment %s...' % fragment_name) + def _prepare_and_start_frag_download(self, ctx): self._prepare_frag_download(ctx) self._start_frag_download(ctx) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 5d62deef4..56f312f57 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -423,7 +423,15 @@ def parseOpts(overrideArguments=None): downloader.add_option( '--fragment-retries', dest='fragment_retries', metavar='RETRIES', default=10, - help='Number of retries for a fragment (default is %default), or "infinite" (DASH only)') + help='Number of retries for a fragment (default is %default), or "infinite" (DASH and hlsnative only)') + downloader.add_option( + '--skip-unavailable-fragments', + action='store_true', dest='skip_unavailable_fragments', default=True, + help='Skip unavailable fragments (DASH and hlsnative only)') + general.add_option( + '--abort-on-unavailable-fragment', + action='store_false', dest='skip_unavailable_fragments', + help='Abort downloading when some fragment is not available') downloader.add_option( '--buffer-size', dest='buffersize', metavar='SIZE', default='1024', From 25afc2a7830e281e849609202b4f70728664bdb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Aug 2016 04:55:55 +0700 Subject: [PATCH 36/92] [downloader/dash:hls] Respect --fragment-retries and --skip-unavailable-fragments (Closes #10165, closes #10448) --- youtube_dl/downloader/dash.py | 12 +++++----- youtube_dl/downloader/hls.py | 41 +++++++++++++++++++++++++++++------ 2 files changed, 41 insertions(+), 12 deletions(-) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 8bbab9dbc..cbcee324d 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -38,6 +38,7 @@ class DashSegmentsFD(FragmentFD): segments_filenames = [] fragment_retries = self.params.get('fragment_retries', 0) + skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) def append_url_to_file(target_url, tmp_filename, segment_name): target_filename = '%s-%s' % (tmp_filename, segment_name) @@ -52,19 +53,20 @@ class DashSegmentsFD(FragmentFD): down.close() segments_filenames.append(target_sanitized) break - except (compat_urllib_error.HTTPError, ) as err: + except compat_urllib_error.HTTPError: # YouTube may often return 404 HTTP error for a fragment causing the # whole download to fail. However if the same fragment is immediately # retried with the same request data this usually succeeds (1-2 attemps # is usually enough) thus allowing to download the whole file successfully. - # So, we will retry all fragments that fail with 404 HTTP error for now. - if err.code != 404: - raise - # Retry fragment + # To be future-proof we will retry all fragments that fail with any + # HTTP error. count += 1 if count <= fragment_retries: self.report_retry_fragment(segment_name, count, fragment_retries) if count > fragment_retries: + if skip_unavailable_fragments: + self.report_skip_fragment(segment_name) + return self.report_error('giving up after %s fragment retries' % fragment_retries) return False diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index baaff44d5..7412620a5 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -13,6 +13,7 @@ from .fragment import FragmentFD from .external import FFmpegFD from ..compat import ( + compat_urllib_error, compat_urlparse, compat_struct_pack, ) @@ -83,6 +84,10 @@ class HlsFD(FragmentFD): self._prepare_and_start_frag_download(ctx) + fragment_retries = self.params.get('fragment_retries', 0) + skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) + test = self.params.get('test', False) + extra_query = None extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') if extra_param_to_segment_url: @@ -99,15 +104,37 @@ class HlsFD(FragmentFD): line if re.match(r'^https?://', line) else compat_urlparse.urljoin(man_url, line)) - frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i) + frag_name = 'Frag%d' % i + frag_filename = '%s-%s' % (ctx['tmpfilename'], frag_name) if extra_query: frag_url = update_url_query(frag_url, extra_query) - success = ctx['dl'].download(frag_filename, {'url': frag_url}) - if not success: + count = 0 + while count <= fragment_retries: + try: + success = ctx['dl'].download(frag_filename, {'url': frag_url}) + if not success: + return False + down, frag_sanitized = sanitize_open(frag_filename, 'rb') + frag_content = down.read() + down.close() + break + except compat_urllib_error.HTTPError: + # Unavailable (possibly temporary) fragments may be served. + # First we try to retry then either skip or abort. + # See https://github.com/rg3/youtube-dl/issues/10165, + # https://github.com/rg3/youtube-dl/issues/10448). + count += 1 + if count <= fragment_retries: + self.report_retry_fragment(frag_name, count, fragment_retries) + if count > fragment_retries: + if skip_unavailable_fragments: + i += 1 + media_sequence += 1 + self.report_skip_fragment(frag_name) + continue + self.report_error( + 'giving up after %s fragment retries' % fragment_retries) return False - down, frag_sanitized = sanitize_open(frag_filename, 'rb') - frag_content = down.read() - down.close() if decrypt_info['METHOD'] == 'AES-128': iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) frag_content = AES.new( @@ -115,7 +142,7 @@ class HlsFD(FragmentFD): ctx['dest_stream'].write(frag_content) frags_filenames.append(frag_sanitized) # We only download the first fragment during the test - if self.params.get('test', False): + if test: break i += 1 media_sequence += 1 From 2e99cd30c3108fd8da6a9f9fadfa89852c8d8826 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 27 Aug 2016 04:57:59 +0700 Subject: [PATCH 37/92] [downloader/dash:hls] Report exact fragment error on retry --- youtube_dl/downloader/dash.py | 4 ++-- youtube_dl/downloader/fragment.py | 5 +++-- youtube_dl/downloader/hls.py | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index cbcee324d..e087cf142 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -53,7 +53,7 @@ class DashSegmentsFD(FragmentFD): down.close() segments_filenames.append(target_sanitized) break - except compat_urllib_error.HTTPError: + except compat_urllib_error.HTTPError as err: # YouTube may often return 404 HTTP error for a fragment causing the # whole download to fail. However if the same fragment is immediately # retried with the same request data this usually succeeds (1-2 attemps @@ -62,7 +62,7 @@ class DashSegmentsFD(FragmentFD): # HTTP error. count += 1 if count <= fragment_retries: - self.report_retry_fragment(segment_name, count, fragment_retries) + self.report_retry_fragment(err, segment_name, count, fragment_retries) if count > fragment_retries: if skip_unavailable_fragments: self.report_skip_fragment(segment_name) diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index b4a798f8f..84aacf7db 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -6,6 +6,7 @@ import time from .common import FileDownloader from .http import HttpFD from ..utils import ( + error_to_compat_str, encodeFilename, sanitize_open, ) @@ -28,10 +29,10 @@ class FragmentFD(FileDownloader): Skip unavailable fragments (DASH and hlsnative only) """ - def report_retry_fragment(self, fragment_name, count, retries): + def report_retry_fragment(self, err, fragment_name, count, retries): self.to_screen( '[download] Got server HTTP error: %s. Retrying fragment %s (attempt %d of %s)...' - % (fragment_name, count, self.format_retries(retries))) + % (error_to_compat_str(err), fragment_name, count, self.format_retries(retries))) def report_skip_fragment(self, fragment_name): self.to_screen('[download] Skipping fragment %s...' % fragment_name) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 7412620a5..5d70abf62 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -118,14 +118,14 @@ class HlsFD(FragmentFD): frag_content = down.read() down.close() break - except compat_urllib_error.HTTPError: + except compat_urllib_error.HTTPError as err: # Unavailable (possibly temporary) fragments may be served. # First we try to retry then either skip or abort. # See https://github.com/rg3/youtube-dl/issues/10165, # https://github.com/rg3/youtube-dl/issues/10448). count += 1 if count <= fragment_retries: - self.report_retry_fragment(frag_name, count, fragment_retries) + self.report_retry_fragment(err, frag_name, count, fragment_retries) if count > fragment_retries: if skip_unavailable_fragments: i += 1 From 4a69fa04e0074a3d5938ffb03decff9cc33f5d3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 30 Aug 2016 22:28:14 +0700 Subject: [PATCH 38/92] [downloader/dash] Abort download immediately after giving up on some fragment --- youtube_dl/downloader/dash.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index e087cf142..efeae02a3 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -66,14 +66,17 @@ class DashSegmentsFD(FragmentFD): if count > fragment_retries: if skip_unavailable_fragments: self.report_skip_fragment(segment_name) - return + return True self.report_error('giving up after %s fragment retries' % fragment_retries) return False + return True if initialization_url: - append_url_to_file(initialization_url, ctx['tmpfilename'], 'Init') + if not append_url_to_file(initialization_url, ctx['tmpfilename'], 'Init'): + return False for i, segment_url in enumerate(segment_urls): - append_url_to_file(segment_url, ctx['tmpfilename'], 'Seg%d' % i) + if not append_url_to_file(segment_url, ctx['tmpfilename'], 'Seg%d' % i): + return False self._finish_frag_download(ctx) From 7e5dc339de14547aa7b489e88b4c456ec613ba9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 4 Sep 2016 00:29:01 +0700 Subject: [PATCH 39/92] [youtube:watchlater] Fix extraction (Closes #10544) --- youtube_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4c8edef8d..0bc85af74 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2376,7 +2376,7 @@ class YoutubeWatchLaterIE(YoutubePlaylistIE): }] def _real_extract(self, url): - video = self._check_download_just_video(url, 'WL') + _, video = self._check_download_just_video(url, 'WL') if video: return video _, playlist = self._extract_playlist('WL') From 091624f9da491ef3a98e63367bf4ffd9836dafde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 4 Sep 2016 03:39:13 +0700 Subject: [PATCH 40/92] [vimple] Extend _VALID_URL (Closes #10547) --- youtube_dl/extractor/vimple.py | 35 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/vimple.py b/youtube_dl/extractor/vimple.py index 92321d66e..7fd9b777b 100644 --- a/youtube_dl/extractor/vimple.py +++ b/youtube_dl/extractor/vimple.py @@ -28,23 +28,24 @@ class SprutoBaseIE(InfoExtractor): class VimpleIE(SprutoBaseIE): IE_DESC = 'Vimple - one-click video hosting' - _VALID_URL = r'https?://(?:player\.vimple\.ru/iframe|vimple\.ru)/(?P[\da-f-]{32,36})' - _TESTS = [ - { - 'url': 'http://vimple.ru/c0f6b1687dcd4000a97ebe70068039cf', - 'md5': '2e750a330ed211d3fd41821c6ad9a279', - 'info_dict': { - 'id': 'c0f6b168-7dcd-4000-a97e-be70068039cf', - 'ext': 'mp4', - 'title': 'Sunset', - 'duration': 20, - 'thumbnail': 're:https?://.*?\.jpg', - }, - }, { - 'url': 'http://player.vimple.ru/iframe/52e1beec-1314-4a83-aeac-c61562eadbf9', - 'only_matching': True, - } - ] + _VALID_URL = r'https?://(?:player\.vimple\.(?:ru|co)/iframe|vimple\.(?:ru|co))/(?P[\da-f-]{32,36})' + _TESTS = [{ + 'url': 'http://vimple.ru/c0f6b1687dcd4000a97ebe70068039cf', + 'md5': '2e750a330ed211d3fd41821c6ad9a279', + 'info_dict': { + 'id': 'c0f6b168-7dcd-4000-a97e-be70068039cf', + 'ext': 'mp4', + 'title': 'Sunset', + 'duration': 20, + 'thumbnail': 're:https?://.*?\.jpg', + }, + }, { + 'url': 'http://player.vimple.ru/iframe/52e1beec-1314-4a83-aeac-c61562eadbf9', + 'only_matching': True, + }, { + 'url': 'http://vimple.co/04506a053f124483b8fb05ed73899f19', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From 37c7490ac62d4aacbf9103bf6760d20f21984a55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 4 Sep 2016 04:59:46 +0700 Subject: [PATCH 41/92] [espn] Extend _VALID_URL (Closes #10549) --- youtube_dl/extractor/espn.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py index 66c08bec4..6d10f8e68 100644 --- a/youtube_dl/extractor/espn.py +++ b/youtube_dl/extractor/espn.py @@ -5,7 +5,7 @@ from ..utils import remove_end class ESPNIE(InfoExtractor): - _VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P[^/]+)' + _VALID_URL = r'https?://(?:espn\.go|(?:www\.)?espn)\.com/(?:[^/]+/)*(?P[^/]+)' _TESTS = [{ 'url': 'http://espn.go.com/video/clip?id=10365079', 'md5': '60e5d097a523e767d06479335d1bdc58', @@ -47,6 +47,9 @@ class ESPNIE(InfoExtractor): }, { 'url': 'http://espn.go.com/nba/playoffs/2015/story/_/id/12887571/john-wall-washington-wizards-no-swelling-left-hand-wrist-game-5-return', 'only_matching': True, + }, { + 'url': 'http://www.espn.com/video/clip?id=10365079', + 'only_matching': True, }] def _real_extract(self, url): From 622638512b8241c39837b634e75c44cf9105a299 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 4 Sep 2016 16:25:59 +0800 Subject: [PATCH 42/92] [rottentomatoes] Fix extraction Closes #10467 --- ChangeLog | 1 + youtube_dl/extractor/rottentomatoes.py | 30 +++++++++++++++++++------- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/ChangeLog b/ChangeLog index 2809e55d7..e6a2d24e1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version Extractors +* [rottentomatoes] Fix extraction (#10467) * [youjizz] Fix extraction (#10437) + [foxnews] Add support for FoxNews Insider (#10445) + [fc2] Recognize Flash player URLs (#10512) diff --git a/youtube_dl/extractor/rottentomatoes.py b/youtube_dl/extractor/rottentomatoes.py index f9cd48790..df39ed3f2 100644 --- a/youtube_dl/extractor/rottentomatoes.py +++ b/youtube_dl/extractor/rottentomatoes.py @@ -1,8 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urlparse -from .internetvideoarchive import InternetVideoArchiveIE +from ..utils import js_to_json class RottenTomatoesIE(InfoExtractor): @@ -11,21 +10,36 @@ class RottenTomatoesIE(InfoExtractor): _TEST = { 'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/', 'info_dict': { - 'id': '613340', + 'id': '11028566', 'ext': 'mp4', 'title': 'Toy Story 3', + 'thumbnail': 're:^https?://.*\.jpg$', }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - og_video = self._og_search_video_url(webpage) - query = compat_urlparse.urlparse(og_video).query + + params = self._parse_json( + self._search_regex(r'(?s)RTVideo\(({.+?})\);', webpage, 'player parameters'), + video_id, transform_source=lambda s: js_to_json(s.replace('window.location.href', '""'))) + + formats = [] + if params.get('urlHLS'): + formats.extend(self._extract_m3u8_formats( + params['urlHLS'], video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + if params.get('urlMP4'): + formats.append({ + 'url': params['urlMP4'], + 'format_id': 'mp4', + }) + self._sort_formats(formats) return { - '_type': 'url_transparent', - 'url': InternetVideoArchiveIE._build_xml_url(query), - 'ie_key': InternetVideoArchiveIE.ie_key(), + 'id': video_id, 'title': self._og_search_title(webpage), + 'formats': formats, + 'thumbnail': params.get('thumbnailImg'), } From b29cd56591f1ef001d9f30bdff87789815f1fa0c Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 4 Sep 2016 17:01:39 +0800 Subject: [PATCH 43/92] [pornovoisines] Fix extraction (closes #10469) --- ChangeLog | 1 + youtube_dl/extractor/pornovoisines.py | 80 +++++++++++++++------------ 2 files changed, 47 insertions(+), 34 deletions(-) diff --git a/ChangeLog b/ChangeLog index e6a2d24e1..616b55803 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version Extractors +* [pornvoisines] Fix extraction (#10469) * [rottentomatoes] Fix extraction (#10467) * [youjizz] Fix extraction (#10437) + [foxnews] Add support for FoxNews Insider (#10445) diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py index 6b51e5c54..58f557e39 100644 --- a/youtube_dl/extractor/pornovoisines.py +++ b/youtube_dl/extractor/pornovoisines.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import random from .common import InfoExtractor from ..utils import ( @@ -13,61 +12,69 @@ from ..utils import ( class PornoVoisinesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/showvideo/(?P\d+)/(?P[^/]+)' - - _VIDEO_URL_TEMPLATE = 'http://stream%d.pornovoisines.com' \ - '/static/media/video/transcoded/%s-640x360-1000-trscded.mp4' - - _SERVER_NUMBERS = (1, 2) + _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/videos/show/(?P\d+)/(?P[^/.]+)' _TEST = { - 'url': 'http://www.pornovoisines.com/showvideo/1285/recherche-appartement/', - 'md5': '5ac670803bc12e9e7f9f662ce64cf1d1', + 'url': 'http://www.pornovoisines.com/videos/show/919/recherche-appartement.html', + 'md5': '6f8aca6a058592ab49fe701c8ba8317b', 'info_dict': { - 'id': '1285', + 'id': '919', 'display_id': 'recherche-appartement', 'ext': 'mp4', 'title': 'Recherche appartement', - 'description': 'md5:819ea0b785e2a04667a1a01cdc89594e', + 'description': 'md5:fe10cb92ae2dd3ed94bb4080d11ff493', 'thumbnail': 're:^https?://.*\.jpg$', 'upload_date': '20140925', 'duration': 120, 'view_count': int, 'average_rating': float, - 'categories': ['Débutantes', 'Scénario', 'Sodomie'], + 'categories': ['Débutante', 'Débutantes', 'Scénario', 'Sodomie'], 'age_limit': 18, + 'subtitles': { + 'fr': [{ + 'ext': 'vtt', + }] + }, } } - @classmethod - def build_video_url(cls, num): - return cls._VIDEO_URL_TEMPLATE % (random.choice(cls._SERVER_NUMBERS), num) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') + settings_url = self._download_json( + 'http://www.pornovoisines.com/api/video/%s/getsettingsurl/' % video_id, + video_id, note='Getting settings URL')['video_settings_url'] + settings = self._download_json(settings_url, video_id)['data'] + + formats = [] + for kind, data in settings['variants'].items(): + if kind == 'HLS': + formats.extend(self._extract_m3u8_formats( + data, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls')) + elif kind == 'MP4': + for item in data: + formats.append({ + 'url': item['url'], + 'height': item.get('height'), + 'bitrate': item.get('bitrate'), + }) + self._sort_formats(formats) + webpage = self._download_webpage(url, video_id) - video_url = self.build_video_url(video_id) + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) - title = self._html_search_regex( - r'

(.+?)

', webpage, 'title', flags=re.DOTALL) - description = self._html_search_regex( - r'
(.+?)
', - webpage, 'description', fatal=False, flags=re.DOTALL) - - thumbnail = self._search_regex( - r'
\s*]+class=([\'"])thumb\1[^>]*src=([\'"])(?P[^"]+)\2', + webpage, 'thumbnail', fatal=False, group='url') upload_date = unified_strdate(self._search_regex( - r'Publié le ([\d-]+)', webpage, 'upload date', fatal=False)) - duration = int_or_none(self._search_regex( - 'Durée (\d+)', webpage, 'duration', fatal=False)) + r'Le\s*([\d/]+)', webpage, 'upload date', fatal=False)) + duration = settings.get('main', {}).get('duration') view_count = int_or_none(self._search_regex( r'(\d+) vues', webpage, 'view count', fatal=False)) average_rating = self._search_regex( @@ -75,15 +82,19 @@ class PornoVoisinesIE(InfoExtractor): if average_rating: average_rating = float_or_none(average_rating.replace(',', '.')) - categories = self._html_search_meta( - 'keywords', webpage, 'categories', fatal=False) + categories = self._html_search_regex( + r'(?s)Catégories\s*:\s*(.+?)', webpage, 'categories', fatal=False) if categories: categories = [category.strip() for category in categories.split(',')] + subtitles = {'fr': [{ + 'url': subtitle, + } for subtitle in settings.get('main', {}).get('vtt_tracks', {}).values()]} + return { 'id': video_id, 'display_id': display_id, - 'url': video_url, + 'formats': formats, 'title': title, 'description': description, 'thumbnail': thumbnail, @@ -93,4 +104,5 @@ class PornoVoisinesIE(InfoExtractor): 'average_rating': average_rating, 'categories': categories, 'age_limit': 18, + 'subtitles': subtitles, } From 919cf1a62f022c61cfa65498e8c1b1cc0d21046e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sat, 3 Sep 2016 23:00:52 +0800 Subject: [PATCH 44/92] [downloader/dash] Abort if the first segment fails Closes #10497, Closes #10542 --- ChangeLog | 4 ++++ youtube_dl/downloader/dash.py | 20 +++++++++++++------- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/ChangeLog b/ChangeLog index 616b55803..1d277b562 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,9 @@ version +Core +* If the first segment of DASH fails, abort the whole download process to + prevent throttling (#10497) + Extractors * [pornvoisines] Fix extraction (#10469) * [rottentomatoes] Fix extraction (#10467) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index efeae02a3..41fc9cfc2 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -40,7 +40,8 @@ class DashSegmentsFD(FragmentFD): fragment_retries = self.params.get('fragment_retries', 0) skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) - def append_url_to_file(target_url, tmp_filename, segment_name): + def process_segment(segment, tmp_filename, fatal): + target_url, segment_name = segment target_filename = '%s-%s' % (tmp_filename, segment_name) count = 0 while count <= fragment_retries: @@ -64,18 +65,23 @@ class DashSegmentsFD(FragmentFD): if count <= fragment_retries: self.report_retry_fragment(err, segment_name, count, fragment_retries) if count > fragment_retries: - if skip_unavailable_fragments: + if not fatal: self.report_skip_fragment(segment_name) return True self.report_error('giving up after %s fragment retries' % fragment_retries) return False return True - if initialization_url: - if not append_url_to_file(initialization_url, ctx['tmpfilename'], 'Init'): - return False - for i, segment_url in enumerate(segment_urls): - if not append_url_to_file(segment_url, ctx['tmpfilename'], 'Seg%d' % i): + segments_to_download = [(initialization_url, 'Init')] if initialization_url else [] + segments_to_download.extend([ + (segment_url, 'Seg%d' % i) + for i, segment_url in enumerate(segment_urls)]) + + for i, segment in enumerate(segments_to_download): + # In DASH, the first segment contains necessary headers to + # generate a valid MP4 file, so always abort for the first segment + fatal = i == 0 or not skip_unavailable_fragments + if not process_segment(segment, ctx['tmpfilename'], fatal): return False self._finish_frag_download(ctx) From 0def758782c273e0a1c9984f895638845796715b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 4 Sep 2016 11:42:15 +0100 Subject: [PATCH 45/92] [internetvideoarchive] extract all formats --- youtube_dl/extractor/common.py | 14 +++++++------- youtube_dl/extractor/internetvideoarchive.py | 15 ++++++++++++--- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a82968162..6edd5a769 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1163,13 +1163,6 @@ class InfoExtractor(object): m3u8_id=None, note=None, errnote=None, fatal=True, live=False): - formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)] - - format_url = lambda u: ( - u - if re.match(r'^https?://', u) - else compat_urlparse.urljoin(m3u8_url, u)) - res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', @@ -1180,6 +1173,13 @@ class InfoExtractor(object): m3u8_doc, urlh = res m3u8_url = urlh.geturl() + formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)] + + format_url = lambda u: ( + u + if re.match(r'^https?://', u) + else compat_urlparse.urljoin(m3u8_url, u)) + # We should try extracting formats only from master playlists [1], i.e. # playlists that describe available qualities. On the other hand media # playlists [2] should be returned as is since they contain just the media diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index 45add007f..76cc5ec3e 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -48,13 +48,23 @@ class InternetVideoArchiveIE(InfoExtractor): # There are multiple videos in the playlist whlie only the first one # matches the video played in browsers video_info = configuration['playlist'][0] + title = video_info['title'] formats = [] for source in video_info['sources']: file_url = source['file'] if determine_ext(file_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - file_url, video_id, ext='mp4', m3u8_id='hls')) + m3u8_formats = self._extract_m3u8_formats( + file_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + file_url = m3u8_formats[0]['url'] + formats.extend(self._extract_f4m_formats( + file_url.replace('.m3u8', '.f4m'), + video_id, f4m_id='hds', fatal=False)) + formats.extend(self._extract_mpd_formats( + file_url.replace('.m3u8', '.mpd'), + video_id, mpd_id='dash', fatal=False)) else: a_format = { 'url': file_url, @@ -70,7 +80,6 @@ class InternetVideoArchiveIE(InfoExtractor): self._sort_formats(formats) - title = video_info['title'] description = video_info.get('description') thumbnail = video_info.get('image') else: From 100bd86a68b5ee84669d162c9bcda31616f6596a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 4 Sep 2016 11:44:13 +0100 Subject: [PATCH 46/92] [rottentomatoes] delegate extraction to InternetVideoArchiveIE --- youtube_dl/extractor/rottentomatoes.py | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/rottentomatoes.py b/youtube_dl/extractor/rottentomatoes.py index df39ed3f2..23abf7a27 100644 --- a/youtube_dl/extractor/rottentomatoes.py +++ b/youtube_dl/extractor/rottentomatoes.py @@ -1,7 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import js_to_json +from .internetvideoarchive import InternetVideoArchiveIE class RottenTomatoesIE(InfoExtractor): @@ -13,6 +13,7 @@ class RottenTomatoesIE(InfoExtractor): 'id': '11028566', 'ext': 'mp4', 'title': 'Toy Story 3', + 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', 'thumbnail': 're:^https?://.*\.jpg$', }, } @@ -20,26 +21,12 @@ class RottenTomatoesIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - - params = self._parse_json( - self._search_regex(r'(?s)RTVideo\(({.+?})\);', webpage, 'player parameters'), - video_id, transform_source=lambda s: js_to_json(s.replace('window.location.href', '""'))) - - formats = [] - if params.get('urlHLS'): - formats.extend(self._extract_m3u8_formats( - params['urlHLS'], video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - if params.get('urlMP4'): - formats.append({ - 'url': params['urlMP4'], - 'format_id': 'mp4', - }) - self._sort_formats(formats) + iva_id = self._search_regex(r'publishedid=(\d+)', webpage, 'internet video archive id') return { + '_type': 'url_transparent', + 'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?domain=www.videodetective.com&customerid=69249&playerid=641&publishedid=' + iva_id, + 'ie_key': InternetVideoArchiveIE.ie_key(), 'id': video_id, 'title': self._og_search_title(webpage), - 'formats': formats, - 'thumbnail': params.get('thumbnailImg'), } From feaa5ad787cdc28e4b6979f1c7798134b1bee723 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 4 Sep 2016 20:12:34 +0700 Subject: [PATCH 47/92] [youtube:playlist] Extend _VALID_URL --- youtube_dl/extractor/youtube.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 0bc85af74..8fc26bd02 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -264,7 +264,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ) )? # all until now is optional -> you can pass the naked ID ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID - (?!.*?&list=) # combined list/video URLs are handled by the playlist IE + (?!.*?\blist=) # combined list/video URLs are handled by the playlist IE (?(1).+)? # if we found the ID, everything can follow $""" _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' @@ -1778,11 +1778,14 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): _VALID_URL = r"""(?x)(?: (?:https?://)? (?:\w+\.)? - youtube\.com/ (?: - (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries) - \? (?:.*?[&;])*? (?:p|a|list)= - | p/ + youtube\.com/ + (?: + (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries) + \? (?:.*?[&;])*? (?:p|a|list)= + | p/ + )| + youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= ) ( (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,} @@ -1887,6 +1890,9 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'skip_download': True, }, 'add_ie': [YoutubeIE.ie_key()], + }, { + 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', + 'only_matching': True, }] def _real_initialize(self): From 433af6ad3002424ecb316e23946722d54010dbe1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 4 Sep 2016 14:18:41 +0100 Subject: [PATCH 48/92] [theplatform] fix player regex(closes #10546) --- youtube_dl/extractor/theplatform.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 23067e8c6..6febf805b 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -96,7 +96,7 @@ class ThePlatformBaseIE(OnceIE): class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P[^/]+)/ - (?:(?:(?:[^/]+/)+select/)?(?Pmedia/(?:guid/\d+/)?)|(?P(?:[^/\?]+/(?:swf|config)|onsite)/select/))? + (?:(?:(?:[^/]+/)+select/)?(?Pmedia/(?:guid/\d+/)?)?|(?P(?:[^/\?]+/(?:swf|config)|onsite)/select/))? |theplatform:)(?P[^/\?&]+)''' _TESTS = [{ @@ -116,6 +116,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): # rtmp download 'skip_download': True, }, + 'skip': '404 Not Found', }, { # from http://www.cnet.com/videos/tesla-model-s-a-second-step-towards-a-cleaner-motoring-future/ 'url': 'http://link.theplatform.com/s/kYEXFC/22d_qsQ6MIRT', From d9606d9b6cb44ee7600abf63333db4b88532a391 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 4 Sep 2016 20:51:48 +0700 Subject: [PATCH 49/92] release 2016.09.04 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- README.md | 7 ++++++- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 5 files changed, 13 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index fc18e733b..1ddb3ef85 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.09.03*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.09.03** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.09.04*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.09.04** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.09.03 +[debug] youtube-dl version 2016.09.04 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 1d277b562..a26f5d4aa 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2016.09.04 Core * If the first segment of DASH fails, abort the whole download process to diff --git a/README.md b/README.md index 87465aa5e..207b633db 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,8 @@ which means you can modify it, redistribute it or use it however you like. --mark-watched Mark videos watched (YouTube only) --no-mark-watched Do not mark videos watched (YouTube only) --no-color Do not emit color codes in output + --abort-on-unavailable-fragment Abort downloading when some fragment is not + available ## Network Options: --proxy URL Use the specified HTTP/HTTPS/SOCKS proxy. @@ -173,7 +175,10 @@ which means you can modify it, redistribute it or use it however you like. -R, --retries RETRIES Number of retries (default is 10), or "infinite". --fragment-retries RETRIES Number of retries for a fragment (default - is 10), or "infinite" (DASH only) + is 10), or "infinite" (DASH and hlsnative + only) + --skip-unavailable-fragments Skip unavailable fragments (DASH and + hlsnative only) --buffer-size SIZE Size of download buffer (e.g. 1024 or 16K) (default is 1024) --no-resize-buffer Do not automatically adjust the buffer diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 015332bca..9e21016f7 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -232,6 +232,7 @@ - **FacebookPluginsVideo** - **faz.net** - **fc2** + - **fc2:embed** - **Fczenit** - **features.aol.com** - **fernsehkritik.tv** @@ -245,6 +246,7 @@ - **FOX** - **Foxgay** - **FoxNews**: Fox News and Fox Business Video + - **foxnews:insider** - **FoxSports** - **france2.fr:generation-quoi** - **FranceCulture** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 5be8c0122..3d12a47e8 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.09.03' +__version__ = '2016.09.04' From 8112bfeabae792754f51e0c012ed34c4dc521bac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 4 Sep 2016 20:57:18 +0700 Subject: [PATCH 50/92] [ChangeLog] Actualize --- ChangeLog | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index a26f5d4aa..a542496a3 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,12 +1,26 @@ -version 2016.09.04 +version Core -* If the first segment of DASH fails, abort the whole download process to - prevent throttling (#10497) +* In DASH downloader if the first segment fails, abort the whole download + process to prevent throttling (#10497) ++ Add support for --skip-unavailable-fragments and --fragment retries in + hlsnative downloader (#10165, #10448). ++ Add support for --skip-unavailable-fragments in DASH downloader ++ Introduce --skip-unavailable-fragments option for fragment based downloaders + that allows to skip fragments unavailable due to a HTTP error +* Fix extraction of video/audio entries with src attribute in + _parse_html5_media_entries (#10540) Extractors +* [theplatform] Relax URL regular expression (#10546) +* [youtube:playlist] Extend URL regular expression +* [rottentomatoes] Delegate extraction to internetvideoarchive extractor +* [internetvideoarchive] Extract all formats * [pornvoisines] Fix extraction (#10469) * [rottentomatoes] Fix extraction (#10467) +* [espn] Extend URL regular expression (#10549) +* [vimple] Extend URL regular expression (#10547) +* [youtube:watchlater] Fix extraction (#10544) * [youjizz] Fix extraction (#10437) + [foxnews] Add support for FoxNews Insider (#10445) + [fc2] Recognize Flash player URLs (#10512) @@ -19,7 +33,6 @@ Core _extract_m3u8_formats (#10522) * Handle semicolon in mimetype2ext - Extractors + [youtube] Add support for rental videos' previews (#10532) * [youtube:playlist] Fallback to video extraction for video/playlist URLs when From 48094901086534533ca89283067f2ab732857654 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 4 Sep 2016 20:58:28 +0700 Subject: [PATCH 51/92] release 2016.09.04.1 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- youtube_dl/version.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 1ddb3ef85..c03092442 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.09.04*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.09.04** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.09.04.1*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.09.04.1** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.09.04 +[debug] youtube-dl version 2016.09.04.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index a542496a3..d392513ce 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2016.09.04.1 Core * In DASH downloader if the first segment fails, abort the whole download diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 3d12a47e8..b2ea6dac6 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.09.04' +__version__ = '2016.09.04.1' From 78e762d23c48f85c61a8afcae29307912000a7dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Cepl?= Date: Thu, 1 Sep 2016 17:31:08 +0200 Subject: [PATCH 52/92] Add new extractor for TV Noe (Czech Christian TV). Fixes #10520 --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tvnoe.py | 44 ++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 youtube_dl/extractor/tvnoe.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8c6ee0503..e47adc26c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -916,6 +916,7 @@ from .tvc import ( ) from .tvigle import TvigleIE from .tvland import TVLandIE +from .tvnoe import TVNoeIE from .tvp import ( TVPEmbedIE, TVPIE, diff --git a/youtube_dl/extractor/tvnoe.py b/youtube_dl/extractor/tvnoe.py new file mode 100644 index 000000000..d50261ddd --- /dev/null +++ b/youtube_dl/extractor/tvnoe.py @@ -0,0 +1,44 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .jwplatform import JWPlatformBaseIE +from ..utils import clean_html, get_element_by_class, js_to_json + + +class TVNoeIE(JWPlatformBaseIE): + _VALID_URL = r'https?://(www\.)?tvnoe\.cz/video/(?P[0-9]+)' + _TEST = { + 'url': 'http://www.tvnoe.cz/video/10362', + 'md5': 'aee983f279aab96ec45ab6e2abb3c2ca', + 'info_dict': { + 'id': '10362', + 'ext': 'mp4', + 'series': 'Noční univerzita', + 'title': 'prof. Tomáš Halík, Th.D. - ' + + 'Návrat náboženství a střet civilizací', + 'description': 'md5:f337bae384e1a531a52c55ebc50fff41', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + iframe_url = self._search_regex(r']+src="([^"]+)"', + webpage, 'iframe src attribute') + + ifs_page = self._download_webpage(iframe_url, video_id) + jwplayer_data = self._parse_json(self._find_jwplayer_data(ifs_page), + video_id, transform_source=js_to_json) + info_dict = self._parse_jwplayer_data( + jwplayer_data, video_id, require_title=False, base_url=iframe_url) + + info_dict.update({ + 'id': video_id, + 'title': clean_html( + get_element_by_class('field-name-field-podnazev', webpage)), + 'description': clean_html(get_element_by_class('field-name-body', + webpage)), + 'series': clean_html(get_element_by_class('title', webpage)) + }) + return info_dict From 9127e1533d294eb672d783d1eeed15aeb9b2cbe1 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 5 Sep 2016 13:37:36 +0800 Subject: [PATCH 53/92] [tvnoe] PEP8 and coding style --- youtube_dl/extractor/tvnoe.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/tvnoe.py b/youtube_dl/extractor/tvnoe.py index d50261ddd..1cd3e6a58 100644 --- a/youtube_dl/extractor/tvnoe.py +++ b/youtube_dl/extractor/tvnoe.py @@ -2,7 +2,11 @@ from __future__ import unicode_literals from .jwplatform import JWPlatformBaseIE -from ..utils import clean_html, get_element_by_class, js_to_json +from ..utils import ( + clean_html, + get_element_by_class, + js_to_json, +) class TVNoeIE(JWPlatformBaseIE): @@ -14,8 +18,7 @@ class TVNoeIE(JWPlatformBaseIE): 'id': '10362', 'ext': 'mp4', 'series': 'Noční univerzita', - 'title': 'prof. Tomáš Halík, Th.D. - ' + - 'Návrat náboženství a střet civilizací', + 'title': 'prof. Tomáš Halík, Th.D. - Návrat náboženství a střet civilizací', 'description': 'md5:f337bae384e1a531a52c55ebc50fff41', } } @@ -24,21 +27,23 @@ class TVNoeIE(JWPlatformBaseIE): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - iframe_url = self._search_regex(r']+src="([^"]+)"', - webpage, 'iframe src attribute') + iframe_url = self._search_regex( + r']+src="([^"]+)"', webpage, 'iframe URL') ifs_page = self._download_webpage(iframe_url, video_id) - jwplayer_data = self._parse_json(self._find_jwplayer_data(ifs_page), - video_id, transform_source=js_to_json) + jwplayer_data = self._parse_json( + self._find_jwplayer_data(ifs_page), + video_id, transform_source=js_to_json) info_dict = self._parse_jwplayer_data( jwplayer_data, video_id, require_title=False, base_url=iframe_url) info_dict.update({ 'id': video_id, - 'title': clean_html( - get_element_by_class('field-name-field-podnazev', webpage)), - 'description': clean_html(get_element_by_class('field-name-body', - webpage)), + 'title': clean_html(get_element_by_class( + 'field-name-field-podnazev', webpage)), + 'description': clean_html(get_element_by_class( + 'field-name-body', webpage)), 'series': clean_html(get_element_by_class('title', webpage)) }) + return info_dict From b49ad71ce1d985165e07fd0f59f80f677434ad84 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 5 Sep 2016 13:38:55 +0800 Subject: [PATCH 54/92] [ChangeLog] Update for #10524 --- ChangeLog | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ChangeLog b/ChangeLog index d392513ce..0be9b0fbb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +version + +Extractors ++ [tvnoe] New extractor (#10524) + + version 2016.09.04.1 Core From 95be19d436d1938d104310e194e85ea5a10c3353 Mon Sep 17 00:00:00 2001 From: Xie Yanbo Date: Sun, 4 Sep 2016 23:23:40 +0800 Subject: [PATCH 55/92] [miaopai] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/miaopai.py | 44 ++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 youtube_dl/extractor/miaopai.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8c6ee0503..d511b04bc 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -471,6 +471,7 @@ from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mgoon import MgoonIE from .mgtv import MGTVIE +from .miaopai import MiaoPaiIE from .microsoftvirtualacademy import ( MicrosoftVirtualAcademyIE, MicrosoftVirtualAcademyCourseIE, diff --git a/youtube_dl/extractor/miaopai.py b/youtube_dl/extractor/miaopai.py new file mode 100644 index 000000000..c36b441b8 --- /dev/null +++ b/youtube_dl/extractor/miaopai.py @@ -0,0 +1,44 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import sanitized_Request + + +class MiaoPaiIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?miaopai\.com/show/(?P[-A-Za-z0-9~_]+).htm' + _TEST = { + 'url': 'http://www.miaopai.com/show/n~0hO7sfV1nBEw4Y29-Hqg__.htm', + 'md5': '095ed3f1cd96b821add957bdc29f845b', + 'info_dict': { + 'id': 'n~0hO7sfV1nBEw4Y29-Hqg__', + 'ext': 'mp4', + 'title': '西游记音乐会的秒拍视频', + 'thumbnail': 're:^https?://.*/n~0hO7sfV1nBEw4Y29-Hqg___m.jpg', + } + } + + _USER_AGENT_IPAD = 'User-Agent:Mozilla/5.0 ' \ + '(iPad; CPU OS 9_1 like Mac OS X) ' \ + 'AppleWebKit/601.1.46 (KHTML, like Gecko) ' \ + 'Version/9.0 Mobile/13B143 Safari/601.1' + + def _real_extract(self, url): + video_id = self._match_id(url) + request = sanitized_Request(url) + request.add_header('User-Agent', self._USER_AGENT_IPAD) + webpage = self._download_webpage(request, video_id) + + title = self._html_search_regex(r'([^<]*)', + webpage, + 'title') + regex = r"""
]*data-url=['"]([^'"]*\.jpg)['"]""" + thumbnail = self._html_search_regex(regex, webpage, '') + regex = r"""', webpage, 'comment_count', + r'([0-9]+) Kommentare', webpage, 'comment_count', fatal=False)) - return { + info_dict.update({ 'id': video_id, - 'title': title, 'url': url, 'ext': 'mp4', - 'thumbnail': thumbnail, - 'description': description, - 'upload_date': upload_date, - 'duration': duration, 'view_count': view_count, 'comment_count': comment_count - } + }) + + return info_dict From f87feb4b688f7b6e02714f8405c8343e27956400 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 8 Sep 2016 00:28:33 +0800 Subject: [PATCH 73/92] [miaopai] Coding style (#10556) --- youtube_dl/extractor/miaopai.py | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/miaopai.py b/youtube_dl/extractor/miaopai.py index 2477d1009..f9e35ac7f 100644 --- a/youtube_dl/extractor/miaopai.py +++ b/youtube_dl/extractor/miaopai.py @@ -2,11 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import sanitized_Request class MiaoPaiIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?miaopai\.com/show/(?P[-A-Za-z0-9~_]+).htm' + _VALID_URL = r'https?://(?:www\.)?miaopai\.com/show/(?P[-A-Za-z0-9~_]+)' _TEST = { 'url': 'http://www.miaopai.com/show/n~0hO7sfV1nBEw4Y29-Hqg__.htm', 'md5': '095ed3f1cd96b821add957bdc29f845b', @@ -18,27 +17,24 @@ class MiaoPaiIE(InfoExtractor): } } - _USER_AGENT_IPAD = 'User-Agent:Mozilla/5.0 ' \ - '(iPad; CPU OS 9_1 like Mac OS X) ' \ - 'AppleWebKit/601.1.46 (KHTML, like Gecko) ' \ - 'Version/9.0 Mobile/13B143 Safari/601.1' + _USER_AGENT_IPAD = 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1' def _real_extract(self, url): video_id = self._match_id(url) - request = sanitized_Request(url) - request.add_header('User-Agent', self._USER_AGENT_IPAD) - webpage = self._download_webpage(request, video_id) + webpage = self._download_webpage( + url, video_id, headers={'User-Agent': self._USER_AGENT_IPAD}) - title = self._html_search_regex(r'([^<]*)', - webpage, - 'title') - regex = r"""
]*data-url=['"]([^'"]*\.jpg)['"]""" - thumbnail = self._html_search_regex(regex, webpage, '') + title = self._html_search_regex( + r'([^<]+)', webpage, 'title') + thumbnail = self._html_search_regex( + r']+class=(?P[\'"]).*\bvideo_img\b.*(?P=q1)[^>]+data-url=(?P[\'"])(?P[^\'"]+)(?P=q2)', + webpage, 'thumbnail', fatal=False, group='url') videos = self._parse_html5_media_entries(url, webpage, video_id) info = videos[0] - info.update({'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - }) + info.update({ + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + }) return info From b84d311d53a4cc023eb458072c09cf382f45c762 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 8 Sep 2016 00:29:55 +0800 Subject: [PATCH 74/92] [ChangeLog] Update for #10556 --- ChangeLog | 1 + 1 file changed, 1 insertion(+) diff --git a/ChangeLog b/ChangeLog index 8322af9d1..52c066c99 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version Extractors ++ [miaopai] New extractor (#10556) * [gamestar] Fix metadata extraction (#10479) + [bilibili] Support episodes (#10190) + [tvnoe] New extractor (#10524) From d7e794928d1b1386a8f5960ce6ece3ae88d975a1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 7 Sep 2016 17:29:06 +0100 Subject: [PATCH 75/92] [tlc] fix query string parsing --- youtube_dl/extractor/tlc.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py index abad3ff64..88eb83d74 100644 --- a/youtube_dl/extractor/tlc.py +++ b/youtube_dl/extractor/tlc.py @@ -1,10 +1,14 @@ # encoding: utf-8 from __future__ import unicode_literals + import re from .common import InfoExtractor from .brightcove import BrightcoveLegacyIE -from ..compat import compat_parse_qs +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) class TlcDeIE(InfoExtractor): @@ -35,5 +39,5 @@ class TlcDeIE(InfoExtractor): title = mobj.group('title') webpage = self._download_webpage(url, title) brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - brightcove_id = compat_parse_qs(brightcove_legacy_url)['@videoPlayer'][0] + brightcove_id = compat_parse_qs(compat_urlparse.urlparse(brightcove_legacy_url).query)['@videoPlayer'][0] return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) From 6656a8248166039d83a3fb4401a7e11e03202ac9 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 7 Sep 2016 17:32:35 +0100 Subject: [PATCH 76/92] [rmcdecouverte] Add new extractor(closes #9709) --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/rmcdecouverte.py | 39 +++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 youtube_dl/extractor/rmcdecouverte.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 522ef7d8b..8d9c2ae13 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -724,6 +724,7 @@ from .revision3 import ( ) from .rice import RICEIE from .ringtv import RingTVIE +from .rmcdecouverte import RMCDecouverteIE from .ro220 import Ro220IE from .rockstargames import RockstarGamesIE from .roosterteeth import RoosterTeethIE diff --git a/youtube_dl/extractor/rmcdecouverte.py b/youtube_dl/extractor/rmcdecouverte.py new file mode 100644 index 000000000..f3bb4fa66 --- /dev/null +++ b/youtube_dl/extractor/rmcdecouverte.py @@ -0,0 +1,39 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .brightcove import BrightcoveLegacyIE +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) + + +class RMCDecouverteIE(InfoExtractor): + _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/mediaplayer-replay.*?\bid=(?P\d+)' + + _TEST = { + 'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=1430&title=LES%20HEROS%20DU%2088e%20ETAGE', + 'info_dict': { + 'id': '5111223049001', + 'ext': 'mp4', + 'title': ': LES HEROS DU 88e ETAGE', + 'description': 'Découvrez comment la bravoure de deux hommes dans la Tour Nord du World Trade Center a sauvé la vie d\'innombrables personnes le 11 septembre 2001.', + 'uploader_id': '1969646226001', + 'upload_date': '20160904', + 'timestamp': 1472951103, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Only works from France', + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) + brightcove_id = compat_parse_qs(compat_urlparse.urlparse(brightcove_legacy_url).query)['@videoPlayer'][0] + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) From e78a5428b6c725bd01064f080d84700b83030b66 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 8 Sep 2016 01:59:31 +0800 Subject: [PATCH 77/92] [foxgay] Fix extraction (closes #10480) --- ChangeLog | 1 + youtube_dl/extractor/foxgay.py | 48 ++++++++++++++++++++++------------ 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/ChangeLog b/ChangeLog index 52c066c99..cec87d5cd 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,7 @@ version Extractors +* [foxgay] Fix extraction (#10480) + [miaopai] New extractor (#10556) * [gamestar] Fix metadata extraction (#10479) + [bilibili] Support episodes (#10190) diff --git a/youtube_dl/extractor/foxgay.py b/youtube_dl/extractor/foxgay.py index 70c1a815d..39174fcec 100644 --- a/youtube_dl/extractor/foxgay.py +++ b/youtube_dl/extractor/foxgay.py @@ -1,18 +1,24 @@ from __future__ import unicode_literals +import itertools + from .common import InfoExtractor +from ..utils import ( + get_element_by_id, + remove_end, +) class FoxgayIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?foxgay\.com/videos/(?:\S+-)?(?P\d+)\.shtml' _TEST = { 'url': 'http://foxgay.com/videos/fuck-turkish-style-2582.shtml', - 'md5': '80d72beab5d04e1655a56ad37afe6841', + 'md5': '344558ccfea74d33b7adbce22e577f54', 'info_dict': { 'id': '2582', 'ext': 'mp4', - 'title': 'md5:6122f7ae0fc6b21ebdf59c5e083ce25a', - 'description': 'md5:5e51dc4405f1fd315f7927daed2ce5cf', + 'title': 'Fuck Turkish-style', + 'description': 'md5:6ae2d9486921891efe89231ace13ffdf', 'age_limit': 18, 'thumbnail': 're:https?://.*\.jpg$', }, @@ -22,27 +28,35 @@ class FoxgayIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'(?P<title>.*?)', - webpage, 'title', fatal=False) - description = self._html_search_regex( - r'

(?P.*?)

', - webpage, 'description', fatal=False) + title = remove_end(self._html_search_regex( + r'([^<]+)', webpage, 'title'), ' - Foxgay.com') + description = get_element_by_id('inf_tit', webpage) + # The default user-agent with foxgay cookies leads to pages without videos + self._downloader.cookiejar.clear('.foxgay.com') # Find the URL for the iFrame which contains the actual video. + iframe_url = self._html_search_regex( + r']+src=([\'"])(?P[^\'"]+)\1', webpage, + 'video frame', group='url') iframe = self._download_webpage( - self._html_search_regex(r'iframe src="(?P.*?)"', webpage, 'video frame'), - video_id) - video_url = self._html_search_regex( - r"v_path = '(?Phttp://.*?)'", iframe, 'url') - thumb_url = self._html_search_regex( - r"t_path = '(?Phttp://.*?)'", iframe, 'thumbnail', fatal=False) + iframe_url, video_id, headers={'User-Agent': 'curl/7.50.1'}, + note='Downloading video frame') + video_data = self._parse_json(self._search_regex( + r'video_data\s*=\s*([^;]+);', iframe, 'video data'), video_id) + + formats = [{ + 'url': source, + 'height': resolution, + } for source, resolution in zip( + video_data['sources'], video_data.get('resolutions', itertools.repeat(None)))] + + self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'url': video_url, + 'formats': formats, 'description': description, - 'thumbnail': thumb_url, + 'thumbnail': video_data.get('act_vid', {}).get('thumb'), 'age_limit': 18, } From 89f257d6e57131a266efae629334fe5f4bcf96e9 Mon Sep 17 00:00:00 2001 From: stepshal Date: Thu, 8 Sep 2016 13:52:22 +0700 Subject: [PATCH 78/92] Add support for https for rest of the exctractors. --- youtube_dl/extractor/abcnews.py | 2 +- youtube_dl/extractor/dailymotion.py | 2 +- youtube_dl/extractor/karaoketv.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py index b61a6327c..8a5ae42f0 100644 --- a/youtube_dl/extractor/abcnews.py +++ b/youtube_dl/extractor/abcnews.py @@ -12,7 +12,7 @@ from ..compat import compat_urlparse class AbcNewsVideoIE(AMPIE): IE_NAME = 'abcnews:video' - _VALID_URL = 'http://abcnews.go.com/[^/]+/video/(?P[0-9a-z-]+)-(?P\d+)' + _VALID_URL = 'https?://abcnews.go.com/[^/]+/video/(?P[0-9a-z-]+)-(?P\d+)' _TESTS = [{ 'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932', diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 496883d15..62b0747a5 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -394,7 +394,7 @@ class DailymotionUserIE(DailymotionPlaylistIE): class DailymotionCloudIE(DailymotionBaseInfoExtractor): - _VALID_URL_PREFIX = r'http://api\.dmcloud\.net/(?:player/)?embed/' + _VALID_URL_PREFIX = r'https?://api\.dmcloud\.net/(?:player/)?embed/' _VALID_URL = r'%s[^/]+/(?P[^/?]+)' % _VALID_URL_PREFIX _VALID_EMBED_URL = r'%s[^/]+/[^\'"]+' % _VALID_URL_PREFIX diff --git a/youtube_dl/extractor/karaoketv.py b/youtube_dl/extractor/karaoketv.py index a6050c4de..2f37786df 100644 --- a/youtube_dl/extractor/karaoketv.py +++ b/youtube_dl/extractor/karaoketv.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class KaraoketvIE(InfoExtractor): - _VALID_URL = r'http://www.karaoketv.co.il/[^/]+/(?P\d+)' + _VALID_URL = r'https?://www.karaoketv.co.il/[^/]+/(?P\d+)' _TEST = { 'url': 'http://www.karaoketv.co.il/%D7%A9%D7%99%D7%A8%D7%99_%D7%A7%D7%A8%D7%99%D7%95%D7%A7%D7%99/58356/%D7%90%D7%99%D7%96%D7%95%D7%9F', 'info_dict': { From 881f35479d50f22955fa21b148aa849a64d9dfc0 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 8 Sep 2016 17:22:43 +0800 Subject: [PATCH 79/92] Credit @xyb for miaopai extractor (#10556) --- AUTHORS | 1 + 1 file changed, 1 insertion(+) diff --git a/AUTHORS b/AUTHORS index 78660f014..937742c5d 100644 --- a/AUTHORS +++ b/AUTHORS @@ -184,3 +184,4 @@ Pratyush Singh Aleksander Nitecki Sebastian Blunt Matěj Cepl +Xie Yanbo From 3f612f076708973fce52d6b6053b24e2234a9c26 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 8 Sep 2016 17:39:29 +0800 Subject: [PATCH 80/92] Fix _VALID_URLs further (#10594) --- youtube_dl/extractor/abcnews.py | 2 +- youtube_dl/extractor/karaoketv.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py index 8a5ae42f0..01247fead 100644 --- a/youtube_dl/extractor/abcnews.py +++ b/youtube_dl/extractor/abcnews.py @@ -12,7 +12,7 @@ from ..compat import compat_urlparse class AbcNewsVideoIE(AMPIE): IE_NAME = 'abcnews:video' - _VALID_URL = 'https?://abcnews.go.com/[^/]+/video/(?P[0-9a-z-]+)-(?P\d+)' + _VALID_URL = r'https?://abcnews\.go\.com/[^/]+/video/(?P[0-9a-z-]+)-(?P\d+)' _TESTS = [{ 'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932', diff --git a/youtube_dl/extractor/karaoketv.py b/youtube_dl/extractor/karaoketv.py index 2f37786df..bad46005b 100644 --- a/youtube_dl/extractor/karaoketv.py +++ b/youtube_dl/extractor/karaoketv.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class KaraoketvIE(InfoExtractor): - _VALID_URL = r'https?://www.karaoketv.co.il/[^/]+/(?P\d+)' + _VALID_URL = r'https?://www\.karaoketv\.co\.il/[^/]+/(?P\d+)' _TEST = { 'url': 'http://www.karaoketv.co.il/%D7%A9%D7%99%D7%A8%D7%99_%D7%A7%D7%A8%D7%99%D7%95%D7%A7%D7%99/58356/%D7%90%D7%99%D7%96%D7%95%D7%9F', 'info_dict': { From 25042f73722c37e4ec88030cf69e23ae76c4359b Mon Sep 17 00:00:00 2001 From: stepshal Date: Thu, 8 Sep 2016 17:04:57 +0700 Subject: [PATCH 81/92] Add missing r prefix for _VALID_URLs --- youtube_dl/extractor/abcnews.py | 2 +- youtube_dl/extractor/ard.py | 2 +- youtube_dl/extractor/globo.py | 4 ++-- youtube_dl/extractor/onet.py | 2 +- youtube_dl/extractor/rutube.py | 2 +- youtube_dl/extractor/spiegel.py | 2 +- youtube_dl/extractor/twitter.py | 2 +- youtube_dl/extractor/youtube.py | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py index 01247fead..6ae5d9a96 100644 --- a/youtube_dl/extractor/abcnews.py +++ b/youtube_dl/extractor/abcnews.py @@ -49,7 +49,7 @@ class AbcNewsVideoIE(AMPIE): class AbcNewsIE(InfoExtractor): IE_NAME = 'abcnews' - _VALID_URL = 'https?://abcnews\.go\.com/(?:[^/]+/)+(?P[0-9a-z-]+)/story\?id=(?P\d+)' + _VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P[0-9a-z-]+)/story\?id=(?P\d+)' _TESTS = [{ 'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY', diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 07e67dd33..3a806a69b 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -238,7 +238,7 @@ class ARDMediathekIE(InfoExtractor): class ARDIE(InfoExtractor): - _VALID_URL = '(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos/(?P[^/?#]+)-(?P[0-9]+))\.html' + _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos/(?P[^/?#]+)-(?P[0-9]+))\.html' _TEST = { 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', 'md5': 'd216c3a86493f9322545e045ddc3eb35', diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index dbacbfc61..5638be48f 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -19,7 +19,7 @@ from ..utils import ( class GloboIE(InfoExtractor): - _VALID_URL = '(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P\d{7,})' + _VALID_URL = r'(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P\d{7,})' _API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist' _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=17.0.0.132&resource_id=%s' @@ -396,7 +396,7 @@ class GloboIE(InfoExtractor): class GloboArticleIE(InfoExtractor): - _VALID_URL = 'https?://.+?\.globo\.com/(?:[^/]+/)*(?P[^/]+)(?:\.html)?' + _VALID_URL = r'https?://.+?\.globo\.com/(?:[^/]+/)*(?P[^/]+)(?:\.html)?' _VIDEOID_REGEXES = [ r'\bdata-video-id=["\'](\d{7,})', diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py index fc22ad5eb..9cbc7c2e2 100644 --- a/youtube_dl/extractor/onet.py +++ b/youtube_dl/extractor/onet.py @@ -90,7 +90,7 @@ class OnetBaseIE(InfoExtractor): class OnetIE(OnetBaseIE): - _VALID_URL = 'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P[0-9a-z-]+)/(?P[0-9a-z]+)' + _VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P[0-9a-z-]+)/(?P[0-9a-z]+)' IE_NAME = 'onet.tv' _TEST = { diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 9ca4ae147..5d0ace5bf 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -88,7 +88,7 @@ class RutubeIE(InfoExtractor): class RutubeEmbedIE(InfoExtractor): IE_NAME = 'rutube:embed' IE_DESC = 'Rutube embedded videos' - _VALID_URL = 'https?://rutube\.ru/(?:video|play)/embed/(?P[0-9]+)' + _VALID_URL = r'https?://rutube\.ru/(?:video|play)/embed/(?P[0-9]+)' _TESTS = [{ 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 3c552807e..74cb3a08a 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -103,7 +103,7 @@ class SpiegelIE(InfoExtractor): class SpiegelArticleIE(InfoExtractor): - _VALID_URL = 'https?://www\.spiegel\.de/(?!video/)[^?#]*?-(?P[0-9]+)\.html' + _VALID_URL = r'https?://www\.spiegel\.de/(?!video/)[^?#]*?-(?P[0-9]+)\.html' IE_NAME = 'Spiegel:Article' IE_DESC = 'Articles on spiegel.de' _TESTS = [{ diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index b73842986..c5a5843b6 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -342,7 +342,7 @@ class TwitterIE(InfoExtractor): class TwitterAmplifyIE(TwitterBaseIE): IE_NAME = 'twitter:amplify' - _VALID_URL = 'https?://amp\.twimg\.com/v/(?P[0-9a-f\-]{36})' + _VALID_URL = r'https?://amp\.twimg\.com/v/(?P[0-9a-f\-]{36})' _TEST = { 'url': 'https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951', diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 8fc26bd02..5082cb589 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -2417,7 +2417,7 @@ class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' - _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' + _VALID_URL = r'https?://www\.youtube\.com/feed/history|:ythistory' _FEED_NAME = 'history' _PLAYLIST_TITLE = 'Youtube History' From 010d034fca60d94435d719e554663fba1894f8b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 8 Sep 2016 22:38:49 +0700 Subject: [PATCH 82/92] [videomore] Fix extraction (Closes #10592) --- youtube_dl/extractor/videomore.py | 72 +++++++++++-------------------- 1 file changed, 25 insertions(+), 47 deletions(-) diff --git a/youtube_dl/extractor/videomore.py b/youtube_dl/extractor/videomore.py index 04e95c66e..328b5b7fb 100644 --- a/youtube_dl/extractor/videomore.py +++ b/youtube_dl/extractor/videomore.py @@ -6,8 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( int_or_none, - parse_age_limit, - parse_iso8601, + xpath_element, xpath_text, ) @@ -17,38 +16,32 @@ class VideomoreIE(InfoExtractor): _VALID_URL = r'videomore:(?P\d+)$|https?://videomore\.ru/(?:(?:embed|[^/]+/[^/]+)/|[^/]+\?.*\btrack_id=)(?P\d+)(?:[/?#&]|\.(?:xml|json)|$)' _TESTS = [{ 'url': 'http://videomore.ru/kino_v_detalayah/5_sezon/367617', - 'md5': '70875fbf57a1cd004709920381587185', + 'md5': '44455a346edc0d509ac5b5a5b531dc35', 'info_dict': { 'id': '367617', 'ext': 'flv', - 'title': 'В гостях Алексей Чумаков и Юлия Ковальчук', - 'description': 'В гостях – лучшие романтические комедии года, «Выживший» Иньярриту и «Стив Джобс» Дэнни Бойла.', + 'title': 'Кино в деталях 5 сезон В гостях Алексей Чумаков и Юлия Ковальчук', 'series': 'Кино в деталях', 'episode': 'В гостях Алексей Чумаков и Юлия Ковальчук', - 'episode_number': None, - 'season': 'Сезон 2015', - 'season_number': 5, 'thumbnail': 're:^https?://.*\.jpg', 'duration': 2910, - 'age_limit': 16, 'view_count': int, + 'comment_count': int, + 'age_limit': 16, }, }, { 'url': 'http://videomore.ru/embed/259974', 'info_dict': { 'id': '259974', 'ext': 'flv', - 'title': '80 серия', - 'description': '«Медведей» ждет решающий матч. Макеев выясняет отношения со Стрельцовым. Парни узнают подробности прошлого Макеева.', + 'title': 'Молодежка 2 сезон 40 серия', 'series': 'Молодежка', - 'episode': '80 серия', - 'episode_number': 40, - 'season': '2 сезон', - 'season_number': 2, + 'episode': '40 серия', 'thumbnail': 're:^https?://.*\.jpg', 'duration': 2809, - 'age_limit': 16, 'view_count': int, + 'comment_count': int, + 'age_limit': 16, }, 'params': { 'skip_download': True, @@ -58,13 +51,8 @@ class VideomoreIE(InfoExtractor): 'info_dict': { 'id': '341073', 'ext': 'flv', - 'title': 'Команда проиграла из-за Бакина?', - 'description': 'Молодежка 3 сезон скоро', - 'series': 'Молодежка', + 'title': 'Промо Команда проиграла из-за Бакина?', 'episode': 'Команда проиграла из-за Бакина?', - 'episode_number': None, - 'season': 'Промо', - 'season_number': 99, 'thumbnail': 're:^https?://.*\.jpg', 'duration': 29, 'age_limit': 16, @@ -109,43 +97,33 @@ class VideomoreIE(InfoExtractor): 'http://videomore.ru/video/tracks/%s.xml' % video_id, video_id, 'Downloading video XML') - video_url = xpath_text(video, './/video_url', 'video url', fatal=True) + item = xpath_element(video, './/playlist/item', fatal=True) + + title = xpath_text( + item, ('./title', './episode_name'), 'title', fatal=True) + + video_url = xpath_text(item, './video_url', 'video url', fatal=True) formats = self._extract_f4m_formats(video_url, video_id, f4m_id='hds') self._sort_formats(formats) - data = self._download_json( - 'http://videomore.ru/video/tracks/%s.json' % video_id, - video_id, 'Downloading video JSON') + thumbnail = xpath_text(item, './thumbnail_url') + duration = int_or_none(xpath_text(item, './duration')) + view_count = int_or_none(xpath_text(item, './views')) + comment_count = int_or_none(xpath_text(item, './count_comments')) + age_limit = int_or_none(xpath_text(item, './min_age')) - title = data.get('title') or data['project_title'] - description = data.get('description') or data.get('description_raw') - timestamp = parse_iso8601(data.get('published_at')) - duration = int_or_none(data.get('duration')) - view_count = int_or_none(data.get('views')) - age_limit = parse_age_limit(data.get('min_age')) - thumbnails = [{ - 'url': thumbnail, - } for thumbnail in data.get('big_thumbnail_urls', [])] - - series = data.get('project_title') - episode = data.get('title') - episode_number = int_or_none(data.get('episode_of_season') or None) - season = data.get('season_title') - season_number = int_or_none(data.get('season_pos') or None) + series = xpath_text(item, './project_name') + episode = xpath_text(item, './episode_name') return { 'id': video_id, 'title': title, - 'description': description, 'series': series, 'episode': episode, - 'episode_number': episode_number, - 'season': season, - 'season_number': season_number, - 'thumbnails': thumbnails, - 'timestamp': timestamp, + 'thumbnail': thumbnail, 'duration': duration, 'view_count': view_count, + 'comment_count': comment_count, 'age_limit': age_limit, 'formats': formats, } From 2fdc7b0e0438e7f72380cfff11b619fe36234ebd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 8 Sep 2016 22:40:02 +0700 Subject: [PATCH 83/92] [viafree] PEP 8 --- youtube_dl/extractor/tvplay.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index c2a6e4e39..c0fec2594 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -390,7 +390,7 @@ class ViafreeIE(InfoExtractor): if thumbnail: video_id = self._search_regex( r'https?://[^/]+/imagecache/(?:[^/]+/)+seasons/\d+/(\d{6,})/', - thumbnail, 'video id', default=None) + thumbnail, 'video id', default=None) if not video_id: video_id = self._search_regex( From 7a979da8cb91abd7386cc2986d2ec2f4a2debb4c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 8 Sep 2016 16:42:47 +0100 Subject: [PATCH 84/92] [yahoo] Look for Brightcove Legacy Studio embeds(closes #9345) --- youtube_dl/extractor/yahoo.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index d7a81ab8c..91f0a0dbb 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -19,7 +19,10 @@ from ..utils import ( determine_ext, ) -from .brightcove import BrightcoveNewIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) from .nbc import NBCSportsVPlayerIE @@ -223,6 +226,11 @@ class YahooIE(InfoExtractor): if nbc_sports_url: return self.url_result(nbc_sports_url, NBCSportsVPlayerIE.ie_key()) + # Look for Brightcove Legacy Studio embeds + bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) + if bc_url: + return self.url_result(bc_url, BrightcoveLegacyIE.ie_key()) + # Look for Brightcove New Studio embeds bc_url = BrightcoveNewIE._extract_url(webpage) if bc_url: From 6cfcb8ac3634f1735093a791fa56b96bddabe14b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 8 Sep 2016 22:44:34 +0700 Subject: [PATCH 85/92] [tvnoe] Do not capture unused groups in _VALID_URL --- youtube_dl/extractor/tvnoe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tvnoe.py b/youtube_dl/extractor/tvnoe.py index 1cd3e6a58..6d5c74826 100644 --- a/youtube_dl/extractor/tvnoe.py +++ b/youtube_dl/extractor/tvnoe.py @@ -10,7 +10,7 @@ from ..utils import ( class TVNoeIE(JWPlatformBaseIE): - _VALID_URL = r'https?://(www\.)?tvnoe\.cz/video/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?tvnoe\.cz/video/(?P[0-9]+)' _TEST = { 'url': 'http://www.tvnoe.cz/video/10362', 'md5': 'aee983f279aab96ec45ab6e2abb3c2ca', From 37720844f6d40878bd2f29ea8311c7988ed3fc6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 8 Sep 2016 22:52:39 +0700 Subject: [PATCH 86/92] [jwplatform] Extract height from label --- youtube_dl/extractor/jwplatform.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index ce3126943..7aaa65476 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -63,10 +63,17 @@ class JWPlatformBaseIE(InfoExtractor): 'ext': ext, }) else: + height = int_or_none(source.get('height')) + if height is None: + # Often no height is provided but there is a label in + # format like 1080p. + height = int_or_none(self._search_regex( + r'^(\d{3,})[pP]$', source.get('label') or '', + 'height', default=None)) a_format = { 'url': source_url, 'width': int_or_none(source.get('width')), - 'height': int_or_none(source.get('height')), + 'height': height, 'ext': ext, } if source_url.startswith('rtmp'): From ad0e2b3359862eb6ac786d8fefd5e20035b051cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 8 Sep 2016 23:15:58 +0700 Subject: [PATCH 87/92] [abcotvs] Add support for ABC Owned Television Stations --- youtube_dl/extractor/abcotvs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/abcotvs.py b/youtube_dl/extractor/abcotvs.py index 53a900e50..054bb0596 100644 --- a/youtube_dl/extractor/abcotvs.py +++ b/youtube_dl/extractor/abcotvs.py @@ -12,6 +12,7 @@ from ..utils import ( class ABCOTVSIE(InfoExtractor): IE_NAME = 'abcotvs' + IE_DESC = 'ABC Owned Television Stations' _VALID_URL = r'https?://(?:abc(?:7(?:news|ny|chicago)?|11|13|30)|6abc)\.com(?:/[^/]+/(?P[^/]+))?/(?P\d+)' _TESTS = [ { From 2abad67e52c034ff4a94bad00c31f9ba4aa496f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 8 Sep 2016 23:32:16 +0700 Subject: [PATCH 88/92] [ChangeLog] Actualize --- ChangeLog | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index cec87d5cd..e74f64b9a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,11 +1,25 @@ version Extractors ++ [jwplatform] Extract height from format label ++ [yahoo] Extract Brightcove Legacy Studio embeds (#9345) +* [videomore] Fix extraction (#10592) * [foxgay] Fix extraction (#10480) -+ [miaopai] New extractor (#10556) ++ [rmcdecouverte] Add extractor for rmcdecouverte.bfmtv.com (#9709) * [gamestar] Fix metadata extraction (#10479) -+ [bilibili] Support episodes (#10190) -+ [tvnoe] New extractor (#10524) +* [puls4] Fix extraction (#10583) ++ [cctv] Add extractor for CCTV and CNTV (#8153) ++ [lci] Add extractor for lci.fr (#10573) ++ [wat] Extract DASH formats ++ [viafree] Improve video id detection (#10569) ++ [trutv] Add extractor for trutv.com (#10519) ++ [nick] Add support for nickelodeon.nl (#10559) ++ [abcotvs:clips] Add support for clips.abcotvs.com ++ [abcotvs] Add support for ABC Owned Television Stations sites (#9551) ++ [miaopai] Add extractor for miaopai.com (#10556) +* [gamestar] Fix metadata extraction (#10479) ++ [bilibili] Add support for episodes (#10190) ++ [tvnoe] Add extractor for tvnoe.cz (#10524) version 2016.09.04.1 From b71783719000af0806bbd6995e934de2520792fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 8 Sep 2016 23:46:14 +0700 Subject: [PATCH 89/92] release 2016.09.08 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 9 ++++++++- youtube_dl/version.py | 2 +- 4 files changed, 13 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index c03092442..a983bf432 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.09.04.1*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.09.04.1** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.09.08*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.09.08** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.09.04.1 +[debug] youtube-dl version 2016.09.08 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index e74f64b9a..d84f447ba 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2016.09.08 Extractors + [jwplatform] Extract height from format label diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 9e21016f7..e6be746a8 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -19,9 +19,10 @@ - **9now.com.au** - **abc.net.au** - **abc.net.au:iview** - - **Abc7News** - **abcnews** - **abcnews:video** + - **abcotvs**: ABC Owned Television Stations + - **abcotvs:clips** - **AcademicEarth:Course** - **acast** - **acast:channel** @@ -128,6 +129,7 @@ - **CBSNews**: CBS News - **CBSNewsLiveVideo**: CBS News Live Videos - **CBSSports** + - **CCTV** - **CDA** - **CeskaTelevize** - **channel9**: Channel 9 @@ -352,6 +354,7 @@ - **kuwo:song**: 酷我音乐 - **la7.it** - **Laola1Tv** + - **LCI** - **Lcp** - **LcpPlay** - **Le**: 乐视网 @@ -390,6 +393,7 @@ - **Metacritic** - **Mgoon** - **MGTV**: 芒果TV + - **MiaoPai** - **Minhateca** - **MinistryGrid** - **Minoto** @@ -576,6 +580,7 @@ - **revision3:embed** - **RICE** - **RingTV** + - **RMCDecouverte** - **RockstarGames** - **RoosterTeeth** - **RottenTomatoes** @@ -721,6 +726,7 @@ - **TrailerAddict** (Currently broken) - **Trilulilu** - **trollvids** + - **TruTV** - **Tube8** - **TubiTv** - **tudou** @@ -742,6 +748,7 @@ - **TVCArticle** - **tvigle**: Интернет-телевидение Tvigle.ru - **tvland.com** + - **TVNoe** - **tvp**: Telewizja Polska - **tvp:embed**: Telewizja Polska - **tvp:series** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index b2ea6dac6..941ffb3f6 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.09.04.1' +__version__ = '2016.09.08' From 4614ad7b591577793321b8e761510cf08ceb558b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 8 Sep 2016 20:45:13 +0100 Subject: [PATCH 90/92] [parliamentliveuk] fix extraction(closes #9137) --- youtube_dl/extractor/parliamentliveuk.py | 57 +++++++++--------------- 1 file changed, 22 insertions(+), 35 deletions(-) diff --git a/youtube_dl/extractor/parliamentliveuk.py b/youtube_dl/extractor/parliamentliveuk.py index 0a423a08f..874aacc55 100644 --- a/youtube_dl/extractor/parliamentliveuk.py +++ b/youtube_dl/extractor/parliamentliveuk.py @@ -1,53 +1,40 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor class ParliamentLiveUKIE(InfoExtractor): IE_NAME = 'parliamentlive.tv' IE_DESC = 'UK parliament videos' - _VALID_URL = r'https?://www\.parliamentlive\.tv/Main/Player\.aspx\?(?:[^&]+&)*?meetingId=(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?parliamentlive\.tv/Event/Index/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _TEST = { - 'url': 'http://www.parliamentlive.tv/Main/Player.aspx?meetingId=15121&player=windowsmedia', + 'url': 'http://parliamentlive.tv/Event/Index/c1e9d44d-fd6c-4263-b50f-97ed26cc998b', 'info_dict': { - 'id': '15121', - 'ext': 'asf', - 'title': 'hoc home affairs committee, 18 mar 2014.pm', - 'description': 'md5:033b3acdf83304cd43946b2d5e5798d1', + 'id': 'c1e9d44d-fd6c-4263-b50f-97ed26cc998b', + 'ext': 'mp4', + 'title': 'Home Affairs Committee', + 'uploader_id': 'FFMPEG-01', + 'timestamp': 1422696664, + 'upload_date': '20150131', }, - 'params': { - 'skip_download': True, # Requires mplayer (mms) - } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - - asx_url = self._html_search_regex( - r'embed.*?src="([^"]+)" name="MediaPlayer"', webpage, - 'metadata URL') - asx = self._download_xml(asx_url, video_id, 'Downloading ASX metadata') - video_url = asx.find('.//REF').attrib['HREF'] - - title = self._search_regex( - r'''(?x)player\.setClipDetails\( - (?:(?:[0-9]+|"[^"]+"),\s*){2} - "([^"]+",\s*"[^"]+)" - ''', - webpage, 'title').replace('", "', ', ') - description = self._html_search_regex( - r'(?s)(.*?)', - webpage, 'description') - + video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://vodplayer.parliamentlive.tv/?mid=' + video_id, video_id) + widget_config = self._parse_json(self._search_regex( + r'kWidgetConfig\s*=\s*({.+});', + webpage, 'kaltura widget config'), video_id) + kaltura_url = 'kaltura:%s:%s' % (widget_config['wid'][1:], widget_config['entry_id']) + event_title = self._download_json( + 'http://parliamentlive.tv/Event/GetShareVideo/' + video_id, video_id)['event']['title'] return { + '_type': 'url_transparent', 'id': video_id, - 'ext': 'asf', - 'url': video_url, - 'title': title, - 'description': description, + 'title': event_title, + 'description': '', + 'url': kaltura_url, + 'ie_key': 'Kaltura', } From 4d5726b0d7562aff975c91303b8ae8cfc4de8c51 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 8 Sep 2016 22:53:44 +0100 Subject: [PATCH 91/92] [telequebec] Add new extractor(closes #1999) --- youtube_dl/extractor/telequebec.py | 36 ++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 youtube_dl/extractor/telequebec.py diff --git a/youtube_dl/extractor/telequebec.py b/youtube_dl/extractor/telequebec.py new file mode 100644 index 000000000..4043fcb92 --- /dev/null +++ b/youtube_dl/extractor/telequebec.py @@ -0,0 +1,36 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class TeleQuebecIE(InfoExtractor): + _VALID_URL = r'https?://zonevideo\.telequebec\.tv/media/(?P\d+)' + _TEST = { + 'url': 'http://zonevideo.telequebec.tv/media/20984/le-couronnement-de-new-york/couronnement-de-new-york', + 'md5': 'fe95a0957e5707b1b01f5013e725c90f', + 'info_dict': { + 'id': '20984', + 'ext': 'mp4', + 'title': 'Le couronnement de New York', + 'description': 'md5:f5b3d27a689ec6c1486132b2d687d432', + 'upload_date': '20160220', + 'timestamp': 1455965438, + } + } + + def _real_extract(self, url): + media_id = self._match_id(url) + media_data = self._download_json( + 'https://mnmedias.api.telequebec.tv/api/v2/media/' + media_id, + media_id)['media'] + return { + '_type': 'url_transparent', + 'id': media_id, + 'url': 'limelight:media:' + media_data['streamInfo']['sourceId'], + 'title': media_data['title'], + 'description': media_data.get('descriptions', [{'text': None}])[0].get('text'), + 'duration': int_or_none(media_data.get('durationInMilliseconds'), 1000), + 'ie_key': 'LimelightMedia', + } From cb9cbd84ed2471140ce58480b8ed373f8abb5816 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 8 Sep 2016 22:55:27 +0100 Subject: [PATCH 92/92] [extractors] add import for TeleQuebecIE --- youtube_dl/extractor/extractors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 8d9c2ae13..b7b630e9d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -861,6 +861,7 @@ from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE from .telegraaf import TelegraafIE from .telemb import TeleMBIE +from .telequebec import TeleQuebecIE from .teletask import TeleTaskIE from .telewebion import TelewebionIE from .testurl import TestURLIE