From 5a16c9d9d37389d163b0004f1c9332764a50ef83 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 25 May 2018 23:12:18 +0100 Subject: [PATCH 01/27] [utils] keep the original TV_PARENTAL_GUIDELINES dict --- youtube_dl/utils.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d61af8837..7b4fd882f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2253,12 +2253,12 @@ US_RATINGS = { TV_PARENTAL_GUIDELINES = { - 'Y': 0, - 'Y7': 7, - 'G': 0, - 'PG': 0, - '14': 14, - 'MA': 17, + 'TV-Y': 0, + 'TV-Y7': 7, + 'TV-G': 0, + 'TV-PG': 0, + 'TV-14': 14, + 'TV-MA': 17, } @@ -2272,9 +2272,9 @@ def parse_age_limit(s): return int(m.group('age')) if s in US_RATINGS: return US_RATINGS[s] - m = re.match(r'^TV[_-]?(%s)$' % '|'.join(TV_PARENTAL_GUIDELINES.keys()), s) + m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s) if m: - return TV_PARENTAL_GUIDELINES[m.group(1)] + return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)] return None From 38e4e8ab80b784f59b3a3ef6d313a70e13f17cd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 12:58:34 +0700 Subject: [PATCH 02/27] [ChangeLog] Actualize [ci skip] --- ChangeLog | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/ChangeLog b/ChangeLog index 08233cd5b..9d0264bf7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,31 @@ +version + +Core +* [utils] Improve parse_age_limit + +Extractors +* [audiomack] Stringify video id (#15310) +* [izlesene] Fix extraction (#16233, #16271, #16407) ++ [indavideo] Add support for generic embeds (#11989) +* [indavideo] Fix extraction (#11221) +* [indavideo] Sign download URLs (#16174) ++ [peertube] Add support for PeerTube based sites (#16301, #16329) +* [imgur] Fix extraction (#16537) ++ [hidive] Add support for authentication (#16534) ++ [nbc] Add support for stream.nbcsports.com (#13911) ++ [viewlift] Add support for hoichoi.tv (#16536) +* [go90] Extract age limit and detect DRM protection(#10127) +* [viewlift] fix extraction for snagfilms.com (#15766) +* [globo] Improve extraction (#4189) + * Add support for authentication + * Simplify URL signing + * Extract DASH and MSS formats +* [leeco] Fix extraction (#16464) +* [teamcoco] Add fallback for format extraction (#16484) +* [teamcoco] Improve URL regular expression (#16484) +* [imdb] Improve extraction (#4085, #14557) + + version 2018.05.18 Extractors From 0934c9d4faadbfd2b076d13c7e24f4bf039cdc79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 13:02:21 +0700 Subject: [PATCH 03/27] release 2018.05.26 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- README.md | 13 ++++++------- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 7d9de5171..c4d4e534e 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.18** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.26*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.26** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.05.18 +[debug] youtube-dl version 2018.05.26 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 9d0264bf7..280390ea0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.05.26 Core * [utils] Improve parse_age_limit diff --git a/README.md b/README.md index 20982b0f1..499a0c206 100644 --- a/README.md +++ b/README.md @@ -93,8 +93,8 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo ## Network Options: --proxy URL Use the specified HTTP/HTTPS/SOCKS proxy. - To enable experimental SOCKS proxy, specify - a proper scheme. For example + To enable SOCKS proxy, specify a proper + scheme. For example socks5://127.0.0.1:1080/. Pass in an empty string (--proxy "") for direct connection --socket-timeout SECONDS Time to wait before giving up, in seconds @@ -109,16 +109,15 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo option is not present) is used for the actual downloading. --geo-bypass Bypass geographic restriction via faking - X-Forwarded-For HTTP header (experimental) + X-Forwarded-For HTTP header --no-geo-bypass Do not bypass geographic restriction via faking X-Forwarded-For HTTP header - (experimental) --geo-bypass-country CODE Force bypass geographic restriction with explicitly provided two-letter ISO 3166-2 - country code (experimental) + country code --geo-bypass-ip-block IP_BLOCK Force bypass geographic restriction with explicitly provided IP block in CIDR - notation (experimental) + notation ## Video Selection: --playlist-start NUMBER Playlist video to start at (default is 1) @@ -209,7 +208,7 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --playlist-reverse Download playlist videos in reverse order --playlist-random Download playlist videos in random order --xattr-set-filesize Set file xattribute ytdl.filesize with - expected file size (experimental) + expected file size --hls-prefer-native Use the native HLS downloader instead of ffmpeg --hls-prefer-ffmpeg Use ffmpeg instead of the native HLS diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c1048cc4c..b60f2ff23 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -365,7 +365,6 @@ - **ImgurAlbum** - **Ina** - **Inc** - - **Indavideo** - **IndavideoEmbed** - **InfoQ** - **Instagram** @@ -526,6 +525,7 @@ - **nbcolympics** - **nbcolympics:stream** - **NBCSports** + - **NBCSportsStream** - **NBCSportsVPlayer** - **ndr**: NDR.de - Norddeutscher Rundfunk - **ndr:embed** @@ -625,6 +625,7 @@ - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - **pcmag** - **PearVideo** + - **PeerTube** - **People** - **PerformGroup** - **periscope**: Periscope diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a43eec860..2253da927 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.05.18' +__version__ = '2018.05.26' From c678192af3f004205b18a16b7418cbd937c1b584 Mon Sep 17 00:00:00 2001 From: Zack Fernandes Date: Sun, 31 Dec 2017 13:55:35 -0800 Subject: [PATCH 04/27] [tumblr] Add support for authentication --- youtube_dl/extractor/tumblr.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 786143525..58ac66755 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,11 +4,18 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + ExtractorError, + int_or_none, + sanitized_Request, + urlencode_postdata +) class TumblrIE(InfoExtractor): _VALID_URL = r'https?://(?P[^/?#&]+)\.tumblr\.com/(?:post|video)/(?P[0-9]+)(?:$|[/?#])' + _NETRC_MACHINE = 'tumblr' + _LOGIN_URL = 'https://www.tumblr.com/login' _TESTS = [{ 'url': 'http://tatianamaslanydaily.tumblr.com/post/54196191430/orphan-black-dvd-extra-behind-the-scenes', 'md5': '479bb068e5b16462f5176a6828829767', @@ -97,6 +104,31 @@ class TumblrIE(InfoExtractor): 'add_ie': ['Instagram'], }] + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + self.report_login() + webpage = self._download_webpage(self._LOGIN_URL, None, False) + form = self._hidden_inputs(webpage) + form.update({ + 'user[email]': username, + 'user[password]': password + }) + login_response = self._download_webpage( + sanitized_Request(self._LOGIN_URL, urlencode_postdata(form), { + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': self._LOGIN_URL + }), None, False, 'Wrong login info') + + # Check the login response from Tumblr for an error message and fail the extraction if we find one. + login_errors = self._search_regex(r'Tumblr\.RegistrationForm\.errors\s*=\s*\[[\"|\'](.+)[\"|\']\]', login_response, 'login errors', False) + if login_errors: + raise ExtractorError('Error logging in: %s' % login_errors, expected=True) + def _real_extract(self, url): m_url = re.match(self._VALID_URL, url) video_id = m_url.group('id') From 56cd31f32015cce131fb40a112d323da57fdda8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 19:53:32 +0700 Subject: [PATCH 05/27] [tumblr] Improve authentication (closes #15133) --- youtube_dl/extractor/tumblr.py | 39 ++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 58ac66755..758ccbb44 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -7,7 +7,6 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, - sanitized_Request, urlencode_postdata ) @@ -111,23 +110,37 @@ class TumblrIE(InfoExtractor): (username, password) = self._get_login_info() if username is None: return - self.report_login() - webpage = self._download_webpage(self._LOGIN_URL, None, False) - form = self._hidden_inputs(webpage) - form.update({ + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + login_form.update({ 'user[email]': username, 'user[password]': password }) - login_response = self._download_webpage( - sanitized_Request(self._LOGIN_URL, urlencode_postdata(form), { - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': self._LOGIN_URL - }), None, False, 'Wrong login info') - # Check the login response from Tumblr for an error message and fail the extraction if we find one. - login_errors = self._search_regex(r'Tumblr\.RegistrationForm\.errors\s*=\s*\[[\"|\'](.+)[\"|\']\]', login_response, 'login errors', False) + response, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': self._LOGIN_URL, + }) + + # Successful login + if '/dashboard' in urlh.geturl(): + return + + login_errors = self._parse_json( + self._search_regex( + r'RegistrationForm\.errors\s*=\s*(\[.+?\])\s*;', response, + 'login errors', default='[]'), + None, fatal=False) if login_errors: - raise ExtractorError('Error logging in: %s' % login_errors, expected=True) + raise ExtractorError( + 'Unable to login: %s' % login_errors[0], expected=True) + + self.report_warning('Login has probably failed') def _real_extract(self, url): m_url = re.match(self._VALID_URL, url) From 97b01144bd9771f224749ffca10156a1cd7e9c1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 20:00:00 +0700 Subject: [PATCH 06/27] [tumblr] Detect and report sensitive media (closes #13829) --- youtube_dl/extractor/tumblr.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 758ccbb44..89e6eb5ab 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, int_or_none, @@ -150,11 +151,19 @@ class TumblrIE(InfoExtractor): url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) webpage, urlh = self._download_webpage_handle(url, video_id) + redirect_url = compat_str(urlh.geturl()) + if 'tumblr.com/safe-mode' in redirect_url or redirect_url.startswith('/safe-mode'): + raise ExtractorError( + 'This Tumblr may contain sensitive media. ' + 'Disable safe mode in your account settings ' + 'at https://www.tumblr.com/settings/account#safe_mode', + expected=True) + iframe_url = self._search_regex( r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', webpage, 'iframe url', default=None) if iframe_url is None: - return self.url_result(urlh.geturl(), 'Generic') + return self.url_result(redirect_url, 'Generic') iframe = self._download_webpage(iframe_url, video_id, 'Downloading iframe page') From 986c0b0215b127713825fa1523966ac66e03157b Mon Sep 17 00:00:00 2001 From: Parmjit Virk Date: Sat, 26 May 2018 08:05:54 -0500 Subject: [PATCH 07/27] [cbc] Fix playlist title extraction (closes #16502) --- youtube_dl/extractor/cbc.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 54b4b9be9..ce8e3d346 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -20,6 +20,7 @@ from ..utils import ( parse_duration, parse_iso8601, parse_age_limit, + strip_or_none, int_or_none, ExtractorError, ) @@ -129,6 +130,9 @@ class CBCIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) + title = self._og_search_title(webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', default=None) or self._html_search_regex( + r'([^<]+)', webpage, 'title', fatal=False) entries = [ self._extract_player_init(player_init, display_id) for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] @@ -136,8 +140,7 @@ class CBCIE(InfoExtractor): self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) for media_id in re.findall(r']+src="[^"]+?mediaId=(\d+)"', webpage)]) return self.playlist_result( - entries, display_id, - self._og_search_title(webpage, fatal=False), + entries, display_id, strip_or_none(title), self._og_search_description(webpage)) From c0fd20abcad16bb2e377b6342a894a374c219763 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 26 May 2018 14:34:13 +0100 Subject: [PATCH 08/27] [soundcloud] detect format extension(closes #16549) --- youtube_dl/extractor/soundcloud.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 46332e5c2..81c81c8d5 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -181,7 +181,6 @@ class SoundcloudIE(InfoExtractor): thumbnail = info.get('artwork_url') or info.get('user', {}).get('avatar_url') if isinstance(thumbnail, compat_str): thumbnail = thumbnail.replace('-large', '-t500x500') - ext = 'mp3' result = { 'id': track_id, 'uploader': info.get('user', {}).get('username'), @@ -215,8 +214,11 @@ class SoundcloudIE(InfoExtractor): track_id, 'Downloading track url', query=query) for key, stream_url in format_dict.items(): - abr = int_or_none(self._search_regex( - r'_(\d+)_url', key, 'audio bitrate', default=None)) + ext, abr = 'mp3', None + mobj = re.search(r'_([^_]+)_(\d+)_url', key) + if mobj: + ext, abr = mobj.groups() + abr = int(abr) if key.startswith('http'): stream_formats = [{ 'format_id': key, @@ -234,13 +236,14 @@ class SoundcloudIE(InfoExtractor): }] elif key.startswith('hls'): stream_formats = self._extract_m3u8_formats( - stream_url, track_id, 'mp3', entry_protocol='m3u8_native', + stream_url, track_id, ext, entry_protocol='m3u8_native', m3u8_id=key, fatal=False) else: continue - for f in stream_formats: - f['abr'] = abr + if abr: + for f in stream_formats: + f['abr'] = abr formats.extend(stream_formats) @@ -250,7 +253,7 @@ class SoundcloudIE(InfoExtractor): formats.append({ 'format_id': 'fallback', 'url': update_url_query(info['stream_url'], query), - 'ext': ext, + 'ext': 'mp3', }) for f in formats: From 261f47306c594614edb8a5f0b8f5f3b8a87ce9c0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 26 May 2018 14:35:47 +0100 Subject: [PATCH 09/27] [utils] fix style id extraction for namespaced id attribute(closes #16551) --- youtube_dl/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7b4fd882f..63f24c0b6 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2667,6 +2667,7 @@ def dfxp2srt(dfxp_data): ] _x = functools.partial(xpath_with_ns, ns_map={ + 'xml': 'http://www.w3.org/XML/1998/namespace', 'ttml': 'http://www.w3.org/ns/ttml', 'tts': 'http://www.w3.org/ns/ttml#styling', }) @@ -2758,7 +2759,9 @@ def dfxp2srt(dfxp_data): repeat = False while True: for style in dfxp.findall(_x('.//ttml:style')): - style_id = style.get('id') + style_id = style.get('id') or style.get(_x('xml:id')) + if not style_id: + continue parent_style_id = style.get('style') if parent_style_id: if parent_style_id not in styles: From 2a49d01992e0b4b87d78da8f83af2f6e57fb8ba8 Mon Sep 17 00:00:00 2001 From: mars67857 Date: Sat, 14 Oct 2017 22:09:44 -0700 Subject: [PATCH 10/27] [cammodels] Add extractor --- youtube_dl/extractor/cammodels.py | 93 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + 2 files changed, 94 insertions(+) create mode 100644 youtube_dl/extractor/cammodels.py diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py new file mode 100644 index 000000000..1711d7096 --- /dev/null +++ b/youtube_dl/extractor/cammodels.py @@ -0,0 +1,93 @@ +from __future__ import unicode_literals +from .common import InfoExtractor +from .common import ExtractorError +import json +import re +from ..utils import int_or_none + + +class CamModelsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cammodels\.com/cam/(?P\w+)' + _HEADERS = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' + # Needed because server doesn't return links to video URLs if a browser-like User-Agent is not used + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + url, + video_id, + headers=self._HEADERS) + manifest_url_root = self._html_search_regex( + r'manifestUrlRoot=(?Phttps?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))', + webpage, + 'manifest', + None, + False) + if not manifest_url_root: + offline = self._html_search_regex( + r'(?PI\'m offline, but let\'s stay connected!)', + webpage, + 'offline indicator', + None, + False) + private = self._html_search_regex( + r'(?PI’m in a private show right now)', + webpage, + 'private show indicator', + None, + False) + err = 'This user is currently offline, so nothing can be downloaded.' if offline \ + else 'This user is doing a private show, which requires payment. This extractor currently does not support private streams.' if private \ + else 'Unable to find link to stream info on webpage. Room is not offline, so something else is wrong.' + raise ExtractorError( + err, + expected=True if offline or private else False, + video_id=video_id + ) + manifest_url = manifest_url_root + video_id + '.json' + manifest = self._download_json( + manifest_url, + video_id, + 'Downloading links to streams.', + 'Link to stream URLs was found, but we couldn\'t access it.', + headers=self._HEADERS) + try: + formats = [] + for fmtName in ['mp4-rtmp', 'mp4-hls']: + for encoding in manifest['formats'][fmtName]['encodings']: + formats.append({ + 'ext': 'mp4', + 'url': encoding['location'], + 'width': int_or_none(encoding.get('videoWidth')), + 'height': int_or_none(encoding.get('videoHeight')), + 'vbr': int_or_none(encoding.get('videoKbps')), + 'abr': int_or_none(encoding.get('audioKbps')), + 'format_id': fmtName + str(encoding.get('videoWidth')) + }) + # If they change the JSON format, then fallback to parsing out RTMP links via regex. + except KeyError: + manifest_json = json.dumps(manifest) + manifest_links = re.finditer( + r'(?Prtmp?:\/\/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#&//=]*))', + manifest_json) + if not manifest_links: + raise ExtractorError( + 'Link to stream info was found, but we couldn\'t read the response. This is probably a bug.', + expected=False, + video_id=video_id) + formats = [] + for manifest_link in manifest_links: + url = manifest_link.group('id') + formats.append({ + 'ext': 'mp4', + 'url': url, + 'format_id': url.split(sep='/')[-1] + }) + self._sort_formats(formats) + return { + 'id': video_id, + 'title': self._live_title(video_id), + 'formats': formats + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index c9b49a0cd..d54e8df9f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -145,6 +145,7 @@ from .camdemy import ( CamdemyIE, CamdemyFolderIE ) +from .cammodels import CamModelsIE from .camwithher import CamWithHerIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE From 8b1da46e8f6dd0de790a54a4809d224041262537 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 21:25:01 +0700 Subject: [PATCH 11/27] [cammodels] Improve and simplify (closes #14499) --- youtube_dl/extractor/cammodels.py | 159 +++++++++++++++--------------- 1 file changed, 80 insertions(+), 79 deletions(-) diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py index 1711d7096..4f1b88d14 100644 --- a/youtube_dl/extractor/cammodels.py +++ b/youtube_dl/extractor/cammodels.py @@ -1,93 +1,94 @@ +# coding: utf-8 from __future__ import unicode_literals + from .common import InfoExtractor -from .common import ExtractorError -import json -import re -from ..utils import int_or_none +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, +) class CamModelsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cammodels\.com/cam/(?P\w+)' - _HEADERS = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36' - # Needed because server doesn't return links to video URLs if a browser-like User-Agent is not used - } + _VALID_URL = r'https?://(?:www\.)?cammodels\.com/cam/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.cammodels.com/cam/AutumnKnight/', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage( - url, - video_id, - headers=self._HEADERS) - manifest_url_root = self._html_search_regex( - r'manifestUrlRoot=(?Phttps?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))', - webpage, - 'manifest', - None, - False) - if not manifest_url_root: - offline = self._html_search_regex( - r'(?PI\'m offline, but let\'s stay connected!)', - webpage, - 'offline indicator', - None, - False) - private = self._html_search_regex( - r'(?PI’m in a private show right now)', - webpage, - 'private show indicator', - None, - False) - err = 'This user is currently offline, so nothing can be downloaded.' if offline \ - else 'This user is doing a private show, which requires payment. This extractor currently does not support private streams.' if private \ - else 'Unable to find link to stream info on webpage. Room is not offline, so something else is wrong.' - raise ExtractorError( - err, - expected=True if offline or private else False, - video_id=video_id + user_id = self._match_id(url) + + webpage = self._download_webpage(url, user_id) + + manifest_root = self._html_search_regex( + r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) + + if not manifest_root: + ERRORS = ( + ("I'm offline, but let's stay connected", 'This user is currently offline'), + ('in a private show', 'This user is in a private show'), ) - manifest_url = manifest_url_root + video_id + '.json' + for pattern, message in ERRORS: + if pattern in webpage: + error = message + expected = True + break + else: + error = 'Unable to find manifest URL root' + expected = False + raise ExtractorError(error, expected=expected) + manifest = self._download_json( - manifest_url, - video_id, - 'Downloading links to streams.', - 'Link to stream URLs was found, but we couldn\'t access it.', - headers=self._HEADERS) - try: - formats = [] - for fmtName in ['mp4-rtmp', 'mp4-hls']: - for encoding in manifest['formats'][fmtName]['encodings']: - formats.append({ + '%s%s.json' % (manifest_root, user_id), user_id) + + formats = [] + for format_id, format_dict in manifest['formats'].items(): + if not isinstance(format_dict, dict): + continue + encodings = format_dict.get('encodings') + if not isinstance(encodings, list): + continue + vcodec = format_dict.get('videoCodec') + acodec = format_dict.get('audioCodec') + for media in encodings: + if not isinstance(media, dict): + continue + media_url = media.get('location') + if not media_url or not isinstance(media_url, compat_str): + continue + + format_id_list = [format_id] + height = int_or_none(media.get('videoHeight')) + if height is not None: + format_id_list.append('%dp' % height) + f = { + 'url': media_url, + 'format_id': '-'.join(format_id_list), + 'width': int_or_none(media.get('videoWidth')), + 'height': height, + 'vbr': int_or_none(media.get('videoKbps')), + 'abr': int_or_none(media.get('audioKbps')), + 'fps': int_or_none(media.get('fps')), + 'vcodec': vcodec, + 'acodec': acodec, + } + if 'rtmp' in format_id: + f['ext'] = 'flv' + elif 'hls' in format_id: + f.update({ 'ext': 'mp4', - 'url': encoding['location'], - 'width': int_or_none(encoding.get('videoWidth')), - 'height': int_or_none(encoding.get('videoHeight')), - 'vbr': int_or_none(encoding.get('videoKbps')), - 'abr': int_or_none(encoding.get('audioKbps')), - 'format_id': fmtName + str(encoding.get('videoWidth')) + # hls skips fragments, preferring rtmp + 'preference': -1, }) - # If they change the JSON format, then fallback to parsing out RTMP links via regex. - except KeyError: - manifest_json = json.dumps(manifest) - manifest_links = re.finditer( - r'(?Prtmp?:\/\/[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#&//=]*))', - manifest_json) - if not manifest_links: - raise ExtractorError( - 'Link to stream info was found, but we couldn\'t read the response. This is probably a bug.', - expected=False, - video_id=video_id) - formats = [] - for manifest_link in manifest_links: - url = manifest_link.group('id') - formats.append({ - 'ext': 'mp4', - 'url': url, - 'format_id': url.split(sep='/')[-1] - }) + else: + continue + formats.append(f) self._sort_formats(formats) + return { - 'id': video_id, - 'title': self._live_title(video_id), - 'formats': formats + 'id': user_id, + 'title': self._live_title(user_id), + 'is_live': True, + 'formats': formats, } From ec2f3d2800185920629a7e6946701edebbf14dd6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 26 May 2018 15:34:36 +0100 Subject: [PATCH 12/27] [ufctv] add support for authentication(closes #16542) --- youtube_dl/extractor/ufctv.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/youtube_dl/extractor/ufctv.py b/youtube_dl/extractor/ufctv.py index ab823814b..f3eaee6b3 100644 --- a/youtube_dl/extractor/ufctv.py +++ b/youtube_dl/extractor/ufctv.py @@ -3,13 +3,16 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + ExtractorError, parse_duration, parse_iso8601, + urlencode_postdata, ) class UFCTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ufc\.tv/video/(?P[^/]+)' + _NETRC_MACHINE = 'ufctv' _TEST = { 'url': 'https://www.ufc.tv/video/ufc-219-countdown-full-episode', 'info_dict': { @@ -26,6 +29,21 @@ class UFCTVIE(InfoExtractor): } } + def _real_initialize(self): + username, password = self._get_login_info() + if username is None: + return + + code = self._download_json( + 'https://www.ufc.tv/secure/authenticate', + None, 'Logging in', data=urlencode_postdata({ + 'username': username, + 'password': password, + 'format': 'json', + })).get('code') + if code and code != 'loginsuccess': + raise ExtractorError(code, expected=True) + def _real_extract(self, url): display_id = self._match_id(url) video_data = self._download_json(url, display_id, query={ From 68217024e83c8e7965f2800e9ff7a9575f049b5c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 26 May 2018 16:12:44 +0100 Subject: [PATCH 13/27] remove unnecessary assignment parenthesis --- youtube_dl/extractor/animeondemand.py | 2 +- youtube_dl/extractor/atresplayer.py | 2 +- youtube_dl/extractor/bambuser.py | 2 +- youtube_dl/extractor/crunchyroll.py | 2 +- youtube_dl/extractor/curiositystream.py | 2 +- youtube_dl/extractor/dramafever.py | 2 +- youtube_dl/extractor/facebook.py | 2 +- youtube_dl/extractor/fc2.py | 2 +- youtube_dl/extractor/funimation.py | 2 +- youtube_dl/extractor/gdcvault.py | 2 +- youtube_dl/extractor/globo.py | 5 ----- youtube_dl/extractor/hidive.py | 7 +------ youtube_dl/extractor/hrti.py | 2 +- youtube_dl/extractor/iqiyi.py | 2 +- youtube_dl/extractor/niconico.py | 2 +- youtube_dl/extractor/noco.py | 2 +- youtube_dl/extractor/packtpub.py | 2 +- youtube_dl/extractor/patreon.py | 2 +- youtube_dl/extractor/pluralsight.py | 2 +- youtube_dl/extractor/roosterteeth.py | 2 +- youtube_dl/extractor/safari.py | 2 +- youtube_dl/extractor/sina.py | 2 +- youtube_dl/extractor/tennistv.py | 2 +- youtube_dl/extractor/tubitv.py | 2 +- youtube_dl/extractor/tumblr.py | 2 +- youtube_dl/extractor/twitch.py | 2 +- youtube_dl/extractor/udemy.py | 2 +- youtube_dl/extractor/vessel.py | 2 +- youtube_dl/extractor/viki.py | 2 +- youtube_dl/extractor/vimeo.py | 2 +- youtube_dl/extractor/vk.py | 2 +- youtube_dl/extractor/youtube.py | 2 +- youtube_dl/extractor/zattoo.py | 2 +- 33 files changed, 32 insertions(+), 42 deletions(-) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index e4fa72f46..1fe5d5e56 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -52,7 +52,7 @@ class AnimeOnDemandIE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 1a31ebe08..ae1c09427 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -74,7 +74,7 @@ class AtresPlayerIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py index 633c57553..34f1b3d83 100644 --- a/youtube_dl/extractor/bambuser.py +++ b/youtube_dl/extractor/bambuser.py @@ -44,7 +44,7 @@ class BambuserIE(InfoExtractor): } def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 3efdc8c21..311da515d 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -49,7 +49,7 @@ class CrunchyrollBaseIE(InfoExtractor): }) def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py index 8e45923e3..35b1e7a34 100644 --- a/youtube_dl/extractor/curiositystream.py +++ b/youtube_dl/extractor/curiositystream.py @@ -35,7 +35,7 @@ class CuriosityStreamBaseIE(InfoExtractor): return result['data'] def _real_initialize(self): - (email, password) = self._get_login_info() + email, password = self._get_login_info() if email is None: return result = self._download_json( diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index ffbd2623d..ab32ba4ff 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -42,7 +42,7 @@ class DramaFeverBaseIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 220ada3a6..0971ce356 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -226,7 +226,7 @@ class FacebookIE(InfoExtractor): return urls def _login(self): - (useremail, password) = self._get_login_info() + useremail, password = self._get_login_info() if useremail is None: return diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index 448647d72..435561147 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -46,7 +46,7 @@ class FC2IE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None or password is None: return False diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index 107f658ba..07d01caec 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -51,7 +51,7 @@ class FunimationIE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return try: diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index f71d9092e..8806dc48a 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -91,7 +91,7 @@ class GDCVaultIE(InfoExtractor): ] def _login(self, webpage_url, display_id): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None or password is None: self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.') return None diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index 81d6d36d3..c2140c362 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -23,7 +23,6 @@ from ..utils import ( class GloboIE(InfoExtractor): _VALID_URL = r'(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P\d{7,})' - _LOGGED_IN = False _NETRC_MACHINE = 'globo' _TESTS = [{ 'url': 'http://g1.globo.com/carros/autoesporte/videos/t/exclusivos-do-g1/v/mercedes-benz-gla-passa-por-teste-de-colisao-na-europa/3607726/', @@ -68,9 +67,6 @@ class GloboIE(InfoExtractor): }] def _real_initialize(self): - if self._LOGGED_IN: - return - email, password = self._get_login_info() if email is None: return @@ -91,7 +87,6 @@ class GloboIE(InfoExtractor): resp = self._parse_json(e.cause.read(), None) raise ExtractorError(resp.get('userMessage') or resp['id'], expected=True) raise - self._LOGGED_IN = True def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/hidive.py b/youtube_dl/extractor/hidive.py index d8f2e682f..39fabe8a5 100644 --- a/youtube_dl/extractor/hidive.py +++ b/youtube_dl/extractor/hidive.py @@ -18,7 +18,6 @@ class HiDiveIE(InfoExtractor): # so disabling geo bypass completely _GEO_BYPASS = False _NETRC_MACHINE = 'hidive' - _LOGGED_IN = False _LOGIN_URL = 'https://www.hidive.com/account/login' _TESTS = [{ @@ -38,10 +37,7 @@ class HiDiveIE(InfoExtractor): }] def _real_initialize(self): - if self._LOGGED_IN: - return - - (email, password) = self._get_login_info() + email, password = self._get_login_info() if email is None: return @@ -56,7 +52,6 @@ class HiDiveIE(InfoExtractor): }) self._download_webpage( self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(data)) - self._LOGGED_IN = True def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/hrti.py b/youtube_dl/extractor/hrti.py index 6424d34ac..9ba1aa703 100644 --- a/youtube_dl/extractor/hrti.py +++ b/youtube_dl/extractor/hrti.py @@ -66,7 +66,7 @@ class HRTiBaseIE(InfoExtractor): self._logout_url = modules['user']['resources']['logout']['uri'] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() # TODO: figure out authentication with cookies if username is None or password is None: self.raise_login_required() diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index fdfa7de9e..4b081bd46 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -239,7 +239,7 @@ class IqiyiIE(InfoExtractor): return ohdave_rsa_encrypt(data, e, N) def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() # No authentication to be performed if not username: diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index df7f528be..dbe871f16 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -163,7 +163,7 @@ class NiconicoIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() # No authentication to be performed if not username: return True diff --git a/youtube_dl/extractor/noco.py b/youtube_dl/extractor/noco.py index a9f9b10c4..58b371ed7 100644 --- a/youtube_dl/extractor/noco.py +++ b/youtube_dl/extractor/noco.py @@ -65,7 +65,7 @@ class NocoIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/packtpub.py b/youtube_dl/extractor/packtpub.py index 8ed3c6347..56a2a1083 100644 --- a/youtube_dl/extractor/packtpub.py +++ b/youtube_dl/extractor/packtpub.py @@ -42,7 +42,7 @@ class PacktPubIE(PacktPubBaseIE): _TOKEN = None def _real_initialize(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return try: diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py index d4b1d34ca..9eb027679 100644 --- a/youtube_dl/extractor/patreon.py +++ b/youtube_dl/extractor/patreon.py @@ -53,7 +53,7 @@ class PatreonIE(InfoExtractor): # needed. Keeping this commented for when this inevitably changes. ''' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index 3c508c9ca..a207ca9cb 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -94,7 +94,7 @@ class PluralsightIE(PluralsightBaseIE): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py index 8b703800e..857434540 100644 --- a/youtube_dl/extractor/roosterteeth.py +++ b/youtube_dl/extractor/roosterteeth.py @@ -50,7 +50,7 @@ class RoosterTeethIE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index cc6698f88..8a5d48fc2 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -27,7 +27,7 @@ class SafariBaseIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py index 8fc66732a..07b766b4a 100644 --- a/youtube_dl/extractor/sina.py +++ b/youtube_dl/extractor/sina.py @@ -64,7 +64,7 @@ class SinaIE(InfoExtractor): # The video id is in the redirected url self.to_screen('Getting video id') request = HEADRequest(url) - (_, urlh) = self._download_webpage_handle(request, 'NA', False) + _, urlh = self._download_webpage_handle(request, 'NA', False) return self._real_extract(urlh.geturl()) else: pseudo_id = mobj.group('pseudo_id') diff --git a/youtube_dl/extractor/tennistv.py b/youtube_dl/extractor/tennistv.py index 0c6f70784..a586f30ad 100644 --- a/youtube_dl/extractor/tennistv.py +++ b/youtube_dl/extractor/tennistv.py @@ -32,7 +32,7 @@ class TennisTVIE(InfoExtractor): _NETRC_MACHINE = 'tennistv' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if not username or not password: raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) diff --git a/youtube_dl/extractor/tubitv.py b/youtube_dl/extractor/tubitv.py index 36f6c1673..a51fa6515 100644 --- a/youtube_dl/extractor/tubitv.py +++ b/youtube_dl/extractor/tubitv.py @@ -36,7 +36,7 @@ class TubiTvIE(InfoExtractor): }] def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return self.report_login() diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 89e6eb5ab..edbb0aa69 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -108,7 +108,7 @@ class TumblrIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 3ee2af52e..e01f11331 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -61,7 +61,7 @@ class TwitchBaseIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 0a74a9768..a7196997e 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -151,7 +151,7 @@ class UdemyIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py index 80a643dfe..31eee0ba7 100644 --- a/youtube_dl/extractor/vessel.py +++ b/youtube_dl/extractor/vessel.py @@ -75,7 +75,7 @@ class VesselIE(InfoExtractor): 'Access to this content is restricted. (%s said: %s)' % (self.IE_NAME, err_code), expected=True) def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return self.report_login() diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index ad2a2a4b7..546de95d8 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -88,7 +88,7 @@ class VikiBaseIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 8dfd8891c..3baa2d075 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -37,7 +37,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): _LOGIN_URL = 'https://vimeo.com/log_in' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: if self._LOGIN_REQUIRED: raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index b50d4f170..29002b35f 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -32,7 +32,7 @@ class VKBaseIE(InfoExtractor): _NETRC_MACHINE = 'vk' def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index e4eec7c30..379559825 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -85,7 +85,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. """ - (username, password) = self._get_login_info() + username, password = self._get_login_info() # No authentication to be performed if username is None: if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None: diff --git a/youtube_dl/extractor/zattoo.py b/youtube_dl/extractor/zattoo.py index 773073d85..b5a3a0716 100644 --- a/youtube_dl/extractor/zattoo.py +++ b/youtube_dl/extractor/zattoo.py @@ -24,7 +24,7 @@ class ZattooBaseIE(InfoExtractor): _power_guide_hash = None def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if not username or not password: self.raise_login_required( 'A valid %s account is needed to access this media.' From ddd8486a448ee94134a62f2488e5e39bbd72880e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 22:10:08 +0700 Subject: [PATCH 14/27] [downloader/rtmp] Gracefully handle live streams interrupted by user --- youtube_dl/downloader/rtmp.py | 119 +++++++++++++++++++--------------- 1 file changed, 66 insertions(+), 53 deletions(-) diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index b823b5171..63e2b5c89 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -24,71 +24,78 @@ class RtmpFD(FileDownloader): def real_download(self, filename, info_dict): def run_rtmpdump(args): start = time.time() - resume_percent = None - resume_downloaded_data_len = None proc = subprocess.Popen(args, stderr=subprocess.PIPE) cursor_in_new_line = True - proc_stderr_closed = False - while not proc_stderr_closed: - # read line from stderr - line = '' - while True: - char = proc.stderr.read(1) - if not char: - proc_stderr_closed = True - break - if char in [b'\r', b'\n']: - break - line += char.decode('ascii', 'replace') - if not line: - # proc_stderr_closed is True - continue - mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) - if mobj: - downloaded_data_len = int(float(mobj.group(1)) * 1024) - percent = float(mobj.group(2)) - if not resume_percent: - resume_percent = percent - resume_downloaded_data_len = downloaded_data_len - time_now = time.time() - eta = self.calc_eta(start, time_now, 100 - resume_percent, percent - resume_percent) - speed = self.calc_speed(start, time_now, downloaded_data_len - resume_downloaded_data_len) - data_len = None - if percent > 0: - data_len = int(downloaded_data_len * 100 / percent) - self._hook_progress({ - 'status': 'downloading', - 'downloaded_bytes': downloaded_data_len, - 'total_bytes_estimate': data_len, - 'tmpfilename': tmpfilename, - 'filename': filename, - 'eta': eta, - 'elapsed': time_now - start, - 'speed': speed, - }) - cursor_in_new_line = False - else: - # no percent for live streams - mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) + + def dl(): + resume_percent = None + resume_downloaded_data_len = None + proc_stderr_closed = False + while not proc_stderr_closed: + # read line from stderr + line = '' + while True: + char = proc.stderr.read(1) + if not char: + proc_stderr_closed = True + break + if char in [b'\r', b'\n']: + break + line += char.decode('ascii', 'replace') + if not line: + # proc_stderr_closed is True + continue + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line) if mobj: downloaded_data_len = int(float(mobj.group(1)) * 1024) + percent = float(mobj.group(2)) + if not resume_percent: + resume_percent = percent + resume_downloaded_data_len = downloaded_data_len time_now = time.time() - speed = self.calc_speed(start, time_now, downloaded_data_len) + eta = self.calc_eta(start, time_now, 100 - resume_percent, percent - resume_percent) + speed = self.calc_speed(start, time_now, downloaded_data_len - resume_downloaded_data_len) + data_len = None + if percent > 0: + data_len = int(downloaded_data_len * 100 / percent) self._hook_progress({ + 'status': 'downloading', 'downloaded_bytes': downloaded_data_len, + 'total_bytes_estimate': data_len, 'tmpfilename': tmpfilename, 'filename': filename, - 'status': 'downloading', + 'eta': eta, 'elapsed': time_now - start, 'speed': speed, }) cursor_in_new_line = False - elif self.params.get('verbose', False): - if not cursor_in_new_line: - self.to_screen('') - cursor_in_new_line = True - self.to_screen('[rtmpdump] ' + line) - proc.wait() + else: + # no percent for live streams + mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec', line) + if mobj: + downloaded_data_len = int(float(mobj.group(1)) * 1024) + time_now = time.time() + speed = self.calc_speed(start, time_now, downloaded_data_len) + self._hook_progress({ + 'downloaded_bytes': downloaded_data_len, + 'tmpfilename': tmpfilename, + 'filename': filename, + 'status': 'downloading', + 'elapsed': time_now - start, + 'speed': speed, + }) + cursor_in_new_line = False + elif self.params.get('verbose', False): + if not cursor_in_new_line: + self.to_screen('') + cursor_in_new_line = True + self.to_screen('[rtmpdump] ' + line) + + try: + dl() + finally: + proc.wait() + if not cursor_in_new_line: self.to_screen('') return proc.returncode @@ -163,7 +170,13 @@ class RtmpFD(FileDownloader): RD_INCOMPLETE = 2 RD_NO_CONNECT = 3 - retval = run_rtmpdump(args) + try: + retval = run_rtmpdump(args) + except KeyboardInterrupt: + if not info_dict.get('is_live'): + raise + retval = RD_SUCCESS + self.to_screen('\n[rtmpdump] Interrupted by user') if retval == RD_NO_CONNECT: self.report_error('[rtmpdump] Could not connect to RTMP server.') From f16f48779cbad4a6d39a908e131a8d55941d1671 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 22:14:09 +0700 Subject: [PATCH 15/27] [downloader/rtmp] Generalize download messages and report time elapsed on finish --- youtube_dl/downloader/rtmp.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index 63e2b5c89..9e0ddbb18 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -170,6 +170,8 @@ class RtmpFD(FileDownloader): RD_INCOMPLETE = 2 RD_NO_CONNECT = 3 + started = time.time() + try: retval = run_rtmpdump(args) except KeyboardInterrupt: @@ -184,7 +186,7 @@ class RtmpFD(FileDownloader): while retval in (RD_INCOMPLETE, RD_FAILED) and not test and not live: prevsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('[rtmpdump] %s bytes' % prevsize) + self.to_screen('[rtmpdump] Downloaded %s bytes' % prevsize) time.sleep(5.0) # This seems to be needed args = basic_args + ['--resume'] if retval == RD_FAILED: @@ -201,13 +203,14 @@ class RtmpFD(FileDownloader): break if retval == RD_SUCCESS or (test and retval == RD_INCOMPLETE): fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('[rtmpdump] %s bytes' % fsize) + self.to_screen('[rtmpdump] Downloaded %s bytes' % fsize) self.try_rename(tmpfilename, filename) self._hook_progress({ 'downloaded_bytes': fsize, 'total_bytes': fsize, 'filename': filename, 'status': 'finished', + 'elapsed': time.time() - started, }) return True else: From 2ce35d9f43328e82108bae6661c2ac0ba2a0498c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 22:21:55 +0700 Subject: [PATCH 16/27] [cammodels] Add another error pattern --- youtube_dl/extractor/cammodels.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py index 4f1b88d14..17f7ac043 100644 --- a/youtube_dl/extractor/cammodels.py +++ b/youtube_dl/extractor/cammodels.py @@ -28,6 +28,7 @@ class CamModelsIE(InfoExtractor): ERRORS = ( ("I'm offline, but let's stay connected", 'This user is currently offline'), ('in a private show', 'This user is in a private show'), + ('is currently performing LIVE', 'This model is currently performing live'), ) for pattern, message in ERRORS: if pattern in webpage: From 8882840ec5d9536772d7de75b7fb6389103a3a1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 26 May 2018 22:22:28 +0700 Subject: [PATCH 17/27] [cammodels] Use geo verification headers --- youtube_dl/extractor/cammodels.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/cammodels.py b/youtube_dl/extractor/cammodels.py index 17f7ac043..ee0165dba 100644 --- a/youtube_dl/extractor/cammodels.py +++ b/youtube_dl/extractor/cammodels.py @@ -19,7 +19,8 @@ class CamModelsIE(InfoExtractor): def _real_extract(self, url): user_id = self._match_id(url) - webpage = self._download_webpage(url, user_id) + webpage = self._download_webpage( + url, user_id, headers=self.geo_verification_headers()) manifest_root = self._html_search_regex( r'manifestUrlRoot=([^&\']+)', webpage, 'manifest', default=None) From c9e12a618c9420c2bb21c09bf47b9469785f492e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 27 May 2018 12:10:12 +0100 Subject: [PATCH 18/27] [9c9media] extract mpd formats and subtitles --- youtube_dl/extractor/ctvnews.py | 4 +- youtube_dl/extractor/extractors.py | 5 +- youtube_dl/extractor/ninecninemedia.py | 93 ++++++++++---------------- youtube_dl/extractor/rds.py | 2 +- 4 files changed, 41 insertions(+), 63 deletions(-) diff --git a/youtube_dl/extractor/ctvnews.py b/youtube_dl/extractor/ctvnews.py index 55a127b76..03f8cefb7 100644 --- a/youtube_dl/extractor/ctvnews.py +++ b/youtube_dl/extractor/ctvnews.py @@ -11,10 +11,10 @@ class CTVNewsIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P[0-9.]+)' _TESTS = [{ 'url': 'http://www.ctvnews.ca/video?clipId=901995', - 'md5': '10deb320dc0ccb8d01d34d12fc2ea672', + 'md5': '9b8624ba66351a23e0b6e1391971f9af', 'info_dict': { 'id': '901995', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Extended: \'That person cannot be me\' Johnson says', 'description': 'md5:958dd3b4f5bbbf0ed4d045c790d89285', 'timestamp': 1467286284, diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d54e8df9f..2f485012f 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -718,10 +718,7 @@ from .nick import ( NickRuIE, ) from .niconico import NiconicoIE, NiconicoPlaylistIE -from .ninecninemedia import ( - NineCNineMediaStackIE, - NineCNineMediaIE, -) +from .ninecninemedia import NineCNineMediaIE from .ninegag import NineGagIE from .ninenow import NineNowIE from .nintendo import NintendoIE diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py index 8961309fd..875665d43 100644 --- a/youtube_dl/extractor/ninecninemedia.py +++ b/youtube_dl/extractor/ninecninemedia.py @@ -13,38 +13,11 @@ from ..utils import ( ) -class NineCNineMediaBaseIE(InfoExtractor): - _API_BASE_TEMPLATE = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' - - -class NineCNineMediaStackIE(NineCNineMediaBaseIE): - IE_NAME = '9c9media:stack' - _GEO_COUNTRIES = ['CA'] - _VALID_URL = r'9c9media:stack:(?P[^:]+):(?P\d+):(?P\d+):(?P\d+)' - - def _real_extract(self, url): - destination_code, content_id, package_id, stack_id = re.match(self._VALID_URL, url).groups() - stack_base_url_template = self._API_BASE_TEMPLATE + 'contentpackages/%s/stacks/%s/manifest.' - stack_base_url = stack_base_url_template % (destination_code, content_id, package_id, stack_id) - - formats = [] - formats.extend(self._extract_m3u8_formats( - stack_base_url + 'm3u8', stack_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - formats.extend(self._extract_f4m_formats( - stack_base_url + 'f4m', stack_id, - f4m_id='hds', fatal=False)) - self._sort_formats(formats) - - return { - 'id': stack_id, - 'formats': formats, - } - - -class NineCNineMediaIE(NineCNineMediaBaseIE): +class NineCNineMediaIE(InfoExtractor): IE_NAME = '9c9media' + _GEO_COUNTRIES = ['CA'] _VALID_URL = r'9c9media:(?P[^:]+):(?P\d+)' + _API_BASE_TEMPLATE = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' def _real_extract(self, url): destination_code, content_id = re.match(self._VALID_URL, url).groups() @@ -58,13 +31,26 @@ class NineCNineMediaIE(NineCNineMediaBaseIE): content_package = content['ContentPackages'][0] package_id = content_package['Id'] content_package_url = api_base_url + 'contentpackages/%s/' % package_id - content_package = self._download_json(content_package_url, content_id) + content_package = self._download_json( + content_package_url, content_id, query={ + '$include': '[HasClosedCaptions]', + }) - if content_package.get('Constraints', {}).get('Security', {}).get('Type') == 'adobe-drm': + if content_package.get('Constraints', {}).get('Security', {}).get('Type'): raise ExtractorError('This video is DRM protected.', expected=True) - stacks = self._download_json(content_package_url + 'stacks/', package_id)['Items'] - multistacks = len(stacks) > 1 + manifest_base_url = content_package_url + 'manifest.' + formats = [] + formats.extend(self._extract_m3u8_formats( + manifest_base_url + 'm3u8', content_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + manifest_base_url + 'f4m', content_id, + f4m_id='hds', fatal=False)) + formats.extend(self._extract_mpd_formats( + manifest_base_url + 'mpd', content_id, + mpd_id='dash', fatal=False)) + self._sort_formats(formats) thumbnails = [] for image in content.get('Images', []): @@ -85,10 +71,12 @@ class NineCNineMediaIE(NineCNineMediaBaseIE): continue container.append(e_name) - description = content.get('Desc') or content.get('ShortDesc') season = content.get('Season', {}) - base_info = { - 'description': description, + + info = { + 'id': content_id, + 'title': title, + 'description': content.get('Desc') or content.get('ShortDesc'), 'timestamp': parse_iso8601(content.get('BroadcastDateTime')), 'episode_number': int_or_none(content.get('Episode')), 'season': season.get('Name'), @@ -97,26 +85,19 @@ class NineCNineMediaIE(NineCNineMediaBaseIE): 'series': content.get('Media', {}).get('Name'), 'tags': tags, 'categories': categories, + 'duration': float_or_none(content_package.get('Duration')), + 'formats': formats, } - entries = [] - for stack in stacks: - stack_id = compat_str(stack['Id']) - entry = { - '_type': 'url_transparent', - 'url': '9c9media:stack:%s:%s:%s:%s' % (destination_code, content_id, package_id, stack_id), - 'id': stack_id, - 'title': '%s_part%s' % (title, stack['Name']) if multistacks else title, - 'duration': float_or_none(stack.get('Duration')), - 'ie_key': 'NineCNineMediaStack', + if content_package.get('HasClosedCaptions'): + info['subtitles'] = { + 'en': [{ + 'url': manifest_base_url + 'vtt', + 'ext': 'vtt', + }, { + 'url': manifest_base_url + 'srt', + 'ext': 'srt', + }] } - entry.update(base_info) - entries.append(entry) - return { - '_type': 'multi_video', - 'id': content_id, - 'title': title, - 'description': description, - 'entries': entries, - } + return info diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py index bf200ea4d..8c016a77d 100644 --- a/youtube_dl/extractor/rds.py +++ b/youtube_dl/extractor/rds.py @@ -19,7 +19,7 @@ class RDSIE(InfoExtractor): 'info_dict': { 'id': '604333', 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Fowler Jr. prend la direction de Jacksonville', 'description': 'Dante Fowler Jr. est le troisième choix du repêchage 2015 de la NFL. ', 'timestamp': 1430397346, From 9c65c4a6cd981e081f4a99d11206e984999f51ff Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 27 May 2018 12:11:53 +0100 Subject: [PATCH 19/27] [bellmedia] add support for bnnbloomberg.ca(#16560) --- youtube_dl/extractor/bellmedia.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/youtube_dl/extractor/bellmedia.py b/youtube_dl/extractor/bellmedia.py index 8820a3914..f36a2452d 100644 --- a/youtube_dl/extractor/bellmedia.py +++ b/youtube_dl/extractor/bellmedia.py @@ -12,7 +12,7 @@ class BellMediaIE(InfoExtractor): (?: ctv| tsn| - bnn| + bnn(?:bloomberg)?| thecomedynetwork| discovery| discoveryvelocity| @@ -27,17 +27,16 @@ class BellMediaIE(InfoExtractor): much\.com )/.*?(?:\bvid(?:eoid)?=|-vid|~|%7E|/(?:episode)?)(?P[0-9]{6,})''' _TESTS = [{ - 'url': 'http://www.ctv.ca/video/player?vid=706966', - 'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0', + 'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070', + 'md5': '36d3ef559cfe8af8efe15922cd3ce950', 'info_dict': { - 'id': '706966', - 'ext': 'mp4', - 'title': 'Larry Day and Richard Jutras on the TIFF red carpet of \'Stonewall\'', - 'description': 'etalk catches up with Larry Day and Richard Jutras on the TIFF red carpet of "Stonewall”.', - 'upload_date': '20150919', - 'timestamp': 1442624700, + 'id': '1403070', + 'ext': 'flv', + 'title': 'David Cockfield\'s Top Picks', + 'description': 'md5:810f7f8c6a83ad5b48677c3f8e5bb2c3', + 'upload_date': '20180525', + 'timestamp': 1527288600, }, - 'expected_warnings': ['HTTP Error 404'], }, { 'url': 'http://www.thecomedynetwork.ca/video/player?vid=923582', 'only_matching': True, @@ -70,6 +69,7 @@ class BellMediaIE(InfoExtractor): 'investigationdiscovery': 'invdisc', 'animalplanet': 'aniplan', 'etalk': 'ctv', + 'bnnbloomberg': 'bnn', } def _real_extract(self, url): From cfd7f2a6365e4d4ed9036b7fd873747be5e91d44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 27 May 2018 18:24:37 +0700 Subject: [PATCH 20/27] [apa] Add extractor (closes #15041, closes #15672) --- youtube_dl/extractor/apa.py | 94 ++++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/generic.py | 23 ++++++++ 3 files changed, 118 insertions(+) create mode 100644 youtube_dl/extractor/apa.py diff --git a/youtube_dl/extractor/apa.py b/youtube_dl/extractor/apa.py new file mode 100644 index 000000000..a30a935aa --- /dev/null +++ b/youtube_dl/extractor/apa.py @@ -0,0 +1,94 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + js_to_json, +) + + +class APAIE(InfoExtractor): + _VALID_URL = r'https?://[^/]+\.apa\.at/embed/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TESTS = [{ + 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', + 'md5': '2b12292faeb0a7d930c778c7a5b4759b', + 'info_dict': { + 'id': 'jjv85FdZ', + 'ext': 'mp4', + 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 254, + 'timestamp': 1519211149, + 'upload_date': '20180221', + }, + }, { + 'url': 'https://uvp-apapublisher.sf.apa.at/embed/2f94e9e6-d945-4db2-9548-f9a41ebf7b78', + 'only_matching': True, + }, { + 'url': 'http://uvp-rma.sf.apa.at/embed/70404cca-2f47-4855-bbb8-20b1fae58f76', + 'only_matching': True, + }, { + 'url': 'http://uvp-kleinezeitung.sf.apa.at/embed/f1c44979-dba2-4ebf-b021-e4cf2cac3c81', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r']+\bsrc=(["\'])(?P(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + jwplatform_id = self._search_regex( + r'media[iI]d\s*:\s*["\'](?P[a-zA-Z0-9]{8})', webpage, + 'jwplatform id', default=None) + + if jwplatform_id: + return self.url_result( + 'jwplatform:' + jwplatform_id, ie='JWPlatform', + video_id=video_id) + + sources = self._parse_json( + self._search_regex( + r'sources\s*=\s*(\[.+?\])\s*;', webpage, 'sources'), + video_id, transform_source=js_to_json) + + formats = [] + for source in sources: + if not isinstance(source, dict): + continue + source_url = source.get('file') + if not source_url or not isinstance(source_url, compat_str): + continue + ext = determine_ext(source_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': source_url, + }) + self._sort_formats(formats) + + thumbnail = self._search_regex( + r'image\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + 'thumbnail', fatal=False, group='url') + + return { + 'id': video_id, + 'title': video_id, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2f485012f..5f829c72c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -44,6 +44,7 @@ from .anysex import AnySexIE from .aol import AolIE from .allocine import AllocineIE from .aliexpress import AliExpressLiveIE +from .apa import APAIE from .aparat import AparatIE from .appleconnect import AppleConnectIE from .appletrailers import ( diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 0292e0458..dad951b75 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -110,6 +110,7 @@ from .xfileshare import XFileShareIE from .cloudflarestream import CloudflareStreamIE from .peertube import PeerTubeIE from .indavideo import IndavideoEmbedIE +from .apa import APAIE class GenericIE(InfoExtractor): @@ -2041,6 +2042,23 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # APA embed via JWPlatform embed + 'url': 'http://www.vol.at/blue-man-group/5593454', + 'info_dict': { + 'id': 'jjv85FdZ', + 'ext': 'mp4', + 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 254, + 'timestamp': 1519211149, + 'upload_date': '20180221', + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://share-videos.se/auto/video/83645793?uid=13', 'md5': 'b68d276de422ab07ee1d49388103f457', @@ -3068,6 +3086,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key()) + apa_urls = APAIE._extract_urls(webpage) + if apa_urls: + return self.playlist_from_matches( + apa_urls, video_id, video_title, ie=APAIE.ie_key()) + sharevideos_urls = [mobj.group('url') for mobj in re.finditer( r']+?\bsrc\s*=\s*(["\'])(?P(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', webpage)] From a07879d6b2edc474b0595a29932726fa7aa14b3a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Mon, 28 May 2018 00:10:01 +0100 Subject: [PATCH 21/27] [spiegel] fix info extraction(#16538) --- youtube_dl/extractor/spiegel.py | 78 +++++++++++---------------------- 1 file changed, 25 insertions(+), 53 deletions(-) diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index fc995e8c1..4df7f4ddc 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -11,9 +11,9 @@ from .nexx import ( from .spiegeltv import SpiegeltvIE from ..compat import compat_urlparse from ..utils import ( - extract_attributes, - unified_strdate, - get_element_by_attribute, + parse_duration, + strip_or_none, + unified_timestamp, ) @@ -21,35 +21,38 @@ class SpiegelIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', - 'md5': '2c2754212136f35fb4b19767d242f66e', + 'md5': 'b57399839d055fccfeb9a0455c439868', 'info_dict': { - 'id': '1259285', + 'id': '563747', 'ext': 'mp4', 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', 'description': 'md5:8029d8310232196eb235d27575a8b9f4', 'duration': 49, 'upload_date': '20130311', + 'timestamp': 1362994320, }, }, { 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', - 'md5': 'f2cdf638d7aa47654e251e1aee360af1', + 'md5': '5b6c2f4add9d62912ed5fc78a1faed80', 'info_dict': { - 'id': '1309159', + 'id': '580988', 'ext': 'mp4', 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', 'duration': 983, 'upload_date': '20131115', + 'timestamp': 1384546642, }, }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', - 'md5': 'd8eeca6bfc8f1cd6f490eb1f44695d51', + 'md5': '97b91083a672d72976faa8433430afb9', 'info_dict': { - 'id': '1519126', + 'id': '601883', 'ext': 'mp4', 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', 'upload_date': '20140904', + 'timestamp': 1409834160, } }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', @@ -62,59 +65,28 @@ class SpiegelIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage, handle = self._download_webpage_handle(url, video_id) + metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id + handle = self._request_webpage(metadata_url, video_id) # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html if SpiegeltvIE.suitable(handle.geturl()): return self.url_result(handle.geturl(), 'Spiegeltv') - nexx_id = self._search_regex( - r'nexxOmniaId\s*:\s*(\d+)', webpage, 'nexx id', default=None) - if nexx_id: - domain_id = NexxIE._extract_domain_id(webpage) or '748' - return self.url_result( - 'nexx:%s:%s' % (domain_id, nexx_id), ie=NexxIE.ie_key(), - video_id=nexx_id) - - video_data = extract_attributes(self._search_regex(r'(]+id="spVideoElements"[^>]+>)', webpage, 'video element', default='')) - - title = video_data.get('data-video-title') or get_element_by_attribute('class', 'module-title', webpage) - description = video_data.get('data-video-teaser') or self._html_search_meta('description', webpage, 'description') - - base_url = self._search_regex( - [r'server\s*:\s*(["\'])(?P.+?)\1', r'var\s+server\s*=\s*"(?P[^"]+)\"'], - webpage, 'server URL', group='url') - - xml_url = base_url + video_id + '.xml' - idoc = self._download_xml(xml_url, video_id) - - formats = [] - for n in list(idoc): - if n.tag.startswith('type') and n.tag != 'type6': - format_id = n.tag.rpartition('type')[2] - video_url = base_url + n.find('./filename').text - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'width': int(n.find('./width').text), - 'height': int(n.find('./height').text), - 'abr': int(n.find('./audiobitrate').text), - 'vbr': int(n.find('./videobitrate').text), - 'vcodec': n.find('./codec').text, - 'acodec': 'MP4A', - }) - duration = float(idoc[0].findall('./duration')[0].text) - - self._check_formats(formats, video_id) - self._sort_formats(formats) + video_data = self._parse_json(self._webpage_read_content( + handle, metadata_url, video_id), video_id) + title = video_data['title'] + nexx_id = video_data['nexxOmniaId'] + domain_id = video_data.get('nexxOmniaDomain') or '748' return { + '_type': 'url_transparent', 'id': video_id, + 'url': 'nexx:%s:%s' % (domain_id, nexx_id), 'title': title, - 'description': description.strip() if description else None, - 'duration': duration, - 'upload_date': unified_strdate(video_data.get('data-video-date')), - 'formats': formats, + 'description': strip_or_none(video_data.get('teaser')), + 'duration': parse_duration(video_data.get('duration')), + 'timestamp': unified_timestamp(video_data.get('datum')), + 'ie_key': NexxIE.ie_key(), } From e0d42dd4b270d06a953822c091afefd946bd93f2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 May 2018 13:21:07 +0100 Subject: [PATCH 22/27] [teamcoco] Fix extraction for full episodes(closes #16573) --- youtube_dl/extractor/tbs.py | 61 ++++++------------ youtube_dl/extractor/teamcoco.py | 102 ++++++++++++++++++------------- youtube_dl/extractor/turner.py | 47 +++++++++++++- 3 files changed, 122 insertions(+), 88 deletions(-) diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py index edc31729d..784f8ed66 100644 --- a/youtube_dl/extractor/tbs.py +++ b/youtube_dl/extractor/tbs.py @@ -4,6 +4,10 @@ from __future__ import unicode_literals import re from .turner import TurnerBaseIE +from ..compat import ( + compat_urllib_parse_urlparse, + compat_parse_qs, +) from ..utils import ( float_or_none, int_or_none, @@ -38,48 +42,22 @@ class TBSIE(TurnerBaseIE): def _real_extract(self, url): site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) - video_data = self._parse_json(self._search_regex( + drupal_settings = self._parse_json(self._search_regex( r']+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})', - webpage, 'drupal setting'), display_id)['turner_playlist'][0] + webpage, 'drupal setting'), display_id) + video_data = drupal_settings['turner_playlist'][0] media_id = video_data['mediaID'] title = video_data['title'] + tokenizer_query = compat_parse_qs(compat_urllib_parse_urlparse( + drupal_settings['ngtv_token_url']).query) - streams_data = self._download_json( - 'http://medium.ngtv.io/media/%s/tv' % media_id, - media_id)['media']['tv'] - duration = None - chapters = [] - formats = [] - for supported_type in ('unprotected', 'bulkaes'): - stream_data = streams_data.get(supported_type, {}) - m3u8_url = stream_data.get('secureUrl') or stream_data.get('url') - if not m3u8_url: - continue - if stream_data.get('playlistProtection') == 'spe': - m3u8_url = self._add_akamai_spe_token( - 'http://token.vgtf.net/token/token_spe', - m3u8_url, media_id, { - 'url': url, - 'site_name': site[:3].upper(), - 'auth_required': video_data.get('authRequired') == '1', - }) - formats.extend(self._extract_m3u8_formats( - m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) - - duration = float_or_none(stream_data.get('totalRuntime') or video_data.get('duration')) - - if not chapters: - for chapter in stream_data.get('contentSegments', []): - start_time = float_or_none(chapter.get('start')) - duration = float_or_none(chapter.get('duration')) - if start_time is None or duration is None: - continue - chapters.append({ - 'start_time': start_time, - 'end_time': start_time + duration, - }) - self._sort_formats(formats) + info = self._extract_ngtv_info( + media_id, tokenizer_query, { + 'url': url, + 'site_name': site[:3].upper(), + 'auth_required': video_data.get('authRequired') == '1', + }) thumbnails = [] for image_id, image in video_data.get('images', {}).items(): @@ -98,15 +76,14 @@ class TBSIE(TurnerBaseIE): }) thumbnails.append(i) - return { + info.update({ 'id': media_id, 'title': title, 'description': strip_or_none(video_data.get('descriptionNoTags') or video_data.get('shortDescriptionNoTags')), - 'duration': duration, + 'duration': float_or_none(video_data.get('duration')) or info.get('duration'), 'timestamp': int_or_none(video_data.get('created')), 'season_number': int_or_none(video_data.get('season')), 'episode_number': int_or_none(video_data.get('episode')), - 'cahpters': chapters, 'thumbnails': thumbnails, - 'formats': formats, - } + }) + return info diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 63fd4fe1c..73469cc5d 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import json -from .common import InfoExtractor +from .turner import TurnerBaseIE from ..utils import ( determine_ext, ExtractorError, @@ -15,7 +15,7 @@ from ..utils import ( ) -class TeamcocoIE(InfoExtractor): +class TeamcocoIE(TurnerBaseIE): _VALID_URL = r'https?://teamcoco\.com/(?P([^/]+/)*[^/?#]+)' _TESTS = [ { @@ -110,6 +110,8 @@ class TeamcocoIE(InfoExtractor): name } duration + turnerMediaId + turnerMediaAuthToken } } ... on NotFoundSlug { @@ -123,53 +125,65 @@ class TeamcocoIE(InfoExtractor): record = response['record'] video_id = record['id'] - video_sources = self._graphql_call('''{ - %s(id: "%s") { - src - } -}''', 'RecordVideoSource', video_id) or {} - - formats = [] - get_quality = qualities(['low', 'sd', 'hd', 'uhd']) - for format_id, src in video_sources.get('src', {}).items(): - if not isinstance(src, dict): - continue - src_url = src.get('src') - if not src_url: - continue - ext = determine_ext(src_url, mimetype2ext(src.get('type'))) - if format_id == 'hls' or ext == 'm3u8': - # compat_urllib_parse.urljoin does not work here - if src_url.startswith('/'): - src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url - formats.extend(self._extract_m3u8_formats( - src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) - else: - if src_url.startswith('/mp4:protected/'): - # TODO Correct extraction for these files - continue - tbr = int_or_none(self._search_regex( - r'(\d+)k\.mp4', src_url, 'tbr', default=None)) - - formats.append({ - 'url': src_url, - 'ext': ext, - 'tbr': tbr, - 'format_id': format_id, - 'quality': get_quality(format_id), - }) - if not formats: - formats = self._extract_m3u8_formats( - record['file']['url'], video_id, 'mp4', fatal=False) - self._sort_formats(formats) - - return { + info = { 'id': video_id, 'display_id': display_id, - 'formats': formats, 'title': record['title'], 'thumbnail': record.get('thumb', {}).get('preview'), 'description': record.get('teaser'), 'duration': parse_duration(record.get('duration')), 'timestamp': parse_iso8601(record.get('publishOn')), } + + media_id = record.get('turnerMediaId') + if media_id: + self._initialize_geo_bypass({ + 'countries': ['US'], + }) + info.update(self._extract_ngtv_info(media_id, { + 'accessToken': record['turnerMediaAuthToken'], + 'accessTokenType': 'jws', + })) + else: + video_sources = self._graphql_call('''{ + %s(id: "%s") { + src + } +}''', 'RecordVideoSource', video_id) or {} + + formats = [] + get_quality = qualities(['low', 'sd', 'hd', 'uhd']) + for format_id, src in video_sources.get('src', {}).items(): + if not isinstance(src, dict): + continue + src_url = src.get('src') + if not src_url: + continue + ext = determine_ext(src_url, mimetype2ext(src.get('type'))) + if format_id == 'hls' or ext == 'm3u8': + # compat_urllib_parse.urljoin does not work here + if src_url.startswith('/'): + src_url = 'http://ht.cdn.turner.com/tbs/big/teamcoco' + src_url + formats.extend(self._extract_m3u8_formats( + src_url, video_id, 'mp4', m3u8_id=format_id, fatal=False)) + else: + if src_url.startswith('/mp4:protected/'): + # TODO Correct extraction for these files + continue + tbr = int_or_none(self._search_regex( + r'(\d+)k\.mp4', src_url, 'tbr', default=None)) + + formats.append({ + 'url': src_url, + 'ext': ext, + 'tbr': tbr, + 'format_id': format_id, + 'quality': get_quality(format_id), + }) + if not formats: + formats = self._extract_m3u8_formats( + record['file']['url'], video_id, 'mp4', fatal=False) + self._sort_formats(formats) + info['formats'] = formats + + return info diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index e73b64aeb..2b7b0d6e1 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -9,6 +9,7 @@ from ..utils import ( xpath_text, int_or_none, determine_ext, + float_or_none, parse_duration, xpath_attr, update_url_query, @@ -23,14 +24,17 @@ class TurnerBaseIE(AdobePassIE): def _extract_timestamp(self, video_data): return int_or_none(xpath_attr(video_data, 'dateCreated', 'uts')) - def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data): + def _add_akamai_spe_token(self, tokenizer_src, video_url, content_id, ap_data, custom_tokenizer_query=None): secure_path = self._search_regex(r'https?://[^/]+(.+/)', video_url, 'secure path') + '*' token = self._AKAMAI_SPE_TOKEN_CACHE.get(secure_path) if not token: query = { 'path': secure_path, - 'videoId': content_id, } + if custom_tokenizer_query: + query.update(custom_tokenizer_query) + else: + query['videoId'] = content_id if ap_data.get('auth_required'): query['accessToken'] = self._extract_mvpd_auth(ap_data['url'], content_id, ap_data['site_name'], ap_data['site_name']) auth = self._download_xml( @@ -188,3 +192,42 @@ class TurnerBaseIE(AdobePassIE): 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), 'is_live': is_live, } + + def _extract_ngtv_info(self, media_id, tokenizer_query, ap_data=None): + streams_data = self._download_json( + 'http://medium.ngtv.io/media/%s/tv' % media_id, + media_id)['media']['tv'] + duration = None + chapters = [] + formats = [] + for supported_type in ('unprotected', 'bulkaes'): + stream_data = streams_data.get(supported_type, {}) + m3u8_url = stream_data.get('secureUrl') or stream_data.get('url') + if not m3u8_url: + continue + if stream_data.get('playlistProtection') == 'spe': + m3u8_url = self._add_akamai_spe_token( + 'http://token.ngtv.io/token/token_spe', + m3u8_url, media_id, ap_data or {}, tokenizer_query) + formats.extend(self._extract_m3u8_formats( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) + + duration = float_or_none(stream_data.get('totalRuntime')) + + if not chapters: + for chapter in stream_data.get('contentSegments', []): + start_time = float_or_none(chapter.get('start')) + chapter_duration = float_or_none(chapter.get('duration')) + if start_time is None or chapter_duration is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': start_time + chapter_duration, + }) + self._sort_formats(formats) + + return { + 'formats': formats, + 'chapters': chapters, + 'duration': duration, + } From bc3143ac5e18731502df014e30c5fe89554e9d6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 30 May 2018 21:52:03 +0700 Subject: [PATCH 23/27] [ChangeLog] Actualize [ci skip] --- ChangeLog | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ChangeLog b/ChangeLog index 280390ea0..95a5c556f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,27 @@ +version + +Core +* [downloader/rtmp] Generalize download messages and report time elapsed + on finish +* [downloader/rtmp] Gracefully handle live streams interrupted by user + +Extractors +* [teamcoco] Fix extraction for full episodes (#16573) +* [spiegel] Fix info extraction (#16538) ++ [apa] Add support for apa.at (#15041, #15672) ++ [bellmedia] Add support for bnnbloomberg.ca (#16560) ++ [9c9media] Extract MPD formats and subtitles +* [cammodels] Use geo verification headers ++ [ufctv] Add support for authentication (#16542) ++ [cammodels] Add support for cammodels.com (#14499) +* [utils] Fix style id extraction for namespaced id attribute in dfxp2srt + (#16551) +* [soundcloud] Detect format extension (#16549) +* [cbc] Fix playlist title extraction (#16502) ++ [tumblr] Detect and report sensitive media (#13829) ++ [tumblr] Add support for authentication (#15133) + + version 2018.05.26 Core From e425710554f1ed96504389fb526b898a942012dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 30 May 2018 21:54:30 +0700 Subject: [PATCH 24/27] release 2018.05.30 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index c4d4e534e..b47a450a4 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.26*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.26** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2018.05.30*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2018.05.30** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2018.05.26 +[debug] youtube-dl version 2018.05.30 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 95a5c556f..4e989caf7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2018.05.30 Core * [downloader/rtmp] Generalize download messages and report time elapsed diff --git a/docs/supportedsites.md b/docs/supportedsites.md index b60f2ff23..c2d5401d6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -15,7 +15,6 @@ - **8tracks** - **91porn** - **9c9media** - - **9c9media:stack** - **9gag** - **9now.com.au** - **abc.net.au** @@ -48,6 +47,7 @@ - **anitube.se** - **Anvato** - **AnySex** + - **APA** - **Aparat** - **AppleConnect** - **AppleDaily**: 臺灣蘋果日報 @@ -128,6 +128,7 @@ - **BYUtv** - **Camdemy** - **CamdemyFolder** + - **CamModels** - **CamWithHer** - **canalc2.tv** - **Canalplus**: mycanal.fr and piwiplus.fr diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2253da927..0f15738b2 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.05.26' +__version__ = '2018.05.30' From 4fd1437d9d617069494a471ba40341c2ad6623b6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 May 2018 17:08:32 +0100 Subject: [PATCH 25/27] [rbmaradio] check formats availability(closes #16585) --- youtube_dl/extractor/rbmaradio.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py index afa7b9161..9c4d72bbd 100644 --- a/youtube_dl/extractor/rbmaradio.py +++ b/youtube_dl/extractor/rbmaradio.py @@ -54,6 +54,7 @@ class RBMARadioIE(InfoExtractor): 'abr': abr, 'vcodec': 'none', } for abr in (96, 128, 256)] + self._check_formats(formats, episode_id) description = clean_html(episode.get('longTeaser')) thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape')) From 128b58ad139f2e62274ab6a649b965f5fa01a533 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 May 2018 02:49:35 +0100 Subject: [PATCH 26/27] [nhl] remove old extractors --- youtube_dl/extractor/extractors.py | 7 +- youtube_dl/extractor/nhl.py | 345 +++++------------------------ 2 files changed, 62 insertions(+), 290 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 5f829c72c..93b22a8c3 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -705,12 +705,7 @@ from .nexx import ( from .nfb import NFBIE from .nfl import NFLIE from .nhk import NhkVodIE -from .nhl import ( - NHLVideocenterIE, - NHLNewsIE, - NHLVideocenterCategoryIE, - NHLIE, -) +from .nhl import NHLIE from .nick import ( NickIE, NickBrIE, diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py index 62ce800c0..cf440f713 100644 --- a/youtube_dl/extractor/nhl.py +++ b/youtube_dl/extractor/nhl.py @@ -1,18 +1,10 @@ from __future__ import unicode_literals import re -import json -import os from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_str, -) +from ..compat import compat_str from ..utils import ( - unified_strdate, determine_ext, int_or_none, parse_iso8601, @@ -20,236 +12,77 @@ from ..utils import ( ) -class NHLBaseInfoExtractor(InfoExtractor): - @staticmethod - def _fix_json(json_string): - return json_string.replace('\\\'', '\'') +class NHLBaseIE(InfoExtractor): + def _real_extract(self, url): + site, tmp_id = re.match(self._VALID_URL, url).groups() + video_data = self._download_json( + 'https://%s/%s/%sid/v1/%s/details/web-v1.json' + % (self._CONTENT_DOMAIN, site[:3], 'item/' if site == 'mlb' else '', tmp_id), tmp_id) + if video_data.get('type') != 'video': + video_data = video_data['media'] + video = video_data.get('video') + if video: + video_data = video + else: + videos = video_data.get('videos') + if videos: + video_data = videos[0] - def _real_extract_video(self, video_id): - vid_parts = video_id.split(',') - if len(vid_parts) == 3: - video_id = '%s0%s%s-X-h' % (vid_parts[0][:4], vid_parts[1], vid_parts[2].rjust(4, '0')) - json_url = 'http://video.nhl.com/videocenter/servlets/playlist?ids=%s&format=json' % video_id - data = self._download_json( - json_url, video_id, transform_source=self._fix_json) - return self._extract_video(data[0]) + video_id = compat_str(video_data['id']) + title = video_data['title'] - def _extract_video(self, info): - video_id = info['id'] - self.report_extraction(video_id) + formats = [] + for playback in video_data.get('playbacks', []): + playback_url = playback.get('url') + if not playback_url: + continue + ext = determine_ext(playback_url) + if ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + playback_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=playback.get('name', 'hls'), fatal=False) + self._check_formats(m3u8_formats, video_id) + formats.extend(m3u8_formats) + else: + height = int_or_none(playback.get('height')) + formats.append({ + 'format_id': playback.get('name', 'http' + ('-%dp' % height if height else '')), + 'url': playback_url, + 'width': int_or_none(playback.get('width')), + 'height': height, + 'tbr': int_or_none(self._search_regex(r'_(\d+)[kK]', playback_url, 'bitrate', default=None)), + }) + self._sort_formats(formats) - initial_video_url = info['publishPoint'] - if info['formats'] == '1': - parsed_url = compat_urllib_parse_urlparse(initial_video_url) - filename, ext = os.path.splitext(parsed_url.path) - path = '%s_sd%s' % (filename, ext) - data = compat_urllib_parse_urlencode({ - 'type': 'fvod', - 'path': compat_urlparse.urlunparse(parsed_url[:2] + (path,) + parsed_url[3:]) + thumbnails = [] + cuts = video_data.get('image', {}).get('cuts') or [] + if isinstance(cuts, dict): + cuts = cuts.values() + for thumbnail_data in cuts: + thumbnail_url = thumbnail_data.get('src') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail_data.get('width')), + 'height': int_or_none(thumbnail_data.get('height')), }) - path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data - path_doc = self._download_xml( - path_url, video_id, 'Downloading final video url') - video_url = path_doc.find('path').text - else: - video_url = initial_video_url - - join = compat_urlparse.urljoin - ret = { - 'id': video_id, - 'title': info['name'], - 'url': video_url, - 'description': info['description'], - 'duration': int(info['duration']), - 'thumbnail': join(join(video_url, '/u/'), info['bigImage']), - 'upload_date': unified_strdate(info['releaseDate'].split('.')[0]), - } - if video_url.startswith('rtmp:'): - mobj = re.match(r'(?Prtmp://[^/]+/(?P[a-z0-9/]+))/(?Pmp4:.*)', video_url) - ret.update({ - 'tc_url': mobj.group('tc_url'), - 'play_path': mobj.group('play_path'), - 'app': mobj.group('app'), - 'no_resume': True, - }) - return ret - - -class NHLVideocenterIE(NHLBaseInfoExtractor): - IE_NAME = 'nhl.com:videocenter' - _VALID_URL = r'https?://video(?P\.[^.]*)?\.nhl\.com/videocenter/(?:console|embed)?(?:\?(?:.*?[?&])?)(?:id|hlg|playlist)=(?P[-0-9a-zA-Z,]+)' - - _TESTS = [{ - 'url': 'http://video.canucks.nhl.com/videocenter/console?catid=6?id=453614', - 'md5': 'db704a4ea09e8d3988c85e36cc892d09', - 'info_dict': { - 'id': '453614', - 'ext': 'mp4', - 'title': 'Quick clip: Weise 4-3 goal vs Flames', - 'description': 'Dale Weise scores his first of the season to put the Canucks up 4-3.', - 'duration': 18, - 'upload_date': '20131006', - }, - }, { - 'url': 'http://video.nhl.com/videocenter/console?id=2014020024-628-h', - 'md5': 'd22e82bc592f52d37d24b03531ee9696', - 'info_dict': { - 'id': '2014020024-628-h', - 'ext': 'mp4', - 'title': 'Alex Galchenyuk Goal on Ray Emery (14:40/3rd)', - 'description': 'Home broadcast - Montreal Canadiens at Philadelphia Flyers - October 11, 2014', - 'duration': 0, - 'upload_date': '20141011', - }, - }, { - 'url': 'http://video.mapleleafs.nhl.com/videocenter/console?id=58665&catid=802', - 'md5': 'c78fc64ea01777e426cfc202b746c825', - 'info_dict': { - 'id': '58665', - 'ext': 'flv', - 'title': 'Classic Game In Six - April 22, 1979', - 'description': 'It was the last playoff game for the Leafs in the decade, and the last time the Leafs and Habs played in the playoffs. Great game, not a great ending.', - 'duration': 400, - 'upload_date': '20100129' - }, - }, { - 'url': 'http://video.flames.nhl.com/videocenter/console?id=630616', - 'only_matching': True, - }, { - 'url': 'http://video.nhl.com/videocenter/?id=736722', - 'only_matching': True, - }, { - 'url': 'http://video.nhl.com/videocenter/console?hlg=20142015,2,299&lang=en', - 'md5': '076fcb88c255154aacbf0a7accc3f340', - 'info_dict': { - 'id': '2014020299-X-h', - 'ext': 'mp4', - 'title': 'Penguins at Islanders / Game Highlights', - 'description': 'Home broadcast - Pittsburgh Penguins at New York Islanders - November 22, 2014', - 'duration': 268, - 'upload_date': '20141122', - } - }, { - 'url': 'http://video.oilers.nhl.com/videocenter/console?id=691469&catid=4', - 'info_dict': { - 'id': '691469', - 'ext': 'mp4', - 'title': 'RAW | Craig MacTavish Full Press Conference', - 'description': 'Oilers GM Craig MacTavish addresses the media at Rexall Place on Friday.', - 'upload_date': '20141205', - }, - 'params': { - 'skip_download': True, # Requires rtmpdump - } - }, { - 'url': 'http://video.nhl.com/videocenter/embed?playlist=836127', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self._real_extract_video(video_id) - - -class NHLNewsIE(NHLBaseInfoExtractor): - IE_NAME = 'nhl.com:news' - IE_DESC = 'NHL news' - _VALID_URL = r'https?://(?:.+?\.)?nhl\.com/(?:ice|club)/news\.html?(?:\?(?:.*?[?&])?)id=(?P[-0-9a-zA-Z]+)' - - _TESTS = [{ - 'url': 'http://www.nhl.com/ice/news.htm?id=750727', - 'md5': '4b3d1262e177687a3009937bd9ec0be8', - 'info_dict': { - 'id': '736722', - 'ext': 'mp4', - 'title': 'Cal Clutterbuck has been fined $2,000', - 'description': 'md5:45fe547d30edab88b23e0dd0ab1ed9e6', - 'duration': 37, - 'upload_date': '20150128', - }, - }, { - # iframe embed - 'url': 'http://sabres.nhl.com/club/news.htm?id=780189', - 'md5': '9f663d1c006c90ac9fb82777d4294e12', - 'info_dict': { - 'id': '836127', - 'ext': 'mp4', - 'title': 'Morning Skate: OTT vs. BUF (9/23/15)', - 'description': "Brian Duff chats with Tyler Ennis prior to Buffalo's first preseason home game.", - 'duration': 93, - 'upload_date': '20150923', - }, - }] - - def _real_extract(self, url): - news_id = self._match_id(url) - webpage = self._download_webpage(url, news_id) - video_id = self._search_regex( - [r'pVid(\d+)', r"nlid\s*:\s*'(\d+)'", - r']+src=["\']https?://video.*?\.nhl\.com/videocenter/embed\?.*\bplaylist=(\d+)'], - webpage, 'video id') - return self._real_extract_video(video_id) - - -class NHLVideocenterCategoryIE(NHLBaseInfoExtractor): - IE_NAME = 'nhl.com:videocenter:category' - IE_DESC = 'NHL videocenter category' - _VALID_URL = r'https?://video\.(?P[^.]*)\.nhl\.com/videocenter/(console\?[^(id=)]*catid=(?P[0-9]+)(?![&?]id=).*?)?$' - _TEST = { - 'url': 'http://video.canucks.nhl.com/videocenter/console?catid=999', - 'info_dict': { - 'id': '999', - 'title': 'Highlights', - }, - 'playlist_count': 12, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - team = mobj.group('team') - webpage = self._download_webpage(url, team) - cat_id = self._search_regex( - [r'var defaultCatId = "(.+?)";', - r'{statusIndex:0,index:0,.*?id:(.*?),'], - webpage, 'category id') - playlist_title = self._html_search_regex( - r'tab0"[^>]*?>(.*?)', - webpage, 'playlist title', flags=re.DOTALL).lower().capitalize() - - data = compat_urllib_parse_urlencode({ - 'cid': cat_id, - # This is the default value - 'count': 12, - 'ptrs': 3, - 'format': 'json', - }) - path = '/videocenter/servlets/browse?' + data - request_url = compat_urlparse.urljoin(url, path) - response = self._download_webpage(request_url, playlist_title) - response = self._fix_json(response) - if not response.strip(): - self._downloader.report_warning('Got an empty response, trying ' - 'adding the "newvideos" parameter') - response = self._download_webpage(request_url + '&newvideos=true', - playlist_title) - response = self._fix_json(response) - videos = json.loads(response) return { - '_type': 'playlist', - 'title': playlist_title, - 'id': cat_id, - 'entries': [self._extract_video(v) for v in videos], + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'timestamp': parse_iso8601(video_data.get('date')), + 'duration': parse_duration(video_data.get('duration')), + 'thumbnails': thumbnails, + 'formats': formats, } -class NHLIE(InfoExtractor): +class NHLIE(NHLBaseIE): IE_NAME = 'nhl.com' _VALID_URL = r'https?://(?:www\.)?(?Pnhl|wch2016)\.com/(?:[^/]+/)*c-(?P\d+)' - _SITES_MAP = { - 'nhl': 'nhl', - 'wch2016': 'wch', - } + _CONTENT_DOMAIN = 'nhl.bamcontent.com' _TESTS = [{ # type=video 'url': 'https://www.nhl.com/video/anisimov-cleans-up-mess/t-277752844/c-43663503', @@ -293,59 +126,3 @@ class NHLIE(InfoExtractor): 'url': 'https://www.wch2016.com/news/3-stars-team-europe-vs-team-canada/c-282195068', 'only_matching': True, }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - tmp_id, site = mobj.group('id'), mobj.group('site') - video_data = self._download_json( - 'https://nhl.bamcontent.com/%s/id/v1/%s/details/web-v1.json' - % (self._SITES_MAP[site], tmp_id), tmp_id) - if video_data.get('type') == 'article': - video_data = video_data['media'] - - video_id = compat_str(video_data['id']) - title = video_data['title'] - - formats = [] - for playback in video_data.get('playbacks', []): - playback_url = playback.get('url') - if not playback_url: - continue - ext = determine_ext(playback_url) - if ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - playback_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=playback.get('name', 'hls'), fatal=False) - self._check_formats(m3u8_formats, video_id) - formats.extend(m3u8_formats) - else: - height = int_or_none(playback.get('height')) - formats.append({ - 'format_id': playback.get('name', 'http' + ('-%dp' % height if height else '')), - 'url': playback_url, - 'width': int_or_none(playback.get('width')), - 'height': height, - }) - self._sort_formats(formats, ('preference', 'width', 'height', 'tbr', 'format_id')) - - thumbnails = [] - for thumbnail_id, thumbnail_data in video_data.get('image', {}).get('cuts', {}).items(): - thumbnail_url = thumbnail_data.get('src') - if not thumbnail_url: - continue - thumbnails.append({ - 'id': thumbnail_id, - 'url': thumbnail_url, - 'width': int_or_none(thumbnail_data.get('width')), - 'height': int_or_none(thumbnail_data.get('height')), - }) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('description'), - 'timestamp': parse_iso8601(video_data.get('date')), - 'duration': parse_duration(video_data.get('duration')), - 'thumbnails': thumbnails, - 'formats': formats, - } From acca2ac7f3f4c78bce775d47736caa63e6872e26 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Thu, 31 May 2018 02:50:14 +0100 Subject: [PATCH 27/27] [mlb] improve extraction(closes #16587) --- youtube_dl/extractor/mlb.py | 105 +++++++++--------------------------- 1 file changed, 24 insertions(+), 81 deletions(-) diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py index 675ff6873..b907f6b49 100644 --- a/youtube_dl/extractor/mlb.py +++ b/youtube_dl/extractor/mlb.py @@ -1,96 +1,90 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from ..utils import ( - parse_duration, - parse_iso8601, -) +from .nhl import NHLBaseIE -class MLBIE(InfoExtractor): +class MLBIE(NHLBaseIE): _VALID_URL = r'''(?x) https?:// - (?:[\da-z_-]+\.)*mlb\.com/ + (?:[\da-z_-]+\.)*(?Pmlb)\.com/ (?: (?: - (?:.*?/)?video/(?:topic/[\da-z_-]+/)?(?:v|.*?/c-)| + (?:[^/]+/)*c-| (?: shared/video/embed/(?:embed|m-internal-embed)\.html| (?:[^/]+/)+(?:play|index)\.jsp| )\?.*?\bcontent_id= ) - (?Pn?\d+)| - (?:[^/]+/)*(?P[^/]+) + (?P\d+) ) ''' + _CONTENT_DOMAIN = 'content.mlb.com' _TESTS = [ { - 'url': 'http://m.mlb.com/sea/video/topic/51231442/v34698933/nymsea-ackley-robs-a-home-run-with-an-amazing-catch/?c_id=sea', - 'md5': 'ff56a598c2cf411a9a38a69709e97079', + 'url': 'https://www.mlb.com/mariners/video/ackleys-spectacular-catch/c-34698933', + 'md5': '632358dacfceec06bad823b83d21df2d', 'info_dict': { 'id': '34698933', 'ext': 'mp4', 'title': "Ackley's spectacular catch", 'description': 'md5:7f5a981eb4f3cbc8daf2aeffa2215bf0', 'duration': 66, - 'timestamp': 1405980600, - 'upload_date': '20140721', + 'timestamp': 1405995000, + 'upload_date': '20140722', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/video/topic/81536970/v34496663/mianym-stanton-practices-for-the-home-run-derby', - 'md5': 'd9c022c10d21f849f49c05ae12a8a7e9', + 'url': 'https://www.mlb.com/video/stanton-prepares-for-derby/c-34496663', + 'md5': 'bf2619bf9cacc0a564fc35e6aeb9219f', 'info_dict': { 'id': '34496663', 'ext': 'mp4', 'title': 'Stanton prepares for Derby', 'description': 'md5:d00ce1e5fd9c9069e9c13ab4faedfa57', 'duration': 46, - 'timestamp': 1405105800, + 'timestamp': 1405120200, 'upload_date': '20140711', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/video/topic/vtp_hrd_sponsor/v34578115/hrd-cespedes-wins-2014-gillette-home-run-derby', - 'md5': '0e6e73d509321e142409b695eadd541f', + 'url': 'https://www.mlb.com/video/cespedes-repeats-as-derby-champ/c-34578115', + 'md5': '99bb9176531adc600b90880fb8be9328', 'info_dict': { 'id': '34578115', 'ext': 'mp4', 'title': 'Cespedes repeats as Derby champ', 'description': 'md5:08df253ce265d4cf6fb09f581fafad07', 'duration': 488, - 'timestamp': 1405399936, + 'timestamp': 1405414336, 'upload_date': '20140715', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/video/v34577915/bautista-on-derby-captaining-duties-his-performance', - 'md5': 'b8fd237347b844365d74ea61d4245967', + 'url': 'https://www.mlb.com/video/bautista-on-home-run-derby/c-34577915', + 'md5': 'da8b57a12b060e7663ee1eebd6f330ec', 'info_dict': { 'id': '34577915', 'ext': 'mp4', 'title': 'Bautista on Home Run Derby', 'description': 'md5:b80b34031143d0986dddc64a8839f0fb', 'duration': 52, - 'timestamp': 1405390722, + 'timestamp': 1405405122, 'upload_date': '20140715', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { - 'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer', - 'md5': 'aafaf5b0186fee8f32f20508092f8111', + 'url': 'https://www.mlb.com/news/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer/c-118550098', + 'md5': 'e09e37b552351fddbf4d9e699c924d68', 'info_dict': { 'id': '75609783', 'ext': 'mp4', 'title': 'Must C: Pillar climbs for catch', 'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run', - 'timestamp': 1429124820, + 'timestamp': 1429139220, 'upload_date': '20150415', } }, @@ -111,7 +105,7 @@ class MLBIE(InfoExtractor): 'only_matching': True, }, { - 'url': 'http://m.cardinals.mlb.com/stl/video/v51175783/atlstl-piscotty-makes-great-sliding-catch-on-line/?partnerId=as_mlb_20150321_42500876&adbid=579409712979910656&adbpl=tw&adbpr=52847728', + 'url': 'https://www.mlb.com/cardinals/video/piscottys-great-sliding-catch/c-51175783', 'only_matching': True, }, { @@ -120,58 +114,7 @@ class MLBIE(InfoExtractor): 'only_matching': True, }, { - 'url': 'http://washington.nationals.mlb.com/mlb/gameday/index.jsp?c_id=was&gid=2015_05_09_atlmlb_wasmlb_1&lang=en&content_id=108309983&mode=video#', + 'url': 'https://www.mlb.com/cut4/carlos-gomez-borrowed-sunglasses-from-an-as-fan/c-278912842', 'only_matching': True, } ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - if not video_id: - video_path = mobj.group('path') - webpage = self._download_webpage(url, video_path) - video_id = self._search_regex( - [r'data-video-?id="(\d+)"', r'content_id=(\d+)'], webpage, 'video id') - - detail = self._download_xml( - 'http://m.mlb.com/gen/multimedia/detail/%s/%s/%s/%s.xml' - % (video_id[-3], video_id[-2], video_id[-1], video_id), video_id) - - title = detail.find('./headline').text - description = detail.find('./big-blurb').text - duration = parse_duration(detail.find('./duration').text) - timestamp = parse_iso8601(detail.attrib['date'][:-5]) - - thumbnails = [{ - 'url': thumbnail.text, - } for thumbnail in detail.findall('./thumbnailScenarios/thumbnailScenario')] - - formats = [] - for media_url in detail.findall('./url'): - playback_scenario = media_url.attrib['playback_scenario'] - fmt = { - 'url': media_url.text, - 'format_id': playback_scenario, - } - m = re.search(r'(?P\d+)K_(?P\d+)X(?P\d+)', playback_scenario) - if m: - fmt.update({ - 'vbr': int(m.group('vbr')) * 1000, - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - formats.append(fmt) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - 'thumbnails': thumbnails, - }