From 0f4a5a73e70172c0accbd2e936d08988d065b3b1 Mon Sep 17 00:00:00 2001 From: Parmjit Virk Date: Thu, 22 Jun 2017 12:08:36 -0500 Subject: [PATCH 01/10] [drtuber] Fix formats extraction (fixes 12058) --- youtube_dl/extractor/drtuber.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py index 1eca82b3b..c5d56a9ad 100644 --- a/youtube_dl/extractor/drtuber.py +++ b/youtube_dl/extractor/drtuber.py @@ -44,8 +44,23 @@ class DrTuberIE(InfoExtractor): webpage = self._download_webpage( 'http://www.drtuber.com/video/%s' % video_id, display_id) - video_url = self._html_search_regex( - r']*><(?:p|h\d+)[^>]*>([^<]+)<', @@ -75,7 +90,7 @@ class DrTuberIE(InfoExtractor): return { 'id': video_id, 'display_id': display_id, - 'url': video_url, + 'formats': formats, 'title': title, 'thumbnail': thumbnail, 'like_count': like_count, From fa3ea7223ac4d547c848e2df44504158ee0099f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 23 Jun 2017 00:42:42 +0700 Subject: [PATCH 02/10] [hgtv.com:show] Relax video config regex and update test (closes #13279, closes #13461) --- youtube_dl/extractor/hgtv.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/hgtv.py b/youtube_dl/extractor/hgtv.py index e854300c7..4d4b06824 100644 --- a/youtube_dl/extractor/hgtv.py +++ b/youtube_dl/extractor/hgtv.py @@ -7,14 +7,19 @@ from .common import InfoExtractor class HGTVComShowIE(InfoExtractor): IE_NAME = 'hgtv.com:show' _VALID_URL = r'https?://(?:www\.)?hgtv\.com/shows/[^/]+/(?P[^/?#&]+)' - _TEST = { - 'url': 'http://www.hgtv.com/shows/flip-or-flop/flip-or-flop-full-episodes-videos', + _TESTS = [{ + # data-module="video" + 'url': 'http://www.hgtv.com/shows/flip-or-flop/flip-or-flop-full-episodes-season-4-videos', 'info_dict': { - 'id': 'flip-or-flop-full-episodes-videos', + 'id': 'flip-or-flop-full-episodes-season-4-videos', 'title': 'Flip or Flop Full Episodes', }, 'playlist_mincount': 15, - } + }, { + # data-deferred-module="video" + 'url': 'http://www.hgtv.com/shows/good-bones/episodes/an-old-victorian-house-gets-a-new-facelift', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -23,7 +28,7 @@ class HGTVComShowIE(InfoExtractor): config = self._parse_json( self._search_regex( - r'(?s)data-module=["\']video["\'][^>]*>.*?]+type=["\']text/x-config["\'][^>]*>(.+?)]*>.*?]+type=["\']text/x-config["\'][^>]*>(.+?) Date: Fri, 23 Jun 2017 02:00:19 +0700 Subject: [PATCH 03/10] [youtube] Adapt to new automatic captions rendition (closes #13467) --- youtube_dl/extractor/youtube.py | 62 ++++++++++++++++++++++----------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index bf4f4e139..77cd271ef 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1269,37 +1269,57 @@ class YoutubeIE(YoutubeBaseInfoExtractor): sub_lang_list[sub_lang] = sub_formats return sub_lang_list + def make_captions(sub_url, sub_langs): + parsed_sub_url = compat_urllib_parse_urlparse(sub_url) + caption_qs = compat_parse_qs(parsed_sub_url.query) + captions = {} + for sub_lang in sub_langs: + sub_formats = [] + for ext in self._SUBTITLE_FORMATS: + caption_qs.update({ + 'tlang': [sub_lang], + 'fmt': [ext], + }) + sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace( + query=compat_urllib_parse_urlencode(caption_qs, True))) + sub_formats.append({ + 'url': sub_url, + 'ext': ext, + }) + captions[sub_lang] = sub_formats + return captions + + # New captions format as of 22.06.2017 + player_response = args.get('player_response') + if player_response and isinstance(player_response, compat_str): + player_response = self._parse_json( + player_response, video_id, fatal=False) + if player_response: + renderer = player_response['captions']['playerCaptionsTracklistRenderer'] + base_url = renderer['captionTracks'][0]['baseUrl'] + sub_lang_list = [] + for lang in renderer['translationLanguages']: + lang_code = lang.get('languageCode') + if lang_code: + sub_lang_list.append(lang_code) + return make_captions(base_url, sub_lang_list) + # Some videos don't provide ttsurl but rather caption_tracks and # caption_translation_languages (e.g. 20LmZk1hakA) + # Does not used anymore as of 22.06.2017 caption_tracks = args['caption_tracks'] caption_translation_languages = args['caption_translation_languages'] caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] - parsed_caption_url = compat_urllib_parse_urlparse(caption_url) - caption_qs = compat_parse_qs(parsed_caption_url.query) - - sub_lang_list = {} + sub_lang_list = [] for lang in caption_translation_languages.split(','): lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) sub_lang = lang_qs.get('lc', [None])[0] - if not sub_lang: - continue - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - caption_qs.update({ - 'tlang': [sub_lang], - 'fmt': [ext], - }) - sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace( - query=compat_urllib_parse_urlencode(caption_qs, True))) - sub_formats.append({ - 'url': sub_url, - 'ext': ext, - }) - sub_lang_list[sub_lang] = sub_formats - return sub_lang_list + if sub_lang: + sub_lang_list.append(sub_lang) + return make_captions(caption_url, sub_lang_list) # An extractor error can be raise by the download process if there are # no automatic captions but there are subtitles - except (KeyError, ExtractorError): + except (KeyError, IndexError, ExtractorError): self._downloader.report_warning(err_msg) return {} From 38dad4737fd18ff380d38ce85ad200d1d0931b8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 23 Jun 2017 02:10:54 +0700 Subject: [PATCH 04/10] [ChangeLog] Actualize --- ChangeLog | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ChangeLog b/ChangeLog index 7f077fd24..f12e74f1d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,7 +1,22 @@ version +Core +* [adobepass] Fix extraction on older python 2.6 + Extractors +* [youtube] Adapt to new automatic captions rendition (#13467) +* [hgtv.com:show] Relax video config regular expression (#13279, #13461) +* [drtuber] Fix formats extraction (#12058) +* [youporn] Fix upload date extraction +* [youporn] Improve formats extraction +* [youporn] Fix title extraction (#13456) +* [googledrive] Fix formats sorting (#13443) +* [watchindianporn] Fix extraction (#13411, #13415) ++ [vimeo] Add fallback mp4 extension for original format ++ [ruv] Add support for ruv.is (#13396) +* [viu] Fix extraction on older python 2.6 * [pandora.tv] Fix upload_date extraction (#12846) ++ [asiancrush] Add support for asiancrush.com (#13420) version 2017.06.18 From 170719414da3b516af464c4b0fb8a6a94c398e14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 23 Jun 2017 02:13:21 +0700 Subject: [PATCH 05/10] release 2017.06.23 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 3 +++ youtube_dl/version.py | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 002c1274a..4e7ceafd8 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.06.18*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.06.18** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.06.23*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.06.23** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.06.18 +[debug] youtube-dl version 2017.06.23 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index f12e74f1d..746250db9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2017.06.23 Core * [adobepass] Fix extraction on older python 2.6 diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 0f21be0a2..e827ec0cf 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -67,6 +67,8 @@ - **arte.tv:info** - **arte.tv:magazine** - **arte.tv:playlist** + - **AsianCrush** + - **AsianCrushPlaylist** - **AtresPlayer** - **ATTTechChannel** - **ATVAt** @@ -686,6 +688,7 @@ - **rutube:person**: Rutube person videos - **RUTV**: RUTV.RU - **Ruutu** + - **Ruv** - **safari**: safaribooksonline.com online video - **safari:api** - **safari:course**: safaribooksonline.com online courses diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8782a6a1e..dfb69ab8d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.06.18' +__version__ = '2017.06.23' From ac7409eec5be3e120928e66af0d5fd7f38a06c5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 23 Jun 2017 02:54:12 +0700 Subject: [PATCH 06/10] [hgtv.com:show] Fix typo --- youtube_dl/extractor/hgtv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/hgtv.py b/youtube_dl/extractor/hgtv.py index 4d4b06824..a4f332565 100644 --- a/youtube_dl/extractor/hgtv.py +++ b/youtube_dl/extractor/hgtv.py @@ -28,7 +28,7 @@ class HGTVComShowIE(InfoExtractor): config = self._parse_json( self._search_regex( - r'(?s)data-(?:deferred)?-module=["\']video["\'][^>]*>.*?]+type=["\']text/x-config["\'][^>]*>(.+?)]*>.*?]+type=["\']text/x-config["\'][^>]*>(.+?) Date: Fri, 23 Jun 2017 20:50:48 +0700 Subject: [PATCH 07/10] [ooyala] Skip empty format URLs (closes #13471, closes #13476) --- youtube_dl/extractor/ooyala.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index 84be2b1e3..ef8adccc1 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -41,7 +41,7 @@ class OoyalaBaseIE(InfoExtractor): for stream in cur_auth_data['streams']: s_url = base64.b64decode( stream['url']['data'].encode('ascii')).decode('utf-8') - if s_url in urls: + if not s_url or s_url in urls: continue urls.append(s_url) ext = determine_ext(s_url, None) From 4f4dd8d797538476a5a97f90e7074df2105d7573 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 23 Jun 2017 20:56:21 +0700 Subject: [PATCH 08/10] [ooyala] Make more robust --- youtube_dl/extractor/ooyala.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index ef8adccc1..a25f326b5 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -3,12 +3,14 @@ import re import base64 from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - int_or_none, - float_or_none, - ExtractorError, - unsmuggle_url, determine_ext, + ExtractorError, + float_or_none, + int_or_none, + try_get, + unsmuggle_url, ) from ..compat import compat_urllib_parse_urlencode @@ -39,13 +41,15 @@ class OoyalaBaseIE(InfoExtractor): formats = [] if cur_auth_data['authorized']: for stream in cur_auth_data['streams']: - s_url = base64.b64decode( - stream['url']['data'].encode('ascii')).decode('utf-8') + url_data = try_get(stream, lambda x: x['url']['data'], compat_str) + if not url_data: + continue + s_url = base64.b64decode(url_data.encode('ascii')).decode('utf-8') if not s_url or s_url in urls: continue urls.append(s_url) ext = determine_ext(s_url, None) - delivery_type = stream['delivery_type'] + delivery_type = stream.get('delivery_type') if delivery_type == 'hls' or ext == 'm3u8': formats.extend(self._extract_m3u8_formats( re.sub(r'/ip(?:ad|hone)/', '/all/', s_url), embed_code, 'mp4', 'm3u8_native', @@ -65,7 +69,7 @@ class OoyalaBaseIE(InfoExtractor): else: formats.append({ 'url': s_url, - 'ext': ext or stream.get('delivery_type'), + 'ext': ext or delivery_type, 'vcodec': stream.get('video_codec'), 'format_id': delivery_type, 'width': int_or_none(stream.get('width')), From b5f523ed62f6c84fe0c58274f1751e66c58282d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 23 Jun 2017 20:56:48 +0700 Subject: [PATCH 09/10] [ooyala] Add test for missing stream['url']['data'] --- youtube_dl/extractor/ooyala.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index a25f326b5..52580baed 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -140,6 +140,11 @@ class OoyalaIE(OoyalaBaseIE): 'title': 'Divide Tool Path.mp4', 'duration': 204.405, } + }, + { + # empty stream['url']['data'] + 'url': 'http://player.ooyala.com/player.js?embedCode=w2bnZtYjE6axZ_dw1Cd0hQtXd_ige2Is', + 'only_matching': True, } ] From 73af5cc817ff19d21cb432c5a4e9e37dd35a353d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 23 Jun 2017 21:18:33 +0700 Subject: [PATCH 10/10] [YoutubeDL] Skip malformed formats for better extraction robustness --- youtube_dl/YoutubeDL.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index c05103bb6..b3a6d4d3b 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1448,17 +1448,25 @@ class YoutubeDL(object): if not formats: raise ExtractorError('No video formats found!') + def is_wellformed(f): + url = f.get('url') + valid_url = url and isinstance(url, compat_str) + if not valid_url: + self.report_warning( + '"url" field is missing or empty - skipping format, ' + 'there is an error in extractor') + return valid_url + + # Filter out malformed formats for better extraction robustness + formats = list(filter(is_wellformed, formats)) + formats_dict = {} # We check that all the formats have the format and format_id fields for i, format in enumerate(formats): - if 'url' not in format: - raise ExtractorError('Missing "url" key in result (index %d)' % i) - sanitize_string_field(format, 'format_id') sanitize_numeric_fields(format) format['url'] = sanitize_url(format['url']) - if format.get('format_id') is None: format['format_id'] = compat_str(i) else: