From 2bfc1d9d68dec097fd8093dc0284dd0cd64beb2e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 19 Jan 2019 21:25:15 +0100 Subject: [PATCH 01/24] [extractor/common] imporove HLS video only format detection(closes #18923) --- youtube_dl/extractor/common.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 9e7febcad..af621b74b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1596,6 +1596,7 @@ class InfoExtractor(object): # References: # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21 # 2. https://github.com/rg3/youtube-dl/issues/12211 + # 3. https://github.com/rg3/youtube-dl/issues/18923 # We should try extracting formats only from master playlists [1, 4.3.4], # i.e. playlists that describe available qualities. On the other hand @@ -1667,11 +1668,16 @@ class InfoExtractor(object): rendition = stream_group[0] return rendition.get('NAME') or stream_group_id + # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF inorder to have the + # chance to detect video only formats when EXT-X-STREAM-INF tags + # precede EXT-X-MEDIA tags in HLS manifest such as [3]. + for line in m3u8_doc.splitlines(): + if line.startswith('#EXT-X-MEDIA:'): + extract_media(line) + for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-STREAM-INF:'): last_stream_inf = parse_m3u8_attributes(line) - elif line.startswith('#EXT-X-MEDIA:'): - extract_media(line) elif line.startswith('#') or not line.strip(): continue else: From f28363ad1fb45b4450ae6f09ff9aff8f93227e40 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 19 Jan 2019 21:25:53 +0100 Subject: [PATCH 02/24] [ted] correct acodec for http formats(#18923) --- youtube_dl/extractor/ted.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index d3e4205f5..645942dfd 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -265,6 +265,8 @@ class TEDIE(InfoExtractor): 'format_id': m3u8_format['format_id'].replace('hls', 'http'), 'protocol': 'http', }) + if f.get('acodec') == 'none': + del f['acodec'] formats.append(f) audio_download = talk_info.get('audioDownload') From 379306ef55b64c966392c072b17a450831fec252 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sat, 19 Jan 2019 21:35:02 +0100 Subject: [PATCH 03/24] [extractor/common] fix typo --- youtube_dl/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index af621b74b..6e36e6778 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1668,7 +1668,7 @@ class InfoExtractor(object): rendition = stream_group[0] return rendition.get('NAME') or stream_group_id - # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF inorder to have the + # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the # chance to detect video only formats when EXT-X-STREAM-INF tags # precede EXT-X-MEDIA tags in HLS manifest such as [3]. for line in m3u8_doc.splitlines(): From 2cc779f497ae20d6e0e28fc546a25723cfea631a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 20 Jan 2019 13:48:09 +0700 Subject: [PATCH 04/24] [YoutubeDL] Add negation support for string comparisons in format selection expressions (closes #18600, closes #18805) --- README.md | 3 ++- test/test_YoutubeDL.py | 46 +++++++++++++++++++++++++++++++++++++++++ youtube_dl/YoutubeDL.py | 9 +++++--- 3 files changed, 54 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 70bcfaccf..886696015 100644 --- a/README.md +++ b/README.md @@ -667,13 +667,14 @@ The following numeric meta fields can be used with comparisons `<`, `<=`, `>`, ` - `asr`: Audio sampling rate in Hertz - `fps`: Frame rate -Also filtering work for comparisons `=` (equals), `!=` (not equals), `^=` (begins with), `$=` (ends with), `*=` (contains) and following string meta fields: +Also filtering work for comparisons `=` (equals), `^=` (starts with), `$=` (ends with), `*=` (contains) and following string meta fields: - `ext`: File extension - `acodec`: Name of the audio codec in use - `vcodec`: Name of the video codec in use - `container`: Name of the container format - `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `http_dash_segments`, `m3u8`, or `m3u8_native`) - `format_id`: A short description of the format +Any string comparison may be prefixed with negation `!` in order to produce an opposite comparison, e.g. `!*=` (does not contain). Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by the video hoster. diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index f0f5a8470..df8994b84 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -239,6 +239,52 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'vid-vcodec-dot') + def test_format_selection_string_ops(self): + formats = [ + {'format_id': 'abc-cba', 'ext': 'mp4', 'url': TEST_URL}, + ] + info_dict = _make_result(formats) + + # equals (=) + ydl = YDL({'format': '[format_id=abc-cba]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'abc-cba') + + # does not equal (!=) + ydl = YDL({'format': '[format_id!=abc-cba]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + + # starts with (^=) + ydl = YDL({'format': '[format_id^=abc]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'abc-cba') + + # does not start with (!^=) + ydl = YDL({'format': '[format_id!^=abc-cba]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + + # ends with ($=) + ydl = YDL({'format': '[format_id$=cba]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'abc-cba') + + # does not end with (!$=) + ydl = YDL({'format': '[format_id!$=abc-cba]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + + # contains (*=) + ydl = YDL({'format': '[format_id*=-]'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'abc-cba') + + # does not contain (!*=) + ydl = YDL({'format': '[format_id!*=-]'}) + self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + def test_youtube_format_selection(self): order = [ '38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '17', '36', '13', diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 4493fd0e1..a827414dc 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1063,21 +1063,24 @@ class YoutubeDL(object): if not m: STR_OPERATORS = { '=': operator.eq, - '!=': operator.ne, '^=': lambda attr, value: attr.startswith(value), '$=': lambda attr, value: attr.endswith(value), '*=': lambda attr, value: value in attr, } str_operator_rex = re.compile(r'''(?x) \s*(?Pext|acodec|vcodec|container|protocol|format_id) - \s*(?P%s)(?P\s*\?)? + \s*(?P!\s*)?(?P%s)(?P\s*\?)? \s*(?P[a-zA-Z0-9._-]+) \s*$ ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) m = str_operator_rex.search(filter_spec) if m: comparison_value = m.group('value') - op = STR_OPERATORS[m.group('op')] + str_op = STR_OPERATORS[m.group('op')] + if m.group('negation'): + op = lambda attr, value: not str_op + else: + op = str_op if not m: raise ValueError('Invalid filter specification %r' % filter_spec) From 4e58d9fabbfef80b2ecc0d20959bdf9dd3705e73 Mon Sep 17 00:00:00 2001 From: Sergey M Date: Sun, 20 Jan 2019 14:23:35 +0700 Subject: [PATCH 05/24] [README.md] Fix formatting --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 886696015..4ba982907 100644 --- a/README.md +++ b/README.md @@ -674,6 +674,7 @@ Also filtering work for comparisons `=` (equals), `^=` (starts with), `$=` (ends - `container`: Name of the container format - `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `http_dash_segments`, `m3u8`, or `m3u8_native`) - `format_id`: A short description of the format + Any string comparison may be prefixed with negation `!` in order to produce an opposite comparison, e.g. `!*=` (does not contain). Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by the video hoster. From fc746c3fdd7d40935a11e72b5cb69a8aee840e94 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 20 Jan 2019 09:04:51 +0100 Subject: [PATCH 06/24] [test/test_InfoExtractor] add test for #18923 --- test/test_InfoExtractor.py | 59 ++++++++++++++++++++++++++++++- test/testdata/m3u8/ted_18923.m3u8 | 28 +++++++++++++++ 2 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 test/testdata/m3u8/ted_18923.m3u8 diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 06be72616..75fa0bbb7 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -497,7 +497,64 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ 'width': 1280, 'height': 720, }] - ) + ), + ( + # https://github.com/rg3/youtube-dl/issues/18923 + # https://www.ted.com/talks/boris_hesser_a_grassroots_healthcare_revolution_in_africa + 'ted_18923', + 'http://hls.ted.com/talks/31241.m3u8', + [{ + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '600k-Audio', + 'vcodec': 'none', + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '68', + 'vcodec': 'none', + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/64k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '163', + 'acodec': 'none', + 'width': 320, + 'height': 180, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/180k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '481', + 'acodec': 'none', + 'width': 512, + 'height': 288, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/320k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '769', + 'acodec': 'none', + 'width': 512, + 'height': 288, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/450k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '984', + 'acodec': 'none', + 'width': 512, + 'height': 288, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/600k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '1255', + 'acodec': 'none', + 'width': 640, + 'height': 360, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/950k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '1693', + 'acodec': 'none', + 'width': 853, + 'height': 480, + }, { + 'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/1500k.m3u8?nobumpers=true&uniqueId=76011e2b', + 'format_id': '2462', + 'acodec': 'none', + 'width': 1280, + 'height': 720, + }] + ), ] for m3u8_file, m3u8_url, expected_formats in _TEST_CASES: diff --git a/test/testdata/m3u8/ted_18923.m3u8 b/test/testdata/m3u8/ted_18923.m3u8 new file mode 100644 index 000000000..52a27118b --- /dev/null +++ b/test/testdata/m3u8/ted_18923.m3u8 @@ -0,0 +1,28 @@ +#EXTM3U +#EXT-X-VERSION:4 +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=1255659,PROGRAM-ID=1,CODECS="avc1.42c01e,mp4a.40.2",RESOLUTION=640x360 +/videos/BorisHesser_2018S/video/600k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=163154,PROGRAM-ID=1,CODECS="avc1.42c00c,mp4a.40.2",RESOLUTION=320x180 +/videos/BorisHesser_2018S/video/64k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=481701,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288 +/videos/BorisHesser_2018S/video/180k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=769968,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288 +/videos/BorisHesser_2018S/video/320k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=984037,PROGRAM-ID=1,CODECS="avc1.42c015,mp4a.40.2",RESOLUTION=512x288 +/videos/BorisHesser_2018S/video/450k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=1693925,PROGRAM-ID=1,CODECS="avc1.4d401f,mp4a.40.2",RESOLUTION=853x480 +/videos/BorisHesser_2018S/video/950k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=2462469,PROGRAM-ID=1,CODECS="avc1.640028,mp4a.40.2",RESOLUTION=1280x720 +/videos/BorisHesser_2018S/video/1500k.m3u8?nobumpers=true&uniqueId=76011e2b +#EXT-X-STREAM-INF:AUDIO="600k",BANDWIDTH=68101,PROGRAM-ID=1,CODECS="mp4a.40.2",DEFAULT=YES +/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b + +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=74298,PROGRAM-ID=1,CODECS="avc1.42c00c",RESOLUTION=320x180,URI="/videos/BorisHesser_2018S/video/64k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=216200,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/180k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=304717,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/320k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=350933,PROGRAM-ID=1,CODECS="avc1.42c015",RESOLUTION=512x288,URI="/videos/BorisHesser_2018S/video/450k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=495850,PROGRAM-ID=1,CODECS="avc1.42c01e",RESOLUTION=640x360,URI="/videos/BorisHesser_2018S/video/600k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=810750,PROGRAM-ID=1,CODECS="avc1.4d401f",RESOLUTION=853x480,URI="/videos/BorisHesser_2018S/video/950k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" +#EXT-X-I-FRAME-STREAM-INF:BANDWIDTH=1273700,PROGRAM-ID=1,CODECS="avc1.640028",RESOLUTION=1280x720,URI="/videos/BorisHesser_2018S/video/1500k_iframe.m3u8?nobumpers=true&uniqueId=76011e2b" + +#EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="600k",LANGUAGE="en",NAME="Audio",AUTOSELECT=YES,DEFAULT=YES,URI="/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b",BANDWIDTH=614400 From 15870747f02effba9376fd1a1500c32d36d1b811 Mon Sep 17 00:00:00 2001 From: aviperes Date: Sun, 20 Jan 2019 10:15:01 +0200 Subject: [PATCH 07/24] [odnoklassniki] Detect paid videos --- youtube_dl/extractor/odnoklassniki.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 190d8af4d..114b93c07 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -115,6 +115,10 @@ class OdnoklassnikiIE(InfoExtractor): }, { 'url': 'https://m.ok.ru/dk?st.cmd=movieLayer&st.discId=863789452017&st.retLoc=friend&st.rtu=%2Fdk%3Fst.cmd%3DfriendMovies%26st.mode%3Down%26st.mrkId%3D%257B%2522uploadedMovieMarker%2522%253A%257B%2522marker%2522%253A%25221519410114503%2522%252C%2522hasMore%2522%253Atrue%257D%252C%2522sharedMovieMarker%2522%253A%257B%2522marker%2522%253Anull%252C%2522hasMore%2522%253Afalse%257D%257D%26st.friendId%3D561722190321%26st.frwd%3Don%26_prevCmd%3DfriendMovies%26tkn%3D7257&st.discType=MOVIE&st.mvId=863789452017&_prevCmd=friendMovies&tkn=3648#lst#', 'only_matching': True, + }, { + # Paid video + 'url': 'https://ok.ru/video/954886983203', + 'only_matching': True, }] def _real_extract(self, url): @@ -244,6 +248,11 @@ class OdnoklassnikiIE(InfoExtractor): 'ext': 'flv', }) + if not formats: + payment_info = metadata.get('paymentInfo') + if payment_info: + raise ExtractorError('This video is paid, subscribe to download it', expected=True) + self._sort_formats(formats) info['formats'] = formats From 31fbedc06a349bc555ab934588544f75734e3a55 Mon Sep 17 00:00:00 2001 From: jhwgh1968 Date: Sun, 20 Jan 2019 09:10:46 +0000 Subject: [PATCH 08/24] [instagram] Add base extractor for playlists and tag extractor --- youtube_dl/extractor/extractors.py | 6 +- youtube_dl/extractor/instagram.py | 136 +++++++++++++++++++++-------- 2 files changed, 105 insertions(+), 37 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index de38c6641..24361def4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -494,7 +494,11 @@ from .ina import InaIE from .inc import IncIE from .indavideo import IndavideoEmbedIE from .infoq import InfoQIE -from .instagram import InstagramIE, InstagramUserIE +from .instagram import ( + InstagramIE, + InstagramUserIE, + InstagramTagIE, +) from .internazionale import InternazionaleIE from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 7e0e838f0..ffd87b55f 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -227,44 +227,37 @@ class InstagramIE(InfoExtractor): } -class InstagramUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P[^/]{2,})/?(?:$|[?#])' - IE_DESC = 'Instagram user profile' - IE_NAME = 'instagram:user' - _TEST = { - 'url': 'https://instagram.com/porsche', - 'info_dict': { - 'id': 'porsche', - 'title': 'porsche', - }, - 'playlist_count': 5, - 'params': { - 'extract_flat': True, - 'skip_download': True, - 'playlistend': 5, - } - } +class InstagramPlaylistIE(InfoExtractor): + # A superclass for handling any kind of query based on GraphQL which + # results in a playlist. - _gis_tmpl = None + _gis_tmpl = None # used to cache GIS request type - def _entries(self, data): + def _parse_graphql(self, webpage, item_id): + # Reads a webpage and returns its GraphQL data. + return self._parse_json( + self._search_regex( + r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'), + item_id) + + def _extract_graphql(self, data, url): + # Parses GraphQL queries containing videos and generates a playlist. def get_count(suffix): return int_or_none(try_get( node, lambda x: x['edge_media_' + suffix]['count'])) - uploader_id = data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] + uploader_id = self._match_id(url) csrf_token = data['config']['csrf_token'] rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8' - self._set_cookie('instagram.com', 'ig_pr', '1') - cursor = '' for page_num in itertools.count(1): - variables = json.dumps({ - 'id': uploader_id, + variables = { 'first': 12, 'after': cursor, - }) + } + variables.update(self._query_vars_for(data)) + variables = json.dumps(variables) if self._gis_tmpl: gis_tmpls = [self._gis_tmpl] @@ -276,21 +269,26 @@ class InstagramUserIE(InfoExtractor): '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']), ] + # try all of the ways to generate a GIS query, and not only use the + # first one that works, but cache it for future requests for gis_tmpl in gis_tmpls: try: - media = self._download_json( + json_data = self._download_json( 'https://www.instagram.com/graphql/query/', uploader_id, 'Downloading JSON page %d' % page_num, headers={ 'X-Requested-With': 'XMLHttpRequest', 'X-Instagram-GIS': hashlib.md5( ('%s:%s' % (gis_tmpl, variables)).encode('utf-8')).hexdigest(), }, query={ - 'query_hash': '42323d64886122307be10013ad2dcc44', + 'query_hash': self._QUERY_HASH, 'variables': variables, - })['data']['user']['edge_owner_to_timeline_media'] + }) + media = self._parse_timeline_from(json_data) self._gis_tmpl = gis_tmpl break except ExtractorError as e: + # if it's an error caused by a bad query, and there are + # more GIS templates to try, ignore it and keep trying if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: if gis_tmpl != gis_tmpls[-1]: continue @@ -348,14 +346,80 @@ class InstagramUserIE(InfoExtractor): break def _real_extract(self, url): - username = self._match_id(url) + user_or_tag = self._match_id(url) + webpage = self._download_webpage(url, user_or_tag) + data = self._parse_graphql(webpage, user_or_tag) - webpage = self._download_webpage(url, username) - - data = self._parse_json( - self._search_regex( - r'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage, 'data'), - username) + self._set_cookie('instagram.com', 'ig_pr', '1') return self.playlist_result( - self._entries(data), username, username) + self._extract_graphql(data, url), user_or_tag, user_or_tag) + + +class InstagramUserIE(InstagramPlaylistIE): + _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P[^/]{2,})/?(?:$|[?#])' + IE_DESC = 'Instagram user profile' + IE_NAME = 'instagram:user' + _TEST = { + 'url': 'https://instagram.com/porsche', + 'info_dict': { + 'id': 'porsche', + 'title': 'porsche', + }, + 'playlist_count': 5, + 'params': { + 'extract_flat': True, + 'skip_download': True, + 'playlistend': 5, + } + } + + _QUERY_HASH = '42323d64886122307be10013ad2dcc44', + + @staticmethod + def _parse_timeline_from(data): + # extracts the media timeline data from a GraphQL result + return data['data']['user']['edge_owner_to_timeline_media'] + + @staticmethod + def _query_vars_for(data): + # returns a dictionary of variables to add to the timeline query based + # on the GraphQL of the original page + return { + 'id': data['entry_data']['ProfilePage'][0]['graphql']['user']['id'] + } + + +class InstagramTagIE(InstagramPlaylistIE): + _VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P[^/]+)' + IE_DESC = 'Instagram hashtag search' + IE_NAME = 'instagram:tag' + _TEST = { + 'url': 'https://instagram.com/explore/tags/lolcats', + 'info_dict': { + 'id': 'lolcats', + 'title': 'lolcats', + }, + 'playlist_count': 50, + 'params': { + 'extract_flat': True, + 'skip_download': True, + 'playlistend': 50, + } + } + + _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314', + + @staticmethod + def _parse_timeline_from(data): + # extracts the media timeline data from a GraphQL result + return data['data']['hashtag']['edge_hashtag_to_media'] + + @staticmethod + def _query_vars_for(data): + # returns a dictionary of variables to add to the timeline query based + # on the GraphQL of the original page + return { + 'tag_name': + data['entry_data']['TagPage'][0]['graphql']['hashtag']['name'] + } From 6ca3fa898cd066422daea3d5be53efdae22187d8 Mon Sep 17 00:00:00 2001 From: yonaikerlol Date: Sun, 20 Jan 2019 05:24:21 -0400 Subject: [PATCH 09/24] [streamango] Add support for fruithosts.net --- youtube_dl/extractor/streamango.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py index fcaa5ac0b..efb259f96 100644 --- a/youtube_dl/extractor/streamango.py +++ b/youtube_dl/extractor/streamango.py @@ -14,7 +14,7 @@ from ..utils import ( class StreamangoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?streamango\.com/(?:f|embed)/(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?(?:streamango\.com|fruithosts\.net)/(?:f|embed)/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4', 'md5': 'e992787515a182f55e38fc97588d802a', @@ -38,6 +38,9 @@ class StreamangoIE(InfoExtractor): }, { 'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4', 'only_matching': True, + }, { + 'url': 'https://fruithosts.net/f/mreodparcdcmspsm/w1f1_r4lph_2018_brrs_720p_latino_mp4', + 'only_matching': True, }] def _real_extract(self, url): From 289ef490f77cb35c43d98b9d2ea4cf529c24895e Mon Sep 17 00:00:00 2001 From: Anthony Fok Date: Sun, 30 Dec 2018 02:44:40 -0700 Subject: [PATCH 10/24] [hketv] Add extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/hketv.py | 185 +++++++++++++++++++++++++++++ 2 files changed, 186 insertions(+) create mode 100644 youtube_dl/extractor/hketv.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 24361def4..574a47e6d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -452,6 +452,7 @@ from .hellporno import HellPornoIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE from .hgtv import HGTVComShowIE +from .hketv import HKETVIE from .hidive import HiDiveIE from .historicfilms import HistoricFilmsIE from .hitbox import HitboxIE, HitboxLiveIE diff --git a/youtube_dl/extractor/hketv.py b/youtube_dl/extractor/hketv.py new file mode 100644 index 000000000..b5790cdee --- /dev/null +++ b/youtube_dl/extractor/hketv.py @@ -0,0 +1,185 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + merge_dicts, + str_or_none, + str_to_int, + try_get, + unified_strdate, + urlencode_postdata, + urljoin, +) + + +class HKETVIE(InfoExtractor): + IE_NAME = 'hketv' + IE_DESC = '香港教育局教育電視 (HKETV) Educational Television, Hong Kong Educational Bureau' + _GEO_BYPASS = False + _GEO_COUNTRIES = ['HK'] + _VALID_URL = r'https?://(?:www\.)?hkedcity\.net/etv/resource/(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://www.hkedcity.net/etv/resource/2932360618', + 'md5': 'f193712f5f7abb208ddef3c5ea6ed0b7', + 'info_dict': { + 'id': '2932360618', + 'ext': 'mp4', + 'title': '喜閱一生(共享閱讀樂) (中、英文字幕可供選擇)', + 'description': '本節目輯錄了「閱讀滿Fun嘉年華」和「二○一八響應世界閱讀日――悅愛閱讀・愈讀愈愛」的活動花絮,並由學者、作家、演藝界人士等,分享培養子女閱讀興趣和習慣的方法,以及呼籲大家一同分享閱讀的樂趣。', + 'upload_date': '20181024', + 'duration': 900, + 'subtitles': { + 'en': [{ + 'url': 'https://apps.hkedcity.net/media/mediaplayer/caption.php?f=74395&lang=en', + 'ext': 'srt', + }], + 'zh-Hant': [{ + 'url': 'https://apps.hkedcity.net/media/mediaplayer/caption.php?f=74395&lang=qmt', + 'ext': 'srt', + }], + } + }, + }, { + 'url': 'https://www.hkedcity.net/etv/resource/972641418', + 'md5': '1ed494c1c6cf7866a8290edad9b07dc9', + 'info_dict': { + 'id': '972641418', + 'ext': 'mp4', + 'title': '衣冠楚楚 (天使系列之一)', + 'description': '天國仙境,有兩位可愛的天使小姐妹。她們對幾千年來天使衣著一成不變頗有不滿。她們下望人世間:只見人們穿著七彩繽紛、款式各異的服裝,漂亮極了。天使姐妹決定下凡考察衣著,以設計天使新裝。 下到人間,姐妹試穿各式各樣的衣著,引發連串奇特有趣的情節:她們穿著校服在街上閒逛時,被女警誤認為逃學而送回學校,到校後又被體育老師誤認為是新同學,匆匆忙忙換上運動服後在操場上大顯神通。她們穿著護士服在醫院散步時,又被誤認為當班護士,而投入追尋失蹤病童、治病救人的工作中去。姐妹倆還到過玩具店,與布娃娃們談論衣著。她們也去過服裝設計學校,被當成小模特兒而試穿各式服裝。最令姐妹倆興奮的是一場盛大的民族服裝表演會。身穿盛裝的十二個民族的少女在台上翩翩起舞,各種服飾七彩繽紛、美不勝收。姐妹們情不自禁地穿上民族服裝,小天使變成了少數民族姑娘……最後天使姐妹回到天上,對於天使究竟穿甚麼樣的衣服好,她們還是拿不定主意。 節目通過天使姐妹的奇特經歷,反復示範各式衣服鞋襪的正確讀音及談論衣著時的常用句式,並以盛大的民族服裝表演活動,帶出有關服裝的文化知識。內容豐富而饒有趣味。', + 'upload_date': '20070109', + 'duration': 907, + 'subtitles': {}, + }, + }] + + _CC_LANGS = { + '中文(繁體中文)': 'zh-Hant', + '中文(简体中文)': 'zh-Hans', + 'English': 'en', + 'Bahasa Indonesia': 'id', + '\u0939\u093f\u0928\u094d\u0926\u0940': 'hi', + '\u0928\u0947\u092a\u093e\u0932\u0940': 'ne', + 'Tagalog': 'tl', + '\u0e44\u0e17\u0e22': 'th', + '\u0627\u0631\u062f\u0648': 'ur', + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + title = self._html_search_meta('ed_title', webpage, fatal=True) + + file_id = self._html_search_regex(r'post_var\["file_id"\]\s*=\s*(.+?);', webpage, 'file ID') + curr_url = self._html_search_regex(r'post_var\["curr_url"\]\s*=\s*"(.+?)";', webpage, 'curr URL') + data = { + 'action': 'get_info', + 'curr_url': curr_url, + 'file_id': file_id, + 'video_url': file_id, + } + _APPS_BASE_URL = 'https://apps.hkedcity.net' + handler_url = _APPS_BASE_URL + '/media/play/handler.php' + + response = self._download_json( + handler_url, video_id, + data=urlencode_postdata(data), + headers=merge_dicts({'Content-Type': 'application/x-www-form-urlencoded'}, + self.geo_verification_headers())) + + result = response['result'] + + formats = [] + subtitles = {} + + if response.get('success') and response.get('access'): + width = int_or_none(result.get('width')) + height = int_or_none(result.get('height')) + + playlist0 = try_get(result, lambda x: x['playlist'][0], dict) + fmts = playlist0.get('sources') + for fmt in fmts: + file_path = fmt.get('file') + if file_path: + file_url = urljoin(_APPS_BASE_URL, file_path) + # If we ever wanted to provide the final resolved URL that + # does not require cookies, albeit with a shorter lifespan: + # urlh = self._downloader.urlopen(file_url) + # resolved_url = urlh.geturl() + + label = fmt.get('label') + w = None + h = None + if label == 'HD': + h = 720 + elif label == 'SD': + h = 360 + if h: + if width and height: + w = h * width // height + else: + w = h * 4 // 3 + + formats.append({ + 'format_id': label, + 'ext': fmt.get('type'), + 'url': file_url, + 'width': w, + 'height': h, + }) + + tracks = playlist0.get('tracks', []) + for track in tracks: + if not isinstance(track, dict): + continue + track_kind = str_or_none(track.get('kind')) + if not track_kind or not isinstance(track_kind, compat_str): + continue + if track_kind.lower() not in ('captions', 'subtitles'): + continue + track_url = urljoin(_APPS_BASE_URL, track.get('file')) + if not track_url: + continue + track_label = track.get('label') + subtitles.setdefault(self._CC_LANGS.get(track_label, track_label), []).append({ + 'url': self._proto_relative_url(track_url), + 'ext': 'srt', + }) + + else: + error = clean_html(response.get('access_err_msg')) + if 'Video streaming is not available in your country' in error: + self.raise_geo_restricted(msg=error, countries=self._GEO_COUNTRIES) + else: + raise ExtractorError(error) + + # Likes + emotion = self._download_json( + 'https://emocounter.hkedcity.net/handler.php', + video_id, + data=urlencode_postdata({ + 'action': 'get_emotion', + 'data[bucket_id]': 'etv', + 'data[identifier]': video_id, + }), + headers={'Content-Type': 'application/x-www-form-urlencoded'}, + fatal=False) + like_count = int_or_none(try_get(emotion, lambda x: x['data']['emotion_data'][0]['count'])) + + return { + 'id': video_id, + 'title': title, + 'description': self._html_search_meta('description', webpage, fatal=False), + 'upload_date': unified_strdate(self._html_search_meta('ed_date', webpage, fatal=False), day_first=False), + 'duration': int_or_none(result.get('length')), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': urljoin(_APPS_BASE_URL, result.get('image')), + 'view_count': str_to_int(result.get('view_count')), + 'like_count': like_count, + } From 73c19aaa9f74e9383a7aaf0dfb3c608727d5b6b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 20 Jan 2019 17:42:25 +0700 Subject: [PATCH 11/24] [hketv] Improve and simplify (closes #18696) --- youtube_dl/extractor/hketv.py | 180 ++++++++++++++++++---------------- 1 file changed, 93 insertions(+), 87 deletions(-) diff --git a/youtube_dl/extractor/hketv.py b/youtube_dl/extractor/hketv.py index b5790cdee..b57927fc1 100644 --- a/youtube_dl/extractor/hketv.py +++ b/youtube_dl/extractor/hketv.py @@ -8,8 +8,8 @@ from ..utils import ( ExtractorError, int_or_none, merge_dicts, + parse_count, str_or_none, - str_to_int, try_get, unified_strdate, urlencode_postdata, @@ -30,20 +30,12 @@ class HKETVIE(InfoExtractor): 'id': '2932360618', 'ext': 'mp4', 'title': '喜閱一生(共享閱讀樂) (中、英文字幕可供選擇)', - 'description': '本節目輯錄了「閱讀滿Fun嘉年華」和「二○一八響應世界閱讀日――悅愛閱讀・愈讀愈愛」的活動花絮,並由學者、作家、演藝界人士等,分享培養子女閱讀興趣和習慣的方法,以及呼籲大家一同分享閱讀的樂趣。', + 'description': 'md5:d5286d05219ef50e0613311cbe96e560', 'upload_date': '20181024', 'duration': 900, - 'subtitles': { - 'en': [{ - 'url': 'https://apps.hkedcity.net/media/mediaplayer/caption.php?f=74395&lang=en', - 'ext': 'srt', - }], - 'zh-Hant': [{ - 'url': 'https://apps.hkedcity.net/media/mediaplayer/caption.php?f=74395&lang=qmt', - 'ext': 'srt', - }], - } + 'subtitles': 'count:2', }, + 'skip': 'Geo restricted to HK', }, { 'url': 'https://www.hkedcity.net/etv/resource/972641418', 'md5': '1ed494c1c6cf7866a8290edad9b07dc9', @@ -51,11 +43,15 @@ class HKETVIE(InfoExtractor): 'id': '972641418', 'ext': 'mp4', 'title': '衣冠楚楚 (天使系列之一)', - 'description': '天國仙境,有兩位可愛的天使小姐妹。她們對幾千年來天使衣著一成不變頗有不滿。她們下望人世間:只見人們穿著七彩繽紛、款式各異的服裝,漂亮極了。天使姐妹決定下凡考察衣著,以設計天使新裝。 下到人間,姐妹試穿各式各樣的衣著,引發連串奇特有趣的情節:她們穿著校服在街上閒逛時,被女警誤認為逃學而送回學校,到校後又被體育老師誤認為是新同學,匆匆忙忙換上運動服後在操場上大顯神通。她們穿著護士服在醫院散步時,又被誤認為當班護士,而投入追尋失蹤病童、治病救人的工作中去。姐妹倆還到過玩具店,與布娃娃們談論衣著。她們也去過服裝設計學校,被當成小模特兒而試穿各式服裝。最令姐妹倆興奮的是一場盛大的民族服裝表演會。身穿盛裝的十二個民族的少女在台上翩翩起舞,各種服飾七彩繽紛、美不勝收。姐妹們情不自禁地穿上民族服裝,小天使變成了少數民族姑娘……最後天使姐妹回到天上,對於天使究竟穿甚麼樣的衣服好,她們還是拿不定主意。 節目通過天使姐妹的奇特經歷,反復示範各式衣服鞋襪的正確讀音及談論衣著時的常用句式,並以盛大的民族服裝表演活動,帶出有關服裝的文化知識。內容豐富而饒有趣味。', + 'description': 'md5:10bb3d659421e74f58e5db5691627b0f', 'upload_date': '20070109', 'duration': 907, 'subtitles': {}, }, + 'params': { + 'geo_verification_proxy': '', + }, + 'skip': 'Geo restricted to HK', }] _CC_LANGS = { @@ -69,117 +65,127 @@ class HKETVIE(InfoExtractor): '\u0e44\u0e17\u0e22': 'th', '\u0627\u0631\u062f\u0648': 'ur', } + _FORMAT_HEIGHTS = { + 'SD': 360, + 'HD': 720, + } + _APPS_BASE_URL = 'https://apps.hkedcity.net' def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_meta('ed_title', webpage, fatal=True) - file_id = self._html_search_regex(r'post_var\["file_id"\]\s*=\s*(.+?);', webpage, 'file ID') - curr_url = self._html_search_regex(r'post_var\["curr_url"\]\s*=\s*"(.+?)";', webpage, 'curr URL') + title = ( + self._html_search_meta( + ('ed_title', 'search.ed_title'), webpage, default=None) or + self._search_regex( + r'data-favorite_title_(?:eng|chi)=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'title', default=None, group='url') or + self._html_search_regex( + r'

([^<]+)

', webpage, 'title', default=None) or + self._og_search_title(webpage) + ) + + file_id = self._search_regex( + r'post_var\[["\']file_id["\']\s*\]\s*=\s*(.+?);', + webpage, 'file ID') + curr_url = self._search_regex( + r'post_var\[["\']curr_url["\']\s*\]\s*=\s*"(.+?)";', + webpage, 'curr URL') data = { 'action': 'get_info', 'curr_url': curr_url, 'file_id': file_id, 'video_url': file_id, } - _APPS_BASE_URL = 'https://apps.hkedcity.net' - handler_url = _APPS_BASE_URL + '/media/play/handler.php' response = self._download_json( - handler_url, video_id, + self._APPS_BASE_URL + '/media/play/handler.php', video_id, data=urlencode_postdata(data), - headers=merge_dicts({'Content-Type': 'application/x-www-form-urlencoded'}, - self.geo_verification_headers())) + headers=merge_dicts({ + 'Content-Type': 'application/x-www-form-urlencoded'}, + self.geo_verification_headers())) result = response['result'] + if not response.get('success') or not response.get('access'): + error = clean_html(response.get('access_err_msg')) + if 'Video streaming is not available in your country' in error: + self.raise_geo_restricted( + msg=error, countries=self._GEO_COUNTRIES) + else: + raise ExtractorError(error, expected=True) + formats = [] + + width = int_or_none(result.get('width')) + height = int_or_none(result.get('height')) + + playlist0 = result['playlist'][0] + for fmt in playlist0['sources']: + file_url = urljoin(self._APPS_BASE_URL, fmt.get('file')) + if not file_url: + continue + # If we ever wanted to provide the final resolved URL that + # does not require cookies, albeit with a shorter lifespan: + # urlh = self._downloader.urlopen(file_url) + # resolved_url = urlh.geturl() + label = fmt.get('label') + h = self._FORMAT_HEIGHTS.get(label) + w = h * width // height if h and width and height else None + formats.append({ + 'format_id': label, + 'ext': fmt.get('type'), + 'url': file_url, + 'width': w, + 'height': h, + }) + self._sort_formats(formats) + subtitles = {} - - if response.get('success') and response.get('access'): - width = int_or_none(result.get('width')) - height = int_or_none(result.get('height')) - - playlist0 = try_get(result, lambda x: x['playlist'][0], dict) - fmts = playlist0.get('sources') - for fmt in fmts: - file_path = fmt.get('file') - if file_path: - file_url = urljoin(_APPS_BASE_URL, file_path) - # If we ever wanted to provide the final resolved URL that - # does not require cookies, albeit with a shorter lifespan: - # urlh = self._downloader.urlopen(file_url) - # resolved_url = urlh.geturl() - - label = fmt.get('label') - w = None - h = None - if label == 'HD': - h = 720 - elif label == 'SD': - h = 360 - if h: - if width and height: - w = h * width // height - else: - w = h * 4 // 3 - - formats.append({ - 'format_id': label, - 'ext': fmt.get('type'), - 'url': file_url, - 'width': w, - 'height': h, - }) - - tracks = playlist0.get('tracks', []) - for track in tracks: - if not isinstance(track, dict): - continue - track_kind = str_or_none(track.get('kind')) - if not track_kind or not isinstance(track_kind, compat_str): - continue - if track_kind.lower() not in ('captions', 'subtitles'): - continue - track_url = urljoin(_APPS_BASE_URL, track.get('file')) - if not track_url: - continue - track_label = track.get('label') - subtitles.setdefault(self._CC_LANGS.get(track_label, track_label), []).append({ + tracks = try_get(playlist0, lambda x: x['tracks'], list) or [] + for track in tracks: + if not isinstance(track, dict): + continue + track_kind = str_or_none(track.get('kind')) + if not track_kind or not isinstance(track_kind, compat_str): + continue + if track_kind.lower() not in ('captions', 'subtitles'): + continue + track_url = urljoin(self._APPS_BASE_URL, track.get('file')) + if not track_url: + continue + track_label = track.get('label') + subtitles.setdefault(self._CC_LANGS.get( + track_label, track_label), []).append({ 'url': self._proto_relative_url(track_url), 'ext': 'srt', }) - else: - error = clean_html(response.get('access_err_msg')) - if 'Video streaming is not available in your country' in error: - self.raise_geo_restricted(msg=error, countries=self._GEO_COUNTRIES) - else: - raise ExtractorError(error) - # Likes emotion = self._download_json( - 'https://emocounter.hkedcity.net/handler.php', - video_id, + 'https://emocounter.hkedcity.net/handler.php', video_id, data=urlencode_postdata({ 'action': 'get_emotion', 'data[bucket_id]': 'etv', 'data[identifier]': video_id, }), headers={'Content-Type': 'application/x-www-form-urlencoded'}, - fatal=False) - like_count = int_or_none(try_get(emotion, lambda x: x['data']['emotion_data'][0]['count'])) + fatal=False) or {} + like_count = int_or_none(try_get( + emotion, lambda x: x['data']['emotion_data'][0]['count'])) return { 'id': video_id, 'title': title, - 'description': self._html_search_meta('description', webpage, fatal=False), - 'upload_date': unified_strdate(self._html_search_meta('ed_date', webpage, fatal=False), day_first=False), + 'description': self._html_search_meta( + 'description', webpage, fatal=False), + 'upload_date': unified_strdate(self._html_search_meta( + 'ed_date', webpage, fatal=False), day_first=False), 'duration': int_or_none(result.get('length')), 'formats': formats, 'subtitles': subtitles, - 'thumbnail': urljoin(_APPS_BASE_URL, result.get('image')), - 'view_count': str_to_int(result.get('view_count')), + 'thumbnail': urljoin(self._APPS_BASE_URL, result.get('image')), + 'view_count': parse_count(result.get('view_count')), 'like_count': like_count, } From a1a460759815414c6194bc921ac77a5533b6e02e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 20 Jan 2019 18:21:31 +0700 Subject: [PATCH 12/24] [vimeo] Fix video password verification for videos protected by Referer HTTP header --- youtube_dl/extractor/vimeo.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index fd37f919b..6215b3258 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -435,6 +435,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'url': 'https://vimeo.com/160743502/abd0e13fb4', 'only_matching': True, } + # https://gettingthingsdone.com/workflowmap/ + # vimeo embed with check-password page protected by Referer header ] @staticmethod @@ -465,20 +467,22 @@ class VimeoIE(VimeoBaseInfoExtractor): urls = VimeoIE._extract_urls(url, webpage) return urls[0] if urls else None - def _verify_player_video_password(self, url, video_id): + def _verify_player_video_password(self, url, video_id, headers): password = self._downloader.params.get('videopassword') if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option') data = urlencode_postdata({ 'password': base64.b64encode(password.encode()), }) - pass_url = url + '/check-password' - password_request = sanitized_Request(pass_url, data) - password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - password_request.add_header('Referer', url) - return self._download_json( - password_request, video_id, - 'Verifying the password', 'Wrong password') + headers = merge_dicts(headers, { + 'Content-Type': 'application/x-www-form-urlencoded', + }) + checked = self._download_json( + url + '/check-password', video_id, + 'Verifying the password', data=data, headers=headers) + if checked is False: + raise ExtractorError('Wrong video password', expected=True) + return checked def _real_initialize(self): self._login() @@ -591,7 +595,7 @@ class VimeoIE(VimeoBaseInfoExtractor): cause=e) else: if config.get('view') == 4: - config = self._verify_player_video_password(redirect_url, video_id) + config = self._verify_player_video_password(redirect_url, video_id, headers) vod = config.get('video', {}).get('vod', {}) From 29cfcb43da8ac60e6c2eddad095a41c800d4d95a Mon Sep 17 00:00:00 2001 From: Alexandre Huot Date: Sun, 20 Jan 2019 06:33:09 -0500 Subject: [PATCH 13/24] [radiocanada] Relax DRM check --- youtube_dl/extractor/radiocanada.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py index b952e59b4..302f67d96 100644 --- a/youtube_dl/extractor/radiocanada.py +++ b/youtube_dl/extractor/radiocanada.py @@ -49,6 +49,16 @@ class RadioCanadaIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + }, + { + # with protectionType but not actually DRM protected + 'url': 'radiocanada:toutv:140872', + 'info_dict': { + 'id': '140872', + 'title': 'Épisode 1', + 'series': 'District 31', + }, + 'only_matching': True, } ] @@ -67,8 +77,10 @@ class RadioCanadaIE(InfoExtractor): el = find_xpath_attr(metadata, './/Meta', 'name', name) return el.text if el is not None else None + # protectionType does not necessarily mean the video is DRM protected (see + # https://github.com/rg3/youtube-dl/pull/18609). if get_meta('protectionType'): - raise ExtractorError('This video is DRM protected.', expected=True) + self.report_warning('This video is probably DRM protected.') device_types = ['ipad'] if not smuggled_data: From 6945b9e78f38284eb4e440b7badea2fc60b66c2f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Sun, 20 Jan 2019 13:31:41 +0100 Subject: [PATCH 14/24] [extractor/common] improve jwplayer relative url handling(closes #18892) --- youtube_dl/extractor/common.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6e36e6778..95456b291 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2630,7 +2630,7 @@ class InfoExtractor(object): 'id': this_video_id, 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')), 'description': video_data.get('description'), - 'thumbnail': self._proto_relative_url(video_data.get('image')), + 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))), 'timestamp': int_or_none(video_data.get('pubdate')), 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), 'subtitles': subtitles, @@ -2657,12 +2657,9 @@ class InfoExtractor(object): for source in jwplayer_sources_data: if not isinstance(source, dict): continue - source_url = self._proto_relative_url(source.get('file')) - if not source_url: - continue - if base_url: - source_url = compat_urlparse.urljoin(base_url, source_url) - if source_url in urls: + source_url = urljoin( + base_url, self._proto_relative_url(source.get('file'))) + if not source_url or source_url in urls: continue urls.append(source_url) source_type = source.get('type') or '' From fad4ceb53404227f471af2f3544c4c14a5df4acb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 20 Jan 2019 20:21:24 +0700 Subject: [PATCH 15/24] [utils] Fix urljoin for paths with non-http(s) schemes --- test/test_utils.py | 2 ++ youtube_dl/utils.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index 9e28e008f..409482c3b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -507,6 +507,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(urljoin('http://foo.de/', ''), None) self.assertEqual(urljoin('http://foo.de/', ['foobar']), None) self.assertEqual(urljoin('http://foo.de/a/b/c.txt', '.././../d.txt'), 'http://foo.de/d.txt') + self.assertEqual(urljoin('http://foo.de/a/b/c.txt', 'rtmp://foo.de'), 'rtmp://foo.de') + self.assertEqual(urljoin(None, 'rtmp://foo.de'), 'rtmp://foo.de') def test_url_or_none(self): self.assertEqual(url_or_none(None), None) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d2d3c1a9f..d0cb65814 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1868,7 +1868,7 @@ def urljoin(base, path): path = path.decode('utf-8') if not isinstance(path, compat_str) or not path: return None - if re.match(r'^(?:https?:)?//', path): + if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path): return path if isinstance(base, bytes): base = base.decode('utf-8') From 07f9febc4b86a9cd819329f3a7daafdbe9455f40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 20 Jan 2019 22:07:01 +0700 Subject: [PATCH 16/24] [tnaflix] Pass Referer in metadata request (closes #18925) --- youtube_dl/extractor/tnaflix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index 6798ef4c3..b3573c6e0 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -96,7 +96,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor): cfg_xml = self._download_xml( cfg_url, display_id, 'Downloading metadata', - transform_source=fix_xml_ampersands) + transform_source=fix_xml_ampersands, headers={'Referer': url}) formats = [] From 19d6991312405f5af108af28b3721966720fc72d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 22 Jan 2019 03:03:53 +0700 Subject: [PATCH 17/24] [videomore] Improve extraction and fix season extractor (closes #18908) --- youtube_dl/extractor/videomore.py | 96 ++++++++++++++++++++++++++++--- 1 file changed, 88 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/videomore.py b/youtube_dl/extractor/videomore.py index 9b56630de..e3eda3327 100644 --- a/youtube_dl/extractor/videomore.py +++ b/youtube_dl/extractor/videomore.py @@ -4,8 +4,14 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, + orderedSet, + parse_duration, + str_or_none, + unified_strdate, + url_or_none, xpath_element, xpath_text, ) @@ -13,7 +19,19 @@ from ..utils import ( class VideomoreIE(InfoExtractor): IE_NAME = 'videomore' - _VALID_URL = r'videomore:(?P\d+)$|https?://videomore\.ru/(?:(?:embed|[^/]+/[^/]+)/|[^/]+\?.*\btrack_id=)(?P\d+)(?:[/?#&]|\.(?:xml|json)|$)' + _VALID_URL = r'''(?x) + videomore:(?P\d+)$| + https?://(?:player\.)?videomore\.ru/ + (?: + (?: + embed| + [^/]+/[^/]+ + )/| + [^/]*\?.*?\btrack_id= + ) + (?P\d+) + (?:[/?#&]|\.(?:xml|json)|$) + ''' _TESTS = [{ 'url': 'http://videomore.ru/kino_v_detalayah/5_sezon/367617', 'md5': '44455a346edc0d509ac5b5a5b531dc35', @@ -79,6 +97,9 @@ class VideomoreIE(InfoExtractor): }, { 'url': 'videomore:367617', 'only_matching': True, + }, { + 'url': 'https://player.videomore.ru/?partner_id=97&track_id=736234&autoplay=0&userToken=', + 'only_matching': True, }] @staticmethod @@ -136,7 +157,7 @@ class VideomoreIE(InfoExtractor): class VideomoreVideoIE(InfoExtractor): IE_NAME = 'videomore:video' - _VALID_URL = r'https?://videomore\.ru/(?:(?:[^/]+/){2})?(?P[^/?#&]+)[/?#&]*$' + _VALID_URL = r'https?://videomore\.ru/(?:(?:[^/]+/){2})?(?P[^/?#&]+)(?:/*|[?#&].*?)$' _TESTS = [{ # single video with og:video:iframe 'url': 'http://videomore.ru/elki_3', @@ -176,6 +197,9 @@ class VideomoreVideoIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://videomore.ru/molodezhka/6_sezon/29_seriya?utm_so', + 'only_matching': True, }] @classmethod @@ -196,13 +220,16 @@ class VideomoreVideoIE(InfoExtractor): r'track-id=["\'](\d+)', r'xcnt_product_id\s*=\s*(\d+)'), webpage, 'video id') video_url = 'videomore:%s' % video_id + else: + video_id = None - return self.url_result(video_url, VideomoreIE.ie_key()) + return self.url_result( + video_url, ie=VideomoreIE.ie_key(), video_id=video_id) class VideomoreSeasonIE(InfoExtractor): IE_NAME = 'videomore:season' - _VALID_URL = r'https?://videomore\.ru/(?!embed)(?P[^/]+/[^/?#&]+)[/?#&]*$' + _VALID_URL = r'https?://videomore\.ru/(?!embed)(?P[^/]+/[^/?#&]+)(?:/*|[?#&].*?)$' _TESTS = [{ 'url': 'http://videomore.ru/molodezhka/sezon_promo', 'info_dict': { @@ -210,8 +237,16 @@ class VideomoreSeasonIE(InfoExtractor): 'title': 'Молодежка Промо', }, 'playlist_mincount': 12, + }, { + 'url': 'http://videomore.ru/molodezhka/sezon_promo?utm_so', + 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return (False if (VideomoreIE.suitable(url) or VideomoreVideoIE.suitable(url)) + else super(VideomoreSeasonIE, cls).suitable(url)) + def _real_extract(self, url): display_id = self._match_id(url) @@ -219,9 +254,54 @@ class VideomoreSeasonIE(InfoExtractor): title = self._og_search_title(webpage) - entries = [ - self.url_result(item) for item in re.findall( - r']+href="((?:https?:)?//videomore\.ru/%s/[^/]+)"[^>]+class="widget-item-desc"' - % display_id, webpage)] + data = self._parse_json( + self._html_search_regex( + r'\bclass=["\']seasons-tracks["\'][^>]+\bdata-custom-data=(["\'])(?P{.+?})\1', + webpage, 'data', default='{}', group='value'), + display_id, fatal=False) + + entries = [] + + if data: + episodes = data.get('episodes') + if isinstance(episodes, list): + for ep in episodes: + if not isinstance(ep, dict): + continue + ep_id = int_or_none(ep.get('id')) + ep_url = url_or_none(ep.get('url')) + if ep_id: + e = { + 'url': 'videomore:%s' % ep_id, + 'id': compat_str(ep_id), + } + elif ep_url: + e = {'url': ep_url} + else: + continue + e.update({ + '_type': 'url', + 'ie_key': VideomoreIE.ie_key(), + 'title': str_or_none(ep.get('title')), + 'thumbnail': url_or_none(ep.get('image')), + 'duration': parse_duration(ep.get('duration')), + 'episode_number': int_or_none(ep.get('number')), + 'upload_date': unified_strdate(ep.get('date')), + }) + entries.append(e) + + if not entries: + entries = [ + self.url_result( + 'videomore:%s' % video_id, ie=VideomoreIE.ie_key(), + video_id=video_id) + for video_id in orderedSet(re.findall( + r':(?:id|key)=["\'](\d+)["\']', webpage))] + + if not entries: + entries = [ + self.url_result(item) for item in re.findall( + r']+href="((?:https?:)?//videomore\.ru/%s/[^/]+)"[^>]+class="widget-item-desc"' + % display_id, webpage)] return self.playlist_result(entries, display_id, title) From 4b85f0f9db9329ef1774a66c3e2fd4da558a5201 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 22 Jan 2019 14:38:29 +0100 Subject: [PATCH 18/24] [vrv] add support for authentication(closes #14307) --- youtube_dl/extractor/vrv.py | 95 ++++++++++++++++++++++--------------- 1 file changed, 56 insertions(+), 39 deletions(-) diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index 483a3be3a..014513051 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -11,10 +11,12 @@ import time from .common import InfoExtractor from ..compat import ( + compat_HTTPError, compat_urllib_parse_urlencode, compat_urllib_parse, ) from ..utils import ( + ExtractorError, float_or_none, int_or_none, ) @@ -24,29 +26,41 @@ class VRVBaseIE(InfoExtractor): _API_DOMAIN = None _API_PARAMS = {} _CMS_SIGNING = {} + _TOKEN = None + _TOKEN_SECRET = '' def _call_api(self, path, video_id, note, data=None): + # https://tools.ietf.org/html/rfc5849#section-3 base_url = self._API_DOMAIN + '/core/' + path - encoded_query = compat_urllib_parse_urlencode({ + query = { 'oauth_consumer_key': self._API_PARAMS['oAuthKey'], 'oauth_nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), 'oauth_signature_method': 'HMAC-SHA1', 'oauth_timestamp': int(time.time()), - 'oauth_version': '1.0', - }) + } + if self._TOKEN: + query['oauth_token'] = self._TOKEN + encoded_query = compat_urllib_parse_urlencode(query) headers = self.geo_verification_headers() if data: data = json.dumps(data).encode() headers['Content-Type'] = 'application/json' - method = 'POST' if data else 'GET' - base_string = '&'.join([method, compat_urllib_parse.quote(base_url, ''), compat_urllib_parse.quote(encoded_query, '')]) + base_string = '&'.join([ + 'POST' if data else 'GET', + compat_urllib_parse.quote(base_url, ''), + compat_urllib_parse.quote(encoded_query, '')]) oauth_signature = base64.b64encode(hmac.new( - (self._API_PARAMS['oAuthSecret'] + '&').encode('ascii'), + (self._API_PARAMS['oAuthSecret'] + '&' + self._TOKEN_SECRET).encode('ascii'), base_string.encode(), hashlib.sha1).digest()).decode() encoded_query += '&oauth_signature=' + compat_urllib_parse.quote(oauth_signature, '') - return self._download_json( - '?'.join([base_url, encoded_query]), video_id, - note='Downloading %s JSON metadata' % note, headers=headers, data=data) + try: + return self._download_json( + '?'.join([base_url, encoded_query]), video_id, + note='Downloading %s JSON metadata' % note, headers=headers, data=data) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + raise ExtractorError(json.loads(e.cause.read().decode())['message'], expected=True) + raise def _call_cms(self, path, video_id, note): if not self._CMS_SIGNING: @@ -55,19 +69,22 @@ class VRVBaseIE(InfoExtractor): self._API_DOMAIN + path, video_id, query=self._CMS_SIGNING, note='Downloading %s JSON metadata' % note, headers=self.geo_verification_headers()) - def _set_api_params(self, webpage, video_id): - if not self._API_PARAMS: - self._API_PARAMS = self._parse_json(self._search_regex( - r'window\.__APP_CONFIG__\s*=\s*({.+?})', - webpage, 'api config'), video_id)['cxApiParams'] - self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co') - def _get_cms_resource(self, resource_key, video_id): return self._call_api( 'cms_resource', video_id, 'resource path', data={ 'resource_key': resource_key, })['__links__']['cms_resource']['href'] + def _real_initialize(self): + webpage = self._download_webpage( + 'https://vrv.co/', None, headers=self.geo_verification_headers()) + self._API_PARAMS = self._parse_json(self._search_regex( + [ + r'window\.__APP_CONFIG__\s*=\s*({.+?})(?:|;)', + r'window\.__APP_CONFIG__\s*=\s*({.+})' + ], webpage, 'app config'), None)['cxApiParams'] + self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co') + class VRVIE(VRVBaseIE): IE_NAME = 'vrv' @@ -86,6 +103,22 @@ class VRVIE(VRVBaseIE): 'skip_download': True, }, }] + _NETRC_MACHINE = 'vrv' + + def _real_initialize(self): + super(VRVIE, self)._real_initialize() + + email, password = self._get_login_info() + if email is None: + return + + token_credentials = self._call_api( + 'authenticate/by:credentials', None, 'Token Credentials', data={ + 'email': email, + 'password': password, + }) + self._TOKEN = token_credentials['oauth_token'] + self._TOKEN_SECRET = token_credentials['oauth_token_secret'] def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): if not url or stream_format not in ('hls', 'dash'): @@ -116,28 +149,16 @@ class VRVIE(VRVBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - url, video_id, - headers=self.geo_verification_headers()) - media_resource = self._parse_json(self._search_regex( - [ - r'window\.__INITIAL_STATE__\s*=\s*({.+?})(?:|;)', - r'window\.__INITIAL_STATE__\s*=\s*({.+})' - ], webpage, 'inital state'), video_id).get('watch', {}).get('mediaResource') or {} - video_data = media_resource.get('json') - if not video_data: - self._set_api_params(webpage, video_id) - episode_path = self._get_cms_resource( - 'cms:/episodes/' + video_id, video_id) - video_data = self._call_cms(episode_path, video_id, 'video') + episode_path = self._get_cms_resource( + 'cms:/episodes/' + video_id, video_id) + video_data = self._call_cms(episode_path, video_id, 'video') title = video_data['title'] - streams_json = media_resource.get('streams', {}).get('json', {}) - if not streams_json: - self._set_api_params(webpage, video_id) - streams_path = video_data['__links__']['streams']['href'] - streams_json = self._call_cms(streams_path, video_id, 'streams') + streams_path = video_data['__links__'].get('streams', {}).get('href') + if not streams_path: + self.raise_login_required() + streams_json = self._call_cms(streams_path, video_id, 'streams') audio_locale = streams_json.get('audio_locale') formats = [] @@ -202,11 +223,7 @@ class VRVSeriesIE(VRVBaseIE): def _real_extract(self, url): series_id = self._match_id(url) - webpage = self._download_webpage( - url, series_id, - headers=self.geo_verification_headers()) - self._set_api_params(webpage, series_id) seasons_path = self._get_cms_resource( 'cms:/seasons?series_id=' + series_id, series_id) seasons_data = self._call_cms(seasons_path, series_id, 'seasons') From 503b604a316837b9dd6ef32045e4e9bbfb6a1363 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Tue, 22 Jan 2019 18:21:37 +0100 Subject: [PATCH 19/24] [vrv] fix oauth signing for python 2(#14307) --- youtube_dl/extractor/vrv.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index 014513051..6c060ae76 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -32,14 +32,14 @@ class VRVBaseIE(InfoExtractor): def _call_api(self, path, video_id, note, data=None): # https://tools.ietf.org/html/rfc5849#section-3 base_url = self._API_DOMAIN + '/core/' + path - query = { - 'oauth_consumer_key': self._API_PARAMS['oAuthKey'], - 'oauth_nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), - 'oauth_signature_method': 'HMAC-SHA1', - 'oauth_timestamp': int(time.time()), - } + query = [ + ('oauth_consumer_key', self._API_PARAMS['oAuthKey']), + ('oauth_nonce', ''.join([random.choice(string.ascii_letters) for _ in range(32)])), + ('oauth_signature_method', 'HMAC-SHA1'), + ('oauth_timestamp', int(time.time())), + ] if self._TOKEN: - query['oauth_token'] = self._TOKEN + query.append(('oauth_token', self._TOKEN)) encoded_query = compat_urllib_parse_urlencode(query) headers = self.geo_verification_headers() if data: From 278d061a0c5eae20963c0a6df4b9b13fd1537186 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Jan 2019 03:51:29 +0700 Subject: [PATCH 20/24] [pornhub] Bypass scrape detection (closes #5930) --- youtube_dl/extractor/pornhub.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index e377de196..f5f3e6593 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -10,7 +10,9 @@ from .common import InfoExtractor from ..compat import ( compat_HTTPError, compat_str, + compat_urllib_request, ) +from .openload import PhantomJSwrapper from ..utils import ( ExtractorError, int_or_none, @@ -126,6 +128,26 @@ class PornHubIE(InfoExtractor): 'only_matching': True, }] + def _download_webpage_handle(self, *args, **kwargs): + def dl(*args, **kwargs): + return super(PornHubIE, self)._download_webpage_handle(*args, **kwargs) + + webpage, urlh = dl(*args, **kwargs) + + if any(re.search(p, webpage) for p in ( + r']+\bonload=["\']go\(\)', + r'document\.cookie\s*=\s*["\']RNKEY=', + r'document\.location\.reload\(true\)')): + url_or_request = args[0] + url = (url_or_request.get_full_url() + if isinstance(url_or_request, compat_urllib_request.Request) + else url_or_request) + phantom = PhantomJSwrapper(self, required_version='2.0') + phantom.get(url, html=webpage) + webpage, urlh = dl(*args, **kwargs) + + return webpage, urlh + @staticmethod def _extract_urls(webpage): return re.findall( From 6510a3aa971c00525969040ad654249c0c73f125 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Jan 2019 03:55:41 +0700 Subject: [PATCH 21/24] [crunchyroll] Extend _VALID_URL (closes #18955) --- youtube_dl/extractor/crunchyroll.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 4a68d092b..5e2cbe41d 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -144,7 +144,7 @@ class CrunchyrollBaseIE(InfoExtractor): class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): IE_NAME = 'crunchyroll' - _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P[0-9]+))(?:[/?&]|$)' + _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P[0-9]+))(?:[/?&]|$)' _TESTS = [{ 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', 'info_dict': { @@ -269,6 +269,9 @@ class CrunchyrollIE(CrunchyrollBaseIE, VRVIE): }, { 'url': 'http://www.crunchyroll.com/media-723735', 'only_matching': True, + }, { + 'url': 'https://www.crunchyroll.com/en-gb/mob-psycho-100/episode-2-urban-legends-encountering-rumors-780921', + 'only_matching': True, }] _FORMAT_IDS = { From 71a1f61700789fb0d61fc6ad9681b6f0899d2f51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Jan 2019 04:12:06 +0700 Subject: [PATCH 22/24] [pornhub] Apply scrape detection bypass for all extractors --- youtube_dl/extractor/pornhub.py | 46 +++++++++++++++++---------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index f5f3e6593..be93d5d48 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -24,7 +24,29 @@ from ..utils import ( ) -class PornHubIE(InfoExtractor): +class PornHubBaseIE(InfoExtractor): + def _download_webpage_handle(self, *args, **kwargs): + def dl(*args, **kwargs): + return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) + + webpage, urlh = dl(*args, **kwargs) + + if any(re.search(p, webpage) for p in ( + r']+\bonload=["\']go\(\)', + r'document\.cookie\s*=\s*["\']RNKEY=', + r'document\.location\.reload\(true\)')): + url_or_request = args[0] + url = (url_or_request.get_full_url() + if isinstance(url_or_request, compat_urllib_request.Request) + else url_or_request) + phantom = PhantomJSwrapper(self, required_version='2.0') + phantom.get(url, html=webpage) + webpage, urlh = dl(*args, **kwargs) + + return webpage, urlh + + +class PornHubIE(PornHubBaseIE): IE_DESC = 'PornHub and Thumbzilla' _VALID_URL = r'''(?x) https?:// @@ -128,26 +150,6 @@ class PornHubIE(InfoExtractor): 'only_matching': True, }] - def _download_webpage_handle(self, *args, **kwargs): - def dl(*args, **kwargs): - return super(PornHubIE, self)._download_webpage_handle(*args, **kwargs) - - webpage, urlh = dl(*args, **kwargs) - - if any(re.search(p, webpage) for p in ( - r']+\bonload=["\']go\(\)', - r'document\.cookie\s*=\s*["\']RNKEY=', - r'document\.location\.reload\(true\)')): - url_or_request = args[0] - url = (url_or_request.get_full_url() - if isinstance(url_or_request, compat_urllib_request.Request) - else url_or_request) - phantom = PhantomJSwrapper(self, required_version='2.0') - phantom.get(url, html=webpage) - webpage, urlh = dl(*args, **kwargs) - - return webpage, urlh - @staticmethod def _extract_urls(webpage): return re.findall( @@ -329,7 +331,7 @@ class PornHubIE(InfoExtractor): } -class PornHubPlaylistBaseIE(InfoExtractor): +class PornHubPlaylistBaseIE(PornHubBaseIE): def _extract_entries(self, webpage, host): # Only process container div with main playlist content skipping # drop-down menu that uses similar pattern for videos (see From 0670bdd8f2bca39fdeadc63e1cd53b5d5b3e638c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Jan 2019 04:43:55 +0700 Subject: [PATCH 23/24] [ChangeLog] Actualize [ci skip] --- ChangeLog | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/ChangeLog b/ChangeLog index 902301765..687796a0e 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,32 @@ +version + +Core +* [utils] Fix urljoin for paths with non-http(s) schemes +* [extractor/common] Improve jwplayer relative URL handling (#18892) ++ [YoutubeDL] Add negation support for string comparisons in format selection + expressions (#18600, #18805) +* [extractor/common] Improve HLS video-only format detection (#18923) + +Extractors +* [crunchyroll] Extend URL regular expression (#18955) +* [pornhub] Bypass scrape detection (#4822, #5930, #7074, #10175, #12722, + #17197, #18338 #18842, #18899) ++ [vrv] Add support for authentication (#14307) +* [videomore:season] Fix extraction +* [videomore] Improve extraction (#18908) ++ [tnaflix] Pass Referer in metadata request (#18925) +* [radiocanada] Relax DRM check (#18608, #18609) +* [vimeo] Fix video password verification for videos protected by + Referer HTTP header ++ [hketv] Add support for hkedcity.net (#18696) ++ [streamango] Add support for fruithosts.net (#18710) ++ [instagram] Add support for tags (#18757) ++ [odnoklassniki] Detect paid videos (#18876) +* [ted] Correct acodec for HTTP formats (#18923) +* [cartoonnetwork] Fix extraction (#15664, #17224) +* [vimeo] Fix extraction for password protected player URLs (#18889) + + version 2019.01.17 Extractors From 435e382423f860aca82a58d7c3db58cbfa242b40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 23 Jan 2019 04:46:55 +0700 Subject: [PATCH 24/24] release 2019.01.23 --- .github/ISSUE_TEMPLATE.md | 6 +++--- ChangeLog | 2 +- docs/supportedsites.md | 2 ++ youtube_dl/version.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 841bca914..db3ebaeed 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.17*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.17** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.01.23*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.01.23** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2019.01.17 +[debug] youtube-dl version 2019.01.23 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/ChangeLog b/ChangeLog index 687796a0e..55ad44315 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,4 @@ -version +version 2019.01.23 Core * [utils] Fix urljoin for paths with non-http(s) schemes diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c01409419..d759d0273 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -361,6 +361,7 @@ - **hitbox** - **hitbox:live** - **HitRecord** + - **hketv**: 香港教育局教育電視 (HKETV) Educational Television, Hong Kong Educational Bureau - **HornBunny** - **HotNewHipHop** - **hotstar** @@ -386,6 +387,7 @@ - **IndavideoEmbed** - **InfoQ** - **Instagram** + - **instagram:tag**: Instagram hashtag search - **instagram:user**: Instagram user profile - **Internazionale** - **InternetVideoArchive** diff --git a/youtube_dl/version.py b/youtube_dl/version.py index ea3f62928..d77949eed 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2019.01.17' +__version__ = '2019.01.23'