From 3395958d2befc710181bbde872074ce81eee9158 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Tue, 20 Mar 2018 23:07:11 +0100 Subject: [PATCH 1/4] libsyn: adapt to new page structure and replace testcase --- youtube_dl/extractor/libsyn.py | 52 +++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py index 4750b03a3..f7311f483 100644 --- a/youtube_dl/extractor/libsyn.py +++ b/youtube_dl/extractor/libsyn.py @@ -1,24 +1,28 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( + parse_duration, + unified_strdate, +) class LibsynIE(InfoExtractor): _VALID_URL = r'(?Phttps?://html5-player\.libsyn\.com/embed/episode/id/(?P[0-9]+))' _TESTS = [{ - 'url': 'http://html5-player.libsyn.com/embed/episode/id/3377616/', - 'md5': '443360ee1b58007bc3dcf09b41d093bb', + 'url': 'http://html5-player.libsyn.com/embed/episode/id/6385796/', + 'md5': '2a55e75496c790cdeb058e7e6c087746', 'info_dict': { - 'id': '3377616', + 'id': '6385796', 'ext': 'mp3', - 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart", - 'description': 'md5:601cb790edd05908957dae8aaa866465', - 'upload_date': '20150220', + 'title': "Champion Minded - Developing a Growth Mindset", + 'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.', + 'upload_date': '20180320', 'thumbnail': 're:^https?://.*', }, }, { @@ -39,31 +43,45 @@ class LibsynIE(InfoExtractor): url = m.group('mainurl') webpage = self._download_webpage(url, video_id) - formats = [{ - 'url': media_url, - } for media_url in set(re.findall(r'var\s+mediaURL(?:Libsyn)?\s*=\s*"([^"]+)"', webpage))] - podcast_title = self._search_regex( - r'

([^<]+)

', webpage, 'podcast title', default=None) + r'

([^<]+)

', webpage, 'podcast title', default=None) + if podcast_title: + podcast_title = podcast_title.strip() episode_title = self._search_regex( - r'(?:
|

)([^<]+)|

)([^<]+)(.+?)

', webpage, + r'(.+?)

', webpage, 'description', default=None) - thumbnail = self._search_regex( - r']+class="info-show-icon"[^>]+src="([^"]+)"', - webpage, 'thumbnail', fatal=False) + if description: + # Strip non-breaking and normal spaces + description = description.replace('\u00A0', ' ').strip() release_date = unified_strdate(self._search_regex( r'
Released: ([^<]+)<', webpage, 'release date', fatal=False)) + data_json = self._search_regex(r'var\s+playlistItem\s*=\s*(\{.*?\});\n', webpage, 'JSON data block') + data = json.loads(data_json) + + formats = [{ + 'url': data['media_url'], + 'format_id': 'main', + }, { + 'url': data['media_url_libsyn'], + 'format_id': 'libsyn', + }] + thumbnail = data.get('thumbnail_url') + duration = parse_duration(data.get('duration')) + return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': release_date, + 'duration': duration, 'formats': formats, } From 328ddf56a151830ae002842b7088464e4e391b5d Mon Sep 17 00:00:00 2001 From: Vijay Singh Date: Wed, 21 Mar 2018 12:13:31 +0530 Subject: [PATCH 2/4] [Youku] Update ccode --- youtube_dl/extractor/youku.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 5b0b248cd..2f5a7b023 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -154,7 +154,7 @@ class YoukuIE(InfoExtractor): # request basic data basic_data_params = { 'vid': video_id, - 'ccode': '0507', + 'ccode': '0590', 'client_ip': '192.168.1.1', 'utid': cna, 'client_ts': time.time() / 1000, From cba5d1b6b36d79fcafe0600d9805e6b82ed5388f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Wed, 21 Mar 2018 23:43:03 +0700 Subject: [PATCH 3/4] [instagram:user] Add pagination (closes #15934) --- youtube_dl/extractor/instagram.py | 96 +++++++++++++++++++------------ 1 file changed, 59 insertions(+), 37 deletions(-) diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index ac9d92a8d..f9cd11b8e 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -1,5 +1,6 @@ from __future__ import unicode_literals +import itertools import json import re @@ -242,48 +243,69 @@ class InstagramUserIE(InfoExtractor): return int_or_none(try_get( node, lambda x: x['edge_media_' + suffix]['count'])) - edges = self._download_json( - 'https://www.instagram.com/graphql/query/', uploader_id, query={ - 'query_hash': '472f257a40c653c64c666ce877d59d2b', - 'variables': json.dumps({ - 'id': uploader_id, - 'first': 999999999, + cursor = '' + for page_num in itertools.count(1): + media = self._download_json( + 'https://www.instagram.com/graphql/query/', uploader_id, + 'Downloading JSON page %d' % page_num, query={ + 'query_hash': '472f257a40c653c64c666ce877d59d2b', + 'variables': json.dumps({ + 'id': uploader_id, + 'first': 100, + 'after': cursor, + }) + })['data']['user']['edge_owner_to_timeline_media'] + + edges = media.get('edges') + if not edges or not isinstance(edges, list): + break + + for edge in edges: + node = edge.get('node') + if not node or not isinstance(node, dict): + continue + if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: + continue + video_id = node.get('shortcode') + if not video_id: + continue + + info = self.url_result( + 'https://instagram.com/p/%s/' % video_id, + ie=InstagramIE.ie_key(), video_id=video_id) + + description = try_get( + node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], + compat_str) + thumbnail = node.get('thumbnail_src') or node.get('display_src') + timestamp = int_or_none(node.get('taken_at_timestamp')) + + comment_count = get_count('to_comment') + like_count = get_count('preview_like') + view_count = int_or_none(node.get('video_view_count')) + + info.update({ + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'comment_count': comment_count, + 'like_count': like_count, + 'view_count': view_count, }) - })['data']['user']['edge_owner_to_timeline_media']['edges'] - for edge in edges: - node = edge['node'] + yield info - if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: - continue - video_id = node.get('shortcode') - if not video_id: - continue + page_info = media.get('page_info') + if not page_info or not isinstance(page_info, dict): + break - info = self.url_result( - 'https://instagram.com/p/%s/' % video_id, - ie=InstagramIE.ie_key(), video_id=video_id) + has_next_page = page_info.get('has_next_page') + if not has_next_page: + break - description = try_get( - node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], - compat_str) - thumbnail = node.get('thumbnail_src') or node.get('display_src') - timestamp = int_or_none(node.get('taken_at_timestamp')) - - comment_count = get_count('to_comment') - like_count = get_count('preview_like') - view_count = int_or_none(node.get('video_view_count')) - - info.update({ - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'comment_count': comment_count, - 'like_count': like_count, - 'view_count': view_count, - }) - - yield info + cursor = page_info.get('end_cursor') + if not cursor or not isinstance(cursor, compat_str): + break def _real_extract(self, url): username = self._match_id(url) From 8b7340a45eb0e3aeaa996896ff8690b6c3a32af6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Thu, 22 Mar 2018 22:55:28 +0700 Subject: [PATCH 4/4] [lenta] Add extractor (closes #15953) --- youtube_dl/extractor/extractors.py | 5 +-- youtube_dl/extractor/generic.py | 18 ---------- youtube_dl/extractor/lenta.py | 53 ++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 20 deletions(-) create mode 100644 youtube_dl/extractor/lenta.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3bde40eb3..de48a37ad 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -532,13 +532,14 @@ from .lcp import ( ) from .learnr import LearnrIE from .lecture2go import Lecture2GoIE -from .lego import LEGOIE -from .lemonde import LemondeIE from .leeco import ( LeIE, LePlaylistIE, LetvCloudIE, ) +from .lego import LEGOIE +from .lemonde import LemondeIE +from .lenta import LentaIE from .libraryofcongress import LibraryOfCongressIE from .libsyn import LibsynIE from .lifenews import ( diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 1cc491b19..cf64398e3 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1270,24 +1270,6 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, - # EaglePlatform embed (generic URL) - { - 'url': 'http://lenta.ru/news/2015/03/06/navalny/', - # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used - 'info_dict': { - 'id': '227304', - 'ext': 'mp4', - 'title': 'Навальный вышел на свободу', - 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 87, - 'view_count': int, - 'age_limit': 0, - }, - 'params': { - 'skip_download': True, - }, - }, # referrer protected EaglePlatform embed { 'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/', diff --git a/youtube_dl/extractor/lenta.py b/youtube_dl/extractor/lenta.py new file mode 100644 index 000000000..2ebd4e577 --- /dev/null +++ b/youtube_dl/extractor/lenta.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class LentaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?lenta\.ru/[^/]+/\d+/\d+/\d+/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://lenta.ru/news/2018/03/22/savshenko_go/', + 'info_dict': { + 'id': '964400', + 'ext': 'mp4', + 'title': 'Надежду Савченко задержали', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 61, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + # EaglePlatform iframe embed + 'url': 'http://lenta.ru/news/2015/03/06/navalny/', + 'info_dict': { + 'id': '227304', + 'ext': 'mp4', + 'title': 'Навальный вышел на свободу', + 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 87, + 'view_count': int, + 'age_limit': 0, + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r'vid\s*:\s*["\']?(\d+)', webpage, 'eagleplatform id', + default=None) + if video_id: + return self.url_result( + 'eagleplatform:lentaru.media.eagleplatform.com:%s' % video_id, + ie='EaglePlatform', video_id=video_id) + + return self.url_result(url, ie='Generic')