Merge remote-tracking branch 'upstream/master' into XVideos-issue-15799

2018-03-22 15:10:37 -05:00 · 2018-03-22 15:10:37 -05:00 · b3c7aea8ac
commit b3c7aea8ac
parent e2845da44e 8b7340a45e
6 changed files with 151 additions and 75 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -532,13 +532,14 @@ from .lcp import (
 )
 from .learnr import LearnrIE
 from .lecture2go import Lecture2GoIE
-from .lego import LEGOIE
-from .lemonde import LemondeIE
 from .leeco import (
    LeIE,
    LePlaylistIE,
    LetvCloudIE,
 )
+from .lego import LEGOIE
+from .lemonde import LemondeIE
+from .lenta import LentaIE
 from .libraryofcongress import LibraryOfCongressIE
 from .libsyn import LibsynIE
 from .lifenews import (
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -1270,24 +1270,6 @@ class GenericIE(InfoExtractor):
            },
            'add_ie': ['Kaltura'],
        },
-        # EaglePlatform embed (generic URL)
-        {
-            'url': 'http://lenta.ru/news/2015/03/06/navalny/',
-            # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
-            'info_dict': {
-                'id': '227304',
-                'ext': 'mp4',
-                'title': 'Навальный вышел на свободу',
-                'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
-                'thumbnail': r're:^https?://.*\.jpg$',
-                'duration': 87,
-                'view_count': int,
-                'age_limit': 0,
-            },
-            'params': {
-                'skip_download': True,
-            },
-        },
        # referrer protected EaglePlatform embed
        {
            'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/',
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@ -1,5 +1,6 @@
 from __future__ import unicode_literals

+import itertools
 import json
 import re

@ -242,48 +243,69 @@ class InstagramUserIE(InfoExtractor):
            return int_or_none(try_get(
                node, lambda x: x['edge_media_' + suffix]['count']))

-        edges = self._download_json(
-            'https://www.instagram.com/graphql/query/', uploader_id, query={
-                'query_hash': '472f257a40c653c64c666ce877d59d2b',
-                'variables': json.dumps({
-                    'id': uploader_id,
-                    'first': 999999999,
+        cursor = ''
+        for page_num in itertools.count(1):
+            media = self._download_json(
+                'https://www.instagram.com/graphql/query/', uploader_id,
+                'Downloading JSON page %d' % page_num, query={
+                    'query_hash': '472f257a40c653c64c666ce877d59d2b',
+                    'variables': json.dumps({
+                        'id': uploader_id,
+                        'first': 100,
+                        'after': cursor,
+                    })
+                })['data']['user']['edge_owner_to_timeline_media']
+
+            edges = media.get('edges')
+            if not edges or not isinstance(edges, list):
+                break
+
+            for edge in edges:
+                node = edge.get('node')
+                if not node or not isinstance(node, dict):
+                    continue
+                if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True:
+                    continue
+                video_id = node.get('shortcode')
+                if not video_id:
+                    continue
+
+                info = self.url_result(
+                    'https://instagram.com/p/%s/' % video_id,
+                    ie=InstagramIE.ie_key(), video_id=video_id)
+
+                description = try_get(
+                    node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
+                    compat_str)
+                thumbnail = node.get('thumbnail_src') or node.get('display_src')
+                timestamp = int_or_none(node.get('taken_at_timestamp'))
+
+                comment_count = get_count('to_comment')
+                like_count = get_count('preview_like')
+                view_count = int_or_none(node.get('video_view_count'))
+
+                info.update({
+                    'description': description,
+                    'thumbnail': thumbnail,
+                    'timestamp': timestamp,
+                    'comment_count': comment_count,
+                    'like_count': like_count,
+                    'view_count': view_count,
                })
-            })['data']['user']['edge_owner_to_timeline_media']['edges']

-        for edge in edges:
-            node = edge['node']
+                yield info

-            if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True:
-                continue
-            video_id = node.get('shortcode')
-            if not video_id:
-                continue
+            page_info = media.get('page_info')
+            if not page_info or not isinstance(page_info, dict):
+                break

-            info = self.url_result(
-                'https://instagram.com/p/%s/' % video_id,
-                ie=InstagramIE.ie_key(), video_id=video_id)
+            has_next_page = page_info.get('has_next_page')
+            if not has_next_page:
+                break

-            description = try_get(
-                node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
-                compat_str)
-            thumbnail = node.get('thumbnail_src') or node.get('display_src')
-            timestamp = int_or_none(node.get('taken_at_timestamp'))
-
-            comment_count = get_count('to_comment')
-            like_count = get_count('preview_like')
-            view_count = int_or_none(node.get('video_view_count'))
-
-            info.update({
-                'description': description,
-                'thumbnail': thumbnail,
-                'timestamp': timestamp,
-                'comment_count': comment_count,
-                'like_count': like_count,
-                'view_count': view_count,
-            })
-
-            yield info
+            cursor = page_info.get('end_cursor')
+            if not cursor or not isinstance(cursor, compat_str):
+                break

    def _real_extract(self, url):
        username = self._match_id(url)
--- a/youtube_dl/extractor/lenta.py
+++ b/youtube_dl/extractor/lenta.py
@ -0,0 +1,53 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class LentaIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?lenta\.ru/[^/]+/\d+/\d+/\d+/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://lenta.ru/news/2018/03/22/savshenko_go/',
+        'info_dict': {
+            'id': '964400',
+            'ext': 'mp4',
+            'title': 'Надежду Савченко задержали',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 61,
+            'view_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # EaglePlatform iframe embed
+        'url': 'http://lenta.ru/news/2015/03/06/navalny/',
+        'info_dict': {
+            'id': '227304',
+            'ext': 'mp4',
+            'title': 'Навальный вышел на свободу',
+            'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 87,
+            'view_count': int,
+            'age_limit': 0,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_id = self._search_regex(
+            r'vid\s*:\s*["\']?(\d+)', webpage, 'eagleplatform id',
+            default=None)
+        if video_id:
+            return self.url_result(
+                'eagleplatform:lentaru.media.eagleplatform.com:%s' % video_id,
+                ie='EaglePlatform', video_id=video_id)
+
+        return self.url_result(url, ie='Generic')
--- a/youtube_dl/extractor/libsyn.py
+++ b/youtube_dl/extractor/libsyn.py
@ -1,24 +1,28 @@
 # coding: utf-8
 from __future__ import unicode_literals

+import json
 import re

 from .common import InfoExtractor
-from ..utils import unified_strdate
+from ..utils import (
+    parse_duration,
+    unified_strdate,
+)


 class LibsynIE(InfoExtractor):
    _VALID_URL = r'(?P<mainurl>https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+))'

    _TESTS = [{
-        'url': 'http://html5-player.libsyn.com/embed/episode/id/3377616/',
-        'md5': '443360ee1b58007bc3dcf09b41d093bb',
+        'url': 'http://html5-player.libsyn.com/embed/episode/id/6385796/',
+        'md5': '2a55e75496c790cdeb058e7e6c087746',
        'info_dict': {
-            'id': '3377616',
+            'id': '6385796',
            'ext': 'mp3',
-            'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
-            'description': 'md5:601cb790edd05908957dae8aaa866465',
-            'upload_date': '20150220',
+            'title': "Champion Minded - Developing a Growth Mindset",
+            'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.',
+            'upload_date': '20180320',
            'thumbnail': 're:^https?://.*',
        },
    }, {
@ -39,31 +43,45 @@ class LibsynIE(InfoExtractor):
        url = m.group('mainurl')
        webpage = self._download_webpage(url, video_id)

-        formats = [{
-            'url': media_url,
-        } for media_url in set(re.findall(r'var\s+mediaURL(?:Libsyn)?\s*=\s*"([^"]+)"', webpage))]
-
        podcast_title = self._search_regex(
-            r'<h2>([^<]+)</h2>', webpage, 'podcast title', default=None)
+            r'<h3>([^<]+)</h3>', webpage, 'podcast title', default=None)
+        if podcast_title:
+            podcast_title = podcast_title.strip()
        episode_title = self._search_regex(
-            r'(?:<div class="episode-title">|<h3>)([^<]+)</', webpage, 'episode title')
+            r'(?:<div class="episode-title">|<h4>)([^<]+)</', webpage, 'episode title')
+        if episode_title:
+            episode_title = episode_title.strip()

        title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title

        description = self._html_search_regex(
-            r'<div id="info_text_body">(.+?)</div>', webpage,
+            r'<p\s+id="info_text_body">(.+?)</p>', webpage,
            'description', default=None)
-        thumbnail = self._search_regex(
-            r'<img[^>]+class="info-show-icon"[^>]+src="([^"]+)"',
-            webpage, 'thumbnail', fatal=False)
+        if description:
+            # Strip non-breaking and normal spaces
+            description = description.replace('\u00A0', ' ').strip()
        release_date = unified_strdate(self._search_regex(
            r'<div class="release_date">Released: ([^<]+)<', webpage, 'release date', fatal=False))

+        data_json = self._search_regex(r'var\s+playlistItem\s*=\s*(\{.*?\});\n', webpage, 'JSON data block')
+        data = json.loads(data_json)
+
+        formats = [{
+            'url': data['media_url'],
+            'format_id': 'main',
+        }, {
+            'url': data['media_url_libsyn'],
+            'format_id': 'libsyn',
+        }]
+        thumbnail = data.get('thumbnail_url')
+        duration = parse_duration(data.get('duration'))
+
        return {
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'upload_date': release_date,
+            'duration': duration,
            'formats': formats,
        }
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@ -154,7 +154,7 @@ class YoukuIE(InfoExtractor):
        # request basic data
        basic_data_params = {
            'vid': video_id,
-            'ccode': '0507',
+            'ccode': '0590',
            'client_ip': '192.168.1.1',
            'utid': cna,
            'client_ts': time.time() / 1000,