Merge remote-tracking branch 'upstream/master' into XVideos-issue-15799

2018-03-22 15:10:37 -05:00 · 2018-03-22 15:10:37 -05:00 · b3c7aea8ac
commit b3c7aea8ac
parent e2845da44e 8b7340a45e
6 changed files with 151 additions and 75 deletions
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -532,13 +532,14 @@ from .lcp import (
 )
 from .learnr import LearnrIE
 from .lecture2go import Lecture2GoIE
 from .lego import LEGOIE
 from .lemonde import LemondeIE
 from .leeco import (
    LeIE,
    LePlaylistIE,
    LetvCloudIE,
 )
 from .lego import LEGOIE
 from .lemonde import LemondeIE
 from .lenta import LentaIE
 from .libraryofcongress import LibraryOfCongressIE
 from .libsyn import LibsynIE
 from .lifenews import (
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -1270,24 +1270,6 @@ class GenericIE(InfoExtractor):
            },
            'add_ie': ['Kaltura'],
        },
        # EaglePlatform embed (generic URL)
        {
            'url': 'http://lenta.ru/news/2015/03/06/navalny/',
            # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
            'info_dict': {
                'id': '227304',
                'ext': 'mp4',
                'title': 'Навальный вышел на свободу',
                'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
                'thumbnail': r're:^https?://.*\.jpg$',
                'duration': 87,
                'view_count': int,
                'age_limit': 0,
            },
            'params': {
                'skip_download': True,
            },
        },
        # referrer protected EaglePlatform embed
        {
            'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/',
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@ -1,5 +1,6 @@
 from __future__ import unicode_literals
 import itertools
 import json
 import re
@ -242,18 +243,27 @@ class InstagramUserIE(InfoExtractor):
            return int_or_none(try_get(
                node, lambda x: x['edge_media_' + suffix]['count']))
-        edges = self._download_json(
+        cursor = ''
-            'https://www.instagram.com/graphql/query/', uploader_id, query={
+        for page_num in itertools.count(1):
            media = self._download_json(
                'https://www.instagram.com/graphql/query/', uploader_id,
                'Downloading JSON page %d' % page_num, query={
                    'query_hash': '472f257a40c653c64c666ce877d59d2b',
                    'variables': json.dumps({
                        'id': uploader_id,
-                    'first': 999999999,
+                        'first': 100,
                        'after': cursor,
                    })
-            })['data']['user']['edge_owner_to_timeline_media']['edges']
+                })['data']['user']['edge_owner_to_timeline_media']
            edges = media.get('edges')
            if not edges or not isinstance(edges, list):
                break
            for edge in edges:
-            node = edge['node']
+                node = edge.get('node')
-
+                if not node or not isinstance(node, dict):
                    continue
                if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True:
                    continue
                video_id = node.get('shortcode')
@ -285,6 +295,18 @@ class InstagramUserIE(InfoExtractor):
                yield info
            page_info = media.get('page_info')
            if not page_info or not isinstance(page_info, dict):
                break
            has_next_page = page_info.get('has_next_page')
            if not has_next_page:
                break
            cursor = page_info.get('end_cursor')
            if not cursor or not isinstance(cursor, compat_str):
                break
    def _real_extract(self, url):
        username = self._match_id(url)
        uploader_id = self._download_json(
--- a/youtube_dl/extractor/lenta.py
+++ b/youtube_dl/extractor/lenta.py
@ -0,0 +1,53 @@
 # coding: utf-8
 from __future__ import unicode_literals
 from .common import InfoExtractor
 class LentaIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?lenta\.ru/[^/]+/\d+/\d+/\d+/(?P<id>[^/?#&]+)'
    _TESTS = [{
        'url': 'https://lenta.ru/news/2018/03/22/savshenko_go/',
        'info_dict': {
            'id': '964400',
            'ext': 'mp4',
            'title': 'Надежду Савченко задержали',
            'thumbnail': r're:^https?://.*\.jpg$',
            'duration': 61,
            'view_count': int,
        },
        'params': {
            'skip_download': True,
        },
    }, {
        # EaglePlatform iframe embed
        'url': 'http://lenta.ru/news/2015/03/06/navalny/',
        'info_dict': {
            'id': '227304',
            'ext': 'mp4',
            'title': 'Навальный вышел на свободу',
            'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
            'thumbnail': r're:^https?://.*\.jpg$',
            'duration': 87,
            'view_count': int,
            'age_limit': 0,
        },
        'params': {
            'skip_download': True,
        },
    }]
    def _real_extract(self, url):
        display_id = self._match_id(url)
        webpage = self._download_webpage(url, display_id)
        video_id = self._search_regex(
            r'vid\s*:\s*["\']?(\d+)', webpage, 'eagleplatform id',
            default=None)
        if video_id:
            return self.url_result(
                'eagleplatform:lentaru.media.eagleplatform.com:%s' % video_id,
                ie='EaglePlatform', video_id=video_id)
        return self.url_result(url, ie='Generic')
--- a/youtube_dl/extractor/libsyn.py
+++ b/youtube_dl/extractor/libsyn.py
@ -1,24 +1,28 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import json
 import re
 from .common import InfoExtractor
-from ..utils import unified_strdate
+from ..utils import (
    parse_duration,
    unified_strdate,
 )
 class LibsynIE(InfoExtractor):
    _VALID_URL = r'(?P<mainurl>https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+))'
    _TESTS = [{
-        'url': 'http://html5-player.libsyn.com/embed/episode/id/3377616/',
+        'url': 'http://html5-player.libsyn.com/embed/episode/id/6385796/',
-        'md5': '443360ee1b58007bc3dcf09b41d093bb',
+        'md5': '2a55e75496c790cdeb058e7e6c087746',
        'info_dict': {
-            'id': '3377616',
+            'id': '6385796',
            'ext': 'mp3',
-            'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
+            'title': "Champion Minded - Developing a Growth Mindset",
-            'description': 'md5:601cb790edd05908957dae8aaa866465',
+            'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.',
-            'upload_date': '20150220',
+            'upload_date': '20180320',
            'thumbnail': 're:^https?://.*',
        },
    }, {
@ -39,31 +43,45 @@ class LibsynIE(InfoExtractor):
        url = m.group('mainurl')
        webpage = self._download_webpage(url, video_id)
        formats = [{
            'url': media_url,
        } for media_url in set(re.findall(r'var\s+mediaURL(?:Libsyn)?\s*=\s*"([^"]+)"', webpage))]
        podcast_title = self._search_regex(
-            r'<h2>([^<]+)</h2>', webpage, 'podcast title', default=None)
+            r'<h3>([^<]+)</h3>', webpage, 'podcast title', default=None)
        if podcast_title:
            podcast_title = podcast_title.strip()
        episode_title = self._search_regex(
-            r'(?:<div class="episode-title">|<h3>)([^<]+)</', webpage, 'episode title')
+            r'(?:<div class="episode-title">|<h4>)([^<]+)</', webpage, 'episode title')
        if episode_title:
            episode_title = episode_title.strip()
        title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title
        description = self._html_search_regex(
-            r'<div id="info_text_body">(.+?)</div>', webpage,
+            r'<p\s+id="info_text_body">(.+?)</p>', webpage,
            'description', default=None)
-        thumbnail = self._search_regex(
+        if description:
-            r'<img[^>]+class="info-show-icon"[^>]+src="([^"]+)"',
+            # Strip non-breaking and normal spaces
-            webpage, 'thumbnail', fatal=False)
+            description = description.replace('\u00A0', ' ').strip()
        release_date = unified_strdate(self._search_regex(
            r'<div class="release_date">Released: ([^<]+)<', webpage, 'release date', fatal=False))
        data_json = self._search_regex(r'var\s+playlistItem\s*=\s*(\{.*?\});\n', webpage, 'JSON data block')
        data = json.loads(data_json)
        formats = [{
            'url': data['media_url'],
            'format_id': 'main',
        }, {
            'url': data['media_url_libsyn'],
            'format_id': 'libsyn',
        }]
        thumbnail = data.get('thumbnail_url')
        duration = parse_duration(data.get('duration'))
        return {
            'id': video_id,
            'title': title,
            'description': description,
            'thumbnail': thumbnail,
            'upload_date': release_date,
            'duration': duration,
            'formats': formats,
        }
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@ -154,7 +154,7 @@ class YoukuIE(InfoExtractor):
        # request basic data
        basic_data_params = {
            'vid': video_id,
-            'ccode': '0507',
+            'ccode': '0590',
            'client_ip': '192.168.1.1',
            'utid': cna,
            'client_ts': time.time() / 1000,