From 3395958d2befc710181bbde872074ce81eee9158 Mon Sep 17 00:00:00 2001
From: Philipp Hagemeister <phihag@phihag.de>
Date: Tue, 20 Mar 2018 23:07:11 +0100
Subject: [PATCH 1/4] libsyn: adapt to new page structure and replace testcase

---
 youtube_dl/extractor/libsyn.py | 52 +++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 17 deletions(-)
diff --git a/youtube_dl/extractor/libsyn.py b/youtube_dl/extractor/libsyn.py
index 4750b03a3..f7311f483 100644
--- a/youtube_dl/extractor/libsyn.py
+++ b/youtube_dl/extractor/libsyn.py
@@ -1,24 +1,28 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import json
 import re
 
 from .common import InfoExtractor
-from ..utils import unified_strdate
+from ..utils import (
+    parse_duration,
+    unified_strdate,
+)
 
 
 class LibsynIE(InfoExtractor):
     _VALID_URL = r'(?P<mainurl>https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+))'
 
     _TESTS = [{
-        'url': 'http://html5-player.libsyn.com/embed/episode/id/3377616/',
-        'md5': '443360ee1b58007bc3dcf09b41d093bb',
+        'url': 'http://html5-player.libsyn.com/embed/episode/id/6385796/',
+        'md5': '2a55e75496c790cdeb058e7e6c087746',
         'info_dict': {
-            'id': '3377616',
+            'id': '6385796',
             'ext': 'mp3',
-            'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
-            'description': 'md5:601cb790edd05908957dae8aaa866465',
-            'upload_date': '20150220',
+            'title': "Champion Minded - Developing a Growth Mindset",
+            'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.',
+            'upload_date': '20180320',
             'thumbnail': 're:^https?://.*',
         },
     }, {
@@ -39,31 +43,45 @@ class LibsynIE(InfoExtractor):
         url = m.group('mainurl')
         webpage = self._download_webpage(url, video_id)
 
-        formats = [{
-            'url': media_url,
-        } for media_url in set(re.findall(r'var\s+mediaURL(?:Libsyn)?\s*=\s*"([^"]+)"', webpage))]
-
         podcast_title = self._search_regex(
-            r'<h2>([^<]+)</h2>', webpage, 'podcast title', default=None)
+            r'<h3>([^<]+)</h3>', webpage, 'podcast title', default=None)
+        if podcast_title:
+            podcast_title = podcast_title.strip()
         episode_title = self._search_regex(
-            r'(?:<div class="episode-title">|<h3>)([^<]+)</', webpage, 'episode title')
+            r'(?:<div class="episode-title">|<h4>)([^<]+)</', webpage, 'episode title')
+        if episode_title:
+            episode_title = episode_title.strip()
 
         title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title
 
         description = self._html_search_regex(
-            r'<div id="info_text_body">(.+?)</div>', webpage,
+            r'<p\s+id="info_text_body">(.+?)</p>', webpage,
             'description', default=None)
-        thumbnail = self._search_regex(
-            r'<img[^>]+class="info-show-icon"[^>]+src="([^"]+)"',
-            webpage, 'thumbnail', fatal=False)
+        if description:
+            # Strip non-breaking and normal spaces
+            description = description.replace('\u00A0', ' ').strip()
         release_date = unified_strdate(self._search_regex(
             r'<div class="release_date">Released: ([^<]+)<', webpage, 'release date', fatal=False))
 
+        data_json = self._search_regex(r'var\s+playlistItem\s*=\s*(\{.*?\});\n', webpage, 'JSON data block')
+        data = json.loads(data_json)
+
+        formats = [{
+            'url': data['media_url'],
+            'format_id': 'main',
+        }, {
+            'url': data['media_url_libsyn'],
+            'format_id': 'libsyn',
+        }]
+        thumbnail = data.get('thumbnail_url')
+        duration = parse_duration(data.get('duration'))
+
         return {
             'id': video_id,
             'title': title,
             'description': description,
             'thumbnail': thumbnail,
             'upload_date': release_date,
+            'duration': duration,
             'formats': formats,
         }

From 328ddf56a151830ae002842b7088464e4e391b5d Mon Sep 17 00:00:00 2001
From: Vijay Singh <sudovijay@users.noreply.github.com>
Date: Wed, 21 Mar 2018 12:13:31 +0530
Subject: [PATCH 2/4] [Youku] Update ccode

---
 youtube_dl/extractor/youku.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py
index 5b0b248cd..2f5a7b023 100644
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@@ -154,7 +154,7 @@ class YoukuIE(InfoExtractor):
         # request basic data
         basic_data_params = {
             'vid': video_id,
-            'ccode': '0507',
+            'ccode': '0590',
             'client_ip': '192.168.1.1',
             'utid': cna,
             'client_ts': time.time() / 1000,

From cba5d1b6b36d79fcafe0600d9805e6b82ed5388f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com>
Date: Wed, 21 Mar 2018 23:43:03 +0700
Subject: [PATCH 3/4] [instagram:user] Add pagination (closes #15934)

---
 youtube_dl/extractor/instagram.py | 96 +++++++++++++++++++------------
 1 file changed, 59 insertions(+), 37 deletions(-)

diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py
index ac9d92a8d..f9cd11b8e 100644
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -1,5 +1,6 @@
 from __future__ import unicode_literals
 
+import itertools
 import json
 import re
 
@@ -242,48 +243,69 @@ class InstagramUserIE(InfoExtractor):
             return int_or_none(try_get(
                 node, lambda x: x['edge_media_' + suffix]['count']))
 
-        edges = self._download_json(
-            'https://www.instagram.com/graphql/query/', uploader_id, query={
-                'query_hash': '472f257a40c653c64c666ce877d59d2b',
-                'variables': json.dumps({
-                    'id': uploader_id,
-                    'first': 999999999,
+        cursor = ''
+        for page_num in itertools.count(1):
+            media = self._download_json(
+                'https://www.instagram.com/graphql/query/', uploader_id,
+                'Downloading JSON page %d' % page_num, query={
+                    'query_hash': '472f257a40c653c64c666ce877d59d2b',
+                    'variables': json.dumps({
+                        'id': uploader_id,
+                        'first': 100,
+                        'after': cursor,
+                    })
+                })['data']['user']['edge_owner_to_timeline_media']
+
+            edges = media.get('edges')
+            if not edges or not isinstance(edges, list):
+                break
+
+            for edge in edges:
+                node = edge.get('node')
+                if not node or not isinstance(node, dict):
+                    continue
+                if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True:
+                    continue
+                video_id = node.get('shortcode')
+                if not video_id:
+                    continue
+
+                info = self.url_result(
+                    'https://instagram.com/p/%s/' % video_id,
+                    ie=InstagramIE.ie_key(), video_id=video_id)
+
+                description = try_get(
+                    node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
+                    compat_str)
+                thumbnail = node.get('thumbnail_src') or node.get('display_src')
+                timestamp = int_or_none(node.get('taken_at_timestamp'))
+
+                comment_count = get_count('to_comment')
+                like_count = get_count('preview_like')
+                view_count = int_or_none(node.get('video_view_count'))
+
+                info.update({
+                    'description': description,
+                    'thumbnail': thumbnail,
+                    'timestamp': timestamp,
+                    'comment_count': comment_count,
+                    'like_count': like_count,
+                    'view_count': view_count,
                 })
-            })['data']['user']['edge_owner_to_timeline_media']['edges']
 
-        for edge in edges:
-            node = edge['node']
+                yield info
 
-            if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True:
-                continue
-            video_id = node.get('shortcode')
-            if not video_id:
-                continue
+            page_info = media.get('page_info')
+            if not page_info or not isinstance(page_info, dict):
+                break
 
-            info = self.url_result(
-                'https://instagram.com/p/%s/' % video_id,
-                ie=InstagramIE.ie_key(), video_id=video_id)
+            has_next_page = page_info.get('has_next_page')
+            if not has_next_page:
+                break
 
-            description = try_get(
-                node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
-                compat_str)
-            thumbnail = node.get('thumbnail_src') or node.get('display_src')
-            timestamp = int_or_none(node.get('taken_at_timestamp'))
-
-            comment_count = get_count('to_comment')
-            like_count = get_count('preview_like')
-            view_count = int_or_none(node.get('video_view_count'))
-
-            info.update({
-                'description': description,
-                'thumbnail': thumbnail,
-                'timestamp': timestamp,
-                'comment_count': comment_count,
-                'like_count': like_count,
-                'view_count': view_count,
-            })
-
-            yield info
+            cursor = page_info.get('end_cursor')
+            if not cursor or not isinstance(cursor, compat_str):
+                break
 
     def _real_extract(self, url):
         username = self._match_id(url)

From 8b7340a45eb0e3aeaa996896ff8690b6c3a32af6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com>
Date: Thu, 22 Mar 2018 22:55:28 +0700
Subject: [PATCH 4/4] [lenta] Add extractor (closes #15953)

---
 youtube_dl/extractor/extractors.py |  5 +--
 youtube_dl/extractor/generic.py    | 18 ----------
 youtube_dl/extractor/lenta.py      | 53 ++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 20 deletions(-)
 create mode 100644 youtube_dl/extractor/lenta.py

diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index 3bde40eb3..de48a37ad 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -532,13 +532,14 @@ from .lcp import (
 )
 from .learnr import LearnrIE
 from .lecture2go import Lecture2GoIE
-from .lego import LEGOIE
-from .lemonde import LemondeIE
 from .leeco import (
     LeIE,
     LePlaylistIE,
     LetvCloudIE,
 )
+from .lego import LEGOIE
+from .lemonde import LemondeIE
+from .lenta import LentaIE
 from .libraryofcongress import LibraryOfCongressIE
 from .libsyn import LibsynIE
 from .lifenews import (
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 1cc491b19..cf64398e3 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -1270,24 +1270,6 @@ class GenericIE(InfoExtractor):
             },
             'add_ie': ['Kaltura'],
         },
-        # EaglePlatform embed (generic URL)
-        {
-            'url': 'http://lenta.ru/news/2015/03/06/navalny/',
-            # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
-            'info_dict': {
-                'id': '227304',
-                'ext': 'mp4',
-                'title': 'Навальный вышел на свободу',
-                'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
-                'thumbnail': r're:^https?://.*\.jpg$',
-                'duration': 87,
-                'view_count': int,
-                'age_limit': 0,
-            },
-            'params': {
-                'skip_download': True,
-            },
-        },
         # referrer protected EaglePlatform embed
         {
             'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/',
diff --git a/youtube_dl/extractor/lenta.py b/youtube_dl/extractor/lenta.py
new file mode 100644
index 000000000..2ebd4e577
--- /dev/null
+++ b/youtube_dl/extractor/lenta.py
@@ -0,0 +1,53 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class LentaIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?lenta\.ru/[^/]+/\d+/\d+/\d+/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://lenta.ru/news/2018/03/22/savshenko_go/',
+        'info_dict': {
+            'id': '964400',
+            'ext': 'mp4',
+            'title': 'Надежду Савченко задержали',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 61,
+            'view_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # EaglePlatform iframe embed
+        'url': 'http://lenta.ru/news/2015/03/06/navalny/',
+        'info_dict': {
+            'id': '227304',
+            'ext': 'mp4',
+            'title': 'Навальный вышел на свободу',
+            'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 87,
+            'view_count': int,
+            'age_limit': 0,
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        video_id = self._search_regex(
+            r'vid\s*:\s*["\']?(\d+)', webpage, 'eagleplatform id',
+            default=None)
+        if video_id:
+            return self.url_result(
+                'eagleplatform:lentaru.media.eagleplatform.com:%s' % video_id,
+                ie='EaglePlatform', video_id=video_id)
+
+        return self.url_result(url, ie='Generic')