Merge remote-tracking branch 'upstream/master' into XVideos-issue-15799

This commit is contained in:
Parmjit Virk 2018-03-22 15:10:37 -05:00
commit b3c7aea8ac
6 changed files with 151 additions and 75 deletions

View File

@ -532,13 +532,14 @@ from .lcp import (
) )
from .learnr import LearnrIE from .learnr import LearnrIE
from .lecture2go import Lecture2GoIE from .lecture2go import Lecture2GoIE
from .lego import LEGOIE
from .lemonde import LemondeIE
from .leeco import ( from .leeco import (
LeIE, LeIE,
LePlaylistIE, LePlaylistIE,
LetvCloudIE, LetvCloudIE,
) )
from .lego import LEGOIE
from .lemonde import LemondeIE
from .lenta import LentaIE
from .libraryofcongress import LibraryOfCongressIE from .libraryofcongress import LibraryOfCongressIE
from .libsyn import LibsynIE from .libsyn import LibsynIE
from .lifenews import ( from .lifenews import (

View File

@ -1270,24 +1270,6 @@ class GenericIE(InfoExtractor):
}, },
'add_ie': ['Kaltura'], 'add_ie': ['Kaltura'],
}, },
# EaglePlatform embed (generic URL)
{
'url': 'http://lenta.ru/news/2015/03/06/navalny/',
# Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
'info_dict': {
'id': '227304',
'ext': 'mp4',
'title': 'Навальный вышел на свободу',
'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 87,
'view_count': int,
'age_limit': 0,
},
'params': {
'skip_download': True,
},
},
# referrer protected EaglePlatform embed # referrer protected EaglePlatform embed
{ {
'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/', 'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/',

View File

@ -1,5 +1,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import itertools
import json import json
import re import re
@ -242,18 +243,27 @@ class InstagramUserIE(InfoExtractor):
return int_or_none(try_get( return int_or_none(try_get(
node, lambda x: x['edge_media_' + suffix]['count'])) node, lambda x: x['edge_media_' + suffix]['count']))
edges = self._download_json( cursor = ''
'https://www.instagram.com/graphql/query/', uploader_id, query={ for page_num in itertools.count(1):
media = self._download_json(
'https://www.instagram.com/graphql/query/', uploader_id,
'Downloading JSON page %d' % page_num, query={
'query_hash': '472f257a40c653c64c666ce877d59d2b', 'query_hash': '472f257a40c653c64c666ce877d59d2b',
'variables': json.dumps({ 'variables': json.dumps({
'id': uploader_id, 'id': uploader_id,
'first': 999999999, 'first': 100,
'after': cursor,
}) })
})['data']['user']['edge_owner_to_timeline_media']['edges'] })['data']['user']['edge_owner_to_timeline_media']
edges = media.get('edges')
if not edges or not isinstance(edges, list):
break
for edge in edges: for edge in edges:
node = edge['node'] node = edge.get('node')
if not node or not isinstance(node, dict):
continue
if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True:
continue continue
video_id = node.get('shortcode') video_id = node.get('shortcode')
@ -285,6 +295,18 @@ class InstagramUserIE(InfoExtractor):
yield info yield info
page_info = media.get('page_info')
if not page_info or not isinstance(page_info, dict):
break
has_next_page = page_info.get('has_next_page')
if not has_next_page:
break
cursor = page_info.get('end_cursor')
if not cursor or not isinstance(cursor, compat_str):
break
def _real_extract(self, url): def _real_extract(self, url):
username = self._match_id(url) username = self._match_id(url)
uploader_id = self._download_json( uploader_id = self._download_json(

View File

@ -0,0 +1,53 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
class LentaIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?lenta\.ru/[^/]+/\d+/\d+/\d+/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://lenta.ru/news/2018/03/22/savshenko_go/',
'info_dict': {
'id': '964400',
'ext': 'mp4',
'title': 'Надежду Савченко задержали',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 61,
'view_count': int,
},
'params': {
'skip_download': True,
},
}, {
# EaglePlatform iframe embed
'url': 'http://lenta.ru/news/2015/03/06/navalny/',
'info_dict': {
'id': '227304',
'ext': 'mp4',
'title': 'Навальный вышел на свободу',
'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 87,
'view_count': int,
'age_limit': 0,
},
'params': {
'skip_download': True,
},
}]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
r'vid\s*:\s*["\']?(\d+)', webpage, 'eagleplatform id',
default=None)
if video_id:
return self.url_result(
'eagleplatform:lentaru.media.eagleplatform.com:%s' % video_id,
ie='EaglePlatform', video_id=video_id)
return self.url_result(url, ie='Generic')

View File

@ -1,24 +1,28 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import json
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import unified_strdate from ..utils import (
parse_duration,
unified_strdate,
)
class LibsynIE(InfoExtractor): class LibsynIE(InfoExtractor):
_VALID_URL = r'(?P<mainurl>https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+))' _VALID_URL = r'(?P<mainurl>https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+))'
_TESTS = [{ _TESTS = [{
'url': 'http://html5-player.libsyn.com/embed/episode/id/3377616/', 'url': 'http://html5-player.libsyn.com/embed/episode/id/6385796/',
'md5': '443360ee1b58007bc3dcf09b41d093bb', 'md5': '2a55e75496c790cdeb058e7e6c087746',
'info_dict': { 'info_dict': {
'id': '3377616', 'id': '6385796',
'ext': 'mp3', 'ext': 'mp3',
'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart", 'title': "Champion Minded - Developing a Growth Mindset",
'description': 'md5:601cb790edd05908957dae8aaa866465', 'description': 'In this episode, Allistair talks about the importance of developing a growth mindset, not only in sports, but in life too.',
'upload_date': '20150220', 'upload_date': '20180320',
'thumbnail': 're:^https?://.*', 'thumbnail': 're:^https?://.*',
}, },
}, { }, {
@ -39,31 +43,45 @@ class LibsynIE(InfoExtractor):
url = m.group('mainurl') url = m.group('mainurl')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
formats = [{
'url': media_url,
} for media_url in set(re.findall(r'var\s+mediaURL(?:Libsyn)?\s*=\s*"([^"]+)"', webpage))]
podcast_title = self._search_regex( podcast_title = self._search_regex(
r'<h2>([^<]+)</h2>', webpage, 'podcast title', default=None) r'<h3>([^<]+)</h3>', webpage, 'podcast title', default=None)
if podcast_title:
podcast_title = podcast_title.strip()
episode_title = self._search_regex( episode_title = self._search_regex(
r'(?:<div class="episode-title">|<h3>)([^<]+)</', webpage, 'episode title') r'(?:<div class="episode-title">|<h4>)([^<]+)</', webpage, 'episode title')
if episode_title:
episode_title = episode_title.strip()
title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title
description = self._html_search_regex( description = self._html_search_regex(
r'<div id="info_text_body">(.+?)</div>', webpage, r'<p\s+id="info_text_body">(.+?)</p>', webpage,
'description', default=None) 'description', default=None)
thumbnail = self._search_regex( if description:
r'<img[^>]+class="info-show-icon"[^>]+src="([^"]+)"', # Strip non-breaking and normal spaces
webpage, 'thumbnail', fatal=False) description = description.replace('\u00A0', ' ').strip()
release_date = unified_strdate(self._search_regex( release_date = unified_strdate(self._search_regex(
r'<div class="release_date">Released: ([^<]+)<', webpage, 'release date', fatal=False)) r'<div class="release_date">Released: ([^<]+)<', webpage, 'release date', fatal=False))
data_json = self._search_regex(r'var\s+playlistItem\s*=\s*(\{.*?\});\n', webpage, 'JSON data block')
data = json.loads(data_json)
formats = [{
'url': data['media_url'],
'format_id': 'main',
}, {
'url': data['media_url_libsyn'],
'format_id': 'libsyn',
}]
thumbnail = data.get('thumbnail_url')
duration = parse_duration(data.get('duration'))
return { return {
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'description': description, 'description': description,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
'upload_date': release_date, 'upload_date': release_date,
'duration': duration,
'formats': formats, 'formats': formats,
} }

View File

@ -154,7 +154,7 @@ class YoukuIE(InfoExtractor):
# request basic data # request basic data
basic_data_params = { basic_data_params = {
'vid': video_id, 'vid': video_id,
'ccode': '0507', 'ccode': '0590',
'client_ip': '192.168.1.1', 'client_ip': '192.168.1.1',
'utid': cna, 'utid': cna,
'client_ts': time.time() / 1000, 'client_ts': time.time() / 1000,