diff --git a/youtube_dl/extractor/asiancrush.py b/youtube_dl/extractor/asiancrush.py index 594c88c9c..6d71c5ad5 100644 --- a/youtube_dl/extractor/asiancrush.py +++ b/youtube_dl/extractor/asiancrush.py @@ -8,7 +8,6 @@ from .kaltura import KalturaIE from ..utils import ( extract_attributes, remove_end, - urlencode_postdata, ) @@ -34,19 +33,40 @@ class AsianCrushIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - data = self._download_json( - 'https://www.asiancrush.com/wp-admin/admin-ajax.php', video_id, - data=urlencode_postdata({ - 'postid': video_id, - 'action': 'get_channel_kaltura_vars', - })) + webpage = self._download_webpage(url, video_id) - entry_id = data['entry_id'] + entry_id, partner_id, title = [None] * 3 + + vars = self._parse_json( + self._search_regex( + r'iEmbedVars\s*=\s*({.+?})', webpage, 'embed vars', + default='{}'), video_id, fatal=False) + if vars: + entry_id = vars.get('entry_id') + partner_id = vars.get('partner_id') + title = vars.get('vid_label') + + if not entry_id: + entry_id = self._search_regex( + r'\bentry_id["\']\s*:\s*["\'](\d+)', webpage, 'entry id') + + player = self._download_webpage( + 'https://api.asiancrush.com/embeddedVideoPlayer', video_id, + query={'id': entry_id}) + + kaltura_id = self._search_regex( + r'entry_id["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', player, + 'kaltura id', group='id') + + if not partner_id: + partner_id = self._search_regex( + r'/p(?:artner_id)?/(\d+)', player, 'partner id', + default='513551') return self.url_result( - 'kaltura:%s:%s' % (data['partner_id'], entry_id), - ie=KalturaIE.ie_key(), video_id=entry_id, - video_title=data.get('vid_label')) + 'kaltura:%s:%s' % (partner_id, kaltura_id), + ie=KalturaIE.ie_key(), video_id=kaltura_id, + video_title=title) class AsianCrushPlaylistIE(InfoExtractor): diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index b8bbaf81a..2dbf81e6e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -211,6 +211,11 @@ class InfoExtractor(object): If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. uploader_url: Full URL to a personal webpage of the video uploader. + channel: Full name of the channel the video is uploaded on. + Note that channel fields may or may not repeat uploader + fields. This depends on a particular extractor. + channel_id: Id of the channel. + channel_url: Full URL to a channel webpage. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format {tag: subformats}. "tag" is usually a language code, and @@ -1701,9 +1706,9 @@ class InfoExtractor(object): # However, this is not always respected, for example, [2] # contains EXT-X-STREAM-INF tag which references AUDIO # rendition group but does not have CODECS and despite - # referencing audio group an audio group, it represents - # a complete (with audio and video) format. So, for such cases - # we will ignore references to rendition groups and treat them + # referencing an audio group it represents a complete + # (with audio and video) format. So, for such cases we will + # ignore references to rendition groups and treat them # as complete formats. if audio_group_id and codecs and f.get('vcodec') != 'none': audio_group = groups.get(audio_group_id) diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index ba8b9fa7e..af786d096 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -445,6 +445,10 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text webpage, 'vilos media', default='{}'), video_id) media_metadata = media.get('metadata') or {} + language = self._search_regex( + r'(?:vilos\.config\.player\.language|LOCALE)\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'language', default=None, group='lang') + video_title = self._html_search_regex( r'(?s)]*>((?:(?!]+itemprop=["\']title["\'][^>]*>(?:(?!', webpage, 'video_title') @@ -466,9 +470,22 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text formats = [] for stream in media.get('streams', []): - formats.extend(self._extract_vrv_formats( + audio_lang = stream.get('audio_lang') + hardsub_lang = stream.get('hardsub_lang') + vrv_formats = self._extract_vrv_formats( stream.get('url'), video_id, stream.get('format'), - stream.get('audio_lang'), stream.get('hardsub_lang'))) + audio_lang, hardsub_lang) + for f in vrv_formats: + if not hardsub_lang: + f['preference'] = 1 + language_preference = 0 + if audio_lang == language: + language_preference += 1 + if hardsub_lang == language: + language_preference += 1 + if language_preference: + f['language_preference'] = language_preference + formats.extend(vrv_formats) if not formats: available_fmts = [] for a, fmt in re.findall(r'(]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): @@ -557,7 +574,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'ext': 'flv', }) formats.append(format_info) - self._sort_formats(formats, ('height', 'width', 'tbr', 'fps')) + self._sort_formats(formats, ('preference', 'language_preference', 'height', 'width', 'tbr', 'fps')) metadata = self._call_rpc_api( 'VideoPlayer_GetMediaMetadata', video_id, diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index ad273a0e7..a9a1f911e 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -3,15 +3,45 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_b64decode, + compat_str, + compat_urllib_parse_unquote, + compat_urlparse, +) from ..utils import ( + int_or_none, parse_duration, parse_iso8601, + str_or_none, str_to_int, + try_get, + unified_timestamp, + url_or_none, ) class FourTubeBaseIE(InfoExtractor): + _TKN_HOST = 'tkn.kodicdn.com' + + def _extract_formats(self, url, video_id, media_id, sources): + token_url = 'https://%s/%s/desktop/%s' % ( + self._TKN_HOST, media_id, '+'.join(sources)) + + parsed_url = compat_urlparse.urlparse(url) + tokens = self._download_json(token_url, video_id, data=b'', headers={ + 'Origin': '%s://%s' % (parsed_url.scheme, parsed_url.hostname), + 'Referer': url, + }) + formats = [{ + 'url': tokens[format]['token'], + 'format_id': format + 'p', + 'resolution': format + 'p', + 'quality': int(format), + } for format in sources] + self._sort_formats(formats) + return formats + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) kind, video_id, display_id = mobj.group('kind', 'id', 'display_id') @@ -68,21 +98,7 @@ class FourTubeBaseIE(InfoExtractor): media_id = params[0] sources = ['%s' % p for p in params[2]] - token_url = 'https://tkn.kodicdn.com/{0}/desktop/{1}'.format( - media_id, '+'.join(sources)) - - parsed_url = compat_urlparse.urlparse(url) - tokens = self._download_json(token_url, video_id, data=b'', headers={ - 'Origin': '%s://%s' % (parsed_url.scheme, parsed_url.hostname), - 'Referer': url, - }) - formats = [{ - 'url': tokens[format]['token'], - 'format_id': format + 'p', - 'resolution': format + 'p', - 'quality': int(format), - } for format in sources] - self._sort_formats(formats) + formats = self._extract_formats(url, video_id, media_id, sources) return { 'id': video_id, @@ -164,6 +180,7 @@ class FuxIE(FourTubeBaseIE): class PornTubeIE(FourTubeBaseIE): _VALID_URL = r'https?://(?:(?Pwww|m)\.)?porntube\.com/(?:videos/(?P[^/]+)_|embed/)(?P\d+)' _URL_TEMPLATE = 'https://www.porntube.com/videos/video_%s' + _TKN_HOST = 'tkn.porntube.com' _TESTS = [{ 'url': 'https://www.porntube.com/videos/teen-couple-doing-anal_7089759', 'info_dict': { @@ -171,13 +188,32 @@ class PornTubeIE(FourTubeBaseIE): 'ext': 'mp4', 'title': 'Teen couple doing anal', 'uploader': 'Alexy', - 'uploader_id': 'Alexy', + 'uploader_id': '91488', 'upload_date': '20150606', 'timestamp': 1433595647, 'duration': 5052, 'view_count': int, 'like_count': int, - 'categories': list, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.porntube.com/videos/squirting-teen-ballerina-ecg_1331406', + 'info_dict': { + 'id': '1331406', + 'ext': 'mp4', + 'title': 'Squirting Teen Ballerina on ECG', + 'uploader': 'Exploited College Girls', + 'uploader_id': '665', + 'channel': 'Exploited College Girls', + 'channel_id': '665', + 'upload_date': '20130920', + 'timestamp': 1379685485, + 'duration': 851, + 'view_count': int, + 'like_count': int, 'age_limit': 18, }, 'params': { @@ -191,6 +227,55 @@ class PornTubeIE(FourTubeBaseIE): 'only_matching': True, }] + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id, display_id = mobj.group('id', 'display_id') + + webpage = self._download_webpage(url, display_id) + + video = self._parse_json( + self._search_regex( + r'INITIALSTATE\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'data', group='value'), video_id, + transform_source=lambda x: compat_urllib_parse_unquote( + compat_b64decode(x).decode('utf-8')))['page']['video'] + + title = video['title'] + media_id = video['mediaId'] + sources = [compat_str(e['height']) + for e in video['encodings'] if e.get('height')] + formats = self._extract_formats(url, video_id, media_id, sources) + + thumbnail = url_or_none(video.get('masterThumb')) + uploader = try_get(video, lambda x: x['user']['username'], compat_str) + uploader_id = str_or_none(try_get( + video, lambda x: x['user']['id'], int)) + channel = try_get(video, lambda x: x['channel']['name'], compat_str) + channel_id = str_or_none(try_get( + video, lambda x: x['channel']['id'], int)) + like_count = int_or_none(video.get('likes')) + dislike_count = int_or_none(video.get('dislikes')) + view_count = int_or_none(video.get('playsQty')) + duration = int_or_none(video.get('durationInSeconds')) + timestamp = unified_timestamp(video.get('publishedAt')) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'uploader': uploader or channel, + 'uploader_id': uploader_id or channel_id, + 'channel': channel, + 'channel_id': channel_id, + 'timestamp': timestamp, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'view_count': view_count, + 'duration': duration, + 'age_limit': 18, + } + class PornerBrosIE(FourTubeBaseIE): _VALID_URL = r'https?://(?:(?Pwww|m)\.)?pornerbros\.com/(?:videos/(?P[^/]+)_|embed/)(?P\d+)' diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index f1f308e68..066f3e8b0 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -40,6 +40,7 @@ class PornHubIE(InfoExtractor): 'ext': 'mp4', 'title': 'Seductive Indian beauty strips down and fingers her pink pussy', 'uploader': 'Babes', + 'upload_date': '20130628', 'duration': 361, 'view_count': int, 'like_count': int, @@ -57,6 +58,7 @@ class PornHubIE(InfoExtractor): 'ext': 'mp4', 'title': '重庆婷婷女王足交', 'uploader': 'Unknown', + 'upload_date': '20150213', 'duration': 1753, 'view_count': int, 'like_count': int, @@ -237,8 +239,14 @@ class PornHubIE(InfoExtractor): video_urls.append((video_url, None)) video_urls_set.add(video_url) + upload_date = None formats = [] for video_url, height in video_urls: + if not upload_date: + upload_date = self._search_regex( + r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None) + if upload_date: + upload_date = upload_date.replace('/', '') tbr = None mobj = re.search(r'(?P\d+)[pP]?_(?P\d+)[kK]', video_url) if mobj: @@ -278,6 +286,7 @@ class PornHubIE(InfoExtractor): return { 'id': video_id, 'uploader': video_uploader, + 'upload_date': upload_date, 'title': title, 'thumbnail': thumbnail, 'duration': duration, diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index b39972b1e..26661fdf6 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -559,7 +559,8 @@ class TwitchStreamIE(TwitchBaseIE): TwitchAllVideosIE, TwitchUploadsIE, TwitchPastBroadcastsIE, - TwitchHighlightsIE)) + TwitchHighlightsIE, + TwitchClipsIE)) else super(TwitchStreamIE, cls).suitable(url)) def _real_extract(self, url): @@ -633,7 +634,7 @@ class TwitchStreamIE(TwitchBaseIE): class TwitchClipsIE(TwitchBaseIE): IE_NAME = 'twitch:clips' - _VALID_URL = r'https?://clips\.twitch\.tv/(?:[^/]+/)*(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:clips\.twitch\.tv/(?:[^/]+/)*|(?:www\.)?twitch\.tv/[^/]+/clip/)(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat', @@ -653,6 +654,9 @@ class TwitchClipsIE(TwitchBaseIE): # multiple formats 'url': 'https://clips.twitch.tv/rflegendary/UninterestedBeeDAESuppy', 'only_matching': True, + }, { + 'url': 'https://www.twitch.tv/sergeynixon/clip/StormyThankfulSproutFutureMan', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index e49b233f2..0a9239b62 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -299,10 +299,13 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/atencio', 'uploader_id': 'atencio', 'uploader': 'Peter Atencio', + 'channel_id': 'keypeele', + 'channel_url': r're:https?://(?:www\.)?vimeo\.com/channels/keypeele', 'timestamp': 1380339469, 'upload_date': '20130928', 'duration': 187, }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'http://vimeo.com/76979871', @@ -355,11 +358,13 @@ class VimeoIE(VimeoBaseInfoExtractor): 'url': 'https://vimeo.com/channels/tributes/6213729', 'info_dict': { 'id': '6213729', - 'ext': 'mov', + 'ext': 'mp4', 'title': 'Vimeo Tribute: The Shining', 'uploader': 'Casey Donahue', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/caseydonahue', 'uploader_id': 'caseydonahue', + 'channel_url': r're:https?://(?:www\.)?vimeo\.com/channels/tributes', + 'channel_id': 'tributes', 'timestamp': 1250886430, 'upload_date': '20090821', 'description': 'md5:bdbf314014e58713e6e5b66eb252f4a6', @@ -465,6 +470,9 @@ class VimeoIE(VimeoBaseInfoExtractor): if 'Referer' not in headers: headers['Referer'] = url + channel_id = self._search_regex( + r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) + # Extract ID from URL mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') @@ -563,19 +571,23 @@ class VimeoIE(VimeoBaseInfoExtractor): if config.get('view') == 4: config = self._verify_player_video_password(redirect_url, video_id) + vod = config.get('video', {}).get('vod', {}) + def is_rented(): if '>You rented this title.<' in webpage: return True if config.get('user', {}).get('purchased'): return True - label = try_get( - config, lambda x: x['video']['vod']['purchase_options'][0]['label_string'], compat_str) - if label and label.startswith('You rented this'): - return True + for purchase_option in vod.get('purchase_options', []): + if purchase_option.get('purchased'): + return True + label = purchase_option.get('label_string') + if label and (label.startswith('You rented this') or label.endswith(' remaining')): + return True return False - if is_rented(): - feature_id = config.get('video', {}).get('vod', {}).get('feature_id') + if is_rented() and vod.get('is_trailer'): + feature_id = vod.get('feature_id') if feature_id and not data.get('force_feature_id', False): return self.url_result(smuggle_url( 'https://player.vimeo.com/player/%s' % feature_id, @@ -652,6 +664,8 @@ class VimeoIE(VimeoBaseInfoExtractor): r']+rel=["\']license["\'][^>]+href=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'license', default=None, group='license') + channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None + info_dict = { 'id': video_id, 'formats': formats, @@ -662,6 +676,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'like_count': like_count, 'comment_count': comment_count, 'license': cc_license, + 'channel_id': channel_id, + 'channel_url': channel_url, } info_dict = merge_dicts(info_dict, info_dict_config, json_ld) diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py index 921e9e172..ac0819c7c 100644 --- a/youtube_dl/extractor/vrv.py +++ b/youtube_dl/extractor/vrv.py @@ -90,7 +90,13 @@ class VRVIE(VRVBaseIE): def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): if not url or stream_format not in ('hls', 'dash'): return [] - stream_id = hardsub_lang or audio_lang + assert audio_lang or hardsub_lang + stream_id_list = [] + if audio_lang: + stream_id_list.append('audio-%s' % audio_lang) + if hardsub_lang: + stream_id_list.append('hardsub-%s' % hardsub_lang) + stream_id = '-'.join(stream_id_list) format_id = '%s-%s' % (stream_format, stream_id) if stream_format == 'hls': adaptive_formats = self._extract_m3u8_formats( diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 27047425d..2fe074cb4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -490,6 +490,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', + 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', 'upload_date': '20121002', 'license': 'Standard YouTube License', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', @@ -1907,6 +1909,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: self._downloader.report_warning('unable to extract uploader nickname') + channel_id = self._html_search_meta( + 'channelId', video_webpage, 'channel id') + channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None + # thumbnail image # We try first to get a high quality image: m_thumb = re.search(r'', @@ -2078,6 +2084,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': video_uploader, 'uploader_id': video_uploader_id, 'uploader_url': video_uploader_url, + 'channel_id': channel_id, + 'channel_url': channel_url, 'upload_date': upload_date, 'license': video_license, 'creator': video_creator or artist,