diff --git a/README.md b/README.md index df419abe8..e39f71281 100644 --- a/README.md +++ b/README.md @@ -319,7 +319,7 @@ which means you can modify it, redistribute it or use it however you like. --all-formats Download all available video formats --prefer-free-formats Prefer free video formats unless a specific one is requested - -F, --list-formats List all available formats of specified + -F, --list-formats List all available formats of requested videos --youtube-skip-dash-manifest Do not download the DASH manifests and related data on YouTube videos @@ -800,7 +800,21 @@ with youtube_dl.YoutubeDL(ydl_opts) as ydl: Bugs and suggestions should be reported at: . Unless you were prompted so or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the irc channel #youtube-dl on freenode. -**Please include the full output of youtube-dl when run with `-v`**. +**Please include the full output of youtube-dl when run with `-v`**, i.e. add `-v` flag to your command line, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this: +``` +$ youtube-dl -v http://www.youtube.com/watch?v=BaW_jenozKcj +[debug] System config: [] +[debug] User config: [] +[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] +[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 +[debug] youtube-dl version 2015.12.06 +[debug] Git HEAD: 135392e +[debug] Python version 2.6.6 - Windows-2003Server-5.2.3790-SP2 +[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 +[debug] Proxy map: {} +... +``` +**Do not post screenshots of verbose log only plain text is acceptable.** The output (including the first lines) contains important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever. diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 1df408610..bf26fecd7 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -15,8 +15,12 @@ - **abc.net.au** - **Abc7News** - **AcademicEarth:Course** + - **acast** + - **acast:channel** - **AddAnime** - **AdobeTV** + - **AdobeTVChannel** + - **AdobeTVShow** - **AdobeTVVideo** - **AdultSwim** - **Aftenposten** @@ -43,6 +47,7 @@ - **arte.tv:future** - **AtresPlayer** - **ATTTechChannel** + - **AudiMedia** - **audiomack** - **audiomack:album** - **Azubu** @@ -92,6 +97,7 @@ - **Clipfish** - **cliphunter** - **Clipsyndicate** + - **cloudtime**: CloudTime - **Cloudy** - **Clubic** - **Clyp** @@ -183,6 +189,7 @@ - **freespeech.org** - **FreeVideo** - **FunnyOrDie** + - **GameInformer** - **Gamekings** - **GameOne** - **gameone:playlist** @@ -307,7 +314,6 @@ - **MovieClips** - **MovieFap** - **Moviezine** - - **movshare**: MovShare - **MPORA** - **MSNBC** - **MTV** @@ -480,6 +486,8 @@ - **Shared**: shared.sx and vivo.sx - **ShareSix** - **Sina** + - **skynewsarabia:video** + - **skynewsarabia:video** - **Slideshare** - **Slutload** - **smotri**: Smotri.com @@ -665,6 +673,7 @@ - **WebOfStories** - **WebOfStoriesPlaylist** - **Weibo** + - **wholecloud**: WholeCloud - **Wimp** - **Wistia** - **WNL** diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 9a8c7da05..c642a1fbf 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1110,6 +1110,12 @@ class YoutubeDL(object): 'contain the video, try using ' '"-f %s+%s"' % (format_2, format_1)) return + # Formats must be opposite (video+audio) + if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none': + self.report_error( + 'Both formats %s and %s are video-only, you must specify "-f video+audio"' + % (format_1, format_2)) + return output_ext = ( formats_info[0]['ext'] if self.params.get('merge_output_format') is None diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bbf656090..2acebfef6 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -3,9 +3,15 @@ from __future__ import unicode_literals from .abc import ABCIE from .abc7news import Abc7NewsIE from .academicearth import AcademicEarthCourseIE +from .acast import ( + ACastIE, + ACastChannelIE, +) from .addanime import AddAnimeIE from .adobetv import ( AdobeTVIE, + AdobeTVShowIE, + AdobeTVChannelIE, AdobeTVVideoIE, ) from .adultswim import AdultSwimIE @@ -38,6 +44,7 @@ from .arte import ( ) from .atresplayer import AtresPlayerIE from .atttechchannel import ATTTechChannelIE +from .audimedia import AudiMediaIE from .audiomack import AudiomackIE, AudiomackAlbumIE from .azubu import AzubuIE from .baidu import BaiduVideoIE @@ -200,6 +207,7 @@ from .freesound import FreesoundIE from .freespeech import FreespeechIE from .freevideo import FreeVideoIE from .funnyordie import FunnyOrDieIE +from .gameinformer import GameInformerIE from .gamekings import GamekingsIE from .gameone import ( GameOneIE, @@ -349,7 +357,6 @@ from .motherless import MotherlessIE from .motorsport import MotorsportIE from .movieclips import MovieClipsIE from .moviezine import MoviezineIE -from .movshare import MovShareIE from .mtv import ( MTVIE, MTVServicesEmbeddedIE, @@ -415,7 +422,13 @@ from .noco import NocoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE from .nova import NovaIE -from .novamov import NovaMovIE +from .novamov import ( + NovaMovIE, + WholeCloudIE, + NowVideoIE, + VideoWeedIE, + CloudTimeIE, +) from .nowness import ( NownessIE, NownessPlaylistIE, @@ -425,7 +438,6 @@ from .nowtv import ( NowTVIE, NowTVListIE, ) -from .nowvideo import NowVideoIE from .npo import ( NPOIE, NPOLiveIE, @@ -554,6 +566,10 @@ from .shahid import ShahidIE from .shared import SharedIE from .sharesix import ShareSixIE from .sina import SinaIE +from .skynewsarabia import ( + SkyNewsArabiaIE, + SkyNewsArabiaArticleIE, +) from .slideshare import SlideshareIE from .slutload import SlutloadIE from .smotri import ( @@ -733,7 +749,6 @@ from .videofyme import VideofyMeIE from .videomega import VideoMegaIE from .videopremium import VideoPremiumIE from .videott import VideoTtIE -from .videoweed import VideoWeedIE from .vidme import VidmeIE from .vidzi import VidziIE from .vier import VierIE, VierVideosIE diff --git a/youtube_dl/extractor/acast.py b/youtube_dl/extractor/acast.py new file mode 100644 index 000000000..be7913bc7 --- /dev/null +++ b/youtube_dl/extractor/acast.py @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import int_or_none + + +class ACastBaseIE(InfoExtractor): + _API_BASE_URL = 'https://www.acast.com/api/' + + +class ACastIE(ACastBaseIE): + IE_NAME = 'acast' + _VALID_URL = r'https?://(?:www\.)?acast\.com/(?P[^/]+)/(?P[^/#?]+)' + _TEST = { + 'url': 'https://www.acast.com/condenasttraveler/-where-are-you-taipei-101-taiwan', + 'md5': 'ada3de5a1e3a2a381327d749854788bb', + 'info_dict': { + 'id': '57de3baa-4bb0-487e-9418-2692c1277a34', + 'ext': 'mp3', + 'title': '"Where Are You?": Taipei 101, Taiwan', + 'timestamp': 1196172000000, + 'description': 'md5:0c5d8201dfea2b93218ea986c91eee6e', + 'duration': 211, + } + } + + def _real_extract(self, url): + channel, display_id = re.match(self._VALID_URL, url).groups() + cast_data = self._download_json(self._API_BASE_URL + 'channels/%s/acasts/%s/playback' % (channel, display_id), display_id) + + return { + 'id': compat_str(cast_data['id']), + 'display_id': display_id, + 'url': cast_data['blings'][0]['audio'], + 'title': cast_data['name'], + 'description': cast_data.get('description'), + 'thumbnail': cast_data.get('image'), + 'timestamp': int_or_none(cast_data.get('publishingDate')), + 'duration': int_or_none(cast_data.get('duration')), + } + + +class ACastChannelIE(ACastBaseIE): + IE_NAME = 'acast:channel' + _VALID_URL = r'https?://(?:www\.)?acast\.com/(?P[^/#?]+)' + _TEST = { + 'url': 'https://www.acast.com/condenasttraveler', + 'info_dict': { + 'id': '50544219-29bb-499e-a083-6087f4cb7797', + 'title': 'Condé Nast Traveler Podcast', + 'description': 'md5:98646dee22a5b386626ae31866638fbd', + }, + 'playlist_mincount': 20, + } + + @classmethod + def suitable(cls, url): + return False if ACastIE.suitable(url) else super(ACastChannelIE, cls).suitable(url) + + def _real_extract(self, url): + display_id = self._match_id(url) + channel_data = self._download_json(self._API_BASE_URL + 'channels/%s' % display_id, display_id) + casts = self._download_json(self._API_BASE_URL + 'channels/%s/acasts' % display_id, display_id) + entries = [self.url_result('https://www.acast.com/%s/%s' % (display_id, cast['url']), 'ACast') for cast in casts] + + return self.playlist_result(entries, compat_str(channel_data['id']), channel_data['name'], channel_data.get('description')) diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py index 5e43adc51..8753ee2cf 100644 --- a/youtube_dl/extractor/adobetv.py +++ b/youtube_dl/extractor/adobetv.py @@ -1,23 +1,32 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( parse_duration, unified_strdate, str_to_int, + int_or_none, float_or_none, ISO639Utils, + determine_ext, ) -class AdobeTVIE(InfoExtractor): - _VALID_URL = r'https?://tv\.adobe\.com/watch/[^/]+/(?P[^/]+)' +class AdobeTVBaseIE(InfoExtractor): + _API_BASE_URL = 'http://tv.adobe.com/api/v4/' + + +class AdobeTVIE(AdobeTVBaseIE): + _VALID_URL = r'https?://tv\.adobe\.com/(?:(?Pfr|de|es|jp)/)?watch/(?P[^/]+)/(?P[^/]+)' _TEST = { 'url': 'http://tv.adobe.com/watch/the-complete-picture-with-julieanne-kost/quick-tip-how-to-draw-a-circle-around-an-object-in-photoshop/', 'md5': '9bc5727bcdd55251f35ad311ca74fa1e', 'info_dict': { - 'id': 'quick-tip-how-to-draw-a-circle-around-an-object-in-photoshop', + 'id': '10981', 'ext': 'mp4', 'title': 'Quick Tip - How to Draw a Circle Around an Object in Photoshop', 'description': 'md5:99ec318dc909d7ba2a1f2b038f7d2311', @@ -29,50 +38,106 @@ class AdobeTVIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + language, show_urlname, urlname = re.match(self._VALID_URL, url).groups() + if not language: + language = 'en' - player = self._parse_json( - self._search_regex(r'html5player:\s*({.+?})\s*\n', webpage, 'player'), - video_id) - - title = player.get('title') or self._search_regex( - r'data-title="([^"]+)"', webpage, 'title') - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - - upload_date = unified_strdate( - self._html_search_meta('datepublished', webpage, 'upload date')) - - duration = parse_duration( - self._html_search_meta('duration', webpage, 'duration') or - self._search_regex( - r'Runtime:\s*(\d{2}:\d{2}:\d{2})', - webpage, 'duration', fatal=False)) - - view_count = str_to_int(self._search_regex( - r'
\s*Views?:\s*([\d,.]+)\s*
', - webpage, 'view count')) + video_data = self._download_json( + self._API_BASE_URL + 'episode/get/?language=%s&show_urlname=%s&urlname=%s&disclosure=standard' % (language, show_urlname, urlname), + urlname)['data'][0] formats = [{ - 'url': source['src'], - 'format_id': source.get('quality') or source['src'].split('-')[-1].split('.')[0] or None, - 'tbr': source.get('bitrate'), - } for source in player['sources']] + 'url': source['url'], + 'format_id': source.get('quality_level') or source['url'].split('-')[-1].split('.')[0] or None, + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + 'tbr': int_or_none(source.get('video_data_rate')), + } for source in video_data['videos']] self._sort_formats(formats) return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'duration': duration, - 'view_count': view_count, + 'id': compat_str(video_data['id']), + 'title': video_data['title'], + 'description': video_data.get('description'), + 'thumbnail': video_data.get('thumbnail'), + 'upload_date': unified_strdate(video_data.get('start_date')), + 'duration': parse_duration(video_data.get('duration')), + 'view_count': str_to_int(video_data.get('playcount')), 'formats': formats, } +class AdobeTVPlaylistBaseIE(AdobeTVBaseIE): + def _parse_page_data(self, page_data): + return [self.url_result(self._get_element_url(element_data)) for element_data in page_data] + + def _extract_playlist_entries(self, url, display_id): + page = self._download_json(url, display_id) + entries = self._parse_page_data(page['data']) + for page_num in range(2, page['paging']['pages'] + 1): + entries.extend(self._parse_page_data( + self._download_json(url + '&page=%d' % page_num, display_id)['data'])) + return entries + + +class AdobeTVShowIE(AdobeTVPlaylistBaseIE): + _VALID_URL = r'https?://tv\.adobe\.com/(?:(?Pfr|de|es|jp)/)?show/(?P[^/]+)' + + _TEST = { + 'url': 'http://tv.adobe.com/show/the-complete-picture-with-julieanne-kost', + 'info_dict': { + 'id': '36', + 'title': 'The Complete Picture with Julieanne Kost', + 'description': 'md5:fa50867102dcd1aa0ddf2ab039311b27', + }, + 'playlist_mincount': 136, + } + + def _get_element_url(self, element_data): + return element_data['urls'][0] + + def _real_extract(self, url): + language, show_urlname = re.match(self._VALID_URL, url).groups() + if not language: + language = 'en' + query = 'language=%s&show_urlname=%s' % (language, show_urlname) + + show_data = self._download_json(self._API_BASE_URL + 'show/get/?%s' % query, show_urlname)['data'][0] + + return self.playlist_result( + self._extract_playlist_entries(self._API_BASE_URL + 'episode/?%s' % query, show_urlname), + compat_str(show_data['id']), + show_data['show_name'], + show_data['show_description']) + + +class AdobeTVChannelIE(AdobeTVPlaylistBaseIE): + _VALID_URL = r'https?://tv\.adobe\.com/(?:(?Pfr|de|es|jp)/)?channel/(?P[^/]+)(?:/(?P[^/]+))?' + + _TEST = { + 'url': 'http://tv.adobe.com/channel/development', + 'info_dict': { + 'id': 'development', + }, + 'playlist_mincount': 96, + } + + def _get_element_url(self, element_data): + return element_data['url'] + + def _real_extract(self, url): + language, channel_urlname, category_urlname = re.match(self._VALID_URL, url).groups() + if not language: + language = 'en' + query = 'language=%s&channel_urlname=%s' % (language, channel_urlname) + if category_urlname: + query += '&category_urlname=%s' % category_urlname + + return self.playlist_result( + self._extract_playlist_entries(self._API_BASE_URL + 'show/?%s' % query, channel_urlname), + channel_urlname) + + class AdobeTVVideoIE(InfoExtractor): _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P\d+)' @@ -91,28 +156,25 @@ class AdobeTVVideoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - player_params = self._parse_json(self._search_regex( - r'var\s+bridge\s*=\s*([^;]+);', webpage, 'player parameters'), - video_id) + video_data = self._download_json(url + '?format=json', video_id) formats = [{ + 'format_id': '%s-%s' % (determine_ext(source['src']), source.get('height')), 'url': source['src'], - 'width': source.get('width'), - 'height': source.get('height'), - 'tbr': source.get('bitrate'), - } for source in player_params['sources']] + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + 'tbr': int_or_none(source.get('bitrate')), + } for source in video_data['sources']] + self._sort_formats(formats) # For both metadata and downloaded files the duration varies among # formats. I just pick the max one duration = max(filter(None, [ float_or_none(source.get('duration'), scale=1000) - for source in player_params['sources']])) + for source in video_data['sources']])) subtitles = {} - for translation in player_params.get('translations', []): + for translation in video_data.get('translations', []): lang_id = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium']) if lang_id not in subtitles: subtitles[lang_id] = [] @@ -124,8 +186,9 @@ class AdobeTVVideoIE(InfoExtractor): return { 'id': video_id, 'formats': formats, - 'title': player_params['title'], - 'description': self._og_search_description(webpage), + 'title': video_data['title'], + 'description': video_data.get('description'), + 'thumbnail': video_data['video'].get('poster'), 'duration': duration, 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/audimedia.py b/youtube_dl/extractor/audimedia.py new file mode 100644 index 000000000..b0b089dee --- /dev/null +++ b/youtube_dl/extractor/audimedia.py @@ -0,0 +1,80 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + sanitized_Request, +) + + +class AudiMediaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?audimedia\.tv/(?:en|de)/vid/(?P[^/?#]+)' + _TEST = { + 'url': 'https://audimedia.tv/en/vid/60-seconds-of-audi-sport-104-2015-wec-bahrain-rookie-test', + 'md5': '79a8b71c46d49042609795ab59779b66', + 'info_dict': { + 'id': '1564', + 'ext': 'mp4', + 'title': '60 Seconds of Audi Sport 104/2015 - WEC Bahrain, Rookie Test', + 'description': 'md5:60e5d30a78ced725f7b8d34370762941', + 'upload_date': '20151124', + 'timestamp': 1448354940, + 'duration': 74022, + 'view_count': int, + } + } + # extracted from https://audimedia.tv/assets/embed/embedded-player.js (dataSourceAuthToken) + _AUTH_TOKEN = 'e25b42847dba18c6c8816d5d8ce94c326e06823ebf0859ed164b3ba169be97f2' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + raw_payload = self._search_regex(r']+class="amtv-embed"[^>]+id="([^"]+)"', webpage, 'raw payload') + _, stage_mode, video_id, lang = raw_payload.split('-') + + # TODO: handle s and e stage_mode (live streams and ended live streams) + if stage_mode not in ('s', 'e'): + request = sanitized_Request( + 'https://audimedia.tv/api/video/v1/videos/%s?embed[]=video_versions&embed[]=thumbnail_image&where[content_language_iso]=%s' % (video_id, lang), + headers={'X-Auth-Token': self._AUTH_TOKEN}) + json_data = self._download_json(request, video_id)['results'] + formats = [] + + stream_url_hls = json_data.get('stream_url_hls') + if stream_url_hls: + m3u8_formats = self._extract_m3u8_formats(stream_url_hls, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + + stream_url_hds = json_data.get('stream_url_hds') + if stream_url_hds: + f4m_formats = self._extract_f4m_formats(json_data.get('stream_url_hds') + '?hdcore=3.4.0', video_id, -1, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + + for video_version in json_data.get('video_versions'): + video_version_url = video_version.get('download_url') or video_version.get('stream_url') + if not video_version_url: + continue + formats.append({ + 'url': video_version_url, + 'width': int_or_none(video_version.get('width')), + 'height': int_or_none(video_version.get('height')), + 'abr': int_or_none(video_version.get('audio_bitrate')), + 'vbr': int_or_none(video_version.get('video_bitrate')), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': json_data['title'], + 'description': json_data.get('subtitle'), + 'thumbnail': json_data.get('thumbnail_image', {}).get('file'), + 'timestamp': parse_iso8601(json_data.get('publication_date')), + 'duration': int_or_none(json_data.get('duration')), + 'view_count': int_or_none(json_data.get('view_count')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 5f7265a06..691aecc0d 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -23,7 +23,7 @@ class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' _ID_REGEX = r'[pb][\da-z]{7}' - _VALID_URL = r'https?://(?:(?:www\.)?bbc\.co\.uk/(?:(?:programmes/(?!articles/)|iplayer(?:/[^/]+)?/(?:episode/|playlist/))|music/clips[/#])|)(?P%s)' % _ID_REGEX + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:(?:programmes/(?!articles/)|iplayer(?:/[^/]+)?/(?:episode/|playlist/))|music/clips[/#])(?P%s)' % _ID_REGEX _MEDIASELECTOR_URLS = [ # Provides HQ HLS streams with even better quality that pc mediaset but fails @@ -47,9 +47,8 @@ class BBCCoUkIE(InfoExtractor): 'info_dict': { 'id': 'b039d07m', 'ext': 'flv', - 'title': 'Kaleidoscope, Leonard Cohen', + 'title': 'Leonard Cohen, Kaleidoscope - BBC Radio 4', 'description': 'The Canadian poet and songwriter reflects on his musical career.', - 'duration': 1740, }, 'params': { # rtmp download @@ -112,7 +111,8 @@ class BBCCoUkIE(InfoExtractor): 'params': { # rtmp download 'skip_download': True, - } + }, + 'skip': 'Episode is no longer available on BBC iPlayer Radio', }, { 'url': 'http://www.bbc.co.uk/music/clips/p02frcc3', 'note': 'Audio', @@ -454,6 +454,7 @@ class BBCCoUkIE(InfoExtractor): webpage = self._download_webpage(url, group_id, 'Downloading video page') programme_id = None + duration = None tviplayer = self._search_regex( r'mediator\.bind\(({.+?})\s*,\s*document\.getElementById', @@ -473,7 +474,9 @@ class BBCCoUkIE(InfoExtractor): title = self._og_search_title(webpage) description = self._search_regex( r'

([^<]+)

', - webpage, 'description', fatal=False) + webpage, 'description', default=None) + if not description: + description = self._html_search_meta('description', webpage) else: programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) @@ -587,6 +590,7 @@ class BBCIE(BBCCoUkIE): 'ext': 'mp4', 'title': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', 'duration': 56, + 'description': '''Judge Mindy Glazer: "I'm sorry to see you here... I always wondered what happened to you"''', }, 'params': { 'skip_download': True, @@ -729,6 +733,7 @@ class BBCIE(BBCCoUkIE): # article with multiple videos embedded with playlist.sxml (e.g. # http://www.bbc.com/sport/0/football/34475836) playlists = re.findall(r']+name="playlist"[^>]+value="([^"]+)"', webpage) + playlists.extend(re.findall(r'data-media-id="([^"]+/playlist\.sxml)"', webpage)) if playlists: entries = [ self._extract_from_playlist_sxml(playlist_url, playlist_id, timestamp) diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index 61bc2f744..d151d38c9 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -1,6 +1,11 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import ( + compat_chr, + compat_ord, + compat_urllib_parse_unquote, +) from ..utils import ( int_or_none, parse_iso8601, @@ -29,7 +34,24 @@ class BeegIE(InfoExtractor): video_id = self._match_id(url) video = self._download_json( - 'http://beeg.com/api/v1/video/%s' % video_id, video_id) + 'http://beeg.com/api/v4/video/%s' % video_id, video_id) + + def decrypt_key(key): + # Reverse engineered from http://static.beeg.com/cpl/1067.js + a = '8RPUUCS35ZWp3ADnKcSmpH71ZusrROo' + e = compat_urllib_parse_unquote(key) + return ''.join([ + compat_chr(compat_ord(e[n]) - compat_ord(a[n % len(a)]) % 25) + for n in range(len(e))]) + + def decrypt_url(encrypted_url): + encrypted_url = self._proto_relative_url( + encrypted_url.replace('{DATA_MARKERS}', ''), 'http:') + key = self._search_regex( + r'/key=(.*?)%2Cend=', encrypted_url, 'key', default=None) + if not key: + return encrypted_url + return encrypted_url.replace(key, decrypt_key(key)) formats = [] for format_id, video_url in video.items(): @@ -40,7 +62,7 @@ class BeegIE(InfoExtractor): if not height: continue formats.append({ - 'url': self._proto_relative_url(video_url.replace('{DATA_MARKERS}', ''), 'http:'), + 'url': decrypt_url(video_url), 'format_id': format_id, 'height': int(height), }) diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 6c66a1236..59beb11bc 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -2,143 +2,109 @@ from __future__ import unicode_literals import re -import itertools -import json from .common import InfoExtractor -from ..compat import ( - compat_etree_fromstring, -) +from ..compat import compat_str from ..utils import ( int_or_none, - unified_strdate, + unescapeHTML, ExtractorError, + xpath_text, ) class BiliBiliIE(InfoExtractor): - _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P[0-9]+)/' + _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P\d+)(?:/index_(?P\d+).html)?' _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', 'md5': '2c301e4dab317596e837c3e7633e7d86', 'info_dict': { - 'id': '1074402_part1', + 'id': '1554319', 'ext': 'flv', 'title': '【金坷垃】金泡沫', - 'duration': 308, + 'duration': 308313, 'upload_date': '20140420', 'thumbnail': 're:^https?://.+\.jpg', + 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', + 'timestamp': 1397983878, + 'uploader': '菊子桑', }, }, { 'url': 'http://www.bilibili.com/video/av1041170/', 'info_dict': { 'id': '1041170', 'title': '【BD1080P】刀语【诸神&异域】', + 'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦!~', + 'uploader': '枫叶逝去', + 'timestamp': 1396501299, }, 'playlist_count': 9, }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + page_num = mobj.group('page_num') or '1' - if '(此视频不存在或被删除)' in webpage: - raise ExtractorError( - 'The video does not exist or was deleted', expected=True) + view_data = self._download_json( + 'http://api.bilibili.com/view?type=json&appkey=8e9fc618fbd41e28&id=%s&page=%s' % (video_id, page_num), + video_id) + if 'error' in view_data: + raise ExtractorError('%s said: %s' % (self.IE_NAME, view_data['error']), expected=True) - if '>你没有权限浏览! 由于版权相关问题 我们不对您所在的地区提供服务<' in webpage: - raise ExtractorError( - 'The video is not available in your region due to copyright reasons', - expected=True) + cid = view_data['cid'] + title = unescapeHTML(view_data['title']) - video_code = self._search_regex( - r'(?s)
(.*?)
', webpage, 'video code') + doc = self._download_xml( + 'http://interface.bilibili.com/v_cdn_play?appkey=8e9fc618fbd41e28&cid=%s' % cid, + cid, + 'Downloading page %s/%s' % (page_num, view_data['pages']) + ) - title = self._html_search_meta( - 'media:title', video_code, 'title', fatal=True) - duration_str = self._html_search_meta( - 'duration', video_code, 'duration') - if duration_str is None: - duration = None - else: - duration_mobj = re.match( - r'^T(?:(?P[0-9]+)H)?(?P[0-9]+)M(?P[0-9]+)S$', - duration_str) - duration = ( - int_or_none(duration_mobj.group('hours'), default=0) * 3600 + - int(duration_mobj.group('minutes')) * 60 + - int(duration_mobj.group('seconds'))) - upload_date = unified_strdate(self._html_search_meta( - 'uploadDate', video_code, fatal=False)) - thumbnail = self._html_search_meta( - 'thumbnailUrl', video_code, 'thumbnail', fatal=False) - - cid = self._search_regex(r'cid=(\d+)', webpage, 'cid') + if xpath_text(doc, './result') == 'error': + raise ExtractorError('%s said: %s' % (self.IE_NAME, xpath_text(doc, './message')), expected=True) entries = [] - lq_page = self._download_webpage( - 'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid, - video_id, - note='Downloading LQ video info' - ) - try: - err_info = json.loads(lq_page) - raise ExtractorError( - 'BiliBili said: ' + err_info['error_text'], expected=True) - except ValueError: - pass - - lq_doc = compat_etree_fromstring(lq_page) - lq_durls = lq_doc.findall('./durl') - - hq_doc = self._download_xml( - 'http://interface.bilibili.com/playurl?appkey=1&cid=%s' % cid, - video_id, - note='Downloading HQ video info', - fatal=False, - ) - if hq_doc is not False: - hq_durls = hq_doc.findall('./durl') - assert len(lq_durls) == len(hq_durls) - else: - hq_durls = itertools.repeat(None) - - i = 1 - for lq_durl, hq_durl in zip(lq_durls, hq_durls): + for durl in doc.findall('./durl'): + size = xpath_text(durl, ['./filesize', './size']) formats = [{ - 'format_id': 'lq', - 'quality': 1, - 'url': lq_durl.find('./url').text, - 'filesize': int_or_none( - lq_durl.find('./size'), get_attr='text'), + 'url': durl.find('./url').text, + 'filesize': int_or_none(size), + 'ext': 'flv', }] - if hq_durl is not None: - formats.append({ - 'format_id': 'hq', - 'quality': 2, - 'ext': 'flv', - 'url': hq_durl.find('./url').text, - 'filesize': int_or_none( - hq_durl.find('./size'), get_attr='text'), - }) - self._sort_formats(formats) + backup_urls = durl.find('./backup_url') + if backup_urls is not None: + for backup_url in backup_urls.findall('./url'): + formats.append({'url': backup_url.text}) + formats.reverse() entries.append({ - 'id': '%s_part%d' % (video_id, i), + 'id': '%s_part%s' % (cid, xpath_text(durl, './order')), 'title': title, + 'duration': int_or_none(xpath_text(durl, './length'), 1000), 'formats': formats, - 'duration': duration, - 'upload_date': upload_date, - 'thumbnail': thumbnail, }) - i += 1 - - return { - '_type': 'multi_video', - 'entries': entries, - 'id': video_id, - 'title': title + info = { + 'id': compat_str(cid), + 'title': title, + 'description': view_data.get('description'), + 'thumbnail': view_data.get('pic'), + 'uploader': view_data.get('author'), + 'timestamp': int_or_none(view_data.get('created')), + 'view_count': int_or_none(view_data.get('play')), + 'duration': int_or_none(xpath_text(doc, './timelength')), } + + if len(entries) == 1: + entries[0].update(info) + return entries[0] + else: + info.update({ + '_type': 'multi_video', + 'id': video_id, + 'entries': entries, + }) + return info diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py index 3b2de517e..dda98059e 100644 --- a/youtube_dl/extractor/byutv.py +++ b/youtube_dl/extractor/byutv.py @@ -14,9 +14,10 @@ class BYUtvIE(InfoExtractor): 'info_dict': { 'id': 'studio-c-season-5-episode-5', 'ext': 'mp4', - 'description': 'md5:5438d33774b6bdc662f9485a340401cc', + 'description': 'md5:e07269172baff037f8e8bf9956bc9747', 'title': 'Season 5 Episode 5', - 'thumbnail': 're:^https?://.*\.jpg$' + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 1486.486, }, 'params': { 'skip_download': True, diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index 7af903571..3a47f6fa4 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -1,14 +1,9 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( - determine_ext, int_or_none, - js_to_json, - parse_iso8601, - remove_end, + unified_strdate, ) @@ -21,48 +16,47 @@ class ClipfishIE(InfoExtractor): 'id': '3966754', 'ext': 'mp4', 'title': 'FIFA 14 - E3 2013 Trailer', - 'timestamp': 1370938118, + 'description': 'Video zu FIFA 14: E3 2013 Trailer', 'upload_date': '20130611', 'duration': 82, + 'view_count': int, } } def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_info = self._parse_json( - js_to_json(self._html_search_regex( - '(?s)videoObject\s*=\s*({.+?});', webpage, 'video object')), - video_id) + video_info = self._download_json( + 'http://www.clipfish.de/devapi/id/%s?format=json&apikey=hbbtv' % video_id, + video_id)['items'][0] formats = [] - for video_url in re.findall(r'var\s+videourl\s*=\s*"([^"]+)"', webpage): - ext = determine_ext(video_url) - if ext == 'm3u8': - formats.append({ - 'url': video_url.replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'), - 'ext': 'mp4', - 'format_id': 'hls', - }) - else: - formats.append({ - 'url': video_url, - 'format_id': ext, - }) - self._sort_formats(formats) - title = remove_end(self._og_search_title(webpage), ' - Video') - thumbnail = self._og_search_thumbnail(webpage) - duration = int_or_none(video_info.get('length')) - timestamp = parse_iso8601(self._html_search_meta('uploadDate', webpage, 'upload date')) + m3u8_url = video_info.get('media_videourl_hls') + if m3u8_url: + formats.append({ + 'url': m3u8_url.replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'), + 'ext': 'mp4', + 'format_id': 'hls', + }) + + mp4_url = video_info.get('media_videourl') + if mp4_url: + formats.append({ + 'url': mp4_url, + 'format_id': 'mp4', + 'width': int_or_none(video_info.get('width')), + 'height': int_or_none(video_info.get('height')), + 'tbr': int_or_none(video_info.get('bitrate')), + }) return { 'id': video_id, - 'title': title, + 'title': video_info['title'], + 'description': video_info.get('descr'), 'formats': formats, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, + 'thumbnail': video_info.get('media_content_thumbnail_large') or video_info.get('media_thumbnail'), + 'duration': int_or_none(video_info.get('media_length')), + 'upload_date': unified_strdate(video_info.get('pubDate')), + 'view_count': int_or_none(video_info.get('media_views')) } diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index d46592cc5..2996b6b09 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -1,7 +1,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import determine_ext +from ..utils import int_or_none _translation_table = { @@ -42,31 +42,26 @@ class CliphunterIE(InfoExtractor): video_title = self._search_regex( r'mediaTitle = "([^"]+)"', webpage, 'title') - fmts = {} - for fmt in ('mp4', 'flv'): - fmt_list = self._parse_json(self._search_regex( - r'var %sjson\s*=\s*(\[.*?\]);' % fmt, webpage, '%s formats' % fmt), video_id) - for f in fmt_list: - fmts[f['fname']] = _decode(f['sUrl']) - - qualities = self._parse_json(self._search_regex( - r'var player_btns\s*=\s*(.*?);\n', webpage, 'quality info'), video_id) + gexo_files = self._parse_json( + self._search_regex( + r'var\s+gexoFiles\s*=\s*({.+?});', webpage, 'gexo files'), + video_id) formats = [] - for fname, url in fmts.items(): - f = { - 'url': url, - } - if fname in qualities: - qual = qualities[fname] - f.update({ - 'format_id': '%s_%sp' % (determine_ext(url), qual['h']), - 'width': qual['w'], - 'height': qual['h'], - 'tbr': qual['br'], - }) - formats.append(f) - + for format_id, f in gexo_files.items(): + video_url = f.get('url') + if not video_url: + continue + fmt = f.get('fmt') + height = f.get('h') + format_id = '%s_%sp' % (fmt, height) if fmt and height else format_id + formats.append({ + 'url': _decode(video_url), + 'format_id': format_id, + 'width': int_or_none(f.get('w')), + 'height': int_or_none(height), + 'tbr': int_or_none(f.get('br')), + }) self._sort_formats(formats) thumbnail = self._search_regex( diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index eb9bfa3d1..6ab2d68d6 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -167,7 +167,7 @@ class InfoExtractor(object): "ext" will be calculated from URL if missing automatic_captions: Like 'subtitles', used by the YoutubeIE for automatically generated captions - duration: Length of the video in seconds, as an integer. + duration: Length of the video in seconds, as an integer or float. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index 92e8c571f..4c81271d3 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -37,8 +37,8 @@ class FC2IE(InfoExtractor): 'params': { 'username': 'ytdl@yt-dl.org', 'password': '(snip)', - 'skip': 'requires actual password' - } + }, + 'skip': 'requires actual password', }, { 'url': 'http://video.fc2.com/en/a/content/20130926eZpARwsF', 'only_matching': True, diff --git a/youtube_dl/extractor/gameinformer.py b/youtube_dl/extractor/gameinformer.py new file mode 100644 index 000000000..25870c131 --- /dev/null +++ b/youtube_dl/extractor/gameinformer.py @@ -0,0 +1,43 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import int_or_none + + +class GameInformerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gameinformer\.com/(?:[^/]+/)*(?P.+)\.aspx' + _TEST = { + 'url': 'http://www.gameinformer.com/b/features/archive/2015/09/26/replay-animal-crossing.aspx', + 'info_dict': { + 'id': '4515472681001', + 'ext': 'm3u8', + 'title': 'Replay - Animal Crossing', + 'description': 'md5:2e211891b215c85d061adc7a4dd2d930', + 'timestamp': 1443457610706, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + bc_api_url = self._search_regex(r"getVideo\('([^']+)'", webpage, 'brightcove api url') + json_data = self._download_json( + bc_api_url + '&video_fields=id,name,shortDescription,publishedDate,videoStillURL,length,IOSRenditions', + display_id) + + return { + 'id': compat_str(json_data['id']), + 'display_id': display_id, + 'url': json_data['IOSRenditions'][0]['url'], + 'title': json_data['name'], + 'description': json_data.get('shortDescription'), + 'timestamp': int_or_none(json_data.get('publishedDate')), + 'duration': int_or_none(json_data.get('length')), + } diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py index a6ab795ae..c3f031d9c 100644 --- a/youtube_dl/extractor/gametrailers.py +++ b/youtube_dl/extractor/gametrailers.py @@ -1,19 +1,62 @@ from __future__ import unicode_literals -from .mtv import MTVServicesInfoExtractor +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_age_limit, + url_basename, +) -class GametrailersIE(MTVServicesInfoExtractor): - _VALID_URL = r'http://www\.gametrailers\.com/(?Pvideos|reviews|full-episodes)/(?P.*?)/(?P.*)' +class GametrailersIE(InfoExtractor): + _VALID_URL = r'http://www\.gametrailers\.com/videos/view/[^/]+/(?P<id>.+)' + _TEST = { - 'url': 'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer', - 'md5': '4c8e67681a0ea7ec241e8c09b3ea8cf7', + 'url': 'http://www.gametrailers.com/videos/view/gametrailers-com/116437-Just-Cause-3-Review', + 'md5': 'f28c4efa0bdfaf9b760f6507955b6a6a', 'info_dict': { - 'id': '70e9a5d7-cf25-4a10-9104-6f3e7342ae0d', + 'id': '2983958', 'ext': 'mp4', - 'title': 'E3 2013: Debut Trailer', - 'description': 'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!', + 'display_id': '116437-Just-Cause-3-Review', + 'title': 'Just Cause 3 - Review', + 'description': 'It\'s a lot of fun to shoot at things and then watch them explode in Just Cause 3, but should there be more to the experience than that?', }, } - _FEED_URL = 'http://www.gametrailers.com/feeds/mrss' + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + title = self._html_search_regex( + r'<title>(.+?)\|', webpage, 'title').strip() + embed_url = self._proto_relative_url( + self._search_regex( + r'src=\'(//embed.gametrailers.com/embed/[^\']+)\'', webpage, + 'embed url'), + scheme='http:') + video_id = url_basename(embed_url) + embed_page = self._download_webpage(embed_url, video_id) + embed_vars_json = self._search_regex( + r'(?s)var embedVars = (\{.*?\})\s*</script>', embed_page, + 'embed vars') + info = self._parse_json(embed_vars_json, video_id) + + formats = [] + for media in info['media']: + if media['mediaPurpose'] == 'play': + formats.append({ + 'url': media['uri'], + 'height': media['height'], + 'width:': media['width'], + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'thumbnail': info.get('thumbUri'), + 'description': self._og_search_description(webpage), + 'duration': int_or_none(info.get('videoLengthInSeconds')), + 'age_limit': parse_age_limit(info.get('audienceRating')), + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 5075d131e..c2e8f9b62 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -54,6 +54,7 @@ from .onionstudios import OnionStudiosIE from .snagfilms import SnagFilmsEmbedIE from .screenwavemedia import ScreenwaveMediaIE from .mtv import MTVServicesEmbeddedIE +from .pladform import PladformIE class GenericIE(InfoExtractor): @@ -339,6 +340,7 @@ class GenericIE(InfoExtractor): 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', 'ext': 'mp4', 'title': '2cc213299525360.mov', # that's what we get + 'duration': 238.231, }, 'add_ie': ['Ooyala'], }, @@ -350,6 +352,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': '"Steve Jobs: Man in the Machine" trailer', 'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."', + 'duration': 135.427, }, 'params': { 'skip_download': True, @@ -960,8 +963,9 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs', 'ext': 'mp4', - 'description': 'VIDEO: Index/Match versus VLOOKUP.', + 'description': 'VIDEO: INDEX/MATCH versus VLOOKUP.', 'title': 'This is what separates the Excel masters from the wannabes', + 'duration': 191.933, }, 'params': { # m3u8 downloads @@ -1501,7 +1505,7 @@ class GenericIE(InfoExtractor): re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) if mobj is not None: - return OoyalaIE._build_url_result(mobj.group('ec')) + return OoyalaIE._build_url_result(smuggle_url(mobj.group('ec'), {'domain': url})) # Look for multiple Ooyala embeds on SBN network websites mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) @@ -1509,7 +1513,7 @@ class GenericIE(InfoExtractor): embeds = self._parse_json(mobj.group(1), video_id, fatal=False) if embeds: return _playlist_from_matches( - embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala') + embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') # Look for Aparat videos mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage) @@ -1738,10 +1742,9 @@ class GenericIE(InfoExtractor): return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform') # Look for Pladform embeds - mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Pladform') + pladform_url = PladformIE._extract_url(webpage) + if pladform_url: + return self.url_result(pladform_url) # Look for Playwire embeds mobj = re.search( diff --git a/youtube_dl/extractor/groupon.py b/youtube_dl/extractor/groupon.py index 8b9e0e2f8..63c05b6a6 100644 --- a/youtube_dl/extractor/groupon.py +++ b/youtube_dl/extractor/groupon.py @@ -18,6 +18,8 @@ class GrouponIE(InfoExtractor): 'id': 'tubGNycTo_9Uxg82uESj4i61EYX8nyuf', 'ext': 'mp4', 'title': 'Bikram Yoga Huntington Beach | Orange County', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'duration': 44.961, }, }], 'params': { diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py index 16677f179..e8f51e545 100644 --- a/youtube_dl/extractor/howcast.py +++ b/youtube_dl/extractor/howcast.py @@ -16,6 +16,7 @@ class HowcastIE(InfoExtractor): 'description': 'md5:dbe792e5f6f1489027027bf2eba188a3', 'timestamp': 1276081287, 'upload_date': '20100609', + 'duration': 56.823, }, 'params': { # m3u8 download diff --git a/youtube_dl/extractor/hypem.py b/youtube_dl/extractor/hypem.py index cca3dd498..b3706fe6d 100644 --- a/youtube_dl/extractor/hypem.py +++ b/youtube_dl/extractor/hypem.py @@ -28,15 +28,12 @@ class HypemIE(InfoExtractor): track_id = self._match_id(url) data = {'ax': 1, 'ts': time.time()} - data_encoded = compat_urllib_parse.urlencode(data) - complete_url = url + "?" + data_encoded - request = sanitized_Request(complete_url) + request = sanitized_Request(url + '?' + compat_urllib_parse.urlencode(data)) response, urlh = self._download_webpage_handle( request, track_id, 'Downloading webpage with the url') - cookie = urlh.headers.get('Set-Cookie', '') html_tracks = self._html_search_regex( - r'(?ms)<script type="application/json" id="displayList-data">\s*(.*?)\s*</script>', + r'(?ms)<script type="application/json" id="displayList-data">(.+?)</script>', response, 'tracks') try: track_list = json.loads(html_tracks) @@ -46,15 +43,14 @@ class HypemIE(InfoExtractor): key = track['key'] track_id = track['id'] - artist = track['artist'] title = track['song'] - serve_url = "http://hypem.com/serve/source/%s/%s" % (track_id, key) request = sanitized_Request( - serve_url, '', {'Content-Type': 'application/json'}) - request.add_header('cookie', cookie) + 'http://hypem.com/serve/source/%s/%s' % (track_id, key), + '', {'Content-Type': 'application/json'}) song_data = self._download_json(request, track_id, 'Downloading metadata') - final_url = song_data["url"] + final_url = song_data['url'] + artist = track.get('artist') return { 'id': track_id, diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 2df1da3f0..f96e12e69 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -205,9 +205,8 @@ class IqiyiIE(InfoExtractor): def get_enc_key(self, swf_url, video_id): # TODO: automatic key extraction - # last update at 2015-10-22 for Zombie::bite - # '7223c67061dbea1259d0ceb44f44b6d62288f4f80c972170de5201d2321060270e05'[2:66][0::2] - enc_key = '2c76de15dcb44bd28ff0927d50d31620' + # last update at 2015-12-06 for Zombie::bite + enc_key = '3719f6a1da83ee0aee3488d8802d7696'[::-1] return enc_key def _real_extract(self, url): diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index d79261bb5..126ca13df 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -1,23 +1,25 @@ from __future__ import unicode_literals -import os import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse -from ..utils import sanitized_Request +from ..utils import ( + sanitized_Request, + url_basename, +) class KeezMoviesIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/.+?(?P<id>[0-9]+)(?:[/?&]|$)' _TEST = { 'url': 'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711', - 'md5': '6e297b7e789329923fcf83abb67c9289', + 'md5': '1c1e75d22ffa53320f45eeb07bc4cdc0', 'info_dict': { 'id': '1214711', 'ext': 'mp4', 'title': 'Petite Asian Lady Mai Playing In Bathtub', 'age_limit': 18, + 'thumbnail': 're:^https?://.*\.jpg$', } } @@ -36,21 +38,29 @@ class KeezMoviesIE(InfoExtractor): video_title = self._html_search_regex( r'<h1 [^>]*>([^<]+)', webpage, 'title') - video_url = self._html_search_regex( - r'(?s)html5VideoPlayer = .*?src="([^"]+)"', webpage, 'video URL') - path = compat_urllib_parse_urlparse(video_url).path - extension = os.path.splitext(path)[1][1:] - format = path.split('/')[4].split('_')[:2] - format = "-".join(format) + flashvars = self._parse_json(self._search_regex( + r'var\s+flashvars\s*=\s*([^;]+);', webpage, 'flashvars'), video_id) + + formats = [] + for height in (180, 240, 480): + if flashvars.get('quality_%dp' % height): + video_url = flashvars['quality_%dp' % height] + a_format = { + 'url': video_url, + 'height': height, + 'format_id': '%dp' % height, + } + filename_parts = url_basename(video_url).split('_') + if len(filename_parts) >= 2 and re.match(r'\d+[Kk]', filename_parts[1]): + a_format['tbr'] = int(filename_parts[1][:-1]) + formats.append(a_format) age_limit = self._rta_search(webpage) return { 'id': video_id, 'title': video_title, - 'url': video_url, - 'ext': extension, - 'format': format, - 'format_id': format, + 'formats': formats, 'age_limit': age_limit, + 'thumbnail': flashvars.get('image_url') } diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 3c786a36d..67d6271e1 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -154,10 +154,10 @@ class MetacafeIE(InfoExtractor): # Extract URL, uploader and title from webpage self.report_extraction(video_id) video_url = None - mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage) + mobj = re.search(r'(?m)&(?:media|video)URL=([^&]+)', webpage) if mobj is not None: mediaURL = compat_urllib_parse_unquote(mobj.group(1)) - video_ext = mediaURL[-3:] + video_ext = determine_ext(mediaURL) # Extract gdaKey if available mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage) @@ -229,7 +229,7 @@ class MetacafeIE(InfoExtractor): age_limit = ( 18 - if re.search(r'"contentRating":"restricted"', webpage) + if re.search(r'(?:"contentRating":|"rating",)"restricted"', webpage) else 0) if isinstance(video_url, list): diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index d47aeceda..c2b7ed9ab 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -64,7 +64,8 @@ class MixcloudIE(InfoExtractor): preview_url = self._search_regex( r'\s(?:data-preview-url|m-preview)="([^"]+)"', webpage, 'preview url') - song_url = preview_url.replace('/previews/', '/c/originals/') + song_url = re.sub(r'audiocdn(\d+)', r'stream\1', preview_url) + song_url = song_url.replace('/previews/', '/c/originals/') if not self._check_url(song_url, track_id, 'mp3'): song_url = song_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') if not self._check_url(song_url, track_id, 'm4a'): diff --git a/youtube_dl/extractor/movshare.py b/youtube_dl/extractor/movshare.py deleted file mode 100644 index 6101063f2..000000000 --- a/youtube_dl/extractor/movshare.py +++ /dev/null @@ -1,27 +0,0 @@ -from __future__ import unicode_literals - -from .novamov import NovaMovIE - - -class MovShareIE(NovaMovIE): - IE_NAME = 'movshare' - IE_DESC = 'MovShare' - - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'movshare\.(?:net|sx|ag)'} - - _HOST = 'www.movshare.net' - - _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' - _TITLE_REGEX = r'<strong>Title:</strong> ([^<]+)</p>' - _DESCRIPTION_REGEX = r'<strong>Description:</strong> ([^<]+)</p>' - - _TEST = { - 'url': 'http://www.movshare.net/video/559e28be54d96', - 'md5': 'abd31a2132947262c50429e1d16c1bfd', - 'info_dict': { - 'id': '559e28be54d96', - 'ext': 'flv', - 'title': 'dissapeared image', - 'description': 'optical illusion dissapeared image magic illusion', - } - } diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 944096e1c..7c6b7841d 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -1,63 +1,102 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( - remove_end, parse_duration, + int_or_none, + xpath_text, + xpath_attr, ) class NBAIE(InfoExtractor): - _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?:nba/)?video(?P<id>/[^?]*?)/?(?:/index\.html)?(?:\?.*)?$' + _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P<path>(?:[^/]+/)?video/(?P<id>[^?]*?))/?(?:/index\.html)?(?:\?.*)?$' _TESTS = [{ 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', - 'md5': 'c0edcfc37607344e2ff8f13c378c88a4', + 'md5': '9e7729d3010a9c71506fd1248f74e4f4', 'info_dict': { - 'id': '0021200253-okc-bkn-recap.nba', - 'ext': 'mp4', + 'id': '0021200253-okc-bkn-recap', + 'ext': 'flv', 'title': 'Thunder vs. Nets', 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', 'duration': 181, + 'timestamp': 1354638466, + 'upload_date': '20121204', }, }, { 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', 'only_matching': True, }, { - 'url': 'http://watch.nba.com/nba/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', + 'url': 'http://watch.nba.com/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', + 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4', 'info_dict': { - 'id': '0041400301-cle-atl-recap.nba', + 'id': '0041400301-cle-atl-recap', 'ext': 'mp4', - 'title': 'NBA GAME TIME | Video: Hawks vs. Cavaliers Game 1', + 'title': 'Hawks vs. Cavaliers Game 1', 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', 'duration': 228, - }, - 'params': { - 'skip_download': True, + 'timestamp': 1432134543, + 'upload_date': '20150520', } }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + path, video_id = re.match(self._VALID_URL, url).groups() + if path.startswith('nba/'): + path = path[3:] + video_info = self._download_xml('http://www.nba.com/%s.xml' % path, video_id) + video_id = xpath_text(video_info, 'slug') + title = xpath_text(video_info, 'headline') + description = xpath_text(video_info, 'description') + duration = parse_duration(xpath_text(video_info, 'length')) + timestamp = int_or_none(xpath_attr(video_info, 'dateCreated', 'uts')) - video_url = 'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' + thumbnails = [] + for image in video_info.find('images'): + thumbnails.append({ + 'id': image.attrib.get('cut'), + 'url': image.text, + 'width': int_or_none(image.attrib.get('width')), + 'height': int_or_none(image.attrib.get('height')), + }) - shortened_video_id = video_id.rpartition('/')[2] - title = remove_end( - self._og_search_title(webpage, default=shortened_video_id), ' : NBA.com') - - description = self._og_search_description(webpage) - duration_str = self._html_search_meta( - 'duration', webpage, 'duration', default=None) - if not duration_str: - duration_str = self._html_search_regex( - r'Duration:</b>\s*(\d+:\d+)', webpage, 'duration', fatal=False) - duration = parse_duration(duration_str) + formats = [] + for video_file in video_info.findall('.//file'): + video_url = video_file.text + if video_url.startswith('/'): + continue + if video_url.endswith('.m3u8'): + m3u8_formats = self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + elif video_url.endswith('.f4m'): + f4m_formats = self._extract_f4m_formats(video_url + '?hdcore=3.4.1.1', video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + else: + key = video_file.attrib.get('bitrate') + format_info = { + 'format_id': key, + 'url': video_url, + } + mobj = re.search(r'(\d+)x(\d+)(?:_(\d+))?', key) + if mobj: + format_info.update({ + 'width': int(mobj.group(1)), + 'height': int(mobj.group(2)), + 'tbr': int_or_none(mobj.group(3)), + }) + formats.append(format_info) + self._sort_formats(formats) return { - 'id': shortened_video_id, - 'url': video_url, + 'id': video_id, 'title': title, 'description': description, 'duration': duration, + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'formats': formats, } diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index e683d24c4..4c1eca96f 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -11,6 +11,7 @@ from ..utils import ( ExtractorError, find_xpath_attr, lowercase_escape, + smuggle_url, unescapeHTML, ) @@ -62,12 +63,13 @@ class NBCIE(InfoExtractor): theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex( [ r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"', + r'<iframe[^>]+src="((?:https?:)?//player\.theplatform\.com/[^"]+)"', r'"embedURL"\s*:\s*"([^"]+)"' ], webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/'))) if theplatform_url.startswith('//'): theplatform_url = 'http:' + theplatform_url - return self.url_result(theplatform_url) + return self.url_result(smuggle_url(theplatform_url, {'source_url': url})) class NBCSportsVPlayerIE(InfoExtractor): diff --git a/youtube_dl/extractor/novamov.py b/youtube_dl/extractor/novamov.py index 6163e8855..837c91559 100644 --- a/youtube_dl/extractor/novamov.py +++ b/youtube_dl/extractor/novamov.py @@ -92,3 +92,89 @@ class NovaMovIE(InfoExtractor): 'title': title, 'description': description } + + +class WholeCloudIE(NovaMovIE): + IE_NAME = 'wholecloud' + IE_DESC = 'WholeCloud' + + _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': '(?:wholecloud\.net|movshare\.(?:net|sx|ag))'} + + _HOST = 'www.wholecloud.net' + + _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' + _TITLE_REGEX = r'<strong>Title:</strong> ([^<]+)</p>' + _DESCRIPTION_REGEX = r'<strong>Description:</strong> ([^<]+)</p>' + + _TEST = { + 'url': 'http://www.wholecloud.net/video/559e28be54d96', + 'md5': 'abd31a2132947262c50429e1d16c1bfd', + 'info_dict': { + 'id': '559e28be54d96', + 'ext': 'flv', + 'title': 'dissapeared image', + 'description': 'optical illusion dissapeared image magic illusion', + } + } + + +class NowVideoIE(NovaMovIE): + IE_NAME = 'nowvideo' + IE_DESC = 'NowVideo' + + _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:to|ch|ec|sx|eu|at|ag|co|li)'} + + _HOST = 'www.nowvideo.to' + + _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' + _FILEKEY_REGEX = r'var fkzd="([^"]+)";' + _TITLE_REGEX = r'<h4>([^<]+)</h4>' + _DESCRIPTION_REGEX = r'</h4>\s*<p>([^<]+)</p>' + + _TEST = { + 'url': 'http://www.nowvideo.to/video/0mw0yow7b6dxa', + 'md5': 'f8fbbc8add72bd95b7850c6a02fc8817', + 'info_dict': { + 'id': '0mw0yow7b6dxa', + 'ext': 'flv', + 'title': 'youtubedl test video _BaW_jenozKc.mp4', + 'description': 'Description', + } + } + + +class VideoWeedIE(NovaMovIE): + IE_NAME = 'videoweed' + IE_DESC = 'VideoWeed' + + _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'videoweed\.(?:es|com)'} + + _HOST = 'www.videoweed.es' + + _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' + _TITLE_REGEX = r'<h1 class="text_shadow">([^<]+)</h1>' + + _TEST = { + 'url': 'http://www.videoweed.es/file/b42178afbea14', + 'md5': 'abd31a2132947262c50429e1d16c1bfd', + 'info_dict': { + 'id': 'b42178afbea14', + 'ext': 'flv', + 'title': 'optical illusion dissapeared image magic illusion', + 'description': '' + }, + } + + +class CloudTimeIE(NovaMovIE): + IE_NAME = 'cloudtime' + IE_DESC = 'CloudTime' + + _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'cloudtime\.to'} + + _HOST = 'www.cloudtime.to' + + _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' + _TITLE_REGEX = r'<div[^>]+class=["\']video_det["\'][^>]*>\s*<strong>([^<]+)</strong>' + + _TEST = None diff --git a/youtube_dl/extractor/nowtv.py b/youtube_dl/extractor/nowtv.py index 67e34b294..fd107aca2 100644 --- a/youtube_dl/extractor/nowtv.py +++ b/youtube_dl/extractor/nowtv.py @@ -71,7 +71,7 @@ class NowTVBaseIE(InfoExtractor): class NowTVIE(NowTVBaseIE): - _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<show_id>[^/]+)/(?:list/[^/]+/)?(?P<id>[^/]+)/(?:player|preview)' + _VALID_URL = r'https?://(?:www\.)?nowtv\.(?:de|at|ch)/(?:rtl|rtl2|rtlnitro|superrtl|ntv|vox)/(?P<show_id>[^/]+)/(?:(?:list/[^/]+|jahr/\d{4}/\d{1,2})/)?(?P<id>[^/]+)/(?:player|preview)' _TESTS = [{ # rtl @@ -190,6 +190,9 @@ class NowTVIE(NowTVBaseIE): }, { 'url': 'http://www.nowtv.de/rtl2/echtzeit/list/aktuell/schnelles-geld-am-ende-der-welt/player', 'only_matching': True, + }, { + 'url': 'http://www.nowtv.de/rtl2/zuhause-im-glueck/jahr/2015/11/eine-erschuetternde-diagnose/player', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/nowvideo.py b/youtube_dl/extractor/nowvideo.py deleted file mode 100644 index 57ee3d366..000000000 --- a/youtube_dl/extractor/nowvideo.py +++ /dev/null @@ -1,28 +0,0 @@ -from __future__ import unicode_literals - -from .novamov import NovaMovIE - - -class NowVideoIE(NovaMovIE): - IE_NAME = 'nowvideo' - IE_DESC = 'NowVideo' - - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'nowvideo\.(?:to|ch|ec|sx|eu|at|ag|co|li)'} - - _HOST = 'www.nowvideo.to' - - _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' - _FILEKEY_REGEX = r'var fkzd="([^"]+)";' - _TITLE_REGEX = r'<h4>([^<]+)</h4>' - _DESCRIPTION_REGEX = r'</h4>\s*<p>([^<]+)</p>' - - _TEST = { - 'url': 'http://www.nowvideo.ch/video/0mw0yow7b6dxa', - 'md5': 'f8fbbc8add72bd95b7850c6a02fc8817', - 'info_dict': { - 'id': '0mw0yow7b6dxa', - 'ext': 'flv', - 'title': 'youtubedl test video _BaW_jenozKc.mp4', - 'description': 'Description', - } - } diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index a262a9f6d..35067e271 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -1,108 +1,69 @@ from __future__ import unicode_literals import re -import json import base64 from .common import InfoExtractor from ..utils import ( - unescapeHTML, - ExtractorError, - determine_ext, int_or_none, + float_or_none, + ExtractorError, + unsmuggle_url, ) +from ..compat import compat_urllib_parse class OoyalaBaseIE(InfoExtractor): - def _extract_result(self, info, more_info): - embedCode = info['embedCode'] - video_url = info.get('ipad_url') or info['url'] - - if determine_ext(video_url) == 'm3u8': - formats = self._extract_m3u8_formats(video_url, embedCode, ext='mp4') - else: - formats = [{ - 'url': video_url, - 'ext': 'mp4', - }] - - return { - 'id': embedCode, - 'title': unescapeHTML(info['title']), - 'formats': formats, - 'description': unescapeHTML(more_info['description']), - 'thumbnail': more_info['promo'], + def _extract(self, content_tree_url, video_id, domain='example.org'): + content_tree = self._download_json(content_tree_url, video_id)['content_tree'] + metadata = content_tree[list(content_tree)[0]] + embed_code = metadata['embed_code'] + pcode = metadata.get('asset_pcode') or embed_code + video_info = { + 'id': embed_code, + 'title': metadata['title'], + 'description': metadata.get('description'), + 'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'), + 'duration': float_or_none(metadata.get('duration'), 1000), } - def _extract(self, player_url, video_id): - player = self._download_webpage(player_url, video_id) - mobile_url = self._search_regex(r'mobile_player_url="(.+?)&device="', - player, 'mobile player url') - # Looks like some videos are only available for particular devices - # (e.g. http://player.ooyala.com/player.js?embedCode=x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0 - # is only available for ipad) - # Working around with fetching URLs for all the devices found starting with 'unknown' - # until we succeed or eventually fail for each device. - devices = re.findall(r'device\s*=\s*"([^"]+)";', player) - devices.remove('unknown') - devices.insert(0, 'unknown') - for device in devices: - mobile_player = self._download_webpage( - '%s&device=%s' % (mobile_url, device), video_id, - 'Downloading mobile player JS for %s device' % device) - videos_info = self._search_regex( - r'var streams=window.oo_testEnv\?\[\]:eval\("\((\[{.*?}\])\)"\);', - mobile_player, 'info', fatal=False, default=None) - if videos_info: - break - - if not videos_info: - formats = [] + formats = [] + for supported_format in ('mp4', 'm3u8', 'hds', 'rtmp'): auth_data = self._download_json( - 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?domain=www.example.org&supportedFormats=mp4,webm' % (video_id, video_id), - video_id) + 'http://player.ooyala.com/sas/player_api/v1/authorization/embed_code/%s/%s?' % (pcode, embed_code) + compat_urllib_parse.urlencode({'domain': domain, 'supportedFormats': supported_format}), + video_id, 'Downloading %s JSON' % supported_format) - cur_auth_data = auth_data['authorization_data'][video_id] + cur_auth_data = auth_data['authorization_data'][embed_code] - for stream in cur_auth_data['streams']: - formats.append({ - 'url': base64.b64decode(stream['url']['data'].encode('ascii')).decode('utf-8'), - 'ext': stream.get('delivery_type'), - 'format': stream.get('video_codec'), - 'format_id': stream.get('profile'), - 'width': int_or_none(stream.get('width')), - 'height': int_or_none(stream.get('height')), - 'abr': int_or_none(stream.get('audio_bitrate')), - 'vbr': int_or_none(stream.get('video_bitrate')), - }) - if formats: - return { - 'id': video_id, - 'formats': formats, - 'title': 'Ooyala video', - } + if cur_auth_data['authorized']: + for stream in cur_auth_data['streams']: + url = base64.b64decode(stream['url']['data'].encode('ascii')).decode('utf-8') + delivery_type = stream['delivery_type'] + if delivery_type == 'remote_asset': + video_info['url'] = url + return video_info + if delivery_type == 'hls': + formats.extend(self._extract_m3u8_formats(url, embed_code, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + elif delivery_type == 'hds': + formats.extend(self._extract_f4m_formats(url, embed_code, -1, 'hds', fatal=False)) + else: + formats.append({ + 'url': url, + 'ext': stream.get('delivery_type'), + 'vcodec': stream.get('video_codec'), + 'format_id': '%s-%s-%sp' % (stream.get('profile'), delivery_type, stream.get('height')), + 'width': int_or_none(stream.get('width')), + 'height': int_or_none(stream.get('height')), + 'abr': int_or_none(stream.get('audio_bitrate')), + 'vbr': int_or_none(stream.get('video_bitrate')), + 'fps': float_or_none(stream.get('framerate')), + }) + else: + raise ExtractorError('%s said: %s' % (self.IE_NAME, cur_auth_data['message']), expected=True) + self._sort_formats(formats) - if not cur_auth_data['authorized']: - raise ExtractorError(cur_auth_data['message'], expected=True) - - if not videos_info: - raise ExtractorError('Unable to extract info') - videos_info = videos_info.replace('\\"', '"') - videos_more_info = self._search_regex( - r'eval\("\(({.*?\\"promo\\".*?})\)"', mobile_player, 'more info').replace('\\"', '"') - videos_info = json.loads(videos_info) - videos_more_info = json.loads(videos_more_info) - - if videos_more_info.get('lineup'): - videos = [self._extract_result(info, more_info) for (info, more_info) in zip(videos_info, videos_more_info['lineup'])] - return { - '_type': 'playlist', - 'id': video_id, - 'title': unescapeHTML(videos_more_info['title']), - 'entries': videos, - } - else: - return self._extract_result(videos_info[0], videos_more_info) + video_info['formats'] = formats + return video_info class OoyalaIE(OoyalaBaseIE): @@ -117,6 +78,7 @@ class OoyalaIE(OoyalaBaseIE): 'ext': 'mp4', 'title': 'Explaining Data Recovery from Hard Drives and SSDs', 'description': 'How badly damaged does a drive have to be to defeat Russell and his crew? Apparently, smashed to bits.', + 'duration': 853.386, }, }, { # Only available for ipad @@ -125,7 +87,7 @@ class OoyalaIE(OoyalaBaseIE): 'id': 'x1b3lqZDq9y_7kMyC2Op5qo-p077tXD0', 'ext': 'mp4', 'title': 'Simulation Overview - Levels of Simulation', - 'description': '', + 'duration': 194.948, }, }, { @@ -136,7 +98,8 @@ class OoyalaIE(OoyalaBaseIE): 'info_dict': { 'id': 'FiOG81ZTrvckcchQxmalf4aQj590qTEx', 'ext': 'mp4', - 'title': 'Ooyala video', + 'title': 'Divide Tool Path.mp4', + 'duration': 204.405, } } ] @@ -151,9 +114,11 @@ class OoyalaIE(OoyalaBaseIE): ie=cls.ie_key()) def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) embed_code = self._match_id(url) - player_url = 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code - return self._extract(player_url, embed_code) + domain = smuggled_data.get('domain') + content_tree_url = 'http://player.ooyala.com/player_api/v1/content_tree/embed_code/%s/%s' % (embed_code, embed_code) + return self._extract(content_tree_url, embed_code, domain) class OoyalaExternalIE(OoyalaBaseIE): @@ -170,7 +135,7 @@ class OoyalaExternalIE(OoyalaBaseIE): .*?&pcode= ) (?P<pcode>.+?) - (&|$) + (?:&|$) ''' _TEST = { @@ -179,7 +144,7 @@ class OoyalaExternalIE(OoyalaBaseIE): 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG', 'ext': 'mp4', 'title': 'dm_140128_30for30Shorts___JudgingJewellv2', - 'description': '', + 'duration': 1302000, }, 'params': { # m3u8 download @@ -188,9 +153,6 @@ class OoyalaExternalIE(OoyalaBaseIE): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - partner_id = mobj.group('partner_id') - video_id = mobj.group('id') - pcode = mobj.group('pcode') - player_url = 'http://player.ooyala.com/player.js?externalId=%s:%s&pcode=%s' % (partner_id, video_id, pcode) - return self._extract(player_url, video_id) + partner_id, video_id, pcode = re.match(self._VALID_URL, url).groups() + content_tree_url = 'http://player.ooyala.com/player_api/v1/content_tree/external_id/%s/%s:%s' % (pcode, partner_id, video_id) + return self._extract(content_tree_url, video_id) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index b787e2a73..ab3c43cfe 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -18,7 +18,7 @@ class PBSIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// (?: # Direct video URL - video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? | + (?:video\.pbs|watch\.knpb)\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? | # Article with embedded player (or direct video) (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) | # Player @@ -174,6 +174,10 @@ class PBSIE(InfoExtractor): { 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'only_matching': True, + }, + { + 'url': 'http://watch.knpb.org/video/2365616055/', + 'only_matching': True, } ] _ERRORS = { @@ -204,6 +208,7 @@ class PBSIE(InfoExtractor): MEDIA_ID_REGEXES = [ r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed r'class="coveplayerid">([^<]+)<', # coveplayer + r'<section[^>]+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/ r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer ] diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py index 551c8c9f0..bc559d1df 100644 --- a/youtube_dl/extractor/pladform.py +++ b/youtube_dl/extractor/pladform.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -44,6 +46,13 @@ class PladformIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src="(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)"', webpage) + if mobj: + return mobj.group('url') + def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py index aa7dbcb63..55c11b3bf 100644 --- a/youtube_dl/extractor/pluralsight.py +++ b/youtube_dl/extractor/pluralsight.py @@ -1,5 +1,6 @@ from __future__ import unicode_literals +import re import json import random import collections @@ -14,6 +15,7 @@ from ..utils import ( ExtractorError, int_or_none, parse_duration, + qualities, sanitized_Request, ) @@ -140,15 +142,28 @@ class PluralsightIE(PluralsightBaseIE): 'low': {'width': 640, 'height': 480}, 'medium': {'width': 848, 'height': 640}, 'high': {'width': 1024, 'height': 768}, + 'high-widescreen': {'width': 1280, 'height': 720}, } + QUALITIES_PREFERENCE = ('low', 'medium', 'high', 'high-widescreen',) + quality_key = qualities(QUALITIES_PREFERENCE) + AllowedQuality = collections.namedtuple('AllowedQuality', ['ext', 'qualities']) ALLOWED_QUALITIES = ( - AllowedQuality('webm', ('high',)), - AllowedQuality('mp4', ('low', 'medium', 'high',)), + AllowedQuality('webm', ['high', ]), + AllowedQuality('mp4', ['low', 'medium', 'high', ]), ) + # Some courses also offer widescreen resolution for high quality (see + # https://github.com/rg3/youtube-dl/issues/7766) + widescreen = True if re.search( + r'courseSupportsWidescreenVideoFormats\s*:\s*true', webpage) else False + best_quality = 'high-widescreen' if widescreen else 'high' + if widescreen: + for allowed_quality in ALLOWED_QUALITIES: + allowed_quality.qualities.append(best_quality) + # In order to minimize the number of calls to ViewClip API and reduce # the probability of being throttled or banned by Pluralsight we will request # only single format until formats listing was explicitly requested. @@ -157,19 +172,19 @@ class PluralsightIE(PluralsightBaseIE): else: def guess_allowed_qualities(): req_format = self._downloader.params.get('format') or 'best' - req_format_split = req_format.split('-') + req_format_split = req_format.split('-', 1) if len(req_format_split) > 1: req_ext, req_quality = req_format_split for allowed_quality in ALLOWED_QUALITIES: if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities: return (AllowedQuality(req_ext, (req_quality, )), ) req_ext = 'webm' if self._downloader.params.get('prefer_free_formats') else 'mp4' - return (AllowedQuality(req_ext, ('high', )), ) + return (AllowedQuality(req_ext, (best_quality, )), ) allowed_qualities = guess_allowed_qualities() formats = [] - for ext, qualities in allowed_qualities: - for quality in qualities: + for ext, qualities_ in allowed_qualities: + for quality in qualities_: f = QUALITIES[quality].copy() clip_post = { 'a': author, @@ -205,6 +220,7 @@ class PluralsightIE(PluralsightBaseIE): 'url': clip_url, 'ext': ext, 'format_id': format_id, + 'quality': quality_key(quality), }) formats.append(f) self._sort_formats(formats) diff --git a/youtube_dl/extractor/skynewsarabia.py b/youtube_dl/extractor/skynewsarabia.py new file mode 100644 index 000000000..05e1b02ad --- /dev/null +++ b/youtube_dl/extractor/skynewsarabia.py @@ -0,0 +1,117 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + parse_iso8601, + parse_duration, +) + + +class SkyNewsArabiaBaseIE(InfoExtractor): + _IMAGE_BASE_URL = 'http://www.skynewsarabia.com/web/images' + + def _call_api(self, path, value): + return self._download_json('http://api.skynewsarabia.com/web/rest/v2/%s/%s.json' % (path, value), value) + + def _get_limelight_media_id(self, url): + return self._search_regex(r'/media/[^/]+/([a-z0-9]{32})', url, 'limelight media id') + + def _get_image_url(self, image_path_template, width='1600', height='1200'): + return self._IMAGE_BASE_URL + image_path_template.format(width=width, height=height) + + def _extract_video_info(self, video_data): + video_id = compat_str(video_data['id']) + topic = video_data.get('topicTitle') + return { + '_type': 'url_transparent', + 'url': 'limelight:media:%s' % self._get_limelight_media_id(video_data['videoUrl'][0]['url']), + 'id': video_id, + 'title': video_data['headline'], + 'description': video_data.get('summary'), + 'thumbnail': self._get_image_url(video_data['mediaAsset']['imageUrl']), + 'timestamp': parse_iso8601(video_data.get('date')), + 'duration': parse_duration(video_data.get('runTime')), + 'tags': video_data.get('tags', []), + 'categories': [topic] if topic else [], + 'webpage_url': 'http://www.skynewsarabia.com/web/video/%s' % video_id, + 'ie_key': 'LimelightMedia', + } + + +class SkyNewsArabiaIE(SkyNewsArabiaBaseIE): + IE_NAME = 'skynewsarabia:video' + _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.skynewsarabia.com/web/video/794902/%D9%86%D8%B5%D9%81-%D9%85%D9%84%D9%8A%D9%88%D9%86-%D9%85%D8%B5%D8%A8%D8%A7%D8%AD-%D8%B4%D8%AC%D8%B1%D8%A9-%D9%83%D8%B1%D9%8A%D8%B3%D9%85%D8%A7%D8%B3', + 'info_dict': { + 'id': '794902', + 'ext': 'flv', + 'title': 'نصف مليون مصباح على شجرة كريسماس', + 'description': 'md5:22f1b27f0850eeb10c7e59b1f16eb7c6', + 'upload_date': '20151128', + 'timestamp': 1448697198, + 'duration': 2119, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._call_api('video', video_id) + return self._extract_video_info(video_data) + + +class SkyNewsArabiaArticleIE(SkyNewsArabiaBaseIE): + IE_NAME = 'skynewsarabia:video' + _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/article/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.skynewsarabia.com/web/article/794549/%D8%A7%D9%94%D8%AD%D8%AF%D8%A7%D8%AB-%D8%A7%D9%84%D8%B4%D8%B1%D9%82-%D8%A7%D9%84%D8%A7%D9%94%D9%88%D8%B3%D8%B7-%D8%AE%D8%B1%D9%8A%D8%B7%D8%A9-%D8%A7%D9%84%D8%A7%D9%94%D9%84%D8%B9%D8%A7%D8%A8-%D8%A7%D9%84%D8%B0%D9%83%D9%8A%D8%A9', + 'info_dict': { + 'id': '794549', + 'ext': 'flv', + 'title': 'بالفيديو.. ألعاب ذكية تحاكي واقع المنطقة', + 'description': 'md5:0c373d29919a851e080ee4edd0c5d97f', + 'upload_date': '20151126', + 'timestamp': 1448559336, + 'duration': 281.6, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, { + 'url': 'http://www.skynewsarabia.com/web/article/794844/%D8%A7%D8%B3%D8%AA%D9%87%D8%AF%D8%A7%D9%81-%D9%82%D9%88%D8%A7%D8%B1%D8%A8-%D8%A7%D9%94%D8%B3%D9%84%D8%AD%D8%A9-%D9%84%D9%85%D9%8A%D9%84%D9%8A%D8%B4%D9%8A%D8%A7%D8%AA-%D8%A7%D9%84%D8%AD%D9%88%D8%AB%D9%8A-%D9%88%D8%B5%D8%A7%D9%84%D8%AD', + 'info_dict': { + 'id': '794844', + 'title': 'إحباط تهريب أسلحة لميليشيات الحوثي وصالح بجنوب اليمن', + 'description': 'md5:5c927b8b2e805796e7f693538d96fc7e', + }, + 'playlist_mincount': 2, + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + article_data = self._call_api('article', article_id) + media_asset = article_data['mediaAsset'] + if media_asset['type'] == 'VIDEO': + topic = article_data.get('topicTitle') + return { + '_type': 'url_transparent', + 'url': 'limelight:media:%s' % self._get_limelight_media_id(media_asset['videoUrl'][0]['url']), + 'id': article_id, + 'title': article_data['headline'], + 'description': article_data.get('summary'), + 'thumbnail': self._get_image_url(media_asset['imageUrl']), + 'timestamp': parse_iso8601(article_data.get('date')), + 'tags': article_data.get('tags', []), + 'categories': [topic] if topic else [], + 'webpage_url': url, + 'ie_key': 'LimelightMedia', + } + entries = [self._extract_video_info(item) for item in article_data.get('inlineItems', []) if item['type'] == 'VIDEO'] + return self.playlist_result(entries, article_id, article_data['headline'], article_data.get('summary')) diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index daf6ad555..ea8fc258d 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -158,6 +158,7 @@ class SohuIE(InfoExtractor): 'file': clips_url[i], 'new': su[i], 'prod': 'flash', + 'rb': 1, } if cdnId is not None: diff --git a/youtube_dl/extractor/srf.py b/youtube_dl/extractor/srf.py index 77eec0bc7..16e1bf2d6 100644 --- a/youtube_dl/extractor/srf.py +++ b/youtube_dl/extractor/srf.py @@ -11,7 +11,7 @@ from ..utils import ( class SrfIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.srf\.ch/play(?:er)?/tv/[^/]+/video/(?P<display_id>[^?]+)\?id=|tp\.srgssr\.ch/p/flash\?urn=urn:srf:ais:video:)(?P<id>[0-9a-f\-]{36})' + _VALID_URL = r'https?://(?:www\.srf\.ch/play(?:er)?/(?:tv|radio)/[^/]+/(?P<media_type>video|audio)/(?P<display_id>[^?]+)\?id=|tp\.srgssr\.ch/p/flash\?urn=urn:srf:ais:video:)(?P<id>[0-9a-f\-]{36})' _TESTS = [{ 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', 'md5': '4cd93523723beff51bb4bee974ee238d', @@ -35,6 +35,20 @@ class SrfIE(InfoExtractor): 'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive', 'timestamp': 1373493600, }, + }, { + 'url': 'http://www.srf.ch/play/radio/hoerspielarchiv-srf-musikwelle/audio/saegel-ohni-wind-von-jakob-stebler?id=415bf3d3-6429-4de7-968d-95866e37cfbc', + 'md5': '', + 'info_dict': { + 'id': '415bf3d3-6429-4de7-968d-95866e37cfbc', + 'display_id': 'saegel-ohni-wind-von-jakob-stebler', + 'ext': 'mp3', + 'upload_date': '20080518', + 'title': '«Sägel ohni Wind» von Jakob Stebler', + 'timestamp': 1211112000, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, }, { 'url': 'http://www.srf.ch/player/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', 'only_matching': True, @@ -44,11 +58,13 @@ class SrfIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) - display_id = re.match(self._VALID_URL, url).group('display_id') or video_id + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + media_type = mobj.group('media_type') + display_id = mobj.group('display_id') or video_id video_data = self._download_xml( - 'http://il.srgssr.ch/integrationlayer/1.0/ue/srf/video/play/%s.xml' % video_id, + 'http://il.srgssr.ch/integrationlayer/1.0/ue/srf/%s/play/%s.xml' % (media_type, video_id), display_id) title = xpath_text( @@ -64,7 +80,7 @@ class SrfIE(InfoExtractor): for url_node in item.findall('url'): quality = url_node.attrib['quality'] full_url = url_node.text - original_ext = determine_ext(full_url) + original_ext = determine_ext(full_url).lower() format_id = '%s-%s' % (quality, item.attrib['protocol']) if original_ext == 'f4m': formats.extend(self._extract_f4m_formats( diff --git a/youtube_dl/extractor/teachingchannel.py b/youtube_dl/extractor/teachingchannel.py index 117afa9bf..e0477382c 100644 --- a/youtube_dl/extractor/teachingchannel.py +++ b/youtube_dl/extractor/teachingchannel.py @@ -16,6 +16,7 @@ class TeachingChannelIE(InfoExtractor): 'ext': 'mp4', 'title': 'A History of Teaming', 'description': 'md5:2a9033db8da81f2edffa4c99888140b3', + 'duration': 422.255, }, 'params': { # m3u8 download diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 1555aa77c..0bf6726b5 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -16,11 +16,12 @@ from ..compat import ( from ..utils import ( determine_ext, ExtractorError, - xpath_with_ns, - unsmuggle_url, - int_or_none, - url_basename, float_or_none, + int_or_none, + sanitized_Request, + unsmuggle_url, + url_basename, + xpath_with_ns, ) default_ns = 'http://www.w3.org/2005/SMIL21/Language' @@ -204,7 +205,12 @@ class ThePlatformIE(ThePlatformBaseIE): smil_url = url # Explicitly specified SMIL (see https://github.com/rg3/youtube-dl/issues/7385) elif '/guid/' in url: - webpage = self._download_webpage(url, video_id) + headers = {} + source_url = smuggled_data.get('source_url') + if source_url: + headers['Referer'] = source_url + request = sanitized_Request(url, headers=headers) + webpage = self._download_webpage(request, video_id) smil_url = self._search_regex( r'<link[^>]+href=(["\'])(?P<url>.+?)\1[^>]+type=["\']application/smil\+xml', webpage, 'smil url', group='url') diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py index 185accc4b..a800449e9 100644 --- a/youtube_dl/extractor/trilulilu.py +++ b/youtube_dl/extractor/trilulilu.py @@ -1,80 +1,103 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, +) class TriluliluIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?trilulilu\.ro/(?:video-[^/]+/)?(?P<id>[^/#\?]+)' - _TEST = { - 'url': 'http://www.trilulilu.ro/video-animatie/big-buck-bunny-1', - 'md5': 'c1450a00da251e2769b74b9005601cac', + _VALID_URL = r'https?://(?:(?:www|m)\.)?trilulilu\.ro/(?:[^/]+/)?(?P<id>[^/#\?]+)' + _TESTS = [{ + 'url': 'http://www.trilulilu.ro/big-buck-bunny-1', + 'md5': '68da087b676a6196a413549212f60cc6', 'info_dict': { 'id': 'ae2899e124140b', 'ext': 'mp4', 'title': 'Big Buck Bunny', 'description': ':) pentru copilul din noi', + 'uploader_id': 'chipy', + 'upload_date': '20120304', + 'timestamp': 1330830647, + 'uploader': 'chipy', + 'view_count': int, + 'like_count': int, + 'comment_count': int, }, - } + }, { + 'url': 'http://www.trilulilu.ro/adena-ft-morreti-inocenta', + 'md5': '929dfb8729dc71750463af88bbbbf4a4', + 'info_dict': { + 'id': 'f299710e3c91c5', + 'ext': 'mp4', + 'title': 'Adena ft. Morreti - Inocenta', + 'description': 'pop music', + 'uploader_id': 'VEVOmixt', + 'upload_date': '20151204', + 'uploader': 'VEVOmixt', + 'timestamp': 1449187937, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + }] def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + media_info = self._download_json('http://m.trilulilu.ro/%s?format=json' % display_id, display_id) - if re.search(r'Fişierul nu este disponibil pentru vizionare în ţara dumneavoastră', webpage): - raise ExtractorError( - 'This video is not available in your country.', expected=True) - elif re.search('Fişierul poate fi accesat doar de către prietenii lui', webpage): + age_limit = 0 + errors = media_info.get('errors', {}) + if errors.get('friends'): raise ExtractorError('This video is private.', expected=True) + elif errors.get('geoblock'): + raise ExtractorError('This video is not available in your country.', expected=True) + elif errors.get('xxx_unlogged'): + age_limit = 18 - flashvars_str = self._search_regex( - r'block_flash_vars\s*=\s*(\{[^\}]+\})', webpage, 'flashvars', fatal=False, default=None) + media_class = media_info.get('class') + if media_class not in ('video', 'audio'): + raise ExtractorError('not a video or an audio') - if flashvars_str: - flashvars = self._parse_json(flashvars_str, display_id) + user = media_info.get('user', {}) + + thumbnail = media_info.get('cover_url') + if thumbnail: + thumbnail.format(width='1600', height='1200') + + # TODO: get correct ext for audio files + stream_type = media_info.get('stream_type') + formats = [{ + 'url': media_info['href'], + 'ext': stream_type, + }] + if media_info.get('is_hd'): + formats.append({ + 'format_id': 'hd', + 'url': media_info['hrefhd'], + 'ext': stream_type, + }) + if media_class == 'audio': + formats[0]['vcodec'] = 'none' else: - raise ExtractorError( - 'This page does not contain videos', expected=True) - - if flashvars['isMP3'] == 'true': - raise ExtractorError( - 'Audio downloads are currently not supported', expected=True) - - video_id = flashvars['hash'] - title = self._og_search_title(webpage) - thumbnail = self._og_search_thumbnail(webpage) - description = self._og_search_description(webpage, default=None) - - format_url = ('http://fs%(server)s.trilulilu.ro/%(hash)s/' - 'video-formats2' % flashvars) - format_doc = self._download_xml( - format_url, video_id, - note='Downloading formats', - errnote='Error while downloading formats') - - video_url_template = ( - 'http://fs%(server)s.trilulilu.ro/stream.php?type=video' - '&source=site&hash=%(hash)s&username=%(userid)s&' - 'key=ministhebest&format=%%s&sig=&exp=' % - flashvars) - formats = [ - { - 'format_id': fnode.text.partition('-')[2], - 'url': video_url_template % fnode.text, - 'ext': fnode.text.partition('-')[0] - } - - for fnode in format_doc.findall('./formats/format') - ] + formats[0]['format_id'] = 'sd' return { - 'id': video_id, + 'id': media_info['identifier'].split('|')[1], 'display_id': display_id, 'formats': formats, - 'title': title, - 'description': description, + 'title': media_info['title'], + 'description': media_info.get('description'), 'thumbnail': thumbnail, + 'uploader_id': user.get('username'), + 'uploader': user.get('fullname'), + 'timestamp': parse_iso8601(media_info.get('published'), ' '), + 'duration': int_or_none(media_info.get('duration')), + 'view_count': int_or_none(media_info.get('count_views')), + 'like_count': int_or_none(media_info.get('count_likes')), + 'comment_count': int_or_none(media_info.get('count_comments')), + 'age_limit': age_limit, } diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index 01af7a995..3db6286e4 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -15,6 +15,7 @@ class ViceIE(InfoExtractor): 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', 'ext': 'mp4', 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', + 'duration': 725.983, }, 'params': { # Requires ffmpeg (m3u8 manifest) diff --git a/youtube_dl/extractor/videoweed.py b/youtube_dl/extractor/videoweed.py deleted file mode 100644 index ca2e50935..000000000 --- a/youtube_dl/extractor/videoweed.py +++ /dev/null @@ -1,26 +0,0 @@ -from __future__ import unicode_literals - -from .novamov import NovaMovIE - - -class VideoWeedIE(NovaMovIE): - IE_NAME = 'videoweed' - IE_DESC = 'VideoWeed' - - _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'videoweed\.(?:es|com)'} - - _HOST = 'www.videoweed.es' - - _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' - _TITLE_REGEX = r'<h1 class="text_shadow">([^<]+)</h1>' - - _TEST = { - 'url': 'http://www.videoweed.es/file/b42178afbea14', - 'md5': 'abd31a2132947262c50429e1d16c1bfd', - 'info_dict': { - 'id': 'b42178afbea14', - 'ext': 'flv', - 'title': 'optical illusion dissapeared image magic illusion', - 'description': '' - }, - } diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index d99a42a9f..90557fa61 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -18,6 +18,7 @@ from ..utils import ( unified_strdate, ) from .vimeo import VimeoIE +from .pladform import PladformIE class VKIE(InfoExtractor): @@ -164,6 +165,11 @@ class VKIE(InfoExtractor): # vk wrapper 'url': 'http://www.biqle.ru/watch/847655_160197695', 'only_matching': True, + }, + { + # pladform embed + 'url': 'https://vk.com/video-76116461_171554880', + 'only_matching': True, } ] @@ -254,10 +260,13 @@ class VKIE(InfoExtractor): if vimeo_url is not None: return self.url_result(vimeo_url) + pladform_url = PladformIE._extract_url(info_page) + if pladform_url: + return self.url_result(pladform_url) + m_rutube = re.search( r'\ssrc="((?:https?:)?//rutube\.ru\\?/video\\?/embed(?:.*?))\\?"', info_page) if m_rutube is not None: - self.to_screen('rutube video detected') rutube_url = self._proto_relative_url( m_rutube.group(1).replace('\\', '')) return self.url_result(rutube_url) diff --git a/youtube_dl/extractor/wimp.py b/youtube_dl/extractor/wimp.py index e4f50e64c..041ff6c55 100644 --- a/youtube_dl/extractor/wimp.py +++ b/youtube_dl/extractor/wimp.py @@ -5,7 +5,7 @@ from .youtube import YoutubeIE class WimpIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?wimp\.com/(?P<id>[^/]+)/' + _VALID_URL = r'http://(?:www\.)?wimp\.com/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.wimp.com/maruexhausted/', 'md5': 'ee21217ffd66d058e8b16be340b74883', @@ -28,18 +28,23 @@ class WimpIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - video_url = self._search_regex( - [r"[\"']file[\"']\s*[:,]\s*[\"'](.+?)[\"']", r"videoId\s*:\s*[\"']([^\"']+)[\"']"], - webpage, 'video URL') - if YoutubeIE.suitable(video_url): - self.to_screen('Found YouTube video') + + youtube_id = self._search_regex( + r"videoId\s*:\s*[\"']([0-9A-Za-z_-]{11})[\"']", + webpage, 'video URL', default=None) + if youtube_id: return { '_type': 'url', - 'url': video_url, + 'url': youtube_id, 'ie_key': YoutubeIE.ie_key(), } + video_url = self._search_regex( + r'<video[^>]+>\s*<source[^>]+src=(["\'])(?P<url>.+?)\1', + webpage, 'video URL', group='url') + return { 'id': video_id, 'url': video_url, diff --git a/youtube_dl/version.py b/youtube_dl/version.py index bd0de9f53..a4e9d7072 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.11.27.1' +__version__ = '2015.12.06'