diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index c65462ba4..704a8b911 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.07*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.07** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.07.11*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.07.11** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.07.07 +[debug] youtube-dl version 2016.07.11 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/AUTHORS b/AUTHORS index f74b30d07..f762e8a16 100644 --- a/AUTHORS +++ b/AUTHORS @@ -177,3 +177,4 @@ Roman Tsiupa Artur Krysiak Jakub Adam Wieczorek Aleksandar Topuzović +Nehal Patel diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 9174c6f89..5bcd6de1c 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -224,6 +224,7 @@ - **Firstpost** - **FiveTV** - **Flickr** + - **Flipagram** - **Folketinget**: Folketinget (ft.dk; Danish parliament) - **FootyRoom** - **Formula1** @@ -553,6 +554,7 @@ - **RICE** - **RingTV** - **RockstarGames** + - **RoosterTeeth** - **RottenTomatoes** - **Roxwel** - **RTBF** diff --git a/test/test_utils.py b/test/test_utils.py index 651079b5a..09494e87c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -81,6 +81,7 @@ from youtube_dl.utils import ( cli_option, cli_valueless_option, cli_bool_option, + parse_codecs, ) from youtube_dl.compat import ( compat_chr, @@ -608,6 +609,29 @@ class TestUtil(unittest.TestCase): limit_length('foo bar baz asd', 12).startswith('foo bar')) self.assertTrue('...' in limit_length('foo bar baz asd', 12)) + def test_parse_codecs(self): + self.assertEqual(parse_codecs(''), {}) + self.assertEqual(parse_codecs('avc1.77.30, mp4a.40.2'), { + 'vcodec': 'avc1.77.30', + 'acodec': 'mp4a.40.2', + }) + self.assertEqual(parse_codecs('mp4a.40.2'), { + 'vcodec': 'none', + 'acodec': 'mp4a.40.2', + }) + self.assertEqual(parse_codecs('mp4a.40.5,avc1.42001e'), { + 'vcodec': 'avc1.42001e', + 'acodec': 'mp4a.40.5', + }) + self.assertEqual(parse_codecs('avc3.640028'), { + 'vcodec': 'avc3.640028', + 'acodec': 'none', + }) + self.assertEqual(parse_codecs(', h264,,newcodec,aac'), { + 'vcodec': 'h264', + 'acodec': 'aac', + }) + def test_escape_rfc3986(self): reserved = "!*'();:@&=+$,/?#[]" unreserved = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~' diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 9b01e38f5..9e28f2579 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -22,6 +22,7 @@ class AnimeOnDemandIE(InfoExtractor): _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' _NETRC_MACHINE = 'animeondemand' _TESTS = [{ + # jap, OmU 'url': 'https://www.anime-on-demand.de/anime/161', 'info_dict': { 'id': '161', @@ -30,17 +31,21 @@ class AnimeOnDemandIE(InfoExtractor): }, 'playlist_mincount': 4, }, { - # Film wording is used instead of Episode + # Film wording is used instead of Episode, ger/jap, Dub/OmU 'url': 'https://www.anime-on-demand.de/anime/39', 'only_matching': True, }, { - # Episodes without titles + # Episodes without titles, jap, OmU 'url': 'https://www.anime-on-demand.de/anime/162', 'only_matching': True, }, { # ger/jap, Dub/OmU, account required 'url': 'https://www.anime-on-demand.de/anime/169', 'only_matching': True, + }, { + # Full length film, non-series, ger/jap, Dub/OmU, account required + 'url': 'https://www.anime-on-demand.de/anime/185', + 'only_matching': True, }] def _login(self): @@ -110,35 +115,12 @@ class AnimeOnDemandIE(InfoExtractor): entries = [] - for num, episode_html in enumerate(re.findall( - r'(?s)]+class="episodebox-title".+?>Episodeninhalt<', webpage), 1): - episodebox_title = self._search_regex( - (r'class="episodebox-title"[^>]+title=(["\'])(?P.+?)\1', - r'class="episodebox-title"[^>]+>(?P<title>.+?)<'), - episode_html, 'episodebox title', default=None, group='title') - if not episodebox_title: - continue - - episode_number = int(self._search_regex( - r'(?:Episode|Film)\s*(\d+)', - episodebox_title, 'episode number', default=num)) - episode_title = self._search_regex( - r'(?:Episode|Film)\s*\d+\s*-\s*(.+)', - episodebox_title, 'episode title', default=None) - - video_id = 'episode-%d' % episode_number - - common_info = { - 'id': video_id, - 'series': anime_title, - 'episode': episode_title, - 'episode_number': episode_number, - } - + def extract_info(html, video_id, num=None): + title, description = [None] * 2 formats = [] for input_ in re.findall( - r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', episode_html): + r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', html): attributes = extract_attributes(input_) playlist_urls = [] for playlist_key in ('data-playlist', 'data-otherplaylist'): @@ -161,7 +143,7 @@ class AnimeOnDemandIE(InfoExtractor): format_id_list.append(lang) if kind: format_id_list.append(kind) - if not format_id_list: + if not format_id_list and num is not None: format_id_list.append(compat_str(num)) format_id = '-'.join(format_id_list) format_note = ', '.join(filter(None, (kind, lang_note))) @@ -215,28 +197,74 @@ class AnimeOnDemandIE(InfoExtractor): }) formats.extend(file_formats) - if formats: - self._sort_formats(formats) + return { + 'title': title, + 'description': description, + 'formats': formats, + } + + def extract_entries(html, video_id, common_info, num=None): + info = extract_info(html, video_id, num) + + if info['formats']: + self._sort_formats(info['formats']) f = common_info.copy() - f.update({ - 'title': title, - 'description': description, - 'formats': formats, - }) + f.update(info) entries.append(f) - # Extract teaser only when full episode is not available - if not formats: + # Extract teaser/trailer only when full episode is not available + if not info['formats']: m = re.search( - r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<', - episode_html) + r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>(?P<kind>Teaser|Trailer)<', + html) if m: f = common_info.copy() f.update({ - 'id': '%s-teaser' % f['id'], + 'id': '%s-%s' % (f['id'], m.group('kind').lower()), 'title': m.group('title'), 'url': compat_urlparse.urljoin(url, m.group('href')), }) entries.append(f) + def extract_episodes(html): + for num, episode_html in enumerate(re.findall( + r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', html), 1): + episodebox_title = self._search_regex( + (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1', + r'class="episodebox-title"[^>]+>(?P<title>.+?)<'), + episode_html, 'episodebox title', default=None, group='title') + if not episodebox_title: + continue + + episode_number = int(self._search_regex( + r'(?:Episode|Film)\s*(\d+)', + episodebox_title, 'episode number', default=num)) + episode_title = self._search_regex( + r'(?:Episode|Film)\s*\d+\s*-\s*(.+)', + episodebox_title, 'episode title', default=None) + + video_id = 'episode-%d' % episode_number + + common_info = { + 'id': video_id, + 'series': anime_title, + 'episode': episode_title, + 'episode_number': episode_number, + } + + extract_entries(episode_html, video_id, common_info) + + def extract_film(html, video_id): + common_info = { + 'id': anime_id, + 'title': anime_title, + 'description': anime_description, + } + extract_entries(html, video_id, common_info) + + extract_episodes(webpage) + + if not entries: + extract_film(webpage, anime_id) + return self.playlist_result(entries, anime_id, anime_title, anime_description) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index fd45b3e42..13a06396d 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -13,6 +13,7 @@ from ..utils import ( parse_duration, unified_strdate, xpath_text, + update_url_query, ) from ..compat import compat_etree_fromstring @@ -34,6 +35,7 @@ class ARDMediathekIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916', 'md5': 'f4d98b10759ac06c0072bbcd1f0b9e3e', @@ -44,6 +46,7 @@ class ARDMediathekIE(InfoExtractor): 'description': 'md5:196392e79876d0ac94c94e8cdb2875f1', 'duration': 5252, }, + 'skip': 'HTTP Error 404: Not Found', }, { # audio 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', @@ -55,6 +58,7 @@ class ARDMediathekIE(InfoExtractor): 'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef', 'duration': 3240, }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', 'only_matching': True, @@ -113,11 +117,14 @@ class ARDMediathekIE(InfoExtractor): continue if ext == 'f4m': formats.extend(self._extract_f4m_formats( - stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', - video_id, preference=-1, f4m_id='hds', fatal=False)) + update_url_query(stream_url, { + 'hdcore': '3.1.1', + 'plugin': 'aasp-3.1.1.69.124' + }), + video_id, f4m_id='hds', fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - stream_url, video_id, 'mp4', preference=1, m3u8_id='hls', fatal=False)) + stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) else: if server and server.startswith('rtmp'): f = { @@ -231,7 +238,8 @@ class ARDIE(InfoExtractor): 'title': 'Die Story im Ersten: Mission unter falscher Flagge', 'upload_date': '20140804', 'thumbnail': 're:^https?://.*\.jpg$', - } + }, + 'skip': 'HTTP Error 404: Not Found', } def _real_extract(self, url): diff --git a/youtube_dl/extractor/biobiochiletv.py b/youtube_dl/extractor/biobiochiletv.py index 133228133..7608c0a08 100644 --- a/youtube_dl/extractor/biobiochiletv.py +++ b/youtube_dl/extractor/biobiochiletv.py @@ -2,11 +2,15 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import remove_end +from ..utils import ( + ExtractorError, + remove_end, +) +from .rudo import RudoIE class BioBioChileTVIE(InfoExtractor): - _VALID_URL = r'https?://tv\.biobiochile\.cl/notas/(?:[^/]+/)+(?P<id>[^/]+)\.shtml' + _VALID_URL = r'https?://(?:tv|www)\.biobiochile\.cl/(?:notas|noticias)/(?:[^/]+/)+(?P<id>[^/]+)\.shtml' _TESTS = [{ 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/sobre-camaras-y-camarillas-parlamentarias.shtml', @@ -18,6 +22,7 @@ class BioBioChileTVIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'Fernando Atria', }, + 'skip': 'URL expired and redirected to http://www.biobiochile.cl/portada/bbtv/index.html', }, { # different uploader layout 'url': 'http://tv.biobiochile.cl/notas/2016/03/18/natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades.shtml', @@ -32,6 +37,16 @@ class BioBioChileTVIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'URL expired and redirected to http://www.biobiochile.cl/portada/bbtv/index.html', + }, { + 'url': 'http://www.biobiochile.cl/noticias/bbtv/comentarios-bio-bio/2016/07/08/edecanes-del-congreso-figuras-decorativas-que-le-cuestan-muy-caro-a-los-chilenos.shtml', + 'info_dict': { + 'id': 'edecanes-del-congreso-figuras-decorativas-que-le-cuestan-muy-caro-a-los-chilenos', + 'ext': 'mp4', + 'uploader': '(none)', + 'upload_date': '20160708', + 'title': 'Edecanes del Congreso: Figuras decorativas que le cuestan muy caro a los chilenos', + }, }, { 'url': 'http://tv.biobiochile.cl/notas/2015/10/22/ninos-transexuales-de-quien-es-la-decision.shtml', 'only_matching': True, @@ -45,42 +60,22 @@ class BioBioChileTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) + rudo_url = RudoIE._extract_url(webpage) + if not rudo_url: + raise ExtractorError('No videos found') + title = remove_end(self._og_search_title(webpage), ' - BioBioChile TV') - file_url = self._search_regex( - r'loadFWPlayerVideo\([^,]+,\s*(["\'])(?P<url>.+?)\1', - webpage, 'file url', group='url') - - base_url = self._search_regex( - r'file\s*:\s*(["\'])(?P<url>.+?)\1\s*\+\s*fileURL', webpage, - 'base url', default='http://unlimited2-cl.digitalproserver.com/bbtv/', - group='url') - - formats = self._extract_m3u8_formats( - '%s%s/playlist.m3u8' % (base_url, file_url), video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) - f = { - 'url': '%s%s' % (base_url, file_url), - 'format_id': 'http', - 'protocol': 'http', - 'preference': 1, - } - if formats: - f_copy = formats[-1].copy() - f_copy.update(f) - f = f_copy - formats.append(f) - self._sort_formats(formats) - thumbnail = self._og_search_thumbnail(webpage) uploader = self._html_search_regex( - r'<a[^>]+href=["\']https?://busca\.biobiochile\.cl/author[^>]+>(.+?)</a>', + r'<a[^>]+href=["\']https?://(?:busca|www)\.biobiochile\.cl/(?:lista/)?(?:author|autor)[^>]+>(.+?)</a>', webpage, 'uploader', fatal=False) return { + '_type': 'url_transparent', + 'url': rudo_url, 'id': video_id, 'title': title, 'thumbnail': thumbnail, 'uploader': uploader, - 'formats': formats, } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index be2b6ff66..df546da27 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -44,6 +44,7 @@ from ..utils import ( sanitized_Request, unescapeHTML, unified_strdate, + unified_timestamp, url_basename, xpath_element, xpath_text, @@ -54,6 +55,8 @@ from ..utils import ( update_Request, update_url_query, parse_m3u8_attributes, + extract_attributes, + parse_codecs, ) @@ -161,6 +164,7 @@ class InfoExtractor(object): * "height" (optional, int) * "resolution" (optional, string "{width}x{height"}, deprecated) + * "filesize" (optional, int) thumbnail: Full URL to a video thumbnail image. description: Full video description. uploader: Full name of the video uploader. @@ -803,15 +807,17 @@ class InfoExtractor(object): return self._html_search_meta('twitter:player', html, 'twitter card player') - def _search_json_ld(self, html, video_id, **kwargs): + def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): json_ld = self._search_regex( r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', html, 'JSON-LD', group='json_ld', **kwargs) if not json_ld: return {} - return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True)) + return self._json_ld( + json_ld, video_id, fatal=kwargs.get('fatal', True), + expected_type=expected_type) - def _json_ld(self, json_ld, video_id, fatal=True): + def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): if isinstance(json_ld, compat_str): json_ld = self._parse_json(json_ld, video_id, fatal=fatal) if not json_ld: @@ -819,6 +825,8 @@ class InfoExtractor(object): info = {} if json_ld.get('@context') == 'http://schema.org': item_type = json_ld.get('@type') + if expected_type is not None and expected_type != item_type: + return info if item_type == 'TVEpisode': info.update({ 'episode': unescapeHTML(json_ld.get('name')), @@ -837,6 +845,19 @@ class InfoExtractor(object): 'title': unescapeHTML(json_ld.get('headline')), 'description': unescapeHTML(json_ld.get('articleBody')), }) + elif item_type == 'VideoObject': + info.update({ + 'url': json_ld.get('contentUrl'), + 'title': unescapeHTML(json_ld.get('name')), + 'description': unescapeHTML(json_ld.get('description')), + 'thumbnail': json_ld.get('thumbnailUrl'), + 'duration': parse_duration(json_ld.get('duration')), + 'timestamp': unified_timestamp(json_ld.get('uploadDate')), + 'filesize': float_or_none(json_ld.get('contentSize')), + 'tbr': int_or_none(json_ld.get('bitrate')), + 'width': int_or_none(json_ld.get('width')), + 'height': int_or_none(json_ld.get('height')), + }) return dict((k, v) for k, v in info.items() if v is not None) @staticmethod @@ -1616,6 +1637,62 @@ class InfoExtractor(object): self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats + def _parse_html5_media_entries(self, base_url, webpage): + def absolute_url(video_url): + return compat_urlparse.urljoin(base_url, video_url) + + def parse_content_type(content_type): + if not content_type: + return {} + ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type) + if ctr: + mimetype, codecs = ctr.groups() + f = parse_codecs(codecs) + f['ext'] = mimetype2ext(mimetype) + return f + return {} + + entries = [] + for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage): + media_info = { + 'formats': [], + 'subtitles': {}, + } + media_attributes = extract_attributes(media_tag) + src = media_attributes.get('src') + if src: + media_info['formats'].append({ + 'url': absolute_url(src), + 'vcodec': 'none' if media_type == 'audio' else None, + }) + media_info['thumbnail'] = media_attributes.get('poster') + if media_content: + for source_tag in re.findall(r'<source[^>]+>', media_content): + source_attributes = extract_attributes(source_tag) + src = source_attributes.get('src') + if not src: + continue + f = parse_content_type(source_attributes.get('type')) + f.update({ + 'url': absolute_url(src), + 'vcodec': 'none' if media_type == 'audio' else None, + }) + media_info['formats'].append(f) + for track_tag in re.findall(r'<track[^>]+>', media_content): + track_attributes = extract_attributes(track_tag) + kind = track_attributes.get('kind') + if not kind or kind == 'subtitles': + src = track_attributes.get('src') + if not src: + continue + lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label') + media_info['subtitles'].setdefault(lang, []).append({ + 'url': absolute_url(src), + }) + if media_info['formats']: + entries.append(media_info) + return entries + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 12cc1b5f7..b08df41b4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -256,6 +256,7 @@ from .fivemin import FiveMinIE from .fivetv import FiveTVIE from .fktv import FKTVIE from .flickr import FlickrIE +from .flipagram import FlipagramIE from .folketinget import FolketingetIE from .footyroom import FootyRoomIE from .formula1 import Formula1IE @@ -679,6 +680,7 @@ from .rice import RICEIE from .ringtv import RingTVIE from .ro220 import Ro220IE from .rockstargames import RockstarGamesIE +from .roosterteeth import RoosterTeethIE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rtbf import RTBFIE @@ -689,6 +691,7 @@ from .rtp import RTPIE from .rts import RTSIE from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE from .rtvnh import RTVNHIE +from .rudo import RudoIE from .ruhd import RUHDIE from .ruleporn import RulePornIE from .rutube import ( diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index f5d4f966a..cdb093262 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -219,12 +219,25 @@ class FacebookIE(InfoExtractor): BEFORE = '{swf.addParam(param[0], param[1]);});' AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});' - m = re.search(re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER), webpage) - if m: - swf_params = m.group(1).replace('\\\\', '\\').replace('\\"', '"') + PATTERN = re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER) + + for m in re.findall(PATTERN, webpage): + swf_params = m.replace('\\\\', '\\').replace('\\"', '"') data = dict(json.loads(swf_params)) params_raw = compat_urllib_parse_unquote(data['params']) - video_data = json.loads(params_raw)['video_data'] + video_data_candidate = json.loads(params_raw)['video_data'] + for _, f in video_data_candidate.items(): + if not f: + continue + if isinstance(f, dict): + f = [f] + if not isinstance(f, list): + continue + if f[0].get('video_id') == video_id: + video_data = video_data_candidate + break + if video_data: + break def video_data_list2dict(video_data): ret = {} diff --git a/youtube_dl/extractor/flipagram.py b/youtube_dl/extractor/flipagram.py new file mode 100644 index 000000000..acb6133ff --- /dev/null +++ b/youtube_dl/extractor/flipagram.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + float_or_none, + try_get, + unified_timestamp, +) + + +class FlipagramIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?flipagram\.com/f/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://flipagram.com/f/nyvTSJMKId', + 'md5': '888dcf08b7ea671381f00fab74692755', + 'info_dict': { + 'id': 'nyvTSJMKId', + 'ext': 'mp4', + 'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction', + 'description': 'md5:d55e32edc55261cae96a41fa85ff630e', + 'duration': 35.571, + 'timestamp': 1461244995, + 'upload_date': '20160421', + 'uploader': 'kitty juria', + 'uploader_id': 'sjuria101', + 'creator': 'kitty juria', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'comments': list, + 'formats': 'mincount:2', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_data = self._parse_json( + self._search_regex( + r'window\.reactH2O\s*=\s*({.+});', webpage, 'video data'), + video_id) + + flipagram = video_data['flipagram'] + video = flipagram['video'] + + json_ld = self._search_json_ld(webpage, video_id, default=False) + title = json_ld.get('title') or flipagram['captionText'] + description = json_ld.get('description') or flipagram.get('captionText') + + formats = [{ + 'url': video['url'], + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + 'filesize': int_or_none(video_data.get('size')), + }] + + preview_url = try_get( + flipagram, lambda x: x['music']['track']['previewUrl'], compat_str) + if preview_url: + formats.append({ + 'url': preview_url, + 'ext': 'm4a', + 'vcodec': 'none', + }) + + self._sort_formats(formats) + + counts = flipagram.get('counts', {}) + user = flipagram.get('user', {}) + video_data = flipagram.get('video', {}) + + thumbnails = [{ + 'url': self._proto_relative_url(cover['url']), + 'width': int_or_none(cover.get('width')), + 'height': int_or_none(cover.get('height')), + 'filesize': int_or_none(cover.get('size')), + } for cover in flipagram.get('covers', []) if cover.get('url')] + + # Note that this only retrieves comments that are initally loaded. + # For videos with large amounts of comments, most won't be retrieved. + comments = [] + for comment in video_data.get('comments', {}).get(video_id, {}).get('items', []): + text = comment.get('comment') + if not text or not isinstance(text, list): + continue + comments.append({ + 'author': comment.get('user', {}).get('name'), + 'author_id': comment.get('user', {}).get('username'), + 'id': comment.get('id'), + 'text': text[0], + 'timestamp': unified_timestamp(comment.get('created')), + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': float_or_none(flipagram.get('duration'), 1000), + 'thumbnails': thumbnails, + 'timestamp': unified_timestamp(flipagram.get('iso8601Created')), + 'uploader': user.get('name'), + 'uploader_id': user.get('username'), + 'creator': user.get('name'), + 'view_count': int_or_none(counts.get('plays')), + 'like_count': int_or_none(counts.get('likes')), + 'repost_count': int_or_none(counts.get('reflips')), + 'comment_count': int_or_none(counts.get('comments')), + 'comments': comments, + 'formats': formats, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 31527d1c6..cddd1a817 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1313,6 +1313,38 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, + { + # Non-standard Vimeo embed + 'url': 'https://openclassrooms.com/courses/understanding-the-web', + 'md5': '64d86f1c7d369afd9a78b38cbb88d80a', + 'info_dict': { + 'id': '148867247', + 'ext': 'mp4', + 'title': 'Understanding the web - Teaser', + 'description': 'This is "Understanding the web - Teaser" by openclassrooms on Vimeo, the home for high quality videos and the people who love them.', + 'upload_date': '20151214', + 'uploader': 'OpenClassrooms', + 'uploader_id': 'openclassrooms', + }, + 'add_ie': ['Vimeo'], + }, + # { + # # TODO: find another test + # # http://schema.org/VideoObject + # 'url': 'https://flipagram.com/f/nyvTSJMKId', + # 'md5': '888dcf08b7ea671381f00fab74692755', + # 'info_dict': { + # 'id': 'nyvTSJMKId', + # 'ext': 'mp4', + # 'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction', + # 'description': '#love for cats.', + # 'timestamp': 1461244995, + # 'upload_date': '20160421', + # }, + # 'params': { + # 'force_generic_extractor': True, + # }, + # } ] def report_following_redirect(self, new_url): @@ -2157,6 +2189,19 @@ class GenericIE(InfoExtractor): if embed_url: return self.url_result(embed_url) + # Looking for http://schema.org/VideoObject + json_ld = self._search_json_ld( + webpage, video_id, default=None, expected_type='VideoObject') + if json_ld and json_ld.get('url'): + info_dict.update({ + 'title': video_title or info_dict['title'], + 'description': video_description, + 'thumbnail': video_thumbnail, + 'age_limit': age_limit + }) + info_dict.update(json_ld) + return info_dict + def check_video(vurl): if YoutubeIE.suitable(vurl): return True diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index 171b705c7..e9cc9aa59 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -23,6 +23,7 @@ from ..utils import ( str_or_none, url_basename, urshift, + update_url_query, ) @@ -89,6 +90,10 @@ class LeIE(InfoExtractor): _loc3_ = self.ror(_loc3_, _loc2_ % 17) return _loc3_ + # reversed from http://jstatic.letvcdn.com/sdk/player.js + def get_mms_key(self, time): + return self.ror(time, 8) ^ 185025305 + # see M3U8Encryption class in KLetvPlayer.swf @staticmethod def decrypt_m3u8(encrypted_data): @@ -109,23 +114,7 @@ class LeIE(InfoExtractor): return bytes(_loc7_) - def _real_extract(self, url): - media_id = self._match_id(url) - page = self._download_webpage(url, media_id) - params = { - 'id': media_id, - 'platid': 1, - 'splatid': 101, - 'format': 1, - 'tkey': self.calc_time_key(int(time.time())), - 'domain': 'www.le.com' - } - - play_json = self._download_json( - 'http://api.le.com/mms/out/video/playJson', - media_id, 'Downloading playJson data', query=params, - headers=self.geo_verification_headers()) - + def _check_errors(self, play_json): # Check for errors playstatus = play_json['playstatus'] if playstatus['status'] == 0: @@ -136,43 +125,99 @@ class LeIE(InfoExtractor): msg = 'Generic error. flag = %d' % flag raise ExtractorError(msg, expected=True) - playurl = play_json['playurl'] + def _real_extract(self, url): + media_id = self._match_id(url) + page = self._download_webpage(url, media_id) - formats = ['350', '1000', '1300', '720p', '1080p'] - dispatch = playurl['dispatch'] + play_json_h5 = self._download_json( + 'http://api.le.com/mms/out/video/playJsonH5', + media_id, 'Downloading html5 playJson data', query={ + 'id': media_id, + 'platid': 3, + 'splatid': 304, + 'format': 1, + 'tkey': self.get_mms_key(int(time.time())), + 'domain': 'www.le.com', + 'tss': 'no', + }, + headers=self.geo_verification_headers()) + self._check_errors(play_json_h5) - urls = [] - for format_id in formats: - if format_id in dispatch: - media_url = playurl['domain'][0] + dispatch[format_id][0] - media_url += '&' + compat_urllib_parse_urlencode({ - 'm3v': 1, + play_json_flash = self._download_json( + 'http://api.le.com/mms/out/video/playJson', + media_id, 'Downloading flash playJson data', query={ + 'id': media_id, + 'platid': 1, + 'splatid': 101, + 'format': 1, + 'tkey': self.calc_time_key(int(time.time())), + 'domain': 'www.le.com', + }, + headers=self.geo_verification_headers()) + self._check_errors(play_json_flash) + + def get_h5_urls(media_url, format_id): + location = self._download_json( + media_url, media_id, + 'Download JSON metadata for format %s' % format_id, query={ 'format': 1, 'expect': 3, - 'rateid': format_id, - }) + 'tss': 'no', + })['location'] - nodes_data = self._download_json( - media_url, media_id, - 'Download JSON metadata for format %s' % format_id) + return { + 'http': update_url_query(location, {'tss': 'no'}), + 'hls': update_url_query(location, {'tss': 'ios'}), + } - req = self._request_webpage( - nodes_data['nodelist'][0]['location'], media_id, - note='Downloading m3u8 information for format %s' % format_id) + def get_flash_urls(media_url, format_id): + media_url += '&' + compat_urllib_parse_urlencode({ + 'm3v': 1, + 'format': 1, + 'expect': 3, + 'rateid': format_id, + }) - m3u8_data = self.decrypt_m3u8(req.read()) + nodes_data = self._download_json( + media_url, media_id, + 'Download JSON metadata for format %s' % format_id) - url_info_dict = { - 'url': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'), - 'ext': determine_ext(dispatch[format_id][1]), - 'format_id': format_id, - 'protocol': 'm3u8', - } + req = self._request_webpage( + nodes_data['nodelist'][0]['location'], media_id, + note='Downloading m3u8 information for format %s' % format_id) - if format_id[-1:] == 'p': - url_info_dict['height'] = int_or_none(format_id[:-1]) + m3u8_data = self.decrypt_m3u8(req.read()) - urls.append(url_info_dict) + return { + 'hls': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'), + } + + extracted_formats = [] + formats = [] + for play_json, get_urls in ((play_json_h5, get_h5_urls), (play_json_flash, get_flash_urls)): + playurl = play_json['playurl'] + play_domain = playurl['domain'][0] + + for format_id, format_data in playurl.get('dispatch', []).items(): + if format_id in extracted_formats: + continue + extracted_formats.append(format_id) + + media_url = play_domain + format_data[0] + for protocol, format_url in get_urls(media_url, format_id).items(): + f = { + 'url': format_url, + 'ext': determine_ext(format_data[1]), + 'format_id': '%s-%s' % (protocol, format_id), + 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', + 'quality': int_or_none(format_id), + } + + if format_id[-1:] == 'p': + f['height'] = int_or_none(format_id[:-1]) + + formats.append(f) + self._sort_formats(formats, ('height', 'quality', 'format_id')) publish_time = parse_iso8601(self._html_search_regex( r'发布时间 ([^<>]+) ', page, 'publish time', default=None), @@ -181,7 +226,7 @@ class LeIE(InfoExtractor): return { 'id': media_id, - 'formats': urls, + 'formats': formats, 'title': playurl['title'], 'thumbnail': playurl['pic'], 'description': description, diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 1237e1573..a98c4c530 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -100,7 +100,7 @@ class LyndaIE(LyndaBaseIE): _TESTS = [{ 'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', - 'md5': '679734f6786145da3546585de9a356be', + # md5 is unstable 'info_dict': { 'id': '114408', 'ext': 'mp4', diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py index 9fbc74f5d..d970e94ec 100644 --- a/youtube_dl/extractor/mgtv.py +++ b/youtube_dl/extractor/mgtv.py @@ -26,7 +26,8 @@ class MGTVIE(InfoExtractor): video_id = self._match_id(url) api_data = self._download_json( 'http://v.api.mgtv.com/player/video', video_id, - query={'video_id': video_id})['data'] + query={'video_id': video_id}, + headers=self.geo_verification_headers())['data'] info = api_data['info'] formats = [] diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py index 170ebd9eb..937ba0f28 100644 --- a/youtube_dl/extractor/miomio.py +++ b/youtube_dl/extractor/miomio.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import random from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( xpath_text, int_or_none, @@ -18,13 +19,16 @@ class MioMioIE(InfoExtractor): _TESTS = [{ # "type=video" in flashvars 'url': 'http://www.miomio.tv/watch/cc88912/', - 'md5': '317a5f7f6b544ce8419b784ca8edae65', 'info_dict': { 'id': '88912', 'ext': 'flv', 'title': '【SKY】字幕 铠武昭和VS平成 假面骑士大战FEAT战队 魔星字幕组 字幕', 'duration': 5923, }, + 'params': { + # The server provides broken file + 'skip_download': True, + } }, { 'url': 'http://www.miomio.tv/watch/cc184024/', 'info_dict': { @@ -32,7 +36,7 @@ class MioMioIE(InfoExtractor): 'title': '《动漫同人插画绘制》', }, 'playlist_mincount': 86, - 'skip': 'This video takes time too long for retrieving the URL', + 'skip': 'Unable to load videos', }, { 'url': 'http://www.miomio.tv/watch/cc173113/', 'info_dict': { @@ -40,20 +44,23 @@ class MioMioIE(InfoExtractor): 'title': 'The New Macbook 2015 上手试玩与简评' }, 'playlist_mincount': 2, + 'skip': 'Unable to load videos', + }, { + # new 'h5' player + 'url': 'http://www.miomio.tv/watch/cc273295/', + 'md5': '', + 'info_dict': { + 'id': '273295', + 'ext': 'mp4', + 'title': 'アウト×デラックス 20160526', + }, + 'params': { + # intermittent HTTP 500 + 'skip_download': True, + }, }] - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._html_search_meta( - 'description', webpage, 'title', fatal=True) - - mioplayer_path = self._search_regex( - r'src="(/mioplayer/[^"]+)"', webpage, 'ref_path') - - http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path} - + def _extract_mioplayer(self, webpage, video_id, title, http_headers): xml_config = self._search_regex( r'flashvars="type=(?:sina|video)&(.+?)&', webpage, 'xml config') @@ -92,10 +99,34 @@ class MioMioIE(InfoExtractor): 'http_headers': http_headers, }) + return entries + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_meta( + 'description', webpage, 'title', fatal=True) + + mioplayer_path = self._search_regex( + r'src="(/mioplayer(?:_h5)?/[^"]+)"', webpage, 'ref_path') + + if '_h5' in mioplayer_path: + player_url = compat_urlparse.urljoin(url, mioplayer_path) + player_webpage = self._download_webpage( + player_url, video_id, + note='Downloading player webpage', headers={'Referer': url}) + entries = self._parse_html5_media_entries(player_url, player_webpage) + http_headers = {'Referer': player_url} + else: + http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path} + entries = self._extract_mioplayer(webpage, video_id, title, http_headers) + if len(entries) == 1: segment = entries[0] segment['id'] = video_id segment['title'] = title + segment['http_headers'] = http_headers return segment return { diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index e96013791..4935002d0 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -8,7 +8,7 @@ from ..utils import update_url_query class NickIE(MTVServicesInfoExtractor): IE_NAME = 'nick.com' - _VALID_URL = r'https?://(?:www\.)?nick\.com/videos/clip/(?P<id>[^/?#.]+)' + _VALID_URL = r'https?://(?:www\.)?nick(?:jr)?\.com/(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)' _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm' _TESTS = [{ 'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html', @@ -52,6 +52,9 @@ class NickIE(MTVServicesInfoExtractor): } }, ], + }, { + 'url': 'http://www.nickjr.com/paw-patrol/videos/pups-save-a-goldrush-s3-ep302-full-episode/', + 'only_matching': True, }] def _get_feed_query(self, uri): diff --git a/youtube_dl/extractor/playvid.py b/youtube_dl/extractor/playvid.py index 2eb4fd96d..78d219299 100644 --- a/youtube_dl/extractor/playvid.py +++ b/youtube_dl/extractor/playvid.py @@ -15,7 +15,7 @@ from ..utils import ( class PlayvidIE(InfoExtractor): _VALID_URL = r'https?://www\.playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)' - _TEST = { + _TESTS = [{ 'url': 'http://www.playvid.com/watch/RnmBNgtrrJu', 'md5': 'ffa2f6b2119af359f544388d8c01eb6c', 'info_dict': { @@ -24,8 +24,19 @@ class PlayvidIE(InfoExtractor): 'title': 'md5:9256d01c6317e3f703848b5906880dc8', 'duration': 82, 'age_limit': 18, - } - } + }, + 'skip': 'Video removed due to ToS', + }, { + 'url': 'http://www.playvid.com/watch/hwb0GpNkzgH', + 'md5': '39d49df503ad7b8f23a4432cbf046477', + 'info_dict': { + 'id': 'hwb0GpNkzgH', + 'ext': 'mp4', + 'title': 'Ellen Euro Cutie Blond Takes a Sexy Survey Get Facial in The Park', + 'age_limit': 18, + 'thumbnail': 're:^https?://.*\.jpg$', + }, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py index d3bebaea3..f559b899f 100644 --- a/youtube_dl/extractor/polskieradio.py +++ b/youtube_dl/extractor/polskieradio.py @@ -33,6 +33,7 @@ class PolskieRadioIE(InfoExtractor): 'timestamp': 1456594200, 'upload_date': '20160227', 'duration': 2364, + 'thumbnail': 're:^https?://static\.prsa\.pl/images/.*\.jpg$' }, }], }, { @@ -68,6 +69,8 @@ class PolskieRadioIE(InfoExtractor): r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', webpage, 'timestamp', fatal=False)) + thumbnail_url = self._og_search_thumbnail(webpage) + entries = [] media_urls = set() @@ -87,6 +90,7 @@ class PolskieRadioIE(InfoExtractor): 'duration': int_or_none(media.get('length')), 'vcodec': 'none' if media.get('provider') == 'audio' else None, 'timestamp': timestamp, + 'thumbnail': thumbnail_url }) title = self._og_search_title(webpage).strip() diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py new file mode 100644 index 000000000..f5b2f560c --- /dev/null +++ b/youtube_dl/extractor/roosterteeth.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + strip_or_none, + unescapeHTML, + urlencode_postdata, +) + + +class RoosterTeethIE(InfoExtractor): + _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/episode/(?P<id>[^/?#&]+)' + _LOGIN_URL = 'https://roosterteeth.com/login' + _NETRC_MACHINE = 'roosterteeth' + _TESTS = [{ + 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'md5': 'e2bd7764732d785ef797700a2489f212', + 'info_dict': { + 'id': '26576', + 'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'ext': 'mp4', + 'title': 'Million Dollars, But...: Million Dollars, But... The Game Announcement', + 'description': 'md5:0cc3b21986d54ed815f5faeccd9a9ca5', + 'thumbnail': 're:^https?://.*\.png$', + 'series': 'Million Dollars, But...', + 'episode': 'Million Dollars, But... The Game Announcement', + 'comment_count': int, + }, + }, { + 'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31', + 'only_matching': True, + }, { + 'url': 'http://funhaus.roosterteeth.com/episode/funhaus-shorts-2016-austin-sucks-funhaus-shorts', + 'only_matching': True, + }, { + 'url': 'http://screwattack.roosterteeth.com/episode/death-battle-season-3-mewtwo-vs-shadow', + 'only_matching': True, + }, { + 'url': 'http://theknow.roosterteeth.com/episode/the-know-game-news-season-1-boring-steam-sales-are-better', + 'only_matching': True, + }, { + # only available for FIRST members + 'url': 'http://roosterteeth.com/episode/rt-docs-the-world-s-greatest-head-massage-the-world-s-greatest-head-massage-an-asmr-journey-part-one', + 'only_matching': True, + }] + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, + note='Downloading login page', + errnote='Unable to download login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + login_request = self._download_webpage( + self._LOGIN_URL, None, + note='Logging in as %s' % username, + data=urlencode_postdata(login_form), + headers={ + 'Referer': self._LOGIN_URL, + }) + + if not any(re.search(p, login_request) for p in ( + r'href=["\']https?://(?:www\.)?roosterteeth\.com/logout"', + r'>Sign Out<')): + error = self._html_search_regex( + r'(?s)<div[^>]+class=(["\']).*?\balert-danger\b.*?\1[^>]*>(?:\s*<button[^>]*>.*?</button>)?(?P<error>.+?)</div>', + login_request, 'alert', default=None, group='error') + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + episode = strip_or_none(unescapeHTML(self._search_regex( + (r'videoTitle\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1', + r'<title>(?P<title>[^<]+)'), webpage, 'title', + default=None, group='title'))) + + title = strip_or_none(self._og_search_title( + webpage, default=None)) or episode + + m3u8_url = self._search_regex( + r'file\s*:\s*(["\'])(?Phttp.+?\.m3u8.*?)\1', + webpage, 'm3u8 url', default=None, group='url') + + if not m3u8_url: + if re.search(r']+class=["\']non-sponsor', webpage): + self.raise_login_required( + '%s is only available for FIRST members' % display_id) + + if re.search(r']+class=["\']golive-gate', webpage): + self.raise_login_required('%s is not available yet' % display_id) + + raise ExtractorError('Unable to extract m3u8 URL') + + formats = self._extract_m3u8_formats( + m3u8_url, display_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + description = strip_or_none(self._og_search_description(webpage)) + thumbnail = self._proto_relative_url(self._og_search_thumbnail(webpage)) + + series = self._search_regex( + (r'

More ([^<]+)

', r']+>See All ([^<]+) Videos<'), + webpage, 'series', fatal=False) + + comment_count = int_or_none(self._search_regex( + r'>Comments \((\d+)\)<', webpage, + 'comment count', fatal=False)) + + video_id = self._search_regex( + (r'containerId\s*=\s*["\']episode-(\d+)\1', + r'[0-9a-zA-Z]+)' + + _TEST = { + 'url': 'http://rudo.video/vod/oTzw0MGnyG', + 'md5': '2a03a5b32dd90a04c83b6d391cf7b415', + 'info_dict': { + 'id': 'oTzw0MGnyG', + 'ext': 'mp4', + 'title': 'Comentario Tomás Mosciatti', + 'upload_date': '20160617', + }, + } + + @classmethod + def _extract_url(self, webpage): + mobj = re.search( + ']+src=(?P[\'"])(?P(?:https?:)?//rudo\.video/vod/[0-9a-zA-Z]+)(?P=q1)', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id, encoding='iso-8859-1') + + jwplayer_data = self._parse_json(self._search_regex( + r'(?s)playerInstance\.setup\(({.+?})\)', webpage, 'jwplayer data'), video_id, + transform_source=lambda s: js_to_json(re.sub(r'encodeURI\([^)]+\)', '""', s))) + + info_dict = self._parse_jwplayer_data( + jwplayer_data, video_id, require_title=False, m3u8_id='hls') + + info_dict.update({ + 'title': self._og_search_title(webpage), + 'upload_date': unified_strdate(get_element_by_class('date', webpage)), + }) + + return info_dict diff --git a/youtube_dl/extractor/srmediathek.py b/youtube_dl/extractor/srmediathek.py index a2569dfba..409d50304 100644 --- a/youtube_dl/extractor/srmediathek.py +++ b/youtube_dl/extractor/srmediathek.py @@ -11,7 +11,7 @@ from ..utils import ( class SRMediathekIE(ARDMediathekIE): IE_NAME = 'sr:mediathek' IE_DESC = 'Saarländischer Rundfunk' - _VALID_URL = r'https?://sr-mediathek\.sr-online\.de/index\.php\?.*?&id=(?P[0-9]+)' + _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P[0-9]+)' _TESTS = [{ 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=28455', @@ -35,7 +35,9 @@ class SRMediathekIE(ARDMediathekIE): # m3u8 download 'skip_download': True, }, - 'expected_warnings': ['Unable to download f4m manifest'] + }, { + 'url': 'http://sr-mediathek.de/index.php?seite=7&id=7480', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 3c78fb3d5..d49cc6cbc 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -9,8 +9,8 @@ from ..utils import ( class VidziIE(JWPlatformBaseIE): - _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?P\w+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?:embed-)?(?P[0-9a-zA-Z]+)' + _TESTS = [{ 'url': 'http://vidzi.tv/cghql9yq6emu.html', 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', 'info_dict': { @@ -22,12 +22,16 @@ class VidziIE(JWPlatformBaseIE): # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'http://vidzi.tv/embed-4z2yb0rzphe9-600x338.html', + 'skip_download': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + 'http://vidzi.tv/%s' % video_id, video_id) title = self._html_search_regex( r'(?s)

(.*?)

', webpage, 'title') diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index d9c9852d4..7e854f326 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -364,6 +364,11 @@ class VimeoIE(VimeoBaseInfoExtractor): r']+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) if mobj: return mobj.group(1) + # Look more for non-standard embedded Vimeo player + mobj = re.search( + r']+src=(?P[\'"])(?P(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)(?P=q1)', webpage) + if mobj: + return mobj.group('url') def _verify_player_video_password(self, url, video_id): password = self._downloader.params.get('videopassword') diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py index eaa888f00..b73da5cd0 100644 --- a/youtube_dl/extractor/vuclip.py +++ b/youtube_dl/extractor/vuclip.py @@ -9,7 +9,7 @@ from ..compat import ( from ..utils import ( ExtractorError, parse_duration, - qualities, + remove_end, ) @@ -22,7 +22,7 @@ class VuClipIE(InfoExtractor): 'id': '922692425', 'ext': '3gp', 'title': 'The Toy Soldiers - Hollywood Movie Trailer', - 'duration': 180, + 'duration': 177, } } @@ -46,34 +46,21 @@ class VuClipIE(InfoExtractor): '%s said: %s' % (self.IE_NAME, error_msg), expected=True) # These clowns alternate between two page types - links_code = self._search_regex( - r'''(?xs) - (?: - | - \s* - ) - (.*?) - (?: - - ) - ''', webpage, 'links') - title = self._html_search_regex( - r'(.*?)-\s*Vuclip', webpage, 'title').strip() + video_url = self._search_regex( + r']+href="([^"]+)"[^>]*>]+src="[^"]*/play\.gif', + webpage, 'video URL', default=None) + if video_url: + formats = [{ + 'url': video_url, + }] + else: + formats = self._parse_html5_media_entries(url, webpage)[0]['formats'] - quality_order = qualities(['Reg', 'Hi']) - formats = [] - for url, q in re.findall( - r'[^"]+)".*?>(?:]*>)?(?P[^<]+)(?:)?', links_code): - format_id = compat_urllib_parse_urlparse(url).scheme + '-' + q - formats.append({ - 'format_id': format_id, - 'url': url, - 'quality': quality_order(q), - }) - self._sort_formats(formats) + title = remove_end(self._html_search_regex( + r'(.*?)-\s*Vuclip', webpage, 'title').strip(), ' - Video') - duration = parse_duration(self._search_regex( - r'\(([0-9:]+)\)', webpage, 'duration', fatal=False)) + duration = parse_duration(self._html_search_regex( + r'[(>]([0-9]+:[0-9]+)(?:]* id="challenge"', login_results) is not None: + if re.search(r'(?i)]+id="challenge"', login_results) is not None: tfa_code = self._get_tfa_info('2-step verification code') if not tfa_code: @@ -165,17 +165,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if tfa_results is False: return False - if re.search(r'(?i)]* id="challenge"', tfa_results) is not None: + if re.search(r'(?i)]+id="challenge"', tfa_results) is not None: self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.') return False - if re.search(r'(?i)]* id="gaia_loginform"', tfa_results) is not None: + if re.search(r'(?i)]+id="gaia_loginform"', tfa_results) is not None: self._downloader.report_warning('unable to log in - did the page structure change?') return False if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None: self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.') return False - if re.search(r'(?i)]* id="gaia_loginform"', login_results) is not None: + if re.search(r'(?i)]+id="gaia_loginform"', login_results) is not None: self._downloader.report_warning('unable to log in: bad username or password') return False return True @@ -1978,10 +1978,13 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url)) + def _build_template_url(self, url, channel_id): + return self._TEMPLATE_URL % channel_id + def _real_extract(self, url): channel_id = self._match_id(url) - url = self._TEMPLATE_URL % channel_id + url = self._build_template_url(url, channel_id) # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) # Workaround by extracting as a playlist if managed to obtain channel playlist URL @@ -2038,8 +2041,8 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): class YoutubeUserIE(YoutubeChannelIE): IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/|c/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P[A-Za-z0-9_-]+)' - _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos' + _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?Puser|c)/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P[A-Za-z0-9_-]+)' + _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos' IE_NAME = 'youtube:user' _TESTS = [{ @@ -2049,12 +2052,24 @@ class YoutubeUserIE(YoutubeChannelIE): 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ', 'title': 'Uploads from The Linux Foundation', } + }, { + # Only available via https://www.youtube.com/c/12minuteathlete/videos + # but not https://www.youtube.com/user/12minuteathlete/videos + 'url': 'https://www.youtube.com/c/12minuteathlete/videos', + 'playlist_mincount': 249, + 'info_dict': { + 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ', + 'title': 'Uploads from 12 Minute Athlete', + } }, { 'url': 'ytuser:phihag', 'only_matching': True, }, { 'url': 'https://www.youtube.com/c/gametrailers', 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/gametrailers', + 'only_matching': True, }, { # This channel is not available. 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos', @@ -2071,6 +2086,10 @@ class YoutubeUserIE(YoutubeChannelIE): else: return super(YoutubeUserIE, cls).suitable(url) + def _build_template_url(self, url, channel_id): + mobj = re.match(self._VALID_URL, url) + return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id')) + class YoutubeLiveIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com live streams' diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 5302b67cc..c4a85b2c0 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -26,7 +26,11 @@ def parseOpts(overrideArguments=None): except IOError: return default # silently skip if file is not present try: - res = compat_shlex_split(optionf.read(), comments=True) + # FIXME: https://github.com/rg3/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56 + contents = optionf.read() + if sys.version_info < (3,): + contents = contents.decode(preferredencoding()) + res = compat_shlex_split(contents, comments=True) finally: optionf.close() return res diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index fa99b0c2a..c1e9eb159 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -363,8 +363,10 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): input_files = [filename] + sub_filenames opts = [ - '-map', '0', - '-c', 'copy', + '-map', '0:v', + '-c:v', 'copy', + '-map', '0:a', + '-c:a', 'copy', # Don't copy the existing subtitles, we may be running the # postprocessor a second time '-map', '-0:s', diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 326aa810d..0fd1e71b4 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2126,6 +2126,42 @@ def mimetype2ext(mt): }.get(res, res) +def parse_codecs(codecs_str): + # http://tools.ietf.org/html/rfc6381 + if not codecs_str: + return {} + splited_codecs = list(filter(None, map( + lambda str: str.strip(), codecs_str.strip().strip(',').split(',')))) + vcodec, acodec = None, None + for full_codec in splited_codecs: + codec = full_codec.split('.')[0] + if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'): + if not vcodec: + vcodec = full_codec + elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac'): + if not acodec: + acodec = full_codec + else: + write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr) + if not vcodec and not acodec: + if len(splited_codecs) == 2: + return { + 'vcodec': vcodec, + 'acodec': acodec, + } + elif len(splited_codecs) == 1: + return { + 'vcodec': 'none', + 'acodec': vcodec, + } + else: + return { + 'vcodec': vcodec or 'none', + 'acodec': acodec or 'none', + } + return {} + + def urlhandle_detect_ext(url_handle): getheader = url_handle.headers.get diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 6396ad4c9..d60480223 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.07.07' +__version__ = '2016.07.11'