diff --git a/AUTHORS b/AUTHORS index 35e256d49..b8bf3cb6f 100644 --- a/AUTHORS +++ b/AUTHORS @@ -103,3 +103,4 @@ Christopher Krooss Ondřej Caletka Dinesh S Johan K. Jensen +Yen Chi Hsuan diff --git a/README.md b/README.md index 078e9df82..1408ebba0 100644 --- a/README.md +++ b/README.md @@ -267,10 +267,22 @@ which means you can modify it, redistribute it or use it however you like. by extension for the extensions aac, m4a, mp3, mp4, ogg, wav, webm. You can also use the special names "best", "bestvideo", - "bestaudio", "worst". By default, youtube- - dl will pick the best quality. Use commas - to download multiple audio formats, such as - -f + "bestaudio", "worst". You can filter the + video results by putting a condition in + brackets, as in -f "best[height=720]" (or + -f "[filesize>10M]"). This works for + filesize, height, width, tbr, abr, and vbr + and the comparisons <, <=, >, >=, =, != . + Formats for which the value is not known + are excluded unless you put a question mark + (?) after the operator. You can combine + format filters, so -f "[height <=? + 720][tbr>500]" selects up to 720p videos + (or videos where the height is not known) + with a bitrate of at least 500 KBit/s. By + default, youtube-dl will pick the best + quality. Use commas to download multiple + audio formats, such as -f 136/137/mp4/bestvideo,140/m4a/bestaudio. You can merge the video and audio of two formats into a single file using -f 450]'}) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'B') + + ydl = YDL({'format': 'best [filesize = 1000] [width!=450]'}) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'C') + + ydl = YDL({'format': '[filesize>?1]'}) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'G') + + ydl = YDL({'format': '[filesize<1M]'}) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'E') + + ydl = YDL({'format': '[filesize<1MiB]'}) + ydl.process_ie_result(info_dict) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'G') + def test_add_extra_info(self): test_dict = { 'extractor': 'Foo', diff --git a/test/test_all_urls.py b/test/test_all_urls.py index bd4fe17bf..e66264b4b 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -14,7 +14,6 @@ from test.helper import gettestcases from youtube_dl.extractor import ( FacebookIE, gen_extractors, - TwitchIE, YoutubeIE, ) @@ -72,18 +71,6 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) - def test_twitch_channelid_matching(self): - self.assertTrue(TwitchIE.suitable('twitch.tv/vanillatv')) - self.assertTrue(TwitchIE.suitable('www.twitch.tv/vanillatv')) - self.assertTrue(TwitchIE.suitable('http://www.twitch.tv/vanillatv')) - self.assertTrue(TwitchIE.suitable('http://www.twitch.tv/vanillatv/')) - - def test_twitch_videoid_matching(self): - self.assertTrue(TwitchIE.suitable('http://www.twitch.tv/vanillatv/b/328087483')) - - def test_twitch_chapterid_matching(self): - self.assertTrue(TwitchIE.suitable('http://www.twitch.tv/tsm_theoddone/c/2349361')) - def test_youtube_extract(self): assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') @@ -115,8 +102,6 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch(':ythistory', ['youtube:history']) self.assertMatch(':thedailyshow', ['ComedyCentralShows']) self.assertMatch(':tds', ['ComedyCentralShows']) - self.assertMatch(':colbertreport', ['ComedyCentralShows']) - self.assertMatch(':cr', ['ComedyCentralShows']) def test_vimeo_matching(self): self.assertMatch('http://vimeo.com/channels/tributes', ['vimeo:channel']) diff --git a/test/test_utils.py b/test/test_utils.py index 206760d99..bdd7f268a 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -28,6 +28,7 @@ from youtube_dl.utils import ( fix_xml_ampersands, InAdvancePagedList, intlist_to_bytes, + is_html, js_to_json, limit_length, OnDemandPagedList, @@ -417,5 +418,21 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') self.assertTrue(age_restricted(18, 14)) self.assertFalse(age_restricted(18, 18)) + def test_is_html(self): + self.assertFalse(is_html(b'\x49\x44\x43\xaaa')) + self.assertTrue(is_html( # UTF-8 with BOM + b'\xef\xbb\xbf\xaaa')) + self.assertTrue(is_html( # UTF-16-LE + b'\xff\xfe<\x00h\x00t\x00m\x00l\x00>\x00\xe4\x00' + )) + self.assertTrue(is_html( # UTF-16-BE + b'\xfe\xff\x00<\x00h\x00t\x00m\x00l\x00>\x00\xe4' + )) + self.assertTrue(is_html( # UTF-32-BE + b'\x00\x00\xFE\xFF\x00\x00\x00<\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\xe4')) + self.assertTrue(is_html( # UTF-32-LE + b'\xFF\xFE\x00\x00<\x00\x00\x00h\x00\x00\x00t\x00\x00\x00m\x00\x00\x00l\x00\x00\x00>\x00\x00\x00\xe4\x00\x00\x00')) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 772fddd45..8ef74e414 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -10,6 +10,7 @@ import io import itertools import json import locale +import operator import os import platform import re @@ -49,6 +50,7 @@ from .utils import ( make_HTTPS_handler, MaxDownloadsReached, PagedList, + parse_filesize, PostProcessingError, platform_name, preferredencoding, @@ -768,7 +770,59 @@ class YoutubeDL(object): else: raise Exception('Invalid result type: %s' % result_type) + def _apply_format_filter(self, format_spec, available_formats): + " Returns a tuple of the remaining format_spec and filtered formats " + + OPERATORS = { + '<': operator.lt, + '<=': operator.le, + '>': operator.gt, + '>=': operator.ge, + '=': operator.eq, + '!=': operator.ne, + } + operator_rex = re.compile(r'''(?x)\s*\[ + (?Pwidth|height|tbr|abr|vbr|filesize) + \s*(?P%s)(?P\s*\?)?\s* + (?P[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?) + \]$ + ''' % '|'.join(map(re.escape, OPERATORS.keys()))) + m = operator_rex.search(format_spec) + if not m: + raise ValueError('Invalid format specification %r' % format_spec) + + try: + comparison_value = int(m.group('value')) + except ValueError: + comparison_value = parse_filesize(m.group('value')) + if comparison_value is None: + comparison_value = parse_filesize(m.group('value') + 'B') + if comparison_value is None: + raise ValueError( + 'Invalid value %r in format specification %r' % ( + m.group('value'), format_spec)) + op = OPERATORS[m.group('op')] + + def _filter(f): + actual_value = f.get(m.group('key')) + if actual_value is None: + return m.group('none_inclusive') + return op(actual_value, comparison_value) + new_formats = [f for f in available_formats if _filter(f)] + + new_format_spec = format_spec[:-len(m.group(0))] + if not new_format_spec: + new_format_spec = 'best' + + return (new_format_spec, new_formats) + def select_format(self, format_spec, available_formats): + while format_spec.endswith(']'): + format_spec, available_formats = self._apply_format_filter( + format_spec, available_formats) + if not available_formats: + return None + if format_spec == 'best' or format_spec is None: return available_formats[-1] elif format_spec == 'worst': diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6a97d4e7f..60e19bd0a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals from .abc import ABCIE +from .abc7news import Abc7NewsIE from .academicearth import AcademicEarthCourseIE from .addanime import AddAnimeIE from .adobetv import AdobeTVIE @@ -175,6 +176,7 @@ from .goshgay import GoshgayIE from .grooveshark import GroovesharkIE from .groupon import GrouponIE from .hark import HarkIE +from .hearthisat import HearThisAtIE from .heise import HeiseIE from .hellporno import HellPornoIE from .helsinki import HelsinkiIE @@ -409,6 +411,7 @@ from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE +from .streetvoice import StreetVoiceIE from .sunporno import SunPornoIE from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE @@ -430,6 +433,7 @@ from .telemb import TeleMBIE from .teletask import TeleTaskIE from .tenplay import TenPlayIE from .testurl import TestURLIE +from .testtube import TestTubeIE from .tf1 import TF1IE from .theonion import TheOnionIE from .theplatform import ThePlatformIE @@ -458,7 +462,14 @@ from .tvigle import TvigleIE from .tvp import TvpIE, TvpSeriesIE from .tvplay import TVPlayIE from .twentyfourvideo import TwentyFourVideoIE -from .twitch import TwitchIE +from .twitch import ( + TwitchVideoIE, + TwitchChapterIE, + TwitchVodIE, + TwitchProfileIE, + TwitchPastBroadcastsIE, + TwitchStreamIE, +) from .ubu import UbuIE from .udemy import ( UdemyIE, diff --git a/youtube_dl/extractor/abc7news.py b/youtube_dl/extractor/abc7news.py new file mode 100644 index 000000000..c04949c21 --- /dev/null +++ b/youtube_dl/extractor/abc7news.py @@ -0,0 +1,68 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import parse_iso8601 + + +class Abc7NewsIE(InfoExtractor): + _VALID_URL = r'https?://abc7news\.com(?:/[^/]+/(?P[^/]+))?/(?P\d+)' + _TESTS = [ + { + 'url': 'http://abc7news.com/entertainment/east-bay-museum-celebrates-vintage-synthesizers/472581/', + 'info_dict': { + 'id': '472581', + 'display_id': 'east-bay-museum-celebrates-vintage-synthesizers', + 'ext': 'mp4', + 'title': 'East Bay museum celebrates history of synthesized music', + 'description': 'md5:a4f10fb2f2a02565c1749d4adbab4b10', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1421123075, + 'upload_date': '20150113', + 'uploader': 'Jonathan Bloom', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://abc7news.com/472581', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + + m3u8 = self._html_search_meta( + 'contentURL', webpage, 'm3u8 url', fatal=True) + + formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4') + self._sort_formats(formats) + + title = self._og_search_title(webpage).strip() + description = self._og_search_description(webpage).strip() + thumbnail = self._og_search_thumbnail(webpage) + timestamp = parse_iso8601(self._search_regex( + r'
\s*