diff --git a/README.md b/README.md index ac1c3adad..5bb6c6e4e 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,7 @@ which means you can modify it, redistribute it or use it however you like. configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows) + --encoding ENCODING Force the specified encoding (experimental) ## Video Selection: --playlist-start NUMBER playlist video to start at (default is 1) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 5b6d18a82..ed041ffda 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -144,7 +144,15 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['PBS']) def test_ComedyCentralShows(self): - self.assertMatch('http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview', ['ComedyCentralShows']) + self.assertMatch( + 'http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview', + ['ComedyCentralShows']) + self.assertMatch( + 'http://thecolbertreport.cc.com/videos/29w6fx/-realhumanpraise-for-fox-news', + ['ComedyCentralShows']) + self.assertMatch( + 'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114', + ['ComedyCentralShows']) if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 8a2af7808..7017e58ea 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -8,6 +8,7 @@ import datetime import errno import io import json +import locale import os import platform import re @@ -160,6 +161,7 @@ class YoutubeDL(object): include_ads: Download ads as well default_search: Prepend this string if an input url is not valid. 'auto' for elaborate guessing + encoding: Use this encoding instead of the system-specified. The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: @@ -1219,6 +1221,9 @@ class YoutubeDL(object): def print_debug_header(self): if not self.params.get('verbose'): return + + write_string('[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % + (locale.getpreferredencoding(), sys.getfilesystemencoding(), sys.stdout.encoding, self.get_encoding())) write_string('[debug] youtube-dl version ' + __version__ + '\n') try: sp = subprocess.Popen( @@ -1283,3 +1288,19 @@ class YoutubeDL(object): # (See https://github.com/rg3/youtube-dl/issues/1309 for details) opener.addheaders = [] self._opener = opener + + def encode(self, s): + if isinstance(s, bytes): + return s # Already encoded + + try: + return s.encode(self.get_encoding()) + except UnicodeEncodeError as err: + err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.' + raise + + def get_encoding(self): + encoding = self.params.get('encoding') + if encoding is None: + encoding = preferredencoding() + return encoding diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index c74f1eeeb..5a63cbf82 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -51,6 +51,7 @@ __authors__ = ( 'David Wagner', 'Juan C. Olivares', 'Mattias Harrysson', + 'phaer', ) __license__ = 'Public Domain' @@ -256,13 +257,17 @@ def parseOpts(overrideArguments=None): general.add_option( '--bidi-workaround', dest='bidi_workaround', action='store_true', help=u'Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH') - general.add_option('--default-search', - dest='default_search', metavar='PREFIX', - help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". By default (with value "auto") youtube-dl guesses.') + general.add_option( + '--default-search', + dest='default_search', metavar='PREFIX', + help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". By default (with value "auto") youtube-dl guesses.') general.add_option( '--ignore-config', action='store_true', help='Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: do not read the user configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows)') + general.add_option( + '--encoding', dest='encoding', metavar='ENCODING', + help='Force the specified encoding (experimental)') selection.add_option( '--playlist-start', @@ -542,8 +547,6 @@ def parseOpts(overrideArguments=None): write_string(u'[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n') write_string(u'[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n') write_string(u'[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n') - write_string(u'[debug] Encodings: locale %r, fs %r, out %r, pref: %r\n' % - (locale.getpreferredencoding(), sys.getfilesystemencoding(), sys.stdout.encoding, preferredencoding())) return parser, opts, args @@ -677,7 +680,7 @@ def _real_main(argv=None): date = DateRange.day(opts.date) else: date = DateRange(opts.dateafter, opts.datebefore) - if opts.default_search not in ('auto', None) and ':' not in opts.default_search: + if opts.default_search not in ('auto', 'auto_warning', None) and ':' not in opts.default_search: parser.error(u'--default-search invalid; did you forget a colon (:) at the end?') # Do not download videos when there are audio-only formats @@ -789,6 +792,7 @@ def _real_main(argv=None): 'include_ads': opts.include_ads, 'default_search': opts.default_search, 'youtube_include_dash_manifest': opts.youtube_include_dash_manifest, + 'encoding': opts.encoding, } with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 8e81fa619..e2e66c526 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -156,6 +156,7 @@ from .mtv import ( MTVIE, MTVIggyIE, ) +from .musicplayon import MusicPlayOnIE from .muzu import MuzuTVIE from .myspace import MySpaceIE from .myspass import MySpassIE @@ -285,7 +286,10 @@ from .vk import VKIE from .vube import VubeIE from .washingtonpost import WashingtonPostIE from .wat import WatIE -from .wdr import WDRIE +from .wdr import ( + WDRIE, + WDRMausIE, +) from .weibo import WeiboIE from .wimp import WimpIE from .wistia import WistiaIE diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index fc5d6825e..dc8657b67 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -6,7 +6,6 @@ import json from .common import InfoExtractor from ..utils import ( compat_urlparse, - determine_ext, ) diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index 2415ce403..25fb79e14 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -1,22 +1,21 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor -from .ooyala import OoyalaIE class BloombergIE(InfoExtractor): _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P.+?)\.html' _TEST = { - u'url': u'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html', - u'file': u'12bzhqZTqQHmmlA8I-i0NpzJgcG5NNYX.mp4', - u'info_dict': { - u'title': u'Shah\'s Presentation on Foreign-Exchange Strategies', - u'description': u'md5:abc86e5236f9f0e4866c59ad36736686', - }, - u'params': { - # Requires ffmpeg (m3u8 manifest) - u'skip_download': True, + 'url': 'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html', + 'md5': '7bf08858ff7c203c870e8a6190e221e5', + 'info_dict': { + 'id': 'qurhIVlJSB6hzkVi229d8g', + 'ext': 'flv', + 'title': 'Shah\'s Presentation on Foreign-Exchange Strategies', + 'description': 'md5:0681e0d30dcdfc6abf34594961d8ea88', }, } @@ -24,7 +23,16 @@ class BloombergIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) name = mobj.group('name') webpage = self._download_webpage(url, name) - embed_code = self._search_regex( - r'thedailyshow|thecolbertreport)\.(?:cc\.)?com/ (full-episodes/(?P.*)| (?P - (the-colbert-report-(videos|collections)/(?P[0-9]+)/[^/]*/(?P.*?)) - |(watch/(?P[^/]*)/(?P.*)))| + (?:videos/[^/]+/(?P[^/?#]+)) + |(the-colbert-report-(videos|collections)/(?P[0-9]+)/[^/]*/(?P.*?)) + |(watch/(?P[^/]*)/(?P.*)) + )| (?P extended-interviews/(?P[0-9a-z]+)/(?:playlist_tds_extended_)?(?P.*?)(/.*?)?))) - $''' + (?:[?#].*|$)''' _TEST = { 'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart', 'md5': '4e2f5cb088a83cd8cdb7756132f9739d', @@ -57,7 +59,7 @@ class ComedyCentralShowsIE(InfoExtractor): 'upload_date': '20121213', 'description': 'Kristen Stewart learns to let loose in "On the Road."', 'uploader': 'thedailyshow', - 'title': 'thedailyshow-kristen-stewart part 1', + 'title': 'thedailyshow kristen-stewart part 1', } } @@ -102,7 +104,9 @@ class ComedyCentralShowsIE(InfoExtractor): assert mobj is not None if mobj.group('clip'): - if mobj.group('showname') == 'thedailyshow': + if mobj.group('videotitle'): + epTitle = mobj.group('videotitle') + elif mobj.group('showname') == 'thedailyshow': epTitle = mobj.group('tdstitle') else: epTitle = mobj.group('cntitle') @@ -161,7 +165,7 @@ class ComedyCentralShowsIE(InfoExtractor): content = itemEl.find('.//{http://search.yahoo.com/mrss/}content') duration = float_or_none(content.attrib.get('duration')) mediagen_url = content.attrib['url'] - guid = itemEl.find('.//guid').text.rpartition(':')[-1] + guid = itemEl.find('./guid').text.rpartition(':')[-1] cdoc = self._download_xml( mediagen_url, epTitle, diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index 885944c5e..2ae6ecc12 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -10,9 +10,10 @@ class DiscoveryIE(InfoExtractor): _VALID_URL = r'http://dsc\.discovery\.com\/[a-zA-Z0-9\-]*/[a-zA-Z0-9\-]*/videos/(?P[a-zA-Z0-9\-]*)(.htm)?' _TEST = { 'url': 'http://dsc.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm', - 'file': '614784.mp4', 'md5': 'e12614f9ee303a6ccef415cb0793eba2', 'info_dict': { + 'id': '614784', + 'ext': 'mp4', 'title': 'MythBusters: Mission Impossible Outtakes', 'description': ('Watch Jamie Hyneman and Adam Savage practice being' ' each other -- to the point of confusing Jamie\'s dog -- and ' @@ -34,7 +35,7 @@ class DiscoveryIE(InfoExtractor): formats = [] for f in info['mp4']: formats.append( - {'url': f['src'], r'ext': r'mp4', 'tbr': int(f['bitrate'][:-1])}) + {'url': f['src'], 'ext': 'mp4', 'tbr': int(f['bitrate'][:-1])}) return { 'id': info['contentId'], diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index fc1bedd57..3ad0e13f9 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -82,6 +82,17 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Brightcove'], }, + { + 'url': 'http://www.championat.com/video/football/v/87/87499.html', + 'md5': 'fb973ecf6e4a78a67453647444222983', + 'info_dict': { + 'id': '3414141473001', + 'ext': 'mp4', + 'title': 'Видео. Удаление Дзагоева (ЦСКА)', + 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"', + 'uploader': 'Championat', + }, + }, # Direct link to a video { 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', @@ -316,13 +327,16 @@ class GenericIE(InfoExtractor): if not parsed_url.scheme: default_search = self._downloader.params.get('default_search') if default_search is None: - default_search = 'auto' + default_search = 'auto_warning' - if default_search == 'auto': + if default_search in ('auto', 'auto_warning'): if '/' in url: self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') return self.url_result('http://' + url) else: + if default_search == 'auto_warning': + self._downloader.report_warning( + 'Falling back to youtube search for %s . Set --default-search to "auto" to suppress this warning.' % url) return self.url_result('ytsearch:' + url) else: assert ':' in default_search diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py index 0d1ea6802..94e7cf790 100644 --- a/youtube_dl/extractor/huffpost.py +++ b/youtube_dl/extractor/huffpost.py @@ -21,9 +21,10 @@ class HuffPostIE(InfoExtractor): _TEST = { 'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677', - 'file': '52dd3e4b02a7602131000677.mp4', 'md5': '55f5e8981c1c80a64706a44b74833de8', 'info_dict': { + 'id': '52dd3e4b02a7602131000677', + 'ext': 'mp4', 'title': 'Legalese It! with @MikeSacksHP', 'description': 'This week on Legalese It, Mike talks to David Bosco about his new book on the ICC, "Rough Justice," he also discusses the Virginia AG\'s historic stance on gay marriage, the execution of Edgar Tamayo, the ICC\'s delay of Kenya\'s President and more. ', 'duration': 1549, diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py index 381af91e4..cfeaa4146 100644 --- a/youtube_dl/extractor/ign.py +++ b/youtube_dl/extractor/ign.py @@ -1,10 +1,8 @@ +from __future__ import unicode_literals + import re -import json from .common import InfoExtractor -from ..utils import ( - determine_ext, -) class IGNIE(InfoExtractor): @@ -14,52 +12,57 @@ class IGNIE(InfoExtractor): """ _VALID_URL = r'https?://.+?\.ign\.com/(?Pvideos|show_videos|articles|(?:[^/]*/feature))(/.+)?/(?P.+)' - IE_NAME = u'ign.com' + IE_NAME = 'ign.com' _CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config' - _DESCRIPTION_RE = [r'(.+?)', - r'id="my_show_video">.*?

(.*?)

', - ] + _DESCRIPTION_RE = [ + r'(.+?)', + r'id="my_show_video">.*?

(.*?)

', + ] _TESTS = [ { - u'url': u'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', - u'file': u'8f862beef863986b2785559b9e1aa599.mp4', - u'md5': u'eac8bdc1890980122c3b66f14bdd02e9', - u'info_dict': { - u'title': u'The Last of Us Review', - u'description': u'md5:c8946d4260a4d43a00d5ae8ed998870c', + 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', + 'md5': 'eac8bdc1890980122c3b66f14bdd02e9', + 'info_dict': { + 'id': '8f862beef863986b2785559b9e1aa599', + 'ext': 'mp4', + 'title': 'The Last of Us Review', + 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c', } }, { - u'url': u'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', - u'playlist': [ + 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', + 'playlist': [ { - u'file': u'5ebbd138523268b93c9141af17bec937.mp4', - u'info_dict': { - u'title': u'GTA 5 Video Review', - u'description': u'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', + 'info_dict': { + 'id': '5ebbd138523268b93c9141af17bec937', + 'ext': 'mp4', + 'title': 'GTA 5 Video Review', + 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', }, }, { - u'file': u'638672ee848ae4ff108df2a296418ee2.mp4', - u'info_dict': { - u'title': u'26 Twisted Moments from GTA 5 in Slow Motion', - u'description': u'The twisted beauty of GTA 5 in stunning slow motion.', + 'info_dict': { + 'id': '638672ee848ae4ff108df2a296418ee2', + 'ext': 'mp4', + 'title': '26 Twisted Moments from GTA 5 in Slow Motion', + 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', }, }, ], - u'params': { - u'skip_download': True, + 'params': { + 'skip_download': True, }, }, ] def _find_video_id(self, webpage): - res_id = [r'data-video-id="(.+?)"', - r'video)/id/(?P.+)' IE_NAME = '1up.com' _DESCRIPTION_RE = r'
(.+?)
' _TEST = { - u'url': u'http://gamevideos.1up.com/video/id/34976', - u'file': u'34976.mp4', - u'md5': u'68a54ce4ebc772e4b71e3123d413163d', - u'info_dict': { - u'title': u'Sniper Elite V2 - Trailer', - u'description': u'md5:5d289b722f5a6d940ca3136e9dae89cf', + 'url': 'http://gamevideos.1up.com/video/id/34976', + 'md5': '68a54ce4ebc772e4b71e3123d413163d', + 'info_dict': { + 'id': '34976', + 'ext': 'mp4', + 'title': 'Sniper Elite V2 - Trailer', + 'description': 'md5:5d289b722f5a6d940ca3136e9dae89cf', } } @@ -123,7 +122,6 @@ class OneUPIE(IGNIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - id = mobj.group('name_or_id') result = super(OneUPIE, self)._real_extract(url) - result['id'] = id + result['id'] = mobj.group('name_or_id') return result diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py index 50bc883ef..961dd1aa6 100644 --- a/youtube_dl/extractor/kickstarter.py +++ b/youtube_dl/extractor/kickstarter.py @@ -1,37 +1,39 @@ +# encoding: utf-8 +from __future__ import unicode_literals + import re from .common import InfoExtractor class KickStarterIE(InfoExtractor): - _VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P\d*)/.*' + _VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P[^/]*)/.*' _TEST = { - u"url": u"https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant?ref=home_location", - u"file": u"1404461844.mp4", - u"md5": u"c81addca81327ffa66c642b5d8b08cab", - u"info_dict": { - u"title": u"Intersection: The Story of Josh Grant by Kyle Cowling", + 'url': 'https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant?ref=home_location', + 'md5': 'c81addca81327ffa66c642b5d8b08cab', + 'info_dict': { + 'id': '1404461844', + 'ext': 'mp4', + 'title': 'Intersection: The Story of Josh Grant by Kyle Cowling', + 'description': 'A unique motocross documentary that examines the ' + 'life and mind of one of sports most elite athletes: Josh Grant.', }, } def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('id') - webpage_src = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r'data-video="(.*?)">', - webpage_src, u'video URL') - if 'mp4' in video_url: - ext = 'mp4' - else: - ext = 'flv' - video_title = self._html_search_regex(r"(.*?)", - webpage_src, u'title').rpartition(u'\u2014 Kickstarter')[0].strip() + video_url = self._search_regex(r'data-video-url="(.*?)"', + webpage, 'video URL') + video_title = self._html_search_regex(r'(.*?)', + webpage, 'title').rpartition('— Kickstarter')[0].strip() - results = [{ - 'id': video_id, - 'url': video_url, - 'title': video_title, - 'ext': ext, - }] - return results + return { + 'id': video_id, + 'url': video_url, + 'title': video_title, + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + } diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py index 465ac4916..07f072924 100644 --- a/youtube_dl/extractor/metacritic.py +++ b/youtube_dl/extractor/metacritic.py @@ -13,8 +13,9 @@ class MetacriticIE(InfoExtractor): _TEST = { 'url': 'http://www.metacritic.com/game/playstation-4/infamous-second-son/trailers/3698222', - 'file': '3698222.mp4', 'info_dict': { + 'id': '3698222', + 'ext': 'mp4', 'title': 'inFamous: Second Son - inSide Sucker Punch: Smoke & Mirrors', 'description': 'Take a peak behind-the-scenes to see how Sucker Punch brings smoke into the universe of inFAMOUS Second Son on the PS4.', 'duration': 221, diff --git a/youtube_dl/extractor/musicplayon.py b/youtube_dl/extractor/musicplayon.py new file mode 100644 index 000000000..42d7a82a5 --- /dev/null +++ b/youtube_dl/extractor/musicplayon.py @@ -0,0 +1,75 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import int_or_none + + +class MusicPlayOnIE(InfoExtractor): + _VALID_URL = r'https?://(?:.+?\.)?musicplayon\.com/play(?:-touch)?\?(?:v|pl=100&play)=(?P\d+)' + + _TEST = { + 'url': 'http://en.musicplayon.com/play?v=433377', + 'info_dict': { + 'id': '433377', + 'ext': 'mp4', + 'title': 'Rick Ross - Interview On Chelsea Lately (2014)', + 'description': 'Rick Ross Interview On Chelsea Lately', + 'duration': 342, + 'uploader': 'ultrafish', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + page = self._download_webpage(url, video_id) + + title = self._og_search_title(page) + description = self._og_search_description(page) + thumbnail = self._og_search_thumbnail(page) + duration = self._html_search_meta('video:duration', page, 'duration', fatal=False) + view_count = self._og_search_property('count', page, fatal=False) + uploader = self._html_search_regex( + r'', page, 'uploader', fatal=False) + + formats = [ + { + 'url': 'http://media0-eu-nl.musicplayon.com/stream-mobile?id=%s&type=.mp4' % video_id, + 'ext': 'mp4', + } + ] + + manifest = self._download_webpage( + 'http://en.musicplayon.com/manifest.m3u8?v=%s' % video_id, video_id, 'Downloading manifest') + + for entry in manifest.split('#')[1:]: + if entry.startswith('EXT-X-STREAM-INF:'): + meta, url, _ = entry.split('\n') + params = dict(param.split('=') for param in meta.split(',')[1:]) + formats.append({ + 'url': url, + 'ext': 'mp4', + 'tbr': int(params['BANDWIDTH']), + 'width': int(params['RESOLUTION'].split('x')[1]), + 'height': int(params['RESOLUTION'].split('x')[-1]), + 'format_note': params['NAME'].replace('"', '').strip(), + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'duration': int_or_none(duration), + 'view_count': int_or_none(view_count), + 'formats': formats, + } \ No newline at end of file diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 7e421610e..633b42f72 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -6,12 +6,13 @@ from .common import InfoExtractor class NBAIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$' + _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?:nba/)?video(?P/[^?]*?)(?:/index\.html)?(?:\?.*)?$' _TEST = { 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', - 'file': u'0021200253-okc-bkn-recap.nba.mp4', 'md5': u'c0edcfc37607344e2ff8f13c378c88a4', 'info_dict': { + 'id': '0021200253-okc-bkn-recap.nba', + 'ext': 'mp4', 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', 'title': 'Thunder vs. Nets', }, @@ -19,7 +20,7 @@ class NBAIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1) + video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) @@ -33,7 +34,6 @@ class NBAIE(InfoExtractor): return { 'id': shortened_video_id, 'url': video_url, - 'ext': 'mp4', 'title': title, 'description': description, } diff --git a/youtube_dl/extractor/ntv.py b/youtube_dl/extractor/ntv.py index e998d156e..8447a9b86 100644 --- a/youtube_dl/extractor/ntv.py +++ b/youtube_dl/extractor/ntv.py @@ -5,7 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( - RegexNotFoundError, + ExtractorError, unescapeHTML ) @@ -98,16 +98,15 @@ class NTVIE(InfoExtractor): page = self._download_webpage(url, video_id, 'Downloading page') - def extract(patterns, name, page, fatal=False): - for pattern in patterns: - mobj = re.search(pattern, page) - if mobj: - return mobj.group(1) - if fatal: - raise RegexNotFoundError(u'Unable to extract %s' % name) - return None + for pattern in self._VIDEO_ID_REGEXES: + mobj = re.search(pattern, page) + if mobj: + break - video_id = extract(self._VIDEO_ID_REGEXES, 'video id', page, fatal=True) + if not mobj: + raise ExtractorError('No media links available for %s' % video_id) + + video_id = mobj.group(1) player = self._download_xml('http://www.ntv.ru/vi%s/' % video_id, video_id, 'Downloading video XML') title = unescapeHTML(player.find('./data/title').text) diff --git a/youtube_dl/extractor/pyvideo.py b/youtube_dl/extractor/pyvideo.py index 33054591b..d2d909136 100644 --- a/youtube_dl/extractor/pyvideo.py +++ b/youtube_dl/extractor/pyvideo.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re import os @@ -5,45 +7,50 @@ from .common import InfoExtractor class PyvideoIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?pyvideo\.org/video/(?P\d+)/(.*)' - _TESTS = [{ - u'url': u'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes', - u'file': u'24_4WWkSmNo.mp4', - u'md5': u'de317418c8bc76b1fd8633e4f32acbc6', - u'info_dict': { - u"title": u"Become a logging expert in 30 minutes", - u"description": u"md5:9665350d466c67fb5b1598de379021f7", - u"upload_date": u"20130320", - u"uploader": u"NextDayVideo", - u"uploader_id": u"NextDayVideo", + _VALID_URL = r'http://(?:www\.)?pyvideo\.org/video/(?P\d+)/(.*)' + + _TESTS = [ + { + 'url': 'http://pyvideo.org/video/1737/become-a-logging-expert-in-30-minutes', + 'md5': 'de317418c8bc76b1fd8633e4f32acbc6', + 'info_dict': { + 'id': '24_4WWkSmNo', + 'ext': 'mp4', + 'title': 'Become a logging expert in 30 minutes', + 'description': 'md5:9665350d466c67fb5b1598de379021f7', + 'upload_date': '20130320', + 'uploader': 'NextDayVideo', + 'uploader_id': 'NextDayVideo', + }, + 'add_ie': ['Youtube'], }, - u'add_ie': ['Youtube'], - }, - { - u'url': u'http://pyvideo.org/video/2542/gloriajw-spotifywitherikbernhardsson182m4v', - u'md5': u'5fe1c7e0a8aa5570330784c847ff6d12', - u'info_dict': { - u'id': u'2542', - u'ext': u'm4v', - u'title': u'Gloriajw-SpotifyWithErikBernhardsson182', + { + 'url': 'http://pyvideo.org/video/2542/gloriajw-spotifywitherikbernhardsson182m4v', + 'md5': '5fe1c7e0a8aa5570330784c847ff6d12', + 'info_dict': { + 'id': '2542', + 'ext': 'm4v', + 'title': 'Gloriajw-SpotifyWithErikBernhardsson182', + }, }, - }, ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', webpage) + webpage = self._download_webpage(url, video_id) + + m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', webpage) if m_youtube is not None: return self.url_result(m_youtube.group(1), 'Youtube') - title = self._html_search_regex(r'
.*?

([^>]+?)

', - webpage, u'title', flags=re.DOTALL) - video_url = self._search_regex([r'Download.*?.*?

([^>]+?)

', webpage, 'title', flags=re.DOTALL) + video_url = self._search_regex( + [r'Download.*?
.+?)(\.|\?|$)' _TEST = { - u'url': u'http://www.roxwel.com/player/passionpittakeawalklive.html', - u'file': u'passionpittakeawalklive.flv', - u'md5': u'd9dea8360a1e7d485d2206db7fe13035', - u'info_dict': { - u'title': u'Take A Walk (live)', - u'uploader': u'Passion Pit', - u'description': u'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ', + 'url': 'http://www.roxwel.com/player/passionpittakeawalklive.html', + 'info_dict': { + 'id': 'passionpittakeawalklive', + 'ext': 'flv', + 'title': 'Take A Walk (live)', + 'uploader': 'Passion Pit', + 'uploader_id': 'passionpit', + 'upload_date': '20120928', + 'description': 'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ', }, - u'skip': u'Requires rtmpdump', + 'params': { + # rtmp download + 'skip_download': True, + } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) filename = mobj.group('filename') info_url = 'http://www.roxwel.com/api/videos/%s' % filename - info_page = self._download_webpage(info_url, filename, - u'Downloading video info') + info = self._download_json(info_url, filename) - self.report_extraction(filename) - info = json.loads(info_page) rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')]) best_rate = rtmp_rates[-1] url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate) - rtmp_url = self._download_webpage(url_page_url, filename, u'Downloading video url') + rtmp_url = self._download_webpage(url_page_url, filename, 'Downloading video url') ext = determine_ext(rtmp_url) if ext == 'f4v': rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename) - return {'id': filename, - 'title': info['title'], - 'url': rtmp_url, - 'ext': 'flv', - 'description': info['description'], - 'thumbnail': info.get('player_image_url') or info.get('image_url_large'), - 'uploader': info['artist'], - 'uploader_id': info['artistname'], - 'upload_date': unified_strdate(info['dbdate']), - } + return { + 'id': filename, + 'title': info['title'], + 'url': rtmp_url, + 'ext': 'flv', + 'description': info['description'], + 'thumbnail': info.get('player_image_url') or info.get('image_url_large'), + 'uploader': info['artist'], + 'uploader_id': info['artistname'], + 'upload_date': unified_strdate(info['dbdate']), + } diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 4922dd764..f1ce66433 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import json import itertools from .common import InfoExtractor @@ -20,8 +19,9 @@ class RutubeIE(InfoExtractor): _TEST = { 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', - 'file': '3eac3b4561676c17df9132a9a1e62e3e.mp4', 'info_dict': { + 'id': '3eac3b4561676c17df9132a9a1e62e3e', + 'ext': 'mp4', 'title': 'Раненный кенгуру забежал в аптеку', 'description': 'http://www.ntdtv.ru ', 'duration': 80, @@ -38,15 +38,15 @@ class RutubeIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - - api_response = self._download_webpage('http://rutube.ru/api/video/%s/?format=json' % video_id, - video_id, 'Downloading video JSON') - video = json.loads(api_response) - - api_response = self._download_webpage('http://rutube.ru/api/play/trackinfo/%s/?format=json' % video_id, - video_id, 'Downloading trackinfo JSON') - trackinfo = json.loads(api_response) - + + video = self._download_json( + 'http://rutube.ru/api/video/%s/?format=json' % video_id, + video_id, 'Downloading video JSON') + + trackinfo = self._download_json( + 'http://rutube.ru/api/play/trackinfo/%s/?format=json' % video_id, + video_id, 'Downloading trackinfo JSON') + # Some videos don't have the author field author = trackinfo.get('author') or {} m3u8_url = trackinfo['video_balancer'].get('m3u8') @@ -79,10 +79,9 @@ class RutubeChannelIE(InfoExtractor): def _extract_videos(self, channel_id, channel_title=None): entries = [] for pagenum in itertools.count(1): - api_response = self._download_webpage( + page = self._download_json( self._PAGE_TEMPLATE % (channel_id, pagenum), channel_id, 'Downloading page %s' % pagenum) - page = json.loads(api_response) results = page['results'] if not results: break @@ -108,10 +107,9 @@ class RutubeMovieIE(RutubeChannelIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) movie_id = mobj.group('id') - api_response = self._download_webpage( + movie = self._download_json( self._MOVIE_TEMPLATE % movie_id, movie_id, 'Downloading movie JSON') - movie = json.loads(api_response) movie_name = movie['name'] return self._extract_videos(movie_id, movie_name) diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index 2c5c88be8..fdae17b1b 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -1,33 +1,37 @@ # coding: utf-8 +from __future__ import unicode_literals -import json import re from .common import InfoExtractor + class TF1IE(InfoExtractor): """TF1 uses the wat.tv player.""" - _VALID_URL = r'http://videos\.tf1\.fr/.*-(.*?)\.html' + _VALID_URL = r'http://videos\.tf1\.fr/.*-(?P.*?)\.html' _TEST = { - u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', - u'file': u'10635995.mp4', - u'md5': u'2e378cc28b9957607d5e88f274e637d8', - u'info_dict': { - u'title': u'Citroën Grand C4 Picasso 2013 : présentation officielle', - u'description': u'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.', + 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', + 'info_dict': { + 'id': '10635995', + 'ext': 'mp4', + 'title': 'Citroën Grand C4 Picasso 2013 : présentation officielle', + 'description': 'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.', + }, + 'params': { + # Sometimes wat serves the whole file with the --test option + 'skip_download': True, }, - u'skip': u'Sometimes wat serves the whole file with the --test option', } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - id = mobj.group(1) - webpage = self._download_webpage(url, id) - embed_url = self._html_search_regex(r'"(https://www.wat.tv/embedframe/.*?)"', - webpage, 'embed url') - embed_page = self._download_webpage(embed_url, id, u'Downloading embed player page') + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + embed_url = self._html_search_regex( + r'"(https://www.wat.tv/embedframe/.*?)"', webpage, 'embed url') + embed_page = self._download_webpage(embed_url, video_id, + 'Downloading embed player page') wat_id = self._search_regex(r'UVID=(.*?)&', embed_page, 'wat id') - wat_info = self._download_webpage('http://www.wat.tv/interface/contentv3/%s' % wat_id, id, u'Downloading Wat info') - wat_info = json.loads(wat_info)['media'] - wat_url = wat_info['url'] - return self.url_result(wat_url, 'Wat') + wat_info = self._download_json( + 'http://www.wat.tv/interface/contentv3/%s' % wat_id, video_id) + return self.url_result(wat_info['media']['url'], 'Wat') diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index 3b3bec92f..8b1432fec 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -16,7 +16,7 @@ from ..utils import ( class VKIE(InfoExtractor): IE_NAME = 'vk.com' - _VALID_URL = r'https?://vk\.com/(?:video_ext\.php\?.*?\boid=(?P\d+).*?\bid=(?P\d+)|(?:videos.*?\?.*?z=)?video(?P.*?)(?:\?|%2F|$))' + _VALID_URL = r'https?://vk\.com/(?:video_ext\.php\?.*?\boid=(?P-?\d+).*?\bid=(?P\d+)|(?:videos.*?\?.*?z=)?video(?P.*?)(?:\?|%2F|$))' _NETRC_MACHINE = 'vk' _TESTS = [ diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 4fab6c6e8..a584e0896 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -1,37 +1,37 @@ # coding: utf-8 +from __future__ import unicode_literals -import json import re from .common import InfoExtractor - from ..utils import ( unified_strdate, ) class WatIE(InfoExtractor): - _VALID_URL=r'http://www\.wat\.tv/.*-(?P.*?)_.*?\.html' + _VALID_URL = r'http://www\.wat\.tv/.*-(?P.*?)_.*?\.html' IE_NAME = 'wat.tv' _TEST = { - u'url': u'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html', - u'file': u'10631273.mp4', - u'md5': u'd8b2231e1e333acd12aad94b80937e19', - u'info_dict': { - u'title': u'World War Z - Philadelphia VOST', - u'description': u'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr', + 'url': 'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html', + 'info_dict': { + 'id': '10631273', + 'ext': 'mp4', + 'title': 'World War Z - Philadelphia VOST', + 'description': 'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr', + }, + 'params': { + # Sometimes wat serves the whole file with the --test option + 'skip_download': True, }, - u'skip': u'Sometimes wat serves the whole file with the --test option', } - + def download_video_info(self, real_id): # 'contentv4' is used in the website, but it also returns the related # videos, we don't need them - info = self._download_webpage('http://www.wat.tv/interface/contentv3/' + real_id, real_id, 'Downloading video info') - info = json.loads(info) + info = self._download_json('http://www.wat.tv/interface/contentv3/' + real_id, real_id) return info['media'] - def _real_extract(self, url): def real_id_for_chapter(chapter): return chapter['tc_start'].split('-')[0] @@ -56,17 +56,17 @@ class WatIE(InfoExtractor): entries = [self.url_result(chapter_url) for chapter_url in chapter_urls] return self.playlist_result(entries, real_id, video_info['title']) + upload_date = None + if 'date_diffusion' in first_chapter: + upload_date = unified_strdate(first_chapter['date_diffusion']) # Otherwise we can continue and extract just one part, we have to use # the short id for getting the video url - info = {'id': real_id, - 'url': 'http://wat.tv/get/android5/%s.mp4' % real_id, - 'ext': 'mp4', - 'title': first_chapter['title'], - 'thumbnail': first_chapter['preview'], - 'description': first_chapter['description'], - 'view_count': video_info['views'], - } - if 'date_diffusion' in first_chapter: - info['upload_date'] = unified_strdate(first_chapter['date_diffusion']) - - return info + return { + 'id': real_id, + 'url': 'http://wat.tv/get/android5/%s.mp4' % real_id, + 'title': first_chapter['title'], + 'thumbnail': first_chapter['preview'], + 'description': first_chapter['description'], + 'view_count': video_info['views'], + 'upload_date': upload_date, + } diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 500b9146f..63691aa67 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -4,9 +4,10 @@ import re from .common import InfoExtractor from ..utils import ( - unified_strdate, + compat_parse_qs, compat_urlparse, determine_ext, + unified_strdate, ) @@ -111,4 +112,85 @@ class WDRIE(InfoExtractor): 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, - } \ No newline at end of file + } + + +class WDRMausIE(InfoExtractor): + _VALID_URL = 'http://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P[^/?#]+)(?:/index\.php5|(?

Sendedatum:\s*([0-9\.]+)

', + webpage, 'air date') + title_str = self._html_search_regex( + r'

(.*?)

', webpage, 'title') + title = '%s - %s' % (title_date, title_str) + upload_date = unified_strdate( + self._html_search_meta('dc.date', webpage)) + + fields = compat_parse_qs(param_code) + video_url = fields['firstVideo'][0] + thumbnail = compat_urlparse.urljoin(url, fields['startPicture'][0]) + + formats = [{ + 'format_id': 'rtmp', + 'url': video_url, + }] + + jscode = self._download_webpage( + 'http://www.wdrmaus.de/codebase/js/extended-medien.min.js', + video_id, fatal=False, + note='Downloading URL translation table', + errnote='Could not download URL translation table') + if jscode: + for m in re.finditer( + r"stream:\s*'dslSrc=(?P[^']+)',\s*download:\s*'(?P
[^']+)'\s*\}", + jscode): + if video_url.startswith(m.group('stream')): + http_url = video_url.replace( + m.group('stream'), m.group('dl')) + formats.append({ + 'format_id': 'http', + 'url': http_url, + }) + break + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + } + +# TODO test _1 \ No newline at end of file diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 3a3a5a39e..b6a1884b5 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -7,13 +7,13 @@ import itertools import json import os.path import re -import string import struct import traceback import zlib from .common import InfoExtractor, SearchInfoExtractor from .subtitles import SubtitlesInfoExtractor +from ..jsinterp import JSInterpreter from ..utils import ( compat_chr, compat_parse_qs, @@ -438,113 +438,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def _parse_sig_js(self, jscode): funcname = self._search_regex( r'signature=([a-zA-Z]+)', jscode, - u'Initial JS player signature function name') + u'Initial JS player signature function name') - functions = {} - - def argidx(varname): - return string.lowercase.index(varname) - - def interpret_statement(stmt, local_vars, allow_recursion=20): - if allow_recursion < 0: - raise ExtractorError(u'Recursion limit reached') - - if stmt.startswith(u'var '): - stmt = stmt[len(u'var '):] - ass_m = re.match(r'^(?P[a-z]+)(?:\[(?P[^\]]+)\])?' + - r'=(?P.*)$', stmt) - if ass_m: - if ass_m.groupdict().get('index'): - def assign(val): - lvar = local_vars[ass_m.group('out')] - idx = interpret_expression(ass_m.group('index'), - local_vars, allow_recursion) - assert isinstance(idx, int) - lvar[idx] = val - return val - expr = ass_m.group('expr') - else: - def assign(val): - local_vars[ass_m.group('out')] = val - return val - expr = ass_m.group('expr') - elif stmt.startswith(u'return '): - assign = lambda v: v - expr = stmt[len(u'return '):] - else: - raise ExtractorError( - u'Cannot determine left side of statement in %r' % stmt) - - v = interpret_expression(expr, local_vars, allow_recursion) - return assign(v) - - def interpret_expression(expr, local_vars, allow_recursion): - if expr.isdigit(): - return int(expr) - - if expr.isalpha(): - return local_vars[expr] - - m = re.match(r'^(?P[a-z]+)\.(?P.*)$', expr) - if m: - member = m.group('member') - val = local_vars[m.group('in')] - if member == 'split("")': - return list(val) - if member == 'join("")': - return u''.join(val) - if member == 'length': - return len(val) - if member == 'reverse()': - return val[::-1] - slice_m = re.match(r'slice\((?P.*)\)', member) - if slice_m: - idx = interpret_expression( - slice_m.group('idx'), local_vars, allow_recursion-1) - return val[idx:] - - m = re.match( - r'^(?P[a-z]+)\[(?P.+)\]$', expr) - if m: - val = local_vars[m.group('in')] - idx = interpret_expression(m.group('idx'), local_vars, - allow_recursion-1) - return val[idx] - - m = re.match(r'^(?P.+?)(?P[%])(?P.+?)$', expr) - if m: - a = interpret_expression(m.group('a'), - local_vars, allow_recursion) - b = interpret_expression(m.group('b'), - local_vars, allow_recursion) - return a % b - - m = re.match( - r'^(?P[a-zA-Z$]+)\((?P[a-z0-9,]+)\)$', expr) - if m: - fname = m.group('func') - if fname not in functions: - functions[fname] = extract_function(fname) - argvals = [int(v) if v.isdigit() else local_vars[v] - for v in m.group('args').split(',')] - return functions[fname](argvals) - raise ExtractorError(u'Unsupported JS expression %r' % expr) - - def extract_function(funcname): - func_m = re.search( - r'function ' + re.escape(funcname) + - r'\((?P[a-z,]+)\){(?P[^}]+)}', - jscode) - argnames = func_m.group('args').split(',') - - def resf(args): - local_vars = dict(zip(argnames, args)) - for stmt in func_m.group('code').split(';'): - res = interpret_statement(stmt, local_vars) - return res - return resf - - initial_function = extract_function(funcname) + jsi = JSInterpreter(jscode) + initial_function = jsi.extract_function(funcname) return lambda s: initial_function([s]) def _parse_sig_swf(self, file_contents): @@ -1549,7 +1446,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): break more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s' % page_num) + 'https://youtube.com/%s' % mobj.group('more'), playlist_id, + 'Downloading page #%s' % page_num, + transform_source=uppercase_escape) content_html = more['content_html'] more_widget_html = more['load_more_widget_html'] @@ -1712,7 +1611,7 @@ class YoutubeUserIE(InfoExtractor): class YoutubeSearchIE(SearchInfoExtractor): IE_DESC = u'YouTube.com searches' - _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' + _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' _MAX_RESULTS = 1000 IE_NAME = u'youtube:search' _SEARCH_KEY = 'ytsearch' @@ -1723,9 +1622,12 @@ class YoutubeSearchIE(SearchInfoExtractor): video_ids = [] pagenum = 0 limit = n + PAGE_SIZE = 50 - while (50 * pagenum) < limit: - result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1) + while (PAGE_SIZE * pagenum) < limit: + result_url = self._API_URL % ( + compat_urllib_parse.quote_plus(query.encode('utf-8')), + (PAGE_SIZE * pagenum) + 1) data_json = self._download_webpage( result_url, video_id=u'query "%s"' % query, note=u'Downloading page %s' % (pagenum + 1), @@ -1836,11 +1738,10 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): feed_entries = [] paging = 0 for i in itertools.count(1): - info = self._download_webpage(self._FEED_TEMPLATE % paging, + info = self._download_json(self._FEED_TEMPLATE % paging, u'%s feed' % self._FEED_NAME, u'Downloading page %s' % i) - info = json.loads(info) - feed_html = info['feed_html'] + feed_html = info.get('feed_html') or info.get('content_html') m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html) ids = orderedSet(m.group(1) for m in m_ids) feed_entries.extend( diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py new file mode 100644 index 000000000..449482d3c --- /dev/null +++ b/youtube_dl/jsinterp.py @@ -0,0 +1,116 @@ +from __future__ import unicode_literals + +import re + +from .utils import ( + ExtractorError, +) + + +class JSInterpreter(object): + def __init__(self, code): + self.code = code + self._functions = {} + + def interpret_statement(self, stmt, local_vars, allow_recursion=20): + if allow_recursion < 0: + raise ExtractorError('Recursion limit reached') + + if stmt.startswith('var '): + stmt = stmt[len('var '):] + ass_m = re.match(r'^(?P[a-z]+)(?:\[(?P[^\]]+)\])?' + + r'=(?P.*)$', stmt) + if ass_m: + if ass_m.groupdict().get('index'): + def assign(val): + lvar = local_vars[ass_m.group('out')] + idx = self.interpret_expression( + ass_m.group('index'), local_vars, allow_recursion) + assert isinstance(idx, int) + lvar[idx] = val + return val + expr = ass_m.group('expr') + else: + def assign(val): + local_vars[ass_m.group('out')] = val + return val + expr = ass_m.group('expr') + elif stmt.startswith('return '): + assign = lambda v: v + expr = stmt[len('return '):] + else: + raise ExtractorError( + 'Cannot determine left side of statement in %r' % stmt) + + v = self.interpret_expression(expr, local_vars, allow_recursion) + return assign(v) + + def interpret_expression(self, expr, local_vars, allow_recursion): + if expr.isdigit(): + return int(expr) + + if expr.isalpha(): + return local_vars[expr] + + m = re.match(r'^(?P[a-z]+)\.(?P.*)$', expr) + if m: + member = m.group('member') + val = local_vars[m.group('in')] + if member == 'split("")': + return list(val) + if member == 'join("")': + return u''.join(val) + if member == 'length': + return len(val) + if member == 'reverse()': + return val[::-1] + slice_m = re.match(r'slice\((?P.*)\)', member) + if slice_m: + idx = self.interpret_expression( + slice_m.group('idx'), local_vars, allow_recursion - 1) + return val[idx:] + + m = re.match( + r'^(?P[a-z]+)\[(?P.+)\]$', expr) + if m: + val = local_vars[m.group('in')] + idx = self.interpret_expression( + m.group('idx'), local_vars, allow_recursion - 1) + return val[idx] + + m = re.match(r'^(?P.+?)(?P[%])(?P.+?)$', expr) + if m: + a = self.interpret_expression( + m.group('a'), local_vars, allow_recursion) + b = self.interpret_expression( + m.group('b'), local_vars, allow_recursion) + return a % b + + m = re.match( + r'^(?P[a-zA-Z$]+)\((?P[a-z0-9,]+)\)$', expr) + if m: + fname = m.group('func') + if fname not in self._functions: + self._functions[fname] = self.extract_function(fname) + argvals = [int(v) if v.isdigit() else local_vars[v] + for v in m.group('args').split(',')] + return self._functions[fname](argvals) + raise ExtractorError('Unsupported JS expression %r' % expr) + + def extract_function(self, funcname): + func_m = re.search( + (r'(?:function %s|%s\s*=\s*function)' % ( + re.escape(funcname), re.escape(funcname))) + + r'\((?P[a-z,]+)\){(?P[^}]+)}', + self.code) + if func_m is None: + raise ExtractorError('Could not find JS function %r' % funcname) + argnames = func_m.group('args').split(',') + + def resf(args): + local_vars = dict(zip(argnames, args)) + for stmt in func_m.group('code').split(';'): + res = self.interpret_statement(stmt, local_vars) + return res + return resf + diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index f7dcbe0cd..55c063fc4 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -55,8 +55,9 @@ class FFmpegPostProcessor(PostProcessor): if self._downloader.params.get('verbose', False): self._downloader.to_screen(u'[debug] ffmpeg command line: %s' % shell_quote(cmd)) - p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout,stderr = p.communicate() + bcmd = [self._downloader.encode(c) for c in cmd] + p = subprocess.Popen(bcmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = p.communicate() if p.returncode != 0: stderr = stderr.decode('utf-8', 'replace') msg = stderr.strip().split('\n')[-1] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 2d77aa4c3..16c061ad3 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -539,7 +539,6 @@ def encodeFilename(s, for_subprocess=False): encoding = 'utf-8' return s.encode(encoding, 'ignore') - def decodeOption(optval): if optval is None: return optval @@ -1269,8 +1268,8 @@ class PagedList(object): def uppercase_escape(s): return re.sub( - r'\\U([0-9a-fA-F]{8})', - lambda m: compat_chr(int(m.group(1), base=16)), s) + r'\\U[0-9a-fA-F]{8}', + lambda m: m.group(0).decode('unicode-escape'), s) try: struct.pack(u'!I', 0) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 154aeca05..bf11c171c 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.03.28' +__version__ = '2014.04.02'