diff --git a/AUTHORS b/AUTHORS index cc552bcb2..cdb56de3b 100644 --- a/AUTHORS +++ b/AUTHORS @@ -144,3 +144,6 @@ Lee Jenkins Anssi Hannula Lukáš Lalinský Qijiang Fan +Rémy Léone +Marco Ferragina +reiv diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 32c2fd84c..09ce98ca2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,6 @@ **Please include the full output of youtube-dl when run with `-v`**. -The output (including the first lines) contain important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever. +The output (including the first lines) contains important debugging information. Issues without the full output are often not reproducible and therefore do not get solved in short order, if ever. Please re-read your issue once again to avoid a couple of common mistakes (you can and should use this as a checklist): @@ -114,12 +114,13 @@ If you want to add support for a new site, you can follow this quick list (assum webpage = self._download_webpage(url, video_id) # TODO more code goes here, for example ... - title = self._html_search_regex(r'
Foo
Foo
''' - doc = xml.etree.ElementTree.fromstring(testxml) + doc = compat_etree_fromstring(testxml) self.assertEqual(xpath_attr(doc, 'div/p', 'x'), 'a') self.assertEqual(xpath_attr(doc, 'div/bar', 'x'), None) self.assertEqual(xpath_attr(doc, 'div/p', 'y'), None) @@ -425,6 +444,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266) self.assertEqual(parse_iso8601('2014-03-23T22:04:26Z'), 1395612266) self.assertEqual(parse_iso8601('2014-03-23T22:04:26.1234Z'), 1395612266) + self.assertEqual(parse_iso8601('2015-09-29T08:27:31.727'), 1443515251) + self.assertEqual(parse_iso8601('2015-09-29T08-27-31.727'), None) def test_strip_jsonp(self): stripped = strip_jsonp('cb ([ {"id":"532cb",\n\n\n"x":\n3}\n]\n);') @@ -495,6 +516,9 @@ class TestUtil(unittest.TestCase): "playlist":[{"controls":{"all":null}}] }''') + inp = '''"The CW\\'s \\'Crazy Ex-Girlfriend\\'"''' + self.assertEqual(js_to_json(inp), '''"The CW's 'Crazy Ex-Girlfriend'"''') + inp = '"SAND Number: SAND 2013-7800P\\nPresenter: Tom Russo\\nHabanero Software Training - Xyce Software\\nXyce, Sandia\\u0027s"' json_code = js_to_json(inp) self.assertEqual(json.loads(json_code), json.loads(inp)) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 12977bf80..9a8c7da05 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -28,6 +28,7 @@ if os.name == 'nt': import ctypes from .compat import ( + compat_basestring, compat_cookiejar, compat_expanduser, compat_get_terminal_size, @@ -63,6 +64,7 @@ from .utils import ( SameFileError, sanitize_filename, sanitize_path, + sanitized_Request, std_headers, subtitles_filename, UnavailableVideoError, @@ -156,7 +158,7 @@ class YoutubeDL(object): writethumbnail: Write the thumbnail image to a file write_all_thumbnails: Write all thumbnail formats to files writesubtitles: Write the video subtitles to a file - writeautomaticsub: Write the automatic subtitles to a file + writeautomaticsub: Write the automatically generated subtitles to a file allsubtitles: Downloads all the subtitles of the video (requires writesubtitles or writeautomaticsub) listsubtitles: Lists all available subtitles for the video @@ -572,7 +574,7 @@ class YoutubeDL(object): if v is not None) template_dict = collections.defaultdict(lambda: 'NA', template_dict) - outtmpl = sanitize_path(self.params.get('outtmpl', DEFAULT_OUTTMPL)) + outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) tmpl = compat_expanduser(outtmpl) filename = tmpl % template_dict # Temporary fix for #4787 @@ -580,7 +582,7 @@ class YoutubeDL(object): # to workaround encoding issues with subprocess on python2 @ Windows if sys.version_info < (3, 0) and sys.platform == 'win32': filename = encodeFilename(filename, True).decode(preferredencoding()) - return filename + return sanitize_path(filename) except ValueError as err: self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')') return None @@ -833,6 +835,7 @@ class YoutubeDL(object): extra_info=extra) playlist_results.append(entry_result) ie_result['entries'] = playlist_results + self.to_screen('[download] Finished downloading playlist: %s' % playlist) return ie_result elif result_type == 'compat_list': self.report_warning( @@ -937,7 +940,7 @@ class YoutubeDL(object): filter_parts.append(string) def _remove_unused_ops(tokens): - # Remove operators that we don't use and join them with the sourrounding strings + # Remove operators that we don't use and join them with the surrounding strings # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' ALLOWED_OPS = ('/', '+', ',', '(', ')') last_string, last_start, last_end, last_line = None, None, None, None @@ -1186,7 +1189,7 @@ class YoutubeDL(object): return res def _calc_cookies(self, info_dict): - pr = compat_urllib_request.Request(info_dict['url']) + pr = sanitized_Request(info_dict['url']) self.cookiejar.add_cookie_header(pr) return pr.get_header('Cookie') @@ -1870,6 +1873,8 @@ class YoutubeDL(object): def urlopen(self, req): """ Start an HTTP download """ + if isinstance(req, compat_basestring): + req = sanitized_Request(req) return self._opener.open(req, timeout=self._socket_timeout) def print_debug_header(self): diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 5e2ed4d4b..9f131f5db 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -377,7 +377,7 @@ def _real_main(argv=None): with YoutubeDL(ydl_opts) as ydl: # Update version if opts.update_self: - update_self(ydl.to_screen, opts.verbose) + update_self(ydl.to_screen, opts.verbose, ydl._opener) # Remove cache dir if opts.rm_cachedir: diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index d103ab9ad..a3e85264a 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -14,6 +14,7 @@ import socket import subprocess import sys import itertools +import xml.etree.ElementTree try: @@ -212,6 +213,43 @@ try: except ImportError: # Python 2.6 from xml.parsers.expat import ExpatError as compat_xml_parse_error +if sys.version_info[0] >= 3: + compat_etree_fromstring = xml.etree.ElementTree.fromstring +else: + # python 2.x tries to encode unicode strings with ascii (see the + # XMLParser._fixtext method) + etree = xml.etree.ElementTree + + try: + _etree_iter = etree.Element.iter + except AttributeError: # Python <=2.6 + def _etree_iter(root): + for el in root.findall('*'): + yield el + for sub in _etree_iter(el): + yield sub + + # on 2.6 XML doesn't have a parser argument, function copied from CPython + # 2.7 source + def _XML(text, parser=None): + if not parser: + parser = etree.XMLParser(target=etree.TreeBuilder()) + parser.feed(text) + return parser.close() + + def _element_factory(*args, **kwargs): + el = etree.Element(*args, **kwargs) + for k, v in el.items(): + if isinstance(v, bytes): + el.set(k, v.decode('utf-8')) + return el + + def compat_etree_fromstring(text): + doc = _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory))) + for el in _etree_iter(doc): + if el.text is not None and isinstance(el.text, bytes): + el.text = el.text.decode('utf-8') + return doc try: from urllib.parse import parse_qs as compat_parse_qs @@ -507,6 +545,7 @@ __all__ = [ 'compat_chr', 'compat_cookiejar', 'compat_cookies', + 'compat_etree_fromstring', 'compat_expanduser', 'compat_get_terminal_size', 'compat_getenv', diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 29a4500d3..b8bf8daf8 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -42,7 +42,7 @@ class FileDownloader(object): min_filesize: Skip files smaller than this size max_filesize: Skip files larger than this size xattr_set_filesize: Set ytdl.filesize user xattribute with expected size. - (experimenatal) + (experimental) external_downloader_args: A list of additional command-line arguments for the external downloader. diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 8b6fa2753..535f2a7fc 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re from .common import FileDownloader -from ..compat import compat_urllib_request +from ..utils import sanitized_Request class DashSegmentsFD(FileDownloader): @@ -22,7 +22,7 @@ class DashSegmentsFD(FileDownloader): def append_url_to_file(outf, target_url, target_name, remaining_bytes=None): self.to_screen('[DashSegments] %s: Downloading %s' % (info_dict['id'], target_name)) - req = compat_urllib_request.Request(target_url) + req = sanitized_Request(target_url) if remaining_bytes is not None: req.add_header('Range', 'bytes=0-%d' % (remaining_bytes - 1)) diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 174180db5..6170cc155 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -5,12 +5,13 @@ import io import itertools import os import time -import xml.etree.ElementTree as etree from .fragment import FragmentFD from ..compat import ( + compat_etree_fromstring, compat_urlparse, compat_urllib_error, + compat_urllib_parse_urlparse, ) from ..utils import ( encodeFilename, @@ -285,9 +286,11 @@ class F4mFD(FragmentFD): man_url = info_dict['url'] requested_bitrate = info_dict.get('tbr') self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME) - manifest = self.ydl.urlopen(man_url).read() + urlh = self.ydl.urlopen(man_url) + man_url = urlh.geturl() + manifest = urlh.read() - doc = etree.fromstring(manifest) + doc = compat_etree_fromstring(manifest) formats = [(int(f.attrib.get('bitrate', -1)), f) for f in self._get_unencrypted_media(doc)] if requested_bitrate is None: @@ -329,20 +332,25 @@ class F4mFD(FragmentFD): if not live: write_metadata_tag(dest_stream, metadata) + base_url_parsed = compat_urllib_parse_urlparse(base_url) + self._start_frag_download(ctx) frags_filenames = [] while fragments_list: seg_i, frag_i = fragments_list.pop(0) name = 'Seg%d-Frag%d' % (seg_i, frag_i) - url = base_url + name + query = [] + if base_url_parsed.query: + query.append(base_url_parsed.query) if akamai_pv: - url += '?' + akamai_pv.strip(';') + query.append(akamai_pv.strip(';')) if info_dict.get('extra_param_to_segment_url'): - url += info_dict.get('extra_param_to_segment_url') + query.append(info_dict['extra_param_to_segment_url']) + url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query)) frag_filename = '%s-%s' % (ctx['tmpfilename'], name) try: - success = ctx['dl'].download(frag_filename, {'url': url}) + success = ctx['dl'].download(frag_filename, {'url': url_parsed.geturl()}) if not success: return False (down, frag_sanitized) = sanitize_open(frag_filename, 'rb') diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 9a83a73dd..b5a3e1167 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -13,6 +13,7 @@ from ..utils import ( encodeArgument, encodeFilename, sanitize_open, + handle_youtubedl_headers, ) @@ -33,9 +34,10 @@ class HlsFD(FileDownloader): if info_dict['http_headers'] and re.match(r'^https?://', url): # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. + headers = handle_youtubedl_headers(info_dict['http_headers']) args += [ '-headers', - ''.join('%s: %s\r\n' % (key, val) for key, val in info_dict['http_headers'].items())] + ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] args += ['-i', url, '-f', 'mp4', '-c', 'copy', '-bsf:a', 'aac_adtstoasc'] diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index a29f5cf31..56840e026 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -7,14 +7,12 @@ import time import re from .common import FileDownloader -from ..compat import ( - compat_urllib_request, - compat_urllib_error, -) +from ..compat import compat_urllib_error from ..utils import ( ContentTooShortError, encodeFilename, sanitize_open, + sanitized_Request, ) @@ -29,8 +27,8 @@ class HttpFD(FileDownloader): add_headers = info_dict.get('http_headers') if add_headers: headers.update(add_headers) - basic_request = compat_urllib_request.Request(url, None, headers) - request = compat_urllib_request.Request(url, None, headers) + basic_request = sanitized_Request(url, None, headers) + request = sanitized_Request(url, None, headers) is_test = self.params.get('test', False) diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index f1d219ba9..14d56db47 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -117,7 +117,7 @@ class RtmpFD(FileDownloader): return False # Download using rtmpdump. rtmpdump returns exit code 2 when - # the connection was interrumpted and resuming appears to be + # the connection was interrupted and resuming appears to be # possible. This is part of rtmpdump's normal usage, AFAIK. basic_args = [ 'rtmpdump', '--verbose', '-r', url, diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bd6eb6ae0..947b83683 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -45,6 +45,7 @@ from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE from .bbc import ( BBCCoUkIE, + BBCCoUkArticleIE, BBCIE, ) from .beeg import BeegIE @@ -59,7 +60,10 @@ from .bloomberg import BloombergIE from .bpb import BpbIE from .br import BRIE from .breakcom import BreakIE -from .brightcove import BrightcoveIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) from .buzzfeed import BuzzFeedIE from .byutv import BYUtvIE from .c56 import C56IE @@ -89,6 +93,7 @@ from .cliphunter import CliphunterIE from .clipsyndicate import ClipsyndicateIE from .cloudy import CloudyIE from .clubic import ClubicIE +from .clyp import ClypIE from .cmt import CMTIE from .cnet import CNETIE from .cnn import ( @@ -122,10 +127,12 @@ from .dbtv import DBTVIE from .dcn import DCNIE from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE +from .democracynow import DemocracynowIE from .dfb import DFBIE from .dhm import DHMIE from .dotsub import DotsubIE from .douyutv import DouyuTVIE +from .dplay import DPlayIE from .dramafever import ( DramaFeverIE, DramaFeverSeriesIE, @@ -209,13 +216,15 @@ from .gfycat import GfycatIE from .giantbomb import GiantBombIE from .giga import GigaIE from .glide import GlideIE -from .globo import GloboIE +from .globo import ( + GloboIE, + GloboArticleIE, +) from .godtube import GodTubeIE from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE -from .gorillavid import GorillaVidIE from .goshgay import GoshgayIE from .groupon import GrouponIE from .hark import HarkIE @@ -412,7 +421,10 @@ from .nowness import ( NownessPlaylistIE, NownessSeriesIE, ) -from .nowtv import NowTVIE +from .nowtv import ( + NowTVIE, + NowTVListIE, +) from .nowvideo import NowVideoIE from .npo import ( NPOIE, @@ -450,10 +462,7 @@ from .orf import ( from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE -from .periscope import ( - PeriscopeIE, - QuickscopeIE, -) +from .periscope import PeriscopeIE from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE @@ -567,7 +576,8 @@ from .soundcloud import ( SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE, - SoundcloudPlaylistIE + SoundcloudPlaylistIE, + SoundcloudSearchIE ) from .soundgasm import ( SoundgasmIE, @@ -586,6 +596,7 @@ from .spankwire import SpankwireIE from .spiegel import SpiegelIE, SpiegelArticleIE from .spiegeltv import SpiegeltvIE from .spike import SpikeIE +from .stitcher import StitcherIE from .sport5 import Sport5IE from .sportbox import ( SportBoxIE, @@ -717,7 +728,6 @@ from .vh1 import VH1IE from .vice import ViceIE from .viddler import ViddlerIE from .videodetective import VideoDetectiveIE -from .videolecturesnet import VideoLecturesNetIE from .videofyme import VideofyMeIE from .videomega import VideoMegaIE from .videopremium import VideoPremiumIE @@ -727,6 +737,7 @@ from .vidme import VidmeIE from .vidzi import VidziIE from .vier import VierIE, VierVideosIE from .viewster import ViewsterIE +from .viidea import ViideaIE from .vimeo import ( VimeoIE, VimeoAlbumIE, @@ -779,6 +790,7 @@ from .wrzuta import WrzutaIE from .wsj import WSJIE from .xbef import XBefIE from .xboxclips import XboxClipsIE +from .xfileshare import XFileShareIE from .xhamster import ( XHamsterIE, XHamsterEmbedIE, @@ -822,6 +834,7 @@ from .youtube import ( YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, YoutubeUserIE, + YoutubeUserPlaylistsIE, YoutubeWatchLaterIE, ) from .zapiks import ZapiksIE diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index f9a389f67..c0e5d1abf 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -12,7 +12,7 @@ from ..utils import ( class ABCIE(InfoExtractor): IE_NAME = 'abc.net.au' - _VALID_URL = r'http://www\.abc\.net\.au/news/[^/]+/[^/]+/(?P