diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index a4685d307..a51e5fc3b 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -1,9 +1,19 @@ from __future__ import unicode_literals +import itertools import re +import time +import xml.etree.ElementTree as etree from .common import FileDownloader -from ..compat import compat_urllib_request +from ..compat import ( + compat_str, + compat_urllib_request, +) +from ..utils import ( + parse_iso8601, + xpath_with_ns, +) class DashSegmentsFD(FileDownloader): @@ -13,9 +23,6 @@ class DashSegmentsFD(FileDownloader): def real_download(self, filename, info_dict): self.report_destination(filename) tmpfilename = self.temp_name(filename) - base_url = info_dict['url'] - segment_urls = info_dict['segment_urls'] - is_test = self.params.get('test', False) remaining_bytes = self._TEST_FILE_SIZE if is_test else None byte_counter = 0 @@ -34,21 +41,63 @@ class DashSegmentsFD(FileDownloader): outf.write(data) return len(data) - def combine_url(base_url, target_url): - if re.match(r'^https?://', target_url): - return target_url - return '%s/%s' % (base_url, target_url) + if not info_dict.get('is_live'): + base_url = info_dict['url'] + segment_urls = info_dict['segment_urls'] + + def combine_url(base_url, target_url): + if re.match(r'^https?://', target_url): + return target_url + return '%s/%s' % (base_url, target_url) + + init_url = combine_url(base_url, info_dict['initialization_url']) + segment_urls = [combine_url(base_url, segment_url) for segment_url in segment_urls] + + else: + manifest_url = info_dict['url'] + manifest_xml = self.ydl.urlopen(manifest_url).read() + manifest = etree.fromstring(manifest_xml) + _x = lambda p: xpath_with_ns(p, {'ns': 'urn:mpeg:DASH:schema:MPD:2011'}) + ad = [e for e in manifest.findall(_x('ns:Period/ns:AdaptationSet')) if e.attrib['id'] == info_dict['mpd_set_id']][0] + segment_template = ad.find(_x('ns:SegmentTemplate')) + + def subs_url_template(url_template, repr_id, number=None): + result = url_template.replace('$RepresentationID$', repr_id) + if number is not None: + result = result.replace('$Number$', compat_str(number)) + return result + + start_time = parse_iso8601(manifest.attrib['availabilityStartTime']) + segment_duration = (int(segment_template.attrib['duration']) / int(segment_template.attrib['timescale'])) # in seconds + first_segment = int((int(time.time()) - start_time) / segment_duration) + init_url = subs_url_template(segment_template.attrib['initialization'], '1') + + def build_live_segment_urls(): + for nr in itertools.count(first_segment): + # We have to avoid requesting a segment before its start time + expected_time = start_time + nr * segment_duration + wait_time = expected_time - time.time() + if wait_time > 0: + time.sleep(wait_time) + yield subs_url_template(segment_template.attrib['media'], '1', nr) + segment_urls = build_live_segment_urls() with open(tmpfilename, 'wb') as outf: append_url_to_file( - outf, combine_url(base_url, info_dict['initialization_url']), + outf, init_url, 'initialization segment') for i, segment_url in enumerate(segment_urls): + note = 'segment %d' % (i + 1) + if not info_dict.get('is_live'): + note += ' / %d' % len(segment_urls) segment_len = append_url_to_file( - outf, combine_url(base_url, segment_url), - 'segment %d / %d' % (i + 1, len(segment_urls)), - remaining_bytes) + outf, segment_url, note, remaining_bytes) byte_counter += segment_len + self._hook_progress({ + 'status': 'downloading', + 'downloaded_bytes': byte_counter, + 'filename': filename, + }) if remaining_bytes is not None: remaining_bytes -= segment_len if remaining_bytes <= 0: diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 50da08830..7d6423611 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -561,7 +561,10 @@ from .srf import SrfIE from .srmediathek import SRMediathekIE from .ssa import SSAIE from .stanfordoc import StanfordOpenClassroomIE -from .steam import SteamIE +from .steam import ( + SteamIE, + SteamBroadcastsIE, +) from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE from .streetvoice import StreetVoiceIE diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py index 183dcb03c..40b4eab73 100644 --- a/youtube_dl/extractor/steam.py +++ b/youtube_dl/extractor/steam.py @@ -5,7 +5,9 @@ import re from .common import InfoExtractor from ..utils import ( ExtractorError, + int_or_none, unescapeHTML, + xpath_with_ns, ) @@ -121,3 +123,76 @@ class SteamIE(InfoExtractor): raise ExtractorError('Could not find any videos') return self.playlist_result(videos, playlist_id, playlist_title) + + +class SteamBroadcastsIE(InfoExtractor): + IE_DESC = 'Steam and Dota 2 live broadcasts' + _VALID_URL = r'https?://(?:www\.)?(?:steamcommunity\.com/broadcast|dota2\.com)/watch/(?P\d+)' + + # Only livestreams, test urls can be obtained from + # https://steamcommunity.com/?subsection=broadcasts or + # https://www.dota2.com/watch/ + _TESTS = [ + { + 'url': 'http://www.dota2.com/watch/76561197986987526', + 'only_matching': True, + }, + { + 'url': 'https://steamcommunity.com/broadcast/watch/76561197986987526', + 'only_matching': True, + }, + ] + + def _extract_dash_manifest_formats(self, manifest_url, video_id): + manifest = self._download_xml(manifest_url, video_id) + + _x = lambda p: xpath_with_ns(p, {'ns': 'urn:mpeg:DASH:schema:MPD:2011'}) + formats = [] + for ad_set in manifest.findall(_x('ns:Period/ns:AdaptationSet')): + set_id = ad_set.attrib['id'] + if set_id == 'game': + continue + for repr in ad_set.findall(_x('ns:Representation')): + repr_id = repr.attrib['id'] + if set_id == 'audio': + ext = 'm4a' + vcodec = 'none' + acodec = repr.attrib.get('codecs') + preference = -10 + else: + ext = 'mp4' + vcodec = repr.attrib.get('codecs') + acodec = 'none' + preference = 0 + formats.append({ + 'url': manifest_url, + 'ext': ext, + 'format_id': '{0}-{1}'.format(set_id, repr_id), + 'protocol': 'http_dash_segments', + 'mpd_set_id': set_id, + 'mpd_representation_id': repr_id, + 'height': int_or_none(repr.attrib.get('height')), + 'width': int_or_none(repr.attrib.get('width')), + 'vcodec': vcodec, + 'acodec': acodec, + 'preference': preference, + }) + return formats + + def _real_extract(self, url): + steamid = self._match_id(url) + + broadcast_mpd_info = self._download_json('https://steamcommunity.com/broadcast/getbroadcastmpd/?steamid={0}&broadcastid=0'.format(steamid), steamid) + broadcast_id = broadcast_mpd_info['broadcastid'] + broadcast_info = self._download_json('https://steamcommunity.com/broadcast/getbroadcastinfo/?steamid={0}&broadcastid={1}'.format(steamid, broadcast_id), steamid) + + manifest_url = broadcast_mpd_info['url'] + formats = self._extract_dash_manifest_formats(manifest_url, steamid) + self._sort_formats(formats) + + return { + 'id': steamid, + 'title': broadcast_info['title'], + 'formats': formats, + 'is_live': True, + }