[steam] Add extractor for live broadcasts (#6012)

2015-07-21 19:24:20 +02:00 · 2015-07-21 19:24:20 +02:00 · f9775ae86e
commit f9775ae86e
parent 675a966176
3 changed files with 140 additions and 13 deletions
--- a/youtube_dl/downloader/dash.py
+++ b/youtube_dl/downloader/dash.py
@ -1,9 +1,19 @@
 from __future__ import unicode_literals
 import itertools
 import re
 import time
 import xml.etree.ElementTree as etree
 from .common import FileDownloader
-from ..compat import compat_urllib_request
+from ..compat import (
    compat_str,
    compat_urllib_request,
 )
 from ..utils import (
    parse_iso8601,
    xpath_with_ns,
 )
 class DashSegmentsFD(FileDownloader):
@ -13,9 +23,6 @@ class DashSegmentsFD(FileDownloader):
    def real_download(self, filename, info_dict):
        self.report_destination(filename)
        tmpfilename = self.temp_name(filename)
        base_url = info_dict['url']
        segment_urls = info_dict['segment_urls']
        is_test = self.params.get('test', False)
        remaining_bytes = self._TEST_FILE_SIZE if is_test else None
        byte_counter = 0
@ -34,21 +41,63 @@ class DashSegmentsFD(FileDownloader):
            outf.write(data)
            return len(data)
-        def combine_url(base_url, target_url):
+        if not info_dict.get('is_live'):
-            if re.match(r'^https?://', target_url):
+            base_url = info_dict['url']
-                return target_url
+            segment_urls = info_dict['segment_urls']
-            return '%s/%s' % (base_url, target_url)
+
            def combine_url(base_url, target_url):
                if re.match(r'^https?://', target_url):
                    return target_url
                return '%s/%s' % (base_url, target_url)
            init_url = combine_url(base_url, info_dict['initialization_url'])
            segment_urls = [combine_url(base_url, segment_url) for segment_url in segment_urls]
        else:
            manifest_url = info_dict['url']
            manifest_xml = self.ydl.urlopen(manifest_url).read()
            manifest = etree.fromstring(manifest_xml)
            _x = lambda p: xpath_with_ns(p, {'ns': 'urn:mpeg:DASH:schema:MPD:2011'})
            ad = [e for e in manifest.findall(_x('ns:Period/ns:AdaptationSet')) if e.attrib['id'] == info_dict['mpd_set_id']][0]
            segment_template = ad.find(_x('ns:SegmentTemplate'))
            def subs_url_template(url_template, repr_id, number=None):
                result = url_template.replace('$RepresentationID$', repr_id)
                if number is not None:
                    result = result.replace('$Number$', compat_str(number))
                return result
            start_time = parse_iso8601(manifest.attrib['availabilityStartTime'])
            segment_duration = (int(segment_template.attrib['duration']) / int(segment_template.attrib['timescale']))  # in seconds
            first_segment = int((int(time.time()) - start_time) / segment_duration)
            init_url = subs_url_template(segment_template.attrib['initialization'], '1')
            def build_live_segment_urls():
                for nr in itertools.count(first_segment):
                    # We have to avoid requesting a segment before its start time
                    expected_time = start_time + nr * segment_duration
                    wait_time = expected_time - time.time()
                    if wait_time > 0:
                        time.sleep(wait_time)
                    yield subs_url_template(segment_template.attrib['media'], '1', nr)
            segment_urls = build_live_segment_urls()
        with open(tmpfilename, 'wb') as outf:
            append_url_to_file(
-                outf, combine_url(base_url, info_dict['initialization_url']),
+                outf, init_url,
                'initialization segment')
            for i, segment_url in enumerate(segment_urls):
                note = 'segment %d' % (i + 1)
                if not info_dict.get('is_live'):
                    note += ' / %d' % len(segment_urls)
                segment_len = append_url_to_file(
-                    outf, combine_url(base_url, segment_url),
+                    outf, segment_url, note, remaining_bytes)
                    'segment %d / %d' % (i + 1, len(segment_urls)),
                    remaining_bytes)
                byte_counter += segment_len
                self._hook_progress({
                    'status': 'downloading',
                    'downloaded_bytes': byte_counter,
                    'filename': filename,
                })
                if remaining_bytes is not None:
                    remaining_bytes -= segment_len
                    if remaining_bytes <= 0:
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -561,7 +561,10 @@ from .srf import SrfIE
 from .srmediathek import SRMediathekIE
 from .ssa import SSAIE
 from .stanfordoc import StanfordOpenClassroomIE
-from .steam import SteamIE
+from .steam import (
    SteamIE,
    SteamBroadcastsIE,
 )
 from .streamcloud import StreamcloudIE
 from .streamcz import StreamCZIE
 from .streetvoice import StreetVoiceIE
--- a/youtube_dl/extractor/steam.py
+++ b/youtube_dl/extractor/steam.py
@ -5,7 +5,9 @@ import re
 from .common import InfoExtractor
 from ..utils import (
    ExtractorError,
    int_or_none,
    unescapeHTML,
    xpath_with_ns,
 )
@ -121,3 +123,76 @@ class SteamIE(InfoExtractor):
            raise ExtractorError('Could not find any videos')
        return self.playlist_result(videos, playlist_id, playlist_title)
 class SteamBroadcastsIE(InfoExtractor):
    IE_DESC = 'Steam and Dota 2 live broadcasts'
    _VALID_URL = r'https?://(?:www\.)?(?:steamcommunity\.com/broadcast|dota2\.com)/watch/(?P<id>\d+)'
    # Only livestreams, test urls can be obtained from
    # https://steamcommunity.com/?subsection=broadcasts or
    # https://www.dota2.com/watch/
    _TESTS = [
        {
            'url': 'http://www.dota2.com/watch/76561197986987526',
            'only_matching': True,
        },
        {
            'url': 'https://steamcommunity.com/broadcast/watch/76561197986987526',
            'only_matching': True,
        },
    ]
    def _extract_dash_manifest_formats(self, manifest_url, video_id):
        manifest = self._download_xml(manifest_url, video_id)
        _x = lambda p: xpath_with_ns(p, {'ns': 'urn:mpeg:DASH:schema:MPD:2011'})
        formats = []
        for ad_set in manifest.findall(_x('ns:Period/ns:AdaptationSet')):
            set_id = ad_set.attrib['id']
            if set_id == 'game':
                continue
            for repr in ad_set.findall(_x('ns:Representation')):
                repr_id = repr.attrib['id']
                if set_id == 'audio':
                    ext = 'm4a'
                    vcodec = 'none'
                    acodec = repr.attrib.get('codecs')
                    preference = -10
                else:
                    ext = 'mp4'
                    vcodec = repr.attrib.get('codecs')
                    acodec = 'none'
                    preference = 0
                formats.append({
                    'url': manifest_url,
                    'ext': ext,
                    'format_id': '{0}-{1}'.format(set_id, repr_id),
                    'protocol': 'http_dash_segments',
                    'mpd_set_id': set_id,
                    'mpd_representation_id': repr_id,
                    'height': int_or_none(repr.attrib.get('height')),
                    'width': int_or_none(repr.attrib.get('width')),
                    'vcodec': vcodec,
                    'acodec': acodec,
                    'preference': preference,
                })
        return formats
    def _real_extract(self, url):
        steamid = self._match_id(url)
        broadcast_mpd_info = self._download_json('https://steamcommunity.com/broadcast/getbroadcastmpd/?steamid={0}&broadcastid=0'.format(steamid), steamid)
        broadcast_id = broadcast_mpd_info['broadcastid']
        broadcast_info = self._download_json('https://steamcommunity.com/broadcast/getbroadcastinfo/?steamid={0}&broadcastid={1}'.format(steamid, broadcast_id), steamid)
        manifest_url = broadcast_mpd_info['url']
        formats = self._extract_dash_manifest_formats(manifest_url, steamid)
        self._sort_formats(formats)
        return {
            'id': steamid,
            'title': broadcast_info['title'],
            'formats': formats,
            'is_live': True,
        }