[steam] Add extractor for live broadcasts (#6012)

2015-07-21 19:24:20 +02:00 · 2015-07-21 19:24:20 +02:00 · f9775ae86e
commit f9775ae86e
parent 675a966176
3 changed files with 140 additions and 13 deletions
--- a/youtube_dl/downloader/dash.py
+++ b/youtube_dl/downloader/dash.py
@ -1,9 +1,19 @@
 from __future__ import unicode_literals

+import itertools
 import re
+import time
+import xml.etree.ElementTree as etree

 from .common import FileDownloader
-from ..compat import compat_urllib_request
+from ..compat import (
+    compat_str,
+    compat_urllib_request,
+)
+from ..utils import (
+    parse_iso8601,
+    xpath_with_ns,
+)


 class DashSegmentsFD(FileDownloader):
@ -13,9 +23,6 @@ class DashSegmentsFD(FileDownloader):
    def real_download(self, filename, info_dict):
        self.report_destination(filename)
        tmpfilename = self.temp_name(filename)
-        base_url = info_dict['url']
-        segment_urls = info_dict['segment_urls']
-
        is_test = self.params.get('test', False)
        remaining_bytes = self._TEST_FILE_SIZE if is_test else None
        byte_counter = 0
@ -34,21 +41,63 @@ class DashSegmentsFD(FileDownloader):
            outf.write(data)
            return len(data)

-        def combine_url(base_url, target_url):
-            if re.match(r'^https?://', target_url):
-                return target_url
-            return '%s/%s' % (base_url, target_url)
+        if not info_dict.get('is_live'):
+            base_url = info_dict['url']
+            segment_urls = info_dict['segment_urls']
+
+            def combine_url(base_url, target_url):
+                if re.match(r'^https?://', target_url):
+                    return target_url
+                return '%s/%s' % (base_url, target_url)
+
+            init_url = combine_url(base_url, info_dict['initialization_url'])
+            segment_urls = [combine_url(base_url, segment_url) for segment_url in segment_urls]
+
+        else:
+            manifest_url = info_dict['url']
+            manifest_xml = self.ydl.urlopen(manifest_url).read()
+            manifest = etree.fromstring(manifest_xml)
+            _x = lambda p: xpath_with_ns(p, {'ns': 'urn:mpeg:DASH:schema:MPD:2011'})
+            ad = [e for e in manifest.findall(_x('ns:Period/ns:AdaptationSet')) if e.attrib['id'] == info_dict['mpd_set_id']][0]
+            segment_template = ad.find(_x('ns:SegmentTemplate'))
+
+            def subs_url_template(url_template, repr_id, number=None):
+                result = url_template.replace('$RepresentationID$', repr_id)
+                if number is not None:
+                    result = result.replace('$Number$', compat_str(number))
+                return result
+
+            start_time = parse_iso8601(manifest.attrib['availabilityStartTime'])
+            segment_duration = (int(segment_template.attrib['duration']) / int(segment_template.attrib['timescale']))  # in seconds
+            first_segment = int((int(time.time()) - start_time) / segment_duration)
+            init_url = subs_url_template(segment_template.attrib['initialization'], '1')
+
+            def build_live_segment_urls():
+                for nr in itertools.count(first_segment):
+                    # We have to avoid requesting a segment before its start time
+                    expected_time = start_time + nr * segment_duration
+                    wait_time = expected_time - time.time()
+                    if wait_time > 0:
+                        time.sleep(wait_time)
+                    yield subs_url_template(segment_template.attrib['media'], '1', nr)
+            segment_urls = build_live_segment_urls()

        with open(tmpfilename, 'wb') as outf:
            append_url_to_file(
-                outf, combine_url(base_url, info_dict['initialization_url']),
+                outf, init_url,
                'initialization segment')
            for i, segment_url in enumerate(segment_urls):
+                note = 'segment %d' % (i + 1)
+                if not info_dict.get('is_live'):
+                    note += ' / %d' % len(segment_urls)
                segment_len = append_url_to_file(
-                    outf, combine_url(base_url, segment_url),
-                    'segment %d / %d' % (i + 1, len(segment_urls)),
-                    remaining_bytes)
+                    outf, segment_url, note, remaining_bytes)
                byte_counter += segment_len
+                self._hook_progress({
+                    'status': 'downloading',
+                    'downloaded_bytes': byte_counter,
+                    'filename': filename,
+                })
                if remaining_bytes is not None:
                    remaining_bytes -= segment_len
                    if remaining_bytes <= 0:
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -561,7 +561,10 @@ from .srf import SrfIE
 from .srmediathek import SRMediathekIE
 from .ssa import SSAIE
 from .stanfordoc import StanfordOpenClassroomIE
-from .steam import SteamIE
+from .steam import (
+    SteamIE,
+    SteamBroadcastsIE,
+)
 from .streamcloud import StreamcloudIE
 from .streamcz import StreamCZIE
 from .streetvoice import StreetVoiceIE
--- a/youtube_dl/extractor/steam.py
+++ b/youtube_dl/extractor/steam.py
@ -5,7 +5,9 @@ import re
 from .common import InfoExtractor
 from ..utils import (
    ExtractorError,
+    int_or_none,
    unescapeHTML,
+    xpath_with_ns,
 )


@ -121,3 +123,76 @@ class SteamIE(InfoExtractor):
            raise ExtractorError('Could not find any videos')

        return self.playlist_result(videos, playlist_id, playlist_title)
+
+
+class SteamBroadcastsIE(InfoExtractor):
+    IE_DESC = 'Steam and Dota 2 live broadcasts'
+    _VALID_URL = r'https?://(?:www\.)?(?:steamcommunity\.com/broadcast|dota2\.com)/watch/(?P<id>\d+)'
+
+    # Only livestreams, test urls can be obtained from
+    # https://steamcommunity.com/?subsection=broadcasts or
+    # https://www.dota2.com/watch/
+    _TESTS = [
+        {
+            'url': 'http://www.dota2.com/watch/76561197986987526',
+            'only_matching': True,
+        },
+        {
+            'url': 'https://steamcommunity.com/broadcast/watch/76561197986987526',
+            'only_matching': True,
+        },
+    ]
+
+    def _extract_dash_manifest_formats(self, manifest_url, video_id):
+        manifest = self._download_xml(manifest_url, video_id)
+
+        _x = lambda p: xpath_with_ns(p, {'ns': 'urn:mpeg:DASH:schema:MPD:2011'})
+        formats = []
+        for ad_set in manifest.findall(_x('ns:Period/ns:AdaptationSet')):
+            set_id = ad_set.attrib['id']
+            if set_id == 'game':
+                continue
+            for repr in ad_set.findall(_x('ns:Representation')):
+                repr_id = repr.attrib['id']
+                if set_id == 'audio':
+                    ext = 'm4a'
+                    vcodec = 'none'
+                    acodec = repr.attrib.get('codecs')
+                    preference = -10
+                else:
+                    ext = 'mp4'
+                    vcodec = repr.attrib.get('codecs')
+                    acodec = 'none'
+                    preference = 0
+                formats.append({
+                    'url': manifest_url,
+                    'ext': ext,
+                    'format_id': '{0}-{1}'.format(set_id, repr_id),
+                    'protocol': 'http_dash_segments',
+                    'mpd_set_id': set_id,
+                    'mpd_representation_id': repr_id,
+                    'height': int_or_none(repr.attrib.get('height')),
+                    'width': int_or_none(repr.attrib.get('width')),
+                    'vcodec': vcodec,
+                    'acodec': acodec,
+                    'preference': preference,
+                })
+        return formats
+
+    def _real_extract(self, url):
+        steamid = self._match_id(url)
+
+        broadcast_mpd_info = self._download_json('https://steamcommunity.com/broadcast/getbroadcastmpd/?steamid={0}&broadcastid=0'.format(steamid), steamid)
+        broadcast_id = broadcast_mpd_info['broadcastid']
+        broadcast_info = self._download_json('https://steamcommunity.com/broadcast/getbroadcastinfo/?steamid={0}&broadcastid={1}'.format(steamid, broadcast_id), steamid)
+
+        manifest_url = broadcast_mpd_info['url']
+        formats = self._extract_dash_manifest_formats(manifest_url, steamid)
+        self._sort_formats(formats)
+
+        return {
+            'id': steamid,
+            'title': broadcast_info['title'],
+            'formats': formats,
+            'is_live': True,
+        }