Merge remote-tracking branch 'origin/master'

2019-03-01 23:48:40 +11:00 · 2019-03-01 23:48:40 +11:00 · bdbbd28f35
commit bdbbd28f35
parent 4ffdcbdccd b5740349fa
11 changed files with 204 additions and 25 deletions
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@ -6,8 +6,8 @@

 ---

-### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.02.18*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.02.18**
+### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2019.03.01*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
+- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2019.03.01**

 ### Before submitting an *issue* make sure you have:
 - [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@ -36,7 +36,7 @@ Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl
 [debug] User config: []
 [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
 [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
-[debug] youtube-dl version 2019.02.18
+[debug] youtube-dl version 2019.03.01
 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
 [debug] Proxy map: {}
--- a/15
+++ b/15
@ -1,3 +1,18 @@
+version 2019.03.01
+
+Core
+ [downloader/external] Add support for rate limit and retries for wget
+* [downloader/external] Fix infinite retries for curl (#19303)
+
+Extractors
+* [npo] Fix extraction (#20084)
+* [francetv:site] Extend video id regex (#20029, #20071)
+ [periscope] Extract width and height (#20015)
+* [servus] Fix extraction (#19297)
+* [bbccouk] Make subtitles non fatal (#19651)
+* [metacafe] Fix family filter bypass (#19287)
+
+
 version 2019.02.18

 Extractors
--- a/youtube_dl/downloader/external.py
+++ b/youtube_dl/downloader/external.py
@ -121,7 +121,11 @@ class CurlFD(ExternalFD):
        cmd += self._valueless_option('--silent', 'noprogress')
        cmd += self._valueless_option('--verbose', 'verbose')
        cmd += self._option('--limit-rate', 'ratelimit')
-        cmd += self._option('--retry', 'retries')
+        retry = self._option('--retry', 'retries')
+        if len(retry) == 2:
+            if retry[1] in ('inf', 'infinite'):
+                retry[1] = '2147483647'
+            cmd += retry
        cmd += self._option('--max-filesize', 'max_filesize')
        cmd += self._option('--interface', 'source_address')
        cmd += self._option('--proxy', 'proxy')
@ -160,6 +164,12 @@ class WgetFD(ExternalFD):
        cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies']
        for key, val in info_dict['http_headers'].items():
            cmd += ['--header', '%s: %s' % (key, val)]
+        cmd += self._option('--limit-rate', 'ratelimit')
+        retry = self._option('--tries', 'retries')
+        if len(retry) == 2:
+            if retry[1] in ('inf', 'infinite'):
+                retry[1] = '0'
+            cmd += retry
        cmd += self._option('--bind-address', 'source_address')
        cmd += self._option('--proxy', 'proxy')
        cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate')
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@ -1,8 +1,9 @@
 # coding: utf-8
 from __future__ import unicode_literals

-import re
 import itertools
+import re
+import xml

 from .common import InfoExtractor
 from ..utils import (
@ -17,6 +18,7 @@ from ..utils import (
    parse_iso8601,
    try_get,
    unescapeHTML,
+    url_or_none,
    urlencode_postdata,
    urljoin,
 )
@ -310,7 +312,13 @@ class BBCCoUkIE(InfoExtractor):
    def _get_subtitles(self, media, programme_id):
        subtitles = {}
        for connection in self._extract_connections(media):
-            captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
+            cc_url = url_or_none(connection.get('href'))
+            if not cc_url:
+                continue
+            captions = self._download_xml(
+                cc_url, programme_id, 'Downloading captions', fatal=False)
+            if not isinstance(captions, xml.etree.ElementTree.Element):
+                continue
            lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
            subtitles[lang] = [
                {
--- a/youtube_dl/extractor/francetv.py
+++ b/youtube_dl/extractor/francetv.py
@ -271,7 +271,7 @@ class FranceTVSiteIE(FranceTVBaseInfoExtractor):

        catalogue = None
        video_id = self._search_regex(
-            r'data-main-video=(["\'])(?P<id>(?:(?!\1).)+)\1',
+            r'(?:data-main-video\s*=|videoId\s*:)\s*(["\'])(?P<id>(?:(?!\1).)+)\1',
            webpage, 'video id', default=None, group='id')

        if not video_id:
--- a/youtube_dl/extractor/metacafe.py
+++ b/youtube_dl/extractor/metacafe.py
@ -1,12 +1,13 @@
 from __future__ import unicode_literals

+import json
 import re

 from .common import InfoExtractor
 from ..compat import (
    compat_parse_qs,
+    compat_urllib_parse,
    compat_urllib_parse_unquote,
-    compat_urllib_parse_urlencode,
 )
 from ..utils import (
    determine_ext,
@ -144,7 +145,7 @@ class MetacafeIE(InfoExtractor):

        headers = {
            # Disable family filter
-            'Cookie': 'user=%s; ' % compat_urllib_parse_urlencode({'ffilter': False})
+            'Cookie': 'user=%s; ' % compat_urllib_parse.quote(json.dumps({'ffilter': False}))
        }

        # AnyClip videos require the flashversion cookie so that we get the link
--- a/youtube_dl/extractor/npo.py
+++ b/youtube_dl/extractor/npo.py
@ -12,11 +12,16 @@ from ..utils import (
    ExtractorError,
    fix_xml_ampersands,
    int_or_none,
+    merge_dicts,
    orderedSet,
    parse_duration,
    qualities,
+    str_or_none,
    strip_jsonp,
    unified_strdate,
+    unified_timestamp,
+    url_or_none,
+    urlencode_postdata,
 )


@ -176,9 +181,118 @@ class NPOIE(NPOBaseIE):

    def _real_extract(self, url):
        video_id = self._match_id(url)
-        return self._get_info(video_id)
+        try:
+            return self._get_info(url, video_id)
+        except ExtractorError:
+            return self._get_old_info(video_id)

-    def _get_info(self, video_id):
+    def _get_info(self, url, video_id):
+        token = self._download_json(
+            'https://www.npostart.nl/api/token', video_id,
+            'Downloading token', headers={
+                'Referer': url,
+                'X-Requested-With': 'XMLHttpRequest',
+            })['token']
+
+        player = self._download_json(
+            'https://www.npostart.nl/player/%s' % video_id, video_id,
+            'Downloading player JSON', data=urlencode_postdata({
+                'autoplay': 0,
+                'share': 1,
+                'pageUrl': url,
+                'hasAdConsent': 0,
+                '_token': token,
+            }))
+
+        player_token = player['token']
+
+        format_urls = set()
+        formats = []
+        for profile in ('hls', 'dash-widevine', 'dash-playready', 'smooth'):
+            streams = self._download_json(
+                'https://start-player.npo.nl/video/%s/streams' % video_id,
+                video_id, 'Downloading %s profile JSON' % profile, fatal=False,
+                query={
+                    'profile': profile,
+                    'quality': 'npo',
+                    'tokenId': player_token,
+                    'streamType': 'broadcast',
+                })
+            if not streams:
+                continue
+            stream = streams.get('stream')
+            if not isinstance(stream, dict):
+                continue
+            stream_url = url_or_none(stream.get('src'))
+            if not stream_url or stream_url in format_urls:
+                continue
+            format_urls.add(stream_url)
+            if stream.get('protection') is not None:
+                continue
+            stream_type = stream.get('type')
+            stream_ext = determine_ext(stream_url)
+            if stream_type == 'application/dash+xml' or stream_ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    stream_url, video_id, mpd_id='dash', fatal=False))
+            elif stream_type == 'application/vnd.apple.mpegurl' or stream_ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    stream_url, video_id, ext='mp4',
+                    entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+            elif '.ism/Manifest' in stream_url:
+                formats.extend(self._extract_ism_formats(
+                    stream_url, video_id, ism_id='mss', fatal=False))
+            else:
+                formats.append({
+                    'url': stream_url,
+                })
+
+        self._sort_formats(formats)
+
+        info = {
+            'id': video_id,
+            'title': video_id,
+            'formats': formats,
+        }
+
+        embed_url = url_or_none(player.get('embedUrl'))
+        if embed_url:
+            webpage = self._download_webpage(
+                embed_url, video_id, 'Downloading embed page', fatal=False)
+            if webpage:
+                video = self._parse_json(
+                    self._search_regex(
+                        r'\bvideo\s*=\s*({.+?})\s*;', webpage, 'video',
+                        default='{}'), video_id)
+                if video:
+                    title = video.get('episodeTitle')
+                    subtitles = {}
+                    subtitles_list = video.get('subtitles')
+                    if isinstance(subtitles_list, list):
+                        for cc in subtitles_list:
+                            cc_url = url_or_none(cc.get('src'))
+                            if not cc_url:
+                                continue
+                            lang = str_or_none(cc.get('language')) or 'nl'
+                            subtitles.setdefault(lang, []).append({
+                                'url': cc_url,
+                            })
+                    return merge_dicts({
+                        'title': title,
+                        'description': video.get('description'),
+                        'thumbnail': url_or_none(
+                            video.get('still_image_url') or video.get('orig_image_url')),
+                        'duration': int_or_none(video.get('duration')),
+                        'timestamp': unified_timestamp(video.get('broadcastDate')),
+                        'creator': video.get('channel'),
+                        'series': video.get('title'),
+                        'episode': title,
+                        'episode_number': int_or_none(video.get('episodeNumber')),
+                        'subtitles': subtitles,
+                    }, info)
+
+        return info
+
+    def _get_old_info(self, video_id):
        metadata = self._download_json(
            'http://e.omroep.nl/metadata/%s' % video_id,
            video_id,
@ -280,7 +394,7 @@ class NPOIE(NPOBaseIE):
            # JSON
            else:
                video_url = stream_info.get('url')
-            if not video_url or video_url in urls:
+            if not video_url or 'vodnotavailable.' in video_url or video_url in urls:
                continue
            urls.add(video_url)
            if determine_ext(video_url) == 'm3u8':
--- a/youtube_dl/extractor/periscope.py
+++ b/youtube_dl/extractor/periscope.py
@ -5,6 +5,7 @@ import re

 from .common import InfoExtractor
 from ..utils import (
+    int_or_none,
    parse_iso8601,
    unescapeHTML,
 )
@ -75,6 +76,14 @@ class PeriscopeIE(PeriscopeBaseIE):
            'url': broadcast[image],
        } for image in ('image_url', 'image_url_small') if broadcast.get(image)]

+        width = int_or_none(broadcast.get('width'))
+        height = int_or_none(broadcast.get('height'))
+
+        def add_width_and_height(f):
+            for key, val in (('width', width), ('height', height)):
+                if not f.get(key):
+                    f[key] = val
+
        video_urls = set()
        formats = []
        for format_id in ('replay', 'rtmp', 'hls', 'https_hls', 'lhls', 'lhlsweb'):
@ -83,16 +92,21 @@ class PeriscopeIE(PeriscopeBaseIE):
                continue
            video_urls.add(video_url)
            if format_id != 'rtmp':
-                formats.extend(self._extract_m3u8_formats(
+                m3u8_formats = self._extract_m3u8_formats(
                    video_url, token, 'mp4',
                    entry_protocol='m3u8_native'
                    if state in ('ended', 'timed_out') else 'm3u8',
-                    m3u8_id=format_id, fatal=False))
+                    m3u8_id=format_id, fatal=False)
+                if len(m3u8_formats) == 1:
+                    add_width_and_height(m3u8_formats[0])
+                formats.extend(m3u8_formats)
                continue
-            formats.append({
+            rtmp_format = {
                'url': video_url,
                'ext': 'flv' if format_id == 'rtmp' else 'mp4',
-            })
+            }
+            add_width_and_height(rtmp_format)
+            formats.append(rtmp_format)
        self._sort_formats(formats)

        return {
--- a/youtube_dl/extractor/servus.py
+++ b/youtube_dl/extractor/servus.py
@ -1,31 +1,44 @@
 # coding: utf-8
 from __future__ import unicode_literals

+import re
+
 from .common import InfoExtractor


 class ServusIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?servus\.com/(?:at|de)/p/[^/]+/(?P<id>AA-\w+|\d+-\d+)'
+    _VALID_URL = r'https?://(?:www\.)?servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)/(?P<id>[aA]{2}-\w+|\d+-\d+)'
    _TESTS = [{
        'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/',
-        'md5': '046dee641cda1c4cabe13baef3be2c1c',
+        'md5': '3e1dd16775aa8d5cbef23628cfffc1f4',
        'info_dict': {
            'id': 'AA-1T6VBU5PW1W12',
            'ext': 'mp4',
-            'title': 'Die Grünen aus Volkssicht',
-            'description': 'md5:052b5da1cb2cd7d562ef1f19be5a5cba',
-            'thumbnail': r're:^https?://.*\.jpg$',
+            'title': 'Die Grünen aus Sicht des Volkes',
+            'description': 'md5:1247204d85783afe3682644398ff2ec4',
+            'thumbnail': r're:^https?://.*\.jpg',
        }
    }, {
        'url': 'https://www.servus.com/at/p/Wie-das-Leben-beginnt/1309984137314-381415152/',
        'only_matching': True,
+    }, {
+        'url': 'https://www.servus.com/tv/videos/aa-1t6vbu5pw1w12/',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/',
+        'only_matching': True,
    }]

    def _real_extract(self, url):
-        video_id = self._match_id(url)
+        video_id = self._match_id(url).upper()
        webpage = self._download_webpage(url, video_id)

-        title = self._og_search_title(webpage)
+        title = self._search_regex(
+            (r'videoLabel\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
+             r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'),
+            webpage, 'title', default=None,
+            group='title') or self._og_search_title(webpage)
+        title = re.sub(r'\s*-\s*Servus TV\s*$', '', title)
        description = self._og_search_description(webpage)
        thumbnail = self._og_search_thumbnail(webpage)

--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@ -502,7 +502,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        orig_url = url
-        if mobj.group('pro') or mobj.group('player'):
+        if mobj.group('pro'):
+            # some videos require portfolio_id to be present in player url
+            # https://github.com/rg3/youtube-dl/issues/20070
+            url = self._extract_url(url, self._download_webpage(url, video_id))
+        elif mobj.group('player'):
            url = 'https://player.vimeo.com/video/' + video_id
        elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')):
            url = 'https://vimeo.com/' + video_id
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@ -1,3 +1,3 @@
 from __future__ import unicode_literals

-__version__ = '2019.02.18'
+__version__ = '2019.03.01'