Merge branch 'master' into GoogleDrive-issue-13619

2017-08-04 19:47:05 -05:00 · 2017-08-04 19:47:05 -05:00 · 9cfc81df17
commit 9cfc81df17
parent 113c456728 1141e9104b
12 changed files with 123 additions and 93 deletions
--- a/README.md
+++ b/README.md
@ -584,7 +584,7 @@ If you are using an output template inside a Windows batch file then you must es

 #### Output template examples

-Note on Windows you may need to use double quotes instead of single.
+Note that on Windows you may need to use double quotes instead of single.

 ```bash
 $ youtube-dl --get-filename -o '%(title)s.%(ext)s' BaW_jenozKc
@ -671,7 +671,7 @@ If you want to preserve the old format selection behavior (prior to youtube-dl 2

 #### Format selection examples

-Note on Windows you may need to use double quotes instead of single.
+Note that on Windows you may need to use double quotes instead of single.

 ```bash
 # Download best mp4 format available or any other best if no mp4 available
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@ -371,6 +371,19 @@ class TestFormatSelection(unittest.TestCase):
        ydl = YDL({'format': 'best[height>360]'})
        self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy())

+    def test_format_selection_issue_10083(self):
+        # See https://github.com/rg3/youtube-dl/issues/10083
+        formats = [
+            {'format_id': 'regular', 'height': 360, 'url': TEST_URL},
+            {'format_id': 'video', 'height': 720, 'acodec': 'none', 'url': TEST_URL},
+            {'format_id': 'audio', 'vcodec': 'none', 'url': TEST_URL},
+        ]
+        info_dict = _make_result(formats)
+
+        ydl = YDL({'format': 'best[height>360]/bestvideo[height>360]+bestaudio'})
+        ydl.process_ie_result(info_dict.copy())
+        self.assertEqual(ydl.downloaded_info_dicts[0]['format_id'], 'video+audio')
+
    def test_invalid_format_specs(self):
        def assert_syntax_error(format_spec):
            ydl = YDL({'format': format_spec})
--- a/youtube_dl/downloader/dash.py
+++ b/youtube_dl/downloader/dash.py
@ -2,6 +2,7 @@ from __future__ import unicode_literals

 from .fragment import FragmentFD
 from ..compat import compat_urllib_error
+from ..utils import urljoin


 class DashSegmentsFD(FragmentFD):
@ -12,12 +13,13 @@ class DashSegmentsFD(FragmentFD):
    FD_NAME = 'dashsegments'

    def real_download(self, filename, info_dict):
-        segments = info_dict['fragments'][:1] if self.params.get(
+        fragment_base_url = info_dict.get('fragment_base_url')
+        fragments = info_dict['fragments'][:1] if self.params.get(
            'test', False) else info_dict['fragments']

        ctx = {
            'filename': filename,
-            'total_frags': len(segments),
+            'total_frags': len(fragments),
        }

        self._prepare_and_start_frag_download(ctx)
@ -26,7 +28,7 @@ class DashSegmentsFD(FragmentFD):
        skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)

        frag_index = 0
-        for i, segment in enumerate(segments):
+        for i, fragment in enumerate(fragments):
            frag_index += 1
            if frag_index <= ctx['fragment_index']:
                continue
@ -36,7 +38,11 @@ class DashSegmentsFD(FragmentFD):
            count = 0
            while count <= fragment_retries:
                try:
-                    success, frag_content = self._download_fragment(ctx, segment['url'], info_dict)
+                    fragment_url = fragment.get('url')
+                    if not fragment_url:
+                        assert fragment_base_url
+                        fragment_url = urljoin(fragment_base_url, fragment['path'])
+                    success, frag_content = self._download_fragment(ctx, fragment_url, info_dict)
                    if not success:
                        return False
                    self._append_fragment(ctx, frag_content)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -1892,9 +1892,13 @@ class InfoExtractor(object):
                                'Bandwidth': bandwidth,
                            }

+                        def location_key(location):
+                            return 'url' if re.match(r'^https?://', location) else 'path'
+
                        if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:

                            media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
+                            media_location_key = location_key(media_template)

                            # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
                            # can't be used at the same time
@ -1904,7 +1908,7 @@ class InfoExtractor(object):
                                    segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
                                    representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
                                representation_ms_info['fragments'] = [{
-                                    'url': media_template % {
+                                    media_location_key: media_template % {
                                        'Number': segment_number,
                                        'Bandwidth': bandwidth,
                                    },
@ -1928,7 +1932,7 @@ class InfoExtractor(object):
                                        'Number': segment_number,
                                    }
                                    representation_ms_info['fragments'].append({
-                                        'url': segment_url,
+                                        media_location_key: segment_url,
                                        'duration': float_or_none(segment_d, representation_ms_info['timescale']),
                                    })

@ -1952,8 +1956,9 @@ class InfoExtractor(object):
                            for s in representation_ms_info['s']:
                                duration = float_or_none(s['d'], timescale)
                                for r in range(s.get('r', 0) + 1):
+                                    segment_uri = representation_ms_info['segment_urls'][segment_index]
                                    fragments.append({
-                                        'url': representation_ms_info['segment_urls'][segment_index],
+                                        location_key(segment_uri): segment_uri,
                                        'duration': duration,
                                    })
                                    segment_index += 1
@ -1962,6 +1967,7 @@ class InfoExtractor(object):
                        # No fragments key is present in this case.
                        if 'fragments' in representation_ms_info:
                            f.update({
+                                'fragment_base_url': base_url,
                                'fragments': [],
                                'protocol': 'http_dash_segments',
                            })
@ -1969,10 +1975,8 @@ class InfoExtractor(object):
                                initialization_url = representation_ms_info['initialization_url']
                                if not f.get('url'):
                                    f['url'] = initialization_url
-                                f['fragments'].append({'url': initialization_url})
+                                f['fragments'].append({location_key(initialization_url): initialization_url})
                            f['fragments'].extend(representation_ms_info['fragments'])
-                            for fragment in f['fragments']:
-                                fragment['url'] = urljoin(base_url, fragment['url'])
                        try:
                            existing_format = next(
                                fo for fo in formats
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -994,7 +994,6 @@ from .teachertube import (
 )
 from .teachingchannel import TeachingChannelIE
 from .teamcoco import TeamcocoIE
-from .teamfourstar import TeamFourStarIE
 from .techtalks import TechTalksIE
 from .ted import TEDIE
 from .tele13 import Tele13IE
--- a/youtube_dl/extractor/mlb.py
+++ b/youtube_dl/extractor/mlb.py
@ -15,7 +15,7 @@ class MLBIE(InfoExtractor):
                        (?:[\da-z_-]+\.)*mlb\.com/
                        (?:
                            (?:
-                                (?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|
+                                (?:.*?/)?video/(?:topic/[\da-z_-]+/)?(?:v|.*?/c-)|
                                (?:
                                    shared/video/embed/(?:embed|m-internal-embed)\.html|
                                    (?:[^/]+/)+(?:play|index)\.jsp|
@ -84,7 +84,7 @@ class MLBIE(InfoExtractor):
        },
        {
            'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer',
-            'md5': 'b190e70141fb9a1552a85426b4da1b5d',
+            'md5': 'aafaf5b0186fee8f32f20508092f8111',
            'info_dict': {
                'id': '75609783',
                'ext': 'mp4',
@ -94,6 +94,10 @@ class MLBIE(InfoExtractor):
                'upload_date': '20150415',
            }
        },
+        {
+            'url': 'https://www.mlb.com/video/hargrove-homers-off-caldwell/c-1352023483?tid=67793694',
+            'only_matching': True,
+        },
        {
            'url': 'http://m.mlb.com/shared/video/embed/embed.html?content_id=35692085&topic_id=6479266&width=400&height=224&property=mlb',
            'only_matching': True,
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@ -189,7 +189,7 @@ class PBSIE(InfoExtractor):
           # Direct video URL
           (?:%s)/(?:viralplayer|video)/(?P<id>[0-9]+)/? |
           # Article with embedded player (or direct video)
-           (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
+           (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
           # Player
           (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
        )
@ -345,6 +345,21 @@ class PBSIE(InfoExtractor):
                'formats': 'mincount:8',
            },
        },
+        {
+            # https://github.com/rg3/youtube-dl/issues/13801
+            'url': 'https://www.pbs.org/video/pbs-newshour-full-episode-july-31-2017-1501539057/',
+            'info_dict': {
+                'id': '3003333873',
+                'ext': 'mp4',
+                'title': 'PBS NewsHour - full episode July 31, 2017',
+                'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+                'duration': 3265,
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
        {
            'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
            'only_matching': True,
@ -433,6 +448,9 @@ class PBSIE(InfoExtractor):
                if url:
                    break

+            if not url:
+                url = self._og_search_url(webpage)
+
            mobj = re.match(self._VALID_URL, url)

        player_id = mobj.group('player_id')
--- a/youtube_dl/extractor/pornhd.py
+++ b/youtube_dl/extractor/pornhd.py
@ -54,7 +54,7 @@ class PornHdIE(InfoExtractor):
             r'<title>(.+?) - .*?[Pp]ornHD.*?</title>'], webpage, 'title')

        sources = self._parse_json(js_to_json(self._search_regex(
-            r"(?s)'sources'\s*:\s*(\{.+?\})\s*\}[;,)]",
+            r"(?s)sources'?\s*:\s*(\{.+?\})\s*\}[;,)]",
            webpage, 'sources', default='{}')), video_id)

        if not sources:
--- a/youtube_dl/extractor/teamfourstar.py
+++ b/youtube_dl/extractor/teamfourstar.py
@ -1,48 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from .jwplatform import JWPlatformIE
-from ..utils import unified_strdate
-
-
-class TeamFourStarIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?teamfourstar\.com/(?P<id>[a-z0-9\-]+)'
-    _TEST = {
-        'url': 'http://teamfourstar.com/tfs-abridged-parody-episode-1-2/',
-        'info_dict': {
-            'id': '0WdZO31W',
-            'title': 'TFS Abridged Parody Episode 1',
-            'description': 'md5:d60bc389588ebab2ee7ad432bda953ae',
-            'ext': 'mp4',
-            'timestamp': 1394168400,
-            'upload_date': '20080508',
-        },
-    }
-
-    def _real_extract(self, url):
-        display_id = self._match_id(url)
-        webpage = self._download_webpage(url, display_id)
-
-        jwplatform_url = JWPlatformIE._extract_url(webpage)
-
-        video_title = self._html_search_regex(
-            r'<h1[^>]+class="entry-title"[^>]*>(?P<title>.+?)</h1>',
-            webpage, 'title')
-        video_date = unified_strdate(self._html_search_regex(
-            r'<span[^>]+class="meta-date date updated"[^>]*>(?P<date>.+?)</span>',
-            webpage, 'date', fatal=False))
-        video_description = self._html_search_regex(
-            r'(?s)<div[^>]+class="content-inner"[^>]*>.*?(?P<description><p>.+?)</div>',
-            webpage, 'description', fatal=False)
-        video_thumbnail = self._og_search_thumbnail(webpage)
-
-        return {
-            '_type': 'url_transparent',
-            'display_id': display_id,
-            'title': video_title,
-            'description': video_description,
-            'upload_date': video_date,
-            'thumbnail': video_thumbnail,
-            'url': jwplatform_url,
-        }
--- a/youtube_dl/extractor/udemy.py
+++ b/youtube_dl/extractor/udemy.py
@ -15,6 +15,7 @@ from ..utils import (
    ExtractorError,
    float_or_none,
    int_or_none,
+    js_to_json,
    sanitized_Request,
    unescapeHTML,
    urlencode_postdata,
@ -268,6 +269,25 @@ class UdemyIE(InfoExtractor):
                    f = add_output_format_meta(f, format_id)
                formats.append(f)

+        def extract_subtitles(track_list):
+            if not isinstance(track_list, list):
+                return
+            for track in track_list:
+                if not isinstance(track, dict):
+                    continue
+                if track.get('kind') != 'captions':
+                    continue
+                src = track.get('src')
+                if not src or not isinstance(src, compat_str):
+                    continue
+                lang = track.get('language') or track.get(
+                    'srclang') or track.get('label')
+                sub_dict = automatic_captions if track.get(
+                    'autogenerated') is True else subtitles
+                sub_dict.setdefault(lang, []).append({
+                    'url': src,
+                })
+
        download_urls = asset.get('download_urls')
        if isinstance(download_urls, dict):
            extract_formats(download_urls.get('Video'))
@ -315,23 +335,16 @@ class UdemyIE(InfoExtractor):
                extract_formats(data.get('sources'))
                if not duration:
                    duration = int_or_none(data.get('duration'))
-                tracks = data.get('tracks')
-                if isinstance(tracks, list):
-                    for track in tracks:
-                        if not isinstance(track, dict):
-                            continue
-                        if track.get('kind') != 'captions':
-                            continue
-                        src = track.get('src')
-                        if not src or not isinstance(src, compat_str):
-                            continue
-                        lang = track.get('language') or track.get(
-                            'srclang') or track.get('label')
-                        sub_dict = automatic_captions if track.get(
-                            'autogenerated') is True else subtitles
-                        sub_dict.setdefault(lang, []).append({
-                            'url': src,
-                        })
+                extract_subtitles(data.get('tracks'))
+
+            if not subtitles and not automatic_captions:
+                text_tracks = self._parse_json(
+                    self._search_regex(
+                        r'text-tracks=(["\'])(?P<data>\[.+?\])\1', view_html,
+                        'text tracks', default='{}', group='data'), video_id,
+                    transform_source=lambda s: js_to_json(unescapeHTML(s)),
+                    fatal=False)
+                extract_subtitles(text_tracks)

        self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id'))

--- a/youtube_dl/extractor/vidme.py
+++ b/youtube_dl/extractor/vidme.py
@ -3,7 +3,10 @@ from __future__ import unicode_literals
 import itertools

 from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..compat import (
+    compat_HTTPError,
+    compat_str,
+)
 from ..utils import (
    ExtractorError,
    int_or_none,
@ -161,13 +164,28 @@ class VidmeIE(InfoExtractor):
                'or for violating the terms of use.',
                expected=True)

-        formats = [{
-            'format_id': f.get('type'),
-            'url': f['uri'],
-            'width': int_or_none(f.get('width')),
-            'height': int_or_none(f.get('height')),
-            'preference': 0 if f.get('type', '').endswith('clip') else 1,
-        } for f in video.get('formats', []) if f.get('uri')]
+        formats = []
+        for f in video.get('formats', []):
+            format_url = f.get('uri')
+            if not format_url or not isinstance(format_url, compat_str):
+                continue
+            format_type = f.get('type')
+            if format_type == 'dash':
+                formats.extend(self._extract_mpd_formats(
+                    format_url, video_id, mpd_id='dash', fatal=False))
+            elif format_type == 'hls':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+            else:
+                formats.append({
+                    'format_id': f.get('type'),
+                    'url': format_url,
+                    'width': int_or_none(f.get('width')),
+                    'height': int_or_none(f.get('height')),
+                    'preference': 0 if f.get('type', '').endswith(
+                        'clip') else 1,
+                })

        if not formats and video.get('complete_url'):
            formats.append({
--- a/youtube_dl/extractor/yandexdisk.py
+++ b/youtube_dl/extractor/yandexdisk.py
@ -13,9 +13,9 @@ from ..utils import (


 class YandexDiskIE(InfoExtractor):
-    _VALID_URL = r'https?://yadi\.sk/i/(?P<id>[^/?#&]+)'
+    _VALID_URL = r'https?://yadi\.sk/[di]/(?P<id>[^/?#&]+)'

-    _TEST = {
+    _TESTS = [{
        'url': 'https://yadi.sk/i/VdOeDou8eZs6Y',
        'md5': '33955d7ae052f15853dc41f35f17581c',
        'info_dict': {
@ -27,7 +27,10 @@ class YandexDiskIE(InfoExtractor):
            'uploader_id': '300043621',
            'view_count': int,
        },
-    }
+    }, {
+        'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce',
+        'only_matching': True,
+    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)