Merge branch 'master' of https://github.com/rg3/youtube-dl

2016-05-27 08:47:31 +02:00 · 2016-05-27 08:47:31 +02:00 · 91ba156b43
commit 91ba156b43
parent d6318da4e7 77b8b4e696
9 changed files with 131 additions and 97 deletions
--- a/youtube_dl/downloader/f4m.py
+++ b/youtube_dl/downloader/f4m.py
@ -319,7 +319,7 @@ class F4mFD(FragmentFD):
        doc = compat_etree_fromstring(manifest)
        formats = [(int(f.attrib.get('bitrate', -1)), f)
                   for f in self._get_unencrypted_media(doc)]
-        if requested_bitrate is None:
+        if requested_bitrate is None or len(formats) == 1:
            # get the best format
            formats = sorted(formats, key=lambda f: f[0])
            rate, media = formats[-1]
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -987,7 +987,7 @@ class InfoExtractor(object):

    def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
                             transform_source=lambda s: fix_xml_ampersands(s).strip(),
-                             fatal=True):
+                             fatal=True, m3u8_id=None):
        manifest = self._download_xml(
            manifest_url, video_id, 'Downloading f4m manifest',
            'Unable to download f4m manifest',
@ -1001,11 +1001,11 @@ class InfoExtractor(object):

        return self._parse_f4m_formats(
            manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
-            transform_source=transform_source, fatal=fatal)
+            transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)

    def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
                           transform_source=lambda s: fix_xml_ampersands(s).strip(),
-                           fatal=True):
+                           fatal=True, m3u8_id=None):
        # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
        akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
        if akamai_pv is not None and ';' in akamai_pv.text:
@ -1029,9 +1029,26 @@ class InfoExtractor(object):
            'base URL', default=None)
        if base_url:
            base_url = base_url.strip()
+
+        bootstrap_info = xpath_text(
+            manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
+            'bootstrap info', default=None)
+
        for i, media_el in enumerate(media_nodes):
+            tbr = int_or_none(media_el.attrib.get('bitrate'))
+            width = int_or_none(media_el.attrib.get('width'))
+            height = int_or_none(media_el.attrib.get('height'))
+            format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
+            # If <bootstrapInfo> is present, the specified f4m is a
+            # stream-level manifest, and only set-level manifests may refer to
+            # external resources.  See section 11.4 and section 4 of F4M spec
+            if bootstrap_info is None:
+                media_url = None
+                # @href is introduced in 2.0, see section 11.6 of F4M spec
                if manifest_version == '2.0':
-                media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
+                    media_url = media_el.attrib.get('href')
+                if media_url is None:
+                    media_url = media_el.attrib.get('url')
                if not media_url:
                    continue
                manifest_url = (
@ -1041,19 +1058,37 @@ class InfoExtractor(object):
                # since bitrates in parent manifest (this one) and media_url manifest
                # may differ leading to inability to resolve the format by requested
                # bitrate in f4m downloader
-                if determine_ext(manifest_url) == 'f4m':
-                    formats.extend(self._extract_f4m_formats(
+                ext = determine_ext(manifest_url)
+                if ext == 'f4m':
+                    f4m_formats = self._extract_f4m_formats(
                        manifest_url, video_id, preference=preference, f4m_id=f4m_id,
-                        transform_source=transform_source, fatal=fatal))
+                        transform_source=transform_source, fatal=fatal)
+                    # Sometimes stream-level manifest contains single media entry that
+                    # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
+                    # At the same time parent's media entry in set-level manifest may
+                    # contain it. We will copy it from parent in such cases.
+                    if len(f4m_formats) == 1:
+                        f = f4m_formats[0]
+                        f.update({
+                            'tbr': f.get('tbr') or tbr,
+                            'width': f.get('width') or width,
+                            'height': f.get('height') or height,
+                            'format_id': f.get('format_id') if not tbr else format_id,
+                        })
+                    formats.extend(f4m_formats)
+                    continue
+                elif ext == 'm3u8':
+                    formats.extend(self._extract_m3u8_formats(
+                        manifest_url, video_id, 'mp4', preference=preference,
+                        m3u8_id=m3u8_id, fatal=fatal))
                    continue
-            tbr = int_or_none(media_el.attrib.get('bitrate'))
            formats.append({
-                'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
+                'format_id': format_id,
                'url': manifest_url,
-                'ext': 'flv',
+                'ext': 'flv' if bootstrap_info else None,
                'tbr': tbr,
-                'width': int_or_none(media_el.attrib.get('width')),
-                'height': int_or_none(media_el.attrib.get('height')),
+                'width': width,
+                'height': height,
                'preference': preference,
            })
        return formats
--- a/youtube_dl/extractor/dw.py
+++ b/youtube_dl/extractor/dw.py
@ -2,13 +2,16 @@
 from __future__ import unicode_literals

 from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+    int_or_none,
+    unified_strdate,
+)
 from ..compat import compat_urlparse


 class DWIE(InfoExtractor):
    IE_NAME = 'dw'
-    _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+av-(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+(?:av|e)-(?P<id>\d+)'
    _TESTS = [{
        # video
        'url': 'http://www.dw.com/en/intelligent-light/av-19112290',
@ -31,6 +34,16 @@ class DWIE(InfoExtractor):
            'description': 'md5:bc9ca6e4e063361e21c920c53af12405',
            'upload_date': '20160311',
        }
+    }, {
+        'url': 'http://www.dw.com/en/documentaries-welcome-to-the-90s-2016-05-21/e-19220158-9798',
+        'md5': '56b6214ef463bfb9a3b71aeb886f3cf1',
+        'info_dict': {
+            'id': '19274438',
+            'ext': 'mp4',
+            'title': 'Welcome to the 90s – Hip Hop',
+            'description': 'Welcome to the 90s - The Golden Decade of Hip Hop',
+            'upload_date': '20160521',
+        },
    }]

    def _real_extract(self, url):
@ -38,6 +51,7 @@ class DWIE(InfoExtractor):
        webpage = self._download_webpage(url, media_id)
        hidden_inputs = self._hidden_inputs(webpage)
        title = hidden_inputs['media_title']
+        media_id = hidden_inputs.get('media_id') or media_id

        if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1':
            formats = self._extract_smil_formats(
@ -49,13 +63,20 @@ class DWIE(InfoExtractor):
        else:
            formats = [{'url': hidden_inputs['file_name']}]

+        upload_date = hidden_inputs.get('display_date')
+        if not upload_date:
+            upload_date = self._html_search_regex(
+                r'<span[^>]+class="date">([0-9.]+)\s*\|', webpage,
+                'upload date', default=None)
+            upload_date = unified_strdate(upload_date)
+
        return {
            'id': media_id,
            'title': title,
            'description': self._og_search_description(webpage),
            'thumbnail': hidden_inputs.get('preview_image'),
            'duration': int_or_none(hidden_inputs.get('file_duration')),
-            'upload_date': hidden_inputs.get('display_date'),
+            'upload_date': upload_date,
            'formats': formats,
        }

--- a/youtube_dl/extractor/eporner.py
+++ b/youtube_dl/extractor/eporner.py
@ -24,20 +24,10 @@ class EpornerIE(InfoExtractor):
            'view_count': int,
            'age_limit': 18,
        },
-    },
+    }, {
        # New (May 2016) URL layout
-    {
        'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0/Star-Wars-XXX-Parody/',
-        'md5': '3469eeaa93b6967a34cdbdbb9d064b33',
-        'info_dict': {
-            'id': '3YRUtzMcWn0',
-            'display_id': 'Star-Wars-XXX-Parody',
-            'ext': 'mp4',
-            'title': 'Star Wars XXX Parody',
-            'duration': 361.0,
-            'view_count': int,
-            'age_limit': 18,
-        },
+        'only_matching': True,
    }]

    def _real_extract(self, url):
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -833,7 +833,10 @@ from .tvc import (
 )
 from .tvigle import TvigleIE
 from .tvland import TVLandIE
-from .tvp import TvpIE, TvpSeriesIE
+from .tvp import (
+    TVPIE,
+    TVPSeriesIE,
+)
 from .tvplay import TVPlayIE
 from .tweakers import TweakersIE
 from .twentyfourvideo import TwentyFourVideoIE
--- a/youtube_dl/extractor/playwire.py
+++ b/youtube_dl/extractor/playwire.py
@ -4,9 +4,8 @@ import re

 from .common import InfoExtractor
 from ..utils import (
-    xpath_text,
+    dict_get,
    float_or_none,
-    int_or_none,
 )


@ -23,6 +22,19 @@ class PlaywireIE(InfoExtractor):
            'duration': 145.94,
        },
    }, {
+        # m3u8 in f4m
+        'url': 'http://config.playwire.com/21772/videos/v2/4840492/zeus.json',
+        'info_dict': {
+            'id': '4840492',
+            'ext': 'mp4',
+            'title': 'ITV EL SHOW FULL',
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
+    }, {
+        # Multiple resolutions while bitrates missing
        'url': 'http://cdn.playwire.com/11625/embed/85228.html',
        'only_matching': True,
    }, {
@ -48,25 +60,10 @@ class PlaywireIE(InfoExtractor):
        thumbnail = content.get('poster')
        src = content['media']['f4m']

-        f4m = self._download_xml(src, video_id)
-        base_url = xpath_text(f4m, './{http://ns.adobe.com/f4m/1.0}baseURL', 'base url', fatal=True)
-        formats = []
-        for media in f4m.findall('./{http://ns.adobe.com/f4m/1.0}media'):
-            media_url = media.get('url')
-            if not media_url:
-                continue
-            tbr = int_or_none(media.get('bitrate'))
-            width = int_or_none(media.get('width'))
-            height = int_or_none(media.get('height'))
-            f = {
-                'url': '%s/%s' % (base_url, media.attrib['url']),
-                'tbr': tbr,
-                'width': width,
-                'height': height,
-            }
-            if not (tbr or width or height):
-                f['quality'] = 1 if '-hd.' in media_url else 0
-            formats.append(f)
+        formats = self._extract_f4m_formats(src, video_id, m3u8_id='hls')
+        for a_format in formats:
+            if not dict_get(a_format, ['tbr', 'width', 'height']):
+                a_format['quality'] = 1 if '-hd.' in a_format['url'] else 0
        self._sort_formats(formats)

        return {
--- a/youtube_dl/extractor/tvp.py
+++ b/youtube_dl/extractor/tvp.py
@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
+# coding: utf-8
 from __future__ import unicode_literals

 import re
@ -6,20 +6,13 @@ import re
 from .common import InfoExtractor


-class TvpIE(InfoExtractor):
-    IE_NAME = 'tvp.pl'
-    _VALID_URL = r'https?://(?:vod|www)\.tvp\.pl/.*/(?P<id>\d+)$'
+class TVPIE(InfoExtractor):
+    IE_NAME = 'tvp'
+    IE_DESC = 'Telewizja Polska'
+    _VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:(?!\d+/)[^/]+/)*(?P<id>\d+)'

    _TESTS = [{
-        'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem/wideo/odc-2/4278035',
-        'md5': 'cdd98303338b8a7f7abab5cd14092bf2',
-        'info_dict': {
-            'id': '4278035',
-            'ext': 'wmv',
-            'title': 'Ogniem i mieczem, odc. 2',
-        },
-    }, {
-        'url': 'http://vod.tvp.pl/seriale/obyczajowe/czas-honoru/sezon-1-1-13/i-seria-odc-13/194536',
+        'url': 'http://vod.tvp.pl/194536/i-seria-odc-13',
        'md5': '8aa518c15e5cc32dfe8db400dc921fbb',
        'info_dict': {
            'id': '194536',
@ -36,12 +29,22 @@ class TvpIE(InfoExtractor):
        },
    }, {
        'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
-        'md5': 'c3b15ed1af288131115ff17a17c19dda',
-        'info_dict': {
-            'id': '17834272',
-            'ext': 'mp4',
-            'title': 'Na sygnale, odc. 39',
-        },
+        'only_matching': True,
+    }, {
+        'url': 'http://wiadomosci.tvp.pl/25169746/24052016-1200',
+        'only_matching': True,
+    }, {
+        'url': 'http://krakow.tvp.pl/25511623/25lecie-mck-wyjatkowe-miejsce-na-mapie-krakowa',
+        'only_matching': True,
+    }, {
+        'url': 'http://teleexpress.tvp.pl/25522307/wierni-wzieli-udzial-w-procesjach',
+        'only_matching': True,
+    }, {
+        'url': 'http://sport.tvp.pl/25522165/krychowiak-uspokaja-w-sprawie-kontuzji-dwa-tygodnie-to-maksimum',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji',
+        'only_matching': True,
    }]

    def _real_extract(self, url):
@ -92,8 +95,8 @@ class TvpIE(InfoExtractor):
        }


-class TvpSeriesIE(InfoExtractor):
-    IE_NAME = 'tvp.pl:Series'
+class TVPSeriesIE(InfoExtractor):
+    IE_NAME = 'tvp:series'
    _VALID_URL = r'https?://vod\.tvp\.pl/(?:[^/]+/){2}(?P<id>[^/]+)/?$'

    _TESTS = [{
@ -127,7 +130,7 @@ class TvpSeriesIE(InfoExtractor):
        videos_paths = re.findall(
            '(?s)class="shortTitle">.*?href="(/[^"]+)', playlist)
        entries = [
-            self.url_result('http://vod.tvp.pl%s' % v_path, ie=TvpIE.ie_key())
+            self.url_result('http://vod.tvp.pl%s' % v_path, ie=TVPIE.ie_key())
            for v_path in videos_paths]

        return {
--- a/youtube_dl/extractor/vlive.py
+++ b/youtube_dl/extractor/vlive.py
@ -1,8 +1,7 @@
 # coding: utf-8
-from __future__ import division, unicode_literals
+from __future__ import unicode_literals

 import re
-import time

 from .common import InfoExtractor
 from ..utils import (
@ -23,7 +22,7 @@ class VLiveIE(InfoExtractor):
        'info_dict': {
            'id': '1326',
            'ext': 'mp4',
-            'title': "[V] Girl's Day's Broadcast",
+            'title': "[V LIVE] Girl's Day's Broadcast",
            'creator': "Girl's Day",
            'view_count': int,
        },
@ -35,24 +34,11 @@ class VLiveIE(InfoExtractor):
        webpage = self._download_webpage(
            'http://www.vlive.tv/video/%s' % video_id, video_id)

-        # UTC+x - UTC+9 (KST)
-        tz = time.altzone if time.localtime().tm_isdst == 1 else time.timezone
-        tz_offset = -tz // 60 - 9 * 60
-        self._set_cookie('vlive.tv', 'timezoneOffset', '%d' % tz_offset)
-
-        status_params = self._download_json(
-            'http://www.vlive.tv/video/status?videoSeq=%s' % video_id,
-            video_id, 'Downloading JSON status',
-            headers={'Referer': url.encode('utf-8')})
-        status = status_params.get('status')
-        air_start = status_params.get('onAirStartAt', '')
-        is_live = status_params.get('isLive')
-
        video_params = self._search_regex(
-            r'vlive\.tv\.video\.ajax\.request\.handler\.init\((.+)\)',
+            r'\bvlive\.video\.init\(([^)]+)\)',
            webpage, 'video params')
-        live_params, long_video_id, key = re.split(
-            r'"\s*,\s*"', video_params)[1:4]
+        status, _, _, live_params, long_video_id, key = re.split(
+            r'"\s*,\s*"', video_params)[2:8]

        if status == 'LIVE_ON_AIR' or status == 'BIG_EVENT_ON_AIR':
            live_params = self._parse_json('"%s"' % live_params, video_id)
@ -61,8 +47,6 @@ class VLiveIE(InfoExtractor):
        elif status == 'VOD_ON_AIR' or status == 'BIG_EVENT_INTRO':
            if long_video_id and key:
                return self._replay(video_id, webpage, long_video_id, key)
-            elif is_live:
-                status = 'LIVE_END'
            else:
                status = 'COMING_SOON'

@ -70,7 +54,7 @@ class VLiveIE(InfoExtractor):
            raise ExtractorError('Uploading for replay. Please wait...',
                                 expected=True)
        elif status == 'COMING_SOON':
-            raise ExtractorError('Coming soon! %s' % air_start, expected=True)
+            raise ExtractorError('Coming soon!', expected=True)
        elif status == 'CANCELED':
            raise ExtractorError('We are sorry, '
                                 'but the live broadcast has been canceled.',
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -1035,6 +1035,7 @@ def unified_strdate(date_str, day_first=True):
        format_expressions.extend([
            '%d-%m-%Y',
            '%d.%m.%Y',
+            '%d.%m.%y',
            '%d/%m/%Y',
            '%d/%m/%y',
            '%d/%m/%Y %H:%M:%S',
@ -1910,7 +1911,7 @@ def parse_age_limit(s):

 def strip_jsonp(code):
    return re.sub(
-        r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
+        r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)


 def js_to_json(code):