Merge branch 'master' into GoogleDrive-issue-13619

2017-07-20 15:55:52 -05:00 · 2017-07-20 15:55:52 -05:00 · 90793eef1d
commit 90793eef1d
parent 46982e74c7 0396806f67
11 changed files with 205 additions and 63 deletions
--- a/7
+++ b/7
@ -1,3 +1,10 @@
+version <unreleased>
+
+Extractors
+* [youku:show] Fix playlist extraction (#13248)
+ [dispeak] Recognize sevt subdomain (#13276)
+
+
 version 2017.07.15

 Core
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@ -41,6 +41,7 @@ def _make_result(formats, **kwargs):
        'id': 'testid',
        'title': 'testttitle',
        'extractor': 'testex',
+        'extractor_key': 'TestEx',
    }
    res.update(**kwargs)
    return res
@ -761,7 +762,8 @@ class TestYoutubeDL(unittest.TestCase):
                    '_type': 'url_transparent',
                    'url': 'foo2:',
                    'ie_key': 'Foo2',
-                    'title': 'foo1 title'
+                    'title': 'foo1 title',
+                    'id': 'foo1_id',
                }

        class Foo2IE(InfoExtractor):
@ -787,6 +789,9 @@ class TestYoutubeDL(unittest.TestCase):
        downloaded = ydl.downloaded_info_dicts[0]
        self.assertEqual(downloaded['url'], TEST_URL)
        self.assertEqual(downloaded['title'], 'foo1 title')
+        self.assertEqual(downloaded['id'], 'testid')
+        self.assertEqual(downloaded['extractor'], 'testex')
+        self.assertEqual(downloaded['extractor_key'], 'TestEx')


 if __name__ == '__main__':
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@ -860,7 +860,7 @@ class YoutubeDL(object):

            force_properties = dict(
                (k, v) for k, v in ie_result.items() if v is not None)
-            for f in ('_type', 'url', 'ie_key'):
+            for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
                if f in force_properties:
                    del force_properties[f]
            new_result = info.copy()
--- a/youtube_dl/extractor/dispeak.py
+++ b/youtube_dl/extractor/dispeak.py
@ -13,7 +13,7 @@ from ..utils import (


 class DigitallySpeakingIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml'
+    _VALID_URL = r'https?://(?:s?evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml'

    _TESTS = [{
        # From http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface
@ -28,6 +28,10 @@ class DigitallySpeakingIE(InfoExtractor):
        # From http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC
        'url': 'http://events.digitallyspeaking.com/gdc/sf11/xml/12396_1299111843500GMPX.xml',
        'only_matching': True,
+    }, {
+        # From http://www.gdcvault.com/play/1013700/Advanced-Material
+        'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml',
+        'only_matching': True,
    }]

    def _parse_mp4(self, metadata):
--- a/youtube_dl/extractor/egghead.py
+++ b/youtube_dl/extractor/egghead.py
@ -2,6 +2,11 @@
 from __future__ import unicode_literals

 from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    try_get,
+    unified_timestamp,
+)


 class EggheadCourseIE(InfoExtractor):
@ -33,3 +38,47 @@ class EggheadCourseIE(InfoExtractor):
        return self.playlist_result(
            entries, playlist_id, course.get('title'),
            course.get('description'))
+
+
+class EggheadLessonIE(InfoExtractor):
+    IE_DESC = 'egghead.io lesson'
+    IE_NAME = 'egghead:lesson'
+    _VALID_URL = r'https://egghead\.io/lessons/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box',
+        'info_dict': {
+            'id': 'fv5yotjxcg',
+            'ext': 'mp4',
+            'title': 'Create linear data flow with container style types (Box)',
+            'description': 'md5:9aa2cdb6f9878ed4c39ec09e85a8150e',
+            'thumbnail': r're:^https?:.*\.jpg$',
+            'timestamp': 1481296768,
+            'upload_date': '20161209',
+            'duration': 304,
+            'view_count': 0,
+            'tags': ['javascript', 'free'],
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        lesson_id = self._match_id(url)
+
+        lesson = self._download_json(
+            'https://egghead.io/api/v1/lessons/%s' % lesson_id, lesson_id)
+
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'Wistia',
+            'url': 'wistia:%s' % lesson['wistia_id'],
+            'id': lesson['wistia_id'],
+            'title': lesson.get('title'),
+            'description': lesson.get('summary'),
+            'thumbnail': lesson.get('thumb_nail'),
+            'timestamp': unified_timestamp(lesson.get('published_at')),
+            'duration': int_or_none(lesson.get('duration')),
+            'view_count': int_or_none(lesson.get('plays_count')),
+            'tags': try_get(lesson, lambda x: x['tag_list'], list),
+        }
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -298,7 +298,10 @@ from .dw import (
 from .eagleplatform import EaglePlatformIE
 from .ebaumsworld import EbaumsWorldIE
 from .echomsk import EchoMskIE
-from .egghead import EggheadCourseIE
+from .egghead import (
+    EggheadCourseIE,
+    EggheadLessonIE,
+)
 from .ehow import EHowIE
 from .eighttracks import EightTracksIE
 from .einthusan import EinthusanIE
--- a/youtube_dl/extractor/funnyordie.py
+++ b/youtube_dl/extractor/funnyordie.py
@ -1,10 +1,14 @@
 from __future__ import unicode_literals

-import json
 import re

 from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+    ExtractorError,
+    float_or_none,
+    int_or_none,
+    unified_timestamp,
+)


 class FunnyOrDieIE(InfoExtractor):
@ -18,6 +22,10 @@ class FunnyOrDieIE(InfoExtractor):
            'title': 'Heart-Shaped Box: Literal Video Version',
            'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338',
            'thumbnail': r're:^http:.*\.jpg$',
+            'uploader': 'DASjr',
+            'timestamp': 1317904928,
+            'upload_date': '20111006',
+            'duration': 318.3,
        },
    }, {
        'url': 'http://www.funnyordie.com/embed/e402820827',
@ -27,6 +35,8 @@ class FunnyOrDieIE(InfoExtractor):
            'title': 'Please Use This Song (Jon Lajoie)',
            'description': 'Please use this to sell something.  www.jonlajoie.com',
            'thumbnail': r're:^http:.*\.jpg$',
+            'timestamp': 1398988800,
+            'upload_date': '20140502',
        },
        'params': {
            'skip_download': True,
@ -100,15 +110,53 @@ class FunnyOrDieIE(InfoExtractor):
                'url': 'http://www.funnyordie.com%s' % src,
            }]

-        post_json = self._search_regex(
-            r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details')
-        post = json.loads(post_json)
+        timestamp = unified_timestamp(self._html_search_meta(
+            'uploadDate', webpage, 'timestamp', default=None))
+
+        uploader = self._html_search_regex(
+            r'<h\d[^>]+\bclass=["\']channel-preview-name[^>]+>(.+?)</h',
+            webpage, 'uploader', default=None)
+
+        title, description, thumbnail, duration = [None] * 4
+
+        medium = self._parse_json(
+            self._search_regex(
+                r'jsonMedium\s*=\s*({.+?});', webpage, 'JSON medium',
+                default='{}'),
+            video_id, fatal=False)
+        if medium:
+            title = medium.get('title')
+            duration = float_or_none(medium.get('duration'))
+            if not timestamp:
+                timestamp = unified_timestamp(medium.get('publishDate'))
+
+        post = self._parse_json(
+            self._search_regex(
+                r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details',
+                default='{}'),
+            video_id, fatal=False)
+        if post:
+            if not title:
+                title = post.get('name')
+            description = post.get('description')
+            thumbnail = post.get('picture')
+
+        if not title:
+            title = self._og_search_title(webpage)
+        if not description:
+            description = self._og_search_description(webpage)
+        if not duration:
+            duration = int_or_none(self._html_search_meta(
+                ('video:duration', 'duration'), webpage, 'duration', default=False))

        return {
            'id': video_id,
-            'title': post['name'],
-            'description': post.get('description'),
-            'thumbnail': post.get('picture'),
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'timestamp': timestamp,
+            'duration': duration,
            'formats': formats,
            'subtitles': subtitles,
        }
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -1569,27 +1569,6 @@ class GenericIE(InfoExtractor):
                'skip_download': True,
            },
        },
-        # Nexx iFrame embed
-        {
-            'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html',
-            'info_dict': {
-                'id': '161464',
-                'ext': 'mp4',
-                'title': 'Nervenkitzel Achterbahn',
-                'alt_title': 'Karussellbauer in Deutschland',
-                'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
-                'release_year': 2005,
-                'creator': 'SPIEGEL TV',
-                'thumbnail': r're:^https?://.*\.jpg$',
-                'duration': 2761,
-                'timestamp': 1394021479,
-                'upload_date': '20140305',
-            },
-            'params': {
-                'format': 'bestvideo',
-                'skip_download': True,
-            },
-        },
        # Facebook <iframe> embed
        {
            'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html',
--- a/youtube_dl/extractor/spiegel.py
+++ b/youtube_dl/extractor/spiegel.py
@ -122,6 +122,26 @@ class SpiegelArticleIE(InfoExtractor):

        },
        'playlist_count': 6,
+    }, {
+        # Nexx iFrame embed
+        'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html',
+        'info_dict': {
+            'id': '161464',
+            'ext': 'mp4',
+            'title': 'Nervenkitzel Achterbahn',
+            'alt_title': 'Karussellbauer in Deutschland',
+            'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
+            'release_year': 2005,
+            'creator': 'SPIEGEL TV',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 2761,
+            'timestamp': 1394021479,
+            'upload_date': '20140305',
+        },
+        'params': {
+            'format': 'bestvideo',
+            'skip_download': True,
+        },
    }]

    def _real_extract(self, url):
--- a/youtube_dl/extractor/tbs.py
+++ b/youtube_dl/extractor/tbs.py
@ -8,6 +8,9 @@ from ..utils import extract_attributes


 class TBSIE(TurnerBaseIE):
+    # https://github.com/rg3/youtube-dl/issues/13658
+    _WORKING = False
+
    _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/videos/(?:[^/]+/)+(?P<id>[^/?#]+)\.html'
    _TESTS = [{
        'url': 'http://www.tbs.com/videos/people-of-earth/season-1/extras/2007318/theatrical-trailer.html',
@ -17,7 +20,8 @@ class TBSIE(TurnerBaseIE):
            'ext': 'mp4',
            'title': 'Theatrical Trailer',
            'description': 'Catch the latest comedy from TBS, People of Earth, premiering Halloween night--Monday, October 31, at 9/8c.',
-        }
+        },
+        'skip': 'TBS videos are deleted after a while',
    }, {
        'url': 'http://www.tntdrama.com/videos/good-behavior/season-1/extras/1538823/you-better-run.html',
        'md5': 'ce53c6ead5e9f3280b4ad2031a6fab56',
@ -26,7 +30,8 @@ class TBSIE(TurnerBaseIE):
            'ext': 'mp4',
            'title': 'You Better Run',
            'description': 'Letty Raines must figure out what she\'s running toward while running away from her past. Good Behavior premieres November 15 at 9/8c.',
-        }
+        },
+        'skip': 'TBS videos are deleted after a while',
    }]

    def _real_extract(self, url):
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@ -1,7 +1,6 @@
 # coding: utf-8
 from __future__ import unicode_literals

-import itertools
 import random
 import re
 import string
@ -14,7 +13,6 @@ from ..utils import (
    js_to_json,
    str_or_none,
    strip_jsonp,
-    urljoin,
 )


@ -222,17 +220,42 @@ class YoukuShowIE(InfoExtractor):
    _VALID_URL = r'https?://list\.youku\.com/show/id_(?P<id>[0-9a-z]+)\.html'
    IE_NAME = 'youku:show'

-    _TEST = {
+    _TESTS = [{
        'url': 'http://list.youku.com/show/id_zc7c670be07ff11e48b3f.html',
        'info_dict': {
            'id': 'zc7c670be07ff11e48b3f',
-            'title': '花千骨 未删减版',
+            'title': '花千骨 DVD版',
            'description': 'md5:a1ae6f5618571bbeb5c9821f9c81b558',
        },
        'playlist_count': 50,
-    }
+    }, {
+        # Episode number not starting from 1
+        'url': 'http://list.youku.com/show/id_zefbfbd70efbfbd780bef.html',
+        'info_dict': {
+            'id': 'zefbfbd70efbfbd780bef',
+            'title': '超级飞侠3',
+            'description': 'md5:275715156abebe5ccc2a1992e9d56b98',
+        },
+        'playlist_count': 24,
+    }, {
+        # Ongoing playlist. The initial page is the last one
+        'url': 'http://list.youku.com/show/id_za7c275ecd7b411e1a19e.html',
+        'only_matchine': True,
+    }]

-    _PAGE_SIZE = 40
+    def _extract_entries(self, playlist_data_url, show_id, note, query):
+        query['callback'] = 'cb'
+        playlist_data = self._download_json(
+            playlist_data_url, show_id, query=query, note=note,
+            transform_source=lambda s: js_to_json(strip_jsonp(s)))['html']
+        drama_list = (get_element_by_class('p-drama-grid', playlist_data) or
+                      get_element_by_class('p-drama-half-row', playlist_data))
+        if drama_list is None:
+            raise ExtractorError('No episodes found')
+        video_urls = re.findall(r'<a[^>]+href="([^"]+)"', drama_list)
+        return playlist_data, [
+            self.url_result(self._proto_relative_url(video_url, 'http:'), YoukuIE.ie_key())
+            for video_url in video_urls]

    def _real_extract(self, url):
        show_id = self._match_id(url)
@ -242,30 +265,29 @@ class YoukuShowIE(InfoExtractor):
        page_config = self._parse_json(self._search_regex(
            r'var\s+PageConfig\s*=\s*({.+});', webpage, 'page config'),
            show_id, transform_source=js_to_json)
-        for idx in itertools.count(0):
-            if idx == 0:
-                playlist_data_url = 'http://list.youku.com/show/module'
-                query = {'id': page_config['showid'], 'tab': 'point'}
-            else:
-                playlist_data_url = 'http://list.youku.com/show/point'
+        first_page, initial_entries = self._extract_entries(
+            'http://list.youku.com/show/module', show_id,
+            note='Downloading initial playlist data page',
            query={
                'id': page_config['showid'],
-                    'stage': 'reload_%d' % (self._PAGE_SIZE * idx + 1),
-                }
-            query['callback'] = 'cb'
-            playlist_data = self._download_json(
-                playlist_data_url, show_id, query=query,
+                'tab': 'showInfo',
+            })
+        first_page_reload_id = self._html_search_regex(
+            r'<div[^>]+id="(reload_\d+)', first_page, 'first page reload id')
+        # The first reload_id has the same items as first_page
+        reload_ids = re.findall('<li[^>]+data-id="([^"]+)">', first_page)
+        for idx, reload_id in enumerate(reload_ids):
+            if reload_id == first_page_reload_id:
+                entries.extend(initial_entries)
+                continue
+            _, new_entries = self._extract_entries(
+                'http://list.youku.com/show/episode', show_id,
                note='Downloading playlist data page %d' % (idx + 1),
-                transform_source=lambda s: js_to_json(strip_jsonp(s)))['html']
-            video_urls = re.findall(
-                r'<div[^>]+class="p-thumb"[^<]+<a[^>]+href="([^"]+)"',
-                playlist_data)
-            new_entries = [
-                self.url_result(urljoin(url, video_url), YoukuIE.ie_key())
-                for video_url in video_urls]
+                query={
+                    'id': page_config['showid'],
+                    'stage': reload_id,
+                })
            entries.extend(new_entries)
-            if len(new_entries) < self._PAGE_SIZE:
-                break

        desc = self._html_search_meta('description', webpage, fatal=False)
        playlist_title = desc.split(',')[0] if desc else None