From c96eca426b6b4c963fafd3f3268ea1f39b3e2857 Mon Sep 17 00:00:00 2001 From: Philip Huppert Date: Mon, 5 Oct 2015 00:41:20 +0200 Subject: [PATCH 01/25] [mixcloud] Added support for user uploads, playlists, favorites and listens. Fixes #3750 and #5272 --- youtube_dl/extractor/extractors.py | 6 +- youtube_dl/extractor/mixcloud.py | 200 ++++++++++++++++++++++++++++- 2 files changed, 203 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 04c6508f1..b06b717f5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -411,7 +411,11 @@ from .minoto import MinotoIE from .miomio import MioMioIE from .mit import TechTVMITIE, MITIE, OCWMITIE from .mitele import MiTeleIE -from .mixcloud import MixcloudIE +from .mixcloud import ( + MixcloudIE, + MixcloudUserIE, + MixcloudPlaylistIE +) from .mlb import MLBIE from .mnet import MnetIE from .mpora import MporaIE diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 9638cc9e6..20e64bab5 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -3,18 +3,22 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..compat import ( + compat_urllib_parse_unquote, + compat_urllib_request +) from ..utils import ( ExtractorError, HEADRequest, NO_DEFAULT, parse_count, str_to_int, + clean_html ) class MixcloudIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/([^/]+)' + _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)' IE_NAME = 'mixcloud' _TESTS = [{ @@ -115,3 +119,195 @@ class MixcloudIE(InfoExtractor): 'view_count': view_count, 'like_count': like_count, } + + +class MixcloudUserIE(InfoExtractor): + """ + Information extractor for Mixcloud users. + It can retrieve a list of a user's uploads, favorites or listens. + """ + + _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P[^/]+)/(?Puploads|favorites|listens)?/?$' + IE_NAME = 'mixcloud:user' + + _TESTS = [{ + 'url': 'http://www.mixcloud.com/dholbach/', + 'info_dict': { + 'id': 'dholbach/uploads', + 'title': 'Daniel Holbach (uploads)', + 'description': 'md5:327af72d1efeb404a8216c27240d1370', + }, + 'playlist_mincount': 11 + }, { + 'url': 'http://www.mixcloud.com/dholbach/uploads/', + 'info_dict': { + 'id': 'dholbach/uploads', + 'title': 'Daniel Holbach (uploads)', + 'description': 'md5:327af72d1efeb404a8216c27240d1370', + }, + 'playlist_mincount': 11 + }, { + 'url': 'http://www.mixcloud.com/dholbach/favorites/', + 'info_dict': { + 'id': 'dholbach/favorites', + 'title': 'Daniel Holbach (favorites)', + 'description': 'md5:327af72d1efeb404a8216c27240d1370', + }, + 'playlist_mincount': 244 + }, { + 'url': 'http://www.mixcloud.com/dholbach/listens/', + 'info_dict': { + 'id': 'dholbach/listens', + 'title': 'Daniel Holbach (listens)', + 'description': 'md5:327af72d1efeb404a8216c27240d1370', + }, + 'playlist_mincount': 846 + }] + + def _fetch_tracks(self, base_url, video_id, dl_note=None, dl_errnote=None): + # retrieve all fragments of a list of tracks with fake AJAX calls + track_urls = [] + current_page = 1 + while True: + # fake a AJAX request to retrieve a list fragment + page_url = base_url + "?page=%d&list=main&_ajax=1" % current_page + req = compat_urllib_request.Request(page_url, headers={"X-Requested-With": "XMLHttpRequest"}, method="GET") + resp = self._download_webpage(req, video_id, note=dl_note + " (page %d)" % current_page, errnote=dl_errnote) + + # extract all track URLs from fragment + urls = re.findall(r'm-play-button m-url="(?P[^"]+)"', resp) + # clean up URLs + urls = map(clean_html, urls) + # create absolute URLs + urls = map(lambda u: "https://www.mixcloud.com" + u, urls) + track_urls.extend(urls) + + # advance to next fragment, if any + if " m-next-page-url=" in resp: + current_page += 1 + else: + break + + return track_urls + + def _handle_track_urls(self, urls): + return map(lambda u: self.url_result(u, "Mixcloud"), urls) + + def _get_user_description(self, page_content): + return self._html_search_regex( + r'
.*?

(?P.*?)

', + page_content, + "user description", + group="description", + fatal=False, + default="") + + def _get_username(self, page_content): + return self._og_search_title(page_content) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + user_id = mobj.group("user") + list_type = mobj.group("type") + + # if only a profile URL was supplied, default to download all uploads + if list_type is None: + list_type = "uploads" + + video_id = "%s/%s" % (user_id, list_type) + + # download the user's profile to retrieve some metadata + profile = self._download_webpage("https://www.mixcloud.com/%s/" % user_id, + video_id, + note="Downloading user profile", + errnote="Unable to download user profile") + + username = self._get_username(profile) + description = self._get_user_description(profile) + + # retrieve all page fragments of uploads, favorites or listens + track_urls = self._fetch_tracks( + "https://www.mixcloud.com/%s/%s/" % (user_id, list_type), + video_id, + dl_note="Downloading list of %s" % list_type, + dl_errnote="Unable to download list of %s" % list_type) + + # let MixcloudIE handle each track URL + entries = self._handle_track_urls(track_urls) + + return { + '_type': 'playlist', + 'entries': entries, + 'title': "%s (%s)" % (username, list_type), + 'id': video_id, + "description": description + } + + +class MixcloudPlaylistIE(MixcloudUserIE): + """ + Information extractor for Mixcloud playlists. + """ + + _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P[^/]+)/playlists/(?P[^/]+)/?$' + IE_NAME = 'mixcloud:playlist' + + _TESTS = [{ + 'url': 'https://www.mixcloud.com/RedBullThre3style/playlists/tokyo-finalists-2015/', + 'info_dict': { + 'id': 'RedBullThre3style/playlists/tokyo-finalists-2015', + 'title': 'National Champions 2015', + 'description': 'md5:6ff5fb01ac76a31abc9b3939c16243a3', + }, + 'playlist_mincount': 16 + }, { + 'url': 'https://www.mixcloud.com/maxvibes/playlists/jazzcat-on-ness-radio/', + 'info_dict': { + 'id': 'maxvibes/playlists/jazzcat-on-ness-radio', + 'title': 'Jazzcat on Ness Radio', + 'description': 'md5:c2c51a1f1b8bb5442f2ca67c3dc4af27', + }, + 'playlist_mincount': 23 + }] + + def _get_playlist_title(self, page_content): + return self._html_search_regex( + r'(?P.*?)</span>', + page_content, + "playlist title", + group="title", + fatal=True + ) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + user_id = mobj.group("user") + playlist_id = mobj.group("playlist") + video_id = "%s/playlists/%s" % (user_id, playlist_id) + + # download the playlist page to retrieve some metadata + profile = self._download_webpage(url, + user_id, + note="Downloading playlist page", + errnote="Unable to download playlist page") + + description = self._get_user_description(profile) + playlist_title = self._get_playlist_title(profile) + + # retrieve all page fragments of playlist + track_urls = self._fetch_tracks( + "https://www.mixcloud.com/%s/playlists/%s/" % (user_id, playlist_id), + video_id, + dl_note="Downloading tracklist of %s" % playlist_title, + dl_errnote="Unable to tracklist of %s" % playlist_title) + + # let MixcloudIE handle each track + entries = self._handle_track_urls(track_urls) + + return { + '_type': 'playlist', + 'entries': entries, + 'title': playlist_title, + 'id': video_id, + "description": description + } From f896e1ccefc5d946c83f91bf517801882f4184dc Mon Sep 17 00:00:00 2001 From: Philip Huppert <philip@zeilen-sprung.de> Date: Sat, 9 Apr 2016 14:30:01 +0200 Subject: [PATCH 02/25] [mixcloud] fixed some tests --- youtube_dl/extractor/mixcloud.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 20e64bab5..dcc4ddf25 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -25,7 +25,7 @@ class MixcloudIE(InfoExtractor): 'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/', 'info_dict': { 'id': 'dholbach-cryptkeeper', - 'ext': 'mp3', + 'ext': 'm4a', 'title': 'Cryptkeeper', 'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.', 'uploader': 'Daniel Holbach', @@ -265,7 +265,7 @@ class MixcloudPlaylistIE(MixcloudUserIE): 'info_dict': { 'id': 'maxvibes/playlists/jazzcat-on-ness-radio', 'title': 'Jazzcat on Ness Radio', - 'description': 'md5:c2c51a1f1b8bb5442f2ca67c3dc4af27', + 'description': 'md5:7bbbf0d6359a0b8cda85224be0f8f263', }, 'playlist_mincount': 23 }] From dcaf00fb3eb716ea146a8e8870c7753b9acf67b8 Mon Sep 17 00:00:00 2001 From: Philip Huppert <philip@zeilen-sprung.de> Date: Sat, 9 Apr 2016 22:20:16 +0200 Subject: [PATCH 03/25] [mixcloud] support older urllib versions --- youtube_dl/extractor/mixcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index dcc4ddf25..769b68dc3 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -171,7 +171,7 @@ class MixcloudUserIE(InfoExtractor): while True: # fake a AJAX request to retrieve a list fragment page_url = base_url + "?page=%d&list=main&_ajax=1" % current_page - req = compat_urllib_request.Request(page_url, headers={"X-Requested-With": "XMLHttpRequest"}, method="GET") + req = compat_urllib_request.Request(page_url, headers={"X-Requested-With": "XMLHttpRequest"}) resp = self._download_webpage(req, video_id, note=dl_note + " (page %d)" % current_page, errnote=dl_errnote) # extract all track URLs from fragment From 6d671695092236af75c49e8a74d19b771e1b4d79 Mon Sep 17 00:00:00 2001 From: Philip Huppert <philip@zeilen-sprung.de> Date: Sun, 10 Apr 2016 15:53:17 +0200 Subject: [PATCH 04/25] [mixcloud] improved extraction of user description --- youtube_dl/extractor/mixcloud.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 769b68dc3..30b33e7e9 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -195,12 +195,10 @@ class MixcloudUserIE(InfoExtractor): def _get_user_description(self, page_content): return self._html_search_regex( - r'<div class="description-text">.*?<p>(?P<description>.*?)</p></div></div></div>', + r'<div class="description-text">.*?<p>(.*?)</p></div></div></div>', page_content, "user description", - group="description", - fatal=False, - default="") + fatal=False) def _get_username(self, page_content): return self._og_search_title(page_content) From b8f67449ecafa76d4d925c7ad72f9c8f8338aba7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20Michaj=C5=82ow?= <kasper93@gmail.com> Date: Thu, 31 Mar 2016 20:42:55 +0200 Subject: [PATCH 05/25] [generic] Add support for LiveLeak embeds --- youtube_dl/extractor/generic.py | 18 ++++++++++++++++++ youtube_dl/extractor/liveleak.py | 8 ++++++++ 2 files changed, 26 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 2aadd6a12..5b22b6b5e 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -60,6 +60,7 @@ from .googledrive import GoogleDriveIE from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE from .instagram import InstagramIE +from .liveleak import LiveLeakIE class GenericIE(InfoExtractor): @@ -1140,6 +1141,18 @@ class GenericIE(InfoExtractor): 'upload_date': '20160409', }, }, + # LiveLeak embed + { + 'url': 'http://www.wykop.pl/link/3088787/', + 'md5': 'ace83b9ed19b21f68e1b50e844fdf95d', + 'info_dict': { + 'id': '874_1459135191', + 'ext': 'mp4', + 'title': 'Man shows poor quality of new apartment building', + 'description': 'The wall is like a sand pile.', + 'uploader': 'Lake8737', + } + }, ] def report_following_redirect(self, new_url): @@ -1944,6 +1957,11 @@ class GenericIE(InfoExtractor): if instagram_embed_url is not None: return self.url_result(instagram_embed_url, InstagramIE.ie_key()) + # Look for LiveLeak embeds + liveleak_url = LiveLeakIE._extract_url(webpage) + if liveleak_url: + return self.url_result(liveleak_url, 'LiveLeak') + def check_video(vurl): if YoutubeIE.suitable(vurl): return True diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index 4684994e1..29fba5f30 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -53,6 +53,14 @@ class LiveLeakIE(InfoExtractor): } }] + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src="https?://(?:\w+\.)?liveleak\.com/ll_embed\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)', + webpage) + if mobj: + return 'http://www.liveleak.com/view?i=%s' % mobj.group('id') + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) From b0ba11cc64656c09f349117fffa9739dcbb2541e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Wed, 13 Apr 2016 08:02:03 +0200 Subject: [PATCH 06/25] release 2016.04.13 --- .github/ISSUE_TEMPLATE.md | 6 +++--- CONTRIBUTING.md | 4 ++-- docs/supportedsites.md | 3 ++- youtube_dl/version.py | 2 +- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index bf9494646..caed64e38 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.04.06*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.04.06** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.04.13*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.04.13** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v <your command line> [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.04.06 +[debug] youtube-dl version 2016.04.13 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0df6193fb..c83b8655a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -140,14 +140,14 @@ After you have ensured this site is distributing it's content legally, you can f # TODO more properties (see youtube_dl/extractor/common.py) } ``` -5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). +5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). 6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/58525c94d547be1c8167d16c298bdd75506db328/youtube_dl/extractor/common.py#L68-L226). Add tests and code for as many as you want. 8. Keep in mind that the only mandatory fields in info dict for successful extraction process are `id`, `title` and either `url` or `formats`, i.e. these are the critical data the extraction does not make any sense without. This means that [any field](https://github.com/rg3/youtube-dl/blob/58525c94d547be1c8167d16c298bdd75506db328/youtube_dl/extractor/common.py#L138-L226) apart from aforementioned mandatory ones should be treated **as optional** and extraction should be **tolerate** to situations when sources for these fields can potentially be unavailable (even if they always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields. For example, if you have some intermediate dict `meta` that is a source of metadata and it has a key `summary` that you want to extract and put into resulting info dict as `description`, you should be ready that this key may be missing from the `meta` dict, i.e. you should extract it as `meta.get('summary')` and not `meta['summary']`. Similarly, you should pass `fatal=False` when extracting data from a webpage with `_search_regex/_html_search_regex`. 9. Check the code with [flake8](https://pypi.python.org/pypi/flake8). 10. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: - $ git add youtube_dl/extractor/__init__.py + $ git add youtube_dl/extractor/extractors.py $ git add youtube_dl/extractor/yourextractor.py $ git commit -m '[yourextractor] Add new extractor' $ git push origin yourextractor diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d6ee8476b..51a6b5609 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -115,6 +115,7 @@ - **Cinemassacre** - **Clipfish** - **cliphunter** + - **ClipRs** - **Clipsyndicate** - **cloudtime**: CloudTime - **Cloudy** @@ -286,7 +287,6 @@ - **ivi:compilation**: ivi.ru compilations - **ivideon**: Ivideon TV - **Izlesene** - - **JadoreCettePub** - **JeuxVideo** - **Jove** - **jpopsuki.tv** @@ -484,6 +484,7 @@ - **Pornotube** - **PornoVoisines** - **PornoXO** + - **PressTV** - **PrimeShareTV** - **PromptFile** - **prosiebensat1**: ProSiebenSat.1 Digital diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 167b16e24..0618d9a4f 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.04.06' +__version__ = '2016.04.13' From 8334637f4ac4a1aeddc6f05131be1c42ea6761e1 Mon Sep 17 00:00:00 2001 From: "aystroganov@gmail.com" <aystroganov@gmail.com> Date: Wed, 13 Apr 2016 21:30:32 +1000 Subject: [PATCH 07/25] Make tbr field 'int' rather than 'tuple' Closes #9180. --- youtube_dl/extractor/brightcove.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index c718cf385..f0781fc27 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -340,7 +340,7 @@ class BrightcoveLegacyIE(InfoExtractor): ext = 'flv' if ext is None: ext = determine_ext(url) - tbr = int_or_none(rend.get('encodingRate'), 1000), + tbr = int_or_none(rend.get('encodingRate'), 1000) a_format = { 'format_id': 'http%s' % ('-%s' % tbr if tbr else ''), 'url': url, From f141fefab73d96a6dd2c927f56d0b86253991c02 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 14 Apr 2016 14:06:05 +0800 Subject: [PATCH 08/25] [karrierevideos] Fix extraction The server serves malformed header "Content Type: text/xml" for the XML request (it should be Content-Type but not Content Type). Python 3.x, which uses email.feedparser rejects such headers. As a result, Content-Encoding header is not parsed, so the returned content is kept not decompressed, and thus XML parsing error. --- youtube_dl/extractor/karrierevideos.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py index 2cb04e533..c05263e61 100644 --- a/youtube_dl/extractor/karrierevideos.py +++ b/youtube_dl/extractor/karrierevideos.py @@ -52,9 +52,12 @@ class KarriereVideosIE(InfoExtractor): video_id = self._search_regex( r'/config/video/(.+?)\.xml', webpage, 'video id') + # Server returns malformed headers + # Force Accept-Encoding: * to prevent gzipped results playlist = self._download_xml( 'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % video_id, - video_id, transform_source=fix_xml_ampersands) + video_id, transform_source=fix_xml_ampersands, + headers={'Accept-Encoding': '*'}) NS_MAP = { 'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats' From d1c4e4ba150562fcf7aedef26646dff7425ccd73 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 14 Apr 2016 14:11:28 +0800 Subject: [PATCH 09/25] [laola1tv] Improve error detection and skip an invalid test --- youtube_dl/extractor/laola1tv.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index d4fbafece..2fab38079 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -63,6 +63,7 @@ class Laola1TvIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This live stream has already finished.', }] def _real_extract(self, url): @@ -74,6 +75,9 @@ class Laola1TvIE(InfoExtractor): webpage = self._download_webpage(url, display_id) + if 'Dieser Livestream ist bereits beendet.' in webpage: + raise ExtractorError('This live stream has already finished.', expected=True) + iframe_url = self._search_regex( r'<iframe[^>]*?id="videoplayer"[^>]*?src="([^"]+)"', webpage, 'iframe url') From f5d8743e0a1fdcbfed2bea4fb87bf5aaf40c1dfa Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 14 Apr 2016 15:07:31 +0800 Subject: [PATCH 10/25] [downloader/rtsp] Print the command --- youtube_dl/downloader/rtsp.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/downloader/rtsp.py b/youtube_dl/downloader/rtsp.py index 3eb29526c..939358b2a 100644 --- a/youtube_dl/downloader/rtsp.py +++ b/youtube_dl/downloader/rtsp.py @@ -27,6 +27,8 @@ class RtspFD(FileDownloader): self.report_error('MMS or RTSP download detected but neither "mplayer" nor "mpv" could be run. Please install any.') return False + self._debug_cmd(args) + retval = subprocess.call(args) if retval == 0: fsize = os.path.getsize(encodeFilename(tmpfilename)) From 73d93f948ee71b2a07d46fdc4d446255d8ddcc9f Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 14 Apr 2016 15:08:01 +0800 Subject: [PATCH 11/25] [lecture2go] Fix extraction RTSP stream fails to download. Seems it's a mpv bug as direct playback works well: $ mpv --ytdl-format rtsp https://lecture2go.uni-hamburg.de/veranstaltungen/-/v/17473 --- youtube_dl/extractor/lecture2go.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/lecture2go.py b/youtube_dl/extractor/lecture2go.py index 40a3d2346..81b5d41be 100644 --- a/youtube_dl/extractor/lecture2go.py +++ b/youtube_dl/extractor/lecture2go.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, + determine_protocol, parse_duration, int_or_none, ) @@ -18,10 +19,14 @@ class Lecture2GoIE(InfoExtractor): 'md5': 'ac02b570883020d208d405d5a3fd2f7f', 'info_dict': { 'id': '17473', - 'ext': 'flv', + 'ext': 'mp4', 'title': '2 - Endliche Automaten und reguläre Sprachen', 'creator': 'Frank Heitmann', 'duration': 5220, + }, + 'params': { + # m3u8 download + 'skip_download': True, } } @@ -32,14 +37,18 @@ class Lecture2GoIE(InfoExtractor): title = self._html_search_regex(r'<em[^>]+class="title">(.+)</em>', webpage, 'title') formats = [] - for url in set(re.findall(r'"src","([^"]+)"', webpage)): + for url in set(re.findall(r'var\s+playerUri\d+\s*=\s*"([^"]+)"', webpage)): ext = determine_ext(url) + protocol = determine_protocol({'url': url}) if ext == 'f4m': - formats.extend(self._extract_f4m_formats(url, video_id)) + formats.extend(self._extract_f4m_formats(url, video_id, f4m_id='hds')) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats(url, video_id)) + formats.extend(self._extract_m3u8_formats(url, video_id, ext='mp4', m3u8_id='hls')) else: + if protocol == 'rtmp': + continue # XXX: currently broken formats.append({ + 'format_id': protocol, 'url': url, }) From 86475d59b1a6892dca5a8eb1ef3f05639ee3ab6a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Thu, 14 Apr 2016 15:12:59 +0800 Subject: [PATCH 12/25] [metacritic] Add a new valid test case --- youtube_dl/extractor/metacritic.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py index e30320569..444ec0310 100644 --- a/youtube_dl/extractor/metacritic.py +++ b/youtube_dl/extractor/metacritic.py @@ -11,7 +11,7 @@ from ..utils import ( class MetacriticIE(InfoExtractor): _VALID_URL = r'https?://www\.metacritic\.com/.+?/trailers/(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.metacritic.com/game/playstation-4/infamous-second-son/trailers/3698222', 'info_dict': { 'id': '3698222', @@ -20,7 +20,17 @@ class MetacriticIE(InfoExtractor): 'description': 'Take a peak behind-the-scenes to see how Sucker Punch brings smoke into the universe of inFAMOUS Second Son on the PS4.', 'duration': 221, }, - } + 'skip': 'Not providing trailers anymore', + }, { + 'url': 'http://www.metacritic.com/game/playstation-4/tales-from-the-borderlands-a-telltale-game-series/trailers/5740315', + 'info_dict': { + 'id': '5740315', + 'ext': 'mp4', + 'title': 'Tales from the Borderlands - Finale: The Vault of the Traveler', + 'description': 'In the final episode of the season, all hell breaks loose. Jack is now in control of Helios\' systems, and he\'s ready to reclaim his rightful place as king of Hyperion (with or without you).', + 'duration': 114, + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) From 5565be9dd9bfce361dd2c77cc0b1fc735a908b4e Mon Sep 17 00:00:00 2001 From: remitamine <remitamine@gmail.com> Date: Thu, 14 Apr 2016 08:47:55 +0100 Subject: [PATCH 13/25] [aol] relex _VALID_URL regex --- youtube_dl/extractor/aol.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index d4801a25b..24df8fe93 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -12,9 +12,10 @@ from ..utils import ( class AolIE(InfoExtractor): IE_NAME = 'on.aol.com' - _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/video/.*-)(?P<id>[^/?-]+)' + _VALID_URL = r'(?:aol-video:|https?://on\.aol\.com/.*-)(?P<id>[^/?-]+)' _TESTS = [{ + # video with 5min ID 'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img', 'md5': '18ef68f48740e86ae94b98da815eec42', 'info_dict': { @@ -31,6 +32,7 @@ class AolIE(InfoExtractor): 'skip_download': True, } }, { + # video with vidible ID 'url': 'http://on.aol.com/video/netflix-is-raising-rates-5707d6b8e4b090497b04f706?context=PC:homepage:PL1944:1460189336183', 'info_dict': { 'id': '5707d6b8e4b090497b04f706', @@ -45,6 +47,12 @@ class AolIE(InfoExtractor): # m3u8 download 'skip_download': True, } + }, { + 'url': 'http://on.aol.com/partners/abc-551438d309eab105804dbfe8/sneak-peek-was-haley-really-framed-570eaebee4b0448640a5c944', + 'only_matching': True, + }, { + 'url': 'http://on.aol.com/shows/park-bench-shw518173474-559a1b9be4b0c3bfad3357a7?context=SH:SHW518173474:PL4327:1460619712763', + 'only_matching': True, }] def _real_extract(self, url): From 404284132c7877767a5b751d2ada2e064f75bd10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 14 Apr 2016 21:52:05 +0600 Subject: [PATCH 14/25] [arte:info] Add extractor (Closes #9182) --- youtube_dl/extractor/arte.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index f042d9163..662230a90 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -232,6 +232,21 @@ class ArteTVCreativeIE(ArteTVPlus7IE): }] +class ArteTVInfoIE(ArteTVPlus7IE): + IE_NAME = 'arte.tv:info' + _VALID_URL = r'https?://info\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + + _TEST = { + 'url': 'http://info.arte.tv/fr/service-civique-un-cache-misere', + 'info_dict': { + 'id': '067528-000-A', + 'ext': 'mp4', + 'title': 'Service civique, un cache misère ?', + 'upload_date': '20160403', + }, + } + + class ArteTVFutureIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:future' _VALID_URL = r'https?://future\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)' From 9e285387260a019d7471c3bdbd52cc764c0e8700 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= <dstftw@gmail.com> Date: Thu, 14 Apr 2016 21:54:41 +0600 Subject: [PATCH 15/25] [arte:creative] Improve _VALID_URL --- youtube_dl/extractor/arte.py | 5 ++++- youtube_dl/extractor/extractors.py | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 662230a90..a9e3266dc 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -210,7 +210,7 @@ class ArteTVPlus7IE(InfoExtractor): # It also uses the arte_vp_url url from the webpage to extract the information class ArteTVCreativeIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:creative' - _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de|en|es)/(?:magazine?/)?(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', @@ -229,6 +229,9 @@ class ArteTVCreativeIE(ArteTVPlus7IE): 'description': 'Événement ! Quarante-cinq ans après leurs premiers succès, les légendaires Monty Python remontent sur scène.\n', 'upload_date': '20140805', } + }, { + 'url': 'http://creative.arte.tv/de/episode/agentur-amateur-4-der-erste-kunde', + 'only_matching': True, }] diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 04c6508f1..2ae9bc9a8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -46,6 +46,7 @@ from .arte import ( ArteTVPlus7IE, ArteTVCreativeIE, ArteTVConcertIE, + ArteTVInfoIE, ArteTVFutureIE, ArteTVCinemaIE, ArteTVDDCIE, From e1bf277e19ff41d0d899f544749b8d9505fb6689 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 15 Apr 2016 02:41:02 +0800 Subject: [PATCH 16/25] [tdslifeway] Add TDSLifewayIE Used by MinistryGridIE --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/tdslifeway.py | 56 ++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 youtube_dl/extractor/tdslifeway.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2ae9bc9a8..0a75a56c5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -730,6 +730,7 @@ from .sztvhu import SztvHuIE from .tagesschau import TagesschauIE from .tapely import TapelyIE from .tass import TassIE +from .tdslifeway import TDSLifewayIE from .teachertube import ( TeacherTubeIE, TeacherTubeUserIE, diff --git a/youtube_dl/extractor/tdslifeway.py b/youtube_dl/extractor/tdslifeway.py new file mode 100644 index 000000000..34c7a13fd --- /dev/null +++ b/youtube_dl/extractor/tdslifeway.py @@ -0,0 +1,56 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class TDSLifewayIE(InfoExtractor): + _VALID_URL = r'https?://tds\.lifeway\.com/v1/trainingdeliverysystem/courses/(?P<id>\d+)/index\.html' + + _TEST = { + # From http://www.ministrygrid.com/training-viewer/-/training/t4g-2014-conference/the-gospel-by-numbers-4/the-gospel-by-numbers + 'url': 'http://tds.lifeway.com/v1/trainingdeliverysystem/courses/3453494717001/index.html?externalRegistration=AssetId%7C34F466F1-78F3-4619-B2AB-A8EFFA55E9E9%21InstanceId%7C0%21UserId%7Caaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa&grouping=http%3A%2F%2Flifeway.com%2Fvideo%2F3453494717001&activity_id=http%3A%2F%2Flifeway.com%2Fvideo%2F3453494717001&content_endpoint=http%3A%2F%2Ftds.lifeway.com%2Fv1%2Ftrainingdeliverysystem%2FScormEngineInterface%2FTCAPI%2Fcontent%2F&actor=%7B%22name%22%3A%5B%22Guest%20Guest%22%5D%2C%22account%22%3A%5B%7B%22accountServiceHomePage%22%3A%22http%3A%2F%2Fscorm.lifeway.com%2F%22%2C%22accountName%22%3A%22aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa%22%7D%5D%2C%22objectType%22%3A%22Agent%22%7D&content_token=462a50b2-b6f9-4970-99b1-930882c499fb®istration=93d6ec8e-7f7b-4ed3-bbc8-a857913c0b2a&externalConfiguration=access%7CFREE%21adLength%7C-1%21assignOrgId%7C4AE36F78-299A-425D-91EF-E14A899B725F%21assignOrgParentId%7C%21courseId%7C%21isAnonymous%7Cfalse%21previewAsset%7Cfalse%21previewLength%7C-1%21previewMode%7Cfalse%21royalty%7CFREE%21sessionId%7C671422F9-8E79-48D4-9C2C-4EE6111EA1CD%21trackId%7C&auth=Basic%20OjhmZjk5MDBmLTBlYTMtNDJhYS04YjFlLWE4MWQ3NGNkOGRjYw%3D%3D&endpoint=http%3A%2F%2Ftds.lifeway.com%2Fv1%2Ftrainingdeliverysystem%2FScormEngineInterface%2FTCAPI%2F', + 'info_dict': { + 'id': '3453494717001', + 'ext': 'mp4', + 'title': 'The Gospel by Numbers', + 'thumbnail': 're:^https?://.*\.jpg', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + # XXX: A generic brightcove function? + json_data = self._download_json( + 'http://api.brightcove.com/services/library', video_id, + query={ + 'command': 'find_video_by_id', + 'video_id': video_id, + 'video_fields': 'id,name,videoStillURL,HLSURL,FLVURL', + 'media_delivery': 'http', + # token extracted from http://tds.lifeway.com/v1/trainingdeliverysystem/courses/player_test.js + 'token': 'MrrNjVSP15NGY3R0gipp-lvclofucPXKD3skFouJMjZXM3KOS2ch0g..', + }) + + formats = [] + + if 'HLSURL' in json_data: + formats.extend(self._extract_m3u8_formats( + json_data['HLSURL'], video_id, ext='mp4', m3u8_id='hls', fatal=False)) + if 'FLVURL' in json_data: + formats.append({ + 'url': json_data['FLVURL'], + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': json_data['name'], + 'thumbnail': json_data.get('videoStillURL'), + 'formats': formats, + } From 8cb57bab8e98c667c13e1da22b96d97bc37cfe78 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 15 Apr 2016 02:47:07 +0800 Subject: [PATCH 17/25] [ministrygrid] Fix extraction and modernize --- youtube_dl/extractor/ministrygrid.py | 30 +++++++++++----------------- 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/youtube_dl/extractor/ministrygrid.py b/youtube_dl/extractor/ministrygrid.py index 949ad11db..000989873 100644 --- a/youtube_dl/extractor/ministrygrid.py +++ b/youtube_dl/extractor/ministrygrid.py @@ -1,8 +1,5 @@ from __future__ import unicode_literals -import json -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -20,21 +17,22 @@ class MinistryGridIE(InfoExtractor): 'id': '3453494717001', 'ext': 'mp4', 'title': 'The Gospel by Numbers', - 'description': 'Coming soon from T4G 2014!', - 'uploader': 'LifeWay Christian Resources (MG)', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - portlets_json = self._search_regex( - r'Liferay\.Portlet\.list=(\[.+?\])', webpage, 'portlet list') - portlets = json.loads(portlets_json) + portlets = self._parse_json(self._search_regex( + r'Liferay\.Portlet\.list=(\[.+?\])', webpage, 'portlet list'), + video_id) pl_id = self._search_regex( - r'<!--\s*p_l_id - ([0-9]+)<br>', webpage, 'p_l_id') + r'getPlid:function\(\){return"(\d+)"}', webpage, 'p_l_id') for i, portlet in enumerate(portlets): portlet_url = 'http://www.ministrygrid.com/c/portal/render_portlet?p_l_id=%s&p_p_id=%s' % (pl_id, portlet) @@ -46,12 +44,8 @@ class MinistryGridIE(InfoExtractor): r'<iframe.*?src="([^"]+)"', portlet_code, 'video iframe', default=None) if video_iframe_url: - surl = smuggle_url( - video_iframe_url, {'force_videoid': video_id}) - return { - '_type': 'url', - 'id': video_id, - 'url': surl, - } + return self.url_result( + smuggle_url(video_iframe_url, {'force_videoid': video_id}), + video_id=video_id) raise ExtractorError('Could not find video iframe in any portlets') From 74b47d00c3d807f91b0c24781077cb9100403bd5 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 15 Apr 2016 03:30:38 +0800 Subject: [PATCH 18/25] [xboxclips] Use http:// URL xboxclips has misconfigured certificates --- youtube_dl/extractor/xboxclips.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xboxclips.py b/youtube_dl/extractor/xboxclips.py index 236ff403b..b113ab1c4 100644 --- a/youtube_dl/extractor/xboxclips.py +++ b/youtube_dl/extractor/xboxclips.py @@ -12,7 +12,7 @@ from ..utils import ( class XboxClipsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?xboxclips\.com/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\w-]{36})' _TEST = { - 'url': 'https://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325', + 'url': 'http://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325', 'md5': 'fbe1ec805e920aeb8eced3c3e657df5d', 'info_dict': { 'id': '074a69a9-5faf-46aa-b93b-9909c1720325', From b0cf2e7c1b844e533c447572b6979ae64f7e2870 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 15 Apr 2016 03:48:23 +0800 Subject: [PATCH 19/25] [ubu] Remove extractor 1. Videos on ubu.com are now hosted on Vimeo 2. The duration is far from correct, and may not exist on other videos (For example http://ubu.com/film/hammons_king.html) --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/ubu.py | 57 ------------------------------ 2 files changed, 58 deletions(-) delete mode 100644 youtube_dl/extractor/ubu.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 0a75a56c5..d9193349d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -833,7 +833,6 @@ from .twitter import ( TwitterIE, TwitterAmplifyIE, ) -from .ubu import UbuIE from .udemy import ( UdemyIE, UdemyCourseIE diff --git a/youtube_dl/extractor/ubu.py b/youtube_dl/extractor/ubu.py deleted file mode 100644 index 1d52cbc98..000000000 --- a/youtube_dl/extractor/ubu.py +++ /dev/null @@ -1,57 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - qualities, -) - - -class UbuIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ubu\.com/film/(?P<id>[\da-z_-]+)\.html' - _TEST = { - 'url': 'http://ubu.com/film/her_noise.html', - 'md5': '138d5652618bf0f03878978db9bef1ee', - 'info_dict': { - 'id': 'her_noise', - 'ext': 'm4v', - 'title': 'Her Noise - The Making Of (2007)', - 'duration': 3600, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._html_search_regex( - r'<title>.+?Film & Video: ([^<]+)', webpage, 'title') - - duration = int_or_none(self._html_search_regex( - r'Duration: (\d+) minutes', webpage, 'duration', fatal=False), - invscale=60) - - formats = [] - FORMAT_REGEXES = [ - ('sq', r"'flashvars'\s*,\s*'file=([^']+)'"), - ('hq', r'href="(http://ubumexico\.centro\.org\.mx/video/[^"]+)"'), - ] - preference = qualities([fid for fid, _ in FORMAT_REGEXES]) - for format_id, format_regex in FORMAT_REGEXES: - m = re.search(format_regex, webpage) - if m: - formats.append({ - 'url': m.group(1), - 'format_id': format_id, - 'preference': preference(format_id), - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'duration': duration, - 'formats': formats, - } From f2159c9815fa056ca1d4ef4a6d1c31c4847b3d47 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Fri, 15 Apr 2016 04:02:23 +0800 Subject: [PATCH 20/25] [wayofthemaster] Remove extractor Now it's using YouTube embeds. --- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/wayofthemaster.py | 52 -------------------------- 2 files changed, 53 deletions(-) delete mode 100644 youtube_dl/extractor/wayofthemaster.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d9193349d..3148869d9 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -917,7 +917,6 @@ from .vulture import VultureIE from .walla import WallaIE from .washingtonpost import WashingtonPostIE from .wat import WatIE -from .wayofthemaster import WayOfTheMasterIE from .wdr import ( WDRIE, WDRMobileIE, diff --git a/youtube_dl/extractor/wayofthemaster.py b/youtube_dl/extractor/wayofthemaster.py deleted file mode 100644 index af7bb8b49..000000000 --- a/youtube_dl/extractor/wayofthemaster.py +++ /dev/null @@ -1,52 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor - - -class WayOfTheMasterIE(InfoExtractor): - _VALID_URL = r'https?://www\.wayofthemaster\.com/([^/?#]*/)*(?P[^/?#]+)\.s?html(?:$|[?#])' - - _TEST = { - 'url': 'http://www.wayofthemaster.com/hbks.shtml', - 'md5': '5316b57487ada8480606a93cb3d18d24', - 'info_dict': { - 'id': 'hbks', - 'ext': 'mp4', - 'title': 'Intelligent Design vs. Evolution', - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) - - title = self._search_regex( - r'(.*?)', webpage, 'page title') - - url_base = self._search_regex( - r' Date: Fri, 15 Apr 2016 04:28:54 +0800 Subject: [PATCH 21/25] [tdslifeway] Use the new Brightcove API Thanks for @remitamine's suggestion. --- youtube_dl/extractor/ministrygrid.py | 6 ++++ youtube_dl/extractor/tdslifeway.py | 41 ++++++---------------------- 2 files changed, 15 insertions(+), 32 deletions(-) diff --git a/youtube_dl/extractor/ministrygrid.py b/youtube_dl/extractor/ministrygrid.py index 000989873..e48eba3fa 100644 --- a/youtube_dl/extractor/ministrygrid.py +++ b/youtube_dl/extractor/ministrygrid.py @@ -17,11 +17,17 @@ class MinistryGridIE(InfoExtractor): 'id': '3453494717001', 'ext': 'mp4', 'title': 'The Gospel by Numbers', + 'thumbnail': 're:^https?://.*\.jpg', + 'upload_date': '20140410', + 'description': 'Coming soon from T4G 2014!', + 'uploader_id': '2034960640001', + 'timestamp': 1397145591, }, 'params': { # m3u8 download 'skip_download': True, }, + 'add_ie': ['TDSLifeway'], } def _real_extract(self, url): diff --git a/youtube_dl/extractor/tdslifeway.py b/youtube_dl/extractor/tdslifeway.py index 34c7a13fd..4d1f5c801 100644 --- a/youtube_dl/extractor/tdslifeway.py +++ b/youtube_dl/extractor/tdslifeway.py @@ -14,43 +14,20 @@ class TDSLifewayIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Gospel by Numbers', 'thumbnail': 're:^https?://.*\.jpg', + 'upload_date': '20140410', + 'description': 'Coming soon from T4G 2014!', + 'uploader_id': '2034960640001', + 'timestamp': 1397145591, }, 'params': { # m3u8 download 'skip_download': True, }, + 'add_ie': ['BrightcoveNew'], } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/2034960640001/default_default/index.html?videoId=%s' + def _real_extract(self, url): - video_id = self._match_id(url) - - # XXX: A generic brightcove function? - json_data = self._download_json( - 'http://api.brightcove.com/services/library', video_id, - query={ - 'command': 'find_video_by_id', - 'video_id': video_id, - 'video_fields': 'id,name,videoStillURL,HLSURL,FLVURL', - 'media_delivery': 'http', - # token extracted from http://tds.lifeway.com/v1/trainingdeliverysystem/courses/player_test.js - 'token': 'MrrNjVSP15NGY3R0gipp-lvclofucPXKD3skFouJMjZXM3KOS2ch0g..', - }) - - formats = [] - - if 'HLSURL' in json_data: - formats.extend(self._extract_m3u8_formats( - json_data['HLSURL'], video_id, ext='mp4', m3u8_id='hls', fatal=False)) - if 'FLVURL' in json_data: - formats.append({ - 'url': json_data['FLVURL'], - }) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': json_data['name'], - 'thumbnail': json_data.get('videoStillURL'), - 'formats': formats, - } + brightcove_id = self._match_id(url) + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) From 9c250931f5e1e68a835065c0cc5fa58e3f1e4734 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Thu, 14 Apr 2016 22:19:52 +0800 Subject: [PATCH 22/25] [mixcloud] Improve and simplify mixcloud:user and mixcloud:playlist --- youtube_dl/extractor/mixcloud.py | 198 ++++++++++++------------------- 1 file changed, 76 insertions(+), 122 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 30b33e7e9..5cf42198d 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -1,19 +1,21 @@ from __future__ import unicode_literals +import functools import re from .common import InfoExtractor from ..compat import ( compat_urllib_parse_unquote, - compat_urllib_request + compat_urlparse, ) from ..utils import ( + clean_html, ExtractorError, HEADRequest, + OnDemandPagedList, NO_DEFAULT, parse_count, str_to_int, - clean_html ) @@ -121,191 +123,143 @@ class MixcloudIE(InfoExtractor): } -class MixcloudUserIE(InfoExtractor): - """ - Information extractor for Mixcloud users. - It can retrieve a list of a user's uploads, favorites or listens. - """ +class MixcloudPlaylistBaseIE(InfoExtractor): + _PAGE_SIZE = 24 + def _fetch_tracks_page(self, path, video_id, page_name, current_page): + resp = self._download_webpage( + 'https://www.mixcloud.com/%s/' % path, video_id, + note='Download %s (page %d)' % (page_name, current_page + 1), + errnote='Unable to download %s' % page_name, + query={'page': (current_page + 1), 'list': 'main', '_ajax': '1'}, + headers={'X-Requested-With': 'XMLHttpRequest'}) + + for url in re.findall(r'm-play-button m-url="(?P[^"]+)"', resp): + yield self.url_result( + compat_urlparse.urljoin('https://www.mixcloud.com', clean_html(url)), + MixcloudIE.ie_key()) + + def _get_user_description(self, page_content): + return self._html_search_regex( + r']+class="description-text"[^>]*>(.+?)', + page_content, 'user description', fatal=False) + + +class MixcloudUserIE(MixcloudPlaylistBaseIE): _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P[^/]+)/(?Puploads|favorites|listens)?/?$' IE_NAME = 'mixcloud:user' _TESTS = [{ 'url': 'http://www.mixcloud.com/dholbach/', 'info_dict': { - 'id': 'dholbach/uploads', + 'id': 'dholbach_uploads', 'title': 'Daniel Holbach (uploads)', 'description': 'md5:327af72d1efeb404a8216c27240d1370', }, - 'playlist_mincount': 11 + 'playlist_mincount': 11, }, { 'url': 'http://www.mixcloud.com/dholbach/uploads/', 'info_dict': { - 'id': 'dholbach/uploads', + 'id': 'dholbach_uploads', 'title': 'Daniel Holbach (uploads)', 'description': 'md5:327af72d1efeb404a8216c27240d1370', }, - 'playlist_mincount': 11 + 'playlist_mincount': 11, }, { 'url': 'http://www.mixcloud.com/dholbach/favorites/', 'info_dict': { - 'id': 'dholbach/favorites', + 'id': 'dholbach_favorites', 'title': 'Daniel Holbach (favorites)', 'description': 'md5:327af72d1efeb404a8216c27240d1370', }, - 'playlist_mincount': 244 + 'params': { + 'playlist_items': '1-100', + }, + 'playlist_mincount': 100, }, { 'url': 'http://www.mixcloud.com/dholbach/listens/', 'info_dict': { - 'id': 'dholbach/listens', + 'id': 'dholbach_listens', 'title': 'Daniel Holbach (listens)', 'description': 'md5:327af72d1efeb404a8216c27240d1370', }, - 'playlist_mincount': 846 + 'params': { + 'playlist_items': '1-100', + }, + 'playlist_mincount': 100, }] - def _fetch_tracks(self, base_url, video_id, dl_note=None, dl_errnote=None): - # retrieve all fragments of a list of tracks with fake AJAX calls - track_urls = [] - current_page = 1 - while True: - # fake a AJAX request to retrieve a list fragment - page_url = base_url + "?page=%d&list=main&_ajax=1" % current_page - req = compat_urllib_request.Request(page_url, headers={"X-Requested-With": "XMLHttpRequest"}) - resp = self._download_webpage(req, video_id, note=dl_note + " (page %d)" % current_page, errnote=dl_errnote) - - # extract all track URLs from fragment - urls = re.findall(r'm-play-button m-url="(?P[^"]+)"', resp) - # clean up URLs - urls = map(clean_html, urls) - # create absolute URLs - urls = map(lambda u: "https://www.mixcloud.com" + u, urls) - track_urls.extend(urls) - - # advance to next fragment, if any - if " m-next-page-url=" in resp: - current_page += 1 - else: - break - - return track_urls - - def _handle_track_urls(self, urls): - return map(lambda u: self.url_result(u, "Mixcloud"), urls) - - def _get_user_description(self, page_content): - return self._html_search_regex( - r'
.*?

(.*?)

', - page_content, - "user description", - fatal=False) - - def _get_username(self, page_content): - return self._og_search_title(page_content) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - user_id = mobj.group("user") - list_type = mobj.group("type") + user_id = mobj.group('user') + list_type = mobj.group('type') # if only a profile URL was supplied, default to download all uploads if list_type is None: - list_type = "uploads" + list_type = 'uploads' - video_id = "%s/%s" % (user_id, list_type) + video_id = '%s_%s' % (user_id, list_type) - # download the user's profile to retrieve some metadata - profile = self._download_webpage("https://www.mixcloud.com/%s/" % user_id, - video_id, - note="Downloading user profile", - errnote="Unable to download user profile") + profile = self._download_webpage( + 'https://www.mixcloud.com/%s/' % user_id, video_id, + note='Downloading user profile', + errnote='Unable to download user profile') - username = self._get_username(profile) + username = self._og_search_title(profile) description = self._get_user_description(profile) - # retrieve all page fragments of uploads, favorites or listens - track_urls = self._fetch_tracks( - "https://www.mixcloud.com/%s/%s/" % (user_id, list_type), - video_id, - dl_note="Downloading list of %s" % list_type, - dl_errnote="Unable to download list of %s" % list_type) + entries = OnDemandPagedList( + functools.partial( + self._fetch_tracks_page, + '%s/%s' % (user_id, list_type), video_id, 'list of %s' % list_type), + self._PAGE_SIZE, use_cache=True) - # let MixcloudIE handle each track URL - entries = self._handle_track_urls(track_urls) - - return { - '_type': 'playlist', - 'entries': entries, - 'title': "%s (%s)" % (username, list_type), - 'id': video_id, - "description": description - } + return self.playlist_result( + entries, video_id, '%s (%s)' % (username, list_type), description) -class MixcloudPlaylistIE(MixcloudUserIE): - """ - Information extractor for Mixcloud playlists. - """ - +class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P[^/]+)/playlists/(?P[^/]+)/?$' IE_NAME = 'mixcloud:playlist' _TESTS = [{ 'url': 'https://www.mixcloud.com/RedBullThre3style/playlists/tokyo-finalists-2015/', 'info_dict': { - 'id': 'RedBullThre3style/playlists/tokyo-finalists-2015', + 'id': 'RedBullThre3style_tokyo-finalists-2015', 'title': 'National Champions 2015', 'description': 'md5:6ff5fb01ac76a31abc9b3939c16243a3', }, - 'playlist_mincount': 16 + 'playlist_mincount': 16, }, { 'url': 'https://www.mixcloud.com/maxvibes/playlists/jazzcat-on-ness-radio/', 'info_dict': { - 'id': 'maxvibes/playlists/jazzcat-on-ness-radio', + 'id': 'maxvibes_jazzcat-on-ness-radio', 'title': 'Jazzcat on Ness Radio', 'description': 'md5:7bbbf0d6359a0b8cda85224be0f8f263', }, 'playlist_mincount': 23 }] - def _get_playlist_title(self, page_content): - return self._html_search_regex( - r'(?P.*?)</span>', - page_content, - "playlist title", - group="title", - fatal=True - ) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - user_id = mobj.group("user") - playlist_id = mobj.group("playlist") - video_id = "%s/playlists/%s" % (user_id, playlist_id) + user_id = mobj.group('user') + playlist_id = mobj.group('playlist') + video_id = '%s_%s' % (user_id, playlist_id) - # download the playlist page to retrieve some metadata - profile = self._download_webpage(url, - user_id, - note="Downloading playlist page", - errnote="Unable to download playlist page") + profile = self._download_webpage( + url, user_id, + note='Downloading playlist page', + errnote='Unable to download playlist page') description = self._get_user_description(profile) - playlist_title = self._get_playlist_title(profile) + playlist_title = self._html_search_regex( + r'<span[^>]+class="[^"]*list-playlist-title[^"]*"[^>]*>(.*?)</span>', + profile, 'playlist title') - # retrieve all page fragments of playlist - track_urls = self._fetch_tracks( - "https://www.mixcloud.com/%s/playlists/%s/" % (user_id, playlist_id), - video_id, - dl_note="Downloading tracklist of %s" % playlist_title, - dl_errnote="Unable to tracklist of %s" % playlist_title) + entries = OnDemandPagedList( + functools.partial( + self._fetch_tracks_page, + '%s/playlists/%s' % (user_id, playlist_id), video_id, 'tracklist'), + self._PAGE_SIZE) - # let MixcloudIE handle each track - entries = self._handle_track_urls(track_urls) - - return { - '_type': 'playlist', - 'entries': entries, - 'title': playlist_title, - 'id': video_id, - "description": description - } + return self.playlist_result(entries, video_id, playlist_title, description) From dd91dfcd67a0d1db25836f734579742ff73b0c66 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 15 Apr 2016 15:42:00 +0800 Subject: [PATCH 23/25] [mixcloud] Fix extraction by decrypting play info Fixes #7521 --- youtube_dl/extractor/mixcloud.py | 43 +++++++++++++++----------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 5cf42198d..d4a4963ee 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -1,19 +1,20 @@ from __future__ import unicode_literals +import base64 import functools import re from .common import InfoExtractor from ..compat import ( + compat_chr, + compat_ord, compat_urllib_parse_unquote, compat_urlparse, ) from ..utils import ( clean_html, ExtractorError, - HEADRequest, OnDemandPagedList, - NO_DEFAULT, parse_count, str_to_int, ) @@ -45,22 +46,22 @@ class MixcloudIE(InfoExtractor): 'description': 'md5:2b8aec6adce69f9d41724647c65875e8', 'uploader': 'Gilles Peterson Worldwide', 'uploader_id': 'gillespeterson', - 'thumbnail': 're:https?://.*/images/', + 'thumbnail': 're:https?://.*', 'view_count': int, 'like_count': int, }, }] - def _check_url(self, url, track_id, ext): - try: - # We only want to know if the request succeed - # don't download the whole file - self._request_webpage( - HEADRequest(url), track_id, - 'Trying %s URL' % ext) - return True - except ExtractorError: - return False + # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js + @staticmethod + def _decrypt_play_info(play_info): + KEY = 'pleasedontdownloadourmusictheartistswontgetpaid' + + play_info = base64.b64decode(play_info.encode('ascii')) + + return ''.join([ + compat_chr(compat_ord(ch) ^ compat_ord(KEY[idx % len(KEY)])) + for idx, ch in enumerate(play_info)]) def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -74,19 +75,15 @@ class MixcloudIE(InfoExtractor): r'(?s)<div[^>]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)', webpage, 'error message', default=None) - preview_url = self._search_regex( - r'\s(?:data-preview-url|m-preview)="([^"]+)"', - webpage, 'preview url', default=None if message else NO_DEFAULT) + encrypted_play_info = self._search_regex( + r'm-play-info="([^"]+)"', webpage, 'play info') + play_info = self._parse_json( + self._decrypt_play_info(encrypted_play_info), track_id) - if message: + if message and 'stream_url' not in play_info: raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) - song_url = re.sub(r'audiocdn(\d+)', r'stream\1', preview_url) - song_url = song_url.replace('/previews/', '/c/originals/') - if not self._check_url(song_url, track_id, 'mp3'): - song_url = song_url.replace('.mp3', '.m4a').replace('originals/', 'm4a/64/') - if not self._check_url(song_url, track_id, 'm4a'): - raise ExtractorError('Unable to extract track url') + song_url = play_info['stream_url'] PREFIX = ( r'm-play-on-spacebar[^>]+' From e6da9240d44774495a7ae0f2780bd42e4d2628f5 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 15 Apr 2016 17:14:17 +0800 Subject: [PATCH 24/25] [mixcloud:stream] Add new extractor Closes #7633 --- youtube_dl/extractor/extractors.py | 3 +- youtube_dl/extractor/mixcloud.py | 79 +++++++++++++++++++++++++----- 2 files changed, 70 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 51c9a4719..d00445b3c 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -415,7 +415,8 @@ from .mitele import MiTeleIE from .mixcloud import ( MixcloudIE, MixcloudUserIE, - MixcloudPlaylistIE + MixcloudPlaylistIE, + MixcloudStreamIE, ) from .mlb import MLBIE from .mnet import MnetIE diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index d4a4963ee..483f6925f 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import base64 import functools +import itertools import re from .common import InfoExtractor @@ -123,19 +124,27 @@ class MixcloudIE(InfoExtractor): class MixcloudPlaylistBaseIE(InfoExtractor): _PAGE_SIZE = 24 - def _fetch_tracks_page(self, path, video_id, page_name, current_page): - resp = self._download_webpage( - 'https://www.mixcloud.com/%s/' % path, video_id, - note='Download %s (page %d)' % (page_name, current_page + 1), - errnote='Unable to download %s' % page_name, - query={'page': (current_page + 1), 'list': 'main', '_ajax': '1'}, - headers={'X-Requested-With': 'XMLHttpRequest'}) - - for url in re.findall(r'm-play-button m-url="(?P<url>[^"]+)"', resp): + def _find_urls_in_page(self, page): + for url in re.findall(r'm-play-button m-url="(?P<url>[^"]+)"', page): yield self.url_result( compat_urlparse.urljoin('https://www.mixcloud.com', clean_html(url)), MixcloudIE.ie_key()) + def _fetch_tracks_page(self, path, video_id, page_name, current_page, real_page_number=None): + real_page_number = real_page_number or current_page + 1 + return self._download_webpage( + 'https://www.mixcloud.com/%s/' % path, video_id, + note='Download %s (page %d)' % (page_name, current_page + 1), + errnote='Unable to download %s' % page_name, + query={'page': real_page_number, 'list': 'main', '_ajax': '1'}, + headers={'X-Requested-With': 'XMLHttpRequest'}) + + def _tracks_page_func(self, page, video_id, page_name, current_page): + resp = self._fetch_tracks_page(page, video_id, page_name, current_page) + + for item in self._find_urls_in_page(resp): + yield item + def _get_user_description(self, page_content): return self._html_search_regex( r'<div[^>]+class="description-text"[^>]*>(.+?)</div>', @@ -207,7 +216,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): entries = OnDemandPagedList( functools.partial( - self._fetch_tracks_page, + self._tracks_page_func, '%s/%s' % (user_id, list_type), video_id, 'list of %s' % list_type), self._PAGE_SIZE, use_cache=True) @@ -255,8 +264,56 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): entries = OnDemandPagedList( functools.partial( - self._fetch_tracks_page, + self._tracks_page_func, '%s/playlists/%s' % (user_id, playlist_id), video_id, 'tracklist'), self._PAGE_SIZE) return self.playlist_result(entries, video_id, playlist_title, description) + + +class MixcloudStreamIE(MixcloudPlaylistBaseIE): + _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/stream/?$' + IE_NAME = 'mixcloud:stream' + + _TEST = { + 'url': 'https://www.mixcloud.com/FirstEar/stream/', + 'info_dict': { + 'id': 'FirstEar', + 'title': 'First Ear', + 'description': 'Curators of good music\nfirstearmusic.com', + }, + 'playlist_mincount': 192, + } + + def _real_extract(self, url): + user_id = self._match_id(url) + + webpage = self._download_webpage(url, user_id) + + entries = [] + prev_page_url = None + + def _handle_page(page): + entries.extend(self._find_urls_in_page(page)) + return self._search_regex( + r'm-next-page-url="([^"]+)"', page, + 'next page URL', default=None) + + next_page_url = _handle_page(webpage) + + for idx in itertools.count(0): + if not next_page_url or prev_page_url == next_page_url: + break + + prev_page_url = next_page_url + current_page = int(self._search_regex( + r'\?page=(\d+)', next_page_url, 'next page number')) + + next_page_url = _handle_page(self._fetch_tracks_page( + '%s/stream' % user_id, user_id, 'stream', idx, + real_page_number=current_page)) + + username = self._og_search_title(webpage) + description = self._get_user_description(webpage) + + return self.playlist_result(entries, user_id, username, description) From f0ec61b52559dc6e25f66895a55e1b73e9e9f58b Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan <yan12125@gmail.com> Date: Fri, 15 Apr 2016 20:54:12 +0800 Subject: [PATCH 25/25] [huffpost] Fix extraction --- youtube_dl/extractor/huffpost.py | 36 ++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/huffpost.py b/youtube_dl/extractor/huffpost.py index a38eae421..1dc5701b2 100644 --- a/youtube_dl/extractor/huffpost.py +++ b/youtube_dl/extractor/huffpost.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, parse_duration, unified_strdate, ) @@ -29,7 +30,12 @@ class HuffPostIE(InfoExtractor): 'description': 'This week on Legalese It, Mike talks to David Bosco about his new book on the ICC, "Rough Justice," he also discusses the Virginia AG\'s historic stance on gay marriage, the execution of Edgar Tamayo, the ICC\'s delay of Kenya\'s President and more. ', 'duration': 1549, 'upload_date': '20140124', - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 404: Not Found'], } def _real_extract(self, url): @@ -45,7 +51,7 @@ class HuffPostIE(InfoExtractor): description = data.get('description') thumbnails = [] - for url in data['images'].values(): + for url in filter(None, data['images'].values()): m = re.match('.*-([0-9]+x[0-9]+)\.', url) if not m: continue @@ -54,13 +60,25 @@ class HuffPostIE(InfoExtractor): 'resolution': m.group(1), }) - formats = [{ - 'format': key, - 'format_id': key.replace('/', '.'), - 'ext': 'mp4', - 'url': url, - 'vcodec': 'none' if key.startswith('audio/') else None, - } for key, url in data.get('sources', {}).get('live', {}).items()] + formats = [] + sources = data.get('sources', {}) + live_sources = list(sources.get('live', {}).items()) + list(sources.get('live_again', {}).items()) + for key, url in live_sources: + ext = determine_ext(url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + url, video_id, ext='mp4', m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formatsa( + url + '?hdcore=2.9.5', video_id, f4m_id='hds', fatal=False)) + else: + formats.append({ + 'format': key, + 'format_id': key.replace('/', '.'), + 'ext': 'mp4', + 'url': url, + 'vcodec': 'none' if key.startswith('audio/') else None, + }) if not formats and data.get('fivemin_id'): return self.url_result('5min:%s' % data['fivemin_id'])