From 9e1ec1ddb04e57461d010ca2ee551a67b8c3de23 Mon Sep 17 00:00:00 2001 From: Jonathan Nifenecker Date: Thu, 11 Feb 2016 00:43:58 +0100 Subject: [PATCH 1/5] [madmoizelle] Add new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/madmoizelle.py | 83 +++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 youtube_dl/extractor/madmoizelle.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 18951c287..321ed3f0a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -375,6 +375,7 @@ from .lynda import ( ) from .m6 import M6IE from .macgamestore import MacGameStoreIE +from .madmoizelle import MadmoizelleIE from .mailru import MailRuIE from .makertv import MakerTVIE from .malemotion import MalemotionIE diff --git a/youtube_dl/extractor/madmoizelle.py b/youtube_dl/extractor/madmoizelle.py new file mode 100644 index 000000000..b00e8a509 --- /dev/null +++ b/youtube_dl/extractor/madmoizelle.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +<<<<<<< HEAD +from ..utils import ( + HEADRequest, +) +======= +from ..utils import HEADRequest +>>>>>>> 7a50899fde5f12ad3f44bd92141a1161a139f0ee + +from .common import InfoExtractor + + +class MadmoizelleIE(InfoExtractor): + IE_NAME = 'madmoizelle.com' + IE_DESC = 'madmoizelle JW player' + _VALID_URL = r'(https?://)?(?:www\.)?madmoizelle\.com/.+-(?P[0-9]+)#?.*' + _TESTS = [{ + # classic video from the site + 'url': 'http://www.madmoizelle.com/ukulete-episode-1-408599', + 'md5': 'e79ce7c2131cb3dfd200bea5177236fe', + 'info_dict': { + 'ext': 'mp4', + 'id': '408599', + 'title': 'Ukul’été - Épisode 1 - Plus besoin de radio !', + 'description': 'La voilà, la nouvelle saga de l’été, présentée par Marion et Waxx, plus excitante que Camping Paradis, plus addictive que Maigret, plus palpitante que Zodiac !', + } + }, { + # to test youtube redirection fallback + 'url': 'http://www.madmoizelle.com/connected-court-metrage-501199#gs.AblS7VA', + 'md5': '77928d3964eceb2fe828204db5ee714a', + 'info_dict': { + 'title': '\'Connected\' - A Sci-Fi Short Starring Pamela Anderson', + 'id': 'iWLcWHYmgpg', + 'ext': 'mp4', + 'upload_date': '20160208', + 'description': 'md5:8de82e60853651512fb923e84873f526', + 'uploader': 'Motherboard', + 'uploader_id': 'MotherboardTV', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex(r'(.+?)', webpage, 'title') + description = self._html_search_meta('description', webpage) + + formats = [] + for format in re.finditer('jwplayer.*"(?P[0-9]+.*(?Phd|sd).+)"', webpage): + url = 'https://player.vimeo.com/external/' + format.group('url') + + # the url found point to a header redirection that we must follow + head_req = HEADRequest(url) + head_response = self._request_webpage( + head_req, video_id, + note=False, errnote='Could not send HEAD request to %s' % url, + fatal=False) + if head_response is not False: + new_url = (self.url_result(head_response.geturl()))['url'] + + formats.append({ + 'format_id': format.group('quality'), + 'url': new_url, + }) + + if not formats: + # nothing has been found with the site's extractor, + # fallback to generic with original url + return { + 'url': url, + 'ie_key': 'Generic', + '_type': 'url', + } + else: + return { + 'formats': formats, + 'id': video_id, + 'title': title, + 'description': description, + } From 04ff0c5771ef76d51816f5b960b3ccd0791d9340 Mon Sep 17 00:00:00 2001 From: Jonathan Nifenecker Date: Thu, 11 Feb 2016 09:16:32 +0100 Subject: [PATCH 2/5] merge leftover corrected for proper pull request --- youtube_dl/extractor/madmoizelle.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/youtube_dl/extractor/madmoizelle.py b/youtube_dl/extractor/madmoizelle.py index b00e8a509..f99cae881 100644 --- a/youtube_dl/extractor/madmoizelle.py +++ b/youtube_dl/extractor/madmoizelle.py @@ -2,13 +2,9 @@ from __future__ import unicode_literals import re -<<<<<<< HEAD from ..utils import ( HEADRequest, ) -======= -from ..utils import HEADRequest ->>>>>>> 7a50899fde5f12ad3f44bd92141a1161a139f0ee from .common import InfoExtractor From c883416c48d3d6bdb959a94a0affde3a6d8644bf Mon Sep 17 00:00:00 2001 From: Jonathan Nifenecker Date: Mon, 15 Feb 2016 22:19:18 +0100 Subject: [PATCH 3/5] [madmoizelle] add multiple videos support. --- youtube_dl/extractor/madmoizelle.py | 100 ++++++++++++-------- youtube_dl/extractor/wat.py | 137 ---------------------------- 2 files changed, 60 insertions(+), 177 deletions(-) delete mode 100644 youtube_dl/extractor/wat.py diff --git a/youtube_dl/extractor/madmoizelle.py b/youtube_dl/extractor/madmoizelle.py index f99cae881..1afb6a59c 100644 --- a/youtube_dl/extractor/madmoizelle.py +++ b/youtube_dl/extractor/madmoizelle.py @@ -2,31 +2,49 @@ from __future__ import unicode_literals import re -from ..utils import ( - HEADRequest, -) from .common import InfoExtractor class MadmoizelleIE(InfoExtractor): IE_NAME = 'madmoizelle.com' - IE_DESC = 'madmoizelle JW player' - _VALID_URL = r'(https?://)?(?:www\.)?madmoizelle\.com/.+-(?P[0-9]+)#?.*' + IE_DESC = 'madmoizelle.com video:article' + _VALID_URL = r'https?://(?:www\.)?madmoizelle\.com/.+-(?P[0-9]+)#?.*' _TESTS = [{ - # classic video from the site + # site's video in banner + 'url': 'http://www.madmoizelle.com/american-ultra-marion-seclin-408959#gs.nNBitsc', + 'info_dict': { + 'ext': 'mp4', + 'id': '408959', + 'title': 'Ma vie, c\'est comme dans American Ultra (Marion Seclin)', + 'description': 'md5:af3f8fc3b0668ede24feb1d50826e00c', + } + }, { + # external video in banner + 'url': 'http://www.madmoizelle.com/game-of-thrones-saison-5-podcast-383647#gs.MvCyYtg', + 'info_dict': { + 'ext': 'mp4', + 'description': 'md5:efe603274fe803d27f9cb912eb83491a', + 'title': 'REPLAY ! L\'éMymyssion - Bilan Game of Thrones saison 5', + 'id': 'hH__bkW5Hu0', + 'upload_date': '20150623', + 'uploader_id': 'madmoiZelledotcom', + 'uploader': 'madmoiZelle.com', + }, + }, { + # site's video in banner and external video in content 'url': 'http://www.madmoizelle.com/ukulete-episode-1-408599', 'md5': 'e79ce7c2131cb3dfd200bea5177236fe', 'info_dict': { 'ext': 'mp4', 'id': '408599', 'title': 'Ukul’été - Épisode 1 - Plus besoin de radio !', - 'description': 'La voilà, la nouvelle saga de l’été, présentée par Marion et Waxx, plus excitante que Camping Paradis, plus addictive que Maigret, plus palpitante que Zodiac !', + 'description': 'md5:18949f6512cbd285c7b8b536a9955f06', } }, { - # to test youtube redirection fallback + # external video in content 'url': 'http://www.madmoizelle.com/connected-court-metrage-501199#gs.AblS7VA', - 'md5': '77928d3964eceb2fe828204db5ee714a', + 'md5': '320890bbce5968e3482059a5d3770ba9', 'info_dict': { 'title': '\'Connected\' - A Sci-Fi Short Starring Pamela Anderson', 'id': 'iWLcWHYmgpg', @@ -35,45 +53,47 @@ class MadmoizelleIE(InfoExtractor): 'description': 'md5:8de82e60853651512fb923e84873f526', 'uploader': 'Motherboard', 'uploader_id': 'MotherboardTV', + 'age_limit': 18, } }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'(.+?)', webpage, 'title') - description = self._html_search_meta('description', webpage) - formats = [] - for format in re.finditer('jwplayer.*"(?P[0-9]+.*(?Phd|sd).+)"', webpage): - url = 'https://player.vimeo.com/external/' + format.group('url') + # all article page may contain one video in a banner form, + # and many externals videos in the article content + results = [] - # the url found point to a header redirection that we must follow - head_req = HEADRequest(url) - head_response = self._request_webpage( - head_req, video_id, - note=False, errnote='Could not send HEAD request to %s' % url, - fatal=False) - if head_response is not False: - new_url = (self.url_result(head_response.geturl()))['url'] + if '
' in webpage: + # banner video present + title = self._html_search_regex(r'(.+?)', webpage, 'title') + description = self._html_search_meta('description', webpage) + formats = [] + for format in re.finditer('jwplayer.*"(?P[0-9]+.*(?Phd|sd).+)"', webpage): + formats.append({ + 'format_id': format.group('quality'), + 'url': 'https://player.vimeo.com/external/' + format.group('url'), + }) - formats.append({ - 'format_id': format.group('quality'), - 'url': new_url, + if formats: + # site's jw player url found + results.append({ + 'formats': formats, + 'id': video_id, + 'title': title, + 'description': description, + }) + + # external video, may be any site, in any number + # fallback to generic extractor + for external_vid in re.finditer(r'.*?)".*', webpage): + results.append({ + 'url': external_vid.group('url'), + '_type': 'url', }) - if not formats: - # nothing has been found with the site's extractor, - # fallback to generic with original url - return { - 'url': url, - 'ie_key': 'Generic', - '_type': 'url', - } - else: - return { - 'formats': formats, - 'id': video_id, - 'title': title, - 'description': description, - } + return { + '_type': 'playlist', + 'entries': results, + } diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py deleted file mode 100644 index affcc52f6..000000000 --- a/youtube_dl/extractor/wat.py +++ /dev/null @@ -1,137 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import hashlib - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - unified_strdate, -) - - -class WatIE(InfoExtractor): - _VALID_URL = r'http://www\.wat\.tv/video/(?P.*)-(?P.*?)_.*?\.html' - IE_NAME = 'wat.tv' - _TESTS = [ - { - 'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html', - 'md5': 'ce70e9223945ed26a8056d413ca55dc9', - 'info_dict': { - 'id': '11713067', - 'display_id': 'soupe-figues-l-orange-aux-epices', - 'ext': 'mp4', - 'title': 'Soupe de figues à l\'orange et aux épices', - 'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.', - 'upload_date': '20140819', - 'duration': 120, - }, - }, - { - 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', - 'md5': 'fbc84e4378165278e743956d9c1bf16b', - 'info_dict': { - 'id': '11713075', - 'display_id': 'gregory-lemarchal-voix-ange', - 'ext': 'mp4', - 'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)', - 'description': 'md5:b7a849cf16a2b733d9cd10c52906dee3', - 'upload_date': '20140816', - 'duration': 2910, - }, - 'skip': "Ce contenu n'est pas disponible pour l'instant.", - }, - ] - - def download_video_info(self, real_id): - # 'contentv4' is used in the website, but it also returns the related - # videos, we don't need them - info = self._download_json('http://www.wat.tv/interface/contentv3/' + real_id, real_id) - return info['media'] - - def _real_extract(self, url): - def real_id_for_chapter(chapter): - return chapter['tc_start'].split('-')[0] - mobj = re.match(self._VALID_URL, url) - short_id = mobj.group('short_id') - display_id = mobj.group('display_id') - webpage = self._download_webpage(url, display_id or short_id) - real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id') - - video_info = self.download_video_info(real_id) - - error_desc = video_info.get('error_desc') - if error_desc: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error_desc), expected=True) - - geo_list = video_info.get('geoList') - country = geo_list[0] if geo_list else '' - - chapters = video_info['chapters'] - first_chapter = chapters[0] - files = video_info['files'] - first_file = files[0] - - if real_id_for_chapter(first_chapter) != real_id: - self.to_screen('Multipart video detected') - chapter_urls = [] - for chapter in chapters: - chapter_id = real_id_for_chapter(chapter) - # Yes, when we this chapter is processed by WatIE, - # it will download the info again - chapter_info = self.download_video_info(chapter_id) - chapter_urls.append(chapter_info['url']) - entries = [self.url_result(chapter_url) for chapter_url in chapter_urls] - return self.playlist_result(entries, real_id, video_info['title']) - - upload_date = None - if 'date_diffusion' in first_chapter: - upload_date = unified_strdate(first_chapter['date_diffusion']) - # Otherwise we can continue and extract just one part, we have to use - # the short id for getting the video url - - formats = [{ - 'url': 'http://wat.tv/get/android5/%s.mp4' % real_id, - 'format_id': 'Mobile', - }] - - fmts = [('SD', 'web')] - if first_file.get('hasHD'): - fmts.append(('HD', 'webhd')) - - def compute_token(param): - timestamp = '%08x' % int(self._download_webpage( - 'http://www.wat.tv/servertime', real_id, - 'Downloading server time').split('|')[0]) - magic = '9b673b13fa4682ed14c3cfa5af5310274b514c4133e9b3a81e6e3aba009l2564' - return '%s/%s' % (hashlib.md5((magic + param + timestamp).encode('ascii')).hexdigest(), timestamp) - - for fmt in fmts: - webid = '/%s/%s' % (fmt[1], real_id) - video_url = self._download_webpage( - 'http://www.wat.tv/get%s?token=%s&getURL=1&country=%s' % (webid, compute_token(webid), country), - real_id, - 'Downloading %s video URL' % fmt[0], - 'Failed to download %s video URL' % fmt[0], - False) - if not video_url: - continue - formats.append({ - 'url': video_url, - 'ext': 'mp4', - 'format_id': fmt[0], - }) - - return { - 'id': real_id, - 'display_id': display_id, - 'title': first_chapter['title'], - 'thumbnail': first_chapter['preview'], - 'description': first_chapter['description'], - 'view_count': video_info['views'], - 'upload_date': upload_date, - 'duration': first_file['duration'], - 'formats': formats, - } From 7a6824b628b1750394e4e8931ce8ab9842733090 Mon Sep 17 00:00:00 2001 From: Jonathan Nifenecker Date: Tue, 22 Mar 2016 22:25:34 +0100 Subject: [PATCH 4/5] revert accidental delete. --- youtube_dl/extractor/wat.py | 137 ++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 youtube_dl/extractor/wat.py diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py new file mode 100644 index 000000000..affcc52f6 --- /dev/null +++ b/youtube_dl/extractor/wat.py @@ -0,0 +1,137 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import hashlib + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + unified_strdate, +) + + +class WatIE(InfoExtractor): + _VALID_URL = r'http://www\.wat\.tv/video/(?P.*)-(?P.*?)_.*?\.html' + IE_NAME = 'wat.tv' + _TESTS = [ + { + 'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html', + 'md5': 'ce70e9223945ed26a8056d413ca55dc9', + 'info_dict': { + 'id': '11713067', + 'display_id': 'soupe-figues-l-orange-aux-epices', + 'ext': 'mp4', + 'title': 'Soupe de figues à l\'orange et aux épices', + 'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.', + 'upload_date': '20140819', + 'duration': 120, + }, + }, + { + 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', + 'md5': 'fbc84e4378165278e743956d9c1bf16b', + 'info_dict': { + 'id': '11713075', + 'display_id': 'gregory-lemarchal-voix-ange', + 'ext': 'mp4', + 'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)', + 'description': 'md5:b7a849cf16a2b733d9cd10c52906dee3', + 'upload_date': '20140816', + 'duration': 2910, + }, + 'skip': "Ce contenu n'est pas disponible pour l'instant.", + }, + ] + + def download_video_info(self, real_id): + # 'contentv4' is used in the website, but it also returns the related + # videos, we don't need them + info = self._download_json('http://www.wat.tv/interface/contentv3/' + real_id, real_id) + return info['media'] + + def _real_extract(self, url): + def real_id_for_chapter(chapter): + return chapter['tc_start'].split('-')[0] + mobj = re.match(self._VALID_URL, url) + short_id = mobj.group('short_id') + display_id = mobj.group('display_id') + webpage = self._download_webpage(url, display_id or short_id) + real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id') + + video_info = self.download_video_info(real_id) + + error_desc = video_info.get('error_desc') + if error_desc: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error_desc), expected=True) + + geo_list = video_info.get('geoList') + country = geo_list[0] if geo_list else '' + + chapters = video_info['chapters'] + first_chapter = chapters[0] + files = video_info['files'] + first_file = files[0] + + if real_id_for_chapter(first_chapter) != real_id: + self.to_screen('Multipart video detected') + chapter_urls = [] + for chapter in chapters: + chapter_id = real_id_for_chapter(chapter) + # Yes, when we this chapter is processed by WatIE, + # it will download the info again + chapter_info = self.download_video_info(chapter_id) + chapter_urls.append(chapter_info['url']) + entries = [self.url_result(chapter_url) for chapter_url in chapter_urls] + return self.playlist_result(entries, real_id, video_info['title']) + + upload_date = None + if 'date_diffusion' in first_chapter: + upload_date = unified_strdate(first_chapter['date_diffusion']) + # Otherwise we can continue and extract just one part, we have to use + # the short id for getting the video url + + formats = [{ + 'url': 'http://wat.tv/get/android5/%s.mp4' % real_id, + 'format_id': 'Mobile', + }] + + fmts = [('SD', 'web')] + if first_file.get('hasHD'): + fmts.append(('HD', 'webhd')) + + def compute_token(param): + timestamp = '%08x' % int(self._download_webpage( + 'http://www.wat.tv/servertime', real_id, + 'Downloading server time').split('|')[0]) + magic = '9b673b13fa4682ed14c3cfa5af5310274b514c4133e9b3a81e6e3aba009l2564' + return '%s/%s' % (hashlib.md5((magic + param + timestamp).encode('ascii')).hexdigest(), timestamp) + + for fmt in fmts: + webid = '/%s/%s' % (fmt[1], real_id) + video_url = self._download_webpage( + 'http://www.wat.tv/get%s?token=%s&getURL=1&country=%s' % (webid, compute_token(webid), country), + real_id, + 'Downloading %s video URL' % fmt[0], + 'Failed to download %s video URL' % fmt[0], + False) + if not video_url: + continue + formats.append({ + 'url': video_url, + 'ext': 'mp4', + 'format_id': fmt[0], + }) + + return { + 'id': real_id, + 'display_id': display_id, + 'title': first_chapter['title'], + 'thumbnail': first_chapter['preview'], + 'description': first_chapter['description'], + 'view_count': video_info['views'], + 'upload_date': upload_date, + 'duration': first_file['duration'], + 'formats': formats, + } From c551678eb23484cdea8896f0cb776b6c822a68a6 Mon Sep 17 00:00:00 2001 From: Jonathan Nifenecker Date: Sat, 16 Apr 2016 23:41:09 +0200 Subject: [PATCH 5/5] add madmoizelle to extractor.py --- youtube_dl/extractor/extractors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 06b3d5e24..4bf58f2bf 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -397,6 +397,7 @@ from .lynda import ( ) from .m6 import M6IE from .macgamestore import MacGameStoreIE +from .madmoizelle import MadmoizelleIE from .mailru import MailRuIE from .makerschannel import MakersChannelIE from .makertv import MakerTVIE