diff --git a/README.md b/README.md index cf4aebf3d..a6ec9619c 100644 --- a/README.md +++ b/README.md @@ -710,12 +710,13 @@ If you want to add support for a new site, you can follow this quick list (assum webpage = self._download_webpage(url, video_id) # TODO more code goes here, for example ... - title = self._html_search_regex(r'

(.*?)

', webpage, 'title') + title = self._html_search_regex(r'

(.+?)

', webpage, 'title') return { 'id': video_id, 'title': title, 'description': self._og_search_description(webpage), + 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), # TODO more properties (see youtube_dl/extractor/common.py) } ``` diff --git a/docs/supportedsites.md b/docs/supportedsites.md index dc0354095..cfa665d88 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -159,6 +159,7 @@ - **facebook** - **faz.net** - **fc2** + - **Fczenit** - **fernsehkritik.tv** - **Firstpost** - **FiveTV** @@ -281,7 +282,6 @@ - **Malemotion** - **MDR** - **media.ccc.de** - - **MegaVideoz** - **metacafe** - **Metacritic** - **Mgoon** @@ -588,7 +588,8 @@ - **twitch:stream** - **twitch:video** - **twitch:vod** - - **TwitterCard** + - **twitter** + - **twitter:card** - **Ubu** - **udemy** - **udemy:course** diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index be8d12997..938466a80 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -35,10 +35,18 @@ class TestInfoExtractor(unittest.TestCase): + + + + ''' self.assertEqual(ie._og_search_title(html), 'Foo') self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2') + self.assertEqual(ie._og_search_video_url(html, default=None), None) + self.assertEqual(ie._og_search_property('foobar', html), 'Foo') + self.assertEqual(ie._og_search_property('test1', html), 'foo > < bar') + self.assertEqual(ie._og_search_property('test2', html), 'foo >//< bar') def test_html_search_meta(self): ie = self.ie diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index c889b6f15..26aadb34f 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -57,5 +57,14 @@ class TestYoutubeLists(unittest.TestCase): entries = result['entries'] self.assertEqual(len(entries), 100) + def test_youtube_flat_playlist_titles(self): + dl = FakeYDL() + dl.params['extract_flat'] = True + ie = YoutubePlaylistIE(dl) + result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') + self.assertIsPlaylist(result) + for entry in result['entries']: + self.assertTrue(entry.get('title')) + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index adf70d658..12977bf80 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -37,6 +37,7 @@ from .compat import ( compat_tokenize_tokenize, compat_urllib_error, compat_urllib_request, + compat_urllib_request_DataHandler, ) from .utils import ( ContentTooShortError, @@ -1967,8 +1968,9 @@ class YoutubeDL(object): debuglevel = 1 if self.params.get('debug_printtraffic') else 0 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) + data_handler = compat_urllib_request_DataHandler() opener = compat_urllib_request.build_opener( - proxy_handler, https_handler, cookie_processor, ydlh) + proxy_handler, https_handler, cookie_processor, ydlh, data_handler) # Delete the default user-agent header, which would otherwise apply in # cases where our custom HTTP handler doesn't come into play diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 192e1c515..d103ab9ad 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -1,7 +1,10 @@ from __future__ import unicode_literals +import binascii import collections +import email import getpass +import io import optparse import os import re @@ -38,6 +41,11 @@ try: except ImportError: # Python 2 import urlparse as compat_urlparse +try: + import urllib.response as compat_urllib_response +except ImportError: # Python 2 + import urllib as compat_urllib_response + try: import http.cookiejar as compat_cookiejar except ImportError: # Python 2 @@ -155,6 +163,40 @@ except ImportError: # Python 2 string = string.replace('+', ' ') return compat_urllib_parse_unquote(string, encoding, errors) +try: + from urllib.request import DataHandler as compat_urllib_request_DataHandler +except ImportError: # Python < 3.4 + # Ported from CPython 98774:1733b3bd46db, Lib/urllib/request.py + class compat_urllib_request_DataHandler(compat_urllib_request.BaseHandler): + def data_open(self, req): + # data URLs as specified in RFC 2397. + # + # ignores POSTed data + # + # syntax: + # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data + # mediatype := [ type "/" subtype ] *( ";" parameter ) + # data := *urlchar + # parameter := attribute "=" value + url = req.get_full_url() + + scheme, data = url.split(":", 1) + mediatype, data = data.split(",", 1) + + # even base64 encoded data URLs might be quoted so unquote in any case: + data = compat_urllib_parse_unquote_to_bytes(data) + if mediatype.endswith(";base64"): + data = binascii.a2b_base64(data) + mediatype = mediatype[:-7] + + if not mediatype: + mediatype = "text/plain;charset=US-ASCII" + + headers = email.message_from_string( + "Content-type: %s\nContent-length: %d\n" % (mediatype, len(data))) + + return compat_urllib_response.addinfourl(io.BytesIO(data), headers, url) + try: compat_basestring = basestring # Python 2 except NameError: @@ -489,6 +531,8 @@ __all__ = [ 'compat_urllib_parse_unquote_to_bytes', 'compat_urllib_parse_urlparse', 'compat_urllib_request', + 'compat_urllib_request_DataHandler', + 'compat_urllib_response', 'compat_urlparse', 'compat_urlretrieve', 'compat_xml_parse_error', diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index a62d2047b..9a83a73dd 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -30,7 +30,7 @@ class HlsFD(FileDownloader): args = [ffpp.executable, '-y'] - if info_dict['http_headers']: + if info_dict['http_headers'] and re.match(r'^https?://', url): # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. args += [ diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 75720843c..bd6eb6ae0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -167,6 +167,7 @@ from .extremetube import ExtremeTubeIE from .facebook import FacebookIE from .faz import FazIE from .fc2 import FC2IE +from .fczenit import FczenitIE from .firstpost import FirstpostIE from .firsttv import FirstTVIE from .fivemin import FiveMinIE @@ -318,7 +319,6 @@ from .macgamestore import MacGameStoreIE from .mailru import MailRuIE from .malemotion import MalemotionIE from .mdr import MDRIE -from .megavideoz import MegaVideozIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mgoon import MgoonIE @@ -690,7 +690,7 @@ from .twitch import ( TwitchBookmarksIE, TwitchStreamIE, ) -from .twitter import TwitterCardIE +from .twitter import TwitterCardIE, TwitterIE from .ubu import UbuIE from .udemy import ( UdemyIE, diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index a27f3e748..c1ef8051d 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -10,6 +10,8 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + float_or_none, + int_or_none, ) @@ -52,11 +54,11 @@ class BandcampIE(InfoExtractor): ext, abr_str = format_id.split('-', 1) formats.append({ 'format_id': format_id, - 'url': format_url, + 'url': self._proto_relative_url(format_url, 'http:'), 'ext': ext, 'vcodec': 'none', 'acodec': ext, - 'abr': int(abr_str), + 'abr': int_or_none(abr_str), }) self._sort_formats(formats) @@ -65,7 +67,7 @@ class BandcampIE(InfoExtractor): 'id': compat_str(data['id']), 'title': data['title'], 'formats': formats, - 'duration': float(data['duration']), + 'duration': float_or_none(data.get('duration')), } else: raise ExtractorError('No free songs found') diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 68995f81e..1b3a33e4e 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -421,7 +421,7 @@ class BBCCoUkIE(InfoExtractor): continue title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS) - description = description_el.text if description_el else None + description = description_el.text if description_el is not None else None def get_programme_id(item): def get_from_attributes(item): diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index b38057f2f..e6c928699 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -1,65 +1,67 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) class BeegIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?beeg\.com/(?P\d+)' _TEST = { 'url': 'http://beeg.com/5416503', - 'md5': '1bff67111adb785c51d1b42959ec10e5', + 'md5': '46c384def73b33dbc581262e5ee67cef', 'info_dict': { 'id': '5416503', 'ext': 'mp4', 'title': 'Sultry Striptease', - 'description': 'md5:6db3c6177972822aaba18652ff59c773', - 'categories': list, # NSFW - 'thumbnail': 're:https?://.*\.jpg$', + 'description': 'md5:d22219c09da287c14bed3d6c37ce4bc2', + 'timestamp': 1391813355, + 'upload_date': '20140207', + 'duration': 383, + 'tags': list, 'age_limit': 18, } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - quality_arr = self._search_regex( - r'(?s)var\s+qualityArr\s*=\s*{\s*(.+?)\s*}', webpage, 'quality formats') - - formats = [{ - 'url': fmt[1], - 'format_id': fmt[0], - 'height': int(fmt[0][:-1]), - } for fmt in re.findall(r"'([^']+)'\s*:\s*'([^']+)'", quality_arr)] + video = self._download_json( + 'http://beeg.com/api/v1/video/%s' % video_id, video_id) + formats = [] + for format_id, video_url in video.items(): + height = self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None) + if not height: + continue + formats.append({ + 'url': self._proto_relative_url(video_url.replace('{DATA_MARKERS}', ''), 'http:'), + 'format_id': format_id, + 'height': int(height), + }) self._sort_formats(formats) - title = self._html_search_regex( - r'([^<]+)\s*-\s*beeg\.?', webpage, 'title') + title = video['title'] + video_id = video.get('id') or video_id + display_id = video.get('code') + description = video.get('desc') - description = self._html_search_regex( - r'\d+)' + _VALID_URL = r'https?://(?:www\.)?canalc2\.tv/video/(?P\d+)' _TEST = { - 'url': 'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', + 'url': 'http://www.canalc2.tv/video/12163', 'md5': '060158428b650f896c542dfbb3d6487f', 'info_dict': { 'id': '12163', - 'ext': 'mp4', - 'title': 'Terrasses du Numérique' + 'ext': 'flv', + 'title': 'Terrasses du Numérique', + 'duration': 122, + }, + 'params': { + 'skip_download': True, # Requires rtmpdump } } def _real_extract(self, url): - video_id = re.match(self._VALID_URL, url).group('id') - # We need to set the voir field for getting the file name - url = 'http://www.canalc2.tv/video.asp?idVideo=%s&voir=oui' % video_id + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - file_name = self._search_regex( - r"so\.addVariable\('file','(.*?)'\);", - webpage, 'file name') - video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name + video_url = self._search_regex( + r'jwplayer\((["\'])Player\1\)\.setup\({[^}]*file\s*:\s*(["\'])(?P.+?)\2', + webpage, 'video_url', group='file') + formats = [{'url': video_url}] + if video_url.startswith('rtmp://'): + rtmp = re.search(r'^(?Prtmp://[^/]+/(?P.+/))(?Pmp4:.+)$', video_url) + formats[0].update({ + 'url': rtmp.group('url'), + 'ext': 'flv', + 'app': rtmp.group('app'), + 'play_path': rtmp.group('play_path'), + 'page_url': url, + }) title = self._html_search_regex( - r'class="evenement8">(.*?)', webpage, 'title') + r'(?s)class="[^"]*col_description[^"]*">.*?

(.*?)

', webpage, 'title') + duration = parse_duration(self._search_regex( + r'id=["\']video_duree["\'][^>]*>([^<]+)', + webpage, 'duration', fatal=False)) return { 'id': video_id, - 'ext': 'mp4', - 'url': video_url, 'title': title, + 'duration': duration, + 'formats': formats, } diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py index 3dfc24f5b..c74553dcf 100644 --- a/youtube_dl/extractor/channel9.py +++ b/youtube_dl/extractor/channel9.py @@ -3,7 +3,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + parse_filesize, + qualities, +) class Channel9IE(InfoExtractor): @@ -28,7 +32,7 @@ class Channel9IE(InfoExtractor): 'title': 'Developer Kick-Off Session: Stuff We Love', 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f', 'duration': 4576, - 'thumbnail': 'http://video.ch9.ms/ch9/9d51/03902f2d-fc97-4d3c-b195-0bfe15a19d51/KOS002_220.jpg', + 'thumbnail': 're:http://.*\.jpg', 'session_code': 'KOS002', 'session_day': 'Day 1', 'session_room': 'Arena 1A', @@ -44,31 +48,29 @@ class Channel9IE(InfoExtractor): 'title': 'Self-service BI with Power BI - nuclear testing', 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b', 'duration': 1540, - 'thumbnail': 'http://video.ch9.ms/ch9/87e1/0300391f-a455-4c72-bec3-4422f19287e1/selfservicenuk_512.jpg', + 'thumbnail': 're:http://.*\.jpg', 'authors': ['Mike Wilmot'], }, + }, + { + # low quality mp4 is best + 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', + 'info_dict': { + 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library', + 'ext': 'mp4', + 'title': 'Ranges for the Standard Library', + 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d', + 'duration': 5646, + 'thumbnail': 're:http://.*\.jpg', + }, + 'params': { + 'skip_download': True, + }, } ] _RSS_URL = 'http://channel9.msdn.com/%s/RSS' - # Sorted by quality - _known_formats = ['MP3', 'MP4', 'Mid Quality WMV', 'Mid Quality MP4', 'High Quality WMV', 'High Quality MP4'] - - def _restore_bytes(self, formatted_size): - if not formatted_size: - return 0 - m = re.match(r'^(?P\d+(?:\.\d+)?)\s+(?P[a-zA-Z]+)', formatted_size) - if not m: - return 0 - units = m.group('units') - try: - exponent = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'].index(units.upper()) - except ValueError: - return 0 - size = float(m.group('size')) - return int(size * (1024 ** exponent)) - def _formats_from_html(self, html): FORMAT_REGEX = r''' (?x) @@ -78,16 +80,20 @@ class Channel9IE(InfoExtractor):

File\s+size

\s*(?P.*?)\s* )? # File size part may be missing ''' - # Extract known formats + quality = qualities(( + 'MP3', 'MP4', + 'Low Quality WMV', 'Low Quality MP4', + 'Mid Quality WMV', 'Mid Quality MP4', + 'High Quality WMV', 'High Quality MP4')) formats = [{ 'url': x.group('url'), 'format_id': x.group('quality'), 'format_note': x.group('note'), 'format': '%s (%s)' % (x.group('quality'), x.group('note')), - 'filesize': self._restore_bytes(x.group('filesize')), # File size is approximate - 'preference': self._known_formats.index(x.group('quality')), + 'filesize_approx': parse_filesize(x.group('filesize')), + 'quality': quality(x.group('quality')), 'vcodec': 'none' if x.group('note') == 'Audio only' else None, - } for x in list(re.finditer(FORMAT_REGEX, html)) if x.group('quality') in self._known_formats] + } for x in list(re.finditer(FORMAT_REGEX, html))] self._sort_formats(formats) @@ -158,7 +164,7 @@ class Channel9IE(InfoExtractor): def _extract_session_day(self, html): m = re.search(r'
  • \s*(?P[^<]+)\s*
  • ', html) - return m.group('day') if m is not None else None + return m.group('day').strip() if m is not None else None def _extract_session_room(self, html): m = re.search(r'
  • \s*(?P.+?)\s*
  • ', html) @@ -224,12 +230,12 @@ class Channel9IE(InfoExtractor): if contents is None: return contents - authors = self._extract_authors(html) + if len(contents) > 1: + raise ExtractorError('Got more than one entry') + result = contents[0] + result['authors'] = self._extract_authors(html) - for content in contents: - content['authors'] = authors - - return contents + return result def _extract_session(self, html, content_path): contents = self._extract_content(html, content_path) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0082a4c84..6169fbbeb 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -172,6 +172,7 @@ class InfoExtractor(object): view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video + repost_count: Number of reposts of the video average_rating: Average rating give by users, the scale used depends on the webpage comment_count: Number of comments on the video comments: A list of comments, each with one or more of the following @@ -645,8 +646,9 @@ class InfoExtractor(object): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\'|\s*([^\s"\'=<>`]+?))' - property_re = r'(?:name|property)=[\'"]?og:%s[\'"]?' % re.escape(prop) + content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' + property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)' + % {'prop': re.escape(prop)}) template = r']+?%s[^>]+?%s' return [ template % (property_re, content_re), diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py index 4fb178165..dedb810a0 100644 --- a/youtube_dl/extractor/criterion.py +++ b/youtube_dl/extractor/criterion.py @@ -27,9 +27,7 @@ class CriterionIE(InfoExtractor): final_url = self._search_regex( r'so.addVariable\("videoURL", "(.+?)"\)\;', webpage, 'video url') title = self._og_search_title(webpage) - description = self._html_search_regex( - r'', - webpage, 'video description') + description = self._html_search_meta('description', webpage) thumbnail = self._search_regex( r'so.addVariable\("thumbnailURL", "(.+?)"\)\;', webpage, 'thumbnail url') diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 95952bc29..cecd0c784 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -32,6 +32,26 @@ from ..aes import ( class CrunchyrollBaseIE(InfoExtractor): + _NETRC_MACHINE = 'crunchyroll' + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + self.report_login() + login_url = 'https://www.crunchyroll.com/?a=formhandler' + data = urlencode_postdata({ + 'formname': 'RpcApiUser_Login', + 'name': username, + 'password': password, + }) + login_request = compat_urllib_request.Request(login_url, data) + login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + self._download_webpage(login_request, None, False, 'Wrong login info') + + def _real_initialize(self): + self._login() + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) else compat_urllib_request.Request(url_or_request)) @@ -46,10 +66,22 @@ class CrunchyrollBaseIE(InfoExtractor): return super(CrunchyrollBaseIE, self)._download_webpage( request, video_id, note, errnote, fatal, tries, timeout, encoding) + @staticmethod + def _add_skip_wall(url): + parsed_url = compat_urlparse.urlparse(url) + qs = compat_urlparse.parse_qs(parsed_url.query) + # Always force skip_wall to bypass maturity wall, namely 18+ confirmation message: + # > This content may be inappropriate for some people. + # > Are you sure you want to continue? + # since it's not disabled by default in crunchyroll account's settings. + # See https://github.com/rg3/youtube-dl/issues/7202. + qs['skip_wall'] = ['1'] + return compat_urlparse.urlunparse( + parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True))) + class CrunchyrollIE(CrunchyrollBaseIE): _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P[0-9]+))(?:[/?&]|$)' - _NETRC_MACHINE = 'crunchyroll' _TESTS = [{ 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', 'info_dict': { @@ -81,10 +113,13 @@ class CrunchyrollIE(CrunchyrollBaseIE): # rtmp 'skip_download': True, }, - }, { 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', 'only_matching': True, + }, { + # geo-restricted (US), 18+ maturity wall, non-premium available + 'url': 'http://www.crunchyroll.com/cosplay-complex-ova/episode-1-the-birth-of-the-cosplay-club-565617', + 'only_matching': True, }] _FORMAT_IDS = { @@ -94,24 +129,6 @@ class CrunchyrollIE(CrunchyrollBaseIE): '1080': ('80', '108'), } - def _login(self): - (username, password) = self._get_login_info() - if username is None: - return - self.report_login() - login_url = 'https://www.crunchyroll.com/?a=formhandler' - data = urlencode_postdata({ - 'formname': 'RpcApiUser_Login', - 'name': username, - 'password': password, - }) - login_request = compat_urllib_request.Request(login_url, data) - login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - self._download_webpage(login_request, None, False, 'Wrong login info') - - def _real_initialize(self): - self._login() - def _decrypt_subtitles(self, data, iv, id): data = bytes_to_intlist(base64.b64decode(data.encode('utf-8'))) iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8'))) @@ -254,7 +271,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text else: webpage_url = 'http://www.' + mobj.group('url') - webpage = self._download_webpage(webpage_url, video_id, 'Downloading webpage') + webpage = self._download_webpage(self._add_skip_wall(webpage_url), video_id, 'Downloading webpage') note_m = self._html_search_regex( r'
    (.+?)
    ', webpage, 'trailer-notice', default='') @@ -352,7 +369,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): IE_NAME = "crunchyroll:playlist" - _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P[\w\-]+))/?$' + _VALID_URL = r'https?://(?:(?Pwww|m)\.)?(?Pcrunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P[\w\-]+))/?(?:\?|$)' _TESTS = [{ 'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', @@ -361,12 +378,25 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): 'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi' }, 'playlist_count': 13, + }, { + # geo-restricted (US), 18+ maturity wall, non-premium available + 'url': 'http://www.crunchyroll.com/cosplay-complex-ova', + 'info_dict': { + 'id': 'cosplay-complex-ova', + 'title': 'Cosplay Complex OVA' + }, + 'playlist_count': 3, + 'skip': 'Georestricted', + }, { + # geo-restricted (US), 18+ maturity wall, non-premium will be available since 2015.11.14 + 'url': 'http://www.crunchyroll.com/ladies-versus-butlers?skip_wall=1', + 'only_matching': True, }] def _real_extract(self, url): show_id = self._match_id(url) - webpage = self._download_webpage(url, show_id) + webpage = self._download_webpage(self._add_skip_wall(url), show_id) title = self._html_search_regex( r'(?s)]*>\s*(.*?)', webpage, 'title') diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 80a05cfee..9cd9ff17d 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -96,6 +96,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'uploader': 'HotWaves1012', 'age_limit': 18, } + }, + # geo-restricted, player v5 + { + 'url': 'http://www.dailymotion.com/video/xhza0o', + 'only_matching': True, } ] @@ -124,6 +129,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor): if player_v5: player = self._parse_json(player_v5, video_id) metadata = player['metadata'] + + self._check_error(metadata) + formats = [] for quality, media_list in metadata['qualities'].items(): for media in media_list: @@ -201,9 +209,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'video info', flags=re.MULTILINE), video_id) - if info.get('error') is not None: - msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title'] - raise ExtractorError(msg, expected=True) + self._check_error(info) formats = [] for (key, format_id) in self._FORMATS: @@ -246,6 +252,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor): 'duration': info['duration'] } + def _check_error(self, info): + if info.get('error') is not None: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, info['error']['title']), expected=True) + def _get_subtitles(self, video_id, webpage): try: sub_list = self._download_webpage( diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index e529b9b96..7bbf617d4 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -87,7 +87,7 @@ class EaglePlatformIE(InfoExtractor): m3u8_url = self._get_video_url(secure_m3u8, video_id, 'Downloading m3u8 JSON') formats = self._extract_m3u8_formats( m3u8_url, video_id, - 'mp4', entry_protocol='m3u8_native') + 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') mp4_url = self._get_video_url( # Secure mp4 URL is constructed according to Player.prototype.mp4 from diff --git a/youtube_dl/extractor/fczenit.py b/youtube_dl/extractor/fczenit.py new file mode 100644 index 000000000..f1f150ef2 --- /dev/null +++ b/youtube_dl/extractor/fczenit.py @@ -0,0 +1,41 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class FczenitIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/gl(?P[0-9]+)' + _TEST = { + 'url': 'http://fc-zenit.ru/video/gl6785/', + 'md5': '458bacc24549173fe5a5aa29174a5606', + 'info_dict': { + 'id': '6785', + 'ext': 'mp4', + 'title': '«Зенит-ТВ»: как Олег Шатов играл против «Урала»', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_title = self._html_search_regex(r'
    ([^<]+)', webpage, 'title') + + bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, 'video URL') + bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw) + + formats = [{ + 'url': furl, + 'tbr': tbr, + } for furl, tbr in bitrates] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_title, + 'formats': formats, + } diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py index 157094e8c..2955965d9 100644 --- a/youtube_dl/extractor/fivemin.py +++ b/youtube_dl/extractor/fivemin.py @@ -2,11 +2,15 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( - compat_str, compat_urllib_parse, + compat_parse_qs, + compat_urllib_parse_urlparse, + compat_urlparse, ) from ..utils import ( ExtractorError, + parse_duration, + replace_extension, ) @@ -28,6 +32,7 @@ class FiveMinIE(InfoExtractor): 'id': '518013791', 'ext': 'mp4', 'title': 'iPad Mini with Retina Display Review', + 'duration': 177, }, }, { @@ -38,9 +43,52 @@ class FiveMinIE(InfoExtractor): 'id': '518086247', 'ext': 'mp4', 'title': 'How to Make a Next-Level Fruit Salad', + 'duration': 184, }, }, ] + _ERRORS = { + 'ErrorVideoNotExist': 'We\'re sorry, but the video you are trying to watch does not exist.', + 'ErrorVideoNoLongerAvailable': 'We\'re sorry, but the video you are trying to watch is no longer available.', + 'ErrorVideoRejected': 'We\'re sorry, but the video you are trying to watch has been removed.', + 'ErrorVideoUserNotGeo': 'We\'re sorry, but the video you are trying to watch cannot be viewed from your current location.', + 'ErrorVideoLibraryRestriction': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.', + 'ErrorExposurePermission': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.', + } + _QUALITIES = { + 1: { + 'width': 640, + 'height': 360, + }, + 2: { + 'width': 854, + 'height': 480, + }, + 4: { + 'width': 1280, + 'height': 720, + }, + 8: { + 'width': 1920, + 'height': 1080, + }, + 16: { + 'width': 640, + 'height': 360, + }, + 32: { + 'width': 854, + 'height': 480, + }, + 64: { + 'width': 1280, + 'height': 720, + }, + 128: { + 'width': 640, + 'height': 360, + }, + } def _real_extract(self, url): video_id = self._match_id(url) @@ -59,26 +107,36 @@ class FiveMinIE(InfoExtractor): 'https://syn.5min.com/handlers/SenseHandler.ashx?' + query, video_id) if not response['success']: - err_msg = response['errorMessage'] - if err_msg == 'ErrorVideoUserNotGeo': - msg = 'Video not available from your location' - else: - msg = 'Aol said: %s' % err_msg - raise ExtractorError(msg, expected=True, video_id=video_id) + raise ExtractorError( + '%s said: %s' % ( + self.IE_NAME, + self._ERRORS.get(response['errorMessage'], response['errorMessage'])), + expected=True) info = response['binding'][0] - second_id = compat_str(int(video_id[:-2]) + 1) formats = [] - for quality, height in [(1, 320), (2, 480), (4, 720), (8, 1080)]: - if any(r['ID'] == quality for r in info['Renditions']): + parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs( + compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0]) + for rendition in info['Renditions']: + if rendition['RenditionType'] == 'm3u8': + formats.extend(self._extract_m3u8_formats(rendition['Url'], video_id, m3u8_id='hls')) + elif rendition['RenditionType'] == 'aac': + continue + else: + rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType']))) + quality = self._QUALITIES.get(rendition['ID'], {}) formats.append({ - 'format_id': compat_str(quality), - 'url': 'http://avideos.5min.com/%s/%s/%s_%s.mp4' % (second_id[-3:], second_id, video_id, quality), - 'height': height, + 'format_id': '%s-%d' % (rendition['RenditionType'], rendition['ID']), + 'url': rendition_url, + 'width': quality.get('width'), + 'height': quality.get('height'), }) + self._sort_formats(formats) return { 'id': video_id, 'title': info['Title'], + 'thumbnail': info.get('ThumbURL'), + 'duration': parse_duration(info.get('Duration')), 'formats': formats, } diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 4bb574cf3..02e1e428e 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -4,8 +4,8 @@ import re import json from .common import InfoExtractor -from ..compat import ( - compat_urlparse, +from ..utils import ( + qualities, ) @@ -30,24 +30,33 @@ class ImdbIE(InfoExtractor): descr = self._html_search_regex( r'(?s)(.*?)', webpage, 'description', fatal=False) - available_formats = re.findall( - r'case \'(?P.*?)\' :$\s+url = \'(?P.*?)\'', webpage, - flags=re.MULTILINE) + player_url = 'http://www.imdb.com/video/imdb/vi%s/imdb/single' % video_id + player_page = self._download_webpage( + player_url, video_id, 'Downloading player page') + # the player page contains the info for the default format, we have to + # fetch other pages for the rest of the formats + extra_formats = re.findall(r'href="(?P%s.*?)".*?>(?P.*?)<' % re.escape(player_url), player_page) + format_pages = [ + self._download_webpage( + f_url, video_id, 'Downloading info for %s format' % f_name) + for f_url, f_name in extra_formats] + format_pages.append(player_page) + + quality = qualities(['SD', '480p', '720p']) formats = [] - for f_id, f_path in available_formats: - f_path = f_path.strip() - format_page = self._download_webpage( - compat_urlparse.urljoin(url, f_path), - 'Downloading info for %s format' % f_id) + for format_page in format_pages: json_data = self._search_regex( r']+class="imdb-player-data"[^>]*?>(.*?)', format_page, 'json data', flags=re.DOTALL) info = json.loads(json_data) format_info = info['videoPlayerObject']['video'] + f_id = format_info['ffname'] formats.append({ 'format_id': f_id, 'url': format_info['videoInfoList'][0]['videoUrl'], + 'quality': quality(f_id), }) + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py index 1df084d87..eef7daa29 100644 --- a/youtube_dl/extractor/jeuxvideo.py +++ b/youtube_dl/extractor/jeuxvideo.py @@ -28,7 +28,7 @@ class JeuxVideoIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) title = mobj.group(1) webpage = self._download_webpage(url, title) - title = self._html_search_meta('name', webpage) + title = self._html_search_meta('name', webpage) or self._og_search_title(webpage) config_url = self._html_search_regex( r'data-src="(/contenu/medias/video.php.*?)"', webpage, 'config URL') diff --git a/youtube_dl/extractor/letv.py b/youtube_dl/extractor/letv.py index a28abb0f0..effd9eb92 100644 --- a/youtube_dl/extractor/letv.py +++ b/youtube_dl/extractor/letv.py @@ -9,13 +9,14 @@ from .common import InfoExtractor from ..compat import ( compat_urllib_parse, compat_urllib_request, - compat_urlparse, + compat_ord, ) from ..utils import ( determine_ext, ExtractorError, parse_iso8601, int_or_none, + encode_data_uri, ) @@ -25,15 +26,16 @@ class LetvIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.letv.com/ptv/vplay/22005890.html', - 'md5': 'cab23bd68d5a8db9be31c9a222c1e8df', + 'md5': 'edadcfe5406976f42f9f266057ee5e40', 'info_dict': { 'id': '22005890', 'ext': 'mp4', 'title': '第87届奥斯卡颁奖礼完美落幕 《鸟人》成最大赢家', - 'timestamp': 1424747397, - 'upload_date': '20150224', 'description': 'md5:a9cb175fd753e2962176b7beca21a47c', - } + }, + 'params': { + 'hls_prefer_native': True, + }, }, { 'url': 'http://www.letv.com/ptv/vplay/1415246.html', 'info_dict': { @@ -42,16 +44,22 @@ class LetvIE(InfoExtractor): 'title': '美人天下01', 'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda', }, + 'params': { + 'hls_prefer_native': True, + }, }, { 'note': 'This video is available only in Mainland China, thus a proxy is needed', 'url': 'http://www.letv.com/ptv/vplay/1118082.html', - 'md5': 'f80936fbe20fb2f58648e81386ff7927', + 'md5': '2424c74948a62e5f31988438979c5ad1', 'info_dict': { 'id': '1118082', 'ext': 'mp4', 'title': '与龙共舞 完整版', 'description': 'md5:7506a5eeb1722bb9d4068f85024e3986', }, + 'params': { + 'hls_prefer_native': True, + }, 'skip': 'Only available in China', }] @@ -74,6 +82,27 @@ class LetvIE(InfoExtractor): _loc3_ = self.ror(_loc3_, _loc2_ % 17) return _loc3_ + # see M3U8Encryption class in KLetvPlayer.swf + @staticmethod + def decrypt_m3u8(encrypted_data): + if encrypted_data[:5].decode('utf-8').lower() != 'vc_01': + return encrypted_data + encrypted_data = encrypted_data[5:] + + _loc4_ = bytearray() + while encrypted_data: + b = compat_ord(encrypted_data[0]) + _loc4_.extend([b // 16, b & 0x0f]) + encrypted_data = encrypted_data[1:] + idx = len(_loc4_) - 11 + _loc4_ = _loc4_[idx:] + _loc4_[:idx] + _loc7_ = bytearray() + while _loc4_: + _loc7_.append(_loc4_[0] * 16 + _loc4_[1]) + _loc4_ = _loc4_[2:] + + return bytes(_loc7_) + def _real_extract(self, url): media_id = self._match_id(url) page = self._download_webpage(url, media_id) @@ -115,23 +144,28 @@ class LetvIE(InfoExtractor): for format_id in formats: if format_id in dispatch: media_url = playurl['domain'][0] + dispatch[format_id][0] - - # Mimic what flvxz.com do - url_parts = list(compat_urlparse.urlparse(media_url)) - qs = dict(compat_urlparse.parse_qs(url_parts[4])) - qs.update({ - 'platid': '14', - 'splatid': '1401', - 'tss': 'no', - 'retry': 1 + media_url += '&' + compat_urllib_parse.urlencode({ + 'm3v': 1, + 'format': 1, + 'expect': 3, + 'rateid': format_id, }) - url_parts[4] = compat_urllib_parse.urlencode(qs) - media_url = compat_urlparse.urlunparse(url_parts) + + nodes_data = self._download_json( + media_url, media_id, + 'Download JSON metadata for format %s' % format_id) + + req = self._request_webpage( + nodes_data['nodelist'][0]['location'], media_id, + note='Downloading m3u8 information for format %s' % format_id) + + m3u8_data = self.decrypt_m3u8(req.read()) url_info_dict = { - 'url': media_url, + 'url': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'), 'ext': determine_ext(dispatch[format_id][1]), 'format_id': format_id, + 'protocol': 'm3u8', } if format_id[-1:] == 'p': diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 378117270..5c973e75c 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -140,13 +140,14 @@ class LyndaIE(LyndaBaseIE): prioritized_streams = video_json.get('PrioritizedStreams') if prioritized_streams: - formats.extend([ - { - 'url': video_url, - 'width': int_or_none(format_id), - 'format_id': format_id, - } for format_id, video_url in prioritized_streams['0'].items() - ]) + for prioritized_stream_id, prioritized_stream in prioritized_streams.items(): + formats.extend([ + { + 'url': video_url, + 'width': int_or_none(format_id), + 'format_id': '%s-%s' % (prioritized_stream_id, format_id), + } for format_id, video_url in prioritized_stream.items() + ]) self._check_formats(formats, video_id) self._sort_formats(formats) diff --git a/youtube_dl/extractor/megavideoz.py b/youtube_dl/extractor/megavideoz.py deleted file mode 100644 index af7ff07ea..000000000 --- a/youtube_dl/extractor/megavideoz.py +++ /dev/null @@ -1,56 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - xpath_text, -) - - -class MegaVideozIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?megavideoz\.eu/video/(?P[^/]+)(?:/(?P[^/]+))?' - _TEST = { - 'url': 'http://megavideoz.eu/video/WM6UB919XMXH/SMPTE-Universal-Film-Leader', - 'info_dict': { - 'id': '48723', - 'display_id': 'SMPTE-Universal-Film-Leader', - 'ext': 'mp4', - 'title': 'SMPTE Universal Film Leader', - 'thumbnail': 're:https?://.*?\.jpg', - 'duration': 10.93, - } - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id - - webpage = self._download_webpage(url, display_id) - - if any(p in webpage for p in ('>Video Not Found<', '>404 Error<')): - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - config = self._download_xml( - self._search_regex( - r"var\s+cnf\s*=\s*'([^']+)'", webpage, 'cnf url'), - display_id) - - video_url = xpath_text(config, './file', 'video url', fatal=True) - title = xpath_text(config, './title', 'title', fatal=True) - thumbnail = xpath_text(config, './image', 'thumbnail') - duration = float_or_none(xpath_text(config, './duration', 'duration')) - video_id = xpath_text(config, './mediaid', 'video id') or video_id - - return { - 'id': video_id, - 'display_id': display_id, - 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration - } diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index ccc88cfb1..184c7a323 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -13,7 +13,7 @@ from ..utils import ( class OdnoklassnikiIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:odnoklassniki|ok)\.ru/(?:video|web-api/video/moviePlayer)/(?P[\d-]+)' + _VALID_URL = r'https?://(?:www\.)?(?:odnoklassniki|ok)\.ru/(?:video(?:embed)?|web-api/video/moviePlayer)/(?P[\d-]+)' _TESTS = [{ # metadata in JSON 'url': 'http://ok.ru/video/20079905452', @@ -66,6 +66,9 @@ class OdnoklassnikiIE(InfoExtractor): }, { 'url': 'http://www.ok.ru/video/20648036891', 'only_matching': True, + }, { + 'url': 'http://www.ok.ru/videoembed/20648036891', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/rte.py b/youtube_dl/extractor/rte.py index 04158b993..d9cfbf180 100644 --- a/youtube_dl/extractor/rte.py +++ b/youtube_dl/extractor/rte.py @@ -9,16 +9,16 @@ from ..utils import ( class RteIE(InfoExtractor): - _VALID_URL = r'http?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/(?P[0-9]+)/' + _VALID_URL = r'https?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P[0-9]+)' _TEST = { - 'url': 'http://www.rte.ie/player/de/show/10363114/', + 'url': 'http://www.rte.ie/player/ie/show/iwitness-862/10478715/', 'info_dict': { - 'id': '10363114', + 'id': '10478715', 'ext': 'mp4', - 'title': 'One News', + 'title': 'Watch iWitness online', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'The One O\'Clock News followed by Weather.', - 'duration': 436.844, + 'description': 'iWitness : The spirit of Ireland, one voice and one minute at a time.', + 'duration': 60.046, }, 'params': { 'skip_download': 'f4m fails with --test atm' diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 023911c41..3ec08b674 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -15,6 +15,7 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + encode_dict, ExtractorError, int_or_none, parse_duration, @@ -27,8 +28,7 @@ class TwitchBaseIE(InfoExtractor): _API_BASE = 'https://api.twitch.tv' _USHER_BASE = 'http://usher.twitch.tv' - _LOGIN_URL = 'https://secure.twitch.tv/login' - _LOGIN_POST_URL = 'https://passport.twitch.tv/authentications/new' + _LOGIN_URL = 'http://www.twitch.tv/login' _NETRC_MACHINE = 'twitch' def _handle_error(self, response): @@ -61,26 +61,28 @@ class TwitchBaseIE(InfoExtractor): if username is None: return - login_page = self._download_webpage( + login_page, handle = self._download_webpage_handle( self._LOGIN_URL, None, 'Downloading login page') login_form = self._hidden_inputs(login_page) login_form.update({ - 'login': username.encode('utf-8'), - 'password': password.encode('utf-8'), + 'username': username, + 'password': password, }) + redirect_url = handle.geturl() + post_url = self._search_regex( r']+action=(["\'])(?P.+?)\1', login_page, - 'post url', default=self._LOGIN_POST_URL, group='url') + 'post url', default=redirect_url, group='url') if not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + post_url = compat_urlparse.urljoin(redirect_url, post_url) request = compat_urllib_request.Request( - post_url, compat_urllib_parse.urlencode(login_form).encode('utf-8')) - request.add_header('Referer', self._LOGIN_URL) + post_url, compat_urllib_parse.urlencode(encode_dict(login_form)).encode('utf-8')) + request.add_header('Referer', redirect_url) response = self._download_webpage( request, None, 'Logging in as %s' % username) @@ -238,14 +240,24 @@ class TwitchVodIE(TwitchItemBaseIE): def _real_extract(self, url): item_id = self._match_id(url) + info = self._download_info(self._ITEM_SHORTCUT, item_id) access_token = self._download_json( '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id, 'Downloading %s access token' % self._ITEM_TYPE) + formats = self._extract_m3u8_formats( - '%s/vod/%s?nauth=%s&nauthsig=%s&allow_source=true' - % (self._USHER_BASE, item_id, access_token['token'], access_token['sig']), + '%s/vod/%s?%s' % ( + self._USHER_BASE, item_id, + compat_urllib_parse.urlencode({ + 'allow_source': 'true', + 'allow_spectre': 'true', + 'player': 'twitchweb', + 'nauth': access_token['token'], + 'nauthsig': access_token['sig'], + })), item_id, 'mp4') + self._prefer_source(formats) info['formats'] = formats diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1aaa06305..9d3e46b94 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -6,23 +7,51 @@ from .common import InfoExtractor from ..compat import compat_urllib_request from ..utils import ( float_or_none, - unescapeHTML, + xpath_text, + remove_end, ) class TwitterCardIE(InfoExtractor): + IE_NAME = 'twitter:card' _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P\d+)' - _TEST = { - 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', - 'md5': 'a74f50b310c83170319ba16de6955192', - 'info_dict': { - 'id': '560070183650213889', - 'ext': 'mp4', - 'title': 'TwitterCard', - 'thumbnail': 're:^https?://.*\.jpg$', - 'duration': 30.033, + _TESTS = [ + { + 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', + 'md5': '7d2f6b4d2eb841a7ccc893d479bfceb4', + 'info_dict': { + 'id': '560070183650213889', + 'ext': 'mp4', + 'title': 'TwitterCard', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 30.033, + } }, - } + { + 'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768', + 'md5': '7ee2a553b63d1bccba97fbed97d9e1c8', + 'info_dict': { + 'id': '623160978427936768', + 'ext': 'mp4', + 'title': 'TwitterCard', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 80.155, + }, + }, + { + 'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977', + 'md5': 'b6f35e8b08a0bec6c8af77a2f4b3a814', + 'info_dict': { + 'id': 'dq4Oj5quskI', + 'ext': 'mp4', + 'title': 'Ubuntu 11.10 Overview', + 'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/', + 'upload_date': '20111013', + 'uploader': 'OMG! Ubuntu!', + 'uploader_id': 'omgubuntu', + }, + } + ] def _real_extract(self, url): video_id = self._match_id(url) @@ -40,10 +69,24 @@ class TwitterCardIE(InfoExtractor): request.add_header('User-Agent', user_agent) webpage = self._download_webpage(request, video_id) - config = self._parse_json( - unescapeHTML(self._search_regex( - r'data-player-config="([^"]+)"', webpage, 'data player config')), + youtube_url = self._html_search_regex( + r']+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"', + webpage, 'youtube iframe', default=None) + if youtube_url: + return self.url_result(youtube_url, 'Youtube') + + config = self._parse_json(self._html_search_regex( + r'data-player-config="([^"]+)"', webpage, 'data player config'), video_id) + if 'playlist' not in config: + if 'vmapUrl' in config: + vmap_data = self._download_xml(config['vmapUrl'], video_id) + video_url = xpath_text(vmap_data, './/MediaFile').strip() + formats.append({ + 'url': video_url, + }) + break # same video regardless of UA + continue video_url = config['playlist'][0]['source'] @@ -70,3 +113,54 @@ class TwitterCardIE(InfoExtractor): 'duration': duration, 'formats': formats, } + + +class TwitterIE(InfoExtractor): + IE_NAME = 'twitter' + _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?P[^/]+)/status/(?P\d+)' + _TEMPLATE_URL = 'https://twitter.com/%s/status/%s' + + _TEST = { + 'url': 'https://twitter.com/freethenipple/status/643211948184596480', + 'md5': '31cd83a116fc41f99ae3d909d4caf6a0', + 'info_dict': { + 'id': '643211948184596480', + 'ext': 'mp4', + 'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 12.922, + 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"', + 'uploader': 'FREE THE NIPPLE', + 'uploader_id': 'freethenipple', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + user_id = mobj.group('user_id') + twid = mobj.group('id') + + webpage = self._download_webpage(self._TEMPLATE_URL % (user_id, twid), twid) + + username = remove_end(self._og_search_title(webpage), ' on Twitter') + + title = self._og_search_description(webpage).strip('').replace('\n', ' ') + + # strip 'https -_t.co_BJYgOjSeGA' junk from filenames + mobj = re.match(r'“(.*)\s+(https?://[^ ]+)”', title) + title, short_url = mobj.groups() + + card_id = self._search_regex( + r'["\']/i/cards/tfw/v1/(\d+)', webpage, 'twitter card url') + card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id + + return { + '_type': 'url_transparent', + 'ie_key': 'TwitterCard', + 'uploader_id': user_id, + 'uploader': username, + 'url': card_url, + 'webpage_url': url, + 'description': '%s on Twitter: "%s %s"' % (username, title, short_url), + 'title': username + ' - ' + title, + } diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py index 078d283b2..382517a4a 100644 --- a/youtube_dl/extractor/vidme.py +++ b/youtube_dl/extractor/vidme.py @@ -93,6 +93,10 @@ class VidmeIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # nsfw, user-disabled + 'url': 'https://vid.me/dzGJ', + 'only_matching': True, }] def _real_extract(self, url): @@ -114,6 +118,12 @@ class VidmeIE(InfoExtractor): video = response['video'] + if video.get('state') == 'user-disabled': + raise ExtractorError( + 'Vidme said: This video has been suspended either due to a copyright claim, ' + 'or for violating the terms of use.', + expected=True) + formats = [{ 'format_id': f.get('type'), 'url': f['uri'], diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index 632e57fb4..7cf930d69 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -131,10 +131,11 @@ class ViewsterIE(InfoExtractor): formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id='hds')) elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( video_url, video_id, 'mp4', m3u8_id='hls', - fatal=False # m3u8 sometimes fail - )) + fatal=False) # m3u8 sometimes fail + if m3u8_formats: + formats.extend(m3u8_formats) else: format_id = media.get('Bitrate') f = { diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 027f47ee3..0f84656c0 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -212,7 +212,7 @@ class VimeoIE(VimeoBaseInfoExtractor): url = url.replace('http://', 'https://') password_request = compat_urllib_request.Request(url + '/password', data) password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - password_request.add_header('Cookie', 'clip_v=1; vuid=%s' % vuid) + password_request.add_header('Cookie', 'clip_test2=1; vuid=%s' % vuid) password_request.add_header('Referer', url) return self._download_webpage( password_request, video_id, @@ -286,7 +286,17 @@ class VimeoIE(VimeoBaseInfoExtractor): try: try: config_url = self._html_search_regex( - r' data-config-url="(.+?)"', webpage, 'config URL') + r' data-config-url="(.+?)"', webpage, + 'config URL', default=None) + if not config_url: + # Sometimes new react-based page is served instead of old one that require + # different config URL extraction approach (see + # https://github.com/rg3/youtube-dl/pull/7209) + vimeo_clip_page_config = self._search_regex( + r'vimeo\.clip_page_config\s*=\s*({.+?});', webpage, + 'vimeo clip page config') + config_url = self._parse_json( + vimeo_clip_page_config, video_id)['player']['config_url'] config_json = self._download_webpage(config_url, video_id) config = json.loads(config_json) except RegexNotFoundError: diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index c733a48fa..be72f3147 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -1,10 +1,14 @@ +# coding: utf-8 from __future__ import unicode_literals import re import itertools from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( + int_or_none, + unified_strdate, +) class VineIE(InfoExtractor): @@ -17,10 +21,12 @@ class VineIE(InfoExtractor): 'ext': 'mp4', 'title': 'Chicken.', 'alt_title': 'Vine by Jack Dorsey', - 'description': 'Chicken.', 'upload_date': '20130519', 'uploader': 'Jack Dorsey', 'uploader_id': '76', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, { 'url': 'https://vine.co/v/MYxVapFvz2z', @@ -29,11 +35,13 @@ class VineIE(InfoExtractor): 'id': 'MYxVapFvz2z', 'ext': 'mp4', 'title': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14', - 'alt_title': 'Vine by Luna', - 'description': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14', + 'alt_title': 'Vine by Mars Ruiz', 'upload_date': '20140815', - 'uploader': 'Luna', + 'uploader': 'Mars Ruiz', 'uploader_id': '1102363502380728320', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, { 'url': 'https://vine.co/v/bxVjBbZlPUH', @@ -43,14 +51,33 @@ class VineIE(InfoExtractor): 'ext': 'mp4', 'title': '#mw3 #ac130 #killcam #angelofdeath', 'alt_title': 'Vine by Z3k3', - 'description': '#mw3 #ac130 #killcam #angelofdeath', 'upload_date': '20130430', 'uploader': 'Z3k3', 'uploader_id': '936470460173008896', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, }, }, { 'url': 'https://vine.co/oembed/MYxVapFvz2z.json', 'only_matching': True, + }, { + 'url': 'https://vine.co/v/e192BnZnZ9V', + 'info_dict': { + 'id': 'e192BnZnZ9V', + 'ext': 'mp4', + 'title': 'ยิ้ม~ เขิน~ อาย~ น่าร้ากอ้ะ >//< @n_whitewo @orlameena #lovesicktheseries #lovesickseason2', + 'alt_title': 'Vine by Pimry_zaa', + 'upload_date': '20150705', + 'uploader': 'Pimry_zaa', + 'uploader_id': '1135760698325307392', + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -65,25 +92,26 @@ class VineIE(InfoExtractor): formats = [{ 'format_id': '%(format)s-%(rate)s' % f, - 'vcodec': f['format'], - 'quality': f['rate'], + 'vcodec': f.get('format'), + 'quality': f.get('rate'), 'url': f['videoUrl'], - } for f in data['videoUrls']] + } for f in data['videoUrls'] if f.get('videoUrl')] self._sort_formats(formats) + username = data.get('username') + return { 'id': video_id, - 'title': self._og_search_title(webpage), - 'alt_title': self._og_search_description(webpage, default=None), - 'description': data['description'], - 'thumbnail': data['thumbnailUrl'], - 'upload_date': unified_strdate(data['created']), - 'uploader': data['username'], - 'uploader_id': data['userIdStr'], - 'like_count': data['likes']['count'], - 'comment_count': data['comments']['count'], - 'repost_count': data['reposts']['count'], + 'title': data.get('description') or self._og_search_title(webpage), + 'alt_title': 'Vine by %s' % username if username else self._og_search_description(webpage, default=None), + 'thumbnail': data.get('thumbnailUrl'), + 'upload_date': unified_strdate(data.get('created')), + 'uploader': username, + 'uploader_id': data.get('userIdStr'), + 'like_count': int_or_none(data.get('likes', {}).get('count')), + 'comment_count': int_or_none(data.get('comments', {}).get('count')), + 'repost_count': int_or_none(data.get('reposts', {}).get('count')), 'formats': formats, } diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index 4098e4629..08dc81f3a 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -46,6 +46,12 @@ class YandexMusicTrackIE(InfoExtractor): % (data['host'], key, data['ts'] + data['path'], storage[1])) def _get_track_info(self, track): + thumbnail = None + cover_uri = track.get('albums', [{}])[0].get('coverUri') + if cover_uri: + thumbnail = cover_uri.replace('%%', 'orig') + if not thumbnail.startswith('http'): + thumbnail = 'http://' + thumbnail return { 'id': track['id'], 'ext': 'mp3', @@ -53,6 +59,7 @@ class YandexMusicTrackIE(InfoExtractor): 'title': '%s - %s' % (track['artists'][0]['name'], track['title']), 'filesize': int_or_none(track.get('fileSize')), 'duration': float_or_none(track.get('durationMs'), 1000), + 'thumbnail': thumbnail, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index b252e36e1..08e821362 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -178,6 +178,52 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return +class YoutubePlaylistBaseInfoExtractor(InfoExtractor): + # Extract the video ids from the playlist pages + def _entries(self, page, playlist_id): + more_widget_html = content_html = page + for page_num in itertools.count(1): + for video_id, video_title in self.extract_videos_from_page(content_html): + yield self.url_result( + video_id, 'Youtube', video_id=video_id, + video_title=video_title) + + mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) + if not mobj: + break + + more = self._download_json( + 'https://youtube.com/%s' % mobj.group('more'), playlist_id, + 'Downloading page #%s' % page_num, + transform_source=uppercase_escape) + content_html = more['content_html'] + if not content_html.strip(): + # Some webpages show a "Load more" button but they don't + # have more videos + break + more_widget_html = more['load_more_widget_html'] + + def extract_videos_from_page(self, page): + ids_in_page = [] + titles_in_page = [] + for mobj in re.finditer(self._VIDEO_RE, page): + # The link with index 0 is not the first video of the playlist (not sure if still actual) + if 'index' in mobj.groupdict() and mobj.group('id') == '0': + continue + video_id = mobj.group('id') + video_title = unescapeHTML(mobj.group('title')) + if video_title: + video_title = video_title.strip() + try: + idx = ids_in_page.index(video_id) + if video_title and not titles_in_page[idx]: + titles_in_page[idx] = video_title + except ValueError: + ids_in_page.append(video_id) + titles_in_page.append(video_title) + return zip(ids_in_page, titles_in_page) + + class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com' _VALID_URL = r"""(?x)^ @@ -1419,7 +1465,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } -class YoutubePlaylistIE(YoutubeBaseInfoExtractor): +class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com playlists' _VALID_URL = r"""(?x)(?: (?:https?://)? @@ -1440,7 +1486,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' - _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)' + _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)(?:[^>]+>(?P[^<]+))?' IE_NAME = 'youtube:playlist' _TESTS = [{ 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', @@ -1557,37 +1603,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): else: self.report_warning('Youtube gives an alert message: ' + match) - # Extract the video ids from the playlist pages - def _entries(): - more_widget_html = content_html = page - for page_num in itertools.count(1): - matches = re.finditer(self._VIDEO_RE, content_html) - # We remove the duplicates and the link with index 0 - # (it's not the first video of the playlist) - new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') - for vid_id in new_ids: - yield self.url_result(vid_id, 'Youtube', video_id=vid_id) - - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: - break - - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape) - content_html = more['content_html'] - if not content_html.strip(): - # Some webpages show a "Load more" button but they don't - # have more videos - break - more_widget_html = more['load_more_widget_html'] - playlist_title = self._html_search_regex( r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>', page, 'title') - return self.playlist_result(_entries(), playlist_id, playlist_title) + return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title) def _real_extract(self, url): # Extract playlist id @@ -1613,10 +1633,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): return self._extract_playlist(playlist_id) -class YoutubeChannelIE(InfoExtractor): +class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com channels' _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)' _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' + _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' IE_NAME = 'youtube:channel' _TESTS = [{ 'note': 'paginated channel', @@ -1627,22 +1648,6 @@ class YoutubeChannelIE(InfoExtractor): } }] - @staticmethod - def extract_videos_from_page(page): - ids_in_page = [] - titles_in_page = [] - for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page): - video_id = mobj.group('id') - video_title = unescapeHTML(mobj.group('title')) - try: - idx = ids_in_page.index(video_id) - if video_title and not titles_in_page[idx]: - titles_in_page[idx] = video_title - except ValueError: - ids_in_page.append(video_id) - titles_in_page.append(video_title) - return zip(ids_in_page, titles_in_page) - def _real_extract(self, url): channel_id = self._match_id(url) @@ -1685,29 +1690,7 @@ class YoutubeChannelIE(InfoExtractor): for video_id, video_title in self.extract_videos_from_page(channel_page)] return self.playlist_result(entries, channel_id) - def _entries(): - more_widget_html = content_html = channel_page - for pagenum in itertools.count(1): - - for video_id, video_title in self.extract_videos_from_page(content_html): - yield self.url_result( - video_id, 'Youtube', video_id=video_id, - video_title=video_title) - - mobj = re.search( - r'data-uix-load-more-href="/?(?P<more>[^"]+)"', - more_widget_html) - if not mobj: - break - - more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), channel_id, - 'Downloading page #%s' % (pagenum + 1), - transform_source=uppercase_escape) - content_html = more['content_html'] - more_widget_html = more['load_more_widget_html'] - - return self.playlist_result(_entries(), channel_id) + return self.playlist_result(self._entries(channel_page, channel_id), channel_id) class YoutubeUserIE(YoutubeChannelIE): diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 98f15177b..a795f56b3 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -9,6 +9,7 @@ from ..utils import ( int_or_none, unified_strdate, OnDemandPagedList, + xpath_text, ) @@ -19,13 +20,11 @@ def extract_from_xml_url(ie, video_id, xml_url): errnote='Failed to download video info') title = doc.find('.//information/title').text - description = doc.find('.//information/detail').text - duration = int(doc.find('.//details/lengthSec').text) - uploader_node = doc.find('.//details/originChannelTitle') - uploader = None if uploader_node is None else uploader_node.text - uploader_id_node = doc.find('.//details/originChannelId') - uploader_id = None if uploader_id_node is None else uploader_id_node.text - upload_date = unified_strdate(doc.find('.//details/airtime').text) + description = xpath_text(doc, './/information/detail', 'description') + duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration')) + uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader') + uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id') + upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date')) def xml_to_format(fnode): video_url = fnode.find('url').text @@ -40,15 +39,14 @@ def extract_from_xml_url(ie, video_id, xml_url): ext = format_m.group('container') proto = format_m.group('proto').lower() - quality = fnode.find('./quality').text - abr = int(fnode.find('./audioBitrate').text) // 1000 - vbr_node = fnode.find('./videoBitrate') - vbr = None if vbr_node is None else int(vbr_node.text) // 1000 + quality = xpath_text(fnode, './quality', 'quality') + abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000) + vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000) - width_node = fnode.find('./width') - width = None if width_node is None else int_or_none(width_node.text) - height_node = fnode.find('./height') - height = None if height_node is None else int_or_none(height_node.text) + width = int_or_none(xpath_text(fnode, './width', 'width')) + height = int_or_none(xpath_text(fnode, './height', 'height')) + + filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize')) format_note = '' if not format_note: @@ -64,12 +62,31 @@ def extract_from_xml_url(ie, video_id, xml_url): 'vbr': vbr, 'width': width, 'height': height, - 'filesize': int_or_none(fnode.find('./filesize').text), + 'filesize': filesize, 'format_note': format_note, 'protocol': proto, '_available': is_available, } + def xml_to_thumbnails(fnode): + thumbnails = [] + for node in fnode: + thumbnail_url = node.text + if not thumbnail_url: + continue + thumbnail = { + 'url': thumbnail_url, + } + if 'key' in node.attrib: + m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) + if m: + thumbnail['width'] = int(m.group(1)) + thumbnail['height'] = int(m.group(2)) + thumbnails.append(thumbnail) + return thumbnails + + thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage')) + format_nodes = doc.findall('.//formitaeten/formitaet') formats = list(filter( lambda f: f['_available'], @@ -81,6 +98,7 @@ def extract_from_xml_url(ie, video_id, xml_url): 'title': title, 'description': description, 'duration': duration, + 'thumbnails': thumbnails, 'uploader': uploader, 'uploader_id': uploader_id, 'upload_date': upload_date, diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 1dc3153fd..db5b3698e 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals +import base64 import calendar import codecs import contextlib @@ -1371,7 +1372,12 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): v = getattr(v, get_attr, None) if v == '': v = None - return default if v is None else (int(v) * invscale // scale) + if v is None: + return default + try: + return int(v) * invscale // scale + except ValueError: + return default def str_or_none(v, default=None): @@ -1387,7 +1393,12 @@ def str_to_int(int_str): def float_or_none(v, scale=1, invscale=1, default=None): - return default if v is None else (float(v) * invscale / scale) + if v is None: + return default + try: + return float(v) * invscale / scale + except ValueError: + return default def parse_duration(s): @@ -1785,6 +1796,10 @@ def urlhandle_detect_ext(url_handle): return mimetype2ext(getheader('Content-Type')) +def encode_data_uri(data, mime_type): + return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii')) + + def age_restricted(content_limit, age_limit): """ Returns True iff the content should be blocked """ diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 0908e963d..660b0050b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.10.12' +__version__ = '2015.10.18'