This commit is contained in:
Gilles Habran 2016-05-06 08:43:44 +02:00
commit d9d23ba68a
11 changed files with 248 additions and 74 deletions

View File

@ -1,7 +1,7 @@
all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites
clean: clean:
rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi *.mkv *.webm CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe
find . -name "*.pyc" -delete find . -name "*.pyc" -delete
find . -name "*.class" -delete find . -name "*.class" -delete

View File

@ -139,8 +139,8 @@ class TestUtil(unittest.TestCase):
self.assertEqual('yes_no', sanitize_filename('yes? no', restricted=True)) self.assertEqual('yes_no', sanitize_filename('yes? no', restricted=True))
self.assertEqual('this_-_that', sanitize_filename('this: that', restricted=True)) self.assertEqual('this_-_that', sanitize_filename('this: that', restricted=True))
tests = 'a\xe4b\u4e2d\u56fd\u7684c' tests = 'aäb\u4e2d\u56fd\u7684c'
self.assertEqual(sanitize_filename(tests, restricted=True), 'a_b_c') self.assertEqual(sanitize_filename(tests, restricted=True), 'aab_c')
self.assertTrue(sanitize_filename('\xf6', restricted=True) != '') # No empty filename self.assertTrue(sanitize_filename('\xf6', restricted=True) != '') # No empty filename
forbidden = '"\0\\/&!: \'\t\n()[]{}$;`^,#' forbidden = '"\0\\/&!: \'\t\n()[]{}$;`^,#'
@ -155,6 +155,10 @@ class TestUtil(unittest.TestCase):
self.assertTrue(sanitize_filename('-', restricted=True) != '') self.assertTrue(sanitize_filename('-', restricted=True) != '')
self.assertTrue(sanitize_filename(':', restricted=True) != '') self.assertTrue(sanitize_filename(':', restricted=True) != '')
self.assertEqual(sanitize_filename(
'ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', restricted=True),
'AAAAAAAECEEEEIIIIDNOOOOOOUUUUYPssaaaaaaaeceeeeiiiionoooooouuuuypy')
def test_sanitize_ids(self): def test_sanitize_ids(self):
self.assertEqual(sanitize_filename('_n_cd26wFpw', is_id=True), '_n_cd26wFpw') self.assertEqual(sanitize_filename('_n_cd26wFpw', is_id=True), '_n_cd26wFpw')
self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw') self.assertEqual(sanitize_filename('_BD_eEpuzXw', is_id=True), '_BD_eEpuzXw')

View File

@ -0,0 +1,61 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,
determine_protocol,
)
class DailyMailIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.dailymail.co.uk/video/sciencetech/video-1288527/Turn-video-impressionist-masterpiece.html',
'md5': '2f639d446394f53f3a33658b518b6615',
'info_dict': {
'id': '1288527',
'ext': 'mp4',
'title': 'Turn any video into an impressionist masterpiece',
'description': 'md5:88ddbcb504367987b2708bb38677c9d2',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_data = self._parse_json(self._search_regex(
r"data-opts='({.+?})'", webpage, 'video data'), video_id)
title = video_data['title']
video_sources = self._download_json(video_data.get(
'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id)
formats = []
for rendition in video_sources['renditions']:
rendition_url = rendition.get('url')
if not rendition_url:
continue
tbr = int_or_none(rendition.get('encodingRate'), 1000)
container = rendition.get('videoContainer')
is_hls = container == 'M2TS'
protocol = 'm3u8_native' if is_hls else determine_protocol({'url': rendition_url})
formats.append({
'format_id': ('hls' if is_hls else protocol) + ('-%d' % tbr if tbr else ''),
'url': rendition_url,
'width': int_or_none(rendition.get('frameWidth')),
'height': int_or_none(rendition.get('frameHeight')),
'tbr': tbr,
'vcodec': rendition.get('videoCodec'),
'container': container,
'protocol': protocol,
'ext': 'mp4' if is_hls else None,
})
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': video_data.get('descr'),
'thumbnail': video_data.get('poster') or video_data.get('thumbnail'),
'formats': formats,
}

View File

@ -157,6 +157,7 @@ from .cspan import CSpanIE
from .ctsnews import CtsNewsIE from .ctsnews import CtsNewsIE
from .cultureunplugged import CultureUnpluggedIE from .cultureunplugged import CultureUnpluggedIE
from .cwtv import CWTVIE from .cwtv import CWTVIE
from .dailymail import DailyMailIE
from .dailymotion import ( from .dailymotion import (
DailymotionIE, DailymotionIE,
DailymotionPlaylistIE, DailymotionPlaylistIE,
@ -560,7 +561,10 @@ from .parliamentliveuk import ParliamentLiveUKIE
from .patreon import PatreonIE from .patreon import PatreonIE
from .pbs import PBSIE from .pbs import PBSIE
from .people import PeopleIE from .people import PeopleIE
from .periscope import PeriscopeIE from .periscope import (
PeriscopeIE,
PeriscopeUserIE,
)
from .philharmoniedeparis import PhilharmonieDeParisIE from .philharmoniedeparis import PhilharmonieDeParisIE
from .phoenix import PhoenixIE from .phoenix import PhoenixIE
from .photobucket import PhotobucketIE from .photobucket import PhotobucketIE

View File

@ -1,20 +1,19 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_urlparse
class FczenitIE(InfoExtractor): class FczenitIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/gl(?P<id>[0-9]+)' _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/(?P<id>[0-9]+)'
_TEST = { _TEST = {
'url': 'http://fc-zenit.ru/video/gl6785/', 'url': 'http://fc-zenit.ru/video/41044/',
'md5': '458bacc24549173fe5a5aa29174a5606', 'md5': '0e3fab421b455e970fa1aa3891e57df0',
'info_dict': { 'info_dict': {
'id': '6785', 'id': '41044',
'ext': 'mp4', 'ext': 'mp4',
'title': '«Зенит-ТВ»: как Олег Шатов играл против «Урала»', 'title': 'Так пишется история: казанский разгром ЦСКА на «Зенит-ТВ»',
}, },
} }
@ -22,15 +21,23 @@ class FczenitIE(InfoExtractor):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
video_title = self._html_search_regex(r'<div class=\"photoalbum__title\">([^<]+)', webpage, 'title') video_title = self._html_search_regex(
r'<[^>]+class=\"photoalbum__title\">([^<]+)', webpage, 'title')
bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, 'video URL') video_items = self._parse_json(self._search_regex(
bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw) r'arrPath\s*=\s*JSON\.parse\(\'(.+)\'\)', webpage, 'video items'),
video_id)
def merge_dicts(*dicts):
ret = {}
for a_dict in dicts:
ret.update(a_dict)
return ret
formats = [{ formats = [{
'url': furl, 'url': compat_urlparse.urljoin(url, video_url),
'tbr': tbr, 'tbr': int(tbr),
} for furl, tbr in bitrates] } for tbr, video_url in merge_dicts(*video_items).items()]
self._sort_formats(formats) self._sort_formats(formats)

View File

@ -266,6 +266,7 @@ class KuwoCategoryIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '86375', 'id': '86375',
'title': '八十年代精选', 'title': '八十年代精选',
'description': '这些都是属于八十年代的回忆!',
}, },
'playlist_mincount': 24, 'playlist_mincount': 24,
} }

View File

@ -7,6 +7,7 @@ from ..utils import parse_iso8601
class PeriscopeIE(InfoExtractor): class PeriscopeIE(InfoExtractor):
IE_DESC = 'Periscope' IE_DESC = 'Periscope'
IE_NAME = 'periscope'
_VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P<id>[^/?#]+)' _VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P<id>[^/?#]+)'
# Alive example URLs can be found here http://onperiscope.com/ # Alive example URLs can be found here http://onperiscope.com/
_TESTS = [{ _TESTS = [{
@ -79,3 +80,39 @@ class PeriscopeIE(InfoExtractor):
'thumbnails': thumbnails, 'thumbnails': thumbnails,
'formats': formats, 'formats': formats,
} }
class PeriscopeUserIE(InfoExtractor):
_VALID_URL = r'https?://www\.periscope\.tv/(?P<id>[^/]+)/?$'
IE_DESC = 'Periscope user videos'
IE_NAME = 'periscope:user'
_TEST = {
'url': 'https://www.periscope.tv/LularoeHusbandMike/',
'info_dict': {
'id': 'LularoeHusbandMike',
'title': 'LULAROE HUSBAND MIKE',
},
# Periscope only shows videos in the last 24 hours, so it's possible to
# get 0 videos
'playlist_mincount': 0,
}
def _real_extract(self, url):
user_id = self._match_id(url)
webpage = self._download_webpage(url, user_id)
broadcast_data = self._parse_json(self._html_search_meta(
'broadcast-data', webpage, default='{}'), user_id)
username = broadcast_data.get('user', {}).get('display_name')
user_broadcasts = self._parse_json(
self._html_search_meta('user-broadcasts', webpage, default='{}'),
user_id)
entries = [
self.url_result(
'https://www.periscope.tv/%s/%s' % (user_id, broadcast['id']))
for broadcast in user_broadcasts.get('broadcasts', [])]
return self.playlist_result(entries, user_id, username)

View File

@ -1,7 +1,12 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ExtractorError from ..utils import (
ExtractorError,
int_or_none,
str_to_int,
unified_strdate,
)
class RedTubeIE(InfoExtractor): class RedTubeIE(InfoExtractor):
@ -13,6 +18,9 @@ class RedTubeIE(InfoExtractor):
'id': '66418', 'id': '66418',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Sucked on a toilet', 'title': 'Sucked on a toilet',
'upload_date': '20120831',
'duration': 596,
'view_count': int,
'age_limit': 18, 'age_limit': 18,
} }
} }
@ -24,12 +32,39 @@ class RedTubeIE(InfoExtractor):
if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']): if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']):
raise ExtractorError('Video %s has been removed' % video_id, expected=True) raise ExtractorError('Video %s has been removed' % video_id, expected=True)
video_url = self._html_search_regex( title = self._html_search_regex(
r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') (r'<h1 class="videoTitle[^"]*">(?P<title>.+?)</h1>',
video_title = self._html_search_regex( r'videoTitle\s*:\s*(["\'])(?P<title>)\1'),
r'<h1 class="videoTitle[^"]*">(.+?)</h1>', webpage, 'title', group='title')
webpage, 'title')
video_thumbnail = self._og_search_thumbnail(webpage) formats = []
sources = self._parse_json(
self._search_regex(
r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'),
video_id, fatal=False)
if sources and isinstance(sources, dict):
for format_id, format_url in sources.items():
if format_url:
formats.append({
'url': format_url,
'format_id': format_id,
'height': int_or_none(format_id),
})
else:
video_url = self._html_search_regex(
r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
formats.append({'url': video_url})
self._sort_formats(formats)
thumbnail = self._og_search_thumbnail(webpage)
upload_date = unified_strdate(self._search_regex(
r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<',
webpage, 'upload date', fatal=False))
duration = int_or_none(self._search_regex(
r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
view_count = str_to_int(self._search_regex(
r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)',
webpage, 'view count', fatal=False))
# No self-labeling, but they describe themselves as # No self-labeling, but they describe themselves as
# "Home of Videos Porno" # "Home of Videos Porno"
@ -37,9 +72,12 @@ class RedTubeIE(InfoExtractor):
return { return {
'id': video_id, 'id': video_id,
'url': video_url,
'ext': 'mp4', 'ext': 'mp4',
'title': video_title, 'title': title,
'thumbnail': video_thumbnail, 'thumbnail': thumbnail,
'upload_date': upload_date,
'duration': duration,
'view_count': view_count,
'age_limit': age_limit, 'age_limit': age_limit,
'formats': formats,
} }

View File

@ -5,7 +5,6 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_HTTPError, compat_HTTPError,
compat_urllib_parse_urlencode,
compat_urllib_request, compat_urllib_request,
compat_urlparse, compat_urlparse,
) )
@ -84,18 +83,19 @@ class UdemyIE(InfoExtractor):
if enroll_url: if enroll_url:
webpage = self._download_webpage( webpage = self._download_webpage(
combine_url(base_url, enroll_url), combine_url(base_url, enroll_url),
course_id, 'Enrolling in the course') course_id, 'Enrolling in the course',
headers={'Referer': base_url})
if '>You have enrolled in' in webpage: if '>You have enrolled in' in webpage:
self.to_screen('%s: Successfully enrolled in the course' % course_id) self.to_screen('%s: Successfully enrolled in the course' % course_id)
def _download_lecture(self, course_id, lecture_id): def _download_lecture(self, course_id, lecture_id):
return self._download_json( return self._download_json(
'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % ( 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?'
course_id, lecture_id, compat_urllib_parse_urlencode({ % (course_id, lecture_id),
'fields[lecture]': 'title,description,view_html,asset', lecture_id, 'Downloading lecture JSON', query={
'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', 'fields[lecture]': 'title,description,view_html,asset',
})), 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data',
lecture_id, 'Downloading lecture JSON') })
def _handle_error(self, response): def _handle_error(self, response):
if not isinstance(response, dict): if not isinstance(response, dict):
@ -155,13 +155,13 @@ class UdemyIE(InfoExtractor):
'password': password, 'password': password,
}) })
request = sanitized_Request(
self._LOGIN_URL, urlencode_postdata(login_form))
request.add_header('Referer', self._ORIGIN_URL)
request.add_header('Origin', self._ORIGIN_URL)
response = self._download_webpage( response = self._download_webpage(
request, None, 'Logging in as %s' % username) self._LOGIN_URL, None, 'Logging in as %s' % username,
data=urlencode_postdata(login_form),
headers={
'Referer': self._ORIGIN_URL,
'Origin': self._ORIGIN_URL,
})
if not is_logged(response): if not is_logged(response):
error = self._html_search_regex( error = self._html_search_regex(

View File

@ -10,8 +10,6 @@ from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
float_or_none, float_or_none,
sanitized_Request,
urlencode_postdata,
) )
@ -177,7 +175,7 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE):
class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
IE_NAME = 'yandexmusic:playlist' IE_NAME = 'yandexmusic:playlist'
IE_DESC = 'Яндекс.Музыка - Плейлист' IE_DESC = 'Яндекс.Музыка - Плейлист'
_VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P<id>\d+)' _VALID_URL = r'https?://music\.yandex\.(?P<tld>ru|kz|ua|by)/users/(?P<user>[^/]+)/playlists/(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'http://music.yandex.ru/users/music.partners/playlists/1245', 'url': 'http://music.yandex.ru/users/music.partners/playlists/1245',
@ -196,47 +194,64 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
'id': '1036', 'id': '1036',
'title': 'Музыка 90-х', 'title': 'Музыка 90-х',
}, },
'playlist_count': 310, 'playlist_mincount': 300,
'skip': 'Travis CI servers blocked by YandexMusic', 'skip': 'Travis CI servers blocked by YandexMusic',
}] }]
def _real_extract(self, url): def _real_extract(self, url):
playlist_id = self._match_id(url) mobj = re.match(self._VALID_URL, url)
tld = mobj.group('tld')
user = mobj.group('user')
playlist_id = mobj.group('id')
webpage = self._download_webpage(url, playlist_id) playlist = self._download_json(
'https://music.yandex.%s/handlers/playlist.jsx' % tld,
playlist_id, 'Downloading missing tracks JSON',
fatal=False,
headers={
'Referer': url,
'X-Requested-With': 'XMLHttpRequest',
'X-Retpath-Y': url,
},
query={
'owner': user,
'kinds': playlist_id,
'light': 'true',
'lang': tld,
'external-domain': 'music.yandex.%s' % tld,
'overembed': 'false',
})['playlist']
mu = self._parse_json( tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds'])
self._search_regex(
r'var\s+Mu\s*=\s*({.+?});\s*</script>', webpage, 'player'),
playlist_id)
playlist = mu['pageData']['playlist'] # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks,
tracks, track_ids = playlist['tracks'], playlist['trackIds']
# tracks dictionary shipped with webpage is limited to 150 tracks,
# missing tracks should be retrieved manually. # missing tracks should be retrieved manually.
if len(tracks) < len(track_ids): if len(tracks) < len(track_ids):
present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')]) present_track_ids = set([
missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids) compat_str(track['id'])
request = sanitized_Request( for track in tracks if track.get('id')])
'https://music.yandex.ru/handlers/track-entries.jsx', missing_track_ids = [
urlencode_postdata({ track_id for track_id in track_ids
'entries': ','.join(missing_track_ids), if track_id not in present_track_ids]
'lang': mu.get('settings', {}).get('lang', 'en'),
'external-domain': 'music.yandex.ru',
'overembed': 'false',
'sign': mu.get('authData', {}).get('user', {}).get('sign'),
'strict': 'true',
}))
request.add_header('Referer', url)
request.add_header('X-Requested-With', 'XMLHttpRequest')
missing_tracks = self._download_json( missing_tracks = self._download_json(
request, playlist_id, 'Downloading missing tracks JSON', fatal=False) 'https://music.yandex.%s/handlers/track-entries.jsx' % tld,
playlist_id, 'Downloading missing tracks JSON',
fatal=False,
headers={
'Referer': url,
'X-Requested-With': 'XMLHttpRequest',
},
query={
'entries': ','.join(missing_track_ids),
'lang': tld,
'external-domain': 'music.yandex.%s' % tld,
'overembed': 'false',
'strict': 'true',
})
if missing_tracks: if missing_tracks:
tracks.extend(missing_tracks) tracks.extend(missing_tracks)
return self.playlist_result( return self.playlist_result(
self._build_playlist(tracks), self._build_playlist(tracks),
compat_str(playlist_id), compat_str(playlist_id),
playlist['title'], playlist.get('description')) playlist.get('title'), playlist.get('description'))

View File

@ -14,8 +14,8 @@ import email.utils
import errno import errno
import functools import functools
import gzip import gzip
import itertools
import io import io
import itertools
import json import json
import locale import locale
import math import math
@ -24,8 +24,8 @@ import os
import pipes import pipes
import platform import platform
import re import re
import ssl
import socket import socket
import ssl
import struct import struct
import subprocess import subprocess
import sys import sys
@ -89,6 +89,11 @@ KNOWN_EXTENSIONS = (
'wav', 'wav',
'f4f', 'f4m', 'm3u8', 'smil') 'f4f', 'f4m', 'm3u8', 'smil')
# needed for sanitizing filenames in restricted mode
ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ',
itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'],
'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy')))
def preferredencoding(): def preferredencoding():
"""Get preferred encoding. """Get preferred encoding.
@ -365,6 +370,8 @@ def sanitize_filename(s, restricted=False, is_id=False):
Set is_id if this is not an arbitrary string, but an ID that should be kept if possible Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
""" """
def replace_insane(char): def replace_insane(char):
if restricted and char in ACCENT_CHARS:
return ACCENT_CHARS[char]
if char == '?' or ord(char) < 32 or ord(char) == 127: if char == '?' or ord(char) < 32 or ord(char) == 127:
return '' return ''
elif char == '"': elif char == '"':