Merge pull request #4 from rg3/master

update `5th may
This commit is contained in:
dntt1 2016-05-05 22:24:18 +05:30
commit 87f6d15c5c
8 changed files with 192 additions and 69 deletions

View File

@ -1,7 +1,7 @@
all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites
clean: clean:
rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi *.mkv *.webm CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe
find . -name "*.pyc" -delete find . -name "*.pyc" -delete
find . -name "*.class" -delete find . -name "*.class" -delete

View File

@ -0,0 +1,61 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
int_or_none,
determine_protocol,
)
class DailyMailIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.dailymail.co.uk/video/sciencetech/video-1288527/Turn-video-impressionist-masterpiece.html',
'md5': '2f639d446394f53f3a33658b518b6615',
'info_dict': {
'id': '1288527',
'ext': 'mp4',
'title': 'Turn any video into an impressionist masterpiece',
'description': 'md5:88ddbcb504367987b2708bb38677c9d2',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_data = self._parse_json(self._search_regex(
r"data-opts='({.+?})'", webpage, 'video data'), video_id)
title = video_data['title']
video_sources = self._download_json(video_data.get(
'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id)
formats = []
for rendition in video_sources['renditions']:
rendition_url = rendition.get('url')
if not rendition_url:
continue
tbr = int_or_none(rendition.get('encodingRate'), 1000)
container = rendition.get('videoContainer')
is_hls = container == 'M2TS'
protocol = 'm3u8_native' if is_hls else determine_protocol({'url': rendition_url})
formats.append({
'format_id': ('hls' if is_hls else protocol) + ('-%d' % tbr if tbr else ''),
'url': rendition_url,
'width': int_or_none(rendition.get('frameWidth')),
'height': int_or_none(rendition.get('frameHeight')),
'tbr': tbr,
'vcodec': rendition.get('videoCodec'),
'container': container,
'protocol': protocol,
'ext': 'mp4' if is_hls else None,
})
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': video_data.get('descr'),
'thumbnail': video_data.get('poster') or video_data.get('thumbnail'),
'formats': formats,
}

View File

@ -157,6 +157,7 @@ from .cspan import CSpanIE
from .ctsnews import CtsNewsIE from .ctsnews import CtsNewsIE
from .cultureunplugged import CultureUnpluggedIE from .cultureunplugged import CultureUnpluggedIE
from .cwtv import CWTVIE from .cwtv import CWTVIE
from .dailymail import DailyMailIE
from .dailymotion import ( from .dailymotion import (
DailymotionIE, DailymotionIE,
DailymotionPlaylistIE, DailymotionPlaylistIE,

View File

@ -1,20 +1,19 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_urlparse
class FczenitIE(InfoExtractor): class FczenitIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/gl(?P<id>[0-9]+)' _VALID_URL = r'https?://(?:www\.)?fc-zenit\.ru/video/(?P<id>[0-9]+)'
_TEST = { _TEST = {
'url': 'http://fc-zenit.ru/video/gl6785/', 'url': 'http://fc-zenit.ru/video/41044/',
'md5': '458bacc24549173fe5a5aa29174a5606', 'md5': '0e3fab421b455e970fa1aa3891e57df0',
'info_dict': { 'info_dict': {
'id': '6785', 'id': '41044',
'ext': 'mp4', 'ext': 'mp4',
'title': '«Зенит-ТВ»: как Олег Шатов играл против «Урала»', 'title': 'Так пишется история: казанский разгром ЦСКА на «Зенит-ТВ»',
}, },
} }
@ -22,15 +21,23 @@ class FczenitIE(InfoExtractor):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
video_title = self._html_search_regex(r'<div class=\"photoalbum__title\">([^<]+)', webpage, 'title') video_title = self._html_search_regex(
r'<[^>]+class=\"photoalbum__title\">([^<]+)', webpage, 'title')
bitrates_raw = self._html_search_regex(r'bitrates:.*\n(.*)\]', webpage, 'video URL') video_items = self._parse_json(self._search_regex(
bitrates = re.findall(r'url:.?\'(.+?)\'.*?bitrate:.?([0-9]{3}?)', bitrates_raw) r'arrPath\s*=\s*JSON\.parse\(\'(.+)\'\)', webpage, 'video items'),
video_id)
def merge_dicts(*dicts):
ret = {}
for a_dict in dicts:
ret.update(a_dict)
return ret
formats = [{ formats = [{
'url': furl, 'url': compat_urlparse.urljoin(url, video_url),
'tbr': tbr, 'tbr': int(tbr),
} for furl, tbr in bitrates] } for tbr, video_url in merge_dicts(*video_items).items()]
self._sort_formats(formats) self._sort_formats(formats)

View File

@ -266,6 +266,7 @@ class KuwoCategoryIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': '86375', 'id': '86375',
'title': '八十年代精选', 'title': '八十年代精选',
'description': '这些都是属于八十年代的回忆!',
}, },
'playlist_mincount': 24, 'playlist_mincount': 24,
} }

View File

@ -1,7 +1,12 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ExtractorError from ..utils import (
ExtractorError,
int_or_none,
str_to_int,
unified_strdate,
)
class RedTubeIE(InfoExtractor): class RedTubeIE(InfoExtractor):
@ -13,6 +18,9 @@ class RedTubeIE(InfoExtractor):
'id': '66418', 'id': '66418',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Sucked on a toilet', 'title': 'Sucked on a toilet',
'upload_date': '20120831',
'duration': 596,
'view_count': int,
'age_limit': 18, 'age_limit': 18,
} }
} }
@ -24,12 +32,39 @@ class RedTubeIE(InfoExtractor):
if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']): if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']):
raise ExtractorError('Video %s has been removed' % video_id, expected=True) raise ExtractorError('Video %s has been removed' % video_id, expected=True)
title = self._html_search_regex(
(r'<h1 class="videoTitle[^"]*">(?P<title>.+?)</h1>',
r'videoTitle\s*:\s*(["\'])(?P<title>)\1'),
webpage, 'title', group='title')
formats = []
sources = self._parse_json(
self._search_regex(
r'sources\s*:\s*({.+?})', webpage, 'source', default='{}'),
video_id, fatal=False)
if sources and isinstance(sources, dict):
for format_id, format_url in sources.items():
if format_url:
formats.append({
'url': format_url,
'format_id': format_id,
'height': int_or_none(format_id),
})
else:
video_url = self._html_search_regex( video_url = self._html_search_regex(
r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
video_title = self._html_search_regex( formats.append({'url': video_url})
r'<h1 class="videoTitle[^"]*">(.+?)</h1>', self._sort_formats(formats)
webpage, 'title')
video_thumbnail = self._og_search_thumbnail(webpage) thumbnail = self._og_search_thumbnail(webpage)
upload_date = unified_strdate(self._search_regex(
r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<',
webpage, 'upload date', fatal=False))
duration = int_or_none(self._search_regex(
r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
view_count = str_to_int(self._search_regex(
r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)',
webpage, 'view count', fatal=False))
# No self-labeling, but they describe themselves as # No self-labeling, but they describe themselves as
# "Home of Videos Porno" # "Home of Videos Porno"
@ -37,9 +72,12 @@ class RedTubeIE(InfoExtractor):
return { return {
'id': video_id, 'id': video_id,
'url': video_url,
'ext': 'mp4', 'ext': 'mp4',
'title': video_title, 'title': title,
'thumbnail': video_thumbnail, 'thumbnail': thumbnail,
'upload_date': upload_date,
'duration': duration,
'view_count': view_count,
'age_limit': age_limit, 'age_limit': age_limit,
'formats': formats,
} }

View File

@ -5,7 +5,6 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_HTTPError, compat_HTTPError,
compat_urllib_parse_urlencode,
compat_urllib_request, compat_urllib_request,
compat_urlparse, compat_urlparse,
) )
@ -84,18 +83,19 @@ class UdemyIE(InfoExtractor):
if enroll_url: if enroll_url:
webpage = self._download_webpage( webpage = self._download_webpage(
combine_url(base_url, enroll_url), combine_url(base_url, enroll_url),
course_id, 'Enrolling in the course') course_id, 'Enrolling in the course',
headers={'Referer': base_url})
if '>You have enrolled in' in webpage: if '>You have enrolled in' in webpage:
self.to_screen('%s: Successfully enrolled in the course' % course_id) self.to_screen('%s: Successfully enrolled in the course' % course_id)
def _download_lecture(self, course_id, lecture_id): def _download_lecture(self, course_id, lecture_id):
return self._download_json( return self._download_json(
'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?%s' % ( 'https://www.udemy.com/api-2.0/users/me/subscribed-courses/%s/lectures/%s?'
course_id, lecture_id, compat_urllib_parse_urlencode({ % (course_id, lecture_id),
lecture_id, 'Downloading lecture JSON', query={
'fields[lecture]': 'title,description,view_html,asset', 'fields[lecture]': 'title,description,view_html,asset',
'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data',
})), })
lecture_id, 'Downloading lecture JSON')
def _handle_error(self, response): def _handle_error(self, response):
if not isinstance(response, dict): if not isinstance(response, dict):
@ -155,13 +155,13 @@ class UdemyIE(InfoExtractor):
'password': password, 'password': password,
}) })
request = sanitized_Request(
self._LOGIN_URL, urlencode_postdata(login_form))
request.add_header('Referer', self._ORIGIN_URL)
request.add_header('Origin', self._ORIGIN_URL)
response = self._download_webpage( response = self._download_webpage(
request, None, 'Logging in as %s' % username) self._LOGIN_URL, None, 'Logging in as %s' % username,
data=urlencode_postdata(login_form),
headers={
'Referer': self._ORIGIN_URL,
'Origin': self._ORIGIN_URL,
})
if not is_logged(response): if not is_logged(response):
error = self._html_search_regex( error = self._html_search_regex(

View File

@ -10,8 +10,6 @@ from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
float_or_none, float_or_none,
sanitized_Request,
urlencode_postdata,
) )
@ -177,7 +175,7 @@ class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE):
class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
IE_NAME = 'yandexmusic:playlist' IE_NAME = 'yandexmusic:playlist'
IE_DESC = 'Яндекс.Музыка - Плейлист' IE_DESC = 'Яндекс.Музыка - Плейлист'
_VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/users/[^/]+/playlists/(?P<id>\d+)' _VALID_URL = r'https?://music\.yandex\.(?P<tld>ru|kz|ua|by)/users/(?P<user>[^/]+)/playlists/(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'http://music.yandex.ru/users/music.partners/playlists/1245', 'url': 'http://music.yandex.ru/users/music.partners/playlists/1245',
@ -196,47 +194,64 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
'id': '1036', 'id': '1036',
'title': 'Музыка 90-х', 'title': 'Музыка 90-х',
}, },
'playlist_count': 310, 'playlist_mincount': 300,
'skip': 'Travis CI servers blocked by YandexMusic', 'skip': 'Travis CI servers blocked by YandexMusic',
}] }]
def _real_extract(self, url): def _real_extract(self, url):
playlist_id = self._match_id(url) mobj = re.match(self._VALID_URL, url)
tld = mobj.group('tld')
user = mobj.group('user')
playlist_id = mobj.group('id')
webpage = self._download_webpage(url, playlist_id) playlist = self._download_json(
'https://music.yandex.%s/handlers/playlist.jsx' % tld,
playlist_id, 'Downloading missing tracks JSON',
fatal=False,
headers={
'Referer': url,
'X-Requested-With': 'XMLHttpRequest',
'X-Retpath-Y': url,
},
query={
'owner': user,
'kinds': playlist_id,
'light': 'true',
'lang': tld,
'external-domain': 'music.yandex.%s' % tld,
'overembed': 'false',
})['playlist']
mu = self._parse_json( tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds'])
self._search_regex(
r'var\s+Mu\s*=\s*({.+?});\s*</script>', webpage, 'player'),
playlist_id)
playlist = mu['pageData']['playlist'] # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks,
tracks, track_ids = playlist['tracks'], playlist['trackIds']
# tracks dictionary shipped with webpage is limited to 150 tracks,
# missing tracks should be retrieved manually. # missing tracks should be retrieved manually.
if len(tracks) < len(track_ids): if len(tracks) < len(track_ids):
present_track_ids = set([compat_str(track['id']) for track in tracks if track.get('id')]) present_track_ids = set([
missing_track_ids = set(map(compat_str, track_ids)) - set(present_track_ids) compat_str(track['id'])
request = sanitized_Request( for track in tracks if track.get('id')])
'https://music.yandex.ru/handlers/track-entries.jsx', missing_track_ids = [
urlencode_postdata({ track_id for track_id in track_ids
'entries': ','.join(missing_track_ids), if track_id not in present_track_ids]
'lang': mu.get('settings', {}).get('lang', 'en'),
'external-domain': 'music.yandex.ru',
'overembed': 'false',
'sign': mu.get('authData', {}).get('user', {}).get('sign'),
'strict': 'true',
}))
request.add_header('Referer', url)
request.add_header('X-Requested-With', 'XMLHttpRequest')
missing_tracks = self._download_json( missing_tracks = self._download_json(
request, playlist_id, 'Downloading missing tracks JSON', fatal=False) 'https://music.yandex.%s/handlers/track-entries.jsx' % tld,
playlist_id, 'Downloading missing tracks JSON',
fatal=False,
headers={
'Referer': url,
'X-Requested-With': 'XMLHttpRequest',
},
query={
'entries': ','.join(missing_track_ids),
'lang': tld,
'external-domain': 'music.yandex.%s' % tld,
'overembed': 'false',
'strict': 'true',
})
if missing_tracks: if missing_tracks:
tracks.extend(missing_tracks) tracks.extend(missing_tracks)
return self.playlist_result( return self.playlist_result(
self._build_playlist(tracks), self._build_playlist(tracks),
compat_str(playlist_id), compat_str(playlist_id),
playlist['title'], playlist.get('description')) playlist.get('title'), playlist.get('description'))