This commit is contained in:
Gilles Habran 2016-05-02 11:59:25 +02:00
commit faaac9b31e
20 changed files with 908 additions and 344 deletions

View File

@ -6,8 +6,8 @@
--- ---
### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.04.24*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. ### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.05.01*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.04.24** - [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.05.01**
### Before submitting an *issue* make sure you have: ### Before submitting an *issue* make sure you have:
- [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@ -35,7 +35,7 @@ $ youtube-dl -v <your command line>
[debug] User config: [] [debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
[debug] youtube-dl version 2016.04.24 [debug] youtube-dl version 2016.05.01
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {} [debug] Proxy map: {}

View File

@ -168,3 +168,4 @@ José Joaquín Atria
Viťas Strádal Viťas Strádal
Kagami Hiiragi Kagami Hiiragi
Philip Huppert Philip Huppert
blahgeek

View File

@ -338,7 +338,6 @@
- **mailru**: Видео@Mail.Ru - **mailru**: Видео@Mail.Ru
- **MakersChannel** - **MakersChannel**
- **MakerTV** - **MakerTV**
- **Malemotion**
- **MatchTV** - **MatchTV**
- **MDR**: MDR.DE and KiKA - **MDR**: MDR.DE and KiKA
- **media.ccc.de** - **media.ccc.de**
@ -375,8 +374,8 @@
- **mtvservices:embedded** - **mtvservices:embedded**
- **MuenchenTV**: münchen.tv - **MuenchenTV**: münchen.tv
- **MusicPlayOn** - **MusicPlayOn**
- **muzu.tv**
- **Mwave** - **Mwave**
- **MwaveMeetGreet**
- **MySpace** - **MySpace**
- **MySpace:album** - **MySpace:album**
- **MySpass** - **MySpass**
@ -554,7 +553,6 @@
- **SenateISVP** - **SenateISVP**
- **ServingSys** - **ServingSys**
- **Sexu** - **Sexu**
- **SexyKarma**: Sexy Karma and Watch Indian Porn
- **Shahid** - **Shahid**
- **Shared**: shared.sx and vivo.sx - **Shared**: shared.sx and vivo.sx
- **ShareSix** - **ShareSix**
@ -567,8 +565,6 @@
- **smotri:broadcast**: Smotri.com broadcasts - **smotri:broadcast**: Smotri.com broadcasts
- **smotri:community**: Smotri.com community videos - **smotri:community**: Smotri.com community videos
- **smotri:user**: Smotri.com user videos - **smotri:user**: Smotri.com user videos
- **SnagFilms**
- **SnagFilmsEmbed**
- **Snotr** - **Snotr**
- **Sohu** - **Sohu**
- **soundcloud** - **soundcloud**
@ -610,6 +606,7 @@
- **Syfy** - **Syfy**
- **SztvHu** - **SztvHu**
- **Tagesschau** - **Tagesschau**
- **tagesschau:player**
- **Tapely** - **Tapely**
- **Tass** - **Tass**
- **TDSLifeway** - **TDSLifeway**
@ -725,6 +722,8 @@
- **Vidzi** - **Vidzi**
- **vier** - **vier**
- **vier:videos** - **vier:videos**
- **ViewLift**
- **ViewLiftEmbed**
- **Viewster** - **Viewster**
- **Viidea** - **Viidea**
- **viki** - **viki**
@ -756,6 +755,7 @@
- **Walla** - **Walla**
- **WashingtonPost** - **WashingtonPost**
- **wat.tv** - **wat.tv**
- **WatchIndianPorn**: Watch Indian Porn
- **WDR** - **WDR**
- **wdr:mobile** - **wdr:mobile**
- **WDRMaus**: Sendung mit der Maus - **WDRMaus**: Sendung mit der Maus
@ -775,6 +775,10 @@
- **XFileShare**: XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me - **XFileShare**: XFileShare based sites: GorillaVid.in, daclips.in, movpod.in, fastvideo.in, realvid.net, filehoot.com and vidto.me
- **XHamster** - **XHamster**
- **XHamsterEmbed** - **XHamsterEmbed**
- **xiami:album**: 虾米音乐 - 专辑
- **xiami:artist**: 虾米音乐 - 歌手
- **xiami:collection**: 虾米音乐 - 精选集
- **xiami:song**: 虾米音乐
- **XMinus** - **XMinus**
- **XNXX** - **XNXX**
- **Xstream** - **Xstream**

View File

@ -1,13 +1,9 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
parse_duration, parse_iso8601,
qualities,
unified_strdate,
) )
@ -19,14 +15,14 @@ class CCCIE(InfoExtractor):
'url': 'https://media.ccc.de/v/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor#video', 'url': 'https://media.ccc.de/v/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor#video',
'md5': '3a1eda8f3a29515d27f5adb967d7e740', 'md5': '3a1eda8f3a29515d27f5adb967d7e740',
'info_dict': { 'info_dict': {
'id': '30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor', 'id': '1839',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Introduction to Processor Design', 'title': 'Introduction to Processor Design',
'description': 'md5:80be298773966f66d56cb11260b879af', 'description': 'md5:df55f6d073d4ceae55aae6f2fd98a0ac',
'thumbnail': 're:^https?://.*\.jpg$', 'thumbnail': 're:^https?://.*\.jpg$',
'view_count': int,
'upload_date': '20131228', 'upload_date': '20131228',
'duration': 3660, 'timestamp': 1388188800,
'duration': 3710,
} }
}, { }, {
'url': 'https://media.ccc.de/v/32c3-7368-shopshifting#download', 'url': 'https://media.ccc.de/v/32c3-7368-shopshifting#download',
@ -34,79 +30,48 @@ class CCCIE(InfoExtractor):
}] }]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) display_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, display_id)
event_id = self._search_regex("data-id='(\d+)'", webpage, 'event id')
event_data = self._download_json('https://media.ccc.de/public/events/%s' % event_id, event_id)
if self._downloader.params.get('prefer_free_formats'):
preference = qualities(['mp3', 'opus', 'mp4-lq', 'webm-lq', 'h264-sd', 'mp4-sd', 'webm-sd', 'mp4', 'webm', 'mp4-hd', 'h264-hd', 'webm-hd'])
else:
preference = qualities(['opus', 'mp3', 'webm-lq', 'mp4-lq', 'webm-sd', 'h264-sd', 'mp4-sd', 'webm', 'mp4', 'webm-hd', 'mp4-hd', 'h264-hd'])
title = self._html_search_regex(
r'(?s)<h1>(.*?)</h1>', webpage, 'title')
description = self._html_search_regex(
r'(?s)<h3>About</h3>(.+?)<h3>',
webpage, 'description', fatal=False)
upload_date = unified_strdate(self._html_search_regex(
r"(?s)<span[^>]+class='[^']*fa-calendar-o'[^>]*>(.+?)</span>",
webpage, 'upload date', fatal=False))
view_count = int_or_none(self._html_search_regex(
r"(?s)<span class='[^']*fa-eye'></span>(.*?)</li>",
webpage, 'view count', fatal=False))
duration = parse_duration(self._html_search_regex(
r'(?s)<span[^>]+class=(["\']).*?fa-clock-o.*?\1[^>]*></span>(?P<duration>.+?)</li',
webpage, 'duration', fatal=False, group='duration'))
matches = re.finditer(r'''(?xs)
<(?:span|div)\s+class='label\s+filetype'>(?P<format>[^<]*)</(?:span|div)>\s*
<(?:span|div)\s+class='label\s+filetype'>(?P<lang>[^<]*)</(?:span|div)>\s*
<a\s+download\s+href='(?P<http_url>[^']+)'>\s*
(?:
.*?
<a\s+(?:download\s+)?href='(?P<torrent_url>[^']+\.torrent)'
)?''', webpage)
formats = [] formats = []
for m in matches: for recording in event_data.get('recordings', []):
format = m.group('format') recording_url = recording.get('recording_url')
format_id = self._search_regex( if not recording_url:
r'.*/([a-z0-9_-]+)/[^/]*$', continue
m.group('http_url'), 'format id', default=None) language = recording.get('language')
if format_id: folder = recording.get('folder')
format_id = m.group('lang') + '-' + format_id format_id = None
vcodec = 'h264' if 'h264' in format_id else ( if language:
'none' if format_id in ('mp3', 'opus') else None format_id = language
if folder:
if language:
format_id += '-' + folder
else:
format_id = folder
vcodec = 'h264' if 'h264' in folder else (
'none' if folder in ('mp3', 'opus') else None
) )
formats.append({ formats.append({
'format_id': format_id, 'format_id': format_id,
'format': format, 'url': recording_url,
'language': m.group('lang'), 'width': int_or_none(recording.get('width')),
'url': m.group('http_url'), 'height': int_or_none(recording.get('height')),
'filesize': int_or_none(recording.get('size'), invscale=1024 * 1024),
'language': language,
'vcodec': vcodec, 'vcodec': vcodec,
'preference': preference(format_id),
}) })
if m.group('torrent_url'):
formats.append({
'format_id': 'torrent-%s' % (format if format_id is None else format_id),
'format': '%s (torrent)' % format,
'proto': 'torrent',
'format_note': '(unsupported; will just download the .torrent file)',
'vcodec': vcodec,
'preference': -100 + preference(format_id),
'url': m.group('torrent_url'),
})
self._sort_formats(formats) self._sort_formats(formats)
thumbnail = self._html_search_regex(
r"<video.*?poster='([^']+)'", webpage, 'thumbnail', fatal=False)
return { return {
'id': video_id, 'id': event_id,
'title': title, 'display_id': display_id,
'description': description, 'title': event_data['title'],
'thumbnail': thumbnail, 'description': event_data.get('description'),
'view_count': view_count, 'thumbnail': event_data.get('thumb_url'),
'upload_date': upload_date, 'timestamp': parse_iso8601(event_data.get('date')),
'duration': duration, 'duration': int_or_none(event_data.get('length')),
'tags': event_data.get('tags'),
'formats': formats, 'formats': formats,
} }

View File

@ -1142,7 +1142,7 @@ class InfoExtractor(object):
# Bandwidth of live streams may differ over time thus making # Bandwidth of live streams may differ over time thus making
# format_id unpredictable. So it's better to keep provided # format_id unpredictable. So it's better to keep provided
# format_id intact. # format_id intact.
if last_media_name and not live: if not live:
format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
f = { f = {
'format_id': '-'.join(format_id), 'format_id': '-'.join(format_id),

View File

@ -307,14 +307,17 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
'video_uploader', fatal=False) 'video_uploader', fatal=False)
available_fmts = [] available_fmts = []
for a, fmt in re.findall(r'(<a[^>]+token="showmedia\.([0-9]{3,4})p"[^>]+>.*?</a>)', webpage): for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage):
attrs = extract_attributes(a) attrs = extract_attributes(a)
href = attrs.get('href') href = attrs.get('href')
if href and '/freetrial' in href: if href and '/freetrial' in href:
continue continue
available_fmts.append(fmt) available_fmts.append(fmt)
if not available_fmts: if not available_fmts:
available_fmts = re.findall(r'token="showmedia\.([0-9]{3,4})p"', webpage) for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'):
available_fmts = re.findall(p, webpage)
if available_fmts:
break
video_encode_ids = [] video_encode_ids = []
formats = [] formats = []
for fmt in available_fmts: for fmt in available_fmts:
@ -364,6 +367,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
'ext': 'flv', 'ext': 'flv',
}) })
formats.append(format_info) formats.append(format_info)
self._sort_formats(formats)
metadata = self._download_xml( metadata = self._download_xml(
'http://www.crunchyroll.com/xml', video_id, 'http://www.crunchyroll.com/xml', video_id,

View File

@ -12,39 +12,46 @@ class DFBIE(InfoExtractor):
_TEST = { _TEST = {
'url': 'http://tv.dfb.de/video/u-19-em-stimmen-zum-spiel-gegen-russland/11633/', 'url': 'http://tv.dfb.de/video/u-19-em-stimmen-zum-spiel-gegen-russland/11633/',
# The md5 is different each time 'md5': 'ac0f98a52a330f700b4b3034ad240649',
'info_dict': { 'info_dict': {
'id': '11633', 'id': '11633',
'display_id': 'u-19-em-stimmen-zum-spiel-gegen-russland', 'display_id': 'u-19-em-stimmen-zum-spiel-gegen-russland',
'ext': 'flv', 'ext': 'mp4',
'title': 'U 19-EM: Stimmen zum Spiel gegen Russland', 'title': 'U 19-EM: Stimmen zum Spiel gegen Russland',
'upload_date': '20150714', 'upload_date': '20150714',
}, },
} }
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) display_id, video_id = re.match(self._VALID_URL, url).groups()
video_id = mobj.group('id')
display_id = mobj.group('display_id')
webpage = self._download_webpage(url, display_id)
player_info = self._download_xml( player_info = self._download_xml(
'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id, 'http://tv.dfb.de/server/hd_video.php?play=%s' % video_id,
display_id) display_id)
video_info = player_info.find('video') video_info = player_info.find('video')
stream_access_url = self._proto_relative_url(video_info.find('url').text.strip())
f4m_info = self._download_xml( formats = []
self._proto_relative_url(video_info.find('url').text.strip()), display_id) # see http://tv.dfb.de/player/js/ajax.js for the method to extract m3u8 formats
token_el = f4m_info.find('token') for sa_url in (stream_access_url, stream_access_url + '&area=&format=iphone'):
manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth'] + '&hdcore=3.2.0' stream_access_info = self._download_xml(sa_url, display_id)
formats = self._extract_f4m_formats(manifest_url, display_id) token_el = stream_access_info.find('token')
manifest_url = token_el.attrib['url'] + '?' + 'hdnea=' + token_el.attrib['auth']
if '.f4m' in manifest_url:
formats.extend(self._extract_f4m_formats(
manifest_url + '&hdcore=3.2.0',
display_id, f4m_id='hds', fatal=False))
else:
formats.extend(self._extract_m3u8_formats(
manifest_url, display_id, 'mp4',
'm3u8_native', m3u8_id='hls', fatal=False))
self._sort_formats(formats) self._sort_formats(formats)
return { return {
'id': video_id, 'id': video_id,
'display_id': display_id, 'display_id': display_id,
'title': video_info.find('title').text, 'title': video_info.find('title').text,
'thumbnail': self._og_search_thumbnail(webpage), 'thumbnail': 'http://tv.dfb.de/images/%s_640x360.jpg' % video_id,
'upload_date': unified_strdate(video_info.find('time_date').text), 'upload_date': unified_strdate(video_info.find('time_date').text),
'formats': formats, 'formats': formats,
} }

View File

@ -33,6 +33,7 @@ class DiscoveryIE(InfoExtractor):
'duration': 156, 'duration': 156,
'timestamp': 1302032462, 'timestamp': 1302032462,
'upload_date': '20110405', 'upload_date': '20110405',
'uploader_id': '103207',
}, },
'params': { 'params': {
'skip_download': True, # requires ffmpeg 'skip_download': True, # requires ffmpeg
@ -54,7 +55,11 @@ class DiscoveryIE(InfoExtractor):
'upload_date': '20140725', 'upload_date': '20140725',
'timestamp': 1406246400, 'timestamp': 1406246400,
'duration': 116, 'duration': 116,
'uploader_id': '103207',
}, },
'params': {
'skip_download': True, # requires ffmpeg
}
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -66,13 +71,19 @@ class DiscoveryIE(InfoExtractor):
entries = [] entries = []
for idx, video_info in enumerate(info['playlist']): for idx, video_info in enumerate(info['playlist']):
formats = self._extract_m3u8_formats( subtitles = {}
video_info['src'], display_id, 'mp4', 'm3u8_native', m3u8_id='hls', caption_url = video_info.get('captionsUrl')
note='Download m3u8 information for video %d' % (idx + 1)) if caption_url:
self._sort_formats(formats) subtitles = {
'en': [{
'url': caption_url,
}]
}
entries.append({ entries.append({
'_type': 'url_transparent',
'url': 'http://players.brightcove.net/103207/default_default/index.html?videoId=ref:%s' % video_info['referenceId'],
'id': compat_str(video_info['id']), 'id': compat_str(video_info['id']),
'formats': formats,
'title': video_info['title'], 'title': video_info['title'],
'description': video_info.get('description'), 'description': video_info.get('description'),
'duration': parse_duration(video_info.get('video_length')), 'duration': parse_duration(video_info.get('video_length')),
@ -80,6 +91,7 @@ class DiscoveryIE(InfoExtractor):
'thumbnail': video_info.get('thumbnailURL'), 'thumbnail': video_info.get('thumbnailURL'),
'alt_title': video_info.get('secondary_title'), 'alt_title': video_info.get('secondary_title'),
'timestamp': parse_iso8601(video_info.get('publishedDate')), 'timestamp': parse_iso8601(video_info.get('publishedDate')),
'subtitles': subtitles,
}) })
return self.playlist_result(entries, display_id, video_title) return self.playlist_result(entries, display_id, video_title)

View File

@ -724,7 +724,10 @@ from .svt import (
from .swrmediathek import SWRMediathekIE from .swrmediathek import SWRMediathekIE
from .syfy import SyfyIE from .syfy import SyfyIE
from .sztvhu import SztvHuIE from .sztvhu import SztvHuIE
from .tagesschau import TagesschauIE from .tagesschau import (
TagesschauPlayerIE,
TagesschauIE,
)
from .tapely import TapelyIE from .tapely import TapelyIE
from .tass import TassIE from .tass import TassIE
from .tdslifeway import TDSLifewayIE from .tdslifeway import TDSLifewayIE
@ -846,7 +849,10 @@ from .veehd import VeeHDIE
from .veoh import VeohIE from .veoh import VeohIE
from .vessel import VesselIE from .vessel import VesselIE
from .vesti import VestiIE from .vesti import VestiIE
from .vevo import VevoIE from .vevo import (
VevoIE,
VevoPlaylistIE,
)
from .vgtv import ( from .vgtv import (
BTArticleIE, BTArticleIE,
BTVestlendingenIE, BTVestlendingenIE,
@ -941,6 +947,12 @@ from .xhamster import (
XHamsterIE, XHamsterIE,
XHamsterEmbedIE, XHamsterEmbedIE,
) )
from .xiami import (
XiamiSongIE,
XiamiAlbumIE,
XiamiArtistIE,
XiamiCollectionIE
)
from .xminus import XMinusIE from .xminus import XMinusIE
from .xnxx import XNXXIE from .xnxx import XNXXIE
from .xstream import XstreamIE from .xstream import XstreamIE

View File

@ -2,6 +2,10 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
compat_urllib_parse_unquote_plus,
)
from ..utils import ( from ..utils import (
clean_html, clean_html,
determine_ext, determine_ext,
@ -27,6 +31,7 @@ class FunimationIE(InfoExtractor):
'description': 'md5:1769f43cd5fc130ace8fd87232207892', 'description': 'md5:1769f43cd5fc130ace8fd87232207892',
'thumbnail': 're:https?://.*\.jpg', 'thumbnail': 're:https?://.*\.jpg',
}, },
'skip': 'Access without user interaction is forbidden by CloudFlare, and video removed',
}, { }, {
'url': 'http://www.funimation.com/shows/hacksign/videos/official/role-play', 'url': 'http://www.funimation.com/shows/hacksign/videos/official/role-play',
'info_dict': { 'info_dict': {
@ -37,6 +42,7 @@ class FunimationIE(InfoExtractor):
'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd', 'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd',
'thumbnail': 're:https?://.*\.jpg', 'thumbnail': 're:https?://.*\.jpg',
}, },
'skip': 'Access without user interaction is forbidden by CloudFlare',
}, { }, {
'url': 'http://www.funimation.com/shows/attack-on-titan-junior-high/videos/promotional/broadcast-dub-preview', 'url': 'http://www.funimation.com/shows/attack-on-titan-junior-high/videos/promotional/broadcast-dub-preview',
'info_dict': { 'info_dict': {
@ -47,8 +53,36 @@ class FunimationIE(InfoExtractor):
'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803', 'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803',
'thumbnail': 're:https?://.*\.(?:jpg|png)', 'thumbnail': 're:https?://.*\.(?:jpg|png)',
}, },
'skip': 'Access without user interaction is forbidden by CloudFlare',
}] }]
_LOGIN_URL = 'http://www.funimation.com/login'
def _download_webpage(self, *args, **kwargs):
try:
return super(FunimationIE, self)._download_webpage(*args, **kwargs)
except ExtractorError as ee:
if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
response = ee.cause.read()
if b'>Please complete the security check to access<' in response:
raise ExtractorError(
'Access to funimation.com is blocked by CloudFlare. '
'Please browse to http://www.funimation.com/, solve '
'the reCAPTCHA, export browser cookies to a text file,'
' and then try again with --cookies YOUR_COOKIE_FILE.',
expected=True)
raise
def _extract_cloudflare_session_ua(self, url):
ci_session_cookie = self._get_cookies(url).get('ci_session')
if ci_session_cookie:
ci_session = compat_urllib_parse_unquote_plus(ci_session_cookie.value)
# ci_session is a string serialized by PHP function serialize()
# This case is simple enough to use regular expressions only
return self._search_regex(
r'"user_agent";s:\d+:"([^"]+)"', ci_session, 'user agent',
default=None)
def _login(self): def _login(self):
(username, password) = self._get_login_info() (username, password) = self._get_login_info()
if username is None: if username is None:
@ -57,8 +91,11 @@ class FunimationIE(InfoExtractor):
'email_field': username, 'email_field': username,
'password_field': password, 'password_field': password,
}) })
login_request = sanitized_Request('http://www.funimation.com/login', data, headers={ user_agent = self._extract_cloudflare_session_ua(self._LOGIN_URL)
'User-Agent': 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0', if not user_agent:
user_agent = 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0'
login_request = sanitized_Request(self._LOGIN_URL, data, headers={
'User-Agent': user_agent,
'Content-Type': 'application/x-www-form-urlencoded' 'Content-Type': 'application/x-www-form-urlencoded'
}) })
login_page = self._download_webpage( login_page = self._download_webpage(
@ -103,11 +140,16 @@ class FunimationIE(InfoExtractor):
('mobile', 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36'), ('mobile', 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36'),
) )
user_agent = self._extract_cloudflare_session_ua(url)
if user_agent:
USER_AGENTS = ((None, user_agent),)
for kind, user_agent in USER_AGENTS: for kind, user_agent in USER_AGENTS:
request = sanitized_Request(url) request = sanitized_Request(url)
request.add_header('User-Agent', user_agent) request.add_header('User-Agent', user_agent)
webpage = self._download_webpage( webpage = self._download_webpage(
request, display_id, 'Downloading %s webpage' % kind) request, display_id,
'Downloading %s webpage' % kind if kind else 'Downloading webpage')
playlist = self._parse_json( playlist = self._parse_json(
self._search_regex( self._search_regex(

View File

@ -196,7 +196,7 @@ class PBSIE(InfoExtractor):
_TESTS = [ _TESTS = [
{ {
'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', 'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/',
'md5': 'ce1888486f0908d555a8093cac9a7362', 'md5': '173dc391afd361fa72eab5d3d918968d',
'info_dict': { 'info_dict': {
'id': '2365006249', 'id': '2365006249',
'ext': 'mp4', 'ext': 'mp4',
@ -204,13 +204,10 @@ class PBSIE(InfoExtractor):
'description': 'md5:36f341ae62e251b8f5bd2b754b95a071', 'description': 'md5:36f341ae62e251b8f5bd2b754b95a071',
'duration': 3190, 'duration': 3190,
}, },
'params': {
'skip_download': True, # requires ffmpeg
},
}, },
{ {
'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/', 'url': 'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/',
'md5': '143c98aa54a346738a3d78f54c925321', 'md5': '6f722cb3c3982186d34b0f13374499c7',
'info_dict': { 'info_dict': {
'id': '2365297690', 'id': '2365297690',
'ext': 'mp4', 'ext': 'mp4',
@ -218,9 +215,6 @@ class PBSIE(InfoExtractor):
'description': 'md5:4d3eaa01f94e61b3e73704735f1196d9', 'description': 'md5:4d3eaa01f94e61b3e73704735f1196d9',
'duration': 5050, 'duration': 5050,
}, },
'params': {
'skip_download': True, # requires ffmpeg
}
}, },
{ {
'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/', 'url': 'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/',
@ -244,9 +238,6 @@ class PBSIE(InfoExtractor):
'duration': 6559, 'duration': 6559,
'thumbnail': 're:^https?://.*\.jpg$', 'thumbnail': 're:^https?://.*\.jpg$',
}, },
'params': {
'skip_download': True, # requires ffmpeg
},
}, },
{ {
'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html', 'url': 'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html',
@ -262,9 +253,6 @@ class PBSIE(InfoExtractor):
'upload_date': '20140122', 'upload_date': '20140122',
'age_limit': 10, 'age_limit': 10,
}, },
'params': {
'skip_download': True, # requires ffmpeg
},
}, },
{ {
'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/', 'url': 'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/',
@ -290,6 +278,7 @@ class PBSIE(InfoExtractor):
}, },
{ {
'url': 'http://www.pbs.org/video/2365245528/', 'url': 'http://www.pbs.org/video/2365245528/',
'md5': '115223d41bd55cda8ae5cd5ed4e11497',
'info_dict': { 'info_dict': {
'id': '2365245528', 'id': '2365245528',
'display_id': '2365245528', 'display_id': '2365245528',
@ -299,15 +288,13 @@ class PBSIE(InfoExtractor):
'duration': 6851, 'duration': 6851,
'thumbnail': 're:^https?://.*\.jpg$', 'thumbnail': 're:^https?://.*\.jpg$',
}, },
'params': {
'skip_download': True, # requires ffmpeg
},
}, },
{ {
# Video embedded in iframe containing angle brackets as attribute's value (e.g. # Video embedded in iframe containing angle brackets as attribute's value (e.g.
# "<iframe style='position: absolute;<br />\ntop: 0; left: 0;' ...", see # "<iframe style='position: absolute;<br />\ntop: 0; left: 0;' ...", see
# https://github.com/rg3/youtube-dl/issues/7059) # https://github.com/rg3/youtube-dl/issues/7059)
'url': 'http://www.pbs.org/food/features/a-chefs-life-season-3-episode-5-prickly-business/', 'url': 'http://www.pbs.org/food/features/a-chefs-life-season-3-episode-5-prickly-business/',
'md5': '84ced42850d78f1d4650297356e95e6f',
'info_dict': { 'info_dict': {
'id': '2365546844', 'id': '2365546844',
'display_id': 'a-chefs-life-season-3-episode-5-prickly-business', 'display_id': 'a-chefs-life-season-3-episode-5-prickly-business',
@ -317,9 +304,6 @@ class PBSIE(InfoExtractor):
'duration': 1480, 'duration': 1480,
'thumbnail': 're:^https?://.*\.jpg$', 'thumbnail': 're:^https?://.*\.jpg$',
}, },
'params': {
'skip_download': True, # requires ffmpeg
},
}, },
{ {
# Frontline video embedded via flp2012.js # Frontline video embedded via flp2012.js
@ -340,6 +324,7 @@ class PBSIE(InfoExtractor):
{ {
# Serves hd only via wigget/partnerplayer page # Serves hd only via wigget/partnerplayer page
'url': 'http://www.pbs.org/video/2365641075/', 'url': 'http://www.pbs.org/video/2365641075/',
'md5': 'acfd4c400b48149a44861cb16dd305cf',
'info_dict': { 'info_dict': {
'id': '2365641075', 'id': '2365641075',
'ext': 'mp4', 'ext': 'mp4',
@ -348,9 +333,6 @@ class PBSIE(InfoExtractor):
'thumbnail': 're:^https?://.*\.jpg$', 'thumbnail': 're:^https?://.*\.jpg$',
'formats': 'mincount:8', 'formats': 'mincount:8',
}, },
'params': {
'skip_download': True, # requires ffmpeg
},
}, },
{ {
'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
@ -494,6 +476,7 @@ class PBSIE(InfoExtractor):
info = video_info info = video_info
formats = [] formats = []
http_url = None
for num, redirect in enumerate(redirects): for num, redirect in enumerate(redirects):
redirect_id = redirect.get('eeid') redirect_id = redirect.get('eeid')
@ -514,13 +497,32 @@ class PBSIE(InfoExtractor):
if determine_ext(format_url) == 'm3u8': if determine_ext(format_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
format_url, display_id, 'mp4', preference=1, m3u8_id='hls')) format_url, display_id, 'mp4', m3u8_id='hls', fatal=False))
else: else:
formats.append({ formats.append({
'url': format_url, 'url': format_url,
'format_id': redirect_id, 'format_id': redirect_id,
}) })
if re.search(r'^https?://.*(?:\d+k|baseline)', format_url):
http_url = format_url
self._remove_duplicate_formats(formats) self._remove_duplicate_formats(formats)
m3u8_formats = list(filter(
lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
formats))
if http_url:
for m3u8_format in m3u8_formats:
bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
# extract only the formats that we know that they will be available as http format.
# https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications
if not bitrate or bitrate not in ('400k', '800k', '1200k', '2500k'):
continue
f = m3u8_format.copy()
f.update({
'url': re.sub(r'\d+k|baseline', bitrate, http_url),
'format_id': m3u8_format['format_id'].replace('hls', 'http'),
'protocol': 'http',
})
formats.append(f)
self._sort_formats(formats) self._sort_formats(formats)
rating_str = info.get('rating') rating_str = info.get('rating')
@ -535,6 +537,19 @@ class PBSIE(InfoExtractor):
'ext': 'ttml', 'ext': 'ttml',
'url': closed_captions_url, 'url': closed_captions_url,
}] }]
mobj = re.search(r'/(\d+)_Encoded\.dfxp', closed_captions_url)
if mobj:
ttml_caption_suffix, ttml_caption_id = mobj.group(0, 1)
ttml_caption_id = int(ttml_caption_id)
subtitles['en'].extend([{
'url': closed_captions_url.replace(
ttml_caption_suffix, '/%d_Encoded.srt' % (ttml_caption_id + 1)),
'ext': 'srt',
}, {
'url': closed_captions_url.replace(
ttml_caption_suffix, '/%d_Encoded.vtt' % (ttml_caption_id + 2)),
'ext': 'vtt',
}])
# info['title'] is often incomplete (e.g. 'Full Episode', 'Episode 5', etc) # info['title'] is often incomplete (e.g. 'Full Episode', 'Episode 5', etc)
# Try turning it to 'program - title' naming scheme if possible # Try turning it to 'program - title' naming scheme if possible

View File

@ -20,18 +20,19 @@ class RtlNlIE(InfoExtractor):
(?P<id>[0-9a-f-]+)''' (?P<id>[0-9a-f-]+)'''
_TESTS = [{ _TESTS = [{
'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/6e4203a6-0a5e-3596-8424-c599a59e0677', 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/82b1aad1-4a14-3d7b-b554-b0aed1b2c416',
'md5': 'cc16baa36a6c169391f0764fa6b16654', 'md5': '473d1946c1fdd050b2c0161a4b13c373',
'info_dict': { 'info_dict': {
'id': '6e4203a6-0a5e-3596-8424-c599a59e0677', 'id': '82b1aad1-4a14-3d7b-b554-b0aed1b2c416',
'ext': 'mp4', 'ext': 'mp4',
'title': 'RTL Nieuws - Laat', 'title': 'RTL Nieuws',
'description': 'md5:6b61f66510c8889923b11f2778c72dc5', 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'timestamp': 1408051800, 'timestamp': 1461951000,
'upload_date': '20140814', 'upload_date': '20160429',
'duration': 576.880, 'duration': 1167.96,
}, },
}, { }, {
# best format avaialble a3t
'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false', 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed/autoplay=false',
'md5': 'dea7474214af1271d91ef332fb8be7ea', 'md5': 'dea7474214af1271d91ef332fb8be7ea',
'info_dict': { 'info_dict': {
@ -39,18 +40,19 @@ class RtlNlIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'timestamp': 1424039400, 'timestamp': 1424039400,
'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag', 'title': 'RTL Nieuws - Nieuwe beelden Kopenhagen: chaos direct na aanslag',
'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$', 'thumbnail': 're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=84ae5571-ac25-4225-ae0c-ef8d9efb2aed$',
'upload_date': '20150215', 'upload_date': '20150215',
'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.', 'description': 'Er zijn nieuwe beelden vrijgegeven die vlak na de aanslag in Kopenhagen zijn gemaakt. Op de video is goed te zien hoe omstanders zich bekommeren om één van de slachtoffers, terwijl de eerste agenten ter plaatse komen.',
} }
}, { }, {
# empty synopsis and missing episodes (see https://github.com/rg3/youtube-dl/issues/6275) # empty synopsis and missing episodes (see https://github.com/rg3/youtube-dl/issues/6275)
# best format available nettv
'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false', 'url': 'http://www.rtl.nl/system/videoplayer/derden/rtlnieuws/video_embed.html#uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a/autoplay=false',
'info_dict': { 'info_dict': {
'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a', 'id': 'f536aac0-1dc3-4314-920e-3bd1c5b3811a',
'ext': 'mp4', 'ext': 'mp4',
'title': 'RTL Nieuws - Meer beelden van overval juwelier', 'title': 'RTL Nieuws - Meer beelden van overval juwelier',
'thumbnail': 're:^https?://screenshots\.rtl\.nl/system/thumb/sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$', 'thumbnail': 're:^https?://screenshots\.rtl\.nl/(?:[^/]+/)*sz=[0-9]+x[0-9]+/uuid=f536aac0-1dc3-4314-920e-3bd1c5b3811a$',
'timestamp': 1437233400, 'timestamp': 1437233400,
'upload_date': '20150718', 'upload_date': '20150718',
'duration': 30.474, 'duration': 30.474,
@ -94,22 +96,46 @@ class RtlNlIE(InfoExtractor):
videopath = material['videopath'] videopath = material['videopath']
m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath
formats = self._extract_m3u8_formats(m3u8_url, uuid, ext='mp4') formats = self._extract_m3u8_formats(
m3u8_url, uuid, 'mp4', m3u8_id='hls', fatal=False)
video_urlpart = videopath.split('/adaptive/')[1][:-5] video_urlpart = videopath.split('/adaptive/')[1][:-5]
PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4' PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4'
formats.extend([ PG_FORMATS = (
{ ('a2t', 512, 288),
'url': PG_URL_TEMPLATE % ('a2m', video_urlpart), ('a3t', 704, 400),
'format_id': 'pg-sd', ('nettv', 1280, 720),
}, )
{
'url': PG_URL_TEMPLATE % ('a3m', video_urlpart), def pg_format(format_id, width, height):
'format_id': 'pg-hd', return {
'quality': 0, 'url': PG_URL_TEMPLATE % (format_id, video_urlpart),
'format_id': 'pg-%s' % format_id,
'protocol': 'http',
'width': width,
'height': height,
} }
])
if not formats:
formats = [pg_format(*pg_tuple) for pg_tuple in PG_FORMATS]
else:
pg_formats = []
for format_id, width, height in PG_FORMATS:
try:
# Find hls format with the same width and height corresponding
# to progressive format and copy metadata from it.
f = next(f for f in formats if f.get('height') == height)
# hls formats may have invalid width
f['width'] = width
f_copy = f.copy()
f_copy.update(pg_format(format_id, width, height))
pg_formats.append(f_copy)
except StopIteration:
# Missing hls format does mean that no progressive format with
# such width and height exists either.
pass
formats.extend(pg_formats)
self._sort_formats(formats) self._sort_formats(formats)
thumbnails = [] thumbnails = []

View File

@ -4,42 +4,178 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import parse_filesize from ..utils import (
determine_ext,
js_to_json,
parse_iso8601,
parse_filesize,
)
class TagesschauPlayerIE(InfoExtractor):
IE_NAME = 'tagesschau:player'
_VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?P<kind>audio|video)/(?P=kind)-(?P<id>\d+)~player(?:_[^/?#&]+)?\.html'
_TESTS = [{
'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html',
'md5': '8d09548d5c15debad38bee3a4d15ca21',
'info_dict': {
'id': '179517',
'ext': 'mp4',
'title': 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD',
'thumbnail': 're:^https?:.*\.jpg$',
'formats': 'mincount:6',
},
}, {
'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html',
'md5': '76e6eec6ebd40740671cf0a2c88617e5',
'info_dict': {
'id': '29417',
'ext': 'mp3',
'title': 'Trabi - Bye, bye Rennpappe',
'thumbnail': 're:^https?:.*\.jpg$',
'formats': 'mincount:2',
},
}, {
'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417~player_autoplay-true.html',
'only_matching': True,
}]
_FORMATS = {
'xs': {'quality': 0},
's': {'width': 320, 'height': 180, 'quality': 1},
'm': {'width': 512, 'height': 288, 'quality': 2},
'l': {'width': 960, 'height': 540, 'quality': 3},
'xl': {'width': 1280, 'height': 720, 'quality': 4},
'xxl': {'quality': 5},
}
def _extract_via_api(self, kind, video_id):
info = self._download_json(
'https://www.tagesschau.de/api/multimedia/{0}/{0}-{1}.json'.format(kind, video_id),
video_id)
title = info['headline']
formats = []
for media in info['mediadata']:
for format_id, format_url in media.items():
if determine_ext(format_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats(
format_url, video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='hls'))
else:
formats.append({
'url': format_url,
'format_id': format_id,
'vcodec': 'none' if kind == 'audio' else None,
})
self._sort_formats(formats)
timestamp = parse_iso8601(info.get('date'))
return {
'id': video_id,
'title': title,
'timestamp': timestamp,
'formats': formats,
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
# kind = mobj.group('kind').lower()
# if kind == 'video':
# return self._extract_via_api(kind, video_id)
# JSON api does not provide some audio formats (e.g. ogg) thus
# extractiong audio via webpage
webpage = self._download_webpage(url, video_id)
title = self._og_search_title(webpage).strip()
formats = []
for media_json in re.findall(r'({src\s*:\s*["\']http[^}]+type\s*:[^}]+})', webpage):
media = self._parse_json(js_to_json(media_json), video_id, fatal=False)
if not media:
continue
src = media.get('src')
if not src:
return
quality = media.get('quality')
kind = media.get('type', '').split('/')[0]
ext = determine_ext(src)
f = {
'url': src,
'format_id': '%s_%s' % (quality, ext) if quality else ext,
'ext': ext,
'vcodec': 'none' if kind == 'audio' else None,
}
f.update(self._FORMATS.get(quality, {}))
formats.append(f)
self._sort_formats(formats)
thumbnail = self._og_search_thumbnail(webpage)
return {
'id': video_id,
'title': title,
'thumbnail': thumbnail,
'formats': formats,
}
class TagesschauIE(InfoExtractor): class TagesschauIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:[^/]+/)*?[^/#?]+?(?P<id>-?[0-9]+)(?:~_[^/#?]+?)?\.html' _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
_TESTS = [{ _TESTS = [{
'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html', 'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
'md5': '917a228bc7df7850783bc47979673a09', 'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6',
'info_dict': { 'info_dict': {
'id': '102143', 'id': 'video-102143',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt', 'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
'description': 'md5:171feccd9d9b3dd54d05d501568f6359', 'description': '18.07.2015 20:10 Uhr',
'thumbnail': 're:^https?:.*\.jpg$', 'thumbnail': 're:^https?:.*\.jpg$',
}, },
}, { }, {
'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html', 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
'md5': '3c54c1f6243d279b706bde660ceec633', 'md5': '3c54c1f6243d279b706bde660ceec633',
'info_dict': { 'info_dict': {
'id': '5727', 'id': 'ts-5727',
'ext': 'mp4', 'ext': 'mp4',
'description': 'md5:695c01bfd98b7e313c501386327aea59',
'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr', 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr',
'description': 'md5:695c01bfd98b7e313c501386327aea59',
'thumbnail': 're:^https?:.*\.jpg$', 'thumbnail': 're:^https?:.*\.jpg$',
}, },
}, { }, {
'url': 'http://www.tagesschau.de/multimedia/politikimradio/audio-18407.html', # exclusive audio
'md5': 'aef45de271c4bf0a5db834aa40bf774c', 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html',
'md5': '76e6eec6ebd40740671cf0a2c88617e5',
'info_dict': { 'info_dict': {
'id': '18407', 'id': 'audio-29417',
'ext': 'mp3', 'ext': 'mp3',
'title': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', 'title': 'Trabi - Bye, bye Rennpappe',
'description': 'Flüchtlingsdebatte: Hitzig, aber wenig hilfreich', 'description': 'md5:8687dda862cbbe2cfb2df09b56341317',
'thumbnail': 're:^https?:.*\.jpg$', 'thumbnail': 're:^https?:.*\.jpg$',
}, },
}, {
# audio in article
'url': 'http://www.tagesschau.de/inland/bnd-303.html',
'md5': 'e0916c623e85fc1d2b26b78f299d3958',
'info_dict': {
'id': 'bnd-303',
'ext': 'mp3',
'title': 'Viele Baustellen für neuen BND-Chef',
'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4',
'thumbnail': 're:^https?:.*\.jpg$',
},
}, {
'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html',
'info_dict': {
'id': 'afd-parteitag-135',
'title': 'Möchtegern-Underdog mit Machtanspruch',
},
'playlist_count': 2,
}, { }, {
'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html', 'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
'only_matching': True, 'only_matching': True,
@ -61,88 +197,108 @@ class TagesschauIE(InfoExtractor):
}, { }, {
'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html', 'url': 'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html',
'only_matching': True, 'only_matching': True,
}, {
'url': 'http://www.tagesschau.de/100sekunden/index.html',
'only_matching': True,
}, {
# playlist article with collapsing sections
'url': 'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html',
'only_matching': True,
}] }]
_FORMATS = { @classmethod
's': {'width': 256, 'height': 144, 'quality': 1}, def suitable(cls, url):
'm': {'width': 512, 'height': 288, 'quality': 2}, return False if TagesschauPlayerIE.suitable(url) else super(TagesschauIE, cls).suitable(url)
'l': {'width': 960, 'height': 544, 'quality': 3},
} def _extract_formats(self, download_text, media_kind):
links = re.finditer(
r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>',
download_text)
formats = []
for l in links:
link_url = l.group('url')
if not link_url:
continue
format_id = self._search_regex(
r'.*/[^/.]+\.([^/]+)\.[^/.]+$', link_url, 'format ID',
default=determine_ext(link_url))
format = {
'format_id': format_id,
'url': l.group('url'),
'format_name': l.group('name'),
}
title = l.group('title')
if title:
if media_kind.lower() == 'video':
m = re.match(
r'''(?x)
Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10;
(?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10;
(?P<vbr>[0-9]+)kbps&\#10;
Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10;
Gr&ouml;&szlig;e:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''',
title)
if m:
format.update({
'format_note': m.group('audio_desc'),
'vcodec': m.group('vcodec'),
'width': int(m.group('width')),
'height': int(m.group('height')),
'abr': int(m.group('abr')),
'vbr': int(m.group('vbr')),
'filesize_approx': parse_filesize(m.group('filesize_approx')),
})
else:
m = re.match(
r'(?P<format>.+?)-Format\s*:\s*(?P<abr>\d+)kbps\s*,\s*(?P<note>.+)',
title)
if m:
format.update({
'format_note': '%s, %s' % (m.group('format'), m.group('note')),
'vcodec': 'none',
'abr': int(m.group('abr')),
})
formats.append(format)
self._sort_formats(formats)
return formats
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id') or mobj.group('path')
display_id = video_id.lstrip('-') display_id = video_id.lstrip('-')
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
player_url = self._html_search_meta( title = self._html_search_regex(
'twitter:player', webpage, 'player URL', default=None) r'<span[^>]*class="headline"[^>]*>(.+?)</span>',
if player_url: webpage, 'title', default=None) or self._og_search_title(webpage)
playerpage = self._download_webpage(
player_url, display_id, 'Downloading player page')
formats = [] DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses (?P<kind>Video|Audio) in folgenden Formaten zum Download an:</p>\s*<div class="controls">(?P<links>.*?)</div>\s*<p>'
for media in re.finditer(
r'''(?x) webpage_type = self._og_search_property('type', webpage, default=None)
(?P<q_url>["\'])(?P<url>http://media.+?)(?P=q_url) if webpage_type == 'website': # Article
,\s*type:(?P<q_type>["\'])(?P<type>video|audio)/(?P<ext>.+?)(?P=q_type) entries = []
(?:,\s*quality:(?P<q_quality>["\'])(?P<quality>.+?)(?P=q_quality))? for num, (entry_title, media_kind, download_text) in enumerate(re.findall(
''', playerpage): r'(?s)<p[^>]+class="infotext"[^>]*>\s*(?:<a[^>]+>)?\s*<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX,
url = media.group('url') webpage), 1):
type_ = media.group('type') entries.append({
ext = media.group('ext') 'id': '%s-%d' % (display_id, num),
res = media.group('quality') 'title': '%s' % entry_title,
f = { 'formats': self._extract_formats(download_text, media_kind),
'format_id': '%s_%s' % (res, ext) if res else ext, })
'url': url, if len(entries) > 1:
'ext': ext, return self.playlist_result(entries, display_id, title)
'vcodec': 'none' if type_ == 'audio' else None, formats = entries[0]['formats']
} else: # Assume single video
f.update(self._FORMATS.get(res, {}))
formats.append(f)
thumbnail = self._og_search_thumbnail(playerpage)
title = self._og_search_title(webpage).strip()
description = self._og_search_description(webpage).strip()
else:
download_text = self._search_regex( download_text = self._search_regex(
r'(?s)<p>Wir bieten dieses Video in folgenden Formaten zum Download an:</p>\s*<div class="controls">(.*?)</div>\s*<p>', DOWNLOAD_REGEX, webpage, 'download links', group='links')
webpage, 'download links') media_kind = self._search_regex(
links = re.finditer( DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='kind')
r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>', formats = self._extract_formats(download_text, media_kind)
download_text) thumbnail = self._og_search_thumbnail(webpage)
formats = [] description = self._html_search_regex(
for l in links: r'(?s)<p class="teasertext">(.*?)</p>',
format_id = self._search_regex( webpage, 'description', default=None)
r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID')
format = {
'format_id': format_id,
'url': l.group('url'),
'format_name': l.group('name'),
}
m = re.match(
r'''(?x)
Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10;
(?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10;
(?P<vbr>[0-9]+)kbps&\#10;
Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10;
Gr&ouml;&szlig;e:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''',
l.group('title'))
if m:
format.update({
'format_note': m.group('audio_desc'),
'vcodec': m.group('vcodec'),
'width': int(m.group('width')),
'height': int(m.group('height')),
'abr': int(m.group('abr')),
'vbr': int(m.group('vbr')),
'filesize_approx': parse_filesize(m.group('filesize_approx')),
})
formats.append(format)
thumbnail = self._og_search_thumbnail(webpage)
description = self._html_search_regex(
r'(?s)<p class="teasertext">(.*?)</p>',
webpage, 'description', default=None)
title = self._html_search_regex(
r'<span class="headline".*?>(.*?)</span>', webpage, 'title')
self._sort_formats(formats) self._sort_formats(formats)

View File

@ -27,7 +27,7 @@ class TEDIE(InfoExtractor):
''' '''
_TESTS = [{ _TESTS = [{
'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
'md5': 'fc94ac279feebbce69f21c0c6ee82810', 'md5': '0de43ac406aa3e4ea74b66c9c7789b13',
'info_dict': { 'info_dict': {
'id': '102', 'id': '102',
'ext': 'mp4', 'ext': 'mp4',
@ -37,21 +37,26 @@ class TEDIE(InfoExtractor):
'consciousness, but that half the time our brains are ' 'consciousness, but that half the time our brains are '
'actively fooling us.'), 'actively fooling us.'),
'uploader': 'Dan Dennett', 'uploader': 'Dan Dennett',
'width': 854, 'width': 853,
'duration': 1308, 'duration': 1308,
} }
}, { }, {
'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms', 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
'md5': '226f4fb9c62380d11b7995efa4c87994', 'md5': 'b899ac15e345fb39534d913f7606082b',
'info_dict': { 'info_dict': {
'id': 'vishal-sikka-the-beauty-and-power-of-algorithms', 'id': 'tSVI8ta_P4w',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Vishal Sikka: The beauty and power of algorithms', 'title': 'Vishal Sikka: The beauty and power of algorithms',
'thumbnail': 're:^https?://.+\.jpg', 'thumbnail': 're:^https?://.+\.jpg',
'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.', 'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4',
} 'upload_date': '20140122',
'uploader_id': 'TEDInstitute',
'uploader': 'TED Institute',
},
'add_ie': ['Youtube'],
}, { }, {
'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best', 'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
'md5': '71b3ab2f4233012dce09d515c9c39ce2',
'info_dict': { 'info_dict': {
'id': '1972', 'id': '1972',
'ext': 'mp4', 'ext': 'mp4',
@ -102,9 +107,9 @@ class TEDIE(InfoExtractor):
}] }]
_NATIVE_FORMATS = { _NATIVE_FORMATS = {
'low': {'preference': 1, 'width': 320, 'height': 180}, 'low': {'width': 320, 'height': 180},
'medium': {'preference': 2, 'width': 512, 'height': 288}, 'medium': {'width': 512, 'height': 288},
'high': {'preference': 3, 'width': 854, 'height': 480}, 'high': {'width': 854, 'height': 480},
} }
def _extract_info(self, webpage): def _extract_info(self, webpage):
@ -171,15 +176,21 @@ class TEDIE(InfoExtractor):
if finfo: if finfo:
f.update(finfo) f.update(finfo)
http_url = None
for format_id, resources in talk_info['resources'].items(): for format_id, resources in talk_info['resources'].items():
if format_id == 'h264': if format_id == 'h264':
for resource in resources: for resource in resources:
h264_url = resource.get('file')
if not h264_url:
continue
bitrate = int_or_none(resource.get('bitrate')) bitrate = int_or_none(resource.get('bitrate'))
formats.append({ formats.append({
'url': resource['file'], 'url': h264_url,
'format_id': '%s-%sk' % (format_id, bitrate), 'format_id': '%s-%sk' % (format_id, bitrate),
'tbr': bitrate, 'tbr': bitrate,
}) })
if re.search('\d+k', h264_url):
http_url = h264_url
elif format_id == 'rtmp': elif format_id == 'rtmp':
streamer = talk_info.get('streamer') streamer = talk_info.get('streamer')
if not streamer: if not streamer:
@ -195,16 +206,24 @@ class TEDIE(InfoExtractor):
'tbr': int_or_none(resource.get('bitrate')), 'tbr': int_or_none(resource.get('bitrate')),
}) })
elif format_id == 'hls': elif format_id == 'hls':
hls_formats = self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
resources.get('stream'), video_name, 'mp4', m3u8_id=format_id) resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False))
for f in hls_formats:
if f.get('format_id') == 'hls-meta': m3u8_formats = list(filter(
continue lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
if not f.get('height'): formats))
f['vcodec'] = 'none' if http_url:
else: for m3u8_format in m3u8_formats:
f['acodec'] = 'none' bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
formats.extend(hls_formats) if not bitrate:
continue
f = m3u8_format.copy()
f.update({
'url': re.sub(r'\d+k', bitrate, http_url),
'format_id': m3u8_format['format_id'].replace('hls', 'http'),
'protocol': 'http',
})
formats.append(f)
audio_download = talk_info.get('audioDownload') audio_download = talk_info.get('audioDownload')
if audio_download: if audio_download:
@ -212,7 +231,6 @@ class TEDIE(InfoExtractor):
'url': audio_download, 'url': audio_download,
'format_id': 'audio', 'format_id': 'audio',
'vcodec': 'none', 'vcodec': 'none',
'preference': -0.5,
}) })
self._sort_formats(formats) self._sort_formats(formats)
@ -254,7 +272,11 @@ class TEDIE(InfoExtractor):
config_json = self._html_search_regex( config_json = self._html_search_regex(
r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>', r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
webpage, 'config') webpage, 'config', default=None)
if not config_json:
embed_url = self._search_regex(
r"<iframe[^>]+class='pages-video-embed__video__object'[^>]+src='([^']+)'", webpage, 'embed url')
return self.url_result(self._proto_relative_url(embed_url))
config = json.loads(config_json)['config'] config = json.loads(config_json)['config']
video_url = config['video']['url'] video_url = config['video']['url']
thumbnail = config.get('image', {}).get('url') thumbnail = config.get('image', {}).get('url')

View File

@ -3,7 +3,10 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_etree_fromstring from ..compat import (
compat_etree_fromstring,
compat_urlparse,
)
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
int_or_none, int_or_none,
@ -12,13 +15,22 @@ from ..utils import (
) )
class VevoIE(InfoExtractor): class VevoBaseIE(InfoExtractor):
def _extract_json(self, webpage, video_id, item):
return self._parse_json(
self._search_regex(
r'window\.__INITIAL_STORE__\s*=\s*({.+?});\s*</script>',
webpage, 'initial store'),
video_id)['default'][item]
class VevoIE(VevoBaseIE):
''' '''
Accepts urls from vevo.com or in the format 'vevo:{id}' Accepts urls from vevo.com or in the format 'vevo:{id}'
(currently used by MTVIE and MySpaceIE) (currently used by MTVIE and MySpaceIE)
''' '''
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
(?:https?://www\.vevo\.com/watch/(?:[^/]+/(?:[^/]+/)?)?| (?:https?://www\.vevo\.com/watch/(?!playlist|genre)(?:[^/]+/(?:[^/]+/)?)?|
https?://cache\.vevo\.com/m/html/embed\.html\?video=| https?://cache\.vevo\.com/m/html/embed\.html\?video=|
https?://videoplayer\.vevo\.com/embed/embedded\?videoId=| https?://videoplayer\.vevo\.com/embed/embedded\?videoId=|
vevo:) vevo:)
@ -30,11 +42,15 @@ class VevoIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': 'GB1101300280', 'id': 'GB1101300280',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Somebody to Die For', 'title': 'Hurts - Somebody to Die For',
'timestamp': 1372057200,
'upload_date': '20130624', 'upload_date': '20130624',
'uploader': 'Hurts', 'uploader': 'Hurts',
'timestamp': 1372057200, 'track': 'Somebody to Die For',
'artist': 'Hurts',
'genre': 'Pop',
}, },
'expected_warnings': ['Unable to download SMIL file'],
}, { }, {
'note': 'v3 SMIL format', 'note': 'v3 SMIL format',
'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923', 'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923',
@ -42,23 +58,31 @@ class VevoIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': 'USUV71302923', 'id': 'USUV71302923',
'ext': 'mp4', 'ext': 'mp4',
'title': 'I Wish I Could Break Your Heart', 'title': 'Cassadee Pope - I Wish I Could Break Your Heart',
'timestamp': 1392796919,
'upload_date': '20140219', 'upload_date': '20140219',
'uploader': 'Cassadee Pope', 'uploader': 'Cassadee Pope',
'timestamp': 1392796919, 'track': 'I Wish I Could Break Your Heart',
'artist': 'Cassadee Pope',
'genre': 'Country',
}, },
'expected_warnings': ['Unable to download SMIL file'],
}, { }, {
'note': 'Age-limited video', 'note': 'Age-limited video',
'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282', 'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282',
'info_dict': { 'info_dict': {
'id': 'USRV81300282', 'id': 'USRV81300282',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Tunnel Vision (Explicit)', 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
'upload_date': '20130703',
'age_limit': 18, 'age_limit': 18,
'uploader': 'Justin Timberlake',
'timestamp': 1372888800, 'timestamp': 1372888800,
'upload_date': '20130703',
'uploader': 'Justin Timberlake',
'track': 'Tunnel Vision (Explicit)',
'artist': 'Justin Timberlake',
'genre': 'Pop',
}, },
'expected_warnings': ['Unable to download SMIL file'],
}, { }, {
'note': 'No video_info', 'note': 'No video_info',
'url': 'http://www.vevo.com/watch/k-camp-1/Till-I-Die/USUV71503000', 'url': 'http://www.vevo.com/watch/k-camp-1/Till-I-Die/USUV71503000',
@ -66,12 +90,32 @@ class VevoIE(InfoExtractor):
'info_dict': { 'info_dict': {
'id': 'USUV71503000', 'id': 'USUV71503000',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Till I Die', 'title': 'K Camp - Till I Die',
'upload_date': '20151207',
'age_limit': 18, 'age_limit': 18,
'uploader': 'K Camp',
'timestamp': 1449468000, 'timestamp': 1449468000,
'upload_date': '20151207',
'uploader': 'K Camp',
'track': 'Till I Die',
'artist': 'K Camp',
'genre': 'Rap/Hip-Hop',
}, },
}, {
'note': 'Only available via webpage',
'url': 'http://www.vevo.com/watch/GBUV71600656',
'md5': '67e79210613865b66a47c33baa5e37fe',
'info_dict': {
'id': 'GBUV71600656',
'ext': 'mp4',
'title': 'ABC - Viva Love',
'age_limit': 0,
'timestamp': 1461830400,
'upload_date': '20160428',
'uploader': 'ABC',
'track': 'Viva Love',
'artist': 'ABC',
'genre': 'Pop',
},
'expected_warnings': ['Failed to download video versions info'],
}] }]
_SMIL_BASE_URL = 'http://smil.lvl3.vevo.com' _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com'
_SOURCE_TYPES = { _SOURCE_TYPES = {
@ -146,8 +190,8 @@ class VevoIE(InfoExtractor):
auth_info = self._parse_json(webpage, video_id) auth_info = self._parse_json(webpage, video_id)
self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token'] self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token']
def _call_api(self, path, video_id, note, errnote, fatal=True): def _call_api(self, path, *args, **kwargs):
return self._download_json(self._api_url_template % path, video_id, note, errnote) return self._download_json(self._api_url_template % path, *args, **kwargs)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
@ -157,9 +201,11 @@ class VevoIE(InfoExtractor):
json_url, video_id, 'Downloading video info', 'Unable to download info') json_url, video_id, 'Downloading video info', 'Unable to download info')
video_info = response.get('video') or {} video_info = response.get('video') or {}
video_versions = video_info.get('videoVersions') video_versions = video_info.get('videoVersions')
artist = None
featured_artist = None
uploader = None uploader = None
timestamp = None
view_count = None view_count = None
timestamp = None
formats = [] formats = []
if not video_info: if not video_info:
@ -183,12 +229,19 @@ class VevoIE(InfoExtractor):
video_versions = self._call_api( video_versions = self._call_api(
'video/%s/streams' % video_id, video_id, 'video/%s/streams' % video_id, video_id,
'Downloading video versions info', 'Downloading video versions info',
'Failed to download video versions info') 'Failed to download video versions info',
fatal=False)
# Some videos are only available via webpage (e.g.
# https://github.com/rg3/youtube-dl/issues/9366)
if not video_versions:
webpage = self._download_webpage(url, video_id)
video_versions = self._extract_json(webpage, video_id, 'streams')[video_id][0]
timestamp = parse_iso8601(video_info.get('releaseDate')) timestamp = parse_iso8601(video_info.get('releaseDate'))
artists = video_info.get('artists') artists = video_info.get('artists')
if artists: if artists:
uploader = artists[0]['name'] artist = uploader = artists[0]['name']
view_count = int_or_none(video_info.get('views', {}).get('total')) view_count = int_or_none(video_info.get('views', {}).get('total'))
for video_version in video_versions: for video_version in video_versions:
@ -241,7 +294,11 @@ class VevoIE(InfoExtractor):
scale=1000) scale=1000)
artists = video_info.get('mainArtists') artists = video_info.get('mainArtists')
if artists: if artists:
uploader = artists[0]['artistName'] artist = uploader = artists[0]['artistName']
featured_artists = video_info.get('featuredArtists')
if featured_artists:
featured_artist = featured_artists[0]['artistName']
smil_parsed = False smil_parsed = False
for video_version in video_info['videoVersions']: for video_version in video_info['videoVersions']:
@ -278,7 +335,11 @@ class VevoIE(InfoExtractor):
smil_parsed = True smil_parsed = True
self._sort_formats(formats) self._sort_formats(formats)
title = video_info['title'] track = video_info['title']
if featured_artist:
artist = '%s ft. %s' % (artist, featured_artist)
title = '%s - %s' % (artist, track) if artist else track
genre = video_info.get('genres', [None])[0]
is_explicit = video_info.get('isExplicit') is_explicit = video_info.get('isExplicit')
if is_explicit is True: if is_explicit is True:
@ -300,4 +361,75 @@ class VevoIE(InfoExtractor):
'duration': duration, 'duration': duration,
'view_count': view_count, 'view_count': view_count,
'age_limit': age_limit, 'age_limit': age_limit,
'track': track,
'artist': uploader,
'genre': genre,
} }
class VevoPlaylistIE(VevoBaseIE):
_VALID_URL = r'https?://www\.vevo\.com/watch/(?P<kind>playlist|genre)/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29',
'info_dict': {
'id': 'dadbf4e7-b99f-4184-9670-6f0e547b6a29',
'title': 'Best-Of: Birdman',
},
'playlist_count': 10,
}, {
'url': 'http://www.vevo.com/watch/genre/rock',
'info_dict': {
'id': 'rock',
'title': 'Rock',
},
'playlist_count': 20,
}, {
'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29?index=0',
'md5': '32dcdfddddf9ec6917fc88ca26d36282',
'info_dict': {
'id': 'USCMV1100073',
'ext': 'mp4',
'title': 'Birdman - Y.U. MAD',
'timestamp': 1323417600,
'upload_date': '20111209',
'uploader': 'Birdman',
'track': 'Y.U. MAD',
'artist': 'Birdman',
'genre': 'Rap/Hip-Hop',
},
'expected_warnings': ['Unable to download SMIL file'],
}, {
'url': 'http://www.vevo.com/watch/genre/rock?index=0',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
playlist_id = mobj.group('id')
playlist_kind = mobj.group('kind')
webpage = self._download_webpage(url, playlist_id)
qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
index = qs.get('index', [None])[0]
if index:
video_id = self._search_regex(
r'<meta[^>]+content=(["\'])vevo://video/(?P<id>.+?)\1[^>]*>',
webpage, 'video id', default=None, group='id')
if video_id:
return self.url_result('vevo:%s' % video_id, VevoIE.ie_key())
playlists = self._extract_json(webpage, playlist_id, '%ss' % playlist_kind)
playlist = (list(playlists.values())[0]
if playlist_kind == 'playlist' else playlists[playlist_id])
entries = [
self.url_result('vevo:%s' % src, VevoIE.ie_key())
for src in playlist['isrcs']]
return self.playlist_result(
entries, playlist.get('playlistId') or playlist_id,
playlist.get('name'), playlist.get('description'))

View File

@ -43,7 +43,7 @@ class VLiveIE(InfoExtractor):
status_params = self._download_json( status_params = self._download_json(
'http://www.vlive.tv/video/status?videoSeq=%s' % video_id, 'http://www.vlive.tv/video/status?videoSeq=%s' % video_id,
video_id, 'Downloading JSON status', video_id, 'Downloading JSON status',
headers={'Referer': url}) headers={'Referer': url.encode('utf-8')})
status = status_params.get('status') status = status_params.get('status')
air_start = status_params.get('onAirStartAt', '') air_start = status_params.get('onAirStartAt', '')
is_live = status_params.get('isLive') is_live = status_params.get('isLive')

View File

@ -4,16 +4,22 @@ from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
float_or_none,
unified_strdate, unified_strdate,
) )
class WSJIE(InfoExtractor): class WSJIE(InfoExtractor):
_VALID_URL = r'https?://video-api\.wsj\.com/api-video/player/iframe\.html\?guid=(?P<id>[a-zA-Z0-9-]+)' _VALID_URL = r'''(?x)https?://
(?:
video-api\.wsj\.com/api-video/player/iframe\.html\?guid=|
(?:www\.)?wsj\.com/video/[^/]+/
)
(?P<id>[a-zA-Z0-9-]+)'''
IE_DESC = 'Wall Street Journal' IE_DESC = 'Wall Street Journal'
_TEST = { _TESTS = [{
'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', 'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
'md5': '9747d7a6ebc2f4df64b981e1dde9efa9', 'md5': 'e230a5bb249075e40793b655a54a02e4',
'info_dict': { 'info_dict': {
'id': '1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', 'id': '1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
'ext': 'mp4', 'ext': 'mp4',
@ -24,65 +30,60 @@ class WSJIE(InfoExtractor):
'duration': 90, 'duration': 90,
'title': 'Bills Coach Rex Ryan Updates His Old Jets Tattoo', 'title': 'Bills Coach Rex Ryan Updates His Old Jets Tattoo',
}, },
} }, {
'url': 'http://www.wsj.com/video/can-alphabet-build-a-smarter-city/359DDAA8-9AC1-489C-82E6-0429C1E430E0.html',
'only_matching': True,
}]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
bitrates = [128, 174, 264, 320, 464, 664, 1264]
api_url = ( api_url = (
'http://video-api.wsj.com/api-video/find_all_videos.asp?' 'http://video-api.wsj.com/api-video/find_all_videos.asp?'
'type=guid&count=1&query=%s&' 'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,'
'fields=hls,adZone,thumbnailList,guid,state,secondsUntilStartTime,' 'thumbnailList,author,description,name,duration,videoURL,'
'author,description,name,linkURL,videoStillURL,duration,videoURL,' 'titletag,formattedCreationDate,keywords,editor' % video_id)
'adCategory,catastrophic,linkShortURL,doctypeID,youtubeID,'
'titletag,rssURL,wsj-section,wsj-subsection,allthingsd-section,'
'allthingsd-subsection,sm-section,sm-subsection,provider,'
'formattedCreationDate,keywords,keywordsOmniture,column,editor,'
'emailURL,emailPartnerID,showName,omnitureProgramName,'
'omnitureVideoFormat,linkRelativeURL,touchCastID,'
'omniturePublishDate,%s') % (
video_id, ','.join('video%dkMP4Url' % br for br in bitrates))
info = self._download_json(api_url, video_id)['items'][0] info = self._download_json(api_url, video_id)['items'][0]
# Thumbnails are conveniently in the correct format already
thumbnails = info.get('thumbnailList')
creator = info.get('author')
uploader_id = info.get('editor')
categories = info.get('keywords')
duration = int_or_none(info.get('duration'))
upload_date = unified_strdate(
info.get('formattedCreationDate'), day_first=False)
title = info.get('name', info.get('titletag')) title = info.get('name', info.get('titletag'))
formats = [{ formats = []
'format_id': 'f4m',
'format_note': 'f4m (meta URL)', f4m_url = info.get('videoURL')
'url': info['videoURL'], if f4m_url:
}] formats.extend(self._extract_f4m_formats(
if info.get('hls'): f4m_url, video_id, f4m_id='hds', fatal=False))
m3u8_url = info.get('hls')
if m3u8_url:
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
info['hls'], video_id, ext='mp4', info['hls'], video_id, ext='mp4',
preference=0, entry_protocol='m3u8_native')) entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
for br in bitrates:
field = 'video%dkMP4Url' % br for v in info.get('videoMP4List', []):
if info.get(field): mp4_url = v.get('url')
formats.append({ if not mp4_url:
'format_id': 'mp4-%d' % br, continue
'container': 'mp4', tbr = int_or_none(v.get('bitrate'))
'tbr': br, formats.append({
'url': info[field], 'url': mp4_url,
}) 'format_id': 'http' + ('-%d' % tbr if tbr else ''),
'tbr': tbr,
'width': int_or_none(v.get('width')),
'height': int_or_none(v.get('height')),
'fps': float_or_none(v.get('fps')),
})
self._sort_formats(formats) self._sort_formats(formats)
return { return {
'id': video_id, 'id': video_id,
'formats': formats, 'formats': formats,
'thumbnails': thumbnails, # Thumbnails are conveniently in the correct format already
'creator': creator, 'thumbnails': info.get('thumbnailList'),
'uploader_id': uploader_id, 'creator': info.get('author'),
'duration': duration, 'uploader_id': info.get('editor'),
'upload_date': upload_date, 'duration': int_or_none(info.get('duration')),
'upload_date': unified_strdate(info.get(
'formattedCreationDate'), day_first=False),
'title': title, 'title': title,
'categories': categories, 'categories': info.get('keywords'),
} }

View File

@ -0,0 +1,158 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote
from ..utils import int_or_none
class XiamiBaseIE(InfoExtractor):
_API_BASE_URL = 'http://www.xiami.com/song/playlist/cat/json/id'
def _extract_track(self, track, track_id=None):
title = track['title']
track_url = self._decrypt(track['location'])
subtitles = {}
lyrics_url = track.get('lyric_url') or track.get('lyric')
if lyrics_url and lyrics_url.startswith('http'):
subtitles['origin'] = [{'url': lyrics_url}]
return {
'id': track.get('song_id') or track_id,
'url': track_url,
'title': title,
'thumbnail': track.get('pic') or track.get('album_pic'),
'duration': int_or_none(track.get('length')),
'creator': track.get('artist', '').split(';')[0],
'track': title,
'album': track.get('album_name'),
'artist': track.get('artist'),
'subtitles': subtitles,
}
def _extract_tracks(self, item_id, typ=None):
playlist = self._download_json(
'%s/%s%s' % (self._API_BASE_URL, item_id, '/type/%s' % typ if typ else ''), item_id)
return [
self._extract_track(track, item_id)
for track in playlist['data']['trackList']]
@staticmethod
def _decrypt(origin):
n = int(origin[0])
origin = origin[1:]
short_lenth = len(origin) // n
long_num = len(origin) - short_lenth * n
l = tuple()
for i in range(0, n):
length = short_lenth
if i < long_num:
length += 1
l += (origin[0:length], )
origin = origin[length:]
ans = ''
for i in range(0, short_lenth + 1):
for j in range(0, n):
if len(l[j]) > i:
ans += l[j][i]
return compat_urllib_parse_unquote(ans).replace('^', '0')
class XiamiSongIE(XiamiBaseIE):
IE_NAME = 'xiami:song'
IE_DESC = '虾米音乐'
_VALID_URL = r'https?://(?:www\.)?xiami\.com/song/(?P<id>[0-9]+)'
_TESTS = [{
'url': 'http://www.xiami.com/song/1775610518',
'md5': '521dd6bea40fd5c9c69f913c232cb57e',
'info_dict': {
'id': '1775610518',
'ext': 'mp3',
'title': 'Woman',
'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg',
'duration': 265,
'creator': 'HONNE',
'track': 'Woman',
'album': 'Woman',
'artist': 'HONNE',
'subtitles': {
'origin': [{
'ext': 'lrc',
}],
},
}
}, {
'url': 'http://www.xiami.com/song/1775256504',
'md5': '932a3abd45c6aa2b1fdbe028fcb4c4fc',
'info_dict': {
'id': '1775256504',
'ext': 'mp3',
'title': '悟空',
'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg',
'duration': 200,
'creator': '戴荃',
'track': '悟空',
'album': '悟空',
'artist': '戴荃',
'subtitles': {
'origin': [{
'ext': 'lrc',
}],
},
}
}]
def _real_extract(self, url):
return self._extract_tracks(self._match_id(url))[0]
class XiamiPlaylistBaseIE(XiamiBaseIE):
def _real_extract(self, url):
item_id = self._match_id(url)
return self.playlist_result(self._extract_tracks(item_id, self._TYPE), item_id)
class XiamiAlbumIE(XiamiPlaylistBaseIE):
IE_NAME = 'xiami:album'
IE_DESC = '虾米音乐 - 专辑'
_VALID_URL = r'https?://(?:www\.)?xiami\.com/album/(?P<id>[0-9]+)'
_TYPE = '1'
_TESTS = [{
'url': 'http://www.xiami.com/album/2100300444',
'info_dict': {
'id': '2100300444',
},
'playlist_count': 10,
}, {
'url': 'http://www.xiami.com/album/512288?spm=a1z1s.6843761.1110925389.6.hhE9p9',
'only_matching': True,
}]
class XiamiArtistIE(XiamiPlaylistBaseIE):
IE_NAME = 'xiami:artist'
IE_DESC = '虾米音乐 - 歌手'
_VALID_URL = r'https?://(?:www\.)?xiami\.com/artist/(?P<id>[0-9]+)'
_TYPE = '2'
_TEST = {
'url': 'http://www.xiami.com/artist/2132?spm=0.0.0.0.dKaScp',
'info_dict': {
'id': '2132',
},
'playlist_count': 20,
}
class XiamiCollectionIE(XiamiPlaylistBaseIE):
IE_NAME = 'xiami:collection'
IE_DESC = '虾米音乐 - 精选集'
_VALID_URL = r'https?://(?:www\.)?xiami\.com/collect/(?P<id>[0-9]+)'
_TYPE = '3'
_TEST = {
'url': 'http://www.xiami.com/collect/156527391?spm=a1z1s.2943601.6856193.12.4jpBnr',
'info_dict': {
'id': '156527391',
},
'playlist_mincount': 29,
}

View File

@ -389,23 +389,30 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
class FFmpegMetadataPP(FFmpegPostProcessor): class FFmpegMetadataPP(FFmpegPostProcessor):
def run(self, info): def run(self, info):
metadata = {} metadata = {}
if info.get('title') is not None:
metadata['title'] = info['title'] def add(meta_list, info_list=None):
if info.get('upload_date') is not None: if not info_list:
metadata['date'] = info['upload_date'] info_list = meta_list
if info.get('artist') is not None: if not isinstance(meta_list, (list, tuple)):
metadata['artist'] = info['artist'] meta_list = (meta_list,)
elif info.get('uploader') is not None: if not isinstance(info_list, (list, tuple)):
metadata['artist'] = info['uploader'] info_list = (info_list,)
elif info.get('uploader_id') is not None: for info_f in info_list:
metadata['artist'] = info['uploader_id'] if info.get(info_f) is not None:
if info.get('description') is not None: for meta_f in meta_list:
metadata['description'] = info['description'] metadata[meta_f] = info[info_f]
metadata['comment'] = info['description'] break
if info.get('webpage_url') is not None:
metadata['purl'] = info['webpage_url'] add('title', ('track', 'title'))
if info.get('album') is not None: add('date', 'upload_date')
metadata['album'] = info['album'] add(('description', 'comment'), 'description')
add('purl', 'webpage_url')
add('track', 'track_number')
add('artist', ('artist', 'creator', 'uploader', 'uploader_id'))
add('genre')
add('album')
add('album_artist')
add('disc', 'disc_number')
if not metadata: if not metadata:
self._downloader.to_screen('[ffmpeg] There isn\'t any metadata to add') self._downloader.to_screen('[ffmpeg] There isn\'t any metadata to add')

View File

@ -1,3 +1,3 @@
from __future__ import unicode_literals from __future__ import unicode_literals
__version__ = '2016.04.24' __version__ = '2016.05.01'