This commit is contained in:
Gilles Habran 2016-05-20 08:08:25 +02:00
commit d12939e16b
12 changed files with 326 additions and 116 deletions

View File

@ -1,7 +1,7 @@
all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites
clean: clean:
rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi *.mkv *.webm CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi *.mkv *.webm *.jpg *.png CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe
find . -name "*.pyc" -delete find . -name "*.pyc" -delete
find . -name "*.class" -delete find . -name "*.class" -delete

View File

@ -50,6 +50,8 @@ from youtube_dl.utils import (
sanitize_path, sanitize_path,
prepend_extension, prepend_extension,
replace_extension, replace_extension,
remove_start,
remove_end,
remove_quotes, remove_quotes,
shell_quote, shell_quote,
smuggle_url, smuggle_url,
@ -215,6 +217,16 @@ class TestUtil(unittest.TestCase):
self.assertEqual(replace_extension('.abc', 'temp'), '.abc.temp') self.assertEqual(replace_extension('.abc', 'temp'), '.abc.temp')
self.assertEqual(replace_extension('.abc.ext', 'temp'), '.abc.temp') self.assertEqual(replace_extension('.abc.ext', 'temp'), '.abc.temp')
def test_remove_start(self):
self.assertEqual(remove_start(None, 'A - '), None)
self.assertEqual(remove_start('A - B', 'A - '), 'B')
self.assertEqual(remove_start('B - A', 'A - '), 'B - A')
def test_remove_end(self):
self.assertEqual(remove_end(None, ' - B'), None)
self.assertEqual(remove_end('A - B', ' - B'), 'A')
self.assertEqual(remove_end('B - A', ' - B'), 'B - A')
def test_remove_quotes(self): def test_remove_quotes(self):
self.assertEqual(remove_quotes(None), None) self.assertEqual(remove_quotes(None), None)
self.assertEqual(remove_quotes('"'), '"') self.assertEqual(remove_quotes('"'), '"')

View File

@ -0,0 +1,135 @@
# coding: utf-8
from __future__ import unicode_literals
import calendar
import re
import time
from .amp import AMPIE
from .common import InfoExtractor
from ..compat import compat_urlparse
class AbcNewsVideoIE(AMPIE):
IE_NAME = 'abcnews:video'
_VALID_URL = 'http://abcnews.go.com/[^/]+/video/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)'
_TESTS = [{
'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932',
'info_dict': {
'id': '20411932',
'ext': 'mp4',
'display_id': 'week-exclusive-irans-foreign-minister-zarif',
'title': '\'This Week\' Exclusive: Iran\'s Foreign Minister Zarif',
'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.',
'duration': 180,
'thumbnail': 're:^https?://.*\.jpg$',
},
'params': {
# m3u8 download
'skip_download': True,
},
}, {
'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('display_id')
video_id = mobj.group('id')
info_dict = self._extract_feed_info(
'http://abcnews.go.com/video/itemfeed?id=%s' % video_id)
info_dict.update({
'id': video_id,
'display_id': display_id,
})
return info_dict
class AbcNewsIE(InfoExtractor):
IE_NAME = 'abcnews'
_VALID_URL = 'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)'
_TESTS = [{
'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY',
'info_dict': {
'id': '10498713',
'ext': 'flv',
'display_id': 'dramatic-video-rare-death-job-america',
'title': 'Occupational Hazards',
'description': 'Nightline investigates the dangers that lurk at various jobs.',
'thumbnail': 're:^https?://.*\.jpg$',
'upload_date': '20100428',
'timestamp': 1272412800,
},
'add_ie': ['AbcNewsVideo'],
}, {
'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818',
'info_dict': {
'id': '39125818',
'ext': 'mp4',
'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016',
'title': 'Justin Timberlake Drops Hints For Secret Single',
'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.',
'upload_date': '20160515',
'timestamp': 1463329500,
},
'params': {
# m3u8 download
'skip_download': True,
# The embedded YouTube video is blocked due to copyright issues
'playlist_items': '1',
},
'add_ie': ['AbcNewsVideo'],
}, {
'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('display_id')
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL')
full_video_url = compat_urlparse.urljoin(url, video_url)
youtube_url = self._html_search_regex(
r'<iframe[^>]+src="(https://www\.youtube\.com/embed/[^"]+)"',
webpage, 'YouTube URL', default=None)
timestamp = None
date_str = self._html_search_regex(
r'<span[^>]+class="timestamp">([^<]+)</span>',
webpage, 'timestamp', fatal=False)
if date_str:
tz_offset = 0
if date_str.endswith(' ET'): # Eastern Time
tz_offset = -5
date_str = date_str[:-3]
date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p']
for date_format in date_formats:
try:
timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format))
except ValueError:
continue
if timestamp is not None:
timestamp -= tz_offset * 3600
entry = {
'_type': 'url_transparent',
'ie_key': AbcNewsVideoIE.ie_key(),
'url': full_video_url,
'id': video_id,
'display_id': display_id,
'timestamp': timestamp,
}
if youtube_url:
entries = [entry, self.url_result(youtube_url, 'Youtube')]
return self.playlist_result(entries)
return entry

View File

@ -52,7 +52,7 @@ class AMPIE(InfoExtractor):
for media_data in media_content: for media_data in media_content:
media = media_data['@attributes'] media = media_data['@attributes']
media_type = media['type'] media_type = media['type']
if media_type == 'video/f4m': if media_type in ('video/f4m', 'application/f4m+xml'):
formats.extend(self._extract_f4m_formats( formats.extend(self._extract_f4m_formats(
media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124',
video_id, f4m_id='hds', fatal=False)) video_id, f4m_id='hds', fatal=False))
@ -61,7 +61,7 @@ class AMPIE(InfoExtractor):
media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False)) media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False))
else: else:
formats.append({ formats.append({
'format_id': media_data['media-category']['@attributes']['label'], 'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),
'url': media['url'], 'url': media['url'],
'tbr': int_or_none(media.get('bitrate')), 'tbr': int_or_none(media.get('bitrate')),
'filesize': int_or_none(media.get('fileSize')), 'filesize': int_or_none(media.get('fileSize')),

View File

@ -4,7 +4,10 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import js_to_json from ..utils import (
js_to_json,
smuggle_url,
)
class CBCIE(InfoExtractor): class CBCIE(InfoExtractor):
@ -12,57 +15,54 @@ class CBCIE(InfoExtractor):
_TESTS = [{ _TESTS = [{
# with mediaId # with mediaId
'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs', 'url': 'http://www.cbc.ca/22minutes/videos/clips-season-23/don-cherry-play-offs',
'md5': '97e24d09672fc4cf56256d6faa6c25bc',
'info_dict': { 'info_dict': {
'id': '2682904050', 'id': '2682904050',
'ext': 'flv', 'ext': 'mp4',
'title': 'Don Cherry All-Stars', 'title': 'Don Cherry All-Stars',
'description': 'Don Cherry has a bee in his bonnet about AHL player John Scott because that guys got heart.', 'description': 'Don Cherry has a bee in his bonnet about AHL player John Scott because that guys got heart.',
'timestamp': 1454475540, 'timestamp': 1454463000,
'upload_date': '20160203', 'upload_date': '20160203',
}, 'uploader': 'CBCC-NEW',
'params': {
# rtmp download
'skip_download': True,
}, },
}, { }, {
# with clipId # with clipId
'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live', 'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live',
'info_dict': { 'info_dict': {
'id': '2487345465', 'id': '2487345465',
'ext': 'flv', 'ext': 'mp4',
'title': 'Robin Williams freestyles on 90 Minutes Live', 'title': 'Robin Williams freestyles on 90 Minutes Live',
'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.', 'description': 'Wacky American comedian Robin Williams shows off his infamous "freestyle" comedic talents while being interviewed on CBC\'s 90 Minutes Live.',
'upload_date': '19700101', 'upload_date': '19780210',
'uploader': 'CBCC-NEW', 'uploader': 'CBCC-NEW',
}, 'timestamp': 255977160,
'params': {
# rtmp download
'skip_download': True,
}, },
}, { }, {
# multiple iframes # multiple iframes
'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot', 'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot',
'playlist': [{ 'playlist': [{
'md5': '377572d0b49c4ce0c9ad77470e0b96b4',
'info_dict': { 'info_dict': {
'id': '2680832926', 'id': '2680832926',
'ext': 'flv', 'ext': 'mp4',
'title': 'An Eagle\'s-Eye View Off Burrard Bridge', 'title': 'An Eagle\'s-Eye View Off Burrard Bridge',
'description': 'Hercules the eagle flies from Vancouver\'s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.', 'description': 'Hercules the eagle flies from Vancouver\'s Burrard Bridge down to a nearby park with a mini-camera strapped to his back.',
'upload_date': '19700101', 'upload_date': '20160201',
'timestamp': 1454342820,
'uploader': 'CBCC-NEW',
}, },
}, { }, {
'md5': '415a0e3f586113894174dfb31aa5bb1a',
'info_dict': { 'info_dict': {
'id': '2658915080', 'id': '2658915080',
'ext': 'flv', 'ext': 'mp4',
'title': 'Fly like an eagle!', 'title': 'Fly like an eagle!',
'description': 'Eagle equipped with a mini camera flies from the world\'s tallest tower', 'description': 'Eagle equipped with a mini camera flies from the world\'s tallest tower',
'upload_date': '19700101', 'upload_date': '20150315',
'timestamp': 1426443984,
'uploader': 'CBCC-NEW',
}, },
}], }],
'params': {
# rtmp download
'skip_download': True,
},
}] }]
@classmethod @classmethod
@ -95,20 +95,23 @@ class CBCPlayerIE(InfoExtractor):
'url': 'http://www.cbc.ca/player/play/2683190193', 'url': 'http://www.cbc.ca/player/play/2683190193',
'info_dict': { 'info_dict': {
'id': '2683190193', 'id': '2683190193',
'ext': 'flv', 'ext': 'mp4',
'title': 'Gerry Runs a Sweat Shop', 'title': 'Gerry Runs a Sweat Shop',
'description': 'md5:b457e1c01e8ff408d9d801c1c2cd29b0', 'description': 'md5:b457e1c01e8ff408d9d801c1c2cd29b0',
'timestamp': 1455067800, 'timestamp': 1455071400,
'upload_date': '20160210', 'upload_date': '20160210',
}, 'uploader': 'CBCC-NEW',
'params': {
# rtmp download
'skip_download': True,
}, },
} }
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
return self.url_result( return {
'http://feed.theplatform.com/f/ExhSPC/vms_5akSXx4Ng_Zn?byGuid=%s' % video_id, '_type': 'url_transparent',
'ThePlatformFeed', video_id) 'ie_key': 'ThePlatform',
'url': smuggle_url(
'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/%s?mbr=true' % video_id, {
'force_smil_url': True
}),
'id': video_id,
}

View File

@ -3,6 +3,10 @@ from __future__ import unicode_literals
from .abc import ABCIE from .abc import ABCIE
from .abc7news import Abc7NewsIE from .abc7news import Abc7NewsIE
from .abcnews import (
AbcNewsIE,
AbcNewsVideoIE,
)
from .academicearth import AcademicEarthCourseIE from .academicearth import AcademicEarthCourseIE
from .acast import ( from .acast import (
ACastIE, ACastIE,
@ -238,6 +242,7 @@ from .fktv import FKTVIE
from .flickr import FlickrIE from .flickr import FlickrIE
from .folketinget import FolketingetIE from .folketinget import FolketingetIE
from .footyroom import FootyRoomIE from .footyroom import FootyRoomIE
from .formula1 import Formula1IE
from .fourtube import FourTubeIE from .fourtube import FourTubeIE
from .fox import FOXIE from .fox import FOXIE
from .foxgay import FoxgayIE from .foxgay import FoxgayIE
@ -365,6 +370,7 @@ from .kuwo import (
) )
from .la7 import LA7IE from .la7 import LA7IE
from .laola1tv import Laola1TvIE from .laola1tv import Laola1TvIE
from .learnr import LearnrIE
from .lecture2go import Lecture2GoIE from .lecture2go import Lecture2GoIE
from .lemonde import LemondeIE from .lemonde import LemondeIE
from .leeco import ( from .leeco import (

View File

@ -0,0 +1,25 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
class Formula1IE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?formula1\.com/content/fom-website/en/video/\d{4}/\d{1,2}/(?P<id>.+?)\.html'
_TEST = {
'url': 'http://www.formula1.com/content/fom-website/en/video/2016/5/Race_highlights_-_Spain_2016.html',
'md5': '8c79e54be72078b26b89e0e111c0502b',
'info_dict': {
'id': 'JvYXJpMzE6pArfHWm5ARp5AiUmD-gibV',
'ext': 'flv',
'title': 'Race highlights - Spain 2016',
}
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
ooyala_embed_code = self._search_regex(
r'data-videoid="([^"]+)"', webpage, 'ooyala embed code')
return self.url_result(
'ooyala:%s' % ooyala_embed_code, 'Ooyala', ooyala_embed_code)

View File

@ -0,0 +1,33 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
class LearnrIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?learnr\.pro/view/video/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.learnr.pro/view/video/51624-web-development-tutorial-for-beginners-1-how-to-build-webpages-with-html-css-javascript',
'md5': '3719fdf0a68397f49899e82c308a89de',
'info_dict': {
'id': '51624',
'ext': 'mp4',
'title': 'Web Development Tutorial for Beginners (#1) - How to build webpages with HTML, CSS, Javascript',
'description': 'md5:b36dbfa92350176cdf12b4d388485503',
'uploader': 'LearnCode.academy',
'uploader_id': 'learncodeacademy',
'upload_date': '20131021',
},
'add_ie': ['Youtube'],
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
return {
'_type': 'url_transparent',
'url': self._search_regex(
r"videoId\s*:\s*'([^']+)'", webpage, 'youtube id'),
'id': video_id,
}

View File

@ -1,19 +1,18 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
month_by_name,
int_or_none, int_or_none,
remove_end,
unified_strdate,
) )
class NDTVIE(InfoExtractor): class NDTVIE(InfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?ndtv\.com/video/player/[^/]*/[^/]*/(?P<id>[a-z0-9]+)' _VALID_URL = r'https?://(?:www\.)?ndtv\.com/video/(?:[^/]+/)+[^/?^&]+-(?P<id>\d+)'
_TEST = { _TEST = {
'url': 'http://www.ndtv.com/video/player/news/ndtv-exclusive-don-t-need-character-certificate-from-rahul-gandhi-says-arvind-kejriwal/300710', 'url': 'http://www.ndtv.com/video/news/news/ndtv-exclusive-don-t-need-character-certificate-from-rahul-gandhi-says-arvind-kejriwal-300710',
'md5': '39f992dbe5fb531c395d8bbedb1e5e88', 'md5': '39f992dbe5fb531c395d8bbedb1e5e88',
'info_dict': { 'info_dict': {
'id': '300710', 'id': '300710',
@ -22,7 +21,7 @@ class NDTVIE(InfoExtractor):
'description': 'md5:ab2d4b4a6056c5cb4caa6d729deabf02', 'description': 'md5:ab2d4b4a6056c5cb4caa6d729deabf02',
'upload_date': '20131208', 'upload_date': '20131208',
'duration': 1327, 'duration': 1327,
'thumbnail': 'http://i.ndtvimg.com/video/images/vod/medium/2013-12/big_300710_1386518307.jpg', 'thumbnail': 're:https?://.*\.jpg',
}, },
} }
@ -30,36 +29,19 @@ class NDTVIE(InfoExtractor):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
title = remove_end(self._og_search_title(webpage), ' - NDTV')
filename = self._search_regex( filename = self._search_regex(
r"__filename='([^']+)'", webpage, 'video filename') r"__filename='([^']+)'", webpage, 'video filename')
video_url = ('http://bitcast-b.bitgravity.com/ndtvod/23372/ndtv/%s' % video_url = 'http://bitcast-b.bitgravity.com/ndtvod/23372/ndtv/%s' % filename
filename)
duration = int_or_none(self._search_regex( duration = int_or_none(self._search_regex(
r"__duration='([^']+)'", webpage, 'duration', fatal=False)) r"__duration='([^']+)'", webpage, 'duration', fatal=False))
date_m = re.search(r'''(?x) upload_date = unified_strdate(self._html_search_meta(
<p\s+class="vod_dateline">\s* 'publish-date', webpage, 'upload date', fatal=False))
Published\s+On:\s*
(?P<monthname>[A-Za-z]+)\s+(?P<day>[0-9]+),\s*(?P<year>[0-9]+)
''', webpage)
upload_date = None
if date_m is not None: description = remove_end(self._og_search_description(webpage), ' (Read more)')
month = month_by_name(date_m.group('monthname'))
if month is not None:
upload_date = '%s%02d%02d' % (
date_m.group('year'), month, int(date_m.group('day')))
description = self._og_search_description(webpage)
READ_MORE = ' (Read more)'
if description.endswith(READ_MORE):
description = description[:-len(READ_MORE)]
title = self._og_search_title(webpage)
TITLE_SUFFIX = ' - NDTV'
if title.endswith(TITLE_SUFFIX):
title = title[:-len(TITLE_SUFFIX)]
return { return {
'id': video_id, 'id': video_id,

View File

@ -2,8 +2,12 @@ from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
sanitized_Request, clean_html,
determine_ext,
int_or_none,
qualities,
urlencode_postdata, urlencode_postdata,
xpath_text,
) )
@ -16,12 +20,12 @@ class NFBIE(InfoExtractor):
'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny', 'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny',
'info_dict': { 'info_dict': {
'id': 'qallunaat_why_white_people_are_funny', 'id': 'qallunaat_why_white_people_are_funny',
'ext': 'mp4', 'ext': 'flv',
'title': 'Qallunaat! Why White People Are Funny ', 'title': 'Qallunaat! Why White People Are Funny ',
'description': 'md5:836d8aff55e087d04d9f6df554d4e038', 'description': 'md5:6b8e32dde3abf91e58857b174916620c',
'duration': 3128, 'duration': 3128,
'creator': 'Mark Sandiford',
'uploader': 'Mark Sandiford', 'uploader': 'Mark Sandiford',
'uploader_id': 'mark-sandiford',
}, },
'params': { 'params': {
# rtmp download # rtmp download
@ -31,65 +35,78 @@ class NFBIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
page = self._download_webpage(
'https://www.nfb.ca/film/%s' % video_id, video_id,
'Downloading film page')
uploader_id = self._html_search_regex(r'<a class="director-link" href="/explore-all-directors/([^/]+)/"', config = self._download_xml(
page, 'director id', fatal=False)
uploader = self._html_search_regex(r'<em class="director-name" itemprop="name">([^<]+)</em>',
page, 'director name', fatal=False)
request = sanitized_Request(
'https://www.nfb.ca/film/%s/player_config' % video_id, 'https://www.nfb.ca/film/%s/player_config' % video_id,
urlencode_postdata({'getConfig': 'true'})) video_id, 'Downloading player config XML',
request.add_header('Content-Type', 'application/x-www-form-urlencoded') data=urlencode_postdata({'getConfig': 'true'}),
request.add_header('X-NFB-Referer', 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf') headers={
'Content-Type': 'application/x-www-form-urlencoded',
'X-NFB-Referer': 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf'
})
config = self._download_xml(request, video_id, 'Downloading player config XML') title, description, thumbnail, duration, uploader, author = [None] * 6
thumbnails, formats = [[]] * 2
title = None subtitles = {}
description = None
thumbnail = None
duration = None
formats = []
def extract_thumbnail(media):
thumbnails = {}
for asset in media.findall('assets/asset'):
thumbnails[asset.get('quality')] = asset.find('default/url').text
if not thumbnails:
return None
if 'high' in thumbnails:
return thumbnails['high']
return list(thumbnails.values())[0]
for media in config.findall('./player/stream/media'): for media in config.findall('./player/stream/media'):
if media.get('type') == 'posterImage': if media.get('type') == 'posterImage':
thumbnail = extract_thumbnail(media) quality_key = qualities(('low', 'high'))
elif media.get('type') == 'video': thumbnails = []
duration = int(media.get('duration'))
title = media.find('title').text
description = media.find('description').text
# It seems assets always go from lower to better quality, so no need to sort
for asset in media.findall('assets/asset'): for asset in media.findall('assets/asset'):
for x in asset: asset_url = xpath_text(asset, 'default/url', default=None)
if not asset_url:
continue
quality = asset.get('quality')
thumbnails.append({
'url': asset_url,
'id': quality,
'preference': quality_key(quality),
})
elif media.get('type') == 'video':
title = xpath_text(media, 'title', fatal=True)
for asset in media.findall('assets/asset'):
quality = asset.get('quality')
height = int_or_none(self._search_regex(
r'^(\d+)[pP]$', quality or '', 'height', default=None))
for node in asset:
streamer = xpath_text(node, 'streamerURI', default=None)
if not streamer:
continue
play_path = xpath_text(node, 'url', default=None)
if not play_path:
continue
formats.append({ formats.append({
'url': x.find('streamerURI').text, 'url': streamer,
'app': x.find('streamerURI').text.split('/', 3)[3], 'app': streamer.split('/', 3)[3],
'play_path': x.find('url').text, 'play_path': play_path,
'rtmp_live': False, 'rtmp_live': False,
'ext': 'mp4', 'ext': 'flv',
'format_id': '%s-%s' % (x.tag, asset.get('quality')), 'format_id': '%s-%s' % (node.tag, quality) if quality else node.tag,
'height': height,
}) })
self._sort_formats(formats)
description = clean_html(xpath_text(media, 'description'))
uploader = xpath_text(media, 'author')
duration = int_or_none(media.get('duration'))
for subtitle in media.findall('./subtitles/subtitle'):
subtitle_url = xpath_text(subtitle, 'url', default=None)
if not subtitle_url:
continue
lang = xpath_text(subtitle, 'lang', default='en')
subtitles.setdefault(lang, []).append({
'url': subtitle_url,
'ext': (subtitle.get('format') or determine_ext(subtitle_url)).lower(),
})
return { return {
'id': video_id, 'id': video_id,
'title': title, 'title': title,
'description': description, 'description': description,
'thumbnail': thumbnail, 'thumbnails': thumbnails,
'duration': duration, 'duration': duration,
'creator': uploader,
'uploader': uploader, 'uploader': uploader,
'uploader_id': uploader_id,
'formats': formats, 'formats': formats,
'subtitles': subtitles,
} }

View File

@ -47,7 +47,8 @@ class TwentyFourVideoIE(InfoExtractor):
title = self._og_search_title(webpage) title = self._og_search_title(webpage)
description = self._html_search_regex( description = self._html_search_regex(
r'<span itemprop="description">([^<]+)</span>', webpage, 'description', fatal=False) r'<(p|span)[^>]+itemprop="description"[^>]*>(?P<description>[^<]+)</\1>',
webpage, 'description', fatal=False, group='description')
thumbnail = self._og_search_thumbnail(webpage) thumbnail = self._og_search_thumbnail(webpage)
duration = int_or_none(self._og_search_property( duration = int_or_none(self._og_search_property(
'duration', webpage, 'duration', fatal=False)) 'duration', webpage, 'duration', fatal=False))

View File

@ -1549,15 +1549,11 @@ def setproctitle(title):
def remove_start(s, start): def remove_start(s, start):
if s.startswith(start): return s[len(start):] if s is not None and s.startswith(start) else s
return s[len(start):]
return s
def remove_end(s, end): def remove_end(s, end):
if s.endswith(end): return s[:-len(end)] if s is not None and s.endswith(end) else s
return s[:-len(end)]
return s
def remove_quotes(s): def remove_quotes(s):