Merge pull request #10 from rg3/master

update 24 may
This commit is contained in:
siddht1 2016-05-24 23:50:42 +05:30
commit d030e9ca0b
26 changed files with 600 additions and 263 deletions

View File

@ -172,3 +172,4 @@ blahgeek
Kevin Deldycke Kevin Deldycke
inondle inondle
Tomáš Čech Tomáš Čech
Déstin Reed

View File

@ -693,6 +693,10 @@ hash -r
Again, from then on you'll be able to update with `sudo youtube-dl -U`. Again, from then on you'll be able to update with `sudo youtube-dl -U`.
### youtube-dl is extremely slow to start on Windows
Add a file exclusion for `youtube-dl.exe` in Windows Defender settings.
### I'm getting an error `Unable to extract OpenGraph title` on YouTube playlists ### I'm getting an error `Unable to extract OpenGraph title` on YouTube playlists
YouTube changed their playlist format in March 2014 and later on, so you'll need at least youtube-dl 2014.07.25 to download all YouTube videos. YouTube changed their playlist format in March 2014 and later on, so you'll need at least youtube-dl 2014.07.25 to download all YouTube videos.

View File

@ -103,6 +103,12 @@ class TestCompat(unittest.TestCase):
self.assertTrue(isinstance(doc.find('chinese').text, compat_str)) self.assertTrue(isinstance(doc.find('chinese').text, compat_str))
self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str)) self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str))
def test_compat_etree_fromstring_doctype(self):
xml = '''<?xml version="1.0"?>
<!DOCTYPE smil PUBLIC "-//W3C//DTD SMIL 2.0//EN" "http://www.w3.org/2001/SMIL20/SMIL20.dtd">
<smil xmlns="http://www.w3.org/2001/SMIL20/Language"></smil>'''
compat_etree_fromstring(xml)
def test_struct_unpack(self): def test_struct_unpack(self):
self.assertEqual(compat_struct_unpack('!B', b'\x00'), (0,)) self.assertEqual(compat_struct_unpack('!B', b'\x00'), (0,))

View File

@ -245,13 +245,20 @@ try:
except ImportError: # Python 2.6 except ImportError: # Python 2.6
from xml.parsers.expat import ExpatError as compat_xml_parse_error from xml.parsers.expat import ExpatError as compat_xml_parse_error
etree = xml.etree.ElementTree
class _TreeBuilder(etree.TreeBuilder):
def doctype(self, name, pubid, system):
pass
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
compat_etree_fromstring = xml.etree.ElementTree.fromstring def compat_etree_fromstring(text):
return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder()))
else: else:
# python 2.x tries to encode unicode strings with ascii (see the # python 2.x tries to encode unicode strings with ascii (see the
# XMLParser._fixtext method) # XMLParser._fixtext method)
etree = xml.etree.ElementTree
try: try:
_etree_iter = etree.Element.iter _etree_iter = etree.Element.iter
except AttributeError: # Python <=2.6 except AttributeError: # Python <=2.6
@ -265,7 +272,7 @@ else:
# 2.7 source # 2.7 source
def _XML(text, parser=None): def _XML(text, parser=None):
if not parser: if not parser:
parser = etree.XMLParser(target=etree.TreeBuilder()) parser = etree.XMLParser(target=_TreeBuilder())
parser.feed(text) parser.feed(text)
return parser.close() return parser.close()
@ -277,7 +284,7 @@ else:
return el return el
def compat_etree_fromstring(text): def compat_etree_fromstring(text):
doc = _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory))) doc = _XML(text, parser=etree.XMLParser(target=_TreeBuilder(element_factory=_element_factory)))
for el in _etree_iter(doc): for el in _etree_iter(doc):
if el.text is not None and isinstance(el.text, bytes): if el.text is not None and isinstance(el.text, bytes):
el.text = el.text.decode('utf-8') el.text = el.text.decode('utf-8')

View File

@ -29,7 +29,7 @@ class BandcampIE(InfoExtractor):
'_skip': 'There is a limit of 200 free downloads / month for the test song' '_skip': 'There is a limit of 200 free downloads / month for the test song'
}, { }, {
'url': 'http://benprunty.bandcamp.com/track/lanius-battle', 'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
'md5': '2b68e5851514c20efdff2afc5603b8b4', 'md5': '73d0b3171568232574e45652f8720b5c',
'info_dict': { 'info_dict': {
'id': '2650410135', 'id': '2650410135',
'ext': 'mp3', 'ext': 'mp3',
@ -48,6 +48,10 @@ class BandcampIE(InfoExtractor):
if m_trackinfo: if m_trackinfo:
json_code = m_trackinfo.group(1) json_code = m_trackinfo.group(1)
data = json.loads(json_code)[0] data = json.loads(json_code)[0]
track_id = compat_str(data['id'])
if not data.get('file'):
raise ExtractorError('Not streamable', video_id=track_id, expected=True)
formats = [] formats = []
for format_id, format_url in data['file'].items(): for format_id, format_url in data['file'].items():
@ -64,7 +68,7 @@ class BandcampIE(InfoExtractor):
self._sort_formats(formats) self._sort_formats(formats)
return { return {
'id': compat_str(data['id']), 'id': track_id,
'title': data['title'], 'title': data['title'],
'formats': formats, 'formats': formats,
'duration': float_or_none(data.get('duration')), 'duration': float_or_none(data.get('duration')),

View File

@ -11,6 +11,7 @@ class BYUtvIE(InfoExtractor):
_VALID_URL = r'^https?://(?:www\.)?byutv.org/watch/[0-9a-f-]+/(?P<video_id>[^/?#]+)' _VALID_URL = r'^https?://(?:www\.)?byutv.org/watch/[0-9a-f-]+/(?P<video_id>[^/?#]+)'
_TEST = { _TEST = {
'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5', 'url': 'http://www.byutv.org/watch/6587b9a3-89d2-42a6-a7f7-fd2f81840a7d/studio-c-season-5-episode-5',
'md5': '05850eb8c749e2ee05ad5a1c34668493',
'info_dict': { 'info_dict': {
'id': 'studio-c-season-5-episode-5', 'id': 'studio-c-season-5-episode-5',
'ext': 'mp4', 'ext': 'mp4',
@ -21,7 +22,8 @@ class BYUtvIE(InfoExtractor):
}, },
'params': { 'params': {
'skip_download': True, 'skip_download': True,
} },
'add_ie': ['Ooyala'],
} }
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -1,5 +1,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .theplatform import ThePlatformIE from .theplatform import ThePlatformIE
from ..utils import ( from ..utils import (
xpath_text, xpath_text,
@ -21,7 +23,7 @@ class CBSBaseIE(ThePlatformIE):
class CBSIE(CBSBaseIE): class CBSIE(CBSBaseIE):
_VALID_URL = r'https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<id>[^/]+)' _VALID_URL = r'(?:cbs:(?P<content_id>\w+)|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/(?:video|artist)|colbertlateshow\.com/(?:video|podcasts))/[^/]+/(?P<display_id>[^/]+))'
_TESTS = [{ _TESTS = [{
'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
@ -66,11 +68,12 @@ class CBSIE(CBSBaseIE):
TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true'
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) content_id, display_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, display_id) if not content_id:
content_id = self._search_regex( webpage = self._download_webpage(url, display_id)
[r"video\.settings\.content_id\s*=\s*'([^']+)';", r"cbsplayer\.contentId\s*=\s*'([^']+)';"], content_id = self._search_regex(
webpage, 'content id') [r"video\.settings\.content_id\s*=\s*'([^']+)';", r"cbsplayer\.contentId\s*=\s*'([^']+)';"],
webpage, 'content id')
items_data = self._download_xml( items_data = self._download_xml(
'http://can.cbs.com/thunder/player/videoPlayerService.php', 'http://can.cbs.com/thunder/player/videoPlayerService.php',
content_id, query={'partner': 'cbs', 'contentId': content_id}) content_id, query={'partner': 'cbs', 'contentId': content_id})

View File

@ -8,6 +8,7 @@ class ESPNIE(InfoExtractor):
_VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P<id>[^/]+)' _VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P<id>[^/]+)'
_TESTS = [{ _TESTS = [{
'url': 'http://espn.go.com/video/clip?id=10365079', 'url': 'http://espn.go.com/video/clip?id=10365079',
'md5': '60e5d097a523e767d06479335d1bdc58',
'info_dict': { 'info_dict': {
'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG', 'id': 'FkYWtmazr6Ed8xmvILvKLWjd4QvYZpzG',
'ext': 'mp4', 'ext': 'mp4',
@ -15,21 +16,22 @@ class ESPNIE(InfoExtractor):
'description': None, 'description': None,
}, },
'params': { 'params': {
# m3u8 download
'skip_download': True, 'skip_download': True,
}, },
'add_ie': ['OoyalaExternal'],
}, { }, {
# intl video, from http://www.espnfc.us/video/mls-highlights/150/video/2743663/must-see-moments-best-of-the-mls-season # intl video, from http://www.espnfc.us/video/mls-highlights/150/video/2743663/must-see-moments-best-of-the-mls-season
'url': 'http://espn.go.com/video/clip?id=2743663', 'url': 'http://espn.go.com/video/clip?id=2743663',
'md5': 'f4ac89b59afc7e2d7dbb049523df6768',
'info_dict': { 'info_dict': {
'id': '50NDFkeTqRHB0nXBOK-RGdSG5YQPuxHg', 'id': '50NDFkeTqRHB0nXBOK-RGdSG5YQPuxHg',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Must-See Moments: Best of the MLS season', 'title': 'Must-See Moments: Best of the MLS season',
}, },
'params': { 'params': {
# m3u8 download
'skip_download': True, 'skip_download': True,
}, },
'add_ie': ['OoyalaExternal'],
}, { }, {
'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079', 'url': 'https://espn.go.com/video/iframe/twitter/?cms=espn&id=10365079',
'only_matching': True, 'only_matching': True,

View File

@ -617,6 +617,10 @@ from .qqmusic import (
QQMusicPlaylistIE, QQMusicPlaylistIE,
) )
from .r7 import R7IE from .r7 import R7IE
from .radiocanada import (
RadioCanadaIE,
RadioCanadaAudioVideoIE,
)
from .radiode import RadioDeIE from .radiode import RadioDeIE
from .radiojavan import RadioJavanIE from .radiojavan import RadioJavanIE
from .radiobremen import RadioBremenIE from .radiobremen import RadioBremenIE
@ -630,6 +634,7 @@ from .rds import RDSIE
from .redtube import RedTubeIE from .redtube import RedTubeIE
from .regiotv import RegioTVIE from .regiotv import RegioTVIE
from .restudy import RestudyIE from .restudy import RestudyIE
from .reuters import ReutersIE
from .reverbnation import ReverbNationIE from .reverbnation import ReverbNationIE
from .revision3 import Revision3IE from .revision3 import Revision3IE
from .rice import RICEIE from .rice import RICEIE
@ -941,7 +946,10 @@ from .vube import VubeIE
from .vuclip import VuClipIE from .vuclip import VuClipIE
from .vulture import VultureIE from .vulture import VultureIE
from .walla import WallaIE from .walla import WallaIE
from .washingtonpost import WashingtonPostIE from .washingtonpost import (
WashingtonPostIE,
WashingtonPostArticleIE,
)
from .wat import WatIE from .wat import WatIE
from .watchindianporn import WatchIndianPornIE from .watchindianporn import WatchIndianPornIE
from .wdr import ( from .wdr import (

View File

@ -13,7 +13,8 @@ class Formula1IE(InfoExtractor):
'id': 'JvYXJpMzE6pArfHWm5ARp5AiUmD-gibV', 'id': 'JvYXJpMzE6pArfHWm5ARp5AiUmD-gibV',
'ext': 'flv', 'ext': 'flv',
'title': 'Race highlights - Spain 2016', 'title': 'Race highlights - Spain 2016',
} },
'add_ie': ['Ooyala'],
} }
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -784,6 +784,19 @@ class GenericIE(InfoExtractor):
'title': 'Rosetta #CometLanding webcast HL 10', 'title': 'Rosetta #CometLanding webcast HL 10',
} }
}, },
# Another Livestream embed, without 'new.' in URL
{
'url': 'https://www.freespeech.org/',
'info_dict': {
'id': '123537347',
'ext': 'mp4',
'title': 're:^FSTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
},
'params': {
# Live stream
'skip_download': True,
},
},
# LazyYT # LazyYT
{ {
'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986', 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
@ -1194,6 +1207,16 @@ class GenericIE(InfoExtractor):
'uploader': 'Lake8737', 'uploader': 'Lake8737',
} }
}, },
# Duplicated embedded video URLs
{
'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443',
'info_dict': {
'id': '149298443_480_16c25b74_2',
'ext': 'mp4',
'title': 'vs. Blue Orange Spring Game',
'uploader': 'www.hudl.com',
},
},
] ]
def report_following_redirect(self, new_url): def report_following_redirect(self, new_url):
@ -1868,7 +1891,7 @@ class GenericIE(InfoExtractor):
return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast') return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
mobj = re.search( mobj = re.search(
r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"', r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"',
webpage) webpage)
if mobj is not None: if mobj is not None:
return self.url_result(mobj.group('url'), 'Livestream') return self.url_result(mobj.group('url'), 'Livestream')
@ -2111,7 +2134,7 @@ class GenericIE(InfoExtractor):
raise UnsupportedError(url) raise UnsupportedError(url)
entries = [] entries = []
for video_url in found: for video_url in orderedSet(found):
video_url = unescapeHTML(video_url) video_url = unescapeHTML(video_url)
video_url = video_url.replace('\\/', '/') video_url = video_url.replace('\\/', '/')
video_url = compat_urlparse.urljoin(url, video_url) video_url = compat_urlparse.urljoin(url, video_url)

View File

@ -14,6 +14,7 @@ class GrouponIE(InfoExtractor):
'description': 'Studio kept at 105 degrees and 40% humidity with anti-microbial and anti-slip Flotex flooring; certified instructors', 'description': 'Studio kept at 105 degrees and 40% humidity with anti-microbial and anti-slip Flotex flooring; certified instructors',
}, },
'playlist': [{ 'playlist': [{
'md5': '42428ce8a00585f9bc36e49226eae7a1',
'info_dict': { 'info_dict': {
'id': 'fk6OhWpXgIQ', 'id': 'fk6OhWpXgIQ',
'ext': 'mp4', 'ext': 'mp4',
@ -24,10 +25,11 @@ class GrouponIE(InfoExtractor):
'uploader_id': 'groupon', 'uploader_id': 'groupon',
'uploader': 'Groupon', 'uploader': 'Groupon',
}, },
'add_ie': ['Youtube'],
}], }],
'params': { 'params': {
'skip_download': True, 'skip_download': True,
} },
} }
_PROVIDERS = { _PROVIDERS = {

View File

@ -8,7 +8,7 @@ class HowcastIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?howcast\.com/videos/(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
_TEST = { _TEST = {
'url': 'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly', 'url': 'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly',
'md5': '8b743df908c42f60cf6496586c7f12c3', 'md5': '7d45932269a288149483144f01b99789',
'info_dict': { 'info_dict': {
'id': '390161', 'id': '390161',
'ext': 'mp4', 'ext': 'mp4',
@ -19,9 +19,9 @@ class HowcastIE(InfoExtractor):
'duration': 56.823, 'duration': 56.823,
}, },
'params': { 'params': {
# m3u8 download
'skip_download': True, 'skip_download': True,
}, },
'add_ie': ['Ooyala'],
} }
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -7,48 +7,53 @@ from .common import InfoExtractor
from ..compat import compat_urlparse from ..compat import compat_urlparse
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
int_or_none,
remove_end,
unified_strdate,
ExtractorError, ExtractorError,
int_or_none,
parse_iso8601,
remove_end,
) )
class LifeNewsIE(InfoExtractor): class LifeNewsIE(InfoExtractor):
IE_NAME = 'lifenews' IE_NAME = 'life'
IE_DESC = 'LIFE | NEWS' IE_DESC = 'Life.ru'
_VALID_URL = r'https?://lifenews\.ru/(?:mobile/)?(?P<section>news|video)/(?P<id>\d+)' _VALID_URL = r'https?://life\.ru/t/[^/]+/(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
# single video embedded via video/source # single video embedded via video/source
'url': 'http://lifenews.ru/news/98736', 'url': 'https://life.ru/t/новости/98736',
'md5': '77c95eaefaca216e32a76a343ad89d23', 'md5': '77c95eaefaca216e32a76a343ad89d23',
'info_dict': { 'info_dict': {
'id': '98736', 'id': '98736',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Мужчина нашел дома архив оборонного завода', 'title': 'Мужчина нашел дома архив оборонного завода',
'description': 'md5:3b06b1b39b5e2bea548e403d99b8bf26', 'description': 'md5:3b06b1b39b5e2bea548e403d99b8bf26',
'timestamp': 1344154740,
'upload_date': '20120805', 'upload_date': '20120805',
'view_count': int,
} }
}, { }, {
# single video embedded via iframe # single video embedded via iframe
'url': 'http://lifenews.ru/news/152125', 'url': 'https://life.ru/t/новости/152125',
'md5': '77d19a6f0886cd76bdbf44b4d971a273', 'md5': '77d19a6f0886cd76bdbf44b4d971a273',
'info_dict': { 'info_dict': {
'id': '152125', 'id': '152125',
'ext': 'mp4', 'ext': 'mp4',
'title': 'В Сети появилось видео захвата «Правым сектором» колхозных полей ', 'title': 'В Сети появилось видео захвата «Правым сектором» колхозных полей ',
'description': 'Жители двух поселков Днепропетровской области не простили радикалам угрозу лишения плодородных земель и пошли в лобовую. ', 'description': 'Жители двух поселков Днепропетровской области не простили радикалам угрозу лишения плодородных земель и пошли в лобовую. ',
'timestamp': 1427961840,
'upload_date': '20150402', 'upload_date': '20150402',
'view_count': int,
} }
}, { }, {
# two videos embedded via iframe # two videos embedded via iframe
'url': 'http://lifenews.ru/news/153461', 'url': 'https://life.ru/t/новости/153461',
'info_dict': { 'info_dict': {
'id': '153461', 'id': '153461',
'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве', 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве',
'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
'upload_date': '20150505', 'timestamp': 1430825520,
'view_count': int,
}, },
'playlist': [{ 'playlist': [{
'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795', 'md5': '9b6ef8bc0ffa25aebc8bdb40d89ab795',
@ -57,6 +62,7 @@ class LifeNewsIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 1)', 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 1)',
'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
'timestamp': 1430825520,
'upload_date': '20150505', 'upload_date': '20150505',
}, },
}, { }, {
@ -66,22 +72,25 @@ class LifeNewsIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 2)', 'title': 'В Москве спасли потерявшегося медвежонка, который спрятался на дереве (Видео 2)',
'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.', 'description': 'Маленький хищник не смог найти дорогу домой и обрел временное убежище на тополе недалеко от жилого массива, пока его не нашла соседская собака.',
'timestamp': 1430825520,
'upload_date': '20150505', 'upload_date': '20150505',
}, },
}], }],
}, { }, {
'url': 'http://lifenews.ru/video/13035', 'url': 'https://life.ru/t/новости/213035',
'only_matching': True,
}, {
'url': 'https://life.ru/t/%D0%BD%D0%BE%D0%B2%D0%BE%D1%81%D1%82%D0%B8/153461',
'only_matching': True,
}, {
'url': 'https://life.ru/t/новости/411489/manuel_vals_nazval_frantsiiu_tsieliu_nomier_odin_dlia_ighil',
'only_matching': True, 'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) video_id = self._match_id(url)
video_id = mobj.group('id')
section = mobj.group('section')
webpage = self._download_webpage( webpage = self._download_webpage(url, video_id)
'http://lifenews.ru/%s/%s' % (section, video_id),
video_id, 'Downloading page')
video_urls = re.findall( video_urls = re.findall(
r'<video[^>]+><source[^>]+src=["\'](.+?)["\']', webpage) r'<video[^>]+><source[^>]+src=["\'](.+?)["\']', webpage)
@ -95,26 +104,22 @@ class LifeNewsIE(InfoExtractor):
title = remove_end( title = remove_end(
self._og_search_title(webpage), self._og_search_title(webpage),
' - Первый по срочным новостям — LIFE | NEWS') ' - Life.ru')
description = self._og_search_description(webpage) description = self._og_search_description(webpage)
view_count = self._html_search_regex( view_count = self._html_search_regex(
r'<div class=\'views\'>\s*(\d+)\s*</div>', webpage, 'view count', fatal=False) r'<div[^>]+class=(["\']).*?\bhits-count\b.*?\1[^>]*>\s*(?P<value>\d+)\s*</div>',
comment_count = self._html_search_regex( webpage, 'view count', fatal=False, group='value')
r'=\'commentCount\'[^>]*>\s*(\d+)\s*<',
webpage, 'comment count', fatal=False)
upload_date = self._html_search_regex( timestamp = parse_iso8601(self._search_regex(
r'<time[^>]*datetime=\'([^\']+)\'', webpage, 'upload date', fatal=False) r'<time[^>]+datetime=(["\'])(?P<value>.+?)\1',
if upload_date is not None: webpage, 'upload date', fatal=False, group='value'))
upload_date = unified_strdate(upload_date)
common_info = { common_info = {
'description': description, 'description': description,
'view_count': int_or_none(view_count), 'view_count': int_or_none(view_count),
'comment_count': int_or_none(comment_count), 'timestamp': timestamp,
'upload_date': upload_date,
} }
def make_entry(video_id, video_url, index=None): def make_entry(video_id, video_url, index=None):
@ -183,7 +188,8 @@ class LifeEmbedIE(InfoExtractor):
ext = determine_ext(video_url) ext = determine_ext(video_url)
if ext == 'm3u8': if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
video_url, video_id, 'mp4', m3u8_id='m3u8')) video_url, video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id='m3u8'))
else: else:
formats.append({ formats.append({
'url': video_url, 'url': video_url,

View File

@ -150,7 +150,7 @@ class LivestreamIE(InfoExtractor):
} }
def _extract_stream_info(self, stream_info): def _extract_stream_info(self, stream_info):
broadcast_id = stream_info['broadcast_id'] broadcast_id = compat_str(stream_info['broadcast_id'])
is_live = stream_info.get('is_live') is_live = stream_info.get('is_live')
formats = [] formats = []

View File

@ -8,6 +8,7 @@ from ..utils import (
float_or_none, float_or_none,
ExtractorError, ExtractorError,
unsmuggle_url, unsmuggle_url,
determine_ext,
) )
from ..compat import compat_urllib_parse_urlencode from ..compat import compat_urllib_parse_urlencode
@ -15,71 +16,80 @@ from ..compat import compat_urllib_parse_urlencode
class OoyalaBaseIE(InfoExtractor): class OoyalaBaseIE(InfoExtractor):
_PLAYER_BASE = 'http://player.ooyala.com/' _PLAYER_BASE = 'http://player.ooyala.com/'
_CONTENT_TREE_BASE = _PLAYER_BASE + 'player_api/v1/content_tree/' _CONTENT_TREE_BASE = _PLAYER_BASE + 'player_api/v1/content_tree/'
_AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v1/authorization/embed_code/%s/%s?' _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v2/authorization/embed_code/%s/%s?'
def _extract(self, content_tree_url, video_id, domain='example.org'): def _extract(self, content_tree_url, video_id, domain='example.org'):
content_tree = self._download_json(content_tree_url, video_id)['content_tree'] content_tree = self._download_json(content_tree_url, video_id)['content_tree']
metadata = content_tree[list(content_tree)[0]] metadata = content_tree[list(content_tree)[0]]
embed_code = metadata['embed_code'] embed_code = metadata['embed_code']
pcode = metadata.get('asset_pcode') or embed_code pcode = metadata.get('asset_pcode') or embed_code
video_info = { title = metadata['title']
'id': embed_code,
'title': metadata['title'], auth_data = self._download_json(
'description': metadata.get('description'), self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code) +
'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'), compat_urllib_parse_urlencode({
'duration': float_or_none(metadata.get('duration'), 1000), 'domain': domain,
} 'supportedFormats': 'mp4,rtmp,m3u8,hds',
}), video_id)
cur_auth_data = auth_data['authorization_data'][embed_code]
urls = [] urls = []
formats = [] formats = []
for supported_format in ('mp4', 'm3u8', 'hds', 'rtmp'): if cur_auth_data['authorized']:
auth_data = self._download_json( for stream in cur_auth_data['streams']:
self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code) + s_url = base64.b64decode(
compat_urllib_parse_urlencode({ stream['url']['data'].encode('ascii')).decode('utf-8')
'domain': domain, if s_url in urls:
'supportedFormats': supported_format continue
}), urls.append(s_url)
video_id, 'Downloading %s JSON' % supported_format) ext = determine_ext(s_url, None)
delivery_type = stream['delivery_type']
cur_auth_data = auth_data['authorization_data'][embed_code] if delivery_type == 'hls' or ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
if cur_auth_data['authorized']: s_url, embed_code, 'mp4', 'm3u8_native',
for stream in cur_auth_data['streams']: m3u8_id='hls', fatal=False))
url = base64.b64decode( elif delivery_type == 'hds' or ext == 'f4m':
stream['url']['data'].encode('ascii')).decode('utf-8') formats.extend(self._extract_f4m_formats(
if url in urls: s_url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False))
continue elif ext == 'smil':
urls.append(url) formats.extend(self._extract_smil_formats(
delivery_type = stream['delivery_type'] s_url, embed_code, fatal=False))
if delivery_type == 'hls' or '.m3u8' in url: else:
formats.extend(self._extract_m3u8_formats( formats.append({
url, embed_code, 'mp4', 'm3u8_native', 'url': s_url,
m3u8_id='hls', fatal=False)) 'ext': ext or stream.get('delivery_type'),
elif delivery_type == 'hds' or '.f4m' in url: 'vcodec': stream.get('video_codec'),
formats.extend(self._extract_f4m_formats( 'format_id': delivery_type,
url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False)) 'width': int_or_none(stream.get('width')),
elif '.smil' in url: 'height': int_or_none(stream.get('height')),
formats.extend(self._extract_smil_formats( 'abr': int_or_none(stream.get('audio_bitrate')),
url, embed_code, fatal=False)) 'vbr': int_or_none(stream.get('video_bitrate')),
else: 'fps': float_or_none(stream.get('framerate')),
formats.append({ })
'url': url, else:
'ext': stream.get('delivery_type'), raise ExtractorError('%s said: %s' % (
'vcodec': stream.get('video_codec'), self.IE_NAME, cur_auth_data['message']), expected=True)
'format_id': delivery_type,
'width': int_or_none(stream.get('width')),
'height': int_or_none(stream.get('height')),
'abr': int_or_none(stream.get('audio_bitrate')),
'vbr': int_or_none(stream.get('video_bitrate')),
'fps': float_or_none(stream.get('framerate')),
})
else:
raise ExtractorError('%s said: %s' % (
self.IE_NAME, cur_auth_data['message']), expected=True)
self._sort_formats(formats) self._sort_formats(formats)
video_info['formats'] = formats subtitles = {}
return video_info for lang, sub in metadata.get('closed_captions_vtt', {}).get('captions', {}).items():
sub_url = sub.get('url')
if not sub_url:
continue
subtitles[lang] = [{
'url': sub_url,
}]
return {
'id': embed_code,
'title': title,
'description': metadata.get('description'),
'thumbnail': metadata.get('thumbnail_image') or metadata.get('promo_image'),
'duration': float_or_none(metadata.get('duration'), 1000),
'subtitles': subtitles,
'formats': formats,
}
class OoyalaIE(OoyalaBaseIE): class OoyalaIE(OoyalaBaseIE):

View File

@ -0,0 +1,130 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
xpath_text,
find_xpath_attr,
determine_ext,
int_or_none,
unified_strdate,
xpath_element,
ExtractorError,
)
class RadioCanadaIE(InfoExtractor):
IE_NAME = 'radiocanada'
_VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)'
_TEST = {
'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272',
'info_dict': {
'id': '7184272',
'ext': 'flv',
'title': 'Le parcours du tireur capté sur vidéo',
'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa',
'upload_date': '20141023',
},
'params': {
# rtmp download
'skip_download': True,
},
}
def _real_extract(self, url):
app_code, video_id = re.match(self._VALID_URL, url).groups()
formats = []
# TODO: extract m3u8 and f4m formats
# m3u8 formats can be extracted using ipad device_type return 403 error code when ffmpeg try to download segements
# f4m formats can be extracted using flashhd device_type but they produce unplayable file
for device_type in ('flash',):
v_data = self._download_xml(
'http://api.radio-canada.ca/validationMedia/v1/Validation.ashx',
video_id, note='Downloading %s XML' % device_type, query={
'appCode': app_code,
'idMedia': video_id,
'connectionType': 'broadband',
'multibitrate': 'true',
'deviceType': device_type,
# paysJ391wsHjbOJwvCs26toz and bypasslock are used to bypass geo-restriction
'paysJ391wsHjbOJwvCs26toz': 'CA',
'bypasslock': 'NZt5K62gRqfc',
})
v_url = xpath_text(v_data, 'url')
if not v_url:
continue
if v_url == 'null':
raise ExtractorError('%s said: %s' % (
self.IE_NAME, xpath_text(v_data, 'message')), expected=True)
ext = determine_ext(v_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
v_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
elif ext == 'f4m':
formats.extend(self._extract_f4m_formats(v_url, video_id, f4m_id='hds', fatal=False))
else:
ext = determine_ext(v_url)
bitrates = xpath_element(v_data, 'bitrates')
for url_e in bitrates.findall('url'):
tbr = int_or_none(url_e.get('bitrate'))
if not tbr:
continue
formats.append({
'format_id': 'rtmp-%d' % tbr,
'url': re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url),
'ext': 'flv',
'protocol': 'rtmp',
'width': int_or_none(url_e.get('width')),
'height': int_or_none(url_e.get('height')),
'tbr': tbr,
})
self._sort_formats(formats)
metadata = self._download_xml(
'http://api.radio-canada.ca/metaMedia/v1/index.ashx',
video_id, note='Downloading metadata XML', query={
'appCode': app_code,
'idMedia': video_id,
})
def get_meta(name):
el = find_xpath_attr(metadata, './/Meta', 'name', name)
return el.text if el is not None else None
return {
'id': video_id,
'title': get_meta('Title'),
'description': get_meta('Description') or get_meta('ShortDescription'),
'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'),
'duration': int_or_none(get_meta('length')),
'series': get_meta('Emission'),
'season_number': int_or_none('SrcSaison'),
'episode_number': int_or_none('SrcEpisode'),
'upload_date': unified_strdate(get_meta('Date')),
'formats': formats,
}
class RadioCanadaAudioVideoIE(InfoExtractor):
'radiocanada:audiovideo'
_VALID_URL = r'https?://ici\.radio-canada\.ca/audio-video/media-(?P<id>[0-9]+)'
_TEST = {
'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam',
'info_dict': {
'id': '7527184',
'ext': 'flv',
'title': 'Barack Obama au Vietnam',
'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam',
'upload_date': '20160523',
},
'params': {
# rtmp download
'skip_download': True,
},
}
def _real_extract(self, url):
return self.url_result('radiocanada:medianet:%s' % self._match_id(url))

View File

@ -0,0 +1,69 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
js_to_json,
int_or_none,
unescapeHTML,
)
class ReutersIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?reuters\.com/.*?\?.*?videoId=(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.reuters.com/video/2016/05/20/san-francisco-police-chief-resigns?videoId=368575562',
'md5': '8015113643a0b12838f160b0b81cc2ee',
'info_dict': {
'id': '368575562',
'ext': 'mp4',
'title': 'San Francisco police chief resigns',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
'http://www.reuters.com/assets/iframe/yovideo?videoId=%s' % video_id, video_id)
video_data = js_to_json(self._search_regex(
r'(?s)Reuters\.yovideo\.drawPlayer\(({.*?})\);',
webpage, 'video data'))
def get_json_value(key, fatal=False):
return self._search_regex('"%s"\s*:\s*"([^"]+)"' % key, video_data, key, fatal=fatal)
title = unescapeHTML(get_json_value('title', fatal=True))
mmid, fid = re.search(r',/(\d+)\?f=(\d+)', get_json_value('flv', fatal=True)).groups()
mas_data = self._download_json(
'http://mas-e.cds1.yospace.com/mas/%s/%s?trans=json' % (mmid, fid),
video_id, transform_source=js_to_json)
formats = []
for f in mas_data:
f_url = f.get('url')
if not f_url:
continue
method = f.get('method')
if method == 'hls':
formats.extend(self._extract_m3u8_formats(
f_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
else:
container = f.get('container')
ext = '3gp' if method == 'mobile' else container
formats.append({
'format_id': ext,
'url': f_url,
'ext': ext,
'container': container if method != 'mobile' else None,
})
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'thumbnail': get_json_value('thumb'),
'duration': int_or_none(get_json_value('seconds')),
'formats': formats,
}

View File

@ -11,6 +11,7 @@ class TeachingChannelIE(InfoExtractor):
_TEST = { _TEST = {
'url': 'https://www.teachingchannel.org/videos/teacher-teaming-evolution', 'url': 'https://www.teachingchannel.org/videos/teacher-teaming-evolution',
'md5': '3d6361864d7cac20b57c8784da17166f',
'info_dict': { 'info_dict': {
'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM', 'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM',
'ext': 'mp4', 'ext': 'mp4',
@ -19,9 +20,9 @@ class TeachingChannelIE(InfoExtractor):
'duration': 422.255, 'duration': 422.255,
}, },
'params': { 'params': {
# m3u8 download
'skip_download': True, 'skip_download': True,
}, },
'add_ie': ['Ooyala'],
} }
def _real_extract(self, url): def _real_extract(self, url):

View File

@ -6,7 +6,7 @@ from .common import InfoExtractor
class TF1IE(InfoExtractor): class TF1IE(InfoExtractor):
"""TF1 uses the wat.tv player.""" """TF1 uses the wat.tv player."""
_VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|www\.tfou)\.fr/(?:[^/]+/)*(?P<id>.+?)\.html' _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|(?:www\.)?(?:tfou|ushuaiatv|histoire|tvbreizh))\.fr/(?:[^/]+/)*(?P<id>[^/?#.]+)'
_TESTS = [{ _TESTS = [{
'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
'info_dict': { 'info_dict': {
@ -48,6 +48,6 @@ class TF1IE(InfoExtractor):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
wat_id = self._html_search_regex( wat_id = self._html_search_regex(
r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})(?:#.*?)?\1', r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})(?:.*?)?\1',
webpage, 'wat id', group='id') webpage, 'wat id', group='id')
return self.url_result('wat:%s' % wat_id, 'Wat') return self.url_result('wat:%s' % wat_id, 'Wat')

View File

@ -37,6 +37,7 @@ class VeohIE(InfoExtractor):
'uploader': 'afp-news', 'uploader': 'afp-news',
'duration': 123, 'duration': 123,
}, },
'skip': 'This video has been deleted.',
}, },
{ {
'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX', 'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX',

View File

@ -11,12 +11,14 @@ class ViceIE(InfoExtractor):
_TESTS = [{ _TESTS = [{
'url': 'http://www.vice.com/video/cowboy-capitalists-part-1', 'url': 'http://www.vice.com/video/cowboy-capitalists-part-1',
'md5': 'e9d77741f9e42ba583e683cd170660f7',
'info_dict': { 'info_dict': {
'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
'ext': 'flv', 'ext': 'flv',
'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
'duration': 725.983, 'duration': 725.983,
}, },
'add_ie': ['Ooyala'],
}, { }, {
'url': 'http://www.vice.com/video/how-to-hack-a-car', 'url': 'http://www.vice.com/video/how-to-hack-a-car',
'md5': '6fb2989a3fed069fb8eab3401fc2d3c9', 'md5': '6fb2989a3fed069fb8eab3401fc2d3c9',
@ -29,6 +31,7 @@ class ViceIE(InfoExtractor):
'uploader': 'Motherboard', 'uploader': 'Motherboard',
'upload_date': '20140529', 'upload_date': '20140529',
}, },
'add_ie': ['Youtube'],
}, { }, {
'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
'only_matching': True, 'only_matching': True,

View File

@ -15,7 +15,8 @@ class VoxMediaIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'title': 'Google\'s new material design direction', 'title': 'Google\'s new material design direction',
'description': 'md5:2f44f74c4d14a1f800ea73e1c6832ad2', 'description': 'md5:2f44f74c4d14a1f800ea73e1c6832ad2',
} },
'add_ie': ['Ooyala'],
}, { }, {
# data-ooyala-id # data-ooyala-id
'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet', 'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet',
@ -25,7 +26,8 @@ class VoxMediaIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'title': 'The Nexus 6: hands-on with Google\'s phablet', 'title': 'The Nexus 6: hands-on with Google\'s phablet',
'description': 'md5:87a51fe95ff8cea8b5bdb9ac7ae6a6af', 'description': 'md5:87a51fe95ff8cea8b5bdb9ac7ae6a6af',
} },
'add_ie': ['Ooyala'],
}, { }, {
# volume embed # volume embed
'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill', 'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill',
@ -35,7 +37,8 @@ class VoxMediaIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'title': 'The new frontier of LGBTQ civil rights, explained', 'title': 'The new frontier of LGBTQ civil rights, explained',
'description': 'md5:0dc58e94a465cbe91d02950f770eb93f', 'description': 'md5:0dc58e94a465cbe91d02950f770eb93f',
} },
'add_ie': ['Ooyala'],
}, { }, {
# youtube embed # youtube embed
'url': 'http://www.vox.com/2016/3/24/11291692/robot-dance', 'url': 'http://www.vox.com/2016/3/24/11291692/robot-dance',
@ -48,7 +51,8 @@ class VoxMediaIE(InfoExtractor):
'upload_date': '20160324', 'upload_date': '20160324',
'uploader_id': 'voxdotcom', 'uploader_id': 'voxdotcom',
'uploader': 'Vox', 'uploader': 'Vox',
} },
'add_ie': ['Youtube'],
}, { }, {
# SBN.VideoLinkset.entryGroup multiple ooyala embeds # SBN.VideoLinkset.entryGroup multiple ooyala embeds
'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok', 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
@ -117,7 +121,7 @@ class VoxMediaIE(InfoExtractor):
volume_webpage = self._download_webpage( volume_webpage = self._download_webpage(
'http://volume.vox-cdn.com/embed/%s' % volume_uuid, volume_uuid) 'http://volume.vox-cdn.com/embed/%s' % volume_uuid, volume_uuid)
video_data = self._parse_json(self._search_regex( video_data = self._parse_json(self._search_regex(
r'Volume\.createVideo\(({.+})\s*,\s*{.*}\);', volume_webpage, 'video data'), volume_uuid) r'Volume\.createVideo\(({.+})\s*,\s*{.*}\s*,\s*\[.*\]\s*,\s*{.*}\);', volume_webpage, 'video data'), volume_uuid)
for provider_video_type in ('ooyala', 'youtube'): for provider_video_type in ('ooyala', 'youtube'):
provider_video_id = video_data.get('%s_id' % provider_video_type) provider_video_id = video_data.get('%s_id' % provider_video_type)
if provider_video_id: if provider_video_id:

View File

@ -11,7 +11,96 @@ from ..utils import (
class WashingtonPostIE(InfoExtractor): class WashingtonPostIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])' IE_NAME = 'washingtonpost'
_VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
_TEST = {
'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
'md5': '6f537e1334b714eb15f9563bd4b9cdfa',
'info_dict': {
'id': '480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
'ext': 'mp4',
'title': 'Egypt finds belongings, debris from plane crash',
'description': 'md5:a17ceee432f215a5371388c1f680bd86',
'upload_date': '20160520',
'uploader': 'Reuters',
'timestamp': 1463778452,
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json(
'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % video_id,
video_id, transform_source=strip_jsonp)[0]['contentConfig']
title = video_data['title']
urls = []
formats = []
for s in video_data.get('streams', []):
s_url = s.get('url')
if not s_url or s_url in urls:
continue
urls.append(s_url)
video_type = s.get('type')
if video_type == 'smil':
continue
elif video_type in ('ts', 'hls') and ('_master.m3u8' in s_url or '_mobile.m3u8' in s_url):
m3u8_formats = self._extract_m3u8_formats(
s_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
for m3u8_format in m3u8_formats:
width = m3u8_format.get('width')
if not width:
continue
vbr = self._search_regex(
r'%d_%d_(\d+)' % (width, m3u8_format['height']), m3u8_format['url'], 'vbr', default=None)
if vbr:
m3u8_format.update({
'vbr': int_or_none(vbr),
})
formats.extend(m3u8_formats)
else:
width = int_or_none(s.get('width'))
vbr = int_or_none(s.get('bitrate'))
has_width = width != 0
formats.append({
'format_id': (
'%s-%d-%d' % (video_type, width, vbr)
if width
else video_type),
'vbr': vbr if has_width else None,
'width': width,
'height': int_or_none(s.get('height')),
'acodec': s.get('audioCodec'),
'vcodec': s.get('videoCodec') if has_width else 'none',
'filesize': int_or_none(s.get('fileSize')),
'url': s_url,
'ext': 'mp4',
'protocol': 'm3u8_native' if video_type in ('ts', 'hls') else None,
})
source_media_url = video_data.get('sourceMediaURL')
if source_media_url:
formats.append({
'format_id': 'source_media',
'url': source_media_url,
})
self._sort_formats(
formats, ('width', 'height', 'vbr', 'filesize', 'tbr', 'format_id'))
return {
'id': video_id,
'title': title,
'description': video_data.get('blurb'),
'uploader': video_data.get('credits', {}).get('source'),
'formats': formats,
'duration': int_or_none(video_data.get('videoDuration'), 100),
'timestamp': int_or_none(
video_data.get('dateConfig', {}).get('dateFirstPublished'), 1000),
}
class WashingtonPostArticleIE(InfoExtractor):
IE_NAME = 'washingtonpost:article'
_VALID_URL = r'https?://(?:www\.)?washingtonpost\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'
_TESTS = [{ _TESTS = [{
'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/', 'url': 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/',
'info_dict': { 'info_dict': {
@ -63,6 +152,10 @@ class WashingtonPostIE(InfoExtractor):
}] }]
}] }]
@classmethod
def suitable(cls, url):
return False if WashingtonPostIE.suitable(url) else super(WashingtonPostArticleIE, cls).suitable(url)
def _real_extract(self, url): def _real_extract(self, url):
page_id = self._match_id(url) page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id) webpage = self._download_webpage(url, page_id)
@ -74,54 +167,7 @@ class WashingtonPostIE(InfoExtractor):
<div\s+class="posttv-video-embed[^>]*?data-uuid=| <div\s+class="posttv-video-embed[^>]*?data-uuid=|
data-video-uuid= data-video-uuid=
)"([^"]+)"''', webpage) )"([^"]+)"''', webpage)
entries = [] entries = [self.url_result('washingtonpost:%s' % uuid, 'WashingtonPost', uuid) for uuid in uuids]
for i, uuid in enumerate(uuids, start=1):
vinfo_all = self._download_json(
'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % uuid,
page_id,
transform_source=strip_jsonp,
note='Downloading information of video %d/%d' % (i, len(uuids))
)
vinfo = vinfo_all[0]['contentConfig']
uploader = vinfo.get('credits', {}).get('source')
timestamp = int_or_none(
vinfo.get('dateConfig', {}).get('dateFirstPublished'), 1000)
formats = [{
'format_id': (
'%s-%s-%s' % (s.get('type'), s.get('width'), s.get('bitrate'))
if s.get('width')
else s.get('type')),
'vbr': s.get('bitrate') if s.get('width') != 0 else None,
'width': s.get('width'),
'height': s.get('height'),
'acodec': s.get('audioCodec'),
'vcodec': s.get('videoCodec') if s.get('width') != 0 else 'none',
'filesize': s.get('fileSize'),
'url': s.get('url'),
'ext': 'mp4',
'preference': -100 if s.get('type') == 'smil' else None,
'protocol': {
'MP4': 'http',
'F4F': 'f4m',
}.get(s.get('type')),
} for s in vinfo.get('streams', [])]
source_media_url = vinfo.get('sourceMediaURL')
if source_media_url:
formats.append({
'format_id': 'source_media',
'url': source_media_url,
})
self._sort_formats(formats)
entries.append({
'id': uuid,
'title': vinfo['title'],
'description': vinfo.get('blurb'),
'uploader': uploader,
'formats': formats,
'duration': int_or_none(vinfo.get('videoDuration'), 100),
'timestamp': timestamp,
})
return { return {
'_type': 'playlist', '_type': 'playlist',

View File

@ -2,25 +2,26 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re import re
import hashlib
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
unified_strdate, unified_strdate,
HEADRequest,
float_or_none,
) )
class WatIE(InfoExtractor): class WatIE(InfoExtractor):
_VALID_URL = r'(?:wat:(?P<real_id>\d{8})|https?://www\.wat\.tv/video/(?P<display_id>.*)-(?P<short_id>.*?)_.*?\.html)' _VALID_URL = r'(?:wat:|https?://(?:www\.)?wat\.tv/video/.*-)(?P<id>[0-9a-z]+)'
IE_NAME = 'wat.tv' IE_NAME = 'wat.tv'
_TESTS = [ _TESTS = [
{ {
'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html', 'url': 'http://www.wat.tv/video/soupe-figues-l-orange-aux-epices-6z1uz_2hvf7_.html',
'md5': 'ce70e9223945ed26a8056d413ca55dc9', 'md5': '83d882d9de5c9d97f0bb2c6273cde56a',
'info_dict': { 'info_dict': {
'id': '11713067', 'id': '11713067',
'display_id': 'soupe-figues-l-orange-aux-epices',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Soupe de figues à l\'orange et aux épices', 'title': 'Soupe de figues à l\'orange et aux épices',
'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.', 'description': 'Retrouvez l\'émission "Petits plats en équilibre", diffusée le 18 août 2014.',
@ -33,7 +34,6 @@ class WatIE(InfoExtractor):
'md5': 'fbc84e4378165278e743956d9c1bf16b', 'md5': 'fbc84e4378165278e743956d9c1bf16b',
'info_dict': { 'info_dict': {
'id': '11713075', 'id': '11713075',
'display_id': 'gregory-lemarchal-voix-ange',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)', 'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)',
'description': 'md5:b7a849cf16a2b733d9cd10c52906dee3', 'description': 'md5:b7a849cf16a2b733d9cd10c52906dee3',
@ -44,96 +44,85 @@ class WatIE(InfoExtractor):
}, },
] ]
def download_video_info(self, real_id): def _real_extract(self, url):
video_id = self._match_id(url)
video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36))
# 'contentv4' is used in the website, but it also returns the related # 'contentv4' is used in the website, but it also returns the related
# videos, we don't need them # videos, we don't need them
info = self._download_json('http://www.wat.tv/interface/contentv3/' + real_id, real_id) video_info = self._download_json(
return info['media'] 'http://www.wat.tv/interface/contentv3/' + video_id, video_id)['media']
def _real_extract(self, url):
def real_id_for_chapter(chapter):
return chapter['tc_start'].split('-')[0]
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('display_id')
real_id = mobj.group('real_id')
if not real_id:
short_id = mobj.group('short_id')
webpage = self._download_webpage(url, display_id or short_id)
real_id = self._search_regex(r'xtpage = ".*-(.*?)";', webpage, 'real id')
video_info = self.download_video_info(real_id)
error_desc = video_info.get('error_desc') error_desc = video_info.get('error_desc')
if error_desc: if error_desc:
raise ExtractorError( raise ExtractorError(
'%s returned error: %s' % (self.IE_NAME, error_desc), expected=True) '%s returned error: %s' % (self.IE_NAME, error_desc), expected=True)
geo_list = video_info.get('geoList')
country = geo_list[0] if geo_list else ''
chapters = video_info['chapters'] chapters = video_info['chapters']
first_chapter = chapters[0] first_chapter = chapters[0]
files = video_info['files']
first_file = files[0]
if real_id_for_chapter(first_chapter) != real_id: def video_id_for_chapter(chapter):
return chapter['tc_start'].split('-')[0]
if video_id_for_chapter(first_chapter) != video_id:
self.to_screen('Multipart video detected') self.to_screen('Multipart video detected')
chapter_urls = [] entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters]
for chapter in chapters: return self.playlist_result(entries, video_id, video_info['title'])
chapter_id = real_id_for_chapter(chapter)
# Yes, when we this chapter is processed by WatIE,
# it will download the info again
chapter_info = self.download_video_info(chapter_id)
chapter_urls.append(chapter_info['url'])
entries = [self.url_result(chapter_url) for chapter_url in chapter_urls]
return self.playlist_result(entries, real_id, video_info['title'])
upload_date = None
if 'date_diffusion' in first_chapter:
upload_date = unified_strdate(first_chapter['date_diffusion'])
# Otherwise we can continue and extract just one part, we have to use # Otherwise we can continue and extract just one part, we have to use
# the short id for getting the video url # the video id for getting the video url
formats = [{ date_diffusion = first_chapter.get('date_diffusion')
'url': 'http://wat.tv/get/android5/%s.mp4' % real_id, upload_date = unified_strdate(date_diffusion) if date_diffusion else None
'format_id': 'Mobile',
}]
fmts = [('SD', 'web')] def extract_url(path_template, url_type):
if first_file.get('hasHD'): req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id)
fmts.append(('HD', 'webhd')) head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type)
red_url = head.geturl()
if req_url == red_url:
raise ExtractorError(
'%s said: Sorry, this video is not available from your country.' % self.IE_NAME,
expected=True)
return red_url
def compute_token(param): m3u8_url = extract_url('ipad/%s.m3u8', 'm3u8')
timestamp = '%08x' % int(self._download_webpage( http_url = extract_url('android5/%s.mp4', 'http')
'http://www.wat.tv/servertime', real_id,
'Downloading server time').split('|')[0])
magic = '9b673b13fa4682ed14c3cfa5af5310274b514c4133e9b3a81e6e3aba009l2564'
return '%s/%s' % (hashlib.md5((magic + param + timestamp).encode('ascii')).hexdigest(), timestamp)
for fmt in fmts: formats = []
webid = '/%s/%s' % (fmt[1], real_id) m3u8_formats = self._extract_m3u8_formats(
video_url = self._download_webpage( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
'http://www.wat.tv/get%s?token=%s&getURL=1&country=%s' % (webid, compute_token(webid), country), formats.extend(m3u8_formats)
real_id, formats.extend(self._extract_f4m_formats(
'Downloading %s video URL' % fmt[0], m3u8_url.replace('ios.', 'web.').replace('.m3u8', '.f4m'),
'Failed to download %s video URL' % fmt[0], video_id, f4m_id='hds', fatal=False))
False) for m3u8_format in m3u8_formats:
if not video_url: mobj = re.search(
r'audio.*?%3D(\d+)(?:-video.*?%3D(\d+))?', m3u8_format['url'])
if not mobj:
continue continue
formats.append({ abr, vbr = mobj.groups()
'url': video_url, abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
'ext': 'mp4', m3u8_format.update({
'format_id': fmt[0], 'vbr': vbr,
'abr': abr,
}) })
if not vbr or not abr:
continue
f = m3u8_format.copy()
f.update({
'url': re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url),
'format_id': f['format_id'].replace('hls', 'http'),
'protocol': 'http',
})
formats.append(f)
self._sort_formats(formats)
return { return {
'id': real_id, 'id': video_id,
'display_id': display_id,
'title': first_chapter['title'], 'title': first_chapter['title'],
'thumbnail': first_chapter['preview'], 'thumbnail': first_chapter['preview'],
'description': first_chapter['description'], 'description': first_chapter['description'],
'view_count': video_info['views'], 'view_count': video_info['views'],
'upload_date': upload_date, 'upload_date': upload_date,
'duration': first_file['duration'], 'duration': video_info['files'][0]['duration'],
'formats': formats, 'formats': formats,
} }

View File

@ -12,37 +12,52 @@ from ..utils import (
class XHamsterIE(InfoExtractor): class XHamsterIE(InfoExtractor):
_VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.+?)\.html(?:\?.*)?' _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.*?)\.html(?:\?.*)?'
_TESTS = [ _TESTS = [{
{ 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', 'md5': '8281348b8d3c53d39fffb377d24eac4e',
'info_dict': { 'info_dict': {
'id': '1509445', 'id': '1509445',
'ext': 'mp4', 'ext': 'mp4',
'title': 'FemaleAgent Shy beauty takes the bait', 'title': 'FemaleAgent Shy beauty takes the bait',
'upload_date': '20121014', 'upload_date': '20121014',
'uploader': 'Ruseful2011', 'uploader': 'Ruseful2011',
'duration': 893.52, 'duration': 893.52,
'age_limit': 18, 'age_limit': 18,
}
}, },
{ }, {
'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
'info_dict': { 'info_dict': {
'id': '2221348', 'id': '2221348',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Britney Spears Sexy Booty', 'title': 'Britney Spears Sexy Booty',
'upload_date': '20130914', 'upload_date': '20130914',
'uploader': 'jojo747400', 'uploader': 'jojo747400',
'duration': 200.48, 'duration': 200.48,
'age_limit': 18, 'age_limit': 18,
}
}, },
{ 'params': {
'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html', 'skip_download': True,
'only_matching': True,
}, },
] }, {
# empty seo
'url': 'http://xhamster.com/movies/5667973/.html',
'info_dict': {
'id': '5667973',
'ext': 'mp4',
'title': '....',
'upload_date': '20160208',
'uploader': 'parejafree',
'duration': 72.0,
'age_limit': 18,
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html',
'only_matching': True,
}]
def _real_extract(self, url): def _real_extract(self, url):
def extract_video_url(webpage, name): def extract_video_url(webpage, name):
@ -170,7 +185,7 @@ class XHamsterEmbedIE(InfoExtractor):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
video_url = self._search_regex( video_url = self._search_regex(
r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id, r'href="(https?://xhamster\.com/movies/%s/[^"]*\.html[^"]*)"' % video_id,
webpage, 'xhamster url', default=None) webpage, 'xhamster url', default=None)
if not video_url: if not video_url: