Merge pull request #3 from rg3/master

update
This commit is contained in:
siddht1 2016-03-19 19:25:00 +05:30
commit e58df5787a
27 changed files with 620 additions and 154 deletions

View File

@ -81,6 +81,7 @@
- **BokeCC** - **BokeCC**
- **Bpb**: Bundeszentrale für politische Bildung - **Bpb**: Bundeszentrale für politische Bildung
- **BR**: Bayerischer Rundfunk Mediathek - **BR**: Bayerischer Rundfunk Mediathek
- **BravoTV**
- **Break** - **Break**
- **brightcove:legacy** - **brightcove:legacy**
- **brightcove:new** - **brightcove:new**
@ -499,6 +500,7 @@
- **Restudy** - **Restudy**
- **ReverbNation** - **ReverbNation**
- **Revision3** - **Revision3**
- **RICE**
- **RingTV** - **RingTV**
- **RottenTomatoes** - **RottenTomatoes**
- **Roxwel** - **Roxwel**
@ -617,6 +619,7 @@
- **ThePlatform** - **ThePlatform**
- **ThePlatformFeed** - **ThePlatformFeed**
- **TheSixtyOne** - **TheSixtyOne**
- **TheStar**
- **ThisAmericanLife** - **ThisAmericanLife**
- **ThisAV** - **ThisAV**
- **THVideo** - **THVideo**
@ -650,6 +653,7 @@
- **tv.dfb.de** - **tv.dfb.de**
- **TV2** - **TV2**
- **TV2Article** - **TV2Article**
- **TV3**
- **TV4**: tv4.se and tv4play.se - **TV4**: tv4.se and tv4play.se
- **TVC** - **TVC**
- **TVCArticle** - **TVCArticle**

View File

@ -222,6 +222,11 @@ class TestFormatSelection(unittest.TestCase):
downloaded = ydl.downloaded_info_dicts[0] downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], 'dash-video-low') self.assertEqual(downloaded['format_id'], 'dash-video-low')
ydl = YDL({'format': 'bestvideo[format_id^=dash][format_id$=low]'})
ydl.process_ie_result(info_dict.copy())
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['format_id'], 'dash-video-low')
formats = [ formats = [
{'format_id': 'vid-vcodec-dot', 'ext': 'mp4', 'preference': 1, 'vcodec': 'avc1.123456', 'acodec': 'none', 'url': TEST_URL}, {'format_id': 'vid-vcodec-dot', 'ext': 'mp4', 'preference': 1, 'vcodec': 'avc1.123456', 'acodec': 'none', 'url': TEST_URL},
] ]

View File

@ -28,6 +28,7 @@ from youtube_dl.utils import (
encodeFilename, encodeFilename,
escape_rfc3986, escape_rfc3986,
escape_url, escape_url,
extract_attributes,
ExtractorError, ExtractorError,
find_xpath_attr, find_xpath_attr,
fix_xml_ampersands, fix_xml_ampersands,
@ -77,6 +78,7 @@ from youtube_dl.utils import (
cli_bool_option, cli_bool_option,
) )
from youtube_dl.compat import ( from youtube_dl.compat import (
compat_chr,
compat_etree_fromstring, compat_etree_fromstring,
compat_urlparse, compat_urlparse,
compat_parse_qs, compat_parse_qs,
@ -629,6 +631,44 @@ class TestUtil(unittest.TestCase):
on = js_to_json('{"abc": "def",}') on = js_to_json('{"abc": "def",}')
self.assertEqual(json.loads(on), {'abc': 'def'}) self.assertEqual(json.loads(on), {'abc': 'def'})
def test_extract_attributes(self):
self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'})
self.assertEqual(extract_attributes('<e x=y>'), {'x': 'y'})
self.assertEqual(extract_attributes('<e x="a \'b\' c">'), {'x': "a 'b' c"})
self.assertEqual(extract_attributes('<e x=\'a "b" c\'>'), {'x': 'a "b" c'})
self.assertEqual(extract_attributes('<e x="&#121;">'), {'x': 'y'})
self.assertEqual(extract_attributes('<e x="&#x79;">'), {'x': 'y'})
self.assertEqual(extract_attributes('<e x="&amp;">'), {'x': '&'}) # XML
self.assertEqual(extract_attributes('<e x="&quot;">'), {'x': '"'})
self.assertEqual(extract_attributes('<e x="&pound;">'), {'x': '£'}) # HTML 3.2
self.assertEqual(extract_attributes('<e x="&lambda;">'), {'x': 'λ'}) # HTML 4.0
self.assertEqual(extract_attributes('<e x="&foo">'), {'x': '&foo'})
self.assertEqual(extract_attributes('<e x="\'">'), {'x': "'"})
self.assertEqual(extract_attributes('<e x=\'"\'>'), {'x': '"'})
self.assertEqual(extract_attributes('<e x >'), {'x': None})
self.assertEqual(extract_attributes('<e x=y a>'), {'x': 'y', 'a': None})
self.assertEqual(extract_attributes('<e x= y>'), {'x': 'y'})
self.assertEqual(extract_attributes('<e x=1 y=2 x=3>'), {'y': '2', 'x': '3'})
self.assertEqual(extract_attributes('<e \nx=\ny\n>'), {'x': 'y'})
self.assertEqual(extract_attributes('<e \nx=\n"y"\n>'), {'x': 'y'})
self.assertEqual(extract_attributes("<e \nx=\n'y'\n>"), {'x': 'y'})
self.assertEqual(extract_attributes('<e \nx="\ny\n">'), {'x': '\ny\n'})
self.assertEqual(extract_attributes('<e CAPS=x>'), {'caps': 'x'}) # Names lowercased
self.assertEqual(extract_attributes('<e x=1 X=2>'), {'x': '2'})
self.assertEqual(extract_attributes('<e X=1 x=2>'), {'x': '2'})
self.assertEqual(extract_attributes('<e _:funny-name1=1>'), {'_:funny-name1': '1'})
self.assertEqual(extract_attributes('<e x="Fáilte 世界 \U0001f600">'), {'x': 'Fáilte 世界 \U0001f600'})
self.assertEqual(extract_attributes('<e x="décompose&#769;">'), {'x': 'décompose\u0301'})
# "Narrow" Python builds don't support unicode code points outside BMP.
try:
compat_chr(0x10000)
supports_outside_bmp = True
except ValueError:
supports_outside_bmp = False
if supports_outside_bmp:
self.assertEqual(extract_attributes('<e x="Smile &#128512;!">'), {'x': 'Smile \U0001f600!'})
def test_clean_html(self): def test_clean_html(self):
self.assertEqual(clean_html('a:\nb'), 'a: b') self.assertEqual(clean_html('a:\nb'), 'a: b')
self.assertEqual(clean_html('a:\n "b"'), 'a: "b"') self.assertEqual(clean_html('a:\n "b"'), 'a: "b"')
@ -662,6 +702,8 @@ class TestUtil(unittest.TestCase):
self.assertEqual(parse_count('1.000'), 1000) self.assertEqual(parse_count('1.000'), 1000)
self.assertEqual(parse_count('1.1k'), 1100) self.assertEqual(parse_count('1.1k'), 1100)
self.assertEqual(parse_count('1.1kk'), 1100000) self.assertEqual(parse_count('1.1kk'), 1100000)
self.assertEqual(parse_count('1.1kk '), 1100000)
self.assertEqual(parse_count('1.1kk views'), 1100000)
def test_version_tuple(self): def test_version_tuple(self):
self.assertEqual(version_tuple('1'), (1,)) self.assertEqual(version_tuple('1'), (1,))

View File

@ -905,7 +905,7 @@ class YoutubeDL(object):
'*=': lambda attr, value: value in attr, '*=': lambda attr, value: value in attr,
} }
str_operator_rex = re.compile(r'''(?x) str_operator_rex = re.compile(r'''(?x)
\s*(?P<key>ext|acodec|vcodec|container|protocol) \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
\s*(?P<op>%s)(?P<none_inclusive>\s*\?)? \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
\s*(?P<value>[a-zA-Z0-9._-]+) \s*(?P<value>[a-zA-Z0-9._-]+)
\s*$ \s*$

View File

@ -77,6 +77,11 @@ try:
except ImportError: # Python 2 except ImportError: # Python 2
from urllib import urlretrieve as compat_urlretrieve from urllib import urlretrieve as compat_urlretrieve
try:
from html.parser import HTMLParser as compat_HTMLParser
except ImportError: # Python 2
from HTMLParser import HTMLParser as compat_HTMLParser
try: try:
from subprocess import DEVNULL from subprocess import DEVNULL
@ -251,6 +256,16 @@ else:
el.text = el.text.decode('utf-8') el.text = el.text.decode('utf-8')
return doc return doc
if sys.version_info < (2, 7):
# Here comes the crazy part: In 2.6, if the xpath is a unicode,
# .//node does not match if a node is a direct child of . !
def compat_xpath(xpath):
if isinstance(xpath, compat_str):
xpath = xpath.encode('ascii')
return xpath
else:
compat_xpath = lambda xpath: xpath
try: try:
from urllib.parse import parse_qs as compat_parse_qs from urllib.parse import parse_qs as compat_parse_qs
except ImportError: # Python 2 except ImportError: # Python 2
@ -543,6 +558,7 @@ else:
from tokenize import generate_tokens as compat_tokenize_tokenize from tokenize import generate_tokens as compat_tokenize_tokenize
__all__ = [ __all__ = [
'compat_HTMLParser',
'compat_HTTPError', 'compat_HTTPError',
'compat_basestring', 'compat_basestring',
'compat_chr', 'compat_chr',
@ -579,6 +595,7 @@ __all__ = [
'compat_urlparse', 'compat_urlparse',
'compat_urlretrieve', 'compat_urlretrieve',
'compat_xml_parse_error', 'compat_xml_parse_error',
'compat_xpath',
'shlex_quote', 'shlex_quote',
'subprocess_check_output', 'subprocess_check_output',
'workaround_optparse_bug9161', 'workaround_optparse_bug9161',

View File

@ -81,6 +81,7 @@ from .bloomberg import BloombergIE
from .bokecc import BokeCCIE from .bokecc import BokeCCIE
from .bpb import BpbIE from .bpb import BpbIE
from .br import BRIE from .br import BRIE
from .bravotv import BravoTVIE
from .breakcom import BreakIE from .breakcom import BreakIE
from .brightcove import ( from .brightcove import (
BrightcoveLegacyIE, BrightcoveLegacyIE,
@ -135,6 +136,7 @@ from .collegerama import CollegeRamaIE
from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
from .comcarcoff import ComCarCoffIE from .comcarcoff import ComCarCoffIE
from .commonmistakes import CommonMistakesIE, UnicodeBOMIE from .commonmistakes import CommonMistakesIE, UnicodeBOMIE
from .commonprotocols import RtmpIE
from .condenast import CondeNastIE from .condenast import CondeNastIE
from .cracked import CrackedIE from .cracked import CrackedIE
from .crackle import CrackleIE from .crackle import CrackleIE
@ -282,6 +284,7 @@ from .goshgay import GoshgayIE
from .gputechconf import GPUTechConfIE from .gputechconf import GPUTechConfIE
from .groupon import GrouponIE from .groupon import GrouponIE
from .hark import HarkIE from .hark import HarkIE
from .hbo import HBOIE
from .hearthisat import HearThisAtIE from .hearthisat import HearThisAtIE
from .heise import HeiseIE from .heise import HeiseIE
from .hellporno import HellPornoIE from .hellporno import HellPornoIE
@ -784,6 +787,7 @@ from .tv2 import (
TV2IE, TV2IE,
TV2ArticleIE, TV2ArticleIE,
) )
from .tv3 import TV3IE
from .tv4 import TV4IE from .tv4 import TV4IE
from .tvc import ( from .tvc import (
TVCIE, TVCIE,

View File

@ -18,7 +18,7 @@ class AnimeOnDemandIE(InfoExtractor):
_LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in' _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in'
_APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply'
_NETRC_MACHINE = 'animeondemand' _NETRC_MACHINE = 'animeondemand'
_TEST = { _TESTS = [{
'url': 'https://www.anime-on-demand.de/anime/161', 'url': 'https://www.anime-on-demand.de/anime/161',
'info_dict': { 'info_dict': {
'id': '161', 'id': '161',
@ -26,7 +26,15 @@ class AnimeOnDemandIE(InfoExtractor):
'description': 'md5:6681ce3c07c7189d255ac6ab23812d31', 'description': 'md5:6681ce3c07c7189d255ac6ab23812d31',
}, },
'playlist_mincount': 4, 'playlist_mincount': 4,
} }, {
# Film wording is used instead of Episode
'url': 'https://www.anime-on-demand.de/anime/39',
'only_matching': True,
}, {
# Episodes without titles
'url': 'https://www.anime-on-demand.de/anime/162',
'only_matching': True,
}]
def _login(self): def _login(self):
(username, password) = self._get_login_info() (username, password) = self._get_login_info()
@ -91,14 +99,22 @@ class AnimeOnDemandIE(InfoExtractor):
entries = [] entries = []
for episode_html in re.findall(r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage): for num, episode_html in enumerate(re.findall(
m = re.search( r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage), 1):
r'class="episodebox-title"[^>]+title="Episode (?P<number>\d+) - (?P<title>.+?)"', episode_html) episodebox_title = self._search_regex(
if not m: (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1',
r'class="episodebox-title"[^>]+>(?P<title>.+?)<'),
episode_html, 'episodebox title', default=None, group='title')
if not episodebox_title:
continue continue
episode_number = int(m.group('number')) episode_number = int(self._search_regex(
episode_title = m.group('title') r'(?:Episode|Film)\s*(\d+)',
episodebox_title, 'episode number', default=num))
episode_title = self._search_regex(
r'(?:Episode|Film)\s*\d+\s*-\s*(.+)',
episodebox_title, 'episode title', default=None)
video_id = 'episode-%d' % episode_number video_id = 'episode-%d' % episode_number
common_info = { common_info = {

View File

@ -0,0 +1,28 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import smuggle_url
class BravoTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+videos/(?P<id>[^/?]+)'
_TEST = {
'url': 'http://www.bravotv.com/last-chance-kitchen/season-5/videos/lck-ep-12-fishy-finale',
'md5': 'd60cdf68904e854fac669bd26cccf801',
'info_dict': {
'id': 'LitrBdX64qLn',
'ext': 'mp4',
'title': 'Last Chance Kitchen Returns',
'description': 'S13: Last Chance Kitchen Returns for Top Chef Season 13',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
account_pid = self._search_regex(r'"account_pid"\s*:\s*"([^"]+)"', webpage, 'account pid')
release_pid = self._search_regex(r'"release_pid"\s*:\s*"([^"]+)"', webpage, 'release pid')
return self.url_result(smuggle_url(
'http://link.theplatform.com/s/%s/%s?mbr=true&switch=progressive' % (account_pid, release_pid),
{'force_smil_url': True}), 'ThePlatform', release_pid)

View File

@ -9,7 +9,6 @@ from ..compat import (
compat_etree_fromstring, compat_etree_fromstring,
compat_parse_qs, compat_parse_qs,
compat_str, compat_str,
compat_urllib_parse,
compat_urllib_parse_urlparse, compat_urllib_parse_urlparse,
compat_urlparse, compat_urlparse,
compat_xml_parse_error, compat_xml_parse_error,
@ -24,16 +23,16 @@ from ..utils import (
js_to_json, js_to_json,
int_or_none, int_or_none,
parse_iso8601, parse_iso8601,
sanitized_Request,
unescapeHTML, unescapeHTML,
unsmuggle_url, unsmuggle_url,
update_url_query,
) )
class BrightcoveLegacyIE(InfoExtractor): class BrightcoveLegacyIE(InfoExtractor):
IE_NAME = 'brightcove:legacy' IE_NAME = 'brightcove:legacy'
_VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)' _VALID_URL = r'(?:https?://.*brightcove\.com/(services|viewer).*?\?|brightcove:)(?P<query>.*)'
_FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' _FEDERATED_URL = 'http://c.brightcove.com/services/viewer/htmlFederated'
_TESTS = [ _TESTS = [
{ {
@ -156,7 +155,7 @@ class BrightcoveLegacyIE(InfoExtractor):
# Not all pages define this value # Not all pages define this value
if playerKey is not None: if playerKey is not None:
params['playerKey'] = playerKey params['playerKey'] = playerKey
# The three fields hold the id of the video # These fields hold the id of the video
videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList') videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') or find_param('@videoList')
if videoPlayer is not None: if videoPlayer is not None:
params['@videoPlayer'] = videoPlayer params['@videoPlayer'] = videoPlayer
@ -185,8 +184,7 @@ class BrightcoveLegacyIE(InfoExtractor):
@classmethod @classmethod
def _make_brightcove_url(cls, params): def _make_brightcove_url(cls, params):
data = compat_urllib_parse.urlencode(params) return update_url_query(cls._FEDERATED_URL, params)
return cls._FEDERATED_URL_TEMPLATE % data
@classmethod @classmethod
def _extract_brightcove_url(cls, webpage): def _extract_brightcove_url(cls, webpage):
@ -240,7 +238,7 @@ class BrightcoveLegacyIE(InfoExtractor):
# We set the original url as the default 'Referer' header # We set the original url as the default 'Referer' header
referer = smuggled_data.get('Referer', url) referer = smuggled_data.get('Referer', url)
return self._get_video_info( return self._get_video_info(
videoPlayer[0], query_str, query, referer=referer) videoPlayer[0], query, referer=referer)
elif 'playerKey' in query: elif 'playerKey' in query:
player_key = query['playerKey'] player_key = query['playerKey']
return self._get_playlist_info(player_key[0]) return self._get_playlist_info(player_key[0])
@ -249,15 +247,14 @@ class BrightcoveLegacyIE(InfoExtractor):
'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?', 'Cannot find playerKey= variable. Did you forget quotes in a shell invocation?',
expected=True) expected=True)
def _get_video_info(self, video_id, query_str, query, referer=None): def _get_video_info(self, video_id, query, referer=None):
request_url = self._FEDERATED_URL_TEMPLATE % query_str headers = {}
req = sanitized_Request(request_url)
linkBase = query.get('linkBaseURL') linkBase = query.get('linkBaseURL')
if linkBase is not None: if linkBase is not None:
referer = linkBase[0] referer = linkBase[0]
if referer is not None: if referer is not None:
req.add_header('Referer', referer) headers['Referer'] = referer
webpage = self._download_webpage(req, video_id) webpage = self._download_webpage(self._FEDERATED_URL, video_id, headers=headers, query=query)
error_msg = self._html_search_regex( error_msg = self._html_search_regex(
r"<h1>We're sorry.</h1>([\s\n]*<p>.*?</p>)+", webpage, r"<h1>We're sorry.</h1>([\s\n]*<p>.*?</p>)+", webpage,
@ -415,8 +412,8 @@ class BrightcoveNewIE(InfoExtractor):
# Look for iframe embeds [1] # Look for iframe embeds [1]
for _, url in re.findall( for _, url in re.findall(
r'<iframe[^>]+src=(["\'])((?:https?:)//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage):
entries.append(url) entries.append(url if url.startswith('http') else 'http:' + url)
# Look for embed_in_page embeds [2] # Look for embed_in_page embeds [2]
for video_id, account_id, player_id, embed in re.findall( for video_id, account_id, player_id, embed in re.findall(
@ -459,12 +456,11 @@ class BrightcoveNewIE(InfoExtractor):
r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1', r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1',
webpage, 'policy key', group='pk') webpage, 'policy key', group='pk')
req = sanitized_Request( api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id)
'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s'
% (account_id, video_id),
headers={'Accept': 'application/json;pk=%s' % policy_key})
try: try:
json_data = self._download_json(req, video_id) json_data = self._download_json(api_url, video_id, headers={
'Accept': 'application/json;pk=%s' % policy_key
})
except ExtractorError as e: except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
json_data = self._parse_json(e.cause.read().decode(), video_id) json_data = self._parse_json(e.cause.read().decode(), video_id)
@ -482,8 +478,7 @@ class BrightcoveNewIE(InfoExtractor):
if not src: if not src:
continue continue
formats.extend(self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
src, video_id, 'mp4', entry_protocol='m3u8_native', src, video_id, 'mp4', m3u8_id='hls', fatal=False))
m3u8_id='hls', fatal=False))
elif source_type == 'application/dash+xml': elif source_type == 'application/dash+xml':
if not src: if not src:
continue continue

View File

@ -78,7 +78,7 @@ class CBSNewsIE(ThePlatformIE):
pid = item.get('media' + format_id) pid = item.get('media' + format_id)
if not pid: if not pid:
continue continue
release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?format=SMIL&mbr=true' % pid release_url = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' % pid
tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % pid) tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % pid)
formats.extend(tp_formats) formats.extend(tp_formats)
subtitles = self._merge_subtitles(subtitles, tp_subtitles) subtitles = self._merge_subtitles(subtitles, tp_subtitles)

View File

@ -60,7 +60,7 @@ class CNETIE(ThePlatformIE):
for (fkey, vid) in vdata['files'].items(): for (fkey, vid) in vdata['files'].items():
if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']:
continue continue
release_url = 'http://link.theplatform.com/s/kYEXFC/%s?format=SMIL&mbr=true' % vid release_url = 'http://link.theplatform.com/s/kYEXFC/%s?mbr=true' % vid
if fkey == 'hds': if fkey == 'hds':
release_url += '&manifest=f4m' release_url += '&manifest=f4m'
tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey) tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey)

View File

@ -0,0 +1,36 @@
from __future__ import unicode_literals
import os
from .common import InfoExtractor
from ..compat import (
compat_urllib_parse_unquote,
compat_urlparse,
)
from ..utils import url_basename
class RtmpIE(InfoExtractor):
IE_DESC = False # Do not list
_VALID_URL = r'(?i)rtmp[est]?://.+'
_TESTS = [{
'url': 'rtmp://cp44293.edgefcs.net/ondemand?auth=daEcTdydfdqcsb8cZcDbAaCbhamacbbawaS-bw7dBb-bWG-GqpGFqCpNCnGoyL&aifp=v001&slist=public/unsecure/audio/2c97899446428e4301471a8cb72b4b97--audio--pmg-20110908-0900a_flv_aac_med_int.mp4',
'only_matching': True,
}, {
'url': 'rtmp://edge.live.hitbox.tv/live/dimak',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
title = compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
return {
'id': video_id,
'title': title,
'formats': [{
'url': url,
'ext': 'flv',
'format_id': compat_urlparse.urlparse(url).scheme,
}],
}

View File

@ -54,7 +54,7 @@ class CrunchyrollBaseIE(InfoExtractor):
def _real_initialize(self): def _real_initialize(self):
self._login() self._login()
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): def _download_webpage(self, url_or_request, *args, **kwargs):
request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request)
else sanitized_Request(url_or_request)) else sanitized_Request(url_or_request))
# Accept-Language must be set explicitly to accept any language to avoid issues # Accept-Language must be set explicitly to accept any language to avoid issues
@ -65,8 +65,7 @@ class CrunchyrollBaseIE(InfoExtractor):
# Crunchyroll to not work in georestriction cases in some browsers that don't place # Crunchyroll to not work in georestriction cases in some browsers that don't place
# the locale lang first in header. However allowing any language seems to workaround the issue. # the locale lang first in header. However allowing any language seems to workaround the issue.
request.add_header('Accept-Language', '*') request.add_header('Accept-Language', '*')
return super(CrunchyrollBaseIE, self)._download_webpage( return super(CrunchyrollBaseIE, self)._download_webpage(request, *args, **kwargs)
request, video_id, note, errnote, fatal, tries, timeout, encoding)
@staticmethod @staticmethod
def _add_skip_wall(url): def _add_skip_wall(url):

View File

@ -239,6 +239,35 @@ class GenericIE(InfoExtractor):
'format': 'bestvideo', 'format': 'bestvideo',
}, },
}, },
# m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8
{
'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8',
'info_dict': {
'id': 'content',
'ext': 'mp4',
'title': 'content',
'formats': 'mincount:8',
},
'params': {
# m3u8 downloads
'skip_download': True,
}
},
# m3u8 served with Content-Type: text/plain
{
'url': 'http://www.nacentapps.com/m3u8/index.m3u8',
'info_dict': {
'id': 'index',
'ext': 'mp4',
'title': 'index',
'upload_date': '20140720',
'formats': 'mincount:11',
},
'params': {
# m3u8 downloads
'skip_download': True,
}
},
# google redirect # google redirect
{ {
'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
@ -1245,14 +1274,13 @@ class GenericIE(InfoExtractor):
info_dict = { info_dict = {
'id': video_id, 'id': video_id,
'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
'upload_date': unified_strdate(head_response.headers.get('Last-Modified'))
} }
# Check for direct link to a video # Check for direct link to a video
content_type = head_response.headers.get('Content-Type', '') content_type = head_response.headers.get('Content-Type', '').lower()
m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>.+)$', content_type) m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
if m: if m:
upload_date = unified_strdate(
head_response.headers.get('Last-Modified'))
format_id = m.group('format_id') format_id = m.group('format_id')
if format_id.endswith('mpegurl'): if format_id.endswith('mpegurl'):
formats = self._extract_m3u8_formats(url, video_id, 'mp4') formats = self._extract_m3u8_formats(url, video_id, 'mp4')
@ -1264,11 +1292,8 @@ class GenericIE(InfoExtractor):
'url': url, 'url': url,
'vcodec': 'none' if m.group('type') == 'audio' else None 'vcodec': 'none' if m.group('type') == 'audio' else None
}] }]
info_dict.update({ info_dict['direct'] = True
'direct': True, info_dict['formats'] = formats
'formats': formats,
'upload_date': upload_date,
})
return info_dict return info_dict
if not self._downloader.params.get('test', False) and not is_intentional: if not self._downloader.params.get('test', False) and not is_intentional:
@ -1289,18 +1314,21 @@ class GenericIE(InfoExtractor):
request.add_header('Accept-Encoding', '*') request.add_header('Accept-Encoding', '*')
full_response = self._request_webpage(request, video_id) full_response = self._request_webpage(request, video_id)
first_bytes = full_response.read(512)
# Is it an M3U playlist?
if first_bytes.startswith(b'#EXTM3U'):
info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4')
return info_dict
# Maybe it's a direct link to a video? # Maybe it's a direct link to a video?
# Be careful not to download the whole thing! # Be careful not to download the whole thing!
first_bytes = full_response.read(512)
if not is_html(first_bytes): if not is_html(first_bytes):
self._downloader.report_warning( self._downloader.report_warning(
'URL could be a direct video link, returning it as such.') 'URL could be a direct video link, returning it as such.')
upload_date = unified_strdate(
head_response.headers.get('Last-Modified'))
info_dict.update({ info_dict.update({
'direct': True, 'direct': True,
'url': url, 'url': url,
'upload_date': upload_date,
}) })
return info_dict return info_dict

122
youtube_dl/extractor/hbo.py Normal file
View File

@ -0,0 +1,122 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
xpath_text,
xpath_element,
int_or_none,
parse_duration,
)
class HBOIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?hbo\.com/video/video\.html\?.*vid=(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.hbo.com/video/video.html?autoplay=true&g=u&vid=1437839',
'md5': '1c33253f0c7782142c993c0ba62a8753',
'info_dict': {
'id': '1437839',
'ext': 'mp4',
'title': 'Ep. 64 Clip: Encryption',
}
}
_FORMATS_INFO = {
'1920': {
'width': 1280,
'height': 720,
},
'640': {
'width': 768,
'height': 432,
},
'highwifi': {
'width': 640,
'height': 360,
},
'high3g': {
'width': 640,
'height': 360,
},
'medwifi': {
'width': 400,
'height': 224,
},
'med3g': {
'width': 400,
'height': 224,
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_xml(
'http://render.lv3.hbo.com/data/content/global/videos/data/%s.xml' % video_id, video_id)
title = xpath_text(video_data, 'title', 'title', True)
formats = []
for source in xpath_element(video_data, 'videos', 'sources', True):
if source.tag == 'size':
path = xpath_text(source, './/path')
if not path:
continue
width = source.attrib.get('width')
format_info = self._FORMATS_INFO.get(width, {})
height = format_info.get('height')
fmt = {
'url': path,
'format_id': 'http%s' % ('-%dp' % height if height else ''),
'width': format_info.get('width'),
'height': height,
}
rtmp = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', path)
if rtmp:
fmt.update({
'url': rtmp.group('url'),
'play_path': rtmp.group('playpath'),
'app': rtmp.group('app'),
'ext': 'flv',
'format_id': fmt['format_id'].replace('http', 'rtmp'),
})
formats.append(fmt)
else:
video_url = source.text
if not video_url:
continue
if source.tag == 'tarball':
formats.extend(self._extract_m3u8_formats(
video_url.replace('.tar', '/base_index_w8.m3u8'),
video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
else:
format_info = self._FORMATS_INFO.get(source.tag, {})
formats.append({
'format_id': 'http-%s' % source.tag,
'url': video_url,
'width': format_info.get('width'),
'height': format_info.get('height'),
})
self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id'))
thumbnails = []
card_sizes = xpath_element(video_data, 'titleCardSizes')
if card_sizes is not None:
for size in card_sizes:
path = xpath_text(size, 'path')
if not path:
continue
width = int_or_none(size.get('width'))
thumbnails.append({
'id': width,
'url': path,
'width': width,
})
return {
'id': video_id,
'title': title,
'duration': parse_duration(xpath_element(video_data, 'duration/tv14')),
'formats': formats,
'thumbnails': thumbnails,
}

View File

@ -48,7 +48,7 @@ class NationalGeographicIE(InfoExtractor):
theplatform_id = url_basename(content.attrib.get('url')) theplatform_id = url_basename(content.attrib.get('url'))
return self.url_result(smuggle_url( return self.url_result(smuggle_url(
'http://link.theplatform.com/s/ngs/%s?format=SMIL&formats=MPEG4&manifest=f4m' % theplatform_id, 'http://link.theplatform.com/s/ngs/%s?formats=MPEG4&manifest=f4m' % theplatform_id,
# For some reason, the normal links don't work and we must force # For some reason, the normal links don't work and we must force
# the use of f4m # the use of f4m
{'force_smil_url': True})) {'force_smil_url': True}))

View File

@ -3,13 +3,16 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_HTTPError from .theplatform import ThePlatformIE
from ..utils import ( from ..utils import (
ExtractorError,
find_xpath_attr, find_xpath_attr,
lowercase_escape, lowercase_escape,
smuggle_url, smuggle_url,
unescapeHTML, unescapeHTML,
update_url_query,
int_or_none,
HEADRequest,
parse_iso8601,
) )
@ -131,10 +134,10 @@ class NBCSportsIE(InfoExtractor):
NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer') NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer')
class NBCNewsIE(InfoExtractor): class NBCNewsIE(ThePlatformIE):
_VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/ _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/
(?:video/.+?/(?P<id>\d+)| (?:video/.+?/(?P<id>\d+)|
(?:watch|feature|nightly-news)/[^/]+/(?P<title>.+)) ([^/]+/)*(?P<display_id>[^/?]+))
''' '''
_TESTS = [ _TESTS = [
@ -149,15 +152,14 @@ class NBCNewsIE(InfoExtractor):
}, },
}, },
{ {
'url': 'http://www.nbcnews.com/feature/edward-snowden-interview/how-twitter-reacted-snowden-interview-n117236', 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880',
'md5': 'b2421750c9f260783721d898f4c42063', 'md5': 'af1adfa51312291a017720403826bb64',
'info_dict': { 'info_dict': {
'id': 'I1wpAI_zmhsQ', 'id': '269389891880',
'ext': 'mp4', 'ext': 'mp4',
'title': 'How Twitter Reacted To The Snowden Interview', 'title': 'How Twitter Reacted To The Snowden Interview',
'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64',
}, },
'add_ie': ['ThePlatform'],
}, },
{ {
'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156', 'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156',
@ -168,17 +170,29 @@ class NBCNewsIE(InfoExtractor):
'title': 'FULL EPISODE: Family Business', 'title': 'FULL EPISODE: Family Business',
'description': 'md5:757988edbaae9d7be1d585eb5d55cc04', 'description': 'md5:757988edbaae9d7be1d585eb5d55cc04',
}, },
'skip': 'This page is unavailable.',
}, },
{ {
'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844',
'md5': 'b5dda8cddd8650baa0dcb616dd2cf60d', 'md5': '73135a2e0ef819107bbb55a5a9b2a802',
'info_dict': { 'info_dict': {
'id': 'sekXqyTVnmN3', 'id': '394064451844',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)',
'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5',
}, },
}, },
{
'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456',
'md5': 'a49e173825e5fcd15c13fc297fced39d',
'info_dict': {
'id': '529953347624',
'ext': 'mp4',
'title': 'Volkswagen U.S. Chief: We \'Totally Screwed Up\'',
'description': 'md5:d22d1281a24f22ea0880741bb4dd6301',
},
'expected_warnings': ['http-6000 is not available']
},
{ {
'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952', 'url': 'http://www.nbcnews.com/watch/dateline/full-episode--deadly-betrayal-386250819952',
'only_matching': True, 'only_matching': True,
@ -202,49 +216,80 @@ class NBCNewsIE(InfoExtractor):
} }
else: else:
# "feature" and "nightly-news" pages use theplatform.com # "feature" and "nightly-news" pages use theplatform.com
title = mobj.group('title') display_id = mobj.group('display_id')
webpage = self._download_webpage(url, title) webpage = self._download_webpage(url, display_id)
info = None
bootstrap_json = self._search_regex( bootstrap_json = self._search_regex(
r'var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$', r'(?m)var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$',
webpage, 'bootstrap json', flags=re.MULTILINE) webpage, 'bootstrap json', default=None)
bootstrap = self._parse_json(bootstrap_json, video_id) if bootstrap_json:
info = bootstrap['results'][0]['video'] bootstrap = self._parse_json(bootstrap_json, display_id)
mpxid = info['mpxId'] info = bootstrap['results'][0]['video']
else:
player_instance_json = self._search_regex(
r'videoObj\s*:\s*({.+})', webpage, 'player instance')
info = self._parse_json(player_instance_json, display_id)
video_id = info['mpxId']
title = info['title']
base_urls = [ subtitles = {}
info['fallbackPlaylistUrl'], caption_links = info.get('captionLinks')
info['associatedPlaylistUrl'], if caption_links:
] for (sub_key, sub_ext) in (('smpte-tt', 'ttml'), ('web-vtt', 'vtt'), ('srt', 'srt')):
sub_url = caption_links.get(sub_key)
if sub_url:
subtitles.setdefault('en', []).append({
'url': sub_url,
'ext': sub_ext,
})
for base_url in base_urls: formats = []
if not base_url: for video_asset in info['videoAssets']:
video_url = video_asset.get('publicUrl')
if not video_url:
continue continue
playlist_url = base_url + '?form=MPXNBCNewsAPI' container = video_asset.get('format')
asset_type = video_asset.get('assetType') or ''
try: if container == 'ISM' or asset_type == 'FireTV-Once':
all_videos = self._download_json(playlist_url, title)
except ExtractorError as ee:
if isinstance(ee.cause, compat_HTTPError):
continue
raise
if not all_videos or 'videos' not in all_videos:
continue continue
elif asset_type == 'OnceURL':
try: tp_formats, tp_subtitles = self._extract_theplatform_smil(
info = next(v for v in all_videos['videos'] if v['mpxId'] == mpxid) video_url, video_id)
break formats.extend(tp_formats)
except StopIteration: subtitles = self._merge_subtitles(subtitles, tp_subtitles)
continue else:
tbr = int_or_none(video_asset.get('bitRate'), 1000)
if info is None: format_id = 'http%s' % ('-%d' % tbr if tbr else '')
raise ExtractorError('Could not find video in playlists') video_url = update_url_query(
video_url, {'format': 'redirect'})
# resolve the url so that we can check availability and detect the correct extension
head = self._request_webpage(
HEADRequest(video_url), video_id,
'Checking %s url' % format_id,
'%s is not available' % format_id,
fatal=False)
if head:
video_url = head.geturl()
formats.append({
'format_id': format_id,
'url': video_url,
'width': int_or_none(video_asset.get('width')),
'height': int_or_none(video_asset.get('height')),
'tbr': tbr,
'container': video_asset.get('format'),
})
self._sort_formats(formats)
return { return {
'_type': 'url', 'id': video_id,
# We get the best quality video 'title': title,
'url': info['videoAssets'][-1]['publicUrl'], 'description': info.get('description'),
'ie_key': 'ThePlatform', 'thumbnail': info.get('description'),
'thumbnail': info.get('thumbnail'),
'duration': int_or_none(info.get('duration')),
'timestamp': parse_iso8601(info.get('pubDate')),
'formats': formats,
'subtitles': subtitles,
} }

View File

@ -2,7 +2,10 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote from ..compat import (
compat_urllib_parse_unquote,
compat_xpath,
)
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
find_xpath_attr, find_xpath_attr,
@ -47,7 +50,7 @@ class NozIE(InfoExtractor):
duration = int_or_none(xpath_text( duration = int_or_none(xpath_text(
doc, './/article/movie/file/duration')) doc, './/article/movie/file/duration'))
formats = [] formats = []
for qnode in doc.findall('.//article/movie/file/qualities/qual'): for qnode in doc.findall(compat_xpath('.//article/movie/file/qualities/qual')):
http_url_ele = find_xpath_attr( http_url_ele = find_xpath_attr(
qnode, './html_urls/video_url', 'format', 'video/mp4') qnode, './html_urls/video_url', 'format', 'video/mp4')
http_url = http_url_ele.text if http_url_ele is not None else None http_url = http_url_ele.text if http_url_ele is not None else None

View File

@ -0,0 +1,38 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class OnceIE(InfoExtractor):
_VALID_URL = r'https?://once\.unicornmedia\.com/now/[^/]+/[^/]+/(?P<domain_id>[^/]+)/(?P<application_id>[^/]+)/(?:[^/]+/)?(?P<media_item_id>[^/]+)/content\.(?:once|m3u8|mp4)'
ADAPTIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/master/playlist/%s/%s/%s/content.m3u8'
PROGRESSIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/media/progressive/%s/%s/%s/%s/content.mp4'
def _extract_once_formats(self, url):
domain_id, application_id, media_item_id = re.match(
OnceIE._VALID_URL, url).groups()
formats = self._extract_m3u8_formats(
self.ADAPTIVE_URL_TEMPLATE % (
domain_id, application_id, media_item_id),
media_item_id, 'mp4', m3u8_id='hls', fatal=False)
progressive_formats = []
for adaptive_format in formats:
rendition_id = self._search_regex(
r'/now/media/playlist/[^/]+/[^/]+/([^/]+)',
adaptive_format['url'], 'redition id', default=None)
if rendition_id:
progressive_format = adaptive_format.copy()
progressive_format.update({
'url': self.PROGRESSIVE_URL_TEMPLATE % (
domain_id, application_id, rendition_id, media_item_id),
'format_id': adaptive_format['format_id'].replace(
'hls', 'http'),
'protocol': 'http',
})
progressive_formats.append(progressive_format)
self._check_formats(progressive_formats, media_item_id)
formats.extend(progressive_formats)
return formats

View File

@ -2,6 +2,10 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import (
smuggle_url,
ExtractorError,
)
class SBSIE(InfoExtractor): class SBSIE(InfoExtractor):
@ -31,21 +35,28 @@ class SBSIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
player_params = self._download_json(
'http://www.sbs.com.au/api/video_pdkvars/id/%s?form=json' % video_id, video_id)
webpage = self._download_webpage( error = player_params.get('error')
'http://www.sbs.com.au/ondemand/video/single/%s?context=web' % video_id, video_id) if error:
error_message = 'Sorry, The video you are looking for does not exist.'
player_params = self._parse_json( video_data = error.get('results') or {}
self._search_regex( error_code = error.get('errorCode')
r'(?s)var\s+playerParams\s*=\s*({.+?});', webpage, 'playerParams'), if error_code == 'ComingSoon':
video_id) error_message = '%s is not yet available.' % video_data.get('title', '')
elif error_code in ('Forbidden', 'intranetAccessOnly'):
error_message = 'Sorry, This video cannot be accessed via this website'
elif error_code == 'Expired':
error_message = 'Sorry, %s is no longer available.' % video_data.get('title', '')
raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True)
urls = player_params['releaseUrls'] urls = player_params['releaseUrls']
theplatform_url = (urls.get('progressive') or urls.get('standard') or theplatform_url = (urls.get('progressive') or urls.get('html') or
urls.get('html') or player_params['relatedItemsURL']) urls.get('standard') or player_params['relatedItemsURL'])
return { return {
'_type': 'url_transparent', '_type': 'url_transparent',
'id': video_id, 'id': video_id,
'url': theplatform_url, 'url': smuggle_url(theplatform_url, {'force_smil_url': True}),
} }

View File

@ -8,13 +8,12 @@ import binascii
import hashlib import hashlib
from .common import InfoExtractor from .once import OnceIE
from ..compat import ( from ..compat import (
compat_parse_qs, compat_parse_qs,
compat_urllib_parse_urlparse, compat_urllib_parse_urlparse,
) )
from ..utils import ( from ..utils import (
determine_ext,
ExtractorError, ExtractorError,
float_or_none, float_or_none,
int_or_none, int_or_none,
@ -29,26 +28,27 @@ default_ns = 'http://www.w3.org/2005/SMIL21/Language'
_x = lambda p: xpath_with_ns(p, {'smil': default_ns}) _x = lambda p: xpath_with_ns(p, {'smil': default_ns})
class ThePlatformBaseIE(InfoExtractor): class ThePlatformBaseIE(OnceIE):
def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'): def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'):
meta = self._download_xml(smil_url, video_id, note=note) meta = self._download_xml(smil_url, video_id, note=note, query={'format': 'SMIL'})
error_element = find_xpath_attr( error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src')
meta, _x('.//smil:ref'), 'src', if error_element is not None and error_element.attrib['src'].startswith(
'http://link.theplatform.com/s/errorFiles/Unavailable.mp4') 'http://link.theplatform.com/s/errorFiles/Unavailable.'):
if error_element is not None:
raise ExtractorError(error_element.attrib['abstract'], expected=True) raise ExtractorError(error_element.attrib['abstract'], expected=True)
formats = self._parse_smil_formats( smil_formats = self._parse_smil_formats(
meta, smil_url, video_id, namespace=default_ns, meta, smil_url, video_id, namespace=default_ns,
# the parameters are from syfy.com, other sites may use others, # the parameters are from syfy.com, other sites may use others,
# they also work for nbc.com # they also work for nbc.com
f4m_params={'g': 'UXWGVKRWHFSP', 'hdcore': '3.0.3'}, f4m_params={'g': 'UXWGVKRWHFSP', 'hdcore': '3.0.3'},
transform_rtmp_url=lambda streamer, src: (streamer, 'mp4:' + src)) transform_rtmp_url=lambda streamer, src: (streamer, 'mp4:' + src))
for _format in formats: formats = []
ext = determine_ext(_format['url']) for _format in smil_formats:
if ext == 'once': if OnceIE.suitable(_format['url']):
_format['ext'] = 'mp4' formats.extend(self._extract_once_formats(_format['url']))
else:
formats.append(_format)
self._sort_formats(formats) self._sort_formats(formats)
@ -125,7 +125,7 @@ class ThePlatformIE(ThePlatformBaseIE):
'only_matching': True, 'only_matching': True,
}, { }, {
'url': 'http://player.theplatform.com/p/2E2eJC/nbcNewsOffsite?guid=tdy_or_siri_150701', 'url': 'http://player.theplatform.com/p/2E2eJC/nbcNewsOffsite?guid=tdy_or_siri_150701',
'md5': '734f3790fb5fc4903da391beeebc4836', 'md5': 'fb96bb3d85118930a5b055783a3bd992',
'info_dict': { 'info_dict': {
'id': 'tdy_or_siri_150701', 'id': 'tdy_or_siri_150701',
'ext': 'mp4', 'ext': 'mp4',
@ -135,7 +135,6 @@ class ThePlatformIE(ThePlatformBaseIE):
'thumbnail': 're:^https?://.*\.jpg$', 'thumbnail': 're:^https?://.*\.jpg$',
'timestamp': 1435752600, 'timestamp': 1435752600,
'upload_date': '20150701', 'upload_date': '20150701',
'categories': ['Today/Shows/Orange Room', 'Today/Sections/Money', 'Today/Topics/Tech', "Today/Topics/Editor's picks"],
}, },
}, { }, {
# From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1 # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1
@ -213,7 +212,7 @@ class ThePlatformIE(ThePlatformBaseIE):
webpage, 'smil url', group='url') webpage, 'smil url', group='url')
path = self._search_regex( path = self._search_regex(
r'link\.theplatform\.com/s/((?:[^/?#&]+/)+[^/?#&]+)', smil_url, 'path') r'link\.theplatform\.com/s/((?:[^/?#&]+/)+[^/?#&]+)', smil_url, 'path')
smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4&format=SMIL' smil_url += '?' if '?' not in smil_url else '&' + 'formats=m3u,mpeg4'
elif mobj.group('config'): elif mobj.group('config'):
config_url = url + '&form=json' config_url = url + '&form=json'
config_url = config_url.replace('swf/', 'config/') config_url = config_url.replace('swf/', 'config/')
@ -223,9 +222,9 @@ class ThePlatformIE(ThePlatformBaseIE):
release_url = config['releaseUrl'] release_url = config['releaseUrl']
else: else:
release_url = 'http://link.theplatform.com/s/%s?mbr=true' % path release_url = 'http://link.theplatform.com/s/%s?mbr=true' % path
smil_url = release_url + '&format=SMIL&formats=MPEG4&manifest=f4m' smil_url = release_url + '&formats=MPEG4&manifest=f4m'
else: else:
smil_url = 'http://link.theplatform.com/s/%s/meta.smil?format=smil&mbr=true' % path smil_url = 'http://link.theplatform.com/s/%s?mbr=true' % path
sig = smuggled_data.get('sig') sig = smuggled_data.get('sig')
if sig: if sig:
@ -250,7 +249,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
_TEST = { _TEST = {
# From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207 # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207
'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207', 'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207',
'md5': '22d2b84f058d3586efcd99e57d59d314', 'md5': '6e32495b5073ab414471b615c5ded394',
'info_dict': { 'info_dict': {
'id': 'n_hardball_5biden_140207', 'id': 'n_hardball_5biden_140207',
'ext': 'mp4', 'ext': 'mp4',
@ -280,7 +279,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
first_video_id = None first_video_id = None
duration = None duration = None
for item in entry['media$content']: for item in entry['media$content']:
smil_url = item['plfile$url'] + '&format=SMIL&mbr=true' smil_url = item['plfile$url'] + '&mbr=true'
cur_video_id = ThePlatformIE._match_id(smil_url) cur_video_id = ThePlatformIE._match_id(smil_url)
if first_video_id is None: if first_video_id is None:
first_video_id = cur_video_id first_video_id = cur_video_id

View File

@ -0,0 +1,33 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
class TV3IE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?tv3\.co\.nz/(?P<id>[^/]+)/tabid/\d+/articleID/\d+/MCat/\d+/Default\.aspx'
_TEST = {
'url': 'http://www.tv3.co.nz/MOTORSPORT-SRS-SsangYong-Hampton-Downs-Round-3/tabid/3692/articleID/121615/MCat/2915/Default.aspx',
'info_dict': {
'id': '4659127992001',
'ext': 'mp4',
'title': 'CRC Motorsport: SRS SsangYong Hampton Downs Round 3 - S2015 Ep3',
'description': 'SsangYong Racing Series returns for Round 3 with drivers from New Zealand and Australia taking to the grid at Hampton Downs raceway.',
'uploader_id': '3812193411001',
'upload_date': '20151213',
'timestamp': 1449975272,
},
'expected_warnings': [
'Failed to download MPD manifest'
],
'params': {
'skip_download': True,
},
}
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/3812193411001/default_default/index.html?videoId=%s'
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
brightcove_id = self._search_regex(r'<param\s*name="@videoPlayer"\s*value="(\d+)"', webpage, 'brightcove id')
return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)

View File

@ -144,7 +144,8 @@ class UdemyIE(InfoExtractor):
webpage = self._download_webpage(url, lecture_id) webpage = self._download_webpage(url, lecture_id)
course_id = self._search_regex( course_id = self._search_regex(
r'data-course-id=["\'](\d+)', webpage, 'course id') (r'data-course-id=["\'](\d+)', r'&quot;id&quot;\s*:\s*(\d+)'),
webpage, 'course id')
try: try:
lecture = self._download_lecture(course_id, lecture_id) lecture = self._download_lecture(course_id, lecture_id)

View File

@ -4,6 +4,7 @@ from .common import InfoExtractor
from ..utils import ( from ..utils import (
ExtractorError, ExtractorError,
sanitized_Request, sanitized_Request,
int_or_none,
) )
@ -18,6 +19,9 @@ class WistiaIE(InfoExtractor):
'id': 'sh7fpupwlt', 'id': 'sh7fpupwlt',
'ext': 'mov', 'ext': 'mov',
'title': 'Being Resourceful', 'title': 'Being Resourceful',
'description': 'a Clients From Hell Video Series video from worldwidewebhosting',
'upload_date': '20131204',
'timestamp': 1386185018,
'duration': 117, 'duration': 117,
}, },
} }
@ -32,35 +36,43 @@ class WistiaIE(InfoExtractor):
raise ExtractorError('Error while getting the playlist', raise ExtractorError('Error while getting the playlist',
expected=True) expected=True)
data = data_json['media'] data = data_json['media']
title = data['name']
formats = [] formats = []
thumbnails = [] thumbnails = []
for a in data['assets']: for a in data['assets']:
astatus = a.get('status')
atype = a.get('type') atype = a.get('type')
if atype == 'still': if (astatus is not None and astatus != 2) or atype == 'preview':
continue
elif atype in ('still', 'still_image'):
thumbnails.append({ thumbnails.append({
'url': a['url'], 'url': a['url'],
'resolution': '%dx%d' % (a['width'], a['height']), 'resolution': '%dx%d' % (a['width'], a['height']),
}) })
continue else:
if atype == 'preview': formats.append({
continue 'format_id': atype,
formats.append({ 'url': a['url'],
'format_id': atype, 'tbr': int_or_none(a.get('bitrate')),
'url': a['url'], 'vbr': int_or_none(a.get('opt_vbitrate')),
'width': a['width'], 'width': int_or_none(a.get('width')),
'height': a['height'], 'height': int_or_none(a.get('height')),
'filesize': a['size'], 'filesize': int_or_none(a.get('size')),
'ext': a['ext'], 'vcodec': a.get('codec'),
'preference': 1 if atype == 'original' else None, 'container': a.get('container'),
}) 'ext': a.get('ext'),
'preference': 1 if atype == 'original' else None,
})
self._sort_formats(formats) self._sort_formats(formats)
return { return {
'id': video_id, 'id': video_id,
'title': data['name'], 'title': title,
'description': data.get('seoDescription'),
'formats': formats, 'formats': formats,
'thumbnails': thumbnails, 'thumbnails': thumbnails,
'duration': data.get('duration'), 'duration': int_or_none(data.get('duration')),
'timestamp': int_or_none(data.get('createdAt')),
} }

View File

@ -309,6 +309,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20}, '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'acodec': 'vorbis', 'abr': 192, 'vcodec': 'vp8', 'preference': -20},
# Apple HTTP Live Streaming # Apple HTTP Live Streaming
'91': {'ext': 'mp4', 'height': 144, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
'92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10}, '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 48, 'vcodec': 'h264', 'preference': -10},
'93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},
'94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10}, '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264', 'preference': -10},

View File

@ -35,6 +35,7 @@ import xml.etree.ElementTree
import zlib import zlib
from .compat import ( from .compat import (
compat_HTMLParser,
compat_basestring, compat_basestring,
compat_chr, compat_chr,
compat_etree_fromstring, compat_etree_fromstring,
@ -49,6 +50,7 @@ from .compat import (
compat_urllib_parse_urlparse, compat_urllib_parse_urlparse,
compat_urllib_request, compat_urllib_request,
compat_urlparse, compat_urlparse,
compat_xpath,
shlex_quote, shlex_quote,
) )
@ -164,12 +166,7 @@ if sys.version_info >= (2, 7):
return node.find(expr) return node.find(expr)
else: else:
def find_xpath_attr(node, xpath, key, val=None): def find_xpath_attr(node, xpath, key, val=None):
# Here comes the crazy part: In 2.6, if the xpath is a unicode, for f in node.findall(compat_xpath(xpath)):
# .//node does not match if a node is a direct child of . !
if isinstance(xpath, compat_str):
xpath = xpath.encode('ascii')
for f in node.findall(xpath):
if key not in f.attrib: if key not in f.attrib:
continue continue
if val is None or f.attrib.get(key) == val: if val is None or f.attrib.get(key) == val:
@ -194,9 +191,7 @@ def xpath_with_ns(path, ns_map):
def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
def _find_xpath(xpath): def _find_xpath(xpath):
if sys.version_info < (2, 7): # Crazy 2.6 return node.find(compat_xpath(xpath))
xpath = xpath.encode('ascii')
return node.find(xpath)
if isinstance(xpath, (str, compat_str)): if isinstance(xpath, (str, compat_str)):
n = _find_xpath(xpath) n = _find_xpath(xpath)
@ -273,6 +268,38 @@ def get_element_by_attribute(attribute, value, html):
return unescapeHTML(res) return unescapeHTML(res)
class HTMLAttributeParser(compat_HTMLParser):
"""Trivial HTML parser to gather the attributes for a single element"""
def __init__(self):
self.attrs = {}
compat_HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
self.attrs = dict(attrs)
def extract_attributes(html_element):
"""Given a string for an HTML element such as
<el
a="foo" B="bar" c="&98;az" d=boz
empty= noval entity="&amp;"
sq='"' dq="'"
>
Decode and return a dictionary of attributes.
{
'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
'empty': '', 'noval': None, 'entity': '&',
'sq': '"', 'dq': '\''
}.
NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
"""
parser = HTMLAttributeParser()
parser.feed(html_element)
parser.close()
return parser.attrs
def clean_html(html): def clean_html(html):
"""Clean an HTML snippet into a readable string""" """Clean an HTML snippet into a readable string"""
@ -1319,7 +1346,7 @@ def format_bytes(bytes):
def lookup_unit_table(unit_table, s): def lookup_unit_table(unit_table, s):
units_re = '|'.join(re.escape(u) for u in unit_table) units_re = '|'.join(re.escape(u) for u in unit_table)
m = re.match( m = re.match(
r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s) r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
if not m: if not m:
return None return None
num_str = m.group('num').replace(',', '.') num_str = m.group('num').replace(',', '.')

View File

@ -1,3 +1,3 @@
from __future__ import unicode_literals from __future__ import unicode_literals
__version__ = '2016.03.14' __version__ = '2016.03.18'