diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index a78413518..30cc27c7b 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.01.05*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.01.05** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.01.22*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.01.22** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2017.01.05 +[debug] youtube-dl version 2017.01.22 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f50f52841..d606eab0e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -124,7 +124,7 @@ After you have ensured this site is distributing its content legally, you can fo 'id': '42', 'ext': 'mp4', 'title': 'Video title goes here', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', # TODO more properties, either as: # * A value # * MD5 checksum; start the string with md5: diff --git a/ChangeLog b/ChangeLog index 2d2e22af9..beea17e54 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,9 +1,92 @@ -version +version 2017.01.22 + +Extractors ++ [pornflip] Add support for pornflip.com (#11556, #11795) +* [chaturbate] Fix extraction (#11797, #11802) ++ [azmedien] Add support for AZ Medien sites (#11784, #11785) ++ [nextmedia] Support redirected URLs ++ [vimeo:channel] Extract videos' titles for playlist entries (#11796) ++ [youtube] Extract episode metadata (#9695, #11774) ++ [cspan] Support Ustream embedded videos (#11547) ++ [1tv] Add support for HLS videos (#11786) +* [uol] Fix extraction (#11770) +* [mtv] Relax triforce feed regular expression (#11766) + + +version 2017.01.18 + +Extractors +* [bilibili] Fix extraction (#11077) ++ [canalplus] Add fallback for video id (#11764) +* [20min] Fix extraction (#11683, #11751) +* [imdb] Extend URL regular expression (#11744) ++ [naver] Add support for tv.naver.com links (#11743) + + +version 2017.01.16 + +Core +* [options] Apply custom config to final composite configuration (#11741) +* [YoutubeDL] Improve protocol auto determining (#11720) + +Extractors +* [xiami] Relax URL regular expressions +* [xiami] Improve track metadata extraction (#11699) ++ [limelight] Check hand-make direct HTTP links ++ [limelight] Add support for direct HTTP links at video.llnw.net (#11737) ++ [brightcove] Recognize another player ID pattern (#11688) ++ [niconico] Support login via cookies (#7968) +* [yourupload] Fix extraction (#11601) ++ [beam:live] Add support for beam.pro live streams (#10702, #11596) +* [vevo] Improve geo restriction detection ++ [dramafever] Add support for URLs with language code (#11714) +* [cbc] Improve playlist support (#11704) + + +version 2017.01.14 + +Core ++ [common] Add ability to customize akamai manifest host ++ [utils] Add more date formats + +Extractors +- [mtv] Eliminate _transform_rtmp_url +* [mtv] Generalize triforce mgid extraction ++ [cmt] Add support for full episodes and video clips (#11623) ++ [mitele] Extract DASH formats ++ [ooyala] Add support for videos with embedToken (#11684) +* [mixcloud] Fix extraction (#11674) +* [openload] Fix extraction (#10408) +* [tv4] Improve extraction (#11698) +* [freesound] Fix and improve extraction (#11602) ++ [nick] Add support for beta.nick.com (#11655) +* [mtv,cc] Use HLS by default with native HLS downloader (#11641) +* [mtv] Fix non-HLS extraction + + +version 2017.01.10 + +Extractors +* [youtube] Fix extraction (#11663, #11664) ++ [inc] Add support for inc.com (#11277, #11647) ++ [youtube] Add itag 212 (#11575) ++ [egghead:course] Add support for egghead.io courses + + +version 2017.01.08 Core * Fix "invalid escape sequence" errors under Python 3.6 (#11581) Extractors ++ [hitrecord] Add support for hitrecord.org (#10867, #11626) +- [videott] Remove extractor +* [swrmediathek] Improve extraction +- [sharesix] Remove extractor +- [aol:features] Remove extractor +* [sendtonews] Improve info extraction +* [3sat,phoenix] Fix extraction (#11619) +* [comedycentral/mtv] Add support for HLS videos (#11600) * [discoverygo] Fix JSON data parsing (#11219, #11522) diff --git a/README.md b/README.md index 905c1b73f..4f677d0cc 100644 --- a/README.md +++ b/README.md @@ -374,7 +374,7 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo avprobe) --audio-format FORMAT Specify audio format: "best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; - "best" by default + "best" by default; No effect without -x --audio-quality QUALITY Specify ffmpeg/avconv audio quality, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K @@ -841,7 +841,7 @@ Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`. In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [Export Cookies](https://addons.mozilla.org/en-US/firefox/addon/export-cookies/) (for Firefox). -Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows, `LF` (`\n`) for Linux and `CR` (`\r`) for Mac OS. `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. +Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows and `LF` (`\n`) for Unix and Unix-like systems (Linux, Mac OS, etc.). `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format. Passing cookies to youtube-dl is a good way to workaround login when a particular extractor does not implement it explicitly. Another use case is working around [CAPTCHA](https://en.wikipedia.org/wiki/CAPTCHA) some websites require you to solve in particular cases in order to get access (e.g. YouTube, CloudFlare). diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 0e301e8f3..b906d443a 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -74,6 +74,8 @@ - **awaan:live** - **awaan:season** - **awaan:video** + - **AZMedien**: AZ Medien videos + - **AZMedienShow**: AZ Medien shows - **Azubu** - **AzubuLive** - **BaiduVideo**: 百度视频 @@ -86,6 +88,7 @@ - **bbc.co.uk:article**: BBC articles - **bbc.co.uk:iplayer:playlist** - **bbc.co.uk:playlist** + - **Beam:live** - **Beatport** - **Beeg** - **BehindKink** @@ -214,6 +217,7 @@ - **EaglePlatform** - **EbaumsWorld** - **EchoMsk** + - **egghead:course**: egghead.io course - **eHow** - **Einthusan** - **eitb.tv** @@ -240,7 +244,6 @@ - **fc2** - **fc2:embed** - **Fczenit** - - **features.aol.com** - **fernsehkritik.tv** - **Firstpost** - **FiveTV** @@ -304,6 +307,7 @@ - **history:topic**: History.com Topic - **hitbox** - **hitbox:live** + - **HitRecord** - **HornBunny** - **HotNewHipHop** - **HotStar** @@ -321,6 +325,7 @@ - **Imgur** - **ImgurAlbum** - **Ina** + - **Inc** - **Indavideo** - **IndavideoEmbed** - **InfoQ** @@ -569,6 +574,7 @@ - **PolskieRadio** - **PolskieRadioCategory** - **PornCom** + - **PornFlip** - **PornHd** - **PornHub**: PornHub and Thumbzilla - **PornHubPlaylist** @@ -650,7 +656,6 @@ - **screen.yahoo:search**: Yahoo screen search - **Screencast** - **ScreencastOMatic** - - **ScreenJunkies** - **Seeker** - **SenateISVP** - **SendtoNews** @@ -658,7 +663,6 @@ - **Sexu** - **Shahid** - **Shared**: shared.sx - - **ShareSix** - **ShowRoomLive** - **Sina** - **SixPlay** @@ -845,7 +849,6 @@ - **videomore:season** - **videomore:video** - **VideoPremium** - - **VideoTt**: video.tt - Your True Tube (Currently broken) - **videoweed**: VideoWeed - **Vidio** - **vidme** diff --git a/test/test_utils.py b/test/test_utils.py index 3092db5c1..e99bf794e 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -295,6 +295,9 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_strdate('27.02.2016 17:30'), '20160227') self.assertEqual(unified_strdate('UNKNOWN DATE FORMAT'), None) self.assertEqual(unified_strdate('Feb 7, 2016 at 6:35 pm'), '20160207') + self.assertEqual(unified_strdate('July 15th, 2013'), '20130715') + self.assertEqual(unified_strdate('September 1st, 2013'), '20130901') + self.assertEqual(unified_strdate('Sep 2nd, 2013'), '20130902') def test_unified_timestamps(self): self.assertEqual(unified_timestamp('December 21, 2010'), 1292889600) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 5d654f55f..41d9a63ee 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1363,7 +1363,7 @@ class YoutubeDL(object): format['ext'] = determine_ext(format['url']).lower() # Automatically determine protocol if missing (useful for format # selection purposes) - if 'protocol' not in format: + if format.get('protocol') is None: format['protocol'] = determine_protocol(format) # Add HTTP headers, so that external programs can use them from the # json output diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py new file mode 100644 index 000000000..a89f71c20 --- /dev/null +++ b/youtube_dl/extractor/azmedien.py @@ -0,0 +1,145 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .kaltura import KalturaIE +from ..utils import ( + get_element_by_class, + strip_or_none, +) + + +class AZMedienBaseIE(InfoExtractor): + def _kaltura_video(self, partner_id, entry_id): + return self.url_result( + 'kaltura:%s:%s' % (partner_id, entry_id), ie=KalturaIE.ie_key(), + video_id=entry_id) + + +class AZMedienIE(AZMedienBaseIE): + IE_DESC = 'AZ Medien videos' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + telezueri\.ch| + telebaern\.tv| + telem1\.ch + )/ + [0-9]+-show-[^/\#]+ + (?: + /[0-9]+-episode-[^/\#]+ + (?: + /[0-9]+-segment-(?:[^/\#]+\#)?| + \# + )| + \# + ) + (?P[^\#]+) + ''' + + _TESTS = [{ + # URL with 'segment' + 'url': 'http://www.telezueri.ch/62-show-zuerinews/13772-episode-sonntag-18-dezember-2016/32419-segment-massenabweisungen-beim-hiltl-club-wegen-pelzboom', + 'info_dict': { + 'id': '1_2444peh4', + 'ext': 'mov', + 'title': 'Massenabweisungen beim Hiltl Club wegen Pelzboom', + 'description': 'md5:9ea9dd1b159ad65b36ddcf7f0d7c76a8', + 'uploader_id': 'TeleZ?ri', + 'upload_date': '20161218', + 'timestamp': 1482084490, + }, + 'params': { + 'skip_download': True, + }, + }, { + # URL with 'segment' and fragment: + 'url': 'http://www.telebaern.tv/118-show-news/14240-episode-dienstag-17-januar-2017/33666-segment-achtung-gefahr#zu-wenig-pflegerinnen-und-pfleger', + 'only_matching': True + }, { + # URL with 'episode' and fragment: + 'url': 'http://www.telem1.ch/47-show-sonntalk/13986-episode-soldaten-fuer-grenzschutz-energiestrategie-obama-bilanz#soldaten-fuer-grenzschutz-energiestrategie-obama-bilanz', + 'only_matching': True + }, { + # URL with 'show' and fragment: + 'url': 'http://www.telezueri.ch/66-show-sonntalk#burka-plakate-trump-putin-china-besuch', + 'only_matching': True + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + partner_id = self._search_regex( + r']+src=["\'](?:https?:)?//(?:[^/]+\.)?kaltura\.com(?:/[^/]+)*/(?:p|partner_id)/([0-9]+)', + webpage, 'kaltura partner id') + entry_id = self._html_search_regex( + r']+data-id=(["\'])(?P(?:(?!\1).)+)\1[^>]+data-slug=["\']%s' + % re.escape(video_id), webpage, 'kaltura entry id', group='id') + + return self._kaltura_video(partner_id, entry_id) + + +class AZMedienShowIE(AZMedienBaseIE): + IE_DESC = 'AZ Medien shows' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + telezueri\.ch| + telebaern\.tv| + telem1\.ch + )/ + (?P[0-9]+-show-[^/\#]+ + (?: + /[0-9]+-episode-[^/\#]+ + )? + )$ + ''' + + _TESTS = [{ + # URL with 'episode' + 'url': 'http://www.telebaern.tv/118-show-news/13735-episode-donnerstag-15-dezember-2016', + 'info_dict': { + 'id': '118-show-news/13735-episode-donnerstag-15-dezember-2016', + 'title': 'News - Donnerstag, 15. Dezember 2016', + }, + 'playlist_count': 9, + }, { + # URL with 'show' only + 'url': 'http://www.telezueri.ch/86-show-talktaeglich', + 'only_matching': True + }] + + def _real_extract(self, url): + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) + + entries = [] + + partner_id = self._search_regex( + r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)', + webpage, 'kaltura partner id', default=None) + + if partner_id: + entries = [ + self._kaltura_video(partner_id, m.group('id')) + for m in re.finditer( + r'data-id=(["\'])(?P(?:(?!\1).)+)\1', webpage)] + + if not entries: + entries = [ + self.url_result(m.group('url'), ie=AZMedienIE.ie_key()) + for m in re.finditer( + r']+data-real=(["\'])(?Phttp.+?)\1', webpage)] + + title = self._search_regex( + r'episodeShareTitle\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'title', + default=strip_or_none(get_element_by_class( + 'title-block-cell', webpage)), group='title') + + return self.playlist_result(entries, show_id, title) diff --git a/youtube_dl/extractor/beampro.py b/youtube_dl/extractor/beampro.py new file mode 100644 index 000000000..f3a9e3278 --- /dev/null +++ b/youtube_dl/extractor/beampro.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + compat_str, + int_or_none, + parse_iso8601, + try_get, +) + + +class BeamProLiveIE(InfoExtractor): + IE_NAME = 'Beam:live' + _VALID_URL = r'https?://(?:\w+\.)?beam\.pro/(?P<id>[^/?#&]+)' + _RATINGS = {'family': 0, 'teen': 13, '18+': 18} + _TEST = { + 'url': 'http://www.beam.pro/niterhayven', + 'info_dict': { + 'id': '261562', + 'ext': 'mp4', + 'title': 'Introducing The Witcher 3 // The Grind Starts Now!', + 'description': 'md5:0b161ac080f15fe05d18a07adb44a74d', + 'thumbnail': r're:https://.*\.jpg$', + 'timestamp': 1483477281, + 'upload_date': '20170103', + 'uploader': 'niterhayven', + 'uploader_id': '373396', + 'age_limit': 18, + 'is_live': True, + 'view_count': int, + }, + 'skip': 'niterhayven is offline', + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + channel_name = self._match_id(url) + + chan = self._download_json( + 'https://beam.pro/api/v1/channels/%s' % channel_name, channel_name) + + if chan.get('online') is False: + raise ExtractorError( + '{0} is offline'.format(channel_name), expected=True) + + channel_id = chan['id'] + + formats = self._extract_m3u8_formats( + 'https://beam.pro/api/v1/channels/%s/manifest.m3u8' % channel_id, + channel_name, ext='mp4', m3u8_id='hls', fatal=False) + self._sort_formats(formats) + + user_id = chan.get('userId') or try_get(chan, lambda x: x['user']['id']) + + return { + 'id': compat_str(chan.get('id') or channel_name), + 'title': self._live_title(chan.get('name') or channel_name), + 'description': clean_html(chan.get('description')), + 'thumbnail': try_get(chan, lambda x: x['thumbnail']['url'], compat_str), + 'timestamp': parse_iso8601(chan.get('updatedAt')), + 'uploader': chan.get('token') or try_get( + chan, lambda x: x['user']['username'], compat_str), + 'uploader_id': compat_str(user_id) if user_id else None, + 'age_limit': self._RATINGS.get(chan.get('audience')), + 'is_live': True, + 'view_count': int_or_none(chan.get('viewersTotal')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 5051934ef..85ea5e6ee 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -34,8 +34,8 @@ class BiliBiliIE(InfoExtractor): }, } - _APP_KEY = '6f90a59ac58a4123' - _BILIBILI_KEY = '0bfd84cc3940035173f35e6777508326' + _APP_KEY = '84956560bc028eb7' + _BILIBILI_KEY = '94aba54af9065f71de72f5508f1cd42e' def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index aa2923ccf..5c6e99da1 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -179,7 +179,7 @@ class BrightcoveLegacyIE(InfoExtractor): params = {} - playerID = find_param('playerID') + playerID = find_param('playerID') or find_param('playerId') if playerID is None: raise ExtractorError('Cannot find player ID') params['playerID'] = playerID @@ -204,7 +204,7 @@ class BrightcoveLegacyIE(InfoExtractor): # // build Brightcove <object /> XML # } m = re.search( - r'''(?x)customBC.\createVideo\( + r'''(?x)customBC\.createVideo\( .*? # skipping width and height ["\'](?P<playerID>\d+)["\']\s*,\s* # playerID ["\'](?P<playerKey>AQ[^"\']{48})[^"\']*["\']\s*,\s* # playerKey begins with AQ and is 50 characters diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 10cf165bc..b3f76a7b1 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -107,7 +107,7 @@ class CanalplusIE(InfoExtractor): [r'<canal:player[^>]+?videoId=(["\'])(?P<id>\d+)', r'id=["\']canal_video_player(?P<id>\d+)', r'data-video=["\'](?P<id>\d+)'], - webpage, 'video id', group='id') + webpage, 'video id', default=mobj.group('vid'), group='id') info_url = self._VIDEO_INFO_TEMPLATE % (site_id, video_id) video_data = self._download_json(info_url, video_id, 'Downloading video JSON') diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 7c76ceac8..a291685bf 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -90,36 +90,49 @@ class CBCIE(InfoExtractor): }, }], 'skip': 'Geo-restricted to Canada', + }, { + # multiple CBC.APP.Caffeine.initInstance(...) + 'url': 'http://www.cbc.ca/news/canada/calgary/dog-indoor-exercise-winter-1.3928238', + 'info_dict': { + 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks', + 'id': 'dog-indoor-exercise-winter-1.3928238', + }, + 'playlist_mincount': 6, }] @classmethod def suitable(cls, url): return False if CBCPlayerIE.suitable(url) else super(CBCIE, cls).suitable(url) + def _extract_player_init(self, player_init, display_id): + player_info = self._parse_json(player_init, display_id, js_to_json) + media_id = player_info.get('mediaId') + if not media_id: + clip_id = player_info['clipId'] + feed = self._download_json( + 'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id, + clip_id, fatal=False) + if feed: + media_id = try_get(feed, lambda x: x['entries'][0]['guid'], compat_str) + if not media_id: + media_id = self._download_json( + 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, + clip_id)['entries'][0]['id'].split('/')[-1] + return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) + def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - player_init = self._search_regex( - r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage, 'player init', - default=None) - if player_init: - player_info = self._parse_json(player_init, display_id, js_to_json) - media_id = player_info.get('mediaId') - if not media_id: - clip_id = player_info['clipId'] - feed = self._download_json( - 'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id, - clip_id, fatal=False) - if feed: - media_id = try_get(feed, lambda x: x['entries'][0]['guid'], compat_str) - if not media_id: - media_id = self._download_json( - 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, - clip_id)['entries'][0]['id'].split('/')[-1] - return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) - else: - entries = [self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) for media_id in re.findall(r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"', webpage)] - return self.playlist_result(entries) + entries = [ + self._extract_player_init(player_init, display_id) + for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)] + entries.extend([ + self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) + for media_id in re.findall(r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"', webpage)]) + return self.playlist_result( + entries, display_id, + self._og_search_title(webpage, fatal=False), + self._og_search_description(webpage)) class CBCPlayerIE(InfoExtractor): diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index 29a8820d5..8fbc91c1f 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ExtractorError @@ -31,30 +33,35 @@ class ChaturbateIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - m3u8_url = self._search_regex( - r'src=(["\'])(?P<url>http.+?\.m3u8.*?)\1', webpage, - 'playlist', default=None, group='url') + m3u8_formats = [(m.group('id').lower(), m.group('url')) for m in re.finditer( + r'hlsSource(?P<id>.+?)\s*=\s*(?P<q>["\'])(?P<url>http.+?)(?P=q)', webpage)] - if not m3u8_url: + if not m3u8_formats: error = self._search_regex( [r'<span[^>]+class=(["\'])desc_span\1[^>]*>(?P<error>[^<]+)</span>', r'<div[^>]+id=(["\'])defchat\1[^>]*>\s*<p><strong>(?P<error>[^<]+)<'], webpage, 'error', group='error', default=None) if not error: - if any(p not in webpage for p in ( + if any(p in webpage for p in ( self._ROOM_OFFLINE, 'offline_tipping', 'tip_offline')): error = self._ROOM_OFFLINE if error: raise ExtractorError(error, expected=True) raise ExtractorError('Unable to find stream URL') - formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') + formats = [] + for m3u8_id, m3u8_url in m3u8_formats: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, ext='mp4', + # ffmpeg skips segments for fast m3u8 + preference=-10 if m3u8_id == 'fast' else None, + m3u8_id=m3u8_id, fatal=False, live=True)) self._sort_formats(formats) return { 'id': video_id, 'title': self._live_title(video_id), - 'thumbnail': 'https://cdn-s.highwebmedia.com/uHK3McUtGCG3SMFcd4ZJsRv8/roomimage/%s.jpg' % video_id, + 'thumbnail': 'https://roomimg.stream.highwebmedia.com/ri/%s.jpg' % video_id, 'age_limit': self._rta_search(webpage), 'is_live': True, 'formats': formats, diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index f35df143a..4815b34be 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -19,6 +19,7 @@ class ChirbitIE(InfoExtractor): 'title': 'md5:f542ea253f5255240be4da375c6a5d7e', 'description': 'md5:f24a4e22a71763e32da5fed59e47c770', 'duration': 306, + 'uploader': 'Gerryaudio', }, 'params': { 'skip_download': True, @@ -54,6 +55,9 @@ class ChirbitIE(InfoExtractor): duration = parse_duration(self._search_regex( r'class=["\']c-length["\'][^>]*>([^<]+)', webpage, 'duration', fatal=False)) + uploader = self._search_regex( + r'id=["\']chirbit-username["\'][^>]*>([^<]+)', + webpage, 'uploader', fatal=False) return { 'id': audio_id, @@ -61,6 +65,7 @@ class ChirbitIE(InfoExtractor): 'title': title, 'description': description, 'duration': duration, + 'uploader': uploader, } diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py index 7d3e9b0c9..f6b794fb3 100644 --- a/youtube_dl/extractor/cmt.py +++ b/youtube_dl/extractor/cmt.py @@ -1,13 +1,11 @@ from __future__ import unicode_literals from .mtv import MTVIE -from ..utils import ExtractorError class CMTIE(MTVIE): IE_NAME = 'cmt.com' - _VALID_URL = r'https?://(?:www\.)?cmt\.com/(?:videos|shows)/(?:[^/]+/)*(?P<videoid>\d+)' - _FEED_URL = 'http://www.cmt.com/sitewide/apps/player/embed/rss/' + _VALID_URL = r'https?://(?:www\.)?cmt\.com/(?:videos|shows|full-episodes|video-clips)/(?P<id>[^/]+)' _TESTS = [{ 'url': 'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061', @@ -33,17 +31,24 @@ class CMTIE(MTVIE): }, { 'url': 'http://www.cmt.com/shows/party-down-south/party-down-south-ep-407-gone-girl/1738172/playlist/#id=1738172', 'only_matching': True, + }, { + 'url': 'http://www.cmt.com/full-episodes/537qb3/nashville-the-wayfaring-stranger-season-5-ep-501', + 'only_matching': True, + }, { + 'url': 'http://www.cmt.com/video-clips/t9e4ci/nashville-juliette-in-2-minutes', + 'only_matching': True, }] - @classmethod - def _transform_rtmp_url(cls, rtmp_video_url): - if 'error_not_available.swf' in rtmp_video_url: - raise ExtractorError( - '%s said: video is not available' % cls.IE_NAME, expected=True) - - return super(CMTIE, cls)._transform_rtmp_url(rtmp_video_url) - def _extract_mgid(self, webpage): - return self._search_regex( + mgid = self._search_regex( r'MTVN\.VIDEO\.contentUri\s*=\s*([\'"])(?P<mgid>.+?)\1', - webpage, 'mgid', group='mgid') + webpage, 'mgid', group='mgid', default=None) + if not mgid: + mgid = self._extract_triforce_mgid(webpage) + return mgid + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + mgid = self._extract_mgid(webpage) + return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 8bd589774..4cac29415 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -48,17 +48,8 @@ class ComedyCentralFullEpisodesIE(MTVServicesInfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - - feed_json = self._search_regex(r'var triforceManifestFeed\s*=\s*(\{.+?\});\n', webpage, 'triforce feeed') - feed = self._parse_json(feed_json, playlist_id) - zones = feed['manifest']['zones'] - - video_zone = zones['t2_lc_promo1'] - feed = self._download_json(video_zone['feed'], playlist_id) - mgid = feed['result']['data']['id'] - - videos_info = self._get_videos_info(mgid, use_hls=True) - + mgid = self._extract_triforce_mgid(webpage, data_zone='t2_lc_promo1') + videos_info = self._get_videos_info(mgid) return videos_info @@ -94,12 +85,6 @@ class ToshIE(MTVServicesInfoExtractor): 'only_matching': True, }] - @classmethod - def _transform_rtmp_url(cls, rtmp_video_url): - new_urls = super(ToshIE, cls)._transform_rtmp_url(rtmp_video_url) - new_urls['rtmp'] = rtmp_video_url.replace('viacomccstrm', 'viacommtvstrm') - return new_urls - class ComedyCentralTVIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/(?:staffeln|shows)/(?P<id>[^/?#&]+)' diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6fa7c334e..dce8c7d0d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1967,10 +1967,13 @@ class InfoExtractor(object): entries.append(media_info) return entries - def _extract_akamai_formats(self, manifest_url, video_id): + def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): formats = [] hdcore_sign = 'hdcore=3.7.0' - f4m_url = re.sub(r'(https?://.+?)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') + f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') + hds_host = hosts.get('hds') + if hds_host: + f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url) if 'hdcore=' not in f4m_url: f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign f4m_formats = self._extract_f4m_formats( @@ -1978,7 +1981,10 @@ class InfoExtractor(object): for entry in f4m_formats: entry.update({'extra_param_to_segment_url': hdcore_sign}) formats.extend(f4m_formats) - m3u8_url = re.sub(r'(https?://.+?)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') + m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') + hls_host = hosts.get('hls') + if hls_host: + m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 7e5d4f227..d4576160b 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -12,6 +12,7 @@ from ..utils import ( ExtractorError, ) from .senateisvp import SenateISVPIE +from .ustream import UstreamIE class CSpanIE(InfoExtractor): @@ -22,14 +23,13 @@ class CSpanIE(InfoExtractor): 'md5': '94b29a4f131ff03d23471dd6f60b6a1d', 'info_dict': { 'id': '315139', - 'ext': 'mp4', 'title': 'Attorney General Eric Holder on Voting Rights Act Decision', - 'description': 'Attorney General Eric Holder speaks to reporters following the Supreme Court decision in [Shelby County v. Holder], in which the court ruled that the preclearance provisions of the Voting Rights Act could not be enforced.', }, + 'playlist_mincount': 2, 'skip': 'Regularly fails on travis, for unknown reasons', }, { 'url': 'http://www.c-span.org/video/?c4486943/cspan-international-health-care-models', - 'md5': '8e5fbfabe6ad0f89f3012a7943c1287b', + # md5 is unstable 'info_dict': { 'id': 'c4486943', 'ext': 'mp4', @@ -38,14 +38,11 @@ class CSpanIE(InfoExtractor): } }, { 'url': 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall', - 'md5': '2ae5051559169baadba13fc35345ae74', 'info_dict': { 'id': '342759', - 'ext': 'mp4', 'title': 'General Motors Ignition Switch Recall', - 'duration': 14848, - 'description': 'md5:118081aedd24bf1d3b68b3803344e7f3' }, + 'playlist_mincount': 6, }, { # Video from senate.gov 'url': 'http://www.c-span.org/video/?104517-1/immigration-reforms-needed-protect-skilled-american-workers', @@ -57,12 +54,30 @@ class CSpanIE(InfoExtractor): 'params': { 'skip_download': True, # m3u8 downloads } + }, { + # Ustream embedded video + 'url': 'https://www.c-span.org/video/?114917-1/armed-services', + 'info_dict': { + 'id': '58428542', + 'ext': 'flv', + 'title': 'USHR07 Armed Services Committee', + 'description': 'hsas00-2118-20150204-1000et-07\n\n\nUSHR07 Armed Services Committee', + 'timestamp': 1423060374, + 'upload_date': '20150204', + 'uploader': 'HouseCommittee', + 'uploader_id': '12987475', + }, }] def _real_extract(self, url): video_id = self._match_id(url) video_type = None webpage = self._download_webpage(url, video_id) + + ustream_url = UstreamIE._extract_url(webpage) + if ustream_url: + return self.url_result(ustream_url, UstreamIE.ie_key()) + # We first look for clipid, because clipprog always appears before patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')] results = list(filter(None, (re.search(p, webpage) for p in patterns))) diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index 1edd8e7bd..bcd9fe2a0 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -66,7 +66,7 @@ class DramaFeverBaseIE(AMPIE): class DramaFeverIE(DramaFeverBaseIE): IE_NAME = 'dramafever' - _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+/[0-9]+)(?:/|$)' + _VALID_URL = r'https?://(?:www\.)?dramafever\.com/(?:[^/]+/)?drama/(?P<id>[0-9]+/[0-9]+)(?:/|$)' _TESTS = [{ 'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/', 'info_dict': { @@ -103,6 +103,9 @@ class DramaFeverIE(DramaFeverBaseIE): # m3u8 download 'skip_download': True, }, + }, { + 'url': 'https://www.dramafever.com/zh-cn/drama/4972/15/Doctor_Romantic/', + 'only_matching': True, }] def _real_extract(self, url): @@ -148,7 +151,7 @@ class DramaFeverIE(DramaFeverBaseIE): class DramaFeverSeriesIE(DramaFeverBaseIE): IE_NAME = 'dramafever:series' - _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P<id>[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$' + _VALID_URL = r'https?://(?:www\.)?dramafever\.com/(?:[^/]+/)?drama/(?P<id>[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$' _TESTS = [{ 'url': 'http://www.dramafever.com/drama/4512/Cooking_with_Shin/', 'info_dict': { diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py new file mode 100644 index 000000000..db921465e --- /dev/null +++ b/youtube_dl/extractor/egghead.py @@ -0,0 +1,39 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class EggheadCourseIE(InfoExtractor): + IE_DESC = 'egghead.io course' + IE_NAME = 'egghead:course' + _VALID_URL = r'https://egghead\.io/courses/(?P<id>[a-zA-Z_0-9-]+)' + _TEST = { + 'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript', + 'playlist_count': 29, + 'info_dict': { + 'id': 'professor-frisby-introduces-composable-functional-javascript', + 'title': 'Professor Frisby Introduces Composable Functional JavaScript', + 'description': 're:(?s)^This course teaches the ubiquitous.*You\'ll start composing functionality before you know it.$', + }, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + title = self._html_search_regex(r'<h1 class="title">([^<]+)</h1>', webpage, 'title') + ul = self._search_regex(r'(?s)<ul class="series-lessons-list">(.*?)</ul>', webpage, 'session list') + + found = re.findall(r'(?s)<a class="[^"]*"\s*href="([^"]+)">\s*<li class="item', ul) + entries = [self.url_result(m) for m in found] + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'entries': entries, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ed9a133ea..cfddf5b92 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -77,6 +77,10 @@ from .awaan import ( AWAANLiveIE, AWAANSeasonIE, ) +from .azmedien import ( + AZMedienIE, + AZMedienShowIE, +) from .azubu import AzubuIE, AzubuLiveIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE @@ -88,6 +92,7 @@ from .bbc import ( BBCCoUkPlaylistIE, BBCIE, ) +from .beampro import BeamProLiveIE from .beeg import BeegIE from .behindkink import BehindKinkIE from .bellmedia import BellMediaIE @@ -252,6 +257,7 @@ from .dw import ( from .eagleplatform import EaglePlatformIE from .ebaumsworld import EbaumsWorldIE from .echomsk import EchoMskIE +from .egghead import EggheadCourseIE from .ehow import EHowIE from .eighttracks import EightTracksIE from .einthusan import EinthusanIE @@ -366,6 +372,7 @@ from .hgtv import ( ) from .historicfilms import HistoricFilmsIE from .hitbox import HitboxIE, HitboxLiveIE +from .hitrecord import HitRecordIE from .hornbunny import HornBunnyIE from .hotnewhiphop import HotNewHipHopIE from .hotstar import HotStarIE @@ -393,6 +400,7 @@ from .imgur import ( ImgurAlbumIE, ) from .ina import InaIE +from .inc import IncIE from .indavideo import ( IndavideoIE, IndavideoEmbedIE, @@ -716,6 +724,7 @@ from .polskieradio import ( ) from .porn91 import Porn91IE from .porncom import PornComIE +from .pornflip import PornFlipIE from .pornhd import PornHdIE from .pornhub import ( PornHubIE, diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index c6fb67057..081c71842 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -86,18 +86,43 @@ class FirstTVIE(InfoExtractor): title = item['title'] quality = qualities(QUALITIES) formats = [] + path = None for f in item.get('mbr', []): src = f.get('src') if not src or not isinstance(src, compat_str): continue tbr = int_or_none(self._search_regex( r'_(\d{3,})\.mp4', src, 'tbr', default=None)) + if not path: + path = self._search_regex( + r'//[^/]+/(.+?)_\d+\.mp4', src, + 'm3u8 path', default=None) formats.append({ 'url': src, 'format_id': f.get('name'), 'tbr': tbr, - 'quality': quality(f.get('name')), + 'source_preference': quality(f.get('name')), }) + # m3u8 URL format is reverse engineered from [1] (search for + # master.m3u8). dashEdges (that is currently balancer-vod.1tv.ru) + # is taken from [2]. + # 1. http://static.1tv.ru/player/eump1tv-current/eump-1tv.all.min.js?rnd=9097422834:formatted + # 2. http://static.1tv.ru/player/eump1tv-config/config-main.js?rnd=9097422834 + if not path and len(formats) == 1: + path = self._search_regex( + r'//[^/]+/(.+?$)', formats[0]['url'], + 'm3u8 path', default=None) + if path: + if len(formats) == 1: + m3u8_path = ',' + else: + tbrs = [compat_str(t) for t in sorted(f['tbr'] for f in formats)] + m3u8_path = '_,%s,%s' % (','.join(tbrs), '.mp4') + formats.extend(self._extract_m3u8_formats( + 'http://balancer-vod.1tv.ru/%s%s.urlset/master.m3u8' + % (path, m3u8_path), + display_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) self._sort_formats(formats) thumbnail = item.get('poster') or self._og_search_thumbnail(webpage) diff --git a/youtube_dl/extractor/flipagram.py b/youtube_dl/extractor/flipagram.py index 1902a2393..b7be40f1b 100644 --- a/youtube_dl/extractor/flipagram.py +++ b/youtube_dl/extractor/flipagram.py @@ -81,7 +81,7 @@ class FlipagramIE(InfoExtractor): 'filesize': int_or_none(cover.get('size')), } for cover in flipagram.get('covers', []) if cover.get('url')] - # Note that this only retrieves comments that are initally loaded. + # Note that this only retrieves comments that are initially loaded. # For videos with large amounts of comments, most won't be retrieved. comments = [] for comment in video_data.get('comments', {}).get(video_id, {}).get('items', []): diff --git a/youtube_dl/extractor/freesound.py b/youtube_dl/extractor/freesound.py index 5ff62af2a..138b6bc58 100644 --- a/youtube_dl/extractor/freesound.py +++ b/youtube_dl/extractor/freesound.py @@ -3,10 +3,16 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + float_or_none, + get_element_by_class, + get_element_by_id, + unified_strdate, +) class FreesoundIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?freesound\.org/people/([^/]+)/sounds/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?freesound\.org/people/[^/]+/sounds/(?P<id>[^/]+)' _TEST = { 'url': 'http://www.freesound.org/people/miklovan/sounds/194503/', 'md5': '12280ceb42c81f19a515c745eae07650', @@ -14,26 +20,60 @@ class FreesoundIE(InfoExtractor): 'id': '194503', 'ext': 'mp3', 'title': 'gulls in the city.wav', - 'uploader': 'miklovan', 'description': 'the sounds of seagulls in the city', + 'duration': 130.233, + 'uploader': 'miklovan', + 'upload_date': '20130715', + 'tags': list, } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - music_id = mobj.group('id') - webpage = self._download_webpage(url, music_id) - title = self._html_search_regex( - r'<div id="single_sample_header">.*?<a href="#">(.+?)</a>', - webpage, 'music title', flags=re.DOTALL) + audio_id = self._match_id(url) + + webpage = self._download_webpage(url, audio_id) + + audio_url = self._og_search_property('audio', webpage, 'song url') + title = self._og_search_property('audio:title', webpage, 'song title') + description = self._html_search_regex( - r'<div id="sound_description">(.*?)</div>', webpage, 'description', - fatal=False, flags=re.DOTALL) + r'(?s)id=["\']sound_description["\'][^>]*>(.+?)</div>', + webpage, 'description', fatal=False) + + duration = float_or_none( + get_element_by_class('duration', webpage), scale=1000) + + upload_date = unified_strdate(get_element_by_id('sound_date', webpage)) + uploader = self._og_search_property( + 'audio:artist', webpage, 'uploader', fatal=False) + + channels = self._html_search_regex( + r'Channels</dt><dd>(.+?)</dd>', webpage, + 'channels info', fatal=False) + + tags_str = get_element_by_class('tags', webpage) + tags = re.findall(r'<a[^>]+>([^<]+)', tags_str) if tags_str else None + + audio_urls = [audio_url] + + LQ_FORMAT = '-lq.mp3' + if LQ_FORMAT in audio_url: + audio_urls.append(audio_url.replace(LQ_FORMAT, '-hq.mp3')) + + formats = [{ + 'url': format_url, + 'format_note': channels, + 'quality': quality, + } for quality, format_url in enumerate(audio_urls)] + self._sort_formats(formats) return { - 'id': music_id, + 'id': audio_id, 'title': title, - 'url': self._og_search_property('audio', webpage, 'music url'), - 'uploader': self._og_search_property('audio:artist', webpage, 'music uploader'), 'description': description, + 'duration': duration, + 'uploader': uploader, + 'upload_date': upload_date, + 'tags': tags, + 'formats': formats, } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 86dc79307..40201f311 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -78,6 +78,8 @@ from .vbox7 import Vbox7IE from .dbtv import DBTVIE from .piksel import PikselIE from .videa import VideaIE +from .twentymin import TwentyMinutenIE +from .ustream import UstreamIE class GenericIE(InfoExtractor): @@ -422,6 +424,26 @@ class GenericIE(InfoExtractor): 'skip_download': True, # m3u8 download }, }, + { + # Brightcove with alternative playerID key + 'url': 'http://www.nature.com/nmeth/journal/v9/n7/fig_tab/nmeth.2062_SV1.html', + 'info_dict': { + 'id': 'nmeth.2062_SV1', + 'title': 'Simultaneous multiview imaging of the Drosophila syncytial blastoderm : Quantitative high-speed imaging of entire developing embryos with simultaneous multiview light-sheet microscopy : Nature Methods : Nature Research', + }, + 'playlist': [{ + 'info_dict': { + 'id': '2228375078001', + 'ext': 'mp4', + 'title': 'nmeth.2062-sv1', + 'description': 'nmeth.2062-sv1', + 'timestamp': 1363357591, + 'upload_date': '20130315', + 'uploader': 'Nature Publishing Group', + 'uploader_id': '1964492299001', + }, + }], + }, # ooyala video { 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', @@ -567,17 +589,6 @@ class GenericIE(InfoExtractor): 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9', } }, - # Embedded Ustream video - { - 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm', - 'md5': '27b99cdb639c9b12a79bca876a073417', - 'info_dict': { - 'id': '45734260', - 'ext': 'flv', - 'uploader': 'AU SPA: The NSA and Privacy', - 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman' - } - }, # nowvideo embed hidden behind percent encoding { 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/', @@ -1448,6 +1459,20 @@ class GenericIE(InfoExtractor): }, 'playlist_mincount': 2, }, + { + # 20 minuten embed + 'url': 'http://www.20min.ch/schweiz/news/story/So-kommen-Sie-bei-Eis-und-Schnee-sicher-an-27032552', + 'info_dict': { + 'id': '523629', + 'ext': 'mp4', + 'title': 'So kommen Sie bei Eis und Schnee sicher an', + 'description': 'md5:117c212f64b25e3d95747e5276863f7d', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [TwentyMinutenIE.ie_key()], + } # { # # TODO: find another test # # http://schema.org/VideoObject @@ -1939,7 +1964,14 @@ class GenericIE(InfoExtractor): re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) if mobj is not None: - return OoyalaIE._build_url_result(smuggle_url(mobj.group('ec'), {'domain': url})) + embed_token = self._search_regex( + r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)', + webpage, 'ooyala embed token', default=None) + return OoyalaIE._build_url_result(smuggle_url( + mobj.group('ec'), { + 'domain': url, + 'embed_token': embed_token, + })) # Look for multiple Ooyala embeds on SBN network websites mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) @@ -2070,10 +2102,9 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url'), 'TED') # Look for embedded Ustream videos - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Ustream') + ustream_url = UstreamIE._extract_url(webpage) + if ustream_url: + return self.url_result(ustream_url, UstreamIE.ie_key()) # Look for embedded arte.tv player mobj = re.search( @@ -2394,6 +2425,12 @@ class GenericIE(InfoExtractor): if videa_urls: return _playlist_from_matches(videa_urls, ie=VideaIE.ie_key()) + # Look for 20 minuten embeds + twentymin_urls = TwentyMinutenIE._extract_urls(webpage) + if twentymin_urls: + return _playlist_from_matches( + twentymin_urls, ie=TwentyMinutenIE.ie_key()) + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( webpage, video_id, default={}, expected_type='VideoObject') diff --git a/youtube_dl/extractor/hitrecord.py b/youtube_dl/extractor/hitrecord.py new file mode 100644 index 000000000..01a6946d0 --- /dev/null +++ b/youtube_dl/extractor/hitrecord.py @@ -0,0 +1,68 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + clean_html, + float_or_none, + int_or_none, + try_get, +) + + +class HitRecordIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hitrecord\.org/records/(?P<id>\d+)' + _TEST = { + 'url': 'https://hitrecord.org/records/2954362', + 'md5': 'fe1cdc2023bce0bbb95c39c57426aa71', + 'info_dict': { + 'id': '2954362', + 'ext': 'mp4', + 'title': 'A Very Different World (HITRECORD x ACLU)', + 'description': 'md5:e62defaffab5075a5277736bead95a3d', + 'duration': 139.327, + 'timestamp': 1471557582, + 'upload_date': '20160818', + 'uploader': 'Zuzi.C12', + 'uploader_id': '362811', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'tags': list, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'https://hitrecord.org/api/web/records/%s' % video_id, video_id) + + title = video['title'] + video_url = video['source_url']['mp4_url'] + + tags = None + tags_list = try_get(video, lambda x: x['tags'], list) + if tags_list: + tags = [ + t['text'] + for t in tags_list + if isinstance(t, dict) and t.get('text') and + isinstance(t['text'], compat_str)] + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': clean_html(video.get('body')), + 'duration': float_or_none(video.get('duration'), 1000), + 'timestamp': int_or_none(video.get('created_at_i')), + 'uploader': try_get( + video, lambda x: x['user']['username'], compat_str), + 'uploader_id': try_get( + video, lambda x: compat_str(x['user']['id'])), + 'view_count': int_or_none(video.get('total_views_count')), + 'like_count': int_or_none(video.get('hearts_count')), + 'comment_count': int_or_none(video.get('comments_count')), + 'tags': tags, + } diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index f0fc8d49a..f95c00c73 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -13,7 +13,7 @@ from ..utils import ( class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video/[^/]+/|title/tt\d+.*?#lb-)vi(?P<id>\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video/[^/]+/|title/tt\d+.*?#lb-|videoplayer/)vi(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.imdb.com/video/imdb/vi2524815897', @@ -32,6 +32,9 @@ class ImdbIE(InfoExtractor): }, { 'url': 'http://www.imdb.com/title/tt1667889/#lb-vi2524815897', 'only_matching': True, + }, { + 'url': 'http://www.imdb.com/videoplayer/vi1562949145', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/inc.py b/youtube_dl/extractor/inc.py new file mode 100644 index 000000000..241ec83c4 --- /dev/null +++ b/youtube_dl/extractor/inc.py @@ -0,0 +1,41 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from .kaltura import KalturaIE + + +class IncIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?inc\.com/(?:[^/]+/)+(?P<id>[^.]+).html' + _TESTS = [{ + 'url': 'http://www.inc.com/tip-sheet/bill-gates-says-these-5-books-will-make-you-smarter.html', + 'md5': '7416739c9c16438c09fa35619d6ba5cb', + 'info_dict': { + 'id': '1_wqig47aq', + 'ext': 'mov', + 'title': 'Bill Gates Says These 5 Books Will Make You Smarter', + 'description': 'md5:bea7ff6cce100886fc1995acb743237e', + 'timestamp': 1474414430, + 'upload_date': '20160920', + 'uploader_id': 'video@inc.com', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.inc.com/video/david-whitford/founders-forum-tripadvisor-steve-kaufer-most-enjoyable-moment-for-entrepreneur.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + partner_id = self._search_regex( + r'var\s+_?bizo_data_partner_id\s*=\s*["\'](\d+)', webpage, 'partner id') + + kaltura_id = self._parse_json(self._search_regex( + r'pageInfo\.videos\s*=\s*\[(.+)\];', webpage, 'kaltura id'), + display_id)['vid_kaltura_id'] + + return self.url_result( + 'kaltura:%s:%s' % (partner_id, kaltura_id), KalturaIE.ie_key()) diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 905a0e85f..e635f3c4d 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -59,14 +59,26 @@ class LimelightBaseIE(InfoExtractor): format_id = 'rtmp' if stream.get('videoBitRate'): format_id += '-%d' % int_or_none(stream['videoBitRate']) - http_url = 'http://cpl.delvenetworks.com/' + rtmp.group('playpath')[4:] - urls.append(http_url) - http_fmt = fmt.copy() - http_fmt.update({ - 'url': http_url, - 'format_id': format_id.replace('rtmp', 'http'), - }) - formats.append(http_fmt) + http_format_id = format_id.replace('rtmp', 'http') + + CDN_HOSTS = ( + ('delvenetworks.com', 'cpl.delvenetworks.com'), + ('video.llnw.net', 's2.content.video.llnw.net'), + ) + for cdn_host, http_host in CDN_HOSTS: + if cdn_host not in rtmp.group('host').lower(): + continue + http_url = 'http://%s/%s' % (http_host, rtmp.group('playpath')[4:]) + urls.append(http_url) + if self._is_valid_url(http_url, video_id, http_format_id): + http_fmt = fmt.copy() + http_fmt.update({ + 'url': http_url, + 'format_id': http_format_id, + }) + formats.append(http_fmt) + break + fmt.update({ 'url': rtmp.group('url'), 'play_path': rtmp.group('playpath'), diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 8984d3b8d..79e0b8ada 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -190,7 +190,7 @@ class MiTeleIE(InfoExtractor): return { '_type': 'url_transparent', # for some reason only HLS is supported - 'url': smuggle_url('ooyala:' + embedCode, {'supportedformats': 'm3u8'}), + 'url': smuggle_url('ooyala:' + embedCode, {'supportedformats': 'm3u8,dash'}), 'id': video_id, 'title': title, 'description': description, diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index 4ba2310fd..a24b3165a 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -16,7 +16,6 @@ from ..utils import ( clean_html, ExtractorError, OnDemandPagedList, - parse_count, str_to_int, ) @@ -36,7 +35,6 @@ class MixcloudIE(InfoExtractor): 'uploader_id': 'dholbach', 'thumbnail': r're:https?://.*\.jpg', 'view_count': int, - 'like_count': int, }, }, { 'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/', @@ -49,7 +47,6 @@ class MixcloudIE(InfoExtractor): 'uploader_id': 'gillespeterson', 'thumbnail': 're:https?://.*', 'view_count': int, - 'like_count': int, }, }, { 'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/', @@ -89,26 +86,18 @@ class MixcloudIE(InfoExtractor): song_url = play_info['stream_url'] - PREFIX = ( - r'm-play-on-spacebar[^>]+' - r'(?:\s+[a-zA-Z0-9-]+(?:="[^"]+")?)*?\s+') - title = self._html_search_regex( - PREFIX + r'm-title="([^"]+)"', webpage, 'title') + title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title') thumbnail = self._proto_relative_url(self._html_search_regex( - PREFIX + r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', - fatal=False)) + r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False)) uploader = self._html_search_regex( - PREFIX + r'm-owner-name="([^"]+)"', - webpage, 'uploader', fatal=False) + r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False) uploader_id = self._search_regex( r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False) description = self._og_search_description(webpage) - like_count = parse_count(self._search_regex( - r'\bbutton-favorite[^>]+>.*?<span[^>]+class=["\']toggle-number[^>]+>\s*([^<]+)', - webpage, 'like count', default=None)) view_count = str_to_int(self._search_regex( [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"', - r'/listeners/?">([0-9,.]+)</a>'], + r'/listeners/?">([0-9,.]+)</a>', + r'm-tooltip=["\']([\d,.]+) plays'], webpage, 'play count', default=None)) return { @@ -120,7 +109,6 @@ class MixcloudIE(InfoExtractor): 'uploader': uploader, 'uploader_id': uploader_id, 'view_count': view_count, - 'like_count': like_count, } diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index e1f1f8fa4..e48ea2481 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -13,11 +13,11 @@ from ..utils import ( fix_xml_ampersands, float_or_none, HEADRequest, - NO_DEFAULT, RegexNotFoundError, sanitized_Request, strip_or_none, timeconvert, + try_get, unescapeHTML, update_url_query, url_basename, @@ -42,15 +42,6 @@ class MTVServicesInfoExtractor(InfoExtractor): # Remove the templates, like &device={device} return re.sub(r'&[^=]*?={.*?}(?=(&|$))', '', url) - # This was originally implemented for ComedyCentral, but it also works here - @classmethod - def _transform_rtmp_url(cls, rtmp_video_url): - m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\..+?/.*)$', rtmp_video_url) - if not m: - return {'rtmp': rtmp_video_url} - base = 'http://viacommtvstrmfs.fplive.net/' - return {'http': base + m.group('finalid')} - def _get_feed_url(self, uri): return self._FEED_URL @@ -88,24 +79,31 @@ class MTVServicesInfoExtractor(InfoExtractor): formats = [] for rendition in mdoc.findall('.//rendition'): - if rendition.attrib['method'] == 'hls': + if rendition.get('method') == 'hls': hls_url = rendition.find('./src').text - formats.extend(self._extract_m3u8_formats(hls_url, video_id, ext='mp4')) + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id='hls')) else: # fms try: _, _, ext = rendition.attrib['type'].partition('/') rtmp_video_url = rendition.find('./src').text + if 'error_not_available.swf' in rtmp_video_url: + raise ExtractorError( + '%s said: video is not available' % self.IE_NAME, + expected=True) if rtmp_video_url.endswith('siteunavail.png'): continue - new_urls = self._transform_rtmp_url(rtmp_video_url) formats.extend([{ - 'ext': 'flv' if new_url.startswith('rtmp') else ext, - 'url': new_url, - 'format_id': '-'.join(filter(None, [kind, rendition.get('bitrate')])), + 'ext': 'flv' if rtmp_video_url.startswith('rtmp') else ext, + 'url': rtmp_video_url, + 'format_id': '-'.join(filter(None, [ + 'rtmp' if rtmp_video_url.startswith('rtmp') else None, + rendition.get('bitrate')])), 'width': int(rendition.get('width')), 'height': int(rendition.get('height')), - } for kind, new_url in new_urls.items()]) + }]) except (KeyError, TypeError): raise ExtractorError('Invalid rendition field.') self._sort_formats(formats) @@ -123,7 +121,7 @@ class MTVServicesInfoExtractor(InfoExtractor): } for typographic in transcript.findall('./typographic')] return subtitles - def _get_video_info(self, itemdoc, use_hls): + def _get_video_info(self, itemdoc, use_hls=True): uri = itemdoc.find('guid').text video_id = self._id_from_uri(uri) self.report_extraction(video_id) @@ -193,13 +191,13 @@ class MTVServicesInfoExtractor(InfoExtractor): data['lang'] = self._LANG return data - def _get_videos_info(self, uri, use_hls=False): + def _get_videos_info(self, uri, use_hls=True): video_id = self._id_from_uri(uri) feed_url = self._get_feed_url(uri) info_url = update_url_query(feed_url, self._get_feed_query(uri)) return self._get_videos_info_from_url(info_url, video_id, use_hls) - def _get_videos_info_from_url(self, url, video_id, use_hls): + def _get_videos_info_from_url(self, url, video_id, use_hls=True): idoc = self._download_xml( url, video_id, 'Downloading info', transform_source=fix_xml_ampersands) @@ -211,7 +209,28 @@ class MTVServicesInfoExtractor(InfoExtractor): [self._get_video_info(item, use_hls) for item in idoc.findall('.//item')], playlist_title=title, playlist_description=description) - def _extract_mgid(self, webpage, default=NO_DEFAULT): + def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None): + triforce_feed = self._parse_json(self._search_regex( + r'triforceManifestFeed\s*=\s*({.+?})\s*;\s*\n', webpage, + 'triforce feed', default='{}'), video_id, fatal=False) + + data_zone = self._search_regex( + r'data-zone=(["\'])(?P<zone>.+?_lc_promo.*?)\1', webpage, + 'data zone', default=data_zone, group='zone') + + feed_url = try_get( + triforce_feed, lambda x: x['manifest']['zones'][data_zone]['feed'], + compat_str) + if not feed_url: + return + + feed = self._download_json(feed_url, video_id, fatal=False) + if not feed: + return + + return try_get(feed, lambda x: x['result']['data']['id'], compat_str) + + def _extract_mgid(self, webpage): try: # the url can be http://media.mtvnservices.com/fb/{mgid}.swf # or http://media.mtvnservices.com/{mgid} @@ -231,7 +250,11 @@ class MTVServicesInfoExtractor(InfoExtractor): sm4_embed = self._html_search_meta( 'sm4:video:embed', webpage, 'sm4 embed', default='') mgid = self._search_regex( - r'embed/(mgid:.+?)["\'&?/]', sm4_embed, 'mgid', default=default) + r'embed/(mgid:.+?)["\'&?/]', sm4_embed, 'mgid', default=None) + + if not mgid: + mgid = self._extract_triforce_mgid(webpage) + return mgid def _real_extract(self, url): diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index 055070ff5..e8131333f 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -12,10 +12,10 @@ from ..utils import ( class NaverIE(InfoExtractor): - _VALID_URL = r'https?://(?:m\.)?tvcast\.naver\.com/v/(?P<id>\d+)' + _VALID_URL = r'https?://(?:m\.)?tv(?:cast)?\.naver\.com/v/(?P<id>\d+)' _TESTS = [{ - 'url': 'http://tvcast.naver.com/v/81652', + 'url': 'http://tv.naver.com/v/81652', 'info_dict': { 'id': '81652', 'ext': 'mp4', @@ -24,7 +24,7 @@ class NaverIE(InfoExtractor): 'upload_date': '20130903', }, }, { - 'url': 'http://tvcast.naver.com/v/395837', + 'url': 'http://tv.naver.com/v/395837', 'md5': '638ed4c12012c458fefcddfd01f173cd', 'info_dict': { 'id': '395837', @@ -34,6 +34,9 @@ class NaverIE(InfoExtractor): 'upload_date': '20150519', }, 'skip': 'Georestricted', + }, { + 'url': 'http://tvcast.naver.com/v/81652', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py index c900f232a..626ed8b49 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/youtube_dl/extractor/nextmedia.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import parse_iso8601 @@ -30,6 +31,12 @@ class NextMediaIE(InfoExtractor): return self._extract_from_nextmedia_page(news_id, url, page) def _extract_from_nextmedia_page(self, news_id, url, page): + redirection_url = self._search_regex( + r'window\.location\.href\s*=\s*([\'"])(?P<url>(?!\1).+)\1', + page, 'redirection URL', default=None, group='url') + if redirection_url: + return self.url_result(compat_urlparse.urljoin(url, redirection_url)) + title = self._fetch_title(page) video_url = self._search_regex(self._URL_PATTERN, page, 'video url') @@ -93,7 +100,7 @@ class NextMediaActionNewsIE(NextMediaIE): class AppleDailyIE(NextMediaIE): IE_DESC = '臺灣蘋果日報' - _VALID_URL = r'https?://(www|ent)\.appledaily\.com\.tw/(?:animation|appledaily|enews|realtimenews|actionnews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' + _VALID_URL = r'https?://(www|ent)\.appledaily\.com\.tw/[^/]+/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' _TESTS = [{ 'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694', 'md5': 'a843ab23d150977cc55ef94f1e2c1e4d', @@ -157,6 +164,10 @@ class AppleDailyIE(NextMediaIE): }, { 'url': 'http://www.appledaily.com.tw/actionnews/appledaily/7/20161003/960588/', 'only_matching': True, + }, { + # Redirected from http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694 + 'url': 'http://ent.appledaily.com.tw/section/article/headline/20150128/36354694', + 'only_matching': True, }] _URL_PATTERN = r'\{url: \'(.+)\'\}' diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 7672845bf..08a75929e 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -10,7 +10,7 @@ from ..utils import update_url_query class NickIE(MTVServicesInfoExtractor): # None of videos on the website are still alive? IE_NAME = 'nick.com' - _VALID_URL = r'https?://(?:www\.)?nick(?:jr)?\.com/(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)' + _VALID_URL = r'https?://(?:(?:www|beta)\.)?nick(?:jr)?\.com/(?:[^/]+/)?(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)' _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm' _TESTS = [{ 'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html', @@ -57,6 +57,9 @@ class NickIE(MTVServicesInfoExtractor): }, { 'url': 'http://www.nickjr.com/paw-patrol/videos/pups-save-a-goldrush-s3-ep302-full-episode/', 'only_matching': True, + }, { + 'url': 'http://beta.nick.com/nicky-ricky-dicky-and-dawn/videos/nicky-ricky-dicky-dawn-301-full-episode/', + 'only_matching': True, }] def _get_feed_query(self, uri): diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index a104e33f8..8baac23e4 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -7,7 +7,6 @@ import datetime from .common import InfoExtractor from ..compat import ( - compat_urllib_parse_urlencode, compat_urlparse, ) from ..utils import ( @@ -40,6 +39,7 @@ class NiconicoIE(InfoExtractor): 'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', 'duration': 33, }, + 'skip': 'Requires an account', }, { # File downloaded with and without credentials are different, so omit # the md5 field @@ -55,6 +55,7 @@ class NiconicoIE(InfoExtractor): 'timestamp': 1304065916, 'duration': 209, }, + 'skip': 'Requires an account', }, { # 'video exists but is marked as "deleted" # md5 is unstable @@ -65,9 +66,10 @@ class NiconicoIE(InfoExtractor): 'description': 'deleted', 'title': 'ドラえもんエターナル第3話「決戦第3新東京市」<前編>', 'upload_date': '20071224', - 'timestamp': 1198527840, # timestamp field has different value if logged in + 'timestamp': int, # timestamp field has different value if logged in 'duration': 304, }, + 'skip': 'Requires an account', }, { 'url': 'http://www.nicovideo.jp/watch/so22543406', 'info_dict': { @@ -79,13 +81,12 @@ class NiconicoIE(InfoExtractor): 'upload_date': '20140104', 'uploader': 'アニメロチャンネル', 'uploader_id': '312', - } + }, + 'skip': 'The viewing period of the video you were searching for has expired.', }] _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)' _NETRC_MACHINE = 'niconico' - # Determine whether the downloader used authentication to download video - _AUTHENTICATED = False def _real_initialize(self): self._login() @@ -109,8 +110,6 @@ class NiconicoIE(InfoExtractor): if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None: self._downloader.report_warning('unable to log in: bad username or password') return False - # Successful login - self._AUTHENTICATED = True return True def _real_extract(self, url): @@ -128,35 +127,19 @@ class NiconicoIE(InfoExtractor): 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id, note='Downloading video info page') - if self._AUTHENTICATED: - # Get flv info - flv_info_webpage = self._download_webpage( - 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', - video_id, 'Downloading flv info') - else: - # Get external player info - ext_player_info = self._download_webpage( - 'http://ext.nicovideo.jp/thumb_watch/' + video_id, video_id) - thumb_play_key = self._search_regex( - r'\'thumbPlayKey\'\s*:\s*\'(.*?)\'', ext_player_info, 'thumbPlayKey') - - # Get flv info - flv_info_data = compat_urllib_parse_urlencode({ - 'k': thumb_play_key, - 'v': video_id - }) - flv_info_request = sanitized_Request( - 'http://ext.nicovideo.jp/thumb_watch', flv_info_data, - {'Content-Type': 'application/x-www-form-urlencoded'}) - flv_info_webpage = self._download_webpage( - flv_info_request, video_id, - note='Downloading flv info', errnote='Unable to download flv info') + # Get flv info + flv_info_webpage = self._download_webpage( + 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', + video_id, 'Downloading flv info') flv_info = compat_urlparse.parse_qs(flv_info_webpage) if 'url' not in flv_info: if 'deleted' in flv_info: raise ExtractorError('The video has been deleted.', expected=True) + elif 'closed' in flv_info: + raise ExtractorError('Niconico videos now require logging in', + expected=True) else: raise ExtractorError('Unable to find video URL') diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py index c2807d0f6..84be2b1e3 100644 --- a/youtube_dl/extractor/ooyala.py +++ b/youtube_dl/extractor/ooyala.py @@ -18,7 +18,7 @@ class OoyalaBaseIE(InfoExtractor): _CONTENT_TREE_BASE = _PLAYER_BASE + 'player_api/v1/content_tree/' _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v2/authorization/embed_code/%s/%s?' - def _extract(self, content_tree_url, video_id, domain='example.org', supportedformats=None): + def _extract(self, content_tree_url, video_id, domain='example.org', supportedformats=None, embed_token=None): content_tree = self._download_json(content_tree_url, video_id)['content_tree'] metadata = content_tree[list(content_tree)[0]] embed_code = metadata['embed_code'] @@ -29,7 +29,8 @@ class OoyalaBaseIE(InfoExtractor): self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code) + compat_urllib_parse_urlencode({ 'domain': domain, - 'supportedFormats': supportedformats or 'mp4,rtmp,m3u8,hds', + 'supportedFormats': supportedformats or 'mp4,rtmp,m3u8,hds,dash,smooth', + 'embedToken': embed_token, }), video_id) cur_auth_data = auth_data['authorization_data'][embed_code] @@ -52,6 +53,12 @@ class OoyalaBaseIE(InfoExtractor): elif delivery_type == 'hds' or ext == 'f4m': formats.extend(self._extract_f4m_formats( s_url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False)) + elif delivery_type == 'dash' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + s_url, embed_code, mpd_id='dash', fatal=False)) + elif delivery_type == 'smooth': + self._extract_ism_formats( + s_url, embed_code, ism_id='mss', fatal=False) elif ext == 'smil': formats.extend(self._extract_smil_formats( s_url, embed_code, fatal=False)) @@ -146,8 +153,9 @@ class OoyalaIE(OoyalaBaseIE): embed_code = self._match_id(url) domain = smuggled_data.get('domain') supportedformats = smuggled_data.get('supportedformats') + embed_token = smuggled_data.get('embed_token') content_tree_url = self._CONTENT_TREE_BASE + 'embed_code/%s/%s' % (embed_code, embed_code) - return self._extract(content_tree_url, embed_code, domain, supportedformats) + return self._extract(content_tree_url, embed_code, domain, supportedformats, embed_token) class OoyalaExternalIE(OoyalaBaseIE): diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 2ce9f3826..3d4ad7dca 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -64,16 +64,17 @@ class OpenloadIE(InfoExtractor): raise ExtractorError('File not found', expected=True) ol_id = self._search_regex( - '<span[^>]+id="[a-zA-Z0-9]+x"[^>]*>([0-9]+)</span>', + '<span[^>]+id="[^"]+"[^>]*>([0-9]+)</span>', webpage, 'openload ID') - first_two_chars = int(float(ol_id[0:][:2])) + first_three_chars = int(float(ol_id[0:][:3])) + fifth_char = int(float(ol_id[3:5])) urlcode = '' - num = 2 + num = 5 while num < len(ol_id): - urlcode += compat_chr(int(float(ol_id[num:][:3])) - - first_two_chars * int(float(ol_id[num + 3:][:2]))) + urlcode += compat_chr(int(float(ol_id[num:][:3])) + + first_three_chars - fifth_char * int(float(ol_id[num + 3:][:2]))) num += 5 video_url = 'https://openload.co/stream/' + urlcode diff --git a/youtube_dl/extractor/pornflip.py b/youtube_dl/extractor/pornflip.py new file mode 100644 index 000000000..a4a5d390e --- /dev/null +++ b/youtube_dl/extractor/pornflip.py @@ -0,0 +1,92 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_str, +) +from ..utils import ( + int_or_none, + try_get, + unified_timestamp, +) + + +class PornFlipIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornflip\.com/(?:v|embed)/(?P<id>[0-9A-Za-z]{11})' + _TESTS = [{ + 'url': 'https://www.pornflip.com/v/wz7DfNhMmep', + 'md5': '98c46639849145ae1fd77af532a9278c', + 'info_dict': { + 'id': 'wz7DfNhMmep', + 'ext': 'mp4', + 'title': '2 Amateurs swallow make his dream cumshots true', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 112, + 'timestamp': 1481655502, + 'upload_date': '20161213', + 'uploader_id': '106786', + 'uploader': 'figifoto', + 'view_count': int, + 'age_limit': 18, + } + }, { + 'url': 'https://www.pornflip.com/embed/wz7DfNhMmep', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://www.pornflip.com/v/%s' % video_id, video_id) + + flashvars = compat_parse_qs(self._search_regex( + r'<embed[^>]+flashvars=(["\'])(?P<flashvars>(?:(?!\1).)+)\1', + webpage, 'flashvars', group='flashvars')) + + title = flashvars['video_vars[title]'][0] + + def flashvar(kind): + return try_get( + flashvars, lambda x: x['video_vars[%s]' % kind][0], compat_str) + + formats = [] + for key, value in flashvars.items(): + if not (value and isinstance(value, list)): + continue + format_url = value[0] + if key == 'video_vars[hds_manifest]': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + continue + height = self._search_regex( + r'video_vars\[video_urls\]\[(\d+)', key, 'height', default=None) + if not height: + continue + formats.append({ + 'url': format_url, + 'format_id': 'http-%s' % height, + 'height': int_or_none(height), + }) + self._sort_formats(formats) + + uploader = self._html_search_regex( + (r'<span[^>]+class="name"[^>]*>\s*<a[^>]+>\s*<strong>(?P<uploader>[^<]+)', + r'<meta[^>]+content=(["\'])[^>]*\buploaded by (?P<uploader>.+?)\1'), + webpage, 'uploader', fatal=False, group='uploader') + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'thumbnail': flashvar('big_thumb'), + 'duration': int_or_none(flashvar('duration')), + 'timestamp': unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp')), + 'uploader_id': flashvar('author_id'), + 'uploader': uploader, + 'view_count': int_or_none(flashvar('views')), + 'age_limit': 18, + } diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index abfee3ece..c59896a17 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -46,7 +46,7 @@ class SpikeIE(MTVServicesInfoExtractor): _CUSTOM_URL_REGEX = re.compile(r'spikenetworkapp://([^/]+/[-a-fA-F0-9]+)') def _extract_mgid(self, webpage): - mgid = super(SpikeIE, self)._extract_mgid(webpage, default=None) + mgid = super(SpikeIE, self)._extract_mgid(webpage) if mgid is None: url_parts = self._search_regex(self._CUSTOM_URL_REGEX, webpage, 'episode_id') video_type, episode_id = url_parts.split('/', 1) diff --git a/youtube_dl/extractor/tv4.py b/youtube_dl/extractor/tv4.py index 29f62b970..ad79db92b 100644 --- a/youtube_dl/extractor/tv4.py +++ b/youtube_dl/extractor/tv4.py @@ -4,11 +4,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - ExtractorError, int_or_none, parse_iso8601, try_get, - update_url_query, + determine_ext, ) @@ -28,7 +27,7 @@ class TV4IE(InfoExtractor): _TESTS = [ { 'url': 'http://www.tv4.se/kalla-fakta/klipp/kalla-fakta-5-english-subtitles-2491650', - 'md5': '909d6454b87b10a25aa04c4bdd416a9b', + 'md5': 'cb837212f342d77cec06e6dad190e96d', 'info_dict': { 'id': '2491650', 'ext': 'mp4', @@ -40,7 +39,7 @@ class TV4IE(InfoExtractor): }, { 'url': 'http://www.tv4play.se/iframe/video/3054113', - 'md5': '77f851c55139ffe0ebd41b6a5552489b', + 'md5': 'cb837212f342d77cec06e6dad190e96d', 'info_dict': { 'id': '3054113', 'ext': 'mp4', @@ -75,11 +74,10 @@ class TV4IE(InfoExtractor): # If is_geo_restricted is true, it doesn't necessarily mean we can't download it if info.get('is_geo_restricted'): self.report_warning('This content might not be available in your country due to licensing restrictions.') - if info.get('requires_subscription'): - raise ExtractorError('This content requires subscription.', expected=True) title = info['title'] + subtitles = {} formats = [] # http formats are linked with unresolvable host for kind in ('hls', ''): @@ -87,26 +85,41 @@ class TV4IE(InfoExtractor): 'https://prima.tv4play.se/api/web/asset/%s/play.json' % video_id, video_id, 'Downloading sources JSON', query={ 'protocol': kind, - 'videoFormat': 'MP4+WEBVTTS+WEBVTT', + 'videoFormat': 'MP4+WEBVTT', }) - item = try_get(data, lambda x: x['playback']['items']['item'], dict) - manifest_url = item.get('url') - if not isinstance(manifest_url, compat_str): + items = try_get(data, lambda x: x['playback']['items']['item']) + if not items: continue - if kind == 'hls': - formats.extend(self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=kind, fatal=False)) - else: - formats.extend(self._extract_f4m_formats( - update_url_query(manifest_url, {'hdcore': '3.8.0'}), - video_id, f4m_id='hds', fatal=False)) + if isinstance(items, dict): + items = [items] + for item in items: + manifest_url = item.get('url') + if not isinstance(manifest_url, compat_str): + continue + ext = determine_ext(manifest_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=kind, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_akamai_formats( + manifest_url, video_id, { + 'hls': 'tv4play-i.akamaihd.net', + })) + elif ext == 'webvtt': + subtitles = self._merge_subtitles( + subtitles, { + 'sv': [{ + 'url': manifest_url, + 'ext': 'vtt', + }]}) self._sort_formats(formats) return { 'id': video_id, 'title': title, 'formats': formats, + 'subtitles': subtitles, 'description': info.get('description'), 'timestamp': parse_iso8601(info.get('broadcast_date_time')), 'duration': int_or_none(info.get('duration')), diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index 1093a3829..a983ebf05 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -12,7 +12,7 @@ from ..utils import ( class TwentyFourVideoIE(InfoExtractor): IE_NAME = '24video' - _VALID_URL = r'https?://(?:www\.)?24video\.(?:net|me|xxx)/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?24video\.(?:net|me|xxx|sex)/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.24video.net/video/view/1044982', @@ -43,7 +43,7 @@ class TwentyFourVideoIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://www.24video.net/video/view/%s' % video_id, video_id) + 'http://www.24video.sex/video/view/%s' % video_id, video_id) title = self._og_search_title(webpage) description = self._html_search_regex( @@ -69,11 +69,11 @@ class TwentyFourVideoIE(InfoExtractor): # Sets some cookies self._download_xml( - r'http://www.24video.net/video/xml/%s?mode=init' % video_id, + r'http://www.24video.sex/video/xml/%s?mode=init' % video_id, video_id, 'Downloading init XML') video_xml = self._download_xml( - 'http://www.24video.net/video/xml/%s?mode=play' % video_id, + 'http://www.24video.sex/video/xml/%s?mode=play' % video_id, video_id, 'Downloading video XML') video = xpath_element(video_xml, './/video', 'video', fatal=True) diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py index b721ecb0a..4fd1aa4bf 100644 --- a/youtube_dl/extractor/twentymin.py +++ b/youtube_dl/extractor/twentymin.py @@ -4,91 +4,88 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import remove_end +from ..utils import ( + int_or_none, + try_get, +) class TwentyMinutenIE(InfoExtractor): IE_NAME = '20min' - _VALID_URL = r'https?://(?:www\.)?20min\.ch/(?:videotv/*\?.*\bvid=(?P<id>\d+)|(?:[^/]+/)*(?P<display_id>[^/#?]+))' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)?20min\.ch/ + (?: + videotv/*\?.*?\bvid=| + videoplayer/videoplayer\.html\?.*?\bvideoId@ + ) + (?P<id>\d+) + ''' _TESTS = [{ - # regular video 'url': 'http://www.20min.ch/videotv/?vid=469148&cid=2', - 'md5': 'b52d6bc6ea6398e6a38f12cfd418149c', + 'md5': 'e7264320db31eed8c38364150c12496e', 'info_dict': { 'id': '469148', - 'ext': 'flv', - 'title': '85 000 Franken für 15 perfekte Minuten', - 'description': 'Was die Besucher vom Silvesterzauber erwarten können. (Video: Alice Grosjean/Murat Temel)', - 'thumbnail': 'http://thumbnails.20min-tv.ch/server063/469148/frame-72-469148.jpg' - } - }, { - # news article with video - 'url': 'http://www.20min.ch/schweiz/news/story/-Wir-muessen-mutig-nach-vorne-schauen--22050469', - 'md5': 'cd4cbb99b94130cff423e967cd275e5e', - 'info_dict': { - 'id': '469408', - 'display_id': '-Wir-muessen-mutig-nach-vorne-schauen--22050469', - 'ext': 'flv', - 'title': '«Wir müssen mutig nach vorne schauen»', - 'description': 'Kein Land sei innovativer als die Schweiz, sagte Johann Schneider-Ammann in seiner Neujahrsansprache. Das Land müsse aber seine Hausaufgaben machen.', - 'thumbnail': 'http://www.20min.ch/images/content/2/2/0/22050469/10/teaserbreit.jpg' - }, - 'skip': '"This video is no longer available" is shown both on the web page and in the downloaded file.', - }, { - # YouTube embed - 'url': 'http://www.20min.ch/ro/sports/football/story/Il-marque-une-bicyclette-de-plus-de-30-metres--21115184', - 'md5': 'cec64d59aa01c0ed9dbba9cf639dd82f', - 'info_dict': { - 'id': 'ivM7A7SpDOs', 'ext': 'mp4', - 'title': 'GOLAZO DE CHILENA DE JAVI GÓMEZ, FINALISTA AL BALÓN DE CLM 2016', - 'description': 'md5:903c92fbf2b2f66c09de514bc25e9f5a', - 'upload_date': '20160424', - 'uploader': 'RTVCM Castilla-La Mancha', - 'uploader_id': 'RTVCM', + 'title': '85 000 Franken für 15 perfekte Minuten', + 'thumbnail': r're:https?://.*\.jpg$', + }, + }, { + 'url': 'http://www.20min.ch/videoplayer/videoplayer.html?params=client@twentyDE|videoId@523629', + 'info_dict': { + 'id': '523629', + 'ext': 'mp4', + 'title': 'So kommen Sie bei Eis und Schnee sicher an', + 'description': 'md5:117c212f64b25e3d95747e5276863f7d', + 'thumbnail': r're:https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, }, - 'add_ie': ['Youtube'], }, { 'url': 'http://www.20min.ch/videotv/?cid=44&vid=468738', 'only_matching': True, - }, { - 'url': 'http://www.20min.ch/ro/sortir/cinema/story/Grandir-au-bahut--c-est-dur-18927411', - 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return [m.group('url') for m in re.finditer( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1', + webpage)] + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id + video_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + video = self._download_json( + 'http://api.20min.ch/video/%s/show' % video_id, + video_id)['content'] - youtube_url = self._html_search_regex( - r'<iframe[^>]+src="((?:https?:)?//www\.youtube\.com/embed/[^"]+)"', - webpage, 'YouTube embed URL', default=None) - if youtube_url is not None: - return self.url_result(youtube_url, 'Youtube') + title = video['title'] - title = self._html_search_regex( - r'<h1>.*?<span>(.+?)</span></h1>', - webpage, 'title', default=None) - if not title: - title = remove_end(re.sub( - r'^20 [Mm]inuten.*? -', '', self._og_search_title(webpage)), ' - News') + formats = [{ + 'format_id': format_id, + 'url': 'http://podcast.20min-tv.ch/podcast/20min/%s%s.mp4' % (video_id, p), + 'quality': quality, + } for quality, (format_id, p) in enumerate([('sd', ''), ('hd', 'h')])] + self._sort_formats(formats) - if not video_id: - video_id = self._search_regex( - r'"file\d?"\s*,\s*\"(\d+)', webpage, 'video id') + description = video.get('lead') + thumbnail = video.get('thumbnail') - description = self._html_search_meta( - 'description', webpage, 'description') - thumbnail = self._og_search_thumbnail(webpage) + def extract_count(kind): + return try_get( + video, + lambda x: int_or_none(x['communityobject']['thumbs_%s' % kind])) + + like_count = extract_count('up') + dislike_count = extract_count('down') return { 'id': video_id, - 'display_id': display_id, - 'url': 'http://speed.20min-tv.ch/%sm.flv' % video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'formats': formats, } diff --git a/youtube_dl/extractor/uol.py b/youtube_dl/extractor/uol.py index c27c64387..e67083004 100644 --- a/youtube_dl/extractor/uol.py +++ b/youtube_dl/extractor/uol.py @@ -84,12 +84,27 @@ class UOLIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - if not video_id.isdigit(): - embed_page = self._download_webpage('https://jsuol.com.br/c/tv/uol/embed/?params=[embed,%s]' % video_id, video_id) - video_id = self._search_regex(r'mediaId=(\d+)', embed_page, 'media id') + media_id = None + + if video_id.isdigit(): + media_id = video_id + + if not media_id: + embed_page = self._download_webpage( + 'https://jsuol.com.br/c/tv/uol/embed/?params=[embed,%s]' % video_id, + video_id, 'Downloading embed page', fatal=False) + if embed_page: + media_id = self._search_regex( + (r'uol\.com\.br/(\d+)', r'mediaId=(\d+)'), + embed_page, 'media id', default=None) + + if not media_id: + webpage = self._download_webpage(url, video_id) + media_id = self._search_regex(r'mediaId=(\d+)', webpage, 'media id') + video_data = self._download_json( - 'http://mais.uol.com.br/apiuol/v3/player/getMedia/%s.json' % video_id, - video_id)['item'] + 'http://mais.uol.com.br/apiuol/v3/player/getMedia/%s.json' % media_id, + media_id)['item'] title = video_data['title'] query = { @@ -118,7 +133,7 @@ class UOLIE(InfoExtractor): tags.append(tag_description) return { - 'id': video_id, + 'id': media_id, 'title': title, 'description': clean_html(video_data.get('desMedia')), 'thumbnail': video_data.get('thumbnail'), diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index 0c06bf36b..5737d4d16 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -69,6 +69,13 @@ class UstreamIE(InfoExtractor): }, }] + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage) + if mobj is not None: + return mobj.group('url') + def _get_stream_info(self, url, video_id, app_id_ver, extra_note=None): def num_to_hex(n): return hex(n)[2:] diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index d82261e5e..f0a8075fb 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -206,7 +206,7 @@ class VevoIE(VevoBaseIE): note='Retrieving oauth token', errnote='Unable to retrieve oauth token') - if 'THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION' in webpage: + if re.search(r'(?i)THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION', webpage): self.raise_geo_restricted( '%s said: This page is currently unavailable in your region' % self.IE_NAME) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 37e1da70d..a6bbd4c05 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -254,7 +254,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', 'duration': 10, - 'description': 'This is "youtube-dl password protected test video" by on Vimeo, the home for high quality videos and the people who love them.', + 'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f', }, 'params': { 'videopassword': 'youtube-dl', @@ -306,7 +306,7 @@ class VimeoIE(VimeoBaseInfoExtractor): { # contains original format 'url': 'https://vimeo.com/33951933', - 'md5': '2d9f5475e0537f013d0073e812ab89e6', + 'md5': '53c688fa95a55bf4b7293d37a89c5c53', 'info_dict': { 'id': '33951933', 'ext': 'mp4', @@ -324,7 +324,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'url': 'https://vimeo.com/channels/tributes/6213729', 'info_dict': { 'id': '6213729', - 'ext': 'mp4', + 'ext': 'mov', 'title': 'Vimeo Tribute: The Shining', 'uploader': 'Casey Donahue', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/caseydonahue', @@ -338,7 +338,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'expected_warnings': ['Unable to download JSON metadata'], }, { - # redirects to ondemand extractor and should be passed throught it + # redirects to ondemand extractor and should be passed through it # for successful extraction 'url': 'https://vimeo.com/73445910', 'info_dict': { @@ -629,6 +629,9 @@ class VimeoOndemandIE(VimeoBaseInfoExtractor): 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/gumfilms', 'uploader_id': 'gumfilms', }, + 'params': { + 'format': 'best[protocol=https]', + }, }, { # requires Referer to be passed along with og:video:url 'url': 'https://vimeo.com/ondemand/36938/126682985', @@ -727,12 +730,12 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): # Try extracting href first since not all videos are available via # short https://vimeo.com/id URL (e.g. https://vimeo.com/channels/tributes/6213729) clips = re.findall( - r'id="clip_(\d+)"[^>]*>\s*<a[^>]+href="(/(?:[^/]+/)*\1)', webpage) + r'id="clip_(\d+)"[^>]*>\s*<a[^>]+href="(/(?:[^/]+/)*\1)(?:[^>]+\btitle="([^"]+)")?', webpage) if clips: - for video_id, video_url in clips: + for video_id, video_url, video_title in clips: yield self.url_result( compat_urlparse.urljoin(base_url, video_url), - VimeoIE.ie_key(), video_id=video_id) + VimeoIE.ie_key(), video_id=video_id, video_title=video_title) # More relaxed fallback else: for video_id in re.findall(r'id=["\']clip_(\d+)', webpage): diff --git a/youtube_dl/extractor/xiami.py b/youtube_dl/extractor/xiami.py index 86abef257..d017e03de 100644 --- a/youtube_dl/extractor/xiami.py +++ b/youtube_dl/extractor/xiami.py @@ -16,7 +16,9 @@ class XiamiBaseIE(InfoExtractor): return webpage def _extract_track(self, track, track_id=None): - title = track['title'] + track_name = track.get('songName') or track.get('name') or track['subName'] + artist = track.get('artist') or track.get('artist_name') or track.get('singers') + title = '%s - %s' % (artist, track_name) if artist else track_name track_url = self._decrypt(track['location']) subtitles = {} @@ -31,9 +33,10 @@ class XiamiBaseIE(InfoExtractor): 'thumbnail': track.get('pic') or track.get('album_pic'), 'duration': int_or_none(track.get('length')), 'creator': track.get('artist', '').split(';')[0], - 'track': title, - 'album': track.get('album_name'), - 'artist': track.get('artist'), + 'track': track_name, + 'track_number': int_or_none(track.get('track')), + 'album': track.get('album_name') or track.get('title'), + 'artist': artist, 'subtitles': subtitles, } @@ -68,14 +71,14 @@ class XiamiBaseIE(InfoExtractor): class XiamiSongIE(XiamiBaseIE): IE_NAME = 'xiami:song' IE_DESC = '虾米音乐' - _VALID_URL = r'https?://(?:www\.)?xiami\.com/song/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?xiami\.com/song/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.xiami.com/song/1775610518', 'md5': '521dd6bea40fd5c9c69f913c232cb57e', 'info_dict': { 'id': '1775610518', 'ext': 'mp3', - 'title': 'Woman', + 'title': 'HONNE - Woman', 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg', 'duration': 265, 'creator': 'HONNE', @@ -95,7 +98,7 @@ class XiamiSongIE(XiamiBaseIE): 'info_dict': { 'id': '1775256504', 'ext': 'mp3', - 'title': '悟空', + 'title': '戴荃 - 悟空', 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg', 'duration': 200, 'creator': '戴荃', @@ -109,6 +112,26 @@ class XiamiSongIE(XiamiBaseIE): }, }, 'skip': 'Georestricted', + }, { + 'url': 'http://www.xiami.com/song/1775953850', + 'info_dict': { + 'id': '1775953850', + 'ext': 'mp3', + 'title': 'До Скону - Чума Пожирает Землю', + 'thumbnail': r're:http://img\.xiami\.net/images/album/.*\.jpg', + 'duration': 683, + 'creator': 'До Скону', + 'track': 'Чума Пожирает Землю', + 'track_number': 7, + 'album': 'Ад', + 'artist': 'До Скону', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.xiami.com/song/xLHGwgd07a1', + 'only_matching': True, }] def _real_extract(self, url): @@ -124,7 +147,7 @@ class XiamiPlaylistBaseIE(XiamiBaseIE): class XiamiAlbumIE(XiamiPlaylistBaseIE): IE_NAME = 'xiami:album' IE_DESC = '虾米音乐 - 专辑' - _VALID_URL = r'https?://(?:www\.)?xiami\.com/album/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?xiami\.com/album/(?P<id>[^/?#&]+)' _TYPE = '1' _TESTS = [{ 'url': 'http://www.xiami.com/album/2100300444', @@ -136,28 +159,34 @@ class XiamiAlbumIE(XiamiPlaylistBaseIE): }, { 'url': 'http://www.xiami.com/album/512288?spm=a1z1s.6843761.1110925389.6.hhE9p9', 'only_matching': True, + }, { + 'url': 'http://www.xiami.com/album/URVDji2a506', + 'only_matching': True, }] class XiamiArtistIE(XiamiPlaylistBaseIE): IE_NAME = 'xiami:artist' IE_DESC = '虾米音乐 - 歌手' - _VALID_URL = r'https?://(?:www\.)?xiami\.com/artist/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?xiami\.com/artist/(?P<id>[^/?#&]+)' _TYPE = '2' - _TEST = { + _TESTS = [{ 'url': 'http://www.xiami.com/artist/2132?spm=0.0.0.0.dKaScp', 'info_dict': { 'id': '2132', }, 'playlist_count': 20, 'skip': 'Georestricted', - } + }, { + 'url': 'http://www.xiami.com/artist/bC5Tk2K6eb99', + 'only_matching': True, + }] class XiamiCollectionIE(XiamiPlaylistBaseIE): IE_NAME = 'xiami:collection' IE_DESC = '虾米音乐 - 精选集' - _VALID_URL = r'https?://(?:www\.)?xiami\.com/collect/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?xiami\.com/collect/(?P<id>[^/?#&]+)' _TYPE = '3' _TEST = { 'url': 'http://www.xiami.com/collect/156527391?spm=a1z1s.2943601.6856193.12.4jpBnr', diff --git a/youtube_dl/extractor/yourupload.py b/youtube_dl/extractor/yourupload.py index 4ce327845..9fa772838 100644 --- a/youtube_dl/extractor/yourupload.py +++ b/youtube_dl/extractor/yourupload.py @@ -2,44 +2,37 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import urljoin class YourUploadIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)? - (?:yourupload\.com/watch| - embed\.yourupload\.com| - embed\.yucache\.net - )/(?P<id>[A-Za-z0-9]+) - ''' - _TESTS = [ - { - 'url': 'http://yourupload.com/watch/14i14h', - 'md5': '5e2c63385454c557f97c4c4131a393cd', - 'info_dict': { - 'id': '14i14h', - 'ext': 'mp4', - 'title': 'BigBuckBunny_320x180.mp4', - 'thumbnail': r're:^https?://.*\.jpe?g', - } - }, - { - 'url': 'http://embed.yourupload.com/14i14h', - 'only_matching': True, - }, - { - 'url': 'http://embed.yucache.net/14i14h?client_file_id=803349', - 'only_matching': True, - }, - ] + _VALID_URL = r'https?://(?:www\.)?(?:yourupload\.com/(?:watch|embed)|embed\.yourupload\.com)/(?P<id>[A-Za-z0-9]+)' + _TESTS = [{ + 'url': 'http://yourupload.com/watch/14i14h', + 'md5': '5e2c63385454c557f97c4c4131a393cd', + 'info_dict': { + 'id': '14i14h', + 'ext': 'mp4', + 'title': 'BigBuckBunny_320x180.mp4', + 'thumbnail': r're:^https?://.*\.jpe?g', + } + }, { + 'url': 'http://www.yourupload.com/embed/14i14h', + 'only_matching': True, + }, { + 'url': 'http://embed.yourupload.com/14i14h', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - embed_url = 'http://embed.yucache.net/{0:}'.format(video_id) + embed_url = 'http://www.yourupload.com/embed/%s' % video_id + webpage = self._download_webpage(embed_url, video_id) title = self._og_search_title(webpage) - video_url = self._og_search_video_url(webpage) + video_url = urljoin(embed_url, self._og_search_video_url(webpage)) thumbnail = self._og_search_thumbnail(webpage, default=None) return { diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 335568a10..5202beb3e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -316,6 +316,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559) '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, + '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40}, '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40}, @@ -862,6 +863,35 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'params': { 'skip_download': True, }, + }, + { + # YouTube Red video with episode data + 'url': 'https://www.youtube.com/watch?v=iqKdEhx-dD4', + 'info_dict': { + 'id': 'iqKdEhx-dD4', + 'ext': 'mp4', + 'title': 'Isolation - Mind Field (Ep 1)', + 'description': 'md5:3a72f23c086a1496c9e2c54a25fa0822', + 'upload_date': '20170118', + 'uploader': 'Vsauce', + 'uploader_id': 'Vsauce', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce', + 'license': 'Standard YouTube License', + 'series': 'Mind Field', + 'season_number': 1, + 'episode_number': 1, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': [ + 'Skipping DASH manifest', + ], + }, + { + # itag 212 + 'url': '1t24XAntNCY', + 'only_matching': True, } ] @@ -1448,6 +1478,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_alt_title = video_creator = None + m_episode = re.search( + r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', + video_webpage) + if m_episode: + series = m_episode.group('series') + season_number = int(m_episode.group('season')) + episode_number = int(m_episode.group('episode')) + else: + series = season_number = episode_number = None + m_cat_container = self._search_regex( r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>', video_webpage, 'categories', default=None) @@ -1737,6 +1777,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'is_live': is_live, 'start_time': start_time, 'end_time': end_time, + 'series': series, + 'season_number': season_number, + 'episode_number': episode_number, } @@ -1813,6 +1856,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'title': 'YDL_Empty_List', }, 'playlist_count': 0, + 'skip': 'This playlist is private', }, { 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', @@ -1844,6 +1888,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', }, 'playlist_count': 2, + 'skip': 'This playlist is private', }, { 'note': 'embedded', 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', @@ -1955,14 +2000,18 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): url = self._TEMPLATE_URL % playlist_id page = self._download_webpage(url, playlist_id) - for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page): + # the yt-alert-message now has tabindex attribute (see https://github.com/rg3/youtube-dl/issues/11604) + for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page): match = match.strip() # Check if the playlist exists or is private - if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match): - raise ExtractorError( - 'The playlist doesn\'t exist or is private, use --username or ' - '--netrc to access it.', - expected=True) + mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match) + if mobj: + reason = mobj.group('reason') + message = 'This playlist %s' % reason + if 'private' in reason: + message += ', use --username or --netrc to access it' + message += '.' + raise ExtractorError(message, expected=True) elif re.match(r'[^<]*Invalid parameters[^<]*', match): raise ExtractorError( 'Invalid parameters. Maybe URL is incorrect.', diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py index a8df4aef0..24cdec28c 100644 --- a/youtube_dl/jsinterp.py +++ b/youtube_dl/jsinterp.py @@ -213,7 +213,7 @@ class JSInterpreter(object): def extract_object(self, objname): obj = {} obj_m = re.search( - (r'(?:var\s+)?%s\s*=\s*\{' % re.escape(objname)) + + (r'(?<!this\.)%s\s*=\s*\{' % re.escape(objname)) + r'\s*(?P<fields>([a-zA-Z$0-9]+\s*:\s*function\(.*?\)\s*\{.*?\}(?:,\s*)?)*)' + r'\}\s*;', self.code) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 0eb4924b6..0d2ce8d15 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -751,7 +751,7 @@ def parseOpts(overrideArguments=None): help='Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)') postproc.add_option( '--audio-format', metavar='FORMAT', dest='audioformat', default='best', - help='Specify audio format: "best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; "%default" by default') + help='Specify audio format: "best", "aac", "vorbis", "mp3", "m4a", "opus", or "wav"; "%default" by default; No effect without -x') postproc.add_option( '--audio-quality', metavar='QUALITY', dest='audioquality', default='5', @@ -867,7 +867,7 @@ def parseOpts(overrideArguments=None): if '--ignore-config' not in system_conf: user_conf = _readUserConf() - argv = system_conf + user_conf + command_line_conf + argv = system_conf + user_conf + custom_conf + command_line_conf opts, args = parser.parse_args(argv) if opts.verbose: for conf_label, conf in ( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 39dd6c49f..12863e74a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -128,7 +128,13 @@ DATE_FORMATS = ( '%d %B %Y', '%d %b %Y', '%B %d %Y', + '%B %dst %Y', + '%B %dnd %Y', + '%B %dth %Y', '%b %d %Y', + '%b %dst %Y', + '%b %dnd %Y', + '%b %dth %Y', '%b %dst %Y %I:%M', '%b %dnd %Y %I:%M', '%b %dth %Y %I:%M', diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2c8e5bcf6..9466c9637 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.01.05' +__version__ = '2017.01.22'