diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 2caca5115..e87fed573 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -6,8 +6,8 @@ --- -### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.08.31*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. -- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.08.31** +### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.09.11.1*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected. +- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.09.11.1** ### Before submitting an *issue* make sure you have: - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections @@ -35,7 +35,7 @@ $ youtube-dl -v [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 -[debug] youtube-dl version 2016.08.31 +[debug] youtube-dl version 2016.09.11.1 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE_tmpl.md b/.github/ISSUE_TEMPLATE_tmpl.md index a5e6a4233..ab9968129 100644 --- a/.github/ISSUE_TEMPLATE_tmpl.md +++ b/.github/ISSUE_TEMPLATE_tmpl.md @@ -55,4 +55,4 @@ $ youtube-dl -v ### Description of your *issue*, suggested solution and other information Explanation of your *issue* in arbitrary form goes here. Please make sure the [description is worded well enough to be understood](https://github.com/rg3/youtube-dl#is-the-description-of-the-issue-itself-sufficient). Provide as much context and examples as possible. -If work on your *issue* required an account credentials please provide them or explain how one can obtain them. +If work on your *issue* requires account credentials please provide them or explain how one can obtain them. diff --git a/AUTHORS b/AUTHORS index b9a602c12..937742c5d 100644 --- a/AUTHORS +++ b/AUTHORS @@ -182,3 +182,6 @@ Rob van Bekkum Petr Zvoníček Pratyush Singh Aleksander Nitecki +Sebastian Blunt +Matěj Cepl +Xie Yanbo diff --git a/ChangeLog b/ChangeLog index 2e75c003d..c3c8bf037 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,8 +1,112 @@ version Extractors +* [kwuo] Improve error detection (#10650) +* [bilibili] Fix extraction for specific videos (#10647) ++ [nbc] Add support for NBC Olympics (#10361) + + +version 2016.09.11.1 + +Extractors ++ [tube8] Extract categories and tags (#10579) ++ [pornhub] Extract categories and tags (#10499) +* [openload] Temporary fix (#10408) ++ [foxnews] Add support Fox News articles (#10598) +* [viafree] Improve video id extraction (#10615) +* [iwara] Fix extraction after relaunch (#10462, #3215) ++ [tfo] Add extractor for tfo.org +* [lrt] Fix audio extraction (#10566) +* [9now] Fix extraction (#10561) ++ [canalplus] Add support for c8.fr (#10577) +* [newgrounds] Fix uploader extraction (#10584) ++ [polskieradio:category] Add support for category lists (#10576) ++ [ketnet] Add extractor for ketnet.be (#10343) ++ [canvas] Add support for een.be (#10605) ++ [telequebec] Add extractor for telequebec.tv (#1999) +* [parliamentliveuk] Fix extraction (#9137) + + +version 2016.09.08 + +Extractors ++ [jwplatform] Extract height from format label ++ [yahoo] Extract Brightcove Legacy Studio embeds (#9345) +* [videomore] Fix extraction (#10592) +* [foxgay] Fix extraction (#10480) ++ [rmcdecouverte] Add extractor for rmcdecouverte.bfmtv.com (#9709) +* [gamestar] Fix metadata extraction (#10479) +* [puls4] Fix extraction (#10583) ++ [cctv] Add extractor for CCTV and CNTV (#8153) ++ [lci] Add extractor for lci.fr (#10573) ++ [wat] Extract DASH formats ++ [viafree] Improve video id detection (#10569) ++ [trutv] Add extractor for trutv.com (#10519) ++ [nick] Add support for nickelodeon.nl (#10559) ++ [abcotvs:clips] Add support for clips.abcotvs.com ++ [abcotvs] Add support for ABC Owned Television Stations sites (#9551) ++ [miaopai] Add extractor for miaopai.com (#10556) +* [gamestar] Fix metadata extraction (#10479) ++ [bilibili] Add support for episodes (#10190) ++ [tvnoe] Add extractor for tvnoe.cz (#10524) + + +version 2016.09.04.1 + +Core +* In DASH downloader if the first segment fails, abort the whole download + process to prevent throttling (#10497) ++ Add support for --skip-unavailable-fragments and --fragment retries in + hlsnative downloader (#10165, #10448). ++ Add support for --skip-unavailable-fragments in DASH downloader ++ Introduce --skip-unavailable-fragments option for fragment based downloaders + that allows to skip fragments unavailable due to a HTTP error +* Fix extraction of video/audio entries with src attribute in + _parse_html5_media_entries (#10540) + +Extractors +* [theplatform] Relax URL regular expression (#10546) +* [youtube:playlist] Extend URL regular expression +* [rottentomatoes] Delegate extraction to internetvideoarchive extractor +* [internetvideoarchive] Extract all formats +* [pornvoisines] Fix extraction (#10469) +* [rottentomatoes] Fix extraction (#10467) +* [espn] Extend URL regular expression (#10549) +* [vimple] Extend URL regular expression (#10547) +* [youtube:watchlater] Fix extraction (#10544) +* [youjizz] Fix extraction (#10437) ++ [foxnews] Add support for FoxNews Insider (#10445) ++ [fc2] Recognize Flash player URLs (#10512) + + +version 2016.09.03 + +Core +* Restore usage of NAME attribute from EXT-X-MEDIA tag for formats codes in + _extract_m3u8_formats (#10522) +* Handle semicolon in mimetype2ext + +Extractors ++ [youtube] Add support for rental videos' previews (#10532) +* [youtube:playlist] Fallback to video extraction for video/playlist URLs when + no playlist is actually served (#10537) ++ [drtv] Add support for dr.dk/nyheder (#10536) ++ [facebook:plugins:video] Add extractor (#10530) ++ [go] Add extractor for *.go.com sites +* [adobepass] Check for authz_token expiration (#10527) +* [nytimes] improve extraction +* [thestar] Fix extraction (#10465) +* [glide] Fix extraction (#10478) +- [exfm] Remove extractor (#10482) +* [youporn] Fix categories and tags extraction (#10521) ++ [curiositystream] Add extractor for app.curiositystream.com - [thvideo] Remove extractor (#10464) * [movingimage] Fix for the new site name (#10466) ++ [cbs] Add support for once formats (#10515) +* [limelight] Skip ism snd duplicate manifests ++ [porncom] Extract categories and tags (#10510) ++ [facebook] Extract timestamp (#10508) ++ [yahoo] Extract more formats version 2016.08.31 diff --git a/README.md b/README.md index 87465aa5e..7543f81ac 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,8 @@ which means you can modify it, redistribute it or use it however you like. --mark-watched Mark videos watched (YouTube only) --no-mark-watched Do not mark videos watched (YouTube only) --no-color Do not emit color codes in output + --abort-on-unavailable-fragment Abort downloading when some fragment is not + available ## Network Options: --proxy URL Use the specified HTTP/HTTPS/SOCKS proxy. @@ -173,7 +175,10 @@ which means you can modify it, redistribute it or use it however you like. -R, --retries RETRIES Number of retries (default is 10), or "infinite". --fragment-retries RETRIES Number of retries for a fragment (default - is 10), or "infinite" (DASH only) + is 10), or "infinite" (DASH and hlsnative + only) + --skip-unavailable-fragments Skip unavailable fragments (DASH and + hlsnative only) --buffer-size SIZE Size of download buffer (e.g. 1024 or 16K) (default is 1024) --no-resize-buffer Do not automatically adjust the buffer @@ -846,6 +851,16 @@ will download the complete `PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re` playlist and cre youtube-dl --download-archive archive.txt "https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re" +### Should I add `--hls-prefer-native` into my config? + +When youtube-dl detects an HLS video, it can download it either with the built-in downloader or ffmpeg. Since many HLS streams are slightly invalid and ffmpeg/youtube-dl each handle some invalid cases better than the other, there is an option to switch the downloader if needed. + +When youtube-dl knows that one particular downloader works better for a given website, that downloader will be picked. Otherwise, youtube-dl will pick the best downloader for general compatibility, which at the moment happens to be ffmpeg. This choice may change in future versions of youtube-dl, with improvements of the built-in downloader and/or ffmpeg. + +In particular, the generic extractor (used when your website is not in the [list of supported sites by youtube-dl](http://rg3.github.io/youtube-dl/supportedsites.html) cannot mandate one specific downloader. + +If you put either `--hls-prefer-native` or `--hls-prefer-ffmpeg` into your configuration, a different subset of videos will fail to download correctly. Instead, it is much better to [file an issue](https://yt-dl.org/bug) or a pull request which details why the native or the ffmpeg HLS downloader is a better choice for your use case. + ### Can you add support for this anime video site, or site which shows current movies for free? As a matter of policy (as well as legality), youtube-dl does not include support for services that specialize in infringing copyright. As a rule of thumb, if you cannot easily find a video that the service is quite obviously allowed to distribute (i.e. that has been uploaded by the creator, the creator's distributor, or is published under a free license), the service is probably unfit for inclusion to youtube-dl. diff --git a/devscripts/release.sh b/devscripts/release.sh index ca6ae1b49..1af61aa0b 100755 --- a/devscripts/release.sh +++ b/devscripts/release.sh @@ -60,6 +60,9 @@ if ! type pandoc >/dev/null 2>/dev/null; then echo 'ERROR: pandoc is missing'; e if ! python3 -c 'import rsa' 2>/dev/null; then echo 'ERROR: python3-rsa is missing'; exit 1; fi if ! python3 -c 'import wheel' 2>/dev/null; then echo 'ERROR: wheel is missing'; exit 1; fi +read -p "Is ChangeLog up to date? (y/n) " -n 1 +if [[ ! $REPLY =~ ^[Yy]$ ]]; then exit 1; fi + /bin/echo -e "\n### First of all, testing..." make clean if $skip_tests ; then diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 42bf291e2..7a7b268d3 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -19,9 +19,10 @@ - **9now.com.au** - **abc.net.au** - **abc.net.au:iview** - - **Abc7News** - **abcnews** - **abcnews:video** + - **abcotvs**: ABC Owned Television Stations + - **abcotvs:clips** - **AcademicEarth:Course** - **acast** - **acast:channel** @@ -128,6 +129,7 @@ - **CBSNews**: CBS News - **CBSNewsLiveVideo**: CBS News Live Videos - **CBSSports** + - **CCTV** - **CDA** - **CeskaTelevize** - **channel9**: Channel 9 @@ -171,6 +173,8 @@ - **CTVNews** - **culturebox.francetvinfo.fr** - **CultureUnplugged** + - **curiositystream** + - **curiositystream:collection** - **CWTV** - **DailyMail** - **dailymotion** @@ -223,13 +227,14 @@ - **EsriVideo** - **Europa** - **EveryonesMixtape** - - **exfm**: ex.fm - **ExpoTV** - **ExtremeTube** - **EyedoTV** - **facebook** + - **FacebookPluginsVideo** - **faz.net** - **fc2** + - **fc2:embed** - **Fczenit** - **features.aol.com** - **fernsehkritik.tv** @@ -242,7 +247,9 @@ - **Formula1** - **FOX** - **Foxgay** - - **FoxNews**: Fox News and Fox Business Video + - **foxnews**: Fox News and Fox Business Video + - **foxnews:article** + - **foxnews:insider** - **FoxSports** - **france2.fr:generation-quoi** - **FranceCulture** @@ -271,6 +278,7 @@ - **Glide**: Glide mobile video messages (glide.me) - **Globo** - **GloboArticle** + - **Go** - **GodTube** - **GodTV** - **Golem** @@ -319,6 +327,7 @@ - **ivi**: ivi.ru - **ivi:compilation**: ivi.ru compilations - **ivideon**: Ivideon TV + - **Iwara** - **Izlesene** - **JeuxVideo** - **Jove** @@ -332,6 +341,7 @@ - **KarriereVideos** - **keek** - **KeezMovies** + - **Ketnet** - **KhanAcademy** - **KickStarter** - **KonserthusetPlay** @@ -347,6 +357,7 @@ - **kuwo:song**: 酷我音乐 - **la7.it** - **Laola1Tv** + - **LCI** - **Lcp** - **LcpPlay** - **Le**: 乐视网 @@ -385,6 +396,7 @@ - **Metacritic** - **Mgoon** - **MGTV**: 芒果TV + - **MiaoPai** - **Minhateca** - **MinistryGrid** - **Minoto** @@ -406,6 +418,7 @@ - **MovieClips** - **MovieFap** - **Moviezine** + - **MovingImage** - **MPORA** - **MSN** - **mtg**: MTG services @@ -530,6 +543,7 @@ - **podomatic** - **Pokemon** - **PolskieRadio** + - **PolskieRadioCategory** - **PornCom** - **PornHd** - **PornHub**: PornHub and Thumbzilla @@ -570,6 +584,7 @@ - **revision3:embed** - **RICE** - **RingTV** + - **RMCDecouverte** - **RockstarGames** - **RoosterTeeth** - **RottenTomatoes** @@ -659,7 +674,6 @@ - **sr:mediathek**: Saarländischer Rundfunk - **SRGSSR** - **SRGSSRPlay**: srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites - - **SSA** - **stanfordoc**: Stanford Open ClassRoom - **Steam** - **Stitcher** @@ -691,9 +705,11 @@ - **Telecinco**: telecinco.es, cuatro.com and mediaset.es - **Telegraaf** - **TeleMB** + - **TeleQuebec** - **TeleTask** - **Telewebion** - **TF1** + - **TFO** - **TheIntercept** - **ThePlatform** - **ThePlatformFeed** @@ -702,8 +718,6 @@ - **TheStar** - **ThisAmericanLife** - **ThisAV** - - **THVideo** - - **THVideoPlaylist** - **tinypic**: tinypic.com videos - **tlc.de** - **TMZ** @@ -717,7 +731,7 @@ - **ToypicsUser**: Toypics user profile - **TrailerAddict** (Currently broken) - **Trilulilu** - - **trollvids** + - **TruTV** - **Tube8** - **TubiTv** - **tudou** @@ -739,6 +753,7 @@ - **TVCArticle** - **tvigle**: Интернет-телевидение Tvigle.ru - **tvland.com** + - **TVNoe** - **tvp**: Telewizja Polska - **tvp:embed**: Telewizja Polska - **tvp:series** diff --git a/test/test_utils.py b/test/test_utils.py index d16ea7f77..9789d8611 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -39,6 +39,8 @@ from youtube_dl.utils import ( is_html, js_to_json, limit_length, + mimetype2ext, + month_by_name, ohdave_rsa_encrypt, OnDemandPagedList, orderedSet, @@ -625,6 +627,22 @@ class TestUtil(unittest.TestCase): limit_length('foo bar baz asd', 12).startswith('foo bar')) self.assertTrue('...' in limit_length('foo bar baz asd', 12)) + def test_mimetype2ext(self): + self.assertEqual(mimetype2ext(None), None) + self.assertEqual(mimetype2ext('video/x-flv'), 'flv') + self.assertEqual(mimetype2ext('application/x-mpegURL'), 'm3u8') + self.assertEqual(mimetype2ext('text/vtt'), 'vtt') + self.assertEqual(mimetype2ext('text/vtt;charset=utf-8'), 'vtt') + self.assertEqual(mimetype2ext('text/html; charset=utf-8'), 'html') + + def test_month_by_name(self): + self.assertEqual(month_by_name(None), None) + self.assertEqual(month_by_name('December', 'en'), 12) + self.assertEqual(month_by_name('décembre', 'fr'), 12) + self.assertEqual(month_by_name('December'), 12) + self.assertEqual(month_by_name('décembre'), None) + self.assertEqual(month_by_name('Unknown', 'unknown'), None) + def test_parse_codecs(self): self.assertEqual(parse_codecs(''), {}) self.assertEqual(parse_codecs('avc1.77.30, mp4a.40.2'), { diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 805733fb7..29d8517a3 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -131,6 +131,9 @@ class YoutubeDL(object): username: Username for authentication purposes. password: Password for authentication purposes. videopassword: Password for accessing a video. + ap_mso: Adobe Pass Multiple-system operator Identifier. + ap_username: TV Provider username for authentication purposes. + ap_password: TV Provider password for authentication purposes. usenetrc: Use netrc for authentication instead. verbose: Print additional info to stdout. quiet: Do not print messages to stdout. diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index a9730292c..1cf3140a0 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -34,12 +34,14 @@ from .utils import ( setproctitle, std_headers, write_string, + render_table, ) from .update import update_self from .downloader import ( FileDownloader, ) from .extractor import gen_extractors, list_extractors +from .extractor.adobepass import MSO_INFO from .YoutubeDL import YoutubeDL @@ -118,18 +120,26 @@ def _real_main(argv=None): desc += ' (Example: "%s%s:%s" )' % (ie.SEARCH_KEY, random.choice(_COUNTS), random.choice(_SEARCHES)) write_string(desc + '\n', out=sys.stdout) sys.exit(0) + if opts.ap_list_mso: + table = [[mso_id, mso_info['name']] for mso_id, mso_info in MSO_INFO.items()] + write_string('Supported TV Providers:\n' + render_table(['mso', 'mso name'], table) + '\n', out=sys.stdout) + sys.exit(0) # Conflicting, missing and erroneous options if opts.usenetrc and (opts.username is not None or opts.password is not None): parser.error('using .netrc conflicts with giving username/password') if opts.password is not None and opts.username is None: parser.error('account username missing\n') + if opts.ap_password is not None and opts.ap_username is None: + parser.error('TV Provider account username missing\n') if opts.outtmpl is not None and (opts.usetitle or opts.autonumber or opts.useid): parser.error('using output template conflicts with using title, video ID or auto number') if opts.usetitle and opts.useid: parser.error('using title conflicts with using video ID') if opts.username is not None and opts.password is None: opts.password = compat_getpass('Type account password and press [Return]: ') + if opts.ap_username is not None and opts.ap_password is None: + opts.ap_password = compat_getpass('Type TV provider account password and press [Return]: ') if opts.ratelimit is not None: numeric_limit = FileDownloader.parse_bytes(opts.ratelimit) if numeric_limit is None: @@ -155,6 +165,8 @@ def _real_main(argv=None): parser.error('max sleep interval must be greater than or equal to min sleep interval') else: opts.max_sleep_interval = opts.sleep_interval + if opts.ap_mso and opts.ap_mso not in MSO_INFO: + parser.error('Unsupported TV Provider, use --ap-list-mso to get a list of supported TV Providers') def parse_retries(retries): if retries in ('inf', 'infinite'): @@ -293,6 +305,9 @@ def _real_main(argv=None): 'password': opts.password, 'twofactor': opts.twofactor, 'videopassword': opts.videopassword, + 'ap_mso': opts.ap_mso, + 'ap_username': opts.ap_username, + 'ap_password': opts.ap_password, 'quiet': (opts.quiet or any_getting or any_printing), 'no_warnings': opts.no_warnings, 'forceurl': opts.geturl, @@ -318,6 +333,7 @@ def _real_main(argv=None): 'nooverwrites': opts.nooverwrites, 'retries': opts.retries, 'fragment_retries': opts.fragment_retries, + 'skip_unavailable_fragments': opts.skip_unavailable_fragments, 'buffersize': opts.buffersize, 'noresizebuffer': opts.noresizebuffer, 'continuedl': opts.continue_dl, diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 8bbab9dbc..41fc9cfc2 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -38,8 +38,10 @@ class DashSegmentsFD(FragmentFD): segments_filenames = [] fragment_retries = self.params.get('fragment_retries', 0) + skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) - def append_url_to_file(target_url, tmp_filename, segment_name): + def process_segment(segment, tmp_filename, fatal): + target_url, segment_name = segment target_filename = '%s-%s' % (tmp_filename, segment_name) count = 0 while count <= fragment_retries: @@ -52,26 +54,35 @@ class DashSegmentsFD(FragmentFD): down.close() segments_filenames.append(target_sanitized) break - except (compat_urllib_error.HTTPError, ) as err: + except compat_urllib_error.HTTPError as err: # YouTube may often return 404 HTTP error for a fragment causing the # whole download to fail. However if the same fragment is immediately # retried with the same request data this usually succeeds (1-2 attemps # is usually enough) thus allowing to download the whole file successfully. - # So, we will retry all fragments that fail with 404 HTTP error for now. - if err.code != 404: - raise - # Retry fragment + # To be future-proof we will retry all fragments that fail with any + # HTTP error. count += 1 if count <= fragment_retries: - self.report_retry_fragment(segment_name, count, fragment_retries) + self.report_retry_fragment(err, segment_name, count, fragment_retries) if count > fragment_retries: + if not fatal: + self.report_skip_fragment(segment_name) + return True self.report_error('giving up after %s fragment retries' % fragment_retries) return False + return True - if initialization_url: - append_url_to_file(initialization_url, ctx['tmpfilename'], 'Init') - for i, segment_url in enumerate(segment_urls): - append_url_to_file(segment_url, ctx['tmpfilename'], 'Seg%d' % i) + segments_to_download = [(initialization_url, 'Init')] if initialization_url else [] + segments_to_download.extend([ + (segment_url, 'Seg%d' % i) + for i, segment_url in enumerate(segment_urls)]) + + for i, segment in enumerate(segments_to_download): + # In DASH, the first segment contains necessary headers to + # generate a valid MP4 file, so always abort for the first segment + fatal = i == 0 or not skip_unavailable_fragments + if not process_segment(segment, ctx['tmpfilename'], fatal): + return False self._finish_frag_download(ctx) diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index ba903ae10..84aacf7db 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -6,6 +6,7 @@ import time from .common import FileDownloader from .http import HttpFD from ..utils import ( + error_to_compat_str, encodeFilename, sanitize_open, ) @@ -22,13 +23,19 @@ class FragmentFD(FileDownloader): Available options: - fragment_retries: Number of times to retry a fragment for HTTP error (DASH only) + fragment_retries: Number of times to retry a fragment for HTTP error (DASH + and hlsnative only) + skip_unavailable_fragments: + Skip unavailable fragments (DASH and hlsnative only) """ - def report_retry_fragment(self, fragment_name, count, retries): + def report_retry_fragment(self, err, fragment_name, count, retries): self.to_screen( - '[download] Got server HTTP error. Retrying fragment %s (attempt %d of %s)...' - % (fragment_name, count, self.format_retries(retries))) + '[download] Got server HTTP error: %s. Retrying fragment %s (attempt %d of %s)...' + % (error_to_compat_str(err), fragment_name, count, self.format_retries(retries))) + + def report_skip_fragment(self, fragment_name): + self.to_screen('[download] Skipping fragment %s...' % fragment_name) def _prepare_and_start_frag_download(self, ctx): self._prepare_frag_download(ctx) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index baaff44d5..5d70abf62 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -13,6 +13,7 @@ from .fragment import FragmentFD from .external import FFmpegFD from ..compat import ( + compat_urllib_error, compat_urlparse, compat_struct_pack, ) @@ -83,6 +84,10 @@ class HlsFD(FragmentFD): self._prepare_and_start_frag_download(ctx) + fragment_retries = self.params.get('fragment_retries', 0) + skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) + test = self.params.get('test', False) + extra_query = None extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') if extra_param_to_segment_url: @@ -99,15 +104,37 @@ class HlsFD(FragmentFD): line if re.match(r'^https?://', line) else compat_urlparse.urljoin(man_url, line)) - frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i) + frag_name = 'Frag%d' % i + frag_filename = '%s-%s' % (ctx['tmpfilename'], frag_name) if extra_query: frag_url = update_url_query(frag_url, extra_query) - success = ctx['dl'].download(frag_filename, {'url': frag_url}) - if not success: + count = 0 + while count <= fragment_retries: + try: + success = ctx['dl'].download(frag_filename, {'url': frag_url}) + if not success: + return False + down, frag_sanitized = sanitize_open(frag_filename, 'rb') + frag_content = down.read() + down.close() + break + except compat_urllib_error.HTTPError as err: + # Unavailable (possibly temporary) fragments may be served. + # First we try to retry then either skip or abort. + # See https://github.com/rg3/youtube-dl/issues/10165, + # https://github.com/rg3/youtube-dl/issues/10448). + count += 1 + if count <= fragment_retries: + self.report_retry_fragment(err, frag_name, count, fragment_retries) + if count > fragment_retries: + if skip_unavailable_fragments: + i += 1 + media_sequence += 1 + self.report_skip_fragment(frag_name) + continue + self.report_error( + 'giving up after %s fragment retries' % fragment_retries) return False - down, frag_sanitized = sanitize_open(frag_filename, 'rb') - frag_content = down.read() - down.close() if decrypt_info['METHOD'] == 'AES-128': iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) frag_content = AES.new( @@ -115,7 +142,7 @@ class HlsFD(FragmentFD): ctx['dest_stream'].write(frag_content) frags_filenames.append(frag_sanitized) # We only download the first fragment during the test - if self.params.get('test', False): + if test: break i += 1 media_sequence += 1 diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index c7b6df7d0..465249bbf 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -13,7 +13,7 @@ from ..utils import ( class ABCIE(InfoExtractor): IE_NAME = 'abc.net.au' - _VALID_URL = r'https?://www\.abc\.net\.au/news/(?:[^/]+/){1,2}(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?abc\.net\.au/news/(?:[^/]+/){1,2}(?P\d+)' _TESTS = [{ 'url': 'http://www.abc.net.au/news/2014-11-05/australia-to-staff-ebola-treatment-centre-in-sierra-leone/5868334', @@ -100,6 +100,7 @@ class ABCIViewIE(InfoExtractor): IE_NAME = 'abc.net.au:iview' _VALID_URL = r'https?://iview\.abc\.net\.au/programs/[^/]+/(?P[^/?#]+)' + # ABC iview programs are normally available for 14 days only. _TESTS = [{ 'url': 'http://iview.abc.net.au/programs/gardening-australia/FA1505V024S00', 'md5': '979d10b2939101f0d27a06b79edad536', @@ -112,6 +113,7 @@ class ABCIViewIE(InfoExtractor): 'uploader_id': 'abc1', 'timestamp': 1471719600, }, + 'skip': 'Video gone', }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py index b61a6327c..6ae5d9a96 100644 --- a/youtube_dl/extractor/abcnews.py +++ b/youtube_dl/extractor/abcnews.py @@ -12,7 +12,7 @@ from ..compat import compat_urlparse class AbcNewsVideoIE(AMPIE): IE_NAME = 'abcnews:video' - _VALID_URL = 'http://abcnews.go.com/[^/]+/video/(?P[0-9a-z-]+)-(?P\d+)' + _VALID_URL = r'https?://abcnews\.go\.com/[^/]+/video/(?P[0-9a-z-]+)-(?P\d+)' _TESTS = [{ 'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932', @@ -49,7 +49,7 @@ class AbcNewsVideoIE(AMPIE): class AbcNewsIE(InfoExtractor): IE_NAME = 'abcnews' - _VALID_URL = 'https?://abcnews\.go\.com/(?:[^/]+/)+(?P[0-9a-z-]+)/story\?id=(?P\d+)' + _VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P[0-9a-z-]+)/story\?id=(?P\d+)' _TESTS = [{ 'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY', diff --git a/youtube_dl/extractor/abc7news.py b/youtube_dl/extractor/abcotvs.py similarity index 52% rename from youtube_dl/extractor/abc7news.py rename to youtube_dl/extractor/abcotvs.py index c04949c21..054bb0596 100644 --- a/youtube_dl/extractor/abc7news.py +++ b/youtube_dl/extractor/abcotvs.py @@ -1,13 +1,19 @@ +# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import parse_iso8601 +from ..utils import ( + int_or_none, + parse_iso8601, +) -class Abc7NewsIE(InfoExtractor): - _VALID_URL = r'https?://abc7news\.com(?:/[^/]+/(?P[^/]+))?/(?P\d+)' +class ABCOTVSIE(InfoExtractor): + IE_NAME = 'abcotvs' + IE_DESC = 'ABC Owned Television Stations' + _VALID_URL = r'https?://(?:abc(?:7(?:news|ny|chicago)?|11|13|30)|6abc)\.com(?:/[^/]+/(?P[^/]+))?/(?P\d+)' _TESTS = [ { 'url': 'http://abc7news.com/entertainment/east-bay-museum-celebrates-vintage-synthesizers/472581/', @@ -15,7 +21,7 @@ class Abc7NewsIE(InfoExtractor): 'id': '472581', 'display_id': 'east-bay-museum-celebrates-vintage-synthesizers', 'ext': 'mp4', - 'title': 'East Bay museum celebrates history of synthesized music', + 'title': 'East Bay museum celebrates vintage synthesizers', 'description': 'md5:a4f10fb2f2a02565c1749d4adbab4b10', 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': 1421123075, @@ -41,7 +47,7 @@ class Abc7NewsIE(InfoExtractor): webpage = self._download_webpage(url, display_id) m3u8 = self._html_search_meta( - 'contentURL', webpage, 'm3u8 url', fatal=True) + 'contentURL', webpage, 'm3u8 url', fatal=True).split('?')[0] formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4') self._sort_formats(formats) @@ -66,3 +72,41 @@ class Abc7NewsIE(InfoExtractor): 'uploader': uploader, 'formats': formats, } + + +class ABCOTVSClipsIE(InfoExtractor): + IE_NAME = 'abcotvs:clips' + _VALID_URL = r'https?://clips\.abcotvs\.com/(?:[^/]+/)*video/(?P\d+)' + _TEST = { + 'url': 'https://clips.abcotvs.com/kabc/video/214814', + 'info_dict': { + 'id': '214814', + 'ext': 'mp4', + 'title': 'SpaceX launch pad explosion destroys rocket, satellite', + 'description': 'md5:9f186e5ad8f490f65409965ee9c7be1b', + 'upload_date': '20160901', + 'timestamp': 1472756695, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json('https://clips.abcotvs.com/vogo/video/getByIds?ids=' + video_id, video_id)['results'][0] + title = video_data['title'] + formats = self._extract_m3u8_formats( + video_data['videoURL'].split('?')[0], video_id, 'mp4') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('thumbnailURL'), + 'duration': int_or_none(video_data.get('duration')), + 'timestamp': int_or_none(video_data.get('pubDate')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index 68ec37e00..8ef5a96ce 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -6,13 +6,29 @@ import time import xml.etree.ElementTree as etree from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( unescapeHTML, urlencode_postdata, unified_timestamp, + ExtractorError, ) +MSO_INFO = { + 'DTV': { + 'name': 'DirecTV', + 'username_field': 'username', + 'password_field': 'password', + }, + 'Rogers': { + 'name': 'Rogers Cable', + 'username_field': 'UserName', + 'password_field': 'UserPassword', + }, +} + + class AdobePassIE(InfoExtractor): _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' @@ -41,6 +57,24 @@ class AdobePassIE(InfoExtractor): token_expires = unified_timestamp(re.sub(r'[_ ]GMT', '', xml_text(token, date_ele))) return token_expires and token_expires <= int(time.time()) + def post_form(form_page_res, note, data={}): + form_page, urlh = form_page_res + post_url = self._html_search_regex(r']+action=(["\'])(?P.+?)\1', form_page, 'post url', group='url') + if not re.match(r'https?://', post_url): + post_url = compat_urlparse.urljoin(urlh.geturl(), post_url) + form_data = self._hidden_inputs(form_page) + form_data.update(data) + return self._download_webpage_handle( + post_url, video_id, note, data=urlencode_postdata(form_data), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + + def raise_mvpd_required(): + raise ExtractorError( + 'This video is only available for users of participating TV providers. ' + 'Use --ap-mso to specify Adobe Pass Multiple-system operator Identifier ' + 'and --ap-username and --ap-password or --netrc to provide account credentials.', expected=True) + mvpd_headers = { 'ap_42': 'anonymous', 'ap_11': 'Linux i686', @@ -49,89 +83,91 @@ class AdobePassIE(InfoExtractor): } guid = xml_text(resource, 'guid') - requestor_info = self._downloader.cache.load('mvpd', requestor_id) or {} - authn_token = requestor_info.get('authn_token') - if authn_token and is_expired(authn_token, 'simpleTokenExpires'): - authn_token = None - if not authn_token: - # TODO add support for other TV Providers - mso_id = 'DTV' - username, password = self._get_netrc_login_info(mso_id) - if not username or not password: - return '' + count = 0 + while count < 2: + requestor_info = self._downloader.cache.load('mvpd', requestor_id) or {} + authn_token = requestor_info.get('authn_token') + if authn_token and is_expired(authn_token, 'simpleTokenExpires'): + authn_token = None + if not authn_token: + # TODO add support for other TV Providers + mso_id = self._downloader.params.get('ap_mso') + if not mso_id: + raise_mvpd_required() + username, password = self._get_login_info('ap_username', 'ap_password', mso_id) + if not username or not password: + raise_mvpd_required() + mso_info = MSO_INFO[mso_id] - def post_form(form_page, note, data={}): - post_url = self._html_search_regex(r']+action=(["\'])(?P.+?)\1', form_page, 'post url', group='url') - return self._download_webpage( - post_url, video_id, note, data=urlencode_postdata(data or self._hidden_inputs(form_page)), headers={ - 'Content-Type': 'application/x-www-form-urlencoded', + provider_redirect_page_res = self._download_webpage_handle( + self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, + 'Downloading Provider Redirect Page', query={ + 'noflash': 'true', + 'mso_id': mso_id, + 'requestor_id': requestor_id, + 'no_iframe': 'false', + 'domain_name': 'adobe.com', + 'redirect_url': url, }) - - provider_redirect_page = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, - 'Downloading Provider Redirect Page', query={ - 'noflash': 'true', - 'mso_id': mso_id, - 'requestor_id': requestor_id, - 'no_iframe': 'false', - 'domain_name': 'adobe.com', - 'redirect_url': url, + provider_login_page_res = post_form( + provider_redirect_page_res, 'Downloading Provider Login Page') + mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', { + mso_info['username_field']: username, + mso_info['password_field']: password, }) - provider_login_page = post_form( - provider_redirect_page, 'Downloading Provider Login Page') - mvpd_confirm_page = post_form(provider_login_page, 'Logging in', { - 'username': username, - 'password': password, + if mso_id == 'DTV': + post_form(mvpd_confirm_page_res, 'Confirming Login') + + session = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, + 'Retrieving Session', data=urlencode_postdata({ + '_method': 'GET', + 'requestor_id': requestor_id, + }), headers=mvpd_headers) + if '[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/programmes/.*?/(?P[^/]+)\.html' _TEST = { 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html', diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 07e67dd33..3a806a69b 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -238,7 +238,7 @@ class ARDMediathekIE(InfoExtractor): class ARDIE(InfoExtractor): - _VALID_URL = '(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos/(?P[^/?#]+)-(?P[0-9]+))\.html' + _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos/(?P[^/?#]+)-(?P[0-9]+))\.html' _TEST = { 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', 'md5': 'd216c3a86493f9322545e045ddc3eb35', diff --git a/youtube_dl/extractor/azubu.py b/youtube_dl/extractor/azubu.py index a813eb429..72e1bd59d 100644 --- a/youtube_dl/extractor/azubu.py +++ b/youtube_dl/extractor/azubu.py @@ -103,7 +103,7 @@ class AzubuIE(InfoExtractor): class AzubuLiveIE(InfoExtractor): - _VALID_URL = r'https?://www.azubu.tv/(?P[^/]+)$' + _VALID_URL = r'https?://(?:www\.)?azubu\.tv/(?P[^/]+)$' _TEST = { 'url': 'http://www.azubu.tv/MarsTVMDLen', diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index deb9cc1c0..b17916137 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -1028,7 +1028,7 @@ class BBCIE(BBCCoUkIE): class BBCCoUkArticleIE(InfoExtractor): - _VALID_URL = r'https?://www.bbc.co.uk/programmes/articles/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/programmes/articles/(?P[a-zA-Z0-9]+)' IE_NAME = 'bbc.co.uk:article' IE_DESC = 'BBC articles' diff --git a/youtube_dl/extractor/ctv.py b/youtube_dl/extractor/bellmedia.py similarity index 54% rename from youtube_dl/extractor/ctv.py rename to youtube_dl/extractor/bellmedia.py index a1fe86316..32326ed9e 100644 --- a/youtube_dl/extractor/ctv.py +++ b/youtube_dl/extractor/bellmedia.py @@ -6,8 +6,25 @@ import re from .common import InfoExtractor -class CTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?Pctv|tsn|bnn|thecomedynetwork)\.ca/.*?(?:\bvid=|-vid|~|%7E)(?P[0-9.]+)' +class BellMediaIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.)? + (?P + (?: + ctv| + tsn| + bnn| + thecomedynetwork| + discovery| + discoveryvelocity| + sciencechannel| + investigationdiscovery| + animalplanet| + bravo| + mtv| + space + )\.ca| + much\.com + )/.*?(?:\bvid=|-vid|~|%7E|/(?:episode)?)(?P[0-9]{6})''' _TESTS = [{ 'url': 'http://www.ctv.ca/video/player?vid=706966', 'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0', @@ -32,15 +49,27 @@ class CTVIE(InfoExtractor): }, { 'url': 'http://www.ctv.ca/YourMorning/Video/S1E6-Monday-August-29-2016-vid938009', 'only_matching': True, + }, { + 'url': 'http://www.much.com/shows/atmidnight/episode948007/tuesday-september-13-2016', + 'only_matching': True, + }, { + 'url': 'http://www.much.com/shows/the-almost-impossible-gameshow/928979/episode-6', + 'only_matching': True, }] + _DOMAINS = { + 'thecomedynetwork': 'comedy', + 'discoveryvelocity': 'discvel', + 'sciencechannel': 'discsci', + 'investigationdiscovery': 'invdisc', + 'animalplanet': 'aniplan', + } def _real_extract(self, url): domain, video_id = re.match(self._VALID_URL, url).groups() - if domain == 'thecomedynetwork': - domain = 'comedy' + domain = domain.split('.')[0] return { '_type': 'url_transparent', 'id': video_id, - 'url': '9c9media:%s_web:%s' % (domain, video_id), + 'url': '9c9media:%s_web:%s' % (self._DOMAINS.get(domain, domain), video_id), 'ie_key': 'NineCNineMedia', } diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index a332fbb69..2d174e6f9 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -10,13 +10,14 @@ from ..utils import ( int_or_none, float_or_none, unified_timestamp, + urlencode_postdata, ) class BiliBiliIE(InfoExtractor): - _VALID_URL = r'https?://www\.bilibili\.(?:tv|com)/video/av(?P\d+)' + _VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/v/)(?P\d+)' - _TESTS = [{ + _TEST = { 'url': 'http://www.bilibili.tv/video/av1074402/', 'md5': '9fa226fe2b8a9a4d5a69b4c6a183417e', 'info_dict': { @@ -31,66 +32,26 @@ class BiliBiliIE(InfoExtractor): 'uploader': '菊子桑', 'uploader_id': '156160', }, - }, { - 'url': 'http://www.bilibili.com/video/av1041170/', - 'info_dict': { - 'id': '1041170', - 'ext': 'mp4', - 'title': '【BD1080P】刀语【诸神&异域】', - 'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦!~', - 'duration': 3382.259, - 'timestamp': 1396530060, - 'upload_date': '20140403', - 'thumbnail': 're:^https?://.+\.jpg', - 'uploader': '枫叶逝去', - 'uploader_id': '520116', - }, - }, { - 'url': 'http://www.bilibili.com/video/av4808130/', - 'info_dict': { - 'id': '4808130', - 'ext': 'mp4', - 'title': '【长篇】哆啦A梦443【钉铛】', - 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', - 'duration': 1493.995, - 'timestamp': 1464564180, - 'upload_date': '20160529', - 'thumbnail': 're:^https?://.+\.jpg', - 'uploader': '喜欢拉面', - 'uploader_id': '151066', - }, - }, { - # Missing upload time - 'url': 'http://www.bilibili.com/video/av1867637/', - 'info_dict': { - 'id': '1867637', - 'ext': 'mp4', - 'title': '【HDTV】【喜剧】岳父岳母真难当 (2014)【法国票房冠军】', - 'description': '一个信奉天主教的法国旧式传统资产阶级家庭中有四个女儿。三个女儿却分别找了阿拉伯、犹太、中国丈夫,老夫老妻唯独期盼剩下未嫁的小女儿能找一个信奉天主教的法国白人,结果没想到小女儿找了一位非裔黑人……【这次应该不会跳帧了】', - 'duration': 5760.0, - 'uploader': '黑夜为猫', - 'uploader_id': '610729', - 'thumbnail': 're:^https?://.+\.jpg', - }, - 'params': { - # Just to test metadata extraction - 'skip_download': True, - }, - 'expected_warnings': ['upload time'], - }] + } _APP_KEY = '6f90a59ac58a4123' _BILIBILI_KEY = '0bfd84cc3940035173f35e6777508326' def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - cid = compat_parse_qs(self._search_regex( - [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', - r']+src="https://secure\.bilibili\.com/secure,([^"]+)"'], - webpage, 'player parameters'))['cid'][0] + if 'anime/v' not in url: + cid = compat_parse_qs(self._search_regex( + [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', + r']+src="https://secure\.bilibili\.com/secure,([^"]+)"'], + webpage, 'player parameters'))['cid'][0] + else: + js = self._download_json( + 'http://bangumi.bilibili.com/web_api/get_source', video_id, + data=urlencode_postdata({'episode_id': video_id}), + headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'}) + cid = js['result']['cid'] payload = 'appkey=%s&cid=%s&otype=json&quality=2&type=mp4' % (self._APP_KEY, cid) sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() @@ -106,7 +67,7 @@ class BiliBiliIE(InfoExtractor): 'url': durl['url'], 'filesize': int_or_none(durl['size']), }] - for backup_url in durl['backup_url']: + for backup_url in durl.get('backup_url', []): formats.append({ 'url': backup_url, # backup URLs have lower priorities @@ -125,6 +86,7 @@ class BiliBiliIE(InfoExtractor): description = self._html_search_meta('description', webpage) timestamp = unified_timestamp(self._html_search_regex( r']+datetime="([^"]+)"', webpage, 'upload time', fatal=False)) + thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage) # TODO 'view_count' requires deobfuscating Javascript info = { @@ -132,7 +94,7 @@ class BiliBiliIE(InfoExtractor): 'title': title, 'description': description, 'timestamp': timestamp, - 'thumbnail': self._html_search_meta('thumbnailUrl', webpage), + 'thumbnail': thumbnail, 'duration': float_or_none(video_info.get('timelength'), scale=1000), } diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py index 6ad45a1e6..9661ade4f 100644 --- a/youtube_dl/extractor/bpb.py +++ b/youtube_dl/extractor/bpb.py @@ -12,7 +12,7 @@ from ..utils import ( class BpbIE(InfoExtractor): IE_DESC = 'Bundeszentrale für politische Bildung' - _VALID_URL = r'https?://www\.bpb\.de/mediathek/(?P[0-9]+)/' + _VALID_URL = r'https?://(?:www\.)?bpb\.de/mediathek/(?P[0-9]+)/' _TEST = { 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', diff --git a/youtube_dl/extractor/camdemy.py b/youtube_dl/extractor/camdemy.py index 268c34392..d4e6fbdce 100644 --- a/youtube_dl/extractor/camdemy.py +++ b/youtube_dl/extractor/camdemy.py @@ -112,7 +112,7 @@ class CamdemyIE(InfoExtractor): class CamdemyFolderIE(InfoExtractor): - _VALID_URL = r'https?://www.camdemy.com/folder/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?camdemy\.com/folder/(?P\d+)' _TESTS = [{ # links with trailing slash 'url': 'http://www.camdemy.com/folder/450', diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 61463f249..69e8f4f57 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -23,6 +23,7 @@ class CanalplusIE(InfoExtractor): (?:(?:www|m)\.)?canalplus\.fr| (?:www\.)?piwiplus\.fr| (?:www\.)?d8\.tv| + (?:www\.)?c8\.fr| (?:www\.)?d17\.tv| (?:www\.)?itele\.fr )/(?:(?:[^/]+/)*(?P[^/?#&]+))?(?:\?.*\bvid=(?P\d+))?| @@ -35,6 +36,7 @@ class CanalplusIE(InfoExtractor): 'canalplus': 'cplus', 'piwiplus': 'teletoon', 'd8': 'd8', + 'c8': 'd8', 'd17': 'd17', 'itele': 'itele', } diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index ec6d24d96..ef0691dcd 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -1,11 +1,13 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import float_or_none class CanvasIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?canvas\.be/video/(?:[^/]+/)*(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?(?Pcanvas|een)\.be/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week', 'md5': 'ea838375a547ac787d4064d8c7860a6c', @@ -38,22 +40,42 @@ class CanvasIE(InfoExtractor): 'params': { 'skip_download': True, } + }, { + 'url': 'https://www.een.be/sorry-voor-alles/herbekijk-sorry-voor-alles', + 'info_dict': { + 'id': 'mz-ast-11a587f8-b921-4266-82e2-0bce3e80d07f', + 'display_id': 'herbekijk-sorry-voor-alles', + 'ext': 'mp4', + 'title': 'Herbekijk Sorry voor alles', + 'description': 'md5:8bb2805df8164e5eb95d6a7a29dc0dd3', + 'thumbnail': 're:^https?://.*\.jpg$', + 'duration': 3788.06, + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + site_id, display_id = mobj.group('site_id'), mobj.group('id') webpage = self._download_webpage(url, display_id) - title = self._search_regex( + title = (self._search_regex( r']+class="video__body__header__title"[^>]*>(.+?)', - webpage, 'title', default=None) or self._og_search_title(webpage) + webpage, 'title', default=None) or self._og_search_title( + webpage)).strip() video_id = self._html_search_regex( r'data-video=(["\'])(?P.+?)\1', webpage, 'video id', group='id') data = self._download_json( - 'https://mediazone.vrt.be/api/v1/canvas/assets/%s' % video_id, display_id) + 'https://mediazone.vrt.be/api/v1/%s/assets/%s' + % (site_id, video_id), display_id) formats = [] for target in data['targetUrls']: diff --git a/youtube_dl/extractor/cartoonnetwork.py b/youtube_dl/extractor/cartoonnetwork.py index b3f30b1ca..688a6375e 100644 --- a/youtube_dl/extractor/cartoonnetwork.py +++ b/youtube_dl/extractor/cartoonnetwork.py @@ -30,7 +30,7 @@ class CartoonNetworkIE(TurnerBaseIE): return self._extract_cvp_info( 'http://www.cartoonnetwork.com/video-seo-svc/episodeservices/getCvpPlaylist?networkName=CN2&' + query, video_id, { 'secure': { - 'media_src': 'http://apple-secure.cdn.turner.com/toon/big', + 'media_src': 'http://androidhls-secure.cdn.turner.com/toon/big', 'tokenizer_src': 'http://www.cartoonnetwork.com/cntv/mvpd/processors/services/token_ipadAdobe.do', }, }) diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index bf7915626..3a62c840b 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -4,7 +4,7 @@ from .cbs import CBSBaseIE class CBSSportsIE(CBSBaseIE): - _VALID_URL = r'https?://www\.cbssports\.com/video/player/[^/]+/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?cbssports\.com/video/player/[^/]+/(?P\d+)' _TESTS = [{ 'url': 'http://www.cbssports.com/video/player/videos/708337219968/0/ben-simmons-the-next-lebron?-not-so-fast', diff --git a/youtube_dl/extractor/cctv.py b/youtube_dl/extractor/cctv.py new file mode 100644 index 000000000..72a72cb73 --- /dev/null +++ b/youtube_dl/extractor/cctv.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import float_or_none + + +class CCTVIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(?:.+?\.)? + (?: + cctv\.(?:com|cn)| + cntv\.cn + )/ + (?: + video/[^/]+/(?P[0-9a-f]{32})| + \d{4}/\d{2}/\d{2}/(?PVID[0-9A-Za-z]+) + )''' + _TESTS = [{ + 'url': 'http://english.cntv.cn/2016/09/03/VIDEhnkB5y9AgHyIEVphCEz1160903.shtml', + 'md5': '819c7b49fc3927d529fb4cd555621823', + 'info_dict': { + 'id': '454368eb19ad44a1925bf1eb96140a61', + 'ext': 'mp4', + 'title': 'Portrait of Real Current Life 09/03/2016 Modern Inventors Part 1', + } + }, { + 'url': 'http://tv.cctv.com/2016/09/07/VIDE5C1FnlX5bUywlrjhxXOV160907.shtml', + 'only_matching': True, + }, { + 'url': 'http://tv.cntv.cn/video/C39296/95cfac44cabd3ddc4a9438780a4e5c44', + 'only_matching': True + }] + + def _real_extract(self, url): + video_id, display_id = re.match(self._VALID_URL, url).groups() + if not video_id: + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r'(?:fo\.addVariable\("videoCenterId",\s*|guid\s*=\s*)"([0-9a-f]{32})', + webpage, 'video_id') + api_data = self._download_json( + 'http://vdn.apps.cntv.cn/api/getHttpVideoInfo.do?pid=' + video_id, video_id) + m3u8_url = re.sub(r'maxbr=\d+&?', '', api_data['hls_url']) + + return { + 'id': video_id, + 'title': api_data['title'], + 'formats': self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', fatal=False), + 'duration': float_or_none(api_data.get('video', {}).get('totalLength')), + } diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index 5a58d1777..87c2e7089 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -17,7 +17,7 @@ from ..utils import ( class CeskaTelevizeIE(InfoExtractor): - _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(?:[^/]+/)*(?P[^/#?]+)/*(?:[#?].*)?$' + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(porady|ivysilani)/(?:[^/]+/)*(?P[^/#?]+)/*(?:[#?].*)?$' _TESTS = [{ 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', 'info_dict': { diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index b43518652..61aed0167 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -65,7 +65,7 @@ class ChirbitIE(InfoExtractor): class ChirbitProfileIE(InfoExtractor): IE_NAME = 'chirbit:profile' - _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?:rss/)?(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?chirbit\.com/(?:rss/)?(?P[^/]+)' _TEST = { 'url': 'http://chirbit.com/ScarletBeauty', 'info_dict': { diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py index f24568dcc..ac3bdfe8f 100644 --- a/youtube_dl/extractor/cmt.py +++ b/youtube_dl/extractor/cmt.py @@ -6,7 +6,7 @@ from ..utils import ExtractorError class CMTIE(MTVIE): IE_NAME = 'cmt.com' - _VALID_URL = r'https?://www\.cmt\.com/(?:videos|shows)/(?:[^/]+/)*(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?cmt\.com/(?:videos|shows)/(?:[^/]+/)*(?P\d+)' _FEED_URL = 'http://www.cmt.com/sitewide/apps/player/embed/rss/' _TESTS = [{ diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a9c7a8d16..ff19270ae 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -680,7 +680,7 @@ class InfoExtractor(object): return (username, password) - def _get_login_info(self): + def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None): """ Get the login info as (username, password) It will look in the netrc file using the _NETRC_MACHINE value @@ -694,11 +694,11 @@ class InfoExtractor(object): downloader_params = self._downloader.params # Attempt to use provided username and password or .netrc data - if downloader_params.get('username') is not None: - username = downloader_params['username'] - password = downloader_params['password'] + if downloader_params.get(username_option) is not None: + username = downloader_params[username_option] + password = downloader_params[password_option] else: - username, password = self._get_netrc_login_info() + username, password = self._get_netrc_login_info(netrc_machine) return (username, password) @@ -1163,13 +1163,6 @@ class InfoExtractor(object): m3u8_id=None, note=None, errnote=None, fatal=True, live=False): - formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)] - - format_url = lambda u: ( - u - if re.match(r'^https?://', u) - else compat_urlparse.urljoin(m3u8_url, u)) - res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', @@ -1180,6 +1173,13 @@ class InfoExtractor(object): m3u8_doc, urlh = res m3u8_url = urlh.geturl() + formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)] + + format_url = lambda u: ( + u + if re.match(r'^https?://', u) + else compat_urlparse.urljoin(m3u8_url, u)) + # We should try extracting formats only from master playlists [1], i.e. # playlists that describe available qualities. On the other hand media # playlists [2] should be returned as is since they contain just the media @@ -1749,7 +1749,7 @@ class InfoExtractor(object): media_attributes = extract_attributes(media_tag) src = media_attributes.get('src') if src: - _, formats = _media_formats(src) + _, formats = _media_formats(src, media_type) media_info['formats'].extend(formats) media_info['thumbnail'] = media_attributes.get('poster') if media_content: diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py index dedb810a0..ad32673a8 100644 --- a/youtube_dl/extractor/criterion.py +++ b/youtube_dl/extractor/criterion.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class CriterionIE(InfoExtractor): - _VALID_URL = r'https?://www\.criterion\.com/films/(?P[0-9]+)-.+' + _VALID_URL = r'https?://(?:www\.)?criterion\.com/films/(?P[0-9]+)-.+' _TEST = { 'url': 'http://www.criterion.com/films/184-le-samourai', 'md5': 'bc51beba55685509883a9a7830919ec3', diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 496883d15..62b0747a5 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -394,7 +394,7 @@ class DailymotionUserIE(DailymotionPlaylistIE): class DailymotionCloudIE(DailymotionBaseInfoExtractor): - _VALID_URL_PREFIX = r'http://api\.dmcloud\.net/(?:player/)?embed/' + _VALID_URL_PREFIX = r'https?://api\.dmcloud\.net/(?:player/)?embed/' _VALID_URL = r'%s[^/]+/(?P[^/?]+)' % _VALID_URL_PREFIX _VALID_EMBED_URL = r'%s[^/]+/[^\'"]+' % _VALID_URL_PREFIX diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index 9099f5046..a47e04993 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -6,7 +6,7 @@ from ..compat import compat_str class DctpTvIE(InfoExtractor): - _VALID_URL = r'https?://www.dctp.tv/(#/)?filme/(?P.+?)/$' + _VALID_URL = r'https?://(?:www\.)?dctp\.tv/(#/)?filme/(?P.+?)/$' _TEST = { 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', 'info_dict': { diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index 65a98d789..bdfe638b4 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -13,7 +13,7 @@ from ..utils import ( class DemocracynowIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?democracynow.org/(?P[^\?]*)' + _VALID_URL = r'https?://(?:www\.)?democracynow\.org/(?P[^\?]*)' IE_NAME = 'democracynow' _TESTS = [{ 'url': 'http://www.democracynow.org/shows/2015/7/3', diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index 2d74ff855..88d096b30 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -4,26 +4,45 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( ExtractorError, + int_or_none, + float_or_none, + mimetype2ext, parse_iso8601, + remove_end, ) class DRTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dr\.dk/tv/se/(?:[^/]+/)*(?P[\da-z-]+)(?:[/#?]|$)' + _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv/se|nyheder)/(?:[^/]+/)*(?P[\da-z-]+)(?:[/#?]|$)' - _TEST = { - 'url': 'https://www.dr.dk/tv/se/boern/ultra/panisk-paske/panisk-paske-5', - 'md5': 'dc515a9ab50577fa14cc4e4b0265168f', + _TESTS = [{ + 'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10', + 'md5': '25e659cccc9a2ed956110a299fdf5983', 'info_dict': { - 'id': 'panisk-paske-5', + 'id': 'klassen-darlig-taber-10', 'ext': 'mp4', - 'title': 'Panisk Påske (5)', - 'description': 'md5:ca14173c5ab24cd26b0fcc074dff391c', - 'timestamp': 1426984612, - 'upload_date': '20150322', - 'duration': 1455, + 'title': 'Klassen - Dårlig taber (10)', + 'description': 'md5:815fe1b7fa656ed80580f31e8b3c79aa', + 'timestamp': 1471991907, + 'upload_date': '20160823', + 'duration': 606.84, }, - } + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang', + 'md5': '2c37175c718155930f939ef59952474a', + 'info_dict': { + 'id': 'christiania-pusher-street-ryddes-drdkrjpo', + 'ext': 'mp4', + 'title': 'LIVE Christianias rydning af Pusher Street er i gang', + 'description': '- Det er det fedeste, der er sket i 20 år, fortæller christianit til DR Nyheder.', + 'timestamp': 1472800279, + 'upload_date': '20160902', + 'duration': 131.4, + }, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -35,7 +54,8 @@ class DRTVIE(InfoExtractor): 'Video %s is not available' % video_id, expected=True) video_id = self._search_regex( - r'data-(?:material-identifier|episode-slug)="([^"]+)"', + (r'data-(?:material-identifier|episode-slug)="([^"]+)"', + r'data-resource="[^>"]+mu/programcard/expanded/([^"]+)"'), webpage, 'video id') programcard = self._download_json( @@ -43,9 +63,12 @@ class DRTVIE(InfoExtractor): video_id, 'Downloading video JSON') data = programcard['Data'][0] - title = data['Title'] - description = data['Description'] - timestamp = parse_iso8601(data['CreatedTime']) + title = remove_end(self._og_search_title( + webpage, default=None), ' | TV | DR') or data['Title'] + description = self._og_search_description( + webpage, default=None) or data.get('Description') + + timestamp = parse_iso8601(data.get('CreatedTime')) thumbnail = None duration = None @@ -56,16 +79,18 @@ class DRTVIE(InfoExtractor): subtitles = {} for asset in data['Assets']: - if asset['Kind'] == 'Image': - thumbnail = asset['Uri'] - elif asset['Kind'] == 'VideoResource': - duration = asset['DurationInMilliseconds'] / 1000.0 - restricted_to_denmark = asset['RestrictedToDenmark'] - spoken_subtitles = asset['Target'] == 'SpokenSubtitles' - for link in asset['Links']: - uri = link['Uri'] - target = link['Target'] - format_id = target + if asset.get('Kind') == 'Image': + thumbnail = asset.get('Uri') + elif asset.get('Kind') == 'VideoResource': + duration = float_or_none(asset.get('DurationInMilliseconds'), 1000) + restricted_to_denmark = asset.get('RestrictedToDenmark') + spoken_subtitles = asset.get('Target') == 'SpokenSubtitles' + for link in asset.get('Links', []): + uri = link.get('Uri') + if not uri: + continue + target = link.get('Target') + format_id = target or '' preference = None if spoken_subtitles: preference = -1 @@ -76,8 +101,8 @@ class DRTVIE(InfoExtractor): video_id, preference, f4m_id=format_id)) elif target == 'HLS': formats.extend(self._extract_m3u8_formats( - uri, video_id, 'mp4', preference=preference, - m3u8_id=format_id)) + uri, video_id, 'mp4', entry_protocol='m3u8_native', + preference=preference, m3u8_id=format_id)) else: bitrate = link.get('Bitrate') if bitrate: @@ -85,7 +110,7 @@ class DRTVIE(InfoExtractor): formats.append({ 'url': uri, 'format_id': format_id, - 'tbr': bitrate, + 'tbr': int_or_none(bitrate), 'ext': link.get('FileFormat'), }) subtitles_list = asset.get('SubtitlesList') @@ -94,12 +119,18 @@ class DRTVIE(InfoExtractor): 'Danish': 'da', } for subs in subtitles_list: - lang = subs['Language'] - subtitles[LANGS.get(lang, lang)] = [{'url': subs['Uri'], 'ext': 'vtt'}] + if not subs.get('Uri'): + continue + lang = subs.get('Language') or 'da' + subtitles.setdefault(LANGS.get(lang, lang), []).append({ + 'url': subs['Uri'], + 'ext': mimetype2ext(subs.get('MimeType')) or 'vtt' + }) if not formats and restricted_to_denmark: - raise ExtractorError( - 'Unfortunately, DR is not allowed to show this program outside Denmark.', expected=True) + self.raise_geo_restricted( + 'Unfortunately, DR is not allowed to show this program outside Denmark.', + expected=True) self._sort_formats(formats) diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py index a39e9010d..65635c18b 100644 --- a/youtube_dl/extractor/engadget.py +++ b/youtube_dl/extractor/engadget.py @@ -4,7 +4,7 @@ from .common import InfoExtractor class EngadgetIE(InfoExtractor): - _VALID_URL = r'https?://www.engadget.com/video/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?engadget\.com/video/(?P[^/?#]+)' _TESTS = [{ # video with 5min ID diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py index 66c08bec4..6d10f8e68 100644 --- a/youtube_dl/extractor/espn.py +++ b/youtube_dl/extractor/espn.py @@ -5,7 +5,7 @@ from ..utils import remove_end class ESPNIE(InfoExtractor): - _VALID_URL = r'https?://espn\.go\.com/(?:[^/]+/)*(?P[^/]+)' + _VALID_URL = r'https?://(?:espn\.go|(?:www\.)?espn)\.com/(?:[^/]+/)*(?P[^/]+)' _TESTS = [{ 'url': 'http://espn.go.com/video/clip?id=10365079', 'md5': '60e5d097a523e767d06479335d1bdc58', @@ -47,6 +47,9 @@ class ESPNIE(InfoExtractor): }, { 'url': 'http://espn.go.com/nba/playoffs/2015/story/_/id/12887571/john-wall-washington-wizards-no-swelling-left-hand-wrist-game-5-return', 'only_matching': True, + }, { + 'url': 'http://www.espn.com/video/clip?id=10365079', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/expotv.py b/youtube_dl/extractor/expotv.py index 971c918a4..ef11962f3 100644 --- a/youtube_dl/extractor/expotv.py +++ b/youtube_dl/extractor/expotv.py @@ -8,7 +8,7 @@ from ..utils import ( class ExpoTVIE(InfoExtractor): - _VALID_URL = r'https?://www\.expotv\.com/videos/[^?#]*/(?P[0-9]+)($|[?#])' + _VALID_URL = r'https?://(?:www\.)?expotv\.com/videos/[^?#]*/(?P[0-9]+)($|[?#])' _TEST = { 'url': 'http://www.expotv.com/videos/reviews/3/40/NYX-Butter-lipstick/667916', 'md5': 'fe1d728c3a813ff78f595bc8b7a707a8', diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 2bcd5a0cd..dd0579425 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -5,11 +5,14 @@ from .abc import ( ABCIE, ABCIViewIE, ) -from .abc7news import Abc7NewsIE from .abcnews import ( AbcNewsIE, AbcNewsVideoIE, ) +from .abcotvs import ( + ABCOTVSIE, + ABCOTVSClipsIE, +) from .academicearth import AcademicEarthCourseIE from .acast import ( ACastIE, @@ -90,6 +93,7 @@ from .bbc import ( ) from .beeg import BeegIE from .behindkink import BehindKinkIE +from .bellmedia import BellMediaIE from .beatportpro import BeatportProIE from .bet import BetIE from .bigflix import BigflixIE @@ -143,6 +147,7 @@ from .cbsnews import ( ) from .cbssports import CBSSportsIE from .ccc import CCCIE +from .cctv import CCTVIE from .cda import CDAIE from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE @@ -191,7 +196,6 @@ from .crunchyroll import ( ) from .cspan import CSpanIE from .ctsnews import CtsNewsIE -from .ctv import CTVIE from .ctvnews import CTVNewsIE from .cultureunplugged import CultureUnpluggedIE from .curiositystream import ( @@ -264,9 +268,15 @@ from .everyonesmixtape import EveryonesMixtapeIE from .expotv import ExpoTVIE from .extremetube import ExtremeTubeIE from .eyedotv import EyedoTVIE -from .facebook import FacebookIE +from .facebook import ( + FacebookIE, + FacebookPluginsVideoIE, +) from .faz import FazIE -from .fc2 import FC2IE +from .fc2 import ( + FC2IE, + FC2EmbedIE, +) from .fczenit import FczenitIE from .firstpost import FirstpostIE from .firsttv import FirstTVIE @@ -281,7 +291,11 @@ from .formula1 import Formula1IE from .fourtube import FourTubeIE from .fox import FOXIE from .foxgay import FoxgayIE -from .foxnews import FoxNewsIE +from .foxnews import ( + FoxNewsIE, + FoxNewsArticleIE, + FoxNewsInsiderIE, +) from .foxsports import FoxSportsIE from .franceculture import FranceCultureIE from .franceinter import FranceInterIE @@ -382,6 +396,7 @@ from .ivi import ( IviCompilationIE ) from .ivideon import IvideonIE +from .iwara import IwaraIE from .izlesene import IzleseneIE from .jeuxvideo import JeuxVideoIE from .jove import JoveIE @@ -394,6 +409,7 @@ from .kankan import KankanIE from .karaoketv import KaraoketvIE from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE +from .ketnet import KetnetIE from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE from .keek import KeekIE @@ -412,6 +428,7 @@ from .kuwo import ( ) from .la7 import LA7IE from .laola1tv import Laola1TvIE +from .lci import LCIIE from .lcp import ( LcpPlayIE, LcpIE, @@ -462,6 +479,7 @@ from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mgoon import MgoonIE from .mgtv import MGTVIE +from .miaopai import MiaoPaiIE from .microsoftvirtualacademy import ( MicrosoftVirtualAcademyIE, MicrosoftVirtualAcademyCourseIE, @@ -516,6 +534,7 @@ from .nbc import ( CSNNEIE, NBCIE, NBCNewsIE, + NBCOlympicsIE, NBCSportsIE, NBCSportsVPlayerIE, ) @@ -655,7 +674,10 @@ from .pluralsight import ( ) from .podomatic import PodomaticIE from .pokemon import PokemonIE -from .polskieradio import PolskieRadioIE +from .polskieradio import ( + PolskieRadioIE, + PolskieRadioCategoryIE, +) from .porn91 import Porn91IE from .porncom import PornComIE from .pornhd import PornHdIE @@ -709,6 +731,7 @@ from .revision3 import ( ) from .rice import RICEIE from .ringtv import RingTVIE +from .rmcdecouverte import RMCDecouverteIE from .ro220 import Ro220IE from .rockstargames import RockstarGamesIE from .roosterteeth import RoosterTeethIE @@ -845,10 +868,12 @@ from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE from .telegraaf import TelegraafIE from .telemb import TeleMBIE +from .telequebec import TeleQuebecIE from .teletask import TeleTaskIE from .telewebion import TelewebionIE from .testurl import TestURLIE from .tf1 import TF1IE +from .tfo import TFOIE from .theintercept import TheInterceptIE from .theplatform import ( ThePlatformIE, @@ -877,7 +902,7 @@ from .toutv import TouTvIE from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE -from .trollvids import TrollvidsIE +from .trutv import TruTVIE from .tube8 import Tube8IE from .tubitv import TubiTvIE from .tudou import ( @@ -907,6 +932,7 @@ from .tvc import ( ) from .tvigle import TvigleIE from .tvland import TVLandIE +from .tvnoe import TVNoeIE from .tvp import ( TVPEmbedIE, TVPIE, diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 228b0b6d7..3a220e995 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -351,3 +351,32 @@ class FacebookIE(InfoExtractor): self._VIDEO_PAGE_TEMPLATE % video_id, video_id, fatal_if_no_video=True) return info_dict + + +class FacebookPluginsVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/plugins/video\.php\?.*?\bhref=(?Phttps.+)' + + _TESTS = [{ + 'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fgov.sg%2Fvideos%2F10154383743583686%2F&show_text=0&width=560', + 'md5': '5954e92cdfe51fe5782ae9bda7058a07', + 'info_dict': { + 'id': '10154383743583686', + 'ext': 'mp4', + 'title': 'What to do during the haze?', + 'uploader': 'Gov.sg', + 'upload_date': '20160826', + 'timestamp': 1472184808, + }, + 'add_ie': [FacebookIE.ie_key()], + }, { + 'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fvideo.php%3Fv%3D10204634152394104', + 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/plugins/video.php?href=https://www.facebook.com/gov.sg/videos/10154383743583686/&show_text=0&width=560', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self.url_result( + compat_urllib_parse_unquote(self._match_id(url)), + FacebookIE.ie_key()) diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index c7d69ff1f..c032d4d02 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -1,10 +1,12 @@ -#! -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import hashlib +import re from .common import InfoExtractor from ..compat import ( + compat_parse_qs, compat_urllib_request, compat_urlparse, ) @@ -16,7 +18,7 @@ from ..utils import ( class FC2IE(InfoExtractor): - _VALID_URL = r'^https?://video\.fc2\.com/(?:[^/]+/)*content/(?P[^/]+)' + _VALID_URL = r'^(?:https?://video\.fc2\.com/(?:[^/]+/)*content/|fc2:)(?P[^/]+)' IE_NAME = 'fc2' _NETRC_MACHINE = 'fc2' _TESTS = [{ @@ -75,12 +77,17 @@ class FC2IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) self._login() - webpage = self._download_webpage(url, video_id) - self._downloader.cookiejar.clear_session_cookies() # must clear - self._login() + webpage = None + if not url.startswith('fc2:'): + webpage = self._download_webpage(url, video_id) + self._downloader.cookiejar.clear_session_cookies() # must clear + self._login() - title = self._og_search_title(webpage) - thumbnail = self._og_search_thumbnail(webpage) + title = 'FC2 video %s' % video_id + thumbnail = None + if webpage is not None: + title = self._og_search_title(webpage) + thumbnail = self._og_search_thumbnail(webpage) refer = url.replace('/content/', '/a/content/') if '/a/content/' not in url else url mimi = hashlib.md5((video_id + '_gGddgPfeaf_gzyr').encode('utf-8')).hexdigest() @@ -113,3 +120,41 @@ class FC2IE(InfoExtractor): 'ext': 'flv', 'thumbnail': thumbnail, } + + +class FC2EmbedIE(InfoExtractor): + _VALID_URL = r'https?://video\.fc2\.com/flv2\.swf\?(?P.+)' + IE_NAME = 'fc2:embed' + + _TEST = { + 'url': 'http://video.fc2.com/flv2.swf?t=201404182936758512407645&i=20130316kwishtfitaknmcgd76kjd864hso93htfjcnaogz629mcgfs6rbfk0hsycma7shkf85937cbchfygd74&i=201403223kCqB3Ez&d=2625&sj=11&lang=ja&rel=1&from=11&cmt=1&tk=TlRBM09EQTNNekU9&tl=プリズン・ブレイク%20S1-01%20マイケル%20【吹替】', + 'md5': 'b8aae5334cb691bdb1193a88a6ab5d5a', + 'info_dict': { + 'id': '201403223kCqB3Ez', + 'ext': 'flv', + 'title': 'プリズン・ブレイク S1-01 マイケル 【吹替】', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + query = compat_parse_qs(mobj.group('query')) + + video_id = query['i'][-1] + title = query.get('tl', ['FC2 video %s' % video_id])[0] + + sj = query.get('sj', [None])[0] + thumbnail = None + if sj: + # See thumbnailImagePath() in ServerConst.as of flv2.swf + thumbnail = 'http://video%s-thumbnail.fc2.com/up/pic/%s.jpg' % ( + sj, '/'.join((video_id[:6], video_id[6:8], video_id[-2], video_id[-1], video_id))) + + return { + '_type': 'url_transparent', + 'ie_key': FC2IE.ie_key(), + 'url': 'fc2:%s' % video_id, + 'title': title, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/foxgay.py b/youtube_dl/extractor/foxgay.py index 70c1a815d..39174fcec 100644 --- a/youtube_dl/extractor/foxgay.py +++ b/youtube_dl/extractor/foxgay.py @@ -1,18 +1,24 @@ from __future__ import unicode_literals +import itertools + from .common import InfoExtractor +from ..utils import ( + get_element_by_id, + remove_end, +) class FoxgayIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?foxgay\.com/videos/(?:\S+-)?(?P\d+)\.shtml' _TEST = { 'url': 'http://foxgay.com/videos/fuck-turkish-style-2582.shtml', - 'md5': '80d72beab5d04e1655a56ad37afe6841', + 'md5': '344558ccfea74d33b7adbce22e577f54', 'info_dict': { 'id': '2582', 'ext': 'mp4', - 'title': 'md5:6122f7ae0fc6b21ebdf59c5e083ce25a', - 'description': 'md5:5e51dc4405f1fd315f7927daed2ce5cf', + 'title': 'Fuck Turkish-style', + 'description': 'md5:6ae2d9486921891efe89231ace13ffdf', 'age_limit': 18, 'thumbnail': 're:https?://.*\.jpg$', }, @@ -22,27 +28,35 @@ class FoxgayIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'(?P<title>.*?)', - webpage, 'title', fatal=False) - description = self._html_search_regex( - r'

(?P.*?)

', - webpage, 'description', fatal=False) + title = remove_end(self._html_search_regex( + r'([^<]+)', webpage, 'title'), ' - Foxgay.com') + description = get_element_by_id('inf_tit', webpage) + # The default user-agent with foxgay cookies leads to pages without videos + self._downloader.cookiejar.clear('.foxgay.com') # Find the URL for the iFrame which contains the actual video. + iframe_url = self._html_search_regex( + r']+src=([\'"])(?P[^\'"]+)\1', webpage, + 'video frame', group='url') iframe = self._download_webpage( - self._html_search_regex(r'iframe src="(?P.*?)"', webpage, 'video frame'), - video_id) - video_url = self._html_search_regex( - r"v_path = '(?Phttp://.*?)'", iframe, 'url') - thumb_url = self._html_search_regex( - r"t_path = '(?Phttp://.*?)'", iframe, 'thumbnail', fatal=False) + iframe_url, video_id, headers={'User-Agent': 'curl/7.50.1'}, + note='Downloading video frame') + video_data = self._parse_json(self._search_regex( + r'video_data\s*=\s*([^;]+);', iframe, 'video data'), video_id) + + formats = [{ + 'url': source, + 'height': resolution, + } for source, resolution in zip( + video_data['sources'], video_data.get('resolutions', itertools.repeat(None)))] + + self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'url': video_url, + 'formats': formats, 'description': description, - 'thumbnail': thumb_url, + 'thumbnail': video_data.get('act_vid', {}).get('thumb'), 'age_limit': 18, } diff --git a/youtube_dl/extractor/foxnews.py b/youtube_dl/extractor/foxnews.py index b04da2415..229bcb175 100644 --- a/youtube_dl/extractor/foxnews.py +++ b/youtube_dl/extractor/foxnews.py @@ -3,11 +3,13 @@ from __future__ import unicode_literals import re from .amp import AMPIE +from .common import InfoExtractor class FoxNewsIE(AMPIE): + IE_NAME = 'foxnews' IE_DESC = 'Fox News and Fox Business Video' - _VALID_URL = r'https?://(?Pvideo\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P\d+)' + _VALID_URL = r'https?://(?Pvideo\.(?:insider\.)?fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P\d+)' _TESTS = [ { 'url': 'http://video.foxnews.com/v/3937480/frozen-in-time/#sp=show-clips', @@ -49,6 +51,11 @@ class FoxNewsIE(AMPIE): 'url': 'http://video.foxbusiness.com/v/4442309889001', 'only_matching': True, }, + { + # From http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words + 'url': 'http://video.insider.foxnews.com/v/video-embed.html?video_id=5099377331001&autoplay=true&share_url=http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words&share_title=Student%20Group:%20Saying%20%27Politically%20Correct,%27%20%27Trash%27%20and%20%27Lame%27%20Is%20Offensive&share=true', + 'only_matching': True, + }, ] def _real_extract(self, url): @@ -58,3 +65,76 @@ class FoxNewsIE(AMPIE): 'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id)) info['id'] = video_id return info + + +class FoxNewsArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?foxnews\.com/(?!v)([^/]+/)+(?P[a-z-]+)' + IE_NAME = 'foxnews:article' + + _TEST = { + 'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html', + 'md5': '62aa5a781b308fdee212ebb6f33ae7ef', + 'info_dict': { + 'id': '5116295019001', + 'ext': 'mp4', + 'title': 'Trump and Clinton asked to defend positions on Iraq War', + 'description': 'Veterans react on \'The Kelly File\'', + 'timestamp': 1473299755, + 'upload_date': '20160908', + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + video_id = self._html_search_regex( + r'data-video-id=([\'"])(?P[^\'"]+)\1', + webpage, 'video ID', group='id') + return self.url_result( + 'http://video.foxnews.com/v/' + video_id, + FoxNewsIE.ie_key()) + + +class FoxNewsInsiderIE(InfoExtractor): + _VALID_URL = r'https?://insider\.foxnews\.com/([^/]+/)+(?P[a-z-]+)' + IE_NAME = 'foxnews:insider' + + _TEST = { + 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words', + 'md5': 'a10c755e582d28120c62749b4feb4c0c', + 'info_dict': { + 'id': '5099377331001', + 'display_id': 'univ-wisconsin-student-group-pushing-silence-certain-words', + 'ext': 'mp4', + 'title': 'Student Group: Saying \'Politically Correct,\' \'Trash\' and \'Lame\' Is Offensive', + 'description': 'Is campus censorship getting out of control?', + 'timestamp': 1472168725, + 'upload_date': '20160825', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': [FoxNewsIE.ie_key()], + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + embed_url = self._html_search_meta('embedUrl', webpage, 'embed URL') + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + + return { + '_type': 'url_transparent', + 'ie_key': FoxNewsIE.ie_key(), + 'url': embed_url, + 'display_id': display_id, + 'title': title, + 'description': description, + } diff --git a/youtube_dl/extractor/franceinter.py b/youtube_dl/extractor/franceinter.py index 2369f868d..0d58f89c5 100644 --- a/youtube_dl/extractor/franceinter.py +++ b/youtube_dl/extractor/franceinter.py @@ -2,20 +2,21 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none +from ..compat import compat_str +from ..utils import month_by_name class FranceInterIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?franceinter\.fr/player/reecouter\?play=(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?franceinter\.fr/emissions/(?P[^?#]+)' + _TEST = { - 'url': 'http://www.franceinter.fr/player/reecouter?play=793962', + 'url': 'https://www.franceinter.fr/emissions/la-marche-de-l-histoire/la-marche-de-l-histoire-18-decembre-2013', 'md5': '4764932e466e6f6c79c317d2e74f6884', 'info_dict': { - 'id': '793962', + 'id': 'la-marche-de-l-histoire/la-marche-de-l-histoire-18-decembre-2013', 'ext': 'mp3', - 'title': 'L’Histoire dans les jeux vidéo', - 'description': 'md5:7e93ddb4451e7530022792240a3049c7', - 'timestamp': 1387369800, + 'title': 'L’Histoire dans les jeux vidéo du 18 décembre 2013 - France Inter', + 'description': 'md5:7f2ce449894d1e585932273080fb410d', 'upload_date': '20131218', }, } @@ -25,23 +26,29 @@ class FranceInterIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - path = self._search_regex( - r']+class=["\']page-diffusion["\'][^>]*>.*?]+data-url=(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'video url', group='url') - title = self._html_search_regex( - r'(.+?)', webpage, 'title') - description = self._html_search_regex( - r'(.*?)', - webpage, 'description', fatal=False) - timestamp = int_or_none(self._search_regex( - r'data-date="(\d+)"', webpage, 'upload date', fatal=False)) + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + + upload_date_str = self._search_regex( + r'class=["\']cover-emission-period["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<', + webpage, 'upload date', fatal=False) + if upload_date_str: + upload_date_list = upload_date_str.split() + upload_date_list.reverse() + upload_date_list[1] = compat_str(month_by_name(upload_date_list[1], lang='fr')) + upload_date = ''.join(upload_date_list) + else: + upload_date = None return { 'id': video_id, 'title': title, 'description': description, - 'timestamp': timestamp, + 'upload_date': upload_date, 'formats': [{ 'url': video_url, 'vcodec': 'none', diff --git a/youtube_dl/extractor/freespeech.py b/youtube_dl/extractor/freespeech.py index 1477708bb..0a70ca763 100644 --- a/youtube_dl/extractor/freespeech.py +++ b/youtube_dl/extractor/freespeech.py @@ -8,7 +8,7 @@ from .common import InfoExtractor class FreespeechIE(InfoExtractor): IE_NAME = 'freespeech.org' - _VALID_URL = r'https://www\.freespeech\.org/video/(?P.+)' + _VALID_URL = r'https?://(?:www\.)?freespeech\.org/video/(?P<title>.+)' _TEST = { 'add_ie': ['Youtube'], 'url': 'https://www.freespeech.org/video/obama-romney-campaign-colorado-ahead-debate-0', diff --git a/youtube_dl/extractor/gamestar.py b/youtube_dl/extractor/gamestar.py index 69058a583..55a34604a 100644 --- a/youtube_dl/extractor/gamestar.py +++ b/youtube_dl/extractor/gamestar.py @@ -1,19 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( int_or_none, - parse_duration, - str_to_int, - unified_strdate, + remove_end, ) class GameStarIE(InfoExtractor): - _VALID_URL = r'https?://www\.gamestar\.de/videos/.*,(?P<id>[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?gamestar\.de/videos/.*,(?P<id>[0-9]+)\.html' _TEST = { 'url': 'http://www.gamestar.de/videos/trailer,3/hobbit-3-die-schlacht-der-fuenf-heere,76110.html', 'md5': '96974ecbb7fd8d0d20fca5a00810cea7', @@ -21,8 +17,9 @@ class GameStarIE(InfoExtractor): 'id': '76110', 'ext': 'mp4', 'title': 'Hobbit 3: Die Schlacht der Fünf Heere - Teaser-Trailer zum dritten Teil', - 'description': 'Der Teaser-Trailer zu Hobbit 3: Die Schlacht der Fünf Heere zeigt einige Szenen aus dem dritten Teil der Saga und kündigt den vollständigen Trailer an.', - 'thumbnail': 'http://images.gamestar.de/images/idgwpgsgp/bdb/2494525/600x.jpg', + 'description': 'Der Teaser-Trailer zu Hobbit 3: Die Schlacht der Fünf Heere zeigt einige Szenen aus dem dritten Teil der Saga und kündigt den...', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1406542020, 'upload_date': '20140728', 'duration': 17 } @@ -32,41 +29,27 @@ class GameStarIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - og_title = self._og_search_title(webpage) - title = re.sub(r'\s*- Video (bei|-) GameStar\.de$', '', og_title) - url = 'http://gamestar.de/_misc/videos/portal/getVideoUrl.cfm?premium=0&videoId=' + video_id - description = self._og_search_description(webpage).strip() - - thumbnail = self._proto_relative_url( - self._og_search_thumbnail(webpage), scheme='http:') - - upload_date = unified_strdate(self._html_search_regex( - r'<span style="float:left;font-size:11px;">Datum: ([0-9]+\.[0-9]+\.[0-9]+)  ', - webpage, 'upload_date', fatal=False)) - - duration = parse_duration(self._html_search_regex( - r'  Länge: ([0-9]+:[0-9]+)</span>', webpage, 'duration', - fatal=False)) - - view_count = str_to_int(self._html_search_regex( - r'  Zuschauer: ([0-9\.]+)  ', webpage, - 'view_count', fatal=False)) + # TODO: there are multiple ld+json objects in the webpage, + # while _search_json_ld finds only the first one + json_ld = self._parse_json(self._search_regex( + r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>[^<]+VideoObject[^<]+)</script>', + webpage, 'JSON-LD', group='json_ld'), video_id) + info_dict = self._json_ld(json_ld, video_id) + info_dict['title'] = remove_end(info_dict['title'], ' - GameStar') + view_count = json_ld.get('interactionCount') comment_count = int_or_none(self._html_search_regex( - r'>Kommentieren \(([0-9]+)\)</a>', webpage, 'comment_count', + r'([0-9]+) Kommentare</span>', webpage, 'comment_count', fatal=False)) - return { + info_dict.update({ 'id': video_id, - 'title': title, 'url': url, 'ext': 'mp4', - 'thumbnail': thumbnail, - 'description': description, - 'upload_date': upload_date, - 'duration': duration, 'view_count': view_count, 'comment_count': comment_count - } + }) + + return info_dict diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 24b217715..2e46ca179 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1369,6 +1369,11 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Vimeo'], }, + { + # generic vimeo embed that requires original URL passed as Referer + 'url': 'http://racing4everyone.eu/2016/07/30/formula-1-2016-round12-germany/', + 'only_matching': True, + }, { 'url': 'https://support.arkena.com/display/PLAY/Ways+to+embed+your+video', 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', diff --git a/youtube_dl/extractor/globo.py b/youtube_dl/extractor/globo.py index dbacbfc61..5638be48f 100644 --- a/youtube_dl/extractor/globo.py +++ b/youtube_dl/extractor/globo.py @@ -19,7 +19,7 @@ from ..utils import ( class GloboIE(InfoExtractor): - _VALID_URL = '(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P<id>\d{7,})' + _VALID_URL = r'(?:globo:|https?://.+?\.globo\.com/(?:[^/]+/)*(?:v/(?:[^/]+/)?|videos/))(?P<id>\d{7,})' _API_URL_TEMPLATE = 'http://api.globovideos.com/videos/%s/playlist' _SECURITY_URL_TEMPLATE = 'http://security.video.globo.com/videos/%s/hash?player=flash&version=17.0.0.132&resource_id=%s' @@ -396,7 +396,7 @@ class GloboIE(InfoExtractor): class GloboArticleIE(InfoExtractor): - _VALID_URL = 'https?://.+?\.globo\.com/(?:[^/]+/)*(?P<id>[^/]+)(?:\.html)?' + _VALID_URL = r'https?://.+?\.globo\.com/(?:[^/]+/)*(?P<id>[^/]+)(?:\.html)?' _VIDEOID_REGEXES = [ r'\bdata-video-id=["\'](\d{7,})', diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 6a437c54d..c7776b186 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -8,6 +8,8 @@ from ..utils import ( int_or_none, determine_ext, parse_age_limit, + urlencode_postdata, + ExtractorError, ) @@ -19,7 +21,7 @@ class GoIE(InfoExtractor): 'watchdisneyjunior': '008', 'watchdisneyxd': '009', } - _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/.*?vdka(?P<id>\w+)' % '|'.join(_BRANDS.keys()) + _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|season-\d+/\d+-(?P<display_id>[^/?#]+))' % '|'.join(_BRANDS.keys()) _TESTS = [{ 'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx', 'info_dict': { @@ -38,9 +40,13 @@ class GoIE(InfoExtractor): }] def _real_extract(self, url): - sub_domain, video_id = re.match(self._VALID_URL, url).groups() + sub_domain, video_id, display_id = re.match(self._VALID_URL, url).groups() + if not video_id: + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex(r'data-video-id=["\']VDKA(\w+)', webpage, 'video id') + brand = self._BRANDS[sub_domain] video_data = self._download_json( - 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/-1/-1/%s/-1/-1.json' % (self._BRANDS[sub_domain], video_id), + 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/-1/-1/%s/-1/-1.json' % (brand, video_id), video_id)['video'][0] title = video_data['title'] @@ -52,6 +58,21 @@ class GoIE(InfoExtractor): format_id = asset.get('format') ext = determine_ext(asset_url) if ext == 'm3u8': + video_type = video_data.get('type') + if video_type == 'lf': + entitlement = self._download_json( + 'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json', + video_id, data=urlencode_postdata({ + 'video_id': video_data['id'], + 'video_type': video_type, + 'brand': brand, + 'device': '001', + })) + errors = entitlement.get('errors', {}).get('errors', []) + if errors: + error_message = ', '.join([error['message'] for error in errors]) + raise ExtractorError('%s said: %s' % (self.IE_NAME, error_message), expected=True) + asset_url += '?' + entitlement['uplynkData']['sessionKey'] formats.extend(self._extract_m3u8_formats( asset_url, video_id, 'mp4', m3u8_id=format_id or 'hls', fatal=False)) else: diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py index 731bacd67..427499b11 100644 --- a/youtube_dl/extractor/googleplus.py +++ b/youtube_dl/extractor/googleplus.py @@ -10,7 +10,7 @@ from ..utils import unified_strdate class GooglePlusIE(InfoExtractor): IE_DESC = 'Google Plus' - _VALID_URL = r'https://plus\.google\.com/(?:[^/]+/)*?posts/(?P<id>\w+)' + _VALID_URL = r'https?://plus\.google\.com/(?:[^/]+/)*?posts/(?P<id>\w+)' IE_NAME = 'plus.google' _TEST = { 'url': 'https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH', diff --git a/youtube_dl/extractor/goshgay.py b/youtube_dl/extractor/goshgay.py index 0c015141f..a43abd154 100644 --- a/youtube_dl/extractor/goshgay.py +++ b/youtube_dl/extractor/goshgay.py @@ -11,7 +11,7 @@ from ..utils import ( class GoshgayIE(InfoExtractor): - _VALID_URL = r'https?://www\.goshgay\.com/video(?P<id>\d+?)($|/)' + _VALID_URL = r'https?://(?:www\.)?goshgay\.com/video(?P<id>\d+?)($|/)' _TEST = { 'url': 'http://www.goshgay.com/video299069/diesel_sfw_xxx_video', 'md5': '4b6db9a0a333142eb9f15913142b0ed1', diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py index b6cc15b6f..749e9154f 100644 --- a/youtube_dl/extractor/hark.py +++ b/youtube_dl/extractor/hark.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class HarkIE(InfoExtractor): - _VALID_URL = r'https?://www\.hark\.com/clips/(?P<id>.+?)-.+' + _VALID_URL = r'https?://(?:www\.)?hark\.com/clips/(?P<id>.+?)-.+' _TEST = { 'url': 'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013', 'md5': '6783a58491b47b92c7c1af5a77d4cbee', diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index 9db565209..34163725f 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -12,7 +12,7 @@ from ..utils import ( class HotNewHipHopIE(InfoExtractor): - _VALID_URL = r'https?://www\.hotnewhiphop\.com/.*\.(?P<id>.*)\.html' + _VALID_URL = r'https?://(?:www\.)?hotnewhiphop\.com/.*\.(?P<id>.*)\.html' _TEST = { 'url': 'http://www.hotnewhiphop.com/freddie-gibbs-lay-it-down-song.1435540.html', 'md5': '2c2cd2f76ef11a9b3b581e8b232f3d96', diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 3a6a6f5ad..f0fc8d49a 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -94,7 +94,7 @@ class ImdbIE(InfoExtractor): class ImdbListIE(InfoExtractor): IE_NAME = 'imdb:list' IE_DESC = 'Internet Movie Database lists' - _VALID_URL = r'https?://www\.imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})' + _VALID_URL = r'https?://(?:www\.)?imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})' _TEST = { 'url': 'http://www.imdb.com/list/JFs9NWw6XI0', 'info_dict': { diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py index 45add007f..76cc5ec3e 100644 --- a/youtube_dl/extractor/internetvideoarchive.py +++ b/youtube_dl/extractor/internetvideoarchive.py @@ -48,13 +48,23 @@ class InternetVideoArchiveIE(InfoExtractor): # There are multiple videos in the playlist whlie only the first one # matches the video played in browsers video_info = configuration['playlist'][0] + title = video_info['title'] formats = [] for source in video_info['sources']: file_url = source['file'] if determine_ext(file_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - file_url, video_id, ext='mp4', m3u8_id='hls')) + m3u8_formats = self._extract_m3u8_formats( + file_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + file_url = m3u8_formats[0]['url'] + formats.extend(self._extract_f4m_formats( + file_url.replace('.m3u8', '.f4m'), + video_id, f4m_id='hds', fatal=False)) + formats.extend(self._extract_mpd_formats( + file_url.replace('.m3u8', '.mpd'), + video_id, mpd_id='dash', fatal=False)) else: a_format = { 'url': file_url, @@ -70,7 +80,6 @@ class InternetVideoArchiveIE(InfoExtractor): self._sort_formats(formats) - title = video_info['title'] description = video_info.get('description') thumbnail = video_info.get('image') else: diff --git a/youtube_dl/extractor/iwara.py b/youtube_dl/extractor/iwara.py new file mode 100644 index 000000000..8d7e7f472 --- /dev/null +++ b/youtube_dl/extractor/iwara.py @@ -0,0 +1,77 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse +from ..utils import remove_end + + +class IwaraIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos/(?P<id>[a-zA-Z0-9]+)' + _TESTS = [{ + 'url': 'http://iwara.tv/videos/amVwUl1EHpAD9RD', + 'md5': '1d53866b2c514b23ed69e4352fdc9839', + 'info_dict': { + 'id': 'amVwUl1EHpAD9RD', + 'ext': 'mp4', + 'title': '【MMD R-18】ガールフレンド carry_me_off', + 'age_limit': 18, + }, + }, { + 'url': 'http://ecchi.iwara.tv/videos/Vb4yf2yZspkzkBO', + 'md5': '7e5f1f359cd51a027ba4a7b7710a50f0', + 'info_dict': { + 'id': '0B1LvuHnL-sRFNXB1WHNqbGw4SXc', + 'ext': 'mp4', + 'title': '[3D Hentai] Kyonyu Ã\x97 Genkai Ã\x97 Emaki Shinobi Girls.mp4', + 'age_limit': 18, + }, + 'add_ie': ['GoogleDrive'], + }, { + 'url': 'http://www.iwara.tv/videos/nawkaumd6ilezzgq', + 'md5': '1d85f1e5217d2791626cff5ec83bb189', + 'info_dict': { + 'id': '6liAP9s2Ojc', + 'ext': 'mp4', + 'age_limit': 0, + 'title': '[MMD] Do It Again Ver.2 [1080p 60FPS] (Motion,Camera,Wav+DL)', + 'description': 'md5:590c12c0df1443d833fbebe05da8c47a', + 'upload_date': '20160910', + 'uploader': 'aMMDsork', + 'uploader_id': 'UCVOFyOSCyFkXTYYHITtqB7A', + }, + 'add_ie': ['Youtube'], + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage, urlh = self._download_webpage_handle(url, video_id) + + hostname = compat_urllib_parse_urlparse(urlh.geturl()).hostname + # ecchi is 'sexy' in Japanese + age_limit = 18 if hostname.split('.')[0] == 'ecchi' else 0 + + entries = self._parse_html5_media_entries(url, webpage, video_id) + + if not entries: + iframe_url = self._html_search_regex( + r'<iframe[^>]+src=([\'"])(?P<url>[^\'"]+)\1', + webpage, 'iframe URL', group='url') + return { + '_type': 'url_transparent', + 'url': iframe_url, + 'age_limit': age_limit, + } + + title = remove_end(self._html_search_regex( + r'<title>([^<]+)', webpage, 'title'), ' | Iwara') + + info_dict = entries[0] + info_dict.update({ + 'id': video_id, + 'title': title, + 'age_limit': age_limit, + }) + + return info_dict diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index ce3126943..7aaa65476 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -63,10 +63,17 @@ class JWPlatformBaseIE(InfoExtractor): 'ext': ext, }) else: + height = int_or_none(source.get('height')) + if height is None: + # Often no height is provided but there is a label in + # format like 1080p. + height = int_or_none(self._search_regex( + r'^(\d{3,})[pP]$', source.get('label') or '', + 'height', default=None)) a_format = { 'url': source_url, 'width': int_or_none(source.get('width')), - 'height': int_or_none(source.get('height')), + 'height': height, 'ext': ext, } if source_url.startswith('rtmp'): diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 6a8464998..5a8403777 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -262,8 +262,16 @@ class KalturaIE(InfoExtractor): # Continue if asset is not ready if f.get('status') != 2: continue + # Original format that's not available (e.g. kaltura:1926081:0_c03e1b5g) + # skip for now. + if f.get('fileExt') == 'chun': + continue video_url = sign_url( '%s/flavorId/%s' % (data_url, f['id'])) + # audio-only has no videoCodecId (e.g. kaltura:1926081:0_c03e1b5g + # -f mp4-56) + vcodec = 'none' if 'videoCodecId' not in f and f.get( + 'frameRate') == 0 else f.get('videoCodecId') formats.append({ 'format_id': '%(fileExt)s-%(bitrate)s' % f, 'ext': f.get('fileExt'), @@ -271,7 +279,7 @@ class KalturaIE(InfoExtractor): 'fps': int_or_none(f.get('frameRate')), 'filesize_approx': int_or_none(f.get('size'), invscale=1024), 'container': f.get('containerFormat'), - 'vcodec': f.get('videoCodecId'), + 'vcodec': vcodec, 'height': int_or_none(f.get('height')), 'width': int_or_none(f.get('width')), 'url': video_url, diff --git a/youtube_dl/extractor/karaoketv.py b/youtube_dl/extractor/karaoketv.py index a6050c4de..bfccf89b0 100644 --- a/youtube_dl/extractor/karaoketv.py +++ b/youtube_dl/extractor/karaoketv.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class KaraoketvIE(InfoExtractor): - _VALID_URL = r'http://www.karaoketv.co.il/[^/]+/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?karaoketv\.co\.il/[^/]+/(?P\d+)' _TEST = { 'url': 'http://www.karaoketv.co.il/%D7%A9%D7%99%D7%A8%D7%99_%D7%A7%D7%A8%D7%99%D7%95%D7%A7%D7%99/58356/%D7%90%D7%99%D7%96%D7%95%D7%9F', 'info_dict': { diff --git a/youtube_dl/extractor/ketnet.py b/youtube_dl/extractor/ketnet.py new file mode 100644 index 000000000..aaf3f807a --- /dev/null +++ b/youtube_dl/extractor/ketnet.py @@ -0,0 +1,52 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class KetnetIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.ketnet.be/kijken/zomerse-filmpjes', + 'md5': 'd907f7b1814ef0fa285c0475d9994ed7', + 'info_dict': { + 'id': 'zomerse-filmpjes', + 'ext': 'mp4', + 'title': 'Gluur mee op de filmset en op Pennenzakkenrock', + 'description': 'Gluur mee met Ghost Rockers op de filmset', + 'thumbnail': 're:^https?://.*\.jpg$', + } + }, { + 'url': 'https://www.ketnet.be/kijken/karrewiet/uitzending-8-september-2016', + 'only_matching': True, + }, { + 'url': 'https://www.ketnet.be/achter-de-schermen/sien-repeteert-voor-stars-for-life', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + config = self._parse_json( + self._search_regex( + r'(?s)playerConfig\s*=\s*({.+?})\s*;', webpage, + 'player config'), + video_id) + + title = config['title'] + + formats = self._extract_m3u8_formats( + config['source']['hls'], video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': config.get('description'), + 'thumbnail': config.get('image'), + 'series': config.get('program'), + 'episode': config.get('episode'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/kickstarter.py b/youtube_dl/extractor/kickstarter.py index c61e78622..fbe499497 100644 --- a/youtube_dl/extractor/kickstarter.py +++ b/youtube_dl/extractor/kickstarter.py @@ -6,7 +6,7 @@ from ..utils import smuggle_url class KickStarterIE(InfoExtractor): - _VALID_URL = r'https?://www\.kickstarter\.com/projects/(?P[^/]*)/.*' + _VALID_URL = r'https?://(?:www\.)?kickstarter\.com/projects/(?P[^/]*)/.*' _TESTS = [{ 'url': 'https://www.kickstarter.com/projects/1404461844/intersection-the-story-of-josh-grant/description', 'md5': 'c81addca81327ffa66c642b5d8b08cab', diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 0eeb9ffeb..63e10125e 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -59,7 +59,7 @@ class KuwoBaseIE(InfoExtractor): class KuwoIE(KuwoBaseIE): IE_NAME = 'kuwo:song' IE_DESC = '酷我音乐' - _VALID_URL = r'https?://www\.kuwo\.cn/yinyue/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/yinyue/(?P\d+)' _TESTS = [{ 'url': 'http://www.kuwo.cn/yinyue/635632/', 'info_dict': { @@ -82,7 +82,7 @@ class KuwoIE(KuwoBaseIE): 'upload_date': '20150518', }, 'params': { - 'format': 'mp3-320' + 'format': 'mp3-320', }, }, { 'url': 'http://www.kuwo.cn/yinyue/3197154?catalog=yueku2016', @@ -91,10 +91,10 @@ class KuwoIE(KuwoBaseIE): def _real_extract(self, url): song_id = self._match_id(url) - webpage = self._download_webpage( + webpage, urlh = self._download_webpage_handle( url, song_id, note='Download song detail info', errnote='Unable to get song detail info') - if '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage: + if song_id not in urlh.geturl() or '对不起,该歌曲由于版权问题已被下线,将返回网站首页' in webpage: raise ExtractorError('this song has been offline because of copyright issues', expected=True) song_name = self._html_search_regex( @@ -139,7 +139,7 @@ class KuwoIE(KuwoBaseIE): class KuwoAlbumIE(InfoExtractor): IE_NAME = 'kuwo:album' IE_DESC = '酷我音乐 - 专辑' - _VALID_URL = r'https?://www\.kuwo\.cn/album/(?P\d+?)/' + _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/album/(?P\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/album/502294/', 'info_dict': { @@ -181,7 +181,7 @@ class KuwoChartIE(InfoExtractor): 'info_dict': { 'id': '香港中文龙虎榜', }, - 'playlist_mincount': 10, + 'playlist_mincount': 7, } def _real_extract(self, url): @@ -200,7 +200,7 @@ class KuwoChartIE(InfoExtractor): class KuwoSingerIE(InfoExtractor): IE_NAME = 'kuwo:singer' IE_DESC = '酷我音乐 - 歌手' - _VALID_URL = r'https?://www\.kuwo\.cn/mingxing/(?P[^/]+)' + _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/mingxing/(?P[^/]+)' _TESTS = [{ 'url': 'http://www.kuwo.cn/mingxing/bruno+mars/', 'info_dict': { @@ -296,14 +296,14 @@ class KuwoCategoryIE(InfoExtractor): class KuwoMvIE(KuwoBaseIE): IE_NAME = 'kuwo:mv' IE_DESC = '酷我音乐 - MV' - _VALID_URL = r'https?://www\.kuwo\.cn/mv/(?P\d+?)/' + _VALID_URL = r'https?://(?:www\.)?kuwo\.cn/mv/(?P\d+?)/' _TEST = { 'url': 'http://www.kuwo.cn/mv/6480076/', 'info_dict': { 'id': '6480076', 'ext': 'mp4', 'title': 'My HouseMV', - 'creator': 'PM02:00', + 'creator': '2PM', }, # In this video, music URLs (anti.s) are blocked outside China and # USA, while the MV URL (mvurl) is available globally, so force the MV diff --git a/youtube_dl/extractor/lci.py b/youtube_dl/extractor/lci.py new file mode 100644 index 000000000..af34829e7 --- /dev/null +++ b/youtube_dl/extractor/lci.py @@ -0,0 +1,24 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class LCIIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?lci\.fr/[^/]+/[\w-]+-(?P\d+)\.html' + _TEST = { + 'url': 'http://www.lci.fr/international/etats-unis-a-j-62-hillary-clinton-reste-sans-voix-2001679.html', + 'md5': '2fdb2538b884d4d695f9bd2bde137e6c', + 'info_dict': { + 'id': '13244802', + 'ext': 'mp4', + 'title': 'Hillary Clinton et sa quinte de toux, en plein meeting', + 'description': 'md5:a4363e3a960860132f8124b62f4a01c9', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + wat_id = self._search_regex(r'data-watid=[\'"](\d+)', webpage, 'wat id') + return self.url_result('wat:' + wat_id, 'Wat', wat_id) diff --git a/youtube_dl/extractor/litv.py b/youtube_dl/extractor/litv.py index 05c6579f1..a3784e6c6 100644 --- a/youtube_dl/extractor/litv.py +++ b/youtube_dl/extractor/litv.py @@ -14,7 +14,7 @@ from ..utils import ( class LiTVIE(InfoExtractor): - _VALID_URL = r'https?://www\.litv\.tv/(?:vod|promo)/[^/]+/(?:content\.do)?\?.*?\b(?:content_)?id=(?P[^&]+)' + _VALID_URL = r'https?://(?:www\.)?litv\.tv/(?:vod|promo)/[^/]+/(?:content\.do)?\?.*?\b(?:content_)?id=(?P[^&]+)' _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s' diff --git a/youtube_dl/extractor/lrt.py b/youtube_dl/extractor/lrt.py index 1072405b3..f5c997ef4 100644 --- a/youtube_dl/extractor/lrt.py +++ b/youtube_dl/extractor/lrt.py @@ -1,8 +1,11 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( + determine_ext, int_or_none, parse_duration, remove_end, @@ -12,8 +15,10 @@ from ..utils import ( class LRTIE(InfoExtractor): IE_NAME = 'lrt.lt' _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P[0-9]+)' - _TEST = { + _TESTS = [{ + # m3u8 download 'url': 'http://www.lrt.lt/mediateka/irasas/54391/', + 'md5': 'fe44cf7e4ab3198055f2c598fc175cb0', 'info_dict': { 'id': '54391', 'ext': 'mp4', @@ -23,20 +28,45 @@ class LRTIE(InfoExtractor): 'view_count': int, 'like_count': int, }, - 'params': { - 'skip_download': True, # m3u8 download + }, { + # direct mp3 download + 'url': 'http://www.lrt.lt/mediateka/irasas/1013074524/', + 'md5': '389da8ca3cad0f51d12bed0c844f6a0a', + 'info_dict': { + 'id': '1013074524', + 'ext': 'mp3', + 'title': 'Kita tema 2016-09-05 15:05', + 'description': 'md5:1b295a8fc7219ed0d543fc228c931fb5', + 'duration': 3008, + 'view_count': int, + 'like_count': int, }, - } + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = remove_end(self._og_search_title(webpage), ' - LRT') - m3u8_url = self._search_regex( - r'file\s*:\s*(["\'])(?P.+?)\1\s*\+\s*location\.hash\.substring\(1\)', - webpage, 'm3u8 url', group='url') - formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') + + formats = [] + for _, file_url in re.findall( + r'file\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage): + ext = determine_ext(file_url) + if ext not in ('m3u8', 'mp3'): + continue + # mp3 served as m3u8 produces stuttered media file + if ext == 'm3u8' and '.mp3' in file_url: + continue + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + file_url, video_id, 'mp4', entry_protocol='m3u8_native', + fatal=False)) + elif ext == 'mp3': + formats.append({ + 'url': file_url, + 'vcodec': 'none', + }) self._sort_formats(formats) thumbnail = self._og_search_thumbnail(webpage) diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index a98c4c530..299873ecc 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -94,7 +94,7 @@ class LyndaBaseIE(InfoExtractor): class LyndaIE(LyndaBaseIE): IE_NAME = 'lynda' IE_DESC = 'lynda.com videos' - _VALID_URL = r'https?://www\.lynda\.com/(?:[^/]+/[^/]+/\d+|player/embed)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?lynda\.com/(?:[^/]+/[^/]+/\d+|player/embed)/(?P\d+)' _TIMECODE_REGEX = r'\[(?P\d+:\d+:\d+[\.,]\d+)\]' diff --git a/youtube_dl/extractor/macgamestore.py b/youtube_dl/extractor/macgamestore.py index 3cd4a3a19..43db9929c 100644 --- a/youtube_dl/extractor/macgamestore.py +++ b/youtube_dl/extractor/macgamestore.py @@ -7,7 +7,7 @@ from ..utils import ExtractorError class MacGameStoreIE(InfoExtractor): IE_NAME = 'macgamestore' IE_DESC = 'MacGameStore trailers' - _VALID_URL = r'https?://www\.macgamestore\.com/mediaviewer\.php\?trailer=(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?macgamestore\.com/mediaviewer\.php\?trailer=(?P\d+)' _TEST = { 'url': 'http://www.macgamestore.com/mediaviewer.php?trailer=2450', diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py index 444ec0310..7d468d78b 100644 --- a/youtube_dl/extractor/metacritic.py +++ b/youtube_dl/extractor/metacritic.py @@ -9,7 +9,7 @@ from ..utils import ( class MetacriticIE(InfoExtractor): - _VALID_URL = r'https?://www\.metacritic\.com/.+?/trailers/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?metacritic\.com/.+?/trailers/(?P\d+)' _TESTS = [{ 'url': 'http://www.metacritic.com/game/playstation-4/infamous-second-son/trailers/3698222', diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py index 27bdff8b2..e0bb5d208 100644 --- a/youtube_dl/extractor/mgtv.py +++ b/youtube_dl/extractor/mgtv.py @@ -6,7 +6,7 @@ from ..utils import int_or_none class MGTVIE(InfoExtractor): - _VALID_URL = r'https?://www\.mgtv\.com/v/(?:[^/]+/)*(?P\d+)\.html' + _VALID_URL = r'https?://(?:www\.)?mgtv\.com/v/(?:[^/]+/)*(?P\d+)\.html' IE_DESC = '芒果TV' _TESTS = [{ diff --git a/youtube_dl/extractor/miaopai.py b/youtube_dl/extractor/miaopai.py new file mode 100644 index 000000000..f9e35ac7f --- /dev/null +++ b/youtube_dl/extractor/miaopai.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class MiaoPaiIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?miaopai\.com/show/(?P[-A-Za-z0-9~_]+)' + _TEST = { + 'url': 'http://www.miaopai.com/show/n~0hO7sfV1nBEw4Y29-Hqg__.htm', + 'md5': '095ed3f1cd96b821add957bdc29f845b', + 'info_dict': { + 'id': 'n~0hO7sfV1nBEw4Y29-Hqg__', + 'ext': 'mp4', + 'title': '西游记音乐会的秒拍视频', + 'thumbnail': 're:^https?://.*/n~0hO7sfV1nBEw4Y29-Hqg___m.jpg', + } + } + + _USER_AGENT_IPAD = 'Mozilla/5.0 (iPad; CPU OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + url, video_id, headers={'User-Agent': self._USER_AGENT_IPAD}) + + title = self._html_search_regex( + r'([^<]+)', webpage, 'title') + thumbnail = self._html_search_regex( + r']+class=(?P[\'"]).*\bvideo_img\b.*(?P=q1)[^>]+data-url=(?P[\'"])(?P[^\'"]+)(?P=q2)', + webpage, 'thumbnail', fatal=False, group='url') + videos = self._parse_html5_media_entries(url, webpage, video_id) + info = videos[0] + + info.update({ + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + }) + return info diff --git a/youtube_dl/extractor/ministrygrid.py b/youtube_dl/extractor/ministrygrid.py index e48eba3fa..10190d5f6 100644 --- a/youtube_dl/extractor/ministrygrid.py +++ b/youtube_dl/extractor/ministrygrid.py @@ -8,7 +8,7 @@ from ..utils import ( class MinistryGridIE(InfoExtractor): - _VALID_URL = r'https?://www\.ministrygrid.com/([^/?#]*/)*(?P[^/#?]+)/?(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?ministrygrid\.com/([^/?#]*/)*(?P[^/#?]+)/?(?:$|[?#])' _TEST = { 'url': 'http://www.ministrygrid.com/training-viewer/-/training/t4g-2014-conference/the-gospel-by-numbers-4/the-gospel-by-numbers', diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index cd169f361..2294745d4 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -74,7 +74,7 @@ class MiTeleBaseIE(InfoExtractor): class MiTeleIE(MiTeleBaseIE): IE_DESC = 'mitele.es' - _VALID_URL = r'https?://www\.mitele\.es/(?:[^/]+/){3}(?P[^/]+)/' + _VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/){3}(?P[^/]+)/' _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', diff --git a/youtube_dl/extractor/moevideo.py b/youtube_dl/extractor/moevideo.py index 978d5d5bf..91ee9c4e9 100644 --- a/youtube_dl/extractor/moevideo.py +++ b/youtube_dl/extractor/moevideo.py @@ -35,7 +35,8 @@ class MoeVideoIE(InfoExtractor): 'height': 360, 'duration': 179, 'filesize': 17822500, - } + }, + 'skip': 'Video has been removed', }, { 'url': 'http://playreplay.net/video/77107.7f325710a627383d40540d8e991a', diff --git a/youtube_dl/extractor/motorsport.py b/youtube_dl/extractor/motorsport.py index 370328b36..c9d1ab64d 100644 --- a/youtube_dl/extractor/motorsport.py +++ b/youtube_dl/extractor/motorsport.py @@ -9,7 +9,7 @@ from ..compat import ( class MotorsportIE(InfoExtractor): IE_DESC = 'motorsport.com' - _VALID_URL = r'https?://www\.motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P[^/]+)/?(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?motorsport\.com/[^/?#]+/video/(?:[^/?#]+/)(?P[^/]+)/?(?:$|[?#])' _TEST = { 'url': 'http://www.motorsport.com/f1/video/main-gallery/red-bull-racing-2014-rules-explained/', 'info_dict': { diff --git a/youtube_dl/extractor/moviezine.py b/youtube_dl/extractor/moviezine.py index f130b75c4..aa091a62c 100644 --- a/youtube_dl/extractor/moviezine.py +++ b/youtube_dl/extractor/moviezine.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class MoviezineIE(InfoExtractor): - _VALID_URL = r'https?://www\.moviezine\.se/video/(?P[^?#]+)' + _VALID_URL = r'https?://(?:www\.)?moviezine\.se/video/(?P[^?#]+)' _TEST = { 'url': 'http://www.moviezine.se/video/205866', diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py index 1ca7b1a9e..2afe535b5 100644 --- a/youtube_dl/extractor/myspass.py +++ b/youtube_dl/extractor/myspass.py @@ -11,7 +11,7 @@ from ..utils import ( class MySpassIE(InfoExtractor): - _VALID_URL = r'https?://www\.myspass\.de/.*' + _VALID_URL = r'https?://(?:www\.)?myspass\.de/.*' _TEST = { 'url': 'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/', 'md5': '0b49f4844a068f8b33f4b7c88405862b', diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index f694e210b..7f1bd9229 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -13,7 +13,7 @@ from ..utils import ( class NBCIE(InfoExtractor): - _VALID_URL = r'https?://www\.nbc\.com/(?:[^/]+/)+(?Pn?\d+)' + _VALID_URL = r'https?://(?:www\.)?nbc\.com/(?:[^/]+/)+(?Pn?\d+)' _TESTS = [ { @@ -138,7 +138,7 @@ class NBCSportsVPlayerIE(InfoExtractor): class NBCSportsIE(InfoExtractor): # Does not include https because its certificate is invalid - _VALID_URL = r'https?://www\.nbcsports\.com//?(?:[^/]+/)+(?P[0-9a-z-]+)' + _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?:[^/]+/)+(?P[0-9a-z-]+)' _TEST = { 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', @@ -161,7 +161,7 @@ class NBCSportsIE(InfoExtractor): class CSNNEIE(InfoExtractor): - _VALID_URL = r'https?://www\.csnne\.com/video/(?P[0-9a-z-]+)' + _VALID_URL = r'https?://(?:www\.)?csnne\.com/video/(?P[0-9a-z-]+)' _TEST = { 'url': 'http://www.csnne.com/video/snc-evening-update-wright-named-red-sox-no-5-starter', @@ -335,3 +335,43 @@ class NBCNewsIE(ThePlatformIE): 'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byId=%s' % video_id, 'ie_key': 'ThePlatformFeed', } + + +class NBCOlympicsIE(InfoExtractor): + _VALID_URL = r'https?://www\.nbcolympics\.com/video/(?P[a-z-]+)' + + _TEST = { + # Geo-restricted to US + 'url': 'http://www.nbcolympics.com/video/justin-roses-son-leo-was-tears-after-his-dad-won-gold', + 'md5': '54fecf846d05429fbaa18af557ee523a', + 'info_dict': { + 'id': 'WjTBzDXx5AUq', + 'display_id': 'justin-roses-son-leo-was-tears-after-his-dad-won-gold', + 'ext': 'mp4', + 'title': 'Rose\'s son Leo was in tears after his dad won gold', + 'description': 'Olympic gold medalist Justin Rose gets emotional talking to the impact his win in men\'s golf has already had on his children.', + 'timestamp': 1471274964, + 'upload_date': '20160815', + 'uploader': 'NBCU-SPORTS', + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + drupal_settings = self._parse_json(self._search_regex( + r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', + webpage, 'drupal settings'), display_id) + + iframe_url = drupal_settings['vod']['iframe_url'] + theplatform_url = iframe_url.replace( + 'vplayer.nbcolympics.com', 'player.theplatform.com') + + return { + '_type': 'url_transparent', + 'url': theplatform_url, + 'ie_key': ThePlatformIE.ie_key(), + 'display_id': display_id, + } diff --git a/youtube_dl/extractor/ndr.py b/youtube_dl/extractor/ndr.py index 0cded6b5c..e3b0da2e9 100644 --- a/youtube_dl/extractor/ndr.py +++ b/youtube_dl/extractor/ndr.py @@ -23,7 +23,7 @@ class NDRBaseIE(InfoExtractor): class NDRIE(NDRBaseIE): IE_NAME = 'ndr' IE_DESC = 'NDR.de - Norddeutscher Rundfunk' - _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)*(?P[^/?#]+),[\da-z]+\.html' + _VALID_URL = r'https?://(?:www\.)?ndr\.de/(?:[^/]+/)*(?P[^/?#]+),[\da-z]+\.html' _TESTS = [{ # httpVideo, same content id 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', @@ -105,7 +105,7 @@ class NDRIE(NDRBaseIE): class NJoyIE(NDRBaseIE): IE_NAME = 'njoy' IE_DESC = 'N-JOY' - _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)*(?:(?P[^/?#]+),)?(?P[\da-z]+)\.html' + _VALID_URL = r'https?://(?:www\.)?n-joy\.de/(?:[^/]+/)*(?:(?P[^/?#]+),)?(?P[\da-z]+)\.html' _TESTS = [{ # httpVideo, same content id 'url': 'http://www.n-joy.de/entertainment/comedy/comedy_contest/Benaissa-beim-NDR-Comedy-Contest,comedycontest2480.html', @@ -238,7 +238,7 @@ class NDREmbedBaseIE(InfoExtractor): class NDREmbedIE(NDREmbedBaseIE): IE_NAME = 'ndr:embed' - _VALID_URL = r'https?://www\.ndr\.de/(?:[^/]+/)*(?P[\da-z]+)-(?:player|externalPlayer)\.html' + _VALID_URL = r'https?://(?:www\.)?ndr\.de/(?:[^/]+/)*(?P[\da-z]+)-(?:player|externalPlayer)\.html' _TESTS = [{ 'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html', 'md5': '8b9306142fe65bbdefb5ce24edb6b0a9', @@ -332,7 +332,7 @@ class NDREmbedIE(NDREmbedBaseIE): class NJoyEmbedIE(NDREmbedBaseIE): IE_NAME = 'njoy:embed' - _VALID_URL = r'https?://www\.n-joy\.de/(?:[^/]+/)*(?P[\da-z]+)-(?:player|externalPlayer)_[^/]+\.html' + _VALID_URL = r'https?://(?:www\.)?n-joy\.de/(?:[^/]+/)*(?P[\da-z]+)-(?:player|externalPlayer)_[^/]+\.html' _TESTS = [{ # httpVideo 'url': 'http://www.n-joy.de/events/reeperbahnfestival/doku948-player_image-bc168e87-5263-4d6d-bd27-bb643005a6de_theme-n-joy.html', diff --git a/youtube_dl/extractor/newgrounds.py b/youtube_dl/extractor/newgrounds.py index 705940323..9bea610c8 100644 --- a/youtube_dl/extractor/newgrounds.py +++ b/youtube_dl/extractor/newgrounds.py @@ -1,15 +1,12 @@ from __future__ import unicode_literals -import json -import re - from .common import InfoExtractor class NewgroundsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:audio/listen|portal/view)/(?P[0-9]+)' _TESTS = [{ - 'url': 'http://www.newgrounds.com/audio/listen/549479', + 'url': 'https://www.newgrounds.com/audio/listen/549479', 'md5': 'fe6033d297591288fa1c1f780386f07a', 'info_dict': { 'id': '549479', @@ -18,7 +15,7 @@ class NewgroundsIE(InfoExtractor): 'uploader': 'Burn7', } }, { - 'url': 'http://www.newgrounds.com/portal/view/673111', + 'url': 'https://www.newgrounds.com/portal/view/673111', 'md5': '3394735822aab2478c31b1004fe5e5bc', 'info_dict': { 'id': '673111', @@ -29,24 +26,20 @@ class NewgroundsIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - music_id = mobj.group('id') - webpage = self._download_webpage(url, music_id) + media_id = self._match_id(url) + webpage = self._download_webpage(url, media_id) title = self._html_search_regex( r'([^>]+)', webpage, 'title') uploader = self._html_search_regex( - [r',"artist":"([^"]+)",', r'[\'"]owner[\'"]\s*:\s*[\'"]([^\'"]+)[\'"],'], - webpage, 'uploader') + r'Author\s*]+>([^<]+)', webpage, 'uploader', fatal=False) - music_url_json_string = self._html_search_regex( - r'({"url":"[^"]+"),', webpage, 'music url') + '}' - music_url_json = json.loads(music_url_json_string) - music_url = music_url_json['url'] + music_url = self._parse_json(self._search_regex( + r'"url":("[^"]+"),', webpage, ''), media_id) return { - 'id': music_id, + 'id': media_id, 'title': title, 'url': music_url, 'uploader': uploader, diff --git a/youtube_dl/extractor/nextmedia.py b/youtube_dl/extractor/nextmedia.py index aae7aeeeb..a08e48c4b 100644 --- a/youtube_dl/extractor/nextmedia.py +++ b/youtube_dl/extractor/nextmedia.py @@ -7,7 +7,7 @@ from ..utils import parse_iso8601 class NextMediaIE(InfoExtractor): IE_DESC = '蘋果日報' - _VALID_URL = r'https?://hk.apple.nextmedia.com/[^/]+/[^/]+/(?P\d+)/(?P\d+)' + _VALID_URL = r'https?://hk\.apple\.nextmedia\.com/[^/]+/[^/]+/(?P\d+)/(?P\d+)' _TESTS = [{ 'url': 'http://hk.apple.nextmedia.com/realtime/news/20141108/53109199', 'md5': 'dff9fad7009311c421176d1ac90bfe4f', @@ -68,7 +68,7 @@ class NextMediaIE(InfoExtractor): class NextMediaActionNewsIE(NextMediaIE): IE_DESC = '蘋果日報 - 動新聞' - _VALID_URL = r'https?://hk.dv.nextmedia.com/actionnews/[^/]+/(?P\d+)/(?P\d+)/\d+' + _VALID_URL = r'https?://hk\.dv\.nextmedia\.com/actionnews/[^/]+/(?P\d+)/(?P\d+)/\d+' _TESTS = [{ 'url': 'http://hk.dv.nextmedia.com/actionnews/hit/20150121/19009428/20061460', 'md5': '05fce8ffeed7a5e00665d4b7cf0f9201', @@ -93,7 +93,7 @@ class NextMediaActionNewsIE(NextMediaIE): class AppleDailyIE(NextMediaIE): IE_DESC = '臺灣蘋果日報' - _VALID_URL = r'https?://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P\d+)/(?P\d+)(/.*)?' + _VALID_URL = r'https?://(www|ent)\.appledaily\.com\.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P\d+)/(?P\d+)(/.*)?' _TESTS = [{ 'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694', 'md5': 'a843ab23d150977cc55ef94f1e2c1e4d', diff --git a/youtube_dl/extractor/nhk.py b/youtube_dl/extractor/nhk.py index 691bdfa4e..5c8cd76dc 100644 --- a/youtube_dl/extractor/nhk.py +++ b/youtube_dl/extractor/nhk.py @@ -1,14 +1,15 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ExtractorError class NhkVodIE(InfoExtractor): - _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/en/vod/(?P.+?)\.html' + _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/en/vod/(?P[^/]+/[^/?#&]+)' _TEST = { # Videos available only for a limited period of time. Visit # http://www3.nhk.or.jp/nhkworld/en/vod/ for working samples. - 'url': 'http://www3.nhk.or.jp/nhkworld/en/vod/tokyofashion/20160815.html', + 'url': 'http://www3.nhk.or.jp/nhkworld/en/vod/tokyofashion/20160815', 'info_dict': { 'id': 'A1bnNiNTE6nY3jLllS-BIISfcC_PpvF5', 'ext': 'flv', @@ -19,25 +20,25 @@ class NhkVodIE(InfoExtractor): }, 'skip': 'Videos available only for a limited period of time', } + _API_URL = 'http://api.nhk.or.jp/nhkworld/vodesdlist/v1/all/all/all.json?apikey=EJfK8jdS57GqlupFgAfAAwr573q01y6k' def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + data = self._download_json(self._API_URL, video_id) - embed_code = self._search_regex( - r'nw_vod_ooplayer\([^,]+,\s*(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'ooyala embed code', group='id') + try: + episode = next( + e for e in data['data']['episodes'] + if e.get('url') and video_id in e['url']) + except StopIteration: + raise ExtractorError('Unable to find episode') - title = self._search_regex( - r']+class=["\']episode-detail["\']>\s*([^<]+)', - webpage, 'title', default=None) - description = self._html_search_regex( - r'(?s)]+class=["\']description["\'][^>]*>(.+?)

', - webpage, 'description', default=None) - series = self._search_regex( - r']+class=["\']detail-top-player-title[^>]+>]+>([^<]+)', - webpage, 'series', default=None) + embed_code = episode['vod_id'] + + title = episode.get('sub_title_clean') or episode['sub_title'] + description = episode.get('description_clean') or episode.get('description') + series = episode.get('title_clean') or episode.get('title') return { '_type': 'url_transparent', diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 64730a624..57cf1ce8e 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -69,13 +69,16 @@ class NickIE(MTVServicesInfoExtractor): class NickDeIE(MTVServicesInfoExtractor): IE_NAME = 'nick.de' - _VALID_URL = r'https?://(?:www\.)?nick\.de/(?:playlist|shows)/(?:[^/]+/)*(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?(?:nick\.de|nickelodeon\.nl)/(?:playlist|shows)/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://www.nick.de/playlist/3773-top-videos/videos/episode/17306-zu-wasser-und-zu-land-rauchende-erdnusse', 'only_matching': True, }, { 'url': 'http://www.nick.de/shows/342-icarly', 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.nl/shows/474-spongebob/videos/17403-een-kijkje-in-de-keuken-met-sandy-van-binnenuit', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index dd75a48af..6eaaa8416 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -252,7 +252,7 @@ class NiconicoIE(InfoExtractor): class NiconicoPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://www\.nicovideo\.jp/mylist/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/mylist/(?P\d+)' _TEST = { 'url': 'http://www.nicovideo.jp/mylist/27411728', diff --git a/youtube_dl/extractor/ninenow.py b/youtube_dl/extractor/ninenow.py index faa577237..351bea7ba 100644 --- a/youtube_dl/extractor/ninenow.py +++ b/youtube_dl/extractor/ninenow.py @@ -44,7 +44,20 @@ class NineNowIE(InfoExtractor): page_data = self._parse_json(self._search_regex( r'window\.__data\s*=\s*({.*?});', webpage, 'page data'), display_id) - common_data = page_data.get('episode', {}).get('episode') or page_data.get('clip', {}).get('clip') + + for kind in ('episode', 'clip'): + current_key = page_data.get(kind, {}).get( + 'current%sKey' % kind.capitalize()) + if not current_key: + continue + cache = page_data.get(kind, {}).get('%sCache' % kind, {}) + if not cache: + continue + common_data = (cache.get(current_key) or list(cache.values())[0])[kind] + break + else: + raise ExtractorError('Unable to find video data') + video_data = common_data['video'] if video_data.get('drm'): diff --git a/youtube_dl/extractor/oktoberfesttv.py b/youtube_dl/extractor/oktoberfesttv.py index 4a41c0542..f2ccc53dc 100644 --- a/youtube_dl/extractor/oktoberfesttv.py +++ b/youtube_dl/extractor/oktoberfesttv.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class OktoberfestTVIE(InfoExtractor): - _VALID_URL = r'https?://www\.oktoberfest-tv\.de/[^/]+/[^/]+/video/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?oktoberfest-tv\.de/[^/]+/[^/]+/video/(?P[^/?#]+)' _TEST = { 'url': 'http://www.oktoberfest-tv.de/de/kameras/video/hb-zelt', diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py index fc22ad5eb..9cbc7c2e2 100644 --- a/youtube_dl/extractor/onet.py +++ b/youtube_dl/extractor/onet.py @@ -90,7 +90,7 @@ class OnetBaseIE(InfoExtractor): class OnetIE(OnetBaseIE): - _VALID_URL = 'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P[0-9a-z-]+)/(?P[0-9a-z]+)' + _VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P[0-9a-z-]+)/(?P[0-9a-z]+)' IE_NAME = 'onet.tv' _TEST = { diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 03baf8e32..c261a7455 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -13,7 +13,7 @@ from ..utils import ( class OpenloadIE(InfoExtractor): - _VALID_URL = r'https://openload.(?:co|io)/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' + _VALID_URL = r'https?://openload\.(?:co|io)/(?:f|embed)/(?P[a-zA-Z0-9-_]+)' _TESTS = [{ 'url': 'https://openload.co/f/kUEfGclsU9o', @@ -60,7 +60,7 @@ class OpenloadIE(InfoExtractor): if j >= 33 and j <= 126: j = ((j + 14) % 94) + 33 if idx == len(enc_data) - 1: - j += 1 + j += 3 video_url_chars += compat_chr(j) video_url = 'https://openload.co/stream/%s?mime=true' % ''.join(video_url_chars) diff --git a/youtube_dl/extractor/parliamentliveuk.py b/youtube_dl/extractor/parliamentliveuk.py index 0a423a08f..874aacc55 100644 --- a/youtube_dl/extractor/parliamentliveuk.py +++ b/youtube_dl/extractor/parliamentliveuk.py @@ -1,53 +1,40 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor class ParliamentLiveUKIE(InfoExtractor): IE_NAME = 'parliamentlive.tv' IE_DESC = 'UK parliament videos' - _VALID_URL = r'https?://www\.parliamentlive\.tv/Main/Player\.aspx\?(?:[^&]+&)*?meetingId=(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?parliamentlive\.tv/Event/Index/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _TEST = { - 'url': 'http://www.parliamentlive.tv/Main/Player.aspx?meetingId=15121&player=windowsmedia', + 'url': 'http://parliamentlive.tv/Event/Index/c1e9d44d-fd6c-4263-b50f-97ed26cc998b', 'info_dict': { - 'id': '15121', - 'ext': 'asf', - 'title': 'hoc home affairs committee, 18 mar 2014.pm', - 'description': 'md5:033b3acdf83304cd43946b2d5e5798d1', + 'id': 'c1e9d44d-fd6c-4263-b50f-97ed26cc998b', + 'ext': 'mp4', + 'title': 'Home Affairs Committee', + 'uploader_id': 'FFMPEG-01', + 'timestamp': 1422696664, + 'upload_date': '20150131', }, - 'params': { - 'skip_download': True, # Requires mplayer (mms) - } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - - asx_url = self._html_search_regex( - r'embed.*?src="([^"]+)" name="MediaPlayer"', webpage, - 'metadata URL') - asx = self._download_xml(asx_url, video_id, 'Downloading ASX metadata') - video_url = asx.find('.//REF').attrib['HREF'] - - title = self._search_regex( - r'''(?x)player\.setClipDetails\( - (?:(?:[0-9]+|"[^"]+"),\s*){2} - "([^"]+",\s*"[^"]+)" - ''', - webpage, 'title').replace('", "', ', ') - description = self._html_search_regex( - r'(?s)(.*?)', - webpage, 'description') - + video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://vodplayer.parliamentlive.tv/?mid=' + video_id, video_id) + widget_config = self._parse_json(self._search_regex( + r'kWidgetConfig\s*=\s*({.+});', + webpage, 'kaltura widget config'), video_id) + kaltura_url = 'kaltura:%s:%s' % (widget_config['wid'][1:], widget_config['entry_id']) + event_title = self._download_json( + 'http://parliamentlive.tv/Event/GetShareVideo/' + video_id, video_id)['event']['title'] return { + '_type': 'url_transparent', 'id': video_id, - 'ext': 'asf', - 'url': video_url, - 'title': title, - 'description': description, + 'title': event_title, + 'description': '', + 'url': kaltura_url, + 'ie_key': 'Kaltura', } diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 6c640089d..eb1aeba46 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -94,7 +94,7 @@ class PeriscopeIE(PeriscopeBaseIE): class PeriscopeUserIE(PeriscopeBaseIE): - _VALID_URL = r'https?://www\.periscope\.tv/(?P[^/]+)/?$' + _VALID_URL = r'https?://(?:www\.)?periscope\.tv/(?P[^/]+)/?$' IE_DESC = 'Periscope user videos' IE_NAME = 'periscope:user' diff --git a/youtube_dl/extractor/playvid.py b/youtube_dl/extractor/playvid.py index 78d219299..79c2db085 100644 --- a/youtube_dl/extractor/playvid.py +++ b/youtube_dl/extractor/playvid.py @@ -14,7 +14,7 @@ from ..utils import ( class PlayvidIE(InfoExtractor): - _VALID_URL = r'https?://www\.playvid\.com/watch(\?v=|/)(?P.+?)(?:#|$)' + _VALID_URL = r'https?://(?:www\.)?playvid\.com/watch(\?v=|/)(?P.+?)(?:#|$)' _TESTS = [{ 'url': 'http://www.playvid.com/watch/RnmBNgtrrJu', 'md5': 'ffa2f6b2119af359f544388d8c01eb6c', diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py index f559b899f..5ff173774 100644 --- a/youtube_dl/extractor/polskieradio.py +++ b/youtube_dl/extractor/polskieradio.py @@ -1,14 +1,17 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor from ..compat import ( compat_str, compat_urllib_parse_unquote, + compat_urlparse ) from ..utils import ( + extract_attributes, int_or_none, strip_or_none, unified_timestamp, @@ -97,3 +100,81 @@ class PolskieRadioIE(InfoExtractor): description = strip_or_none(self._og_search_description(webpage)) return self.playlist_result(entries, playlist_id, title, description) + + +class PolskieRadioCategoryIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P\d+)' + _TESTS = [{ + 'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA', + 'info_dict': { + 'id': '5102', + 'title': 'HISTORIA ŻYWA', + }, + 'playlist_mincount': 38, + }, { + 'url': 'http://www.polskieradio.pl/7/4807', + 'info_dict': { + 'id': '4807', + 'title': 'Vademecum 1050. rocznicy Chrztu Polski' + }, + 'playlist_mincount': 5 + }, { + 'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source', + 'only_matching': True + }, { + 'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow', + 'info_dict': { + 'id': '4143', + 'title': 'Kierunek Kraków', + }, + 'playlist_mincount': 61 + }, { + 'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka', + 'info_dict': { + 'id': '214', + 'title': 'Muzyka', + }, + 'playlist_mincount': 61 + }, { + 'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA', + 'only_matching': True, + }, { + 'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if PolskieRadioIE.suitable(url) else super(PolskieRadioCategoryIE, cls).suitable(url) + + def _entries(self, url, page, category_id): + content = page + for page_num in itertools.count(2): + for a_entry, entry_id in re.findall( + r'(?s)]+>.*?(]+href=["\']/\d+/\d+/Artykul/(\d+)[^>]+>).*?', + content): + entry = extract_attributes(a_entry) + href = entry.get('href') + if not href: + continue + yield self.url_result( + compat_urlparse.urljoin(url, href), PolskieRadioIE.ie_key(), + entry_id, entry.get('title')) + mobj = re.search( + r']+class=["\']next["\'][^>]*>\s*]+href=(["\'])(?P(?:(?!\1).)+)\1', + content) + if not mobj: + break + next_url = compat_urlparse.urljoin(url, mobj.group('url')) + content = self._download_webpage( + next_url, category_id, 'Downloading page %s' % page_num) + + def _real_extract(self, url): + category_id = self._match_id(url) + webpage = self._download_webpage(url, category_id) + title = self._html_search_regex( + r'([^<]+) - [^<]+ - [^<]+', + webpage, 'title', fatal=False) + return self.playlist_result( + self._entries(url, webpage, category_id), + category_id, title) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 20976c101..0724efc09 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -15,6 +15,7 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, + js_to_json, orderedSet, sanitized_Request, str_to_int, @@ -48,6 +49,8 @@ class PornHubIE(InfoExtractor): 'dislike_count': int, 'comment_count': int, 'age_limit': 18, + 'tags': list, + 'categories': list, }, }, { # non-ASCII title @@ -63,6 +66,8 @@ class PornHubIE(InfoExtractor): 'dislike_count': int, 'comment_count': int, 'age_limit': 18, + 'tags': list, + 'categories': list, }, 'params': { 'skip_download': True, @@ -183,6 +188,15 @@ class PornHubIE(InfoExtractor): }) self._sort_formats(formats) + page_params = self._parse_json(self._search_regex( + r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P{[^}]+})', + webpage, 'page parameters', group='data', default='{}'), + video_id, transform_source=js_to_json, fatal=False) + tags = categories = None + if page_params: + tags = page_params.get('tags', '').split(',') + categories = page_params.get('categories', '').split(',') + return { 'id': video_id, 'uploader': video_uploader, @@ -195,6 +209,8 @@ class PornHubIE(InfoExtractor): 'comment_count': comment_count, 'formats': formats, 'age_limit': 18, + 'tags': tags, + 'categories': categories, } diff --git a/youtube_dl/extractor/pornovoisines.py b/youtube_dl/extractor/pornovoisines.py index 6b51e5c54..58f557e39 100644 --- a/youtube_dl/extractor/pornovoisines.py +++ b/youtube_dl/extractor/pornovoisines.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import random from .common import InfoExtractor from ..utils import ( @@ -13,61 +12,69 @@ from ..utils import ( class PornoVoisinesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/showvideo/(?P\d+)/(?P[^/]+)' - - _VIDEO_URL_TEMPLATE = 'http://stream%d.pornovoisines.com' \ - '/static/media/video/transcoded/%s-640x360-1000-trscded.mp4' - - _SERVER_NUMBERS = (1, 2) + _VALID_URL = r'https?://(?:www\.)?pornovoisines\.com/videos/show/(?P\d+)/(?P[^/.]+)' _TEST = { - 'url': 'http://www.pornovoisines.com/showvideo/1285/recherche-appartement/', - 'md5': '5ac670803bc12e9e7f9f662ce64cf1d1', + 'url': 'http://www.pornovoisines.com/videos/show/919/recherche-appartement.html', + 'md5': '6f8aca6a058592ab49fe701c8ba8317b', 'info_dict': { - 'id': '1285', + 'id': '919', 'display_id': 'recherche-appartement', 'ext': 'mp4', 'title': 'Recherche appartement', - 'description': 'md5:819ea0b785e2a04667a1a01cdc89594e', + 'description': 'md5:fe10cb92ae2dd3ed94bb4080d11ff493', 'thumbnail': 're:^https?://.*\.jpg$', 'upload_date': '20140925', 'duration': 120, 'view_count': int, 'average_rating': float, - 'categories': ['Débutantes', 'Scénario', 'Sodomie'], + 'categories': ['Débutante', 'Débutantes', 'Scénario', 'Sodomie'], 'age_limit': 18, + 'subtitles': { + 'fr': [{ + 'ext': 'vtt', + }] + }, } } - @classmethod - def build_video_url(cls, num): - return cls._VIDEO_URL_TEMPLATE % (random.choice(cls._SERVER_NUMBERS), num) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') display_id = mobj.group('display_id') + settings_url = self._download_json( + 'http://www.pornovoisines.com/api/video/%s/getsettingsurl/' % video_id, + video_id, note='Getting settings URL')['video_settings_url'] + settings = self._download_json(settings_url, video_id)['data'] + + formats = [] + for kind, data in settings['variants'].items(): + if kind == 'HLS': + formats.extend(self._extract_m3u8_formats( + data, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls')) + elif kind == 'MP4': + for item in data: + formats.append({ + 'url': item['url'], + 'height': item.get('height'), + 'bitrate': item.get('bitrate'), + }) + self._sort_formats(formats) + webpage = self._download_webpage(url, video_id) - video_url = self.build_video_url(video_id) + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) - title = self._html_search_regex( - r'

(.+?)

', webpage, 'title', flags=re.DOTALL) - description = self._html_search_regex( - r'
(.+?)
', - webpage, 'description', fatal=False, flags=re.DOTALL) - - thumbnail = self._search_regex( - r'
\s*]+class=([\'"])thumb\1[^>]*src=([\'"])(?P[^"]+)\2', + webpage, 'thumbnail', fatal=False, group='url') upload_date = unified_strdate(self._search_regex( - r'Publié le ([\d-]+)', webpage, 'upload date', fatal=False)) - duration = int_or_none(self._search_regex( - 'Durée (\d+)', webpage, 'duration', fatal=False)) + r'Le\s*([\d/]+)', webpage, 'upload date', fatal=False)) + duration = settings.get('main', {}).get('duration') view_count = int_or_none(self._search_regex( r'(\d+) vues', webpage, 'view count', fatal=False)) average_rating = self._search_regex( @@ -75,15 +82,19 @@ class PornoVoisinesIE(InfoExtractor): if average_rating: average_rating = float_or_none(average_rating.replace(',', '.')) - categories = self._html_search_meta( - 'keywords', webpage, 'categories', fatal=False) + categories = self._html_search_regex( + r'(?s)Catégories\s*:\s*(.+?)', webpage, 'categories', fatal=False) if categories: categories = [category.strip() for category in categories.split(',')] + subtitles = {'fr': [{ + 'url': subtitle, + } for subtitle in settings.get('main', {}).get('vtt_tracks', {}).values()]} + return { 'id': video_id, 'display_id': display_id, - 'url': video_url, + 'formats': formats, 'title': title, 'description': description, 'thumbnail': thumbnail, @@ -93,4 +104,5 @@ class PornoVoisinesIE(InfoExtractor): 'average_rating': average_rating, 'categories': categories, 'age_limit': 18, + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index c6eee3b72..7335dc2af 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -15,7 +15,111 @@ from ..utils import ( ) -class ProSiebenSat1IE(InfoExtractor): +class ProSiebenSat1BaseIE(InfoExtractor): + def _extract_video_info(self, url, clip_id): + client_location = url + + video = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos', + clip_id, 'Downloading videos JSON', query={ + 'access_token': self._TOKEN, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + 'ids': clip_id, + })[0] + + if video.get('is_protected') is True: + raise ExtractorError('This video is DRM protected.', expected=True) + + duration = float_or_none(video.get('duration')) + source_ids = [compat_str(source['id']) for source in video['sources']] + + client_id = self._SALT[:2] + sha1(''.join([clip_id, self._SALT, self._TOKEN, client_location, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() + + sources = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id, + clip_id, 'Downloading sources JSON', query={ + 'access_token': self._TOKEN, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + }) + server_id = sources['server_id'] + + def fix_bitrate(bitrate): + bitrate = int_or_none(bitrate) + if not bitrate: + return None + return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate + + formats = [] + for source_id in source_ids: + client_id = self._SALT[:2] + sha1(''.join([self._SALT, clip_id, self._TOKEN, server_id, client_location, source_id, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() + urls = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id, + clip_id, 'Downloading urls JSON', fatal=False, query={ + 'access_token': self._TOKEN, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': self._CLIENT_NAME, + 'server_id': server_id, + 'source_ids': source_id, + }) + if not urls: + continue + if urls.get('status_code') != 0: + raise ExtractorError('This video is unavailable', expected=True) + urls_sources = urls['sources'] + if isinstance(urls_sources, dict): + urls_sources = urls_sources.values() + for source in urls_sources: + source_url = source.get('url') + if not source_url: + continue + protocol = source.get('protocol') + mimetype = source.get('mimetype') + if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + source_url, clip_id, f4m_id='hds', fatal=False)) + elif mimetype == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats( + source_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + tbr = fix_bitrate(source['bitrate']) + if protocol in ('rtmp', 'rtmpe'): + mobj = re.search(r'^(?Prtmpe?://[^/]+)/(?P.+)$', source_url) + if not mobj: + continue + path = mobj.group('path') + mp4colon_index = path.rfind('mp4:') + app = path[:mp4colon_index] + play_path = path[mp4colon_index:] + formats.append({ + 'url': '%s/%s' % (mobj.group('url'), app), + 'app': app, + 'play_path': play_path, + 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', + 'page_url': 'http://www.prosieben.de', + 'tbr': tbr, + 'ext': 'flv', + 'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''), + }) + else: + formats.append({ + 'url': source_url, + 'tbr': tbr, + 'format_id': 'http%s' % ('-%d' % tbr if tbr else ''), + }) + self._sort_formats(formats) + + return { + 'duration': duration, + 'formats': formats, + } + + +class ProSiebenSat1IE(ProSiebenSat1BaseIE): IE_NAME = 'prosiebensat1' IE_DESC = 'ProSiebenSat.1 Digital' _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany|7tv)\.(?:de|at|ch)|ran\.de|fem\.com)/(?P.+)' @@ -188,6 +292,9 @@ class ProSiebenSat1IE(InfoExtractor): }, ] + _TOKEN = 'prosieben' + _SALT = '01!8d8F_)r9]4s[qeuXfP%' + _CLIENT_NAME = 'kolibri-2.0.19-splec4' _CLIPID_REGEXES = [ r'"clip_id"\s*:\s+"(\d+)"', r'clipid: "(\d+)"', @@ -234,123 +341,22 @@ class ProSiebenSat1IE(InfoExtractor): def _extract_clip(self, url, webpage): clip_id = self._html_search_regex( self._CLIPID_REGEXES, webpage, 'clip id') - - access_token = 'prosieben' - client_name = 'kolibri-2.0.19-splec4' - client_location = url - - video = self._download_json( - 'http://vas.sim-technik.de/vas/live/v2/videos', - clip_id, 'Downloading videos JSON', query={ - 'access_token': access_token, - 'client_location': client_location, - 'client_name': client_name, - 'ids': clip_id, - })[0] - - if video.get('is_protected') is True: - raise ExtractorError('This video is DRM protected.', expected=True) - - duration = float_or_none(video.get('duration')) - source_ids = [compat_str(source['id']) for source in video['sources']] - - g = '01!8d8F_)r9]4s[qeuXfP%' - client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name]).encode('utf-8')).hexdigest() - - sources = self._download_json( - 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id, - clip_id, 'Downloading sources JSON', query={ - 'access_token': access_token, - 'client_id': client_id, - 'client_location': client_location, - 'client_name': client_name, - }) - server_id = sources['server_id'] - title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title') - - def fix_bitrate(bitrate): - bitrate = int_or_none(bitrate) - if not bitrate: - return None - return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate - - formats = [] - for source_id in source_ids: - client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id, client_location, source_id, g, client_name]).encode('utf-8')).hexdigest() - urls = self._download_json( - 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id, - clip_id, 'Downloading urls JSON', fatal=False, query={ - 'access_token': access_token, - 'client_id': client_id, - 'client_location': client_location, - 'client_name': client_name, - 'server_id': server_id, - 'source_ids': source_id, - }) - if not urls: - continue - if urls.get('status_code') != 0: - raise ExtractorError('This video is unavailable', expected=True) - urls_sources = urls['sources'] - if isinstance(urls_sources, dict): - urls_sources = urls_sources.values() - for source in urls_sources: - source_url = source.get('url') - if not source_url: - continue - protocol = source.get('protocol') - mimetype = source.get('mimetype') - if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': - formats.extend(self._extract_f4m_formats( - source_url, clip_id, f4m_id='hds', fatal=False)) - elif mimetype == 'application/x-mpegURL': - formats.extend(self._extract_m3u8_formats( - source_url, clip_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - tbr = fix_bitrate(source['bitrate']) - if protocol in ('rtmp', 'rtmpe'): - mobj = re.search(r'^(?Prtmpe?://[^/]+)/(?P.+)$', source_url) - if not mobj: - continue - path = mobj.group('path') - mp4colon_index = path.rfind('mp4:') - app = path[:mp4colon_index] - play_path = path[mp4colon_index:] - formats.append({ - 'url': '%s/%s' % (mobj.group('url'), app), - 'app': app, - 'play_path': play_path, - 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', - 'page_url': 'http://www.prosieben.de', - 'tbr': tbr, - 'ext': 'flv', - 'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''), - }) - else: - formats.append({ - 'url': source_url, - 'tbr': tbr, - 'format_id': 'http%s' % ('-%d' % tbr if tbr else ''), - }) - self._sort_formats(formats) - + info = self._extract_video_info(url, clip_id) description = self._html_search_regex( self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) thumbnail = self._og_search_thumbnail(webpage) upload_date = unified_strdate(self._html_search_regex( self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) - return { + info.update({ 'id': clip_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'upload_date': upload_date, - 'duration': duration, - 'formats': formats, - } + }) + return info def _extract_playlist(self, url, webpage): playlist_id = self._html_search_regex( diff --git a/youtube_dl/extractor/puls4.py b/youtube_dl/extractor/puls4.py index fca30e1aa..9c2ccbe2d 100644 --- a/youtube_dl/extractor/puls4.py +++ b/youtube_dl/extractor/puls4.py @@ -1,88 +1,51 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals -from .common import InfoExtractor +from .prosiebensat1 import ProSiebenSat1BaseIE from ..utils import ( - ExtractorError, unified_strdate, - int_or_none, + parse_duration, + compat_str, ) -class Puls4IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?puls4\.com/video/[^/]+/play/(?P[0-9]+)' +class Puls4IE(ProSiebenSat1BaseIE): + _VALID_URL = r'https?://(?:www\.)?puls4\.com/(?P(?:[^/]+/)*?videos/[^?#]+)' _TESTS = [{ - 'url': 'http://www.puls4.com/video/pro-und-contra/play/2716816', - 'md5': '49f6a6629747eeec43cef6a46b5df81d', + 'url': 'http://www.puls4.com/2-minuten-2-millionen/staffel-3/videos/2min2miotalk/Tobias-Homberger-von-myclubs-im-2min2miotalk-118118', + 'md5': 'fd3c6b0903ac72c9d004f04bc6bb3e03', 'info_dict': { - 'id': '2716816', - 'ext': 'mp4', - 'title': 'Pro und Contra vom 23.02.2015', - 'description': 'md5:293e44634d9477a67122489994675db6', - 'duration': 2989, - 'upload_date': '20150224', + 'id': '118118', + 'ext': 'flv', + 'title': 'Tobias Homberger von myclubs im #2min2miotalk', + 'description': 'md5:f9def7c5e8745d6026d8885487d91955', + 'upload_date': '20160830', 'uploader': 'PULS_4', }, - 'skip': 'Only works from Germany', - }, { - 'url': 'http://www.puls4.com/video/kult-spielfilme/play/1298106', - 'md5': '6a48316c8903ece8dab9b9a7bf7a59ec', - 'info_dict': { - 'id': '1298106', - 'ext': 'mp4', - 'title': 'Lucky Fritz', - }, - 'skip': 'Only works from Germany', }] + _TOKEN = 'puls4' + _SALT = '01!kaNgaiNgah1Ie4AeSha' + _CLIENT_NAME = '' def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - error_message = self._html_search_regex( - r']+class="message-error"[^>]*>(.+?)
', - webpage, 'error message', default=None) - if error_message: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) - - real_url = self._html_search_regex( - r'\"fsk-button\".+?href=\"([^"]+)', - webpage, 'fsk_button', default=None) - if real_url: - webpage = self._download_webpage(real_url, video_id) - - player = self._search_regex( - r'p4_video_player(?:_iframe)?\("video_\d+_container"\s*,(.+?)\);\s*\}', - webpage, 'player') - - player_json = self._parse_json( - '[%s]' % player, video_id, - transform_source=lambda s: s.replace('undefined,', '')) - - formats = None - result = None - - for v in player_json: - if isinstance(v, list) and not formats: - formats = [{ - 'url': f['url'], - 'format': 'hd' if f.get('hd') else 'sd', - 'width': int_or_none(f.get('size_x')), - 'height': int_or_none(f.get('size_y')), - 'tbr': int_or_none(f.get('bitrate')), - } for f in v] - self._sort_formats(formats) - elif isinstance(v, dict) and not result: - result = { - 'id': video_id, - 'title': v['videopartname'].strip(), - 'description': v.get('videotitle'), - 'duration': int_or_none(v.get('videoduration') or v.get('episodeduration')), - 'upload_date': unified_strdate(v.get('clipreleasetime')), - 'uploader': v.get('channel'), - } - - result['formats'] = formats - - return result + path = self._match_id(url) + content_path = self._download_json( + 'http://www.puls4.com/api/json-fe/page/' + path, path)['content'][0]['url'] + media = self._download_json( + 'http://www.puls4.com' + content_path, + content_path)['mediaCurrent'] + player_content = media['playerContent'] + info = self._extract_video_info(url, player_content['id']) + info.update({ + 'id': compat_str(media['objectId']), + 'title': player_content['title'], + 'description': media.get('description'), + 'thumbnail': media.get('previewLink'), + 'upload_date': unified_strdate(media.get('date')), + 'duration': parse_duration(player_content.get('duration')), + 'episode': player_content.get('episodePartName'), + 'show': media.get('channel'), + 'season_id': player_content.get('seasonId'), + 'uploader': player_content.get('sourceCompany'), + }) + return info diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py index ff0af9543..37cb9e2c9 100644 --- a/youtube_dl/extractor/qqmusic.py +++ b/youtube_dl/extractor/qqmusic.py @@ -18,7 +18,7 @@ from ..utils import ( class QQMusicIE(InfoExtractor): IE_NAME = 'qqmusic' IE_DESC = 'QQ音乐' - _VALID_URL = r'https?://y.qq.com/#type=song&mid=(?P[0-9A-Za-z]+)' + _VALID_URL = r'https?://y\.qq\.com/#type=song&mid=(?P[0-9A-Za-z]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD', 'md5': '9ce1c1c8445f561506d2e3cfb0255705', @@ -172,7 +172,7 @@ class QQPlaylistBaseIE(InfoExtractor): class QQMusicSingerIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:singer' IE_DESC = 'QQ音乐 - 歌手' - _VALID_URL = r'https?://y.qq.com/#type=singer&mid=(?P[0-9A-Za-z]+)' + _VALID_URL = r'https?://y\.qq\.com/#type=singer&mid=(?P[0-9A-Za-z]+)' _TEST = { 'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2', 'info_dict': { @@ -217,7 +217,7 @@ class QQMusicSingerIE(QQPlaylistBaseIE): class QQMusicAlbumIE(QQPlaylistBaseIE): IE_NAME = 'qqmusic:album' IE_DESC = 'QQ音乐 - 专辑' - _VALID_URL = r'https?://y.qq.com/#type=album&mid=(?P[0-9A-Za-z]+)' + _VALID_URL = r'https?://y\.qq\.com/#type=album&mid=(?P[0-9A-Za-z]+)' _TESTS = [{ 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1', diff --git a/youtube_dl/extractor/rmcdecouverte.py b/youtube_dl/extractor/rmcdecouverte.py new file mode 100644 index 000000000..f3bb4fa66 --- /dev/null +++ b/youtube_dl/extractor/rmcdecouverte.py @@ -0,0 +1,39 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .brightcove import BrightcoveLegacyIE +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) + + +class RMCDecouverteIE(InfoExtractor): + _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/mediaplayer-replay.*?\bid=(?P\d+)' + + _TEST = { + 'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=1430&title=LES%20HEROS%20DU%2088e%20ETAGE', + 'info_dict': { + 'id': '5111223049001', + 'ext': 'mp4', + 'title': ': LES HEROS DU 88e ETAGE', + 'description': 'Découvrez comment la bravoure de deux hommes dans la Tour Nord du World Trade Center a sauvé la vie d\'innombrables personnes le 11 septembre 2001.', + 'uploader_id': '1969646226001', + 'upload_date': '20160904', + 'timestamp': 1472951103, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + 'skip': 'Only works from France', + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) + brightcove_id = compat_parse_qs(compat_urlparse.urlparse(brightcove_legacy_url).query)['@videoPlayer'][0] + return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/rottentomatoes.py b/youtube_dl/extractor/rottentomatoes.py index f9cd48790..1d404d20a 100644 --- a/youtube_dl/extractor/rottentomatoes.py +++ b/youtube_dl/extractor/rottentomatoes.py @@ -1,31 +1,32 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urlparse from .internetvideoarchive import InternetVideoArchiveIE class RottenTomatoesIE(InfoExtractor): - _VALID_URL = r'https?://www\.rottentomatoes\.com/m/[^/]+/trailers/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?rottentomatoes\.com/m/[^/]+/trailers/(?P\d+)' _TEST = { 'url': 'http://www.rottentomatoes.com/m/toy_story_3/trailers/11028566/', 'info_dict': { - 'id': '613340', + 'id': '11028566', 'ext': 'mp4', 'title': 'Toy Story 3', + 'description': 'From the creators of the beloved TOY STORY films, comes a story that will reunite the gang in a whole new way.', + 'thumbnail': 're:^https?://.*\.jpg$', }, } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - og_video = self._og_search_video_url(webpage) - query = compat_urlparse.urlparse(og_video).query + iva_id = self._search_regex(r'publishedid=(\d+)', webpage, 'internet video archive id') return { '_type': 'url_transparent', - 'url': InternetVideoArchiveIE._build_xml_url(query), + 'url': 'http://video.internetvideoarchive.net/player/6/configuration.ashx?domain=www.videodetective.com&customerid=69249&playerid=641&publishedid=' + iva_id, 'ie_key': InternetVideoArchiveIE.ie_key(), + 'id': video_id, 'title': self._og_search_title(webpage), } diff --git a/youtube_dl/extractor/roxwel.py b/youtube_dl/extractor/roxwel.py index 41638c1d0..65284643b 100644 --- a/youtube_dl/extractor/roxwel.py +++ b/youtube_dl/extractor/roxwel.py @@ -7,7 +7,7 @@ from ..utils import unified_strdate, determine_ext class RoxwelIE(InfoExtractor): - _VALID_URL = r'https?://www\.roxwel\.com/player/(?P.+?)(\.|\?|$)' + _VALID_URL = r'https?://(?:www\.)?roxwel\.com/player/(?P.+?)(\.|\?|$)' _TEST = { 'url': 'http://www.roxwel.com/player/passionpittakeawalklive.html', diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index 34f9c4a99..f1b92f6da 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -64,7 +64,7 @@ def _decrypt_url(png): class RTVEALaCartaIE(InfoExtractor): IE_NAME = 'rtve.es:alacarta' IE_DESC = 'RTVE a la carta' - _VALID_URL = r'https?://www\.rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/(m/)?(alacarta/videos|filmoteca)/[^/]+/[^/]+/(?P\d+)' _TESTS = [{ 'url': 'http://www.rtve.es/alacarta/videos/balonmano/o-swiss-cup-masculina-final-espana-suecia/2491869/', @@ -184,7 +184,7 @@ class RTVEInfantilIE(InfoExtractor): class RTVELiveIE(InfoExtractor): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' - _VALID_URL = r'https?://www\.rtve\.es/directo/(?P[a-zA-Z0-9-]+)' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P[a-zA-Z0-9-]+)' _TESTS = [{ 'url': 'http://www.rtve.es/directo/la-1/', @@ -226,7 +226,7 @@ class RTVELiveIE(InfoExtractor): class RTVETelevisionIE(InfoExtractor): IE_NAME = 'rtve.es:television' - _VALID_URL = r'https?://www\.rtve\.es/television/[^/]+/[^/]+/(?P\d+).shtml' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/television/[^/]+/[^/]+/(?P\d+).shtml' _TEST = { 'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml', diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index 9ca4ae147..5d0ace5bf 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -88,7 +88,7 @@ class RutubeIE(InfoExtractor): class RutubeEmbedIE(InfoExtractor): IE_NAME = 'rutube:embed' IE_DESC = 'Rutube embedded videos' - _VALID_URL = 'https?://rutube\.ru/(?:video|play)/embed/(?P[0-9]+)' + _VALID_URL = r'https?://rutube\.ru/(?:video|play)/embed/(?P[0-9]+)' _TESTS = [{ 'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=', diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 08ddbe3c4..eabe41efe 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -103,13 +103,13 @@ class SafariIE(SafariBaseIE): webpage = self._download_webpage(url, video_id) reference_id = self._search_regex( - r'data-reference-id=(["\'])(?P.+?)\1', + r'data-reference-id=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'kaltura reference id', group='id') partner_id = self._search_regex( - r'data-partner-id=(["\'])(?P.+?)\1', + r'data-partner-id=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'kaltura widget id', group='id') ui_id = self._search_regex( - r'data-ui-id=(["\'])(?P.+?)\1', + r'data-ui-id=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'kaltura uiconf id', group='id') query = { diff --git a/youtube_dl/extractor/screenjunkies.py b/youtube_dl/extractor/screenjunkies.py index dd0a6ba19..02e574cd8 100644 --- a/youtube_dl/extractor/screenjunkies.py +++ b/youtube_dl/extractor/screenjunkies.py @@ -11,7 +11,7 @@ from ..utils import ( class ScreenJunkiesIE(InfoExtractor): - _VALID_URL = r'https?://www.screenjunkies.com/video/(?P[^/]+?)(?:-(?P\d+))?(?:[/?#&]|$)' + _VALID_URL = r'https?://(?:www\.)?screenjunkies\.com/video/(?P[^/]+?)(?:-(?P\d+))?(?:[/?#&]|$)' _TESTS = [{ 'url': 'http://www.screenjunkies.com/video/best-quentin-tarantino-movie-2841915', 'md5': '5c2b686bec3d43de42bde9ec047536b0', diff --git a/youtube_dl/extractor/senateisvp.py b/youtube_dl/extractor/senateisvp.py index c5f474dd1..35540c082 100644 --- a/youtube_dl/extractor/senateisvp.py +++ b/youtube_dl/extractor/senateisvp.py @@ -48,7 +48,7 @@ class SenateISVPIE(InfoExtractor): ['arch', '', 'http://ussenate-f.akamaihd.net/'] ] _IE_NAME = 'senate.gov' - _VALID_URL = r'https?://www\.senate\.gov/isvp/?\?(?P.+)' + _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P.+)' _TESTS = [{ 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', 'info_dict': { diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py index 4967c1b77..74a1dc672 100644 --- a/youtube_dl/extractor/slideshare.py +++ b/youtube_dl/extractor/slideshare.py @@ -14,7 +14,7 @@ from ..utils import ( class SlideshareIE(InfoExtractor): - _VALID_URL = r'https?://www\.slideshare\.net/[^/]+?/(?P.+?)($|\?)' + _VALID_URL = r'https?://(?:www\.)?slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)' _TEST = { 'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity', diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 3c552807e..b41d9f59f 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -103,7 +103,7 @@ class SpiegelIE(InfoExtractor): class SpiegelArticleIE(InfoExtractor): - _VALID_URL = 'https?://www\.spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html' + _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html' IE_NAME = 'Spiegel:Article' IE_DESC = 'Articles on spiegel.de' _TESTS = [{ diff --git a/youtube_dl/extractor/syfy.py b/youtube_dl/extractor/syfy.py index ab8bab5cd..def7e5a2c 100644 --- a/youtube_dl/extractor/syfy.py +++ b/youtube_dl/extractor/syfy.py @@ -8,7 +8,7 @@ from ..utils import ( class SyfyIE(AdobePassIE): - _VALID_URL = r'https?://www\.syfy\.com/(?:[^/]+/)?videos/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?syfy\.com/(?:[^/]+/)?videos/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'http://www.syfy.com/theinternetruinedmylife/videos/the-internet-ruined-my-life-season-1-trailer', 'info_dict': { diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py index 79b00e376..0c351e045 100644 --- a/youtube_dl/extractor/tbs.py +++ b/youtube_dl/extractor/tbs.py @@ -53,7 +53,7 @@ class TBSIE(TurnerBaseIE): 'media_src': 'http://ht.cdn.turner.com/%s/big' % site, }, 'secure': { - 'media_src': 'http://apple-secure.cdn.turner.com/%s/big' % site, + 'media_src': 'http://androidhls-secure.cdn.turner.com/%s/big' % site, 'tokenizer_src': 'http://www.%s.com/video/processors/services/token_ipadAdobe.do' % domain, }, }) diff --git a/youtube_dl/extractor/teachingchannel.py b/youtube_dl/extractor/teachingchannel.py index d14d93e3a..e89759714 100644 --- a/youtube_dl/extractor/teachingchannel.py +++ b/youtube_dl/extractor/teachingchannel.py @@ -7,7 +7,7 @@ from .ooyala import OoyalaIE class TeachingChannelIE(InfoExtractor): - _VALID_URL = r'https?://www\.teachingchannel\.org/videos/(?P<title>.+)' + _VALID_URL = r'https?://(?:www\.)?teachingchannel\.org/videos/(?P<title>.+)' _TEST = { 'url': 'https://www.teachingchannel.org/videos/teacher-teaming-evolution', diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index 2ecfd0405..d5abfc9e4 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -6,7 +6,7 @@ from .mitele import MiTeleBaseIE class TelecincoIE(MiTeleBaseIE): IE_DESC = 'telecinco.es, cuatro.com and mediaset.es' - _VALID_URL = r'https?://www\.(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html' + _VALID_URL = r'https?://(?:www\.)?(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html' _TESTS = [{ 'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html', diff --git a/youtube_dl/extractor/telequebec.py b/youtube_dl/extractor/telequebec.py new file mode 100644 index 000000000..4043fcb92 --- /dev/null +++ b/youtube_dl/extractor/telequebec.py @@ -0,0 +1,36 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class TeleQuebecIE(InfoExtractor): + _VALID_URL = r'https?://zonevideo\.telequebec\.tv/media/(?P<id>\d+)' + _TEST = { + 'url': 'http://zonevideo.telequebec.tv/media/20984/le-couronnement-de-new-york/couronnement-de-new-york', + 'md5': 'fe95a0957e5707b1b01f5013e725c90f', + 'info_dict': { + 'id': '20984', + 'ext': 'mp4', + 'title': 'Le couronnement de New York', + 'description': 'md5:f5b3d27a689ec6c1486132b2d687d432', + 'upload_date': '20160220', + 'timestamp': 1455965438, + } + } + + def _real_extract(self, url): + media_id = self._match_id(url) + media_data = self._download_json( + 'https://mnmedias.api.telequebec.tv/api/v2/media/' + media_id, + media_id)['media'] + return { + '_type': 'url_transparent', + 'id': media_id, + 'url': 'limelight:media:' + media_data['streamInfo']['sourceId'], + 'title': media_data['title'], + 'description': media_data.get('descriptions', [{'text': None}])[0].get('text'), + 'duration': int_or_none(media_data.get('durationInMilliseconds'), 1000), + 'ie_key': 'LimelightMedia', + } diff --git a/youtube_dl/extractor/telewebion.py b/youtube_dl/extractor/telewebion.py index 77916c601..7786b2813 100644 --- a/youtube_dl/extractor/telewebion.py +++ b/youtube_dl/extractor/telewebion.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class TelewebionIE(InfoExtractor): - _VALID_URL = r'https?://www\.telewebion\.com/#!/episode/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?telewebion\.com/#!/episode/(?P<id>\d+)' _TEST = { 'url': 'http://www.telewebion.com/#!/episode/1263668/', diff --git a/youtube_dl/extractor/tfo.py b/youtube_dl/extractor/tfo.py new file mode 100644 index 000000000..6f1eeac57 --- /dev/null +++ b/youtube_dl/extractor/tfo.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + HEADRequest, + ExtractorError, + int_or_none, +) + + +class TFOIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tfo\.org/(?:en|fr)/(?:[^/]+/){2}(?P<id>\d+)' + _TEST = { + 'url': 'http://www.tfo.org/en/universe/tfo-247/100463871/video-game-hackathon', + 'md5': '47c987d0515561114cf03d1226a9d4c7', + 'info_dict': { + 'id': '100463871', + 'ext': 'mp4', + 'title': 'Video Game Hackathon', + 'description': 'md5:558afeba217c6c8d96c60e5421795c07', + 'upload_date': '20160212', + 'timestamp': 1455310233, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + self._request_webpage(HEADRequest('http://www.tfo.org/'), video_id) + infos = self._download_json( + 'http://www.tfo.org/api/web/video/get_infos', video_id, data=json.dumps({ + 'product_id': video_id, + }).encode(), headers={ + 'X-tfo-session': self._get_cookies('http://www.tfo.org/')['tfo-session'].value, + }) + if infos.get('success') == 0: + raise ExtractorError('%s said: %s' % (self.IE_NAME, infos['msg']), expected=True) + video_data = infos['data'] + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'limelight:media:' + video_data['llid'], + 'title': video_data['title'], + 'description': video_data.get('description'), + 'series': video_data.get('collection'), + 'season_number': int_or_none(video_data.get('season')), + 'episode_number': int_or_none(video_data.get('episode')), + 'duration': int_or_none(video_data.get('duration')), + 'ie_key': 'LimelightMedia', + } diff --git a/youtube_dl/extractor/theintercept.py b/youtube_dl/extractor/theintercept.py index 8cb3c3669..ec6f4ecaa 100644 --- a/youtube_dl/extractor/theintercept.py +++ b/youtube_dl/extractor/theintercept.py @@ -11,7 +11,7 @@ from ..utils import ( class TheInterceptIE(InfoExtractor): - _VALID_URL = r'https://theintercept.com/fieldofvision/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://theintercept\.com/fieldofvision/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://theintercept.com/fieldofvision/thisisacoup-episode-four-surrender-or-die/', 'md5': '145f28b41d44aab2f87c0a4ac8ec95bd', diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 23067e8c6..6febf805b 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -96,7 +96,7 @@ class ThePlatformBaseIE(OnceIE): class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/ - (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))? + (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)?|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))? |theplatform:)(?P<id>[^/\?&]+)''' _TESTS = [{ @@ -116,6 +116,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): # rtmp download 'skip_download': True, }, + 'skip': '404 Not Found', }, { # from http://www.cnet.com/videos/tesla-model-s-a-second-step-towards-a-cleaner-motoring-future/ 'url': 'http://link.theplatform.com/s/kYEXFC/22d_qsQ6MIRT', diff --git a/youtube_dl/extractor/thescene.py b/youtube_dl/extractor/thescene.py index 3e4e14031..ce1326c03 100644 --- a/youtube_dl/extractor/thescene.py +++ b/youtube_dl/extractor/thescene.py @@ -7,7 +7,7 @@ from ..utils import qualities class TheSceneIE(InfoExtractor): - _VALID_URL = r'https://thescene\.com/watch/[^/]+/(?P<id>[^/#?]+)' + _VALID_URL = r'https?://thescene\.com/watch/[^/]+/(?P<id>[^/#?]+)' _TEST = { 'url': 'https://thescene.com/watch/vogue/narciso-rodriguez-spring-2013-ready-to-wear', diff --git a/youtube_dl/extractor/tlc.py b/youtube_dl/extractor/tlc.py index abad3ff64..ce4f91f46 100644 --- a/youtube_dl/extractor/tlc.py +++ b/youtube_dl/extractor/tlc.py @@ -1,15 +1,19 @@ # encoding: utf-8 from __future__ import unicode_literals + import re from .common import InfoExtractor from .brightcove import BrightcoveLegacyIE -from ..compat import compat_parse_qs +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) class TlcDeIE(InfoExtractor): IE_NAME = 'tlc.de' - _VALID_URL = r'https?://www\.tlc\.de/(?:[^/]+/)*videos/(?P<title>[^/?#]+)?(?:.*#(?P<id>\d+))?' + _VALID_URL = r'https?://(?:www\.)?tlc\.de/(?:[^/]+/)*videos/(?P<title>[^/?#]+)?(?:.*#(?P<id>\d+))?' _TEST = { 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001', @@ -35,5 +39,5 @@ class TlcDeIE(InfoExtractor): title = mobj.group('title') webpage = self._download_webpage(url, title) brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - brightcove_id = compat_parse_qs(brightcove_legacy_url)['@videoPlayer'][0] + brightcove_id = compat_parse_qs(compat_urlparse.urlparse(brightcove_legacy_url).query)['@videoPlayer'][0] return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/trollvids.py b/youtube_dl/extractor/trollvids.py deleted file mode 100644 index 657705623..000000000 --- a/youtube_dl/extractor/trollvids.py +++ /dev/null @@ -1,36 +0,0 @@ -# encoding: utf-8 -from __future__ import unicode_literals - -import re - -from .nuevo import NuevoBaseIE - - -class TrollvidsIE(NuevoBaseIE): - _VALID_URL = r'https?://(?:www\.)?trollvids\.com/video/(?P<id>\d+)/(?P<display_id>[^/?#&]+)' - IE_NAME = 'trollvids' - _TEST = { - 'url': 'http://trollvids.com/video/2349002/%E3%80%90MMD-R-18%E3%80%91%E3%82%AC%E3%83%BC%E3%83%AB%E3%83%95%E3%83%AC%E3%83%B3%E3%83%89-carrymeoff', - 'md5': '1d53866b2c514b23ed69e4352fdc9839', - 'info_dict': { - 'id': '2349002', - 'ext': 'mp4', - 'title': '【MMD R-18】ガールフレンド carry_me_off', - 'age_limit': 18, - 'duration': 216.78, - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - info = self._extract_nuevo( - 'http://trollvids.com/nuevo/player/config.php?v=%s' % video_id, - video_id) - info.update({ - 'display_id': display_id, - 'age_limit': 18 - }) - return info diff --git a/youtube_dl/extractor/trutv.py b/youtube_dl/extractor/trutv.py new file mode 100644 index 000000000..e60d8a181 --- /dev/null +++ b/youtube_dl/extractor/trutv.py @@ -0,0 +1,35 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .turner import TurnerBaseIE + + +class TruTVIE(TurnerBaseIE): + _VALID_URL = r'https?://(?:www\.)?trutv\.com(?:(?P<path>/shows/[^/]+/videos/[^/?#]+?)\.html|/full-episodes/[^/]+/(?P<id>\d+))' + _TEST = { + 'url': 'http://www.trutv.com/shows/10-things/videos/you-wont-believe-these-sports-bets.html', + 'md5': '2cdc844f317579fed1a7251b087ff417', + 'info_dict': { + 'id': '/shows/10-things/videos/you-wont-believe-these-sports-bets', + 'ext': 'mp4', + 'title': 'You Won\'t Believe These Sports Bets', + 'description': 'Jamie Lee sits down with a bookie to discuss the bizarre world of illegal sports betting.', + 'upload_date': '20130305', + } + } + + def _real_extract(self, url): + path, video_id = re.match(self._VALID_URL, url).groups() + if path: + data_src = 'http://www.trutv.com/video/cvp/v2/xml/content.xml?id=%s.xml' % path + else: + data_src = 'http://www.trutv.com/tveverywhere/services/cvpXML.do?titleId=' + video_id + return self._extract_cvp_info( + data_src, path, { + 'secure': { + 'media_src': 'http://androidhls-secure.cdn.turner.com/trutv/big', + 'tokenizer_src': 'http://www.trutv.com/tveverywhere/processors/services/token_ipadAdobe.do', + }, + }) diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index 4053f6c21..1853a1104 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals +import re + from ..utils import ( int_or_none, str_to_int, @@ -21,7 +23,9 @@ class Tube8IE(KeezMoviesIE): 'title': 'Kasia music video', 'age_limit': 18, 'duration': 230, - } + 'categories': ['Teen'], + 'tags': ['dancing'], + }, }, { 'url': 'http://www.tube8.com/shemale/teen/blonde-cd-gets-kidnapped-by-two-blacks-and-punished-for-being-a-slutty-girl/19569151/', 'only_matching': True, @@ -51,6 +55,17 @@ class Tube8IE(KeezMoviesIE): r'<span id="allCommentsCount">(\d+)</span>', webpage, 'comment count', fatal=False)) + category = self._search_regex( + r'Category:\s*</strong>\s*<a[^>]+href=[^>]+>([^<]+)', + webpage, 'category', fatal=False) + categories = [category] if category else None + + tags_str = self._search_regex( + r'(?s)Tags:\s*</strong>(.+?)</(?!a)', + webpage, 'tags', fatal=False) + tags = [t for t in re.findall( + r'<a[^>]+href=[^>]+>([^<]+)', tags_str)] if tags_str else None + info.update({ 'description': description, 'uploader': uploader, @@ -58,6 +73,8 @@ class Tube8IE(KeezMoviesIE): 'like_count': like_count, 'dislike_count': dislike_count, 'comment_count': comment_count, + 'categories': categories, + 'tags': tags, }) return info diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index b59dafda6..4228c1ccc 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -12,7 +12,7 @@ from ..utils import ( parse_duration, xpath_attr, update_url_query, - compat_urlparse, + ExtractorError, ) @@ -24,6 +24,7 @@ class TurnerBaseIE(InfoExtractor): video_data = self._download_xml(data_src, video_id) video_id = video_data.attrib['id'] title = xpath_text(video_data, 'headline', fatal=True) + content_id = xpath_text(video_data, 'contentId') or video_id # rtmp_src = xpath_text(video_data, 'akamai/src') # if rtmp_src: # splited_rtmp_src = rtmp_src.split(',') @@ -54,7 +55,7 @@ class TurnerBaseIE(InfoExtractor): # auth = self._download_webpage( # protected_path_data['tokenizer_src'], query={ # 'path': protected_path, - # 'videoId': video_id, + # 'videoId': content_id, # 'aifp': aifp, # }) # token = xpath_text(auth, 'token') @@ -72,8 +73,11 @@ class TurnerBaseIE(InfoExtractor): auth = self._download_xml( secure_path_data['tokenizer_src'], video_id, query={ 'path': secure_path, - 'videoId': video_id, + 'videoId': content_id, }) + error_msg = xpath_text(auth, 'error/msg') + if error_msg: + raise ExtractorError(error_msg, expected=True) token = xpath_text(auth, 'token') if not token: continue @@ -93,19 +97,9 @@ class TurnerBaseIE(InfoExtractor): formats.extend(self._extract_smil_formats( video_url, video_id, fatal=False)) elif ext == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id=format_id or 'hls', - fatal=False) - if m3u8_formats: - # Sometimes final URLs inside m3u8 are unsigned, let's fix this - # ourselves - qs = compat_urlparse.urlparse(video_url).query - if qs: - query = compat_urlparse.parse_qs(qs) - for m3u8_format in m3u8_formats: - m3u8_format['url'] = update_url_query(m3u8_format['url'], query) - m3u8_format['extra_param_to_segment_url'] = qs - formats.extend(m3u8_formats) + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', + m3u8_id=format_id or 'hls', fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( update_url_query(video_url, {'hdcore': '3.7.0'}), diff --git a/youtube_dl/extractor/tvnoe.py b/youtube_dl/extractor/tvnoe.py new file mode 100644 index 000000000..6d5c74826 --- /dev/null +++ b/youtube_dl/extractor/tvnoe.py @@ -0,0 +1,49 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .jwplatform import JWPlatformBaseIE +from ..utils import ( + clean_html, + get_element_by_class, + js_to_json, +) + + +class TVNoeIE(JWPlatformBaseIE): + _VALID_URL = r'https?://(?:www\.)?tvnoe\.cz/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.tvnoe.cz/video/10362', + 'md5': 'aee983f279aab96ec45ab6e2abb3c2ca', + 'info_dict': { + 'id': '10362', + 'ext': 'mp4', + 'series': 'Noční univerzita', + 'title': 'prof. Tomáš Halík, Th.D. - Návrat náboženství a střet civilizací', + 'description': 'md5:f337bae384e1a531a52c55ebc50fff41', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + iframe_url = self._search_regex( + r'<iframe[^>]+src="([^"]+)"', webpage, 'iframe URL') + + ifs_page = self._download_webpage(iframe_url, video_id) + jwplayer_data = self._parse_json( + self._find_jwplayer_data(ifs_page), + video_id, transform_source=js_to_json) + info_dict = self._parse_jwplayer_data( + jwplayer_data, video_id, require_title=False, base_url=iframe_url) + + info_dict.update({ + 'id': video_id, + 'title': clean_html(get_element_by_class( + 'field-name-field-podnazev', webpage)), + 'description': clean_html(get_element_by_class( + 'field-name-body', webpage)), + 'series': clean_html(get_element_by_class('title', webpage)) + }) + + return info_dict diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 4186e82db..3eda0a399 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -348,6 +348,29 @@ class ViafreeIE(InfoExtractor): 'skip_download': True, }, 'add_ie': [TVPlayIE.ie_key()], + }, { + # with relatedClips + 'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-1', + 'info_dict': { + 'id': '758770', + 'ext': 'mp4', + 'title': 'Sommaren med YouTube-stjärnorna S01E01', + 'description': 'md5:2bc69dce2c4bb48391e858539bbb0e3f', + 'series': 'Sommaren med YouTube-stjärnorna', + 'season': 'Säsong 1', + 'season_number': 1, + 'duration': 1326, + 'timestamp': 1470905572, + 'upload_date': '20160811', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [TVPlayIE.ie_key()], + }, { + # Different og:image URL schema + 'url': 'http://www.viafree.se/program/reality/sommaren-med-youtube-stjarnorna/sasong-1/avsnitt-2', + 'only_matching': True, }, { 'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1', 'only_matching': True, @@ -365,8 +388,38 @@ class ViafreeIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - video_id = self._search_regex( - r'currentVideo["\']\s*:\s*.+?["\']id["\']\s*:\s*["\'](?P<id>\d{6,})', - webpage, 'video id') + data = self._parse_json( + self._search_regex( + r'(?s)window\.App\s*=\s*({.+?})\s*;\s*</script', + webpage, 'data', default='{}'), + video_id, transform_source=lambda x: re.sub( + r'(?s)function\s+[a-zA-Z_][\da-zA-Z_]*\s*\([^)]*\)\s*{[^}]*}\s*', + 'null', x), fatal=False) + + video_id = None + + if data: + video_id = try_get( + data, lambda x: x['context']['dispatcher']['stores'][ + 'ContentPageProgramStore']['currentVideo']['id'], + compat_str) + + # Fallback #1 (extract from og:image URL schema) + if not video_id: + thumbnail = self._og_search_thumbnail(webpage, default=None) + if thumbnail: + video_id = self._search_regex( + # Patterns seen: + # http://cdn.playapi.mtgx.tv/imagecache/600x315/cloud/content-images/inbox/765166/a2e95e5f1d735bab9f309fa345cc3f25.jpg + # http://cdn.playapi.mtgx.tv/imagecache/600x315/cloud/content-images/seasons/15204/758770/4a5ba509ca8bc043e1ebd1a76131cdf2.jpg + r'https?://[^/]+/imagecache/(?:[^/]+/)+(\d{6,})/', + thumbnail, 'video id', default=None) + + # Fallback #2. Extract from raw JSON string. + # May extract wrong video id if relatedClips is present. + if not video_id: + video_id = self._search_regex( + r'currentVideo["\']\s*:\s*.+?["\']id["\']\s*:\s*["\'](\d{6,})', + webpage, 'video id') return self.url_result('mtg:%s' % video_id, TVPlayIE.ie_key()) diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index b73842986..c5a5843b6 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -342,7 +342,7 @@ class TwitterIE(InfoExtractor): class TwitterAmplifyIE(TwitterBaseIE): IE_NAME = 'twitter:amplify' - _VALID_URL = 'https?://amp\.twimg\.com/v/(?P<id>[0-9a-f\-]{36})' + _VALID_URL = r'https?://amp\.twimg\.com/v/(?P<id>[0-9a-f\-]{36})' _TEST = { 'url': 'https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951', diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 89b869559..c2f507233 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -307,7 +307,7 @@ class UdemyIE(InfoExtractor): class UdemyCourseIE(UdemyIE): IE_NAME = 'udemy:course' - _VALID_URL = r'https?://www\.udemy\.com/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?udemy\.com/(?P<id>[^/?#&]+)' _TESTS = [] @classmethod diff --git a/youtube_dl/extractor/ustream.py b/youtube_dl/extractor/ustream.py index 54605d863..a3dc9d33e 100644 --- a/youtube_dl/extractor/ustream.py +++ b/youtube_dl/extractor/ustream.py @@ -14,7 +14,7 @@ from ..utils import ( class UstreamIE(InfoExtractor): - _VALID_URL = r'https?://www\.ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)' IE_NAME = 'ustream' _TESTS = [{ 'url': 'http://www.ustream.tv/recorded/20274954', @@ -117,7 +117,7 @@ class UstreamIE(InfoExtractor): class UstreamChannelIE(InfoExtractor): - _VALID_URL = r'https?://www\.ustream\.tv/channel/(?P<slug>.+)' + _VALID_URL = r'https?://(?:www\.)?ustream\.tv/channel/(?P<slug>.+)' IE_NAME = 'ustream:channel' _TEST = { 'url': 'http://www.ustream.tv/channel/channeljapan', diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 388b4debe..783efda7d 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -31,7 +31,7 @@ class VevoIE(VevoBaseIE): (currently used by MTVIE and MySpaceIE) ''' _VALID_URL = r'''(?x) - (?:https?://www\.vevo\.com/watch/(?!playlist|genre)(?:[^/]+/(?:[^/]+/)?)?| + (?:https?://(?:www\.)?vevo\.com/watch/(?!playlist|genre)(?:[^/]+/(?:[^/]+/)?)?| https?://cache\.vevo\.com/m/html/embed\.html\?video=| https?://videoplayer\.vevo\.com/embed/embedded\?videoId=| vevo:) @@ -374,7 +374,7 @@ class VevoIE(VevoBaseIE): class VevoPlaylistIE(VevoBaseIE): - _VALID_URL = r'https?://www\.vevo\.com/watch/(?P<kind>playlist|genre)/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?vevo\.com/watch/(?P<kind>playlist|genre)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.vevo.com/watch/playlist/dadbf4e7-b99f-4184-9670-6f0e547b6a29', diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py index 2ed5d9643..a19411a05 100644 --- a/youtube_dl/extractor/videodetective.py +++ b/youtube_dl/extractor/videodetective.py @@ -6,7 +6,7 @@ from .internetvideoarchive import InternetVideoArchiveIE class VideoDetectiveIE(InfoExtractor): - _VALID_URL = r'https?://www\.videodetective\.com/[^/]+/[^/]+/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?videodetective\.com/[^/]+/[^/]+/(?P<id>\d+)' _TEST = { 'url': 'http://www.videodetective.com/movies/kick-ass-2/194487', diff --git a/youtube_dl/extractor/videomore.py b/youtube_dl/extractor/videomore.py index 04e95c66e..328b5b7fb 100644 --- a/youtube_dl/extractor/videomore.py +++ b/youtube_dl/extractor/videomore.py @@ -6,8 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( int_or_none, - parse_age_limit, - parse_iso8601, + xpath_element, xpath_text, ) @@ -17,38 +16,32 @@ class VideomoreIE(InfoExtractor): _VALID_URL = r'videomore:(?P<sid>\d+)$|https?://videomore\.ru/(?:(?:embed|[^/]+/[^/]+)/|[^/]+\?.*\btrack_id=)(?P<id>\d+)(?:[/?#&]|\.(?:xml|json)|$)' _TESTS = [{ 'url': 'http://videomore.ru/kino_v_detalayah/5_sezon/367617', - 'md5': '70875fbf57a1cd004709920381587185', + 'md5': '44455a346edc0d509ac5b5a5b531dc35', 'info_dict': { 'id': '367617', 'ext': 'flv', - 'title': 'В гостях Алексей Чумаков и Юлия Ковальчук', - 'description': 'В гостях – лучшие романтические комедии года, «Выживший» Иньярриту и «Стив Джобс» Дэнни Бойла.', + 'title': 'Кино в деталях 5 сезон В гостях Алексей Чумаков и Юлия Ковальчук', 'series': 'Кино в деталях', 'episode': 'В гостях Алексей Чумаков и Юлия Ковальчук', - 'episode_number': None, - 'season': 'Сезон 2015', - 'season_number': 5, 'thumbnail': 're:^https?://.*\.jpg', 'duration': 2910, - 'age_limit': 16, 'view_count': int, + 'comment_count': int, + 'age_limit': 16, }, }, { 'url': 'http://videomore.ru/embed/259974', 'info_dict': { 'id': '259974', 'ext': 'flv', - 'title': '80 серия', - 'description': '«Медведей» ждет решающий матч. Макеев выясняет отношения со Стрельцовым. Парни узнают подробности прошлого Макеева.', + 'title': 'Молодежка 2 сезон 40 серия', 'series': 'Молодежка', - 'episode': '80 серия', - 'episode_number': 40, - 'season': '2 сезон', - 'season_number': 2, + 'episode': '40 серия', 'thumbnail': 're:^https?://.*\.jpg', 'duration': 2809, - 'age_limit': 16, 'view_count': int, + 'comment_count': int, + 'age_limit': 16, }, 'params': { 'skip_download': True, @@ -58,13 +51,8 @@ class VideomoreIE(InfoExtractor): 'info_dict': { 'id': '341073', 'ext': 'flv', - 'title': 'Команда проиграла из-за Бакина?', - 'description': 'Молодежка 3 сезон скоро', - 'series': 'Молодежка', + 'title': 'Промо Команда проиграла из-за Бакина?', 'episode': 'Команда проиграла из-за Бакина?', - 'episode_number': None, - 'season': 'Промо', - 'season_number': 99, 'thumbnail': 're:^https?://.*\.jpg', 'duration': 29, 'age_limit': 16, @@ -109,43 +97,33 @@ class VideomoreIE(InfoExtractor): 'http://videomore.ru/video/tracks/%s.xml' % video_id, video_id, 'Downloading video XML') - video_url = xpath_text(video, './/video_url', 'video url', fatal=True) + item = xpath_element(video, './/playlist/item', fatal=True) + + title = xpath_text( + item, ('./title', './episode_name'), 'title', fatal=True) + + video_url = xpath_text(item, './video_url', 'video url', fatal=True) formats = self._extract_f4m_formats(video_url, video_id, f4m_id='hds') self._sort_formats(formats) - data = self._download_json( - 'http://videomore.ru/video/tracks/%s.json' % video_id, - video_id, 'Downloading video JSON') + thumbnail = xpath_text(item, './thumbnail_url') + duration = int_or_none(xpath_text(item, './duration')) + view_count = int_or_none(xpath_text(item, './views')) + comment_count = int_or_none(xpath_text(item, './count_comments')) + age_limit = int_or_none(xpath_text(item, './min_age')) - title = data.get('title') or data['project_title'] - description = data.get('description') or data.get('description_raw') - timestamp = parse_iso8601(data.get('published_at')) - duration = int_or_none(data.get('duration')) - view_count = int_or_none(data.get('views')) - age_limit = parse_age_limit(data.get('min_age')) - thumbnails = [{ - 'url': thumbnail, - } for thumbnail in data.get('big_thumbnail_urls', [])] - - series = data.get('project_title') - episode = data.get('title') - episode_number = int_or_none(data.get('episode_of_season') or None) - season = data.get('season_title') - season_number = int_or_none(data.get('season_pos') or None) + series = xpath_text(item, './project_name') + episode = xpath_text(item, './episode_name') return { 'id': video_id, 'title': title, - 'description': description, 'series': series, 'episode': episode, - 'episode_number': episode_number, - 'season': season, - 'season_number': season_number, - 'thumbnails': thumbnails, - 'timestamp': timestamp, + 'thumbnail': thumbnail, 'duration': duration, 'view_count': view_count, + 'comment_count': comment_count, 'age_limit': age_limit, 'formats': formats, } diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 7e854f326..50aacc6ac 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -350,6 +350,10 @@ class VimeoIE(VimeoBaseInfoExtractor): } ] + @staticmethod + def _smuggle_referrer(url, referrer_url): + return smuggle_url(url, {'http_headers': {'Referer': referrer_url}}) + @staticmethod def _extract_vimeo_url(url, webpage): # Look for embedded (iframe) Vimeo player @@ -357,8 +361,7 @@ class VimeoIE(VimeoBaseInfoExtractor): r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage) if mobj: player_url = unescapeHTML(mobj.group('url')) - surl = smuggle_url(player_url, {'http_headers': {'Referer': url}}) - return surl + return VimeoIE._smuggle_referrer(player_url, url) # Look for embedded (swf embed) Vimeo player mobj = re.search( r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) @@ -585,6 +588,20 @@ class VimeoOndemandIE(VimeoBaseInfoExtractor): 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/gumfilms', 'uploader_id': 'gumfilms', }, + }, { + # requires Referer to be passed along with og:video:url + 'url': 'https://vimeo.com/ondemand/36938/126682985', + 'info_dict': { + 'id': '126682985', + 'ext': 'mp4', + 'title': 'Rävlock, rätt läte på rätt plats', + 'uploader': 'Lindroth & Norin', + 'uploader_url': 're:https?://(?:www\.)?vimeo\.com/user14430847', + 'uploader_id': 'user14430847', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://vimeo.com/ondemand/nazmaalik', 'only_matching': True, @@ -599,7 +616,12 @@ class VimeoOndemandIE(VimeoBaseInfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - return self.url_result(self._og_search_video_url(webpage), VimeoIE.ie_key()) + return self.url_result( + # Some videos require Referer to be passed along with og:video:url + # similarly to generic vimeo embeds (e.g. + # https://vimeo.com/ondemand/36938/126682985). + VimeoIE._smuggle_referrer(self._og_search_video_url(webpage), url), + VimeoIE.ie_key()) class VimeoChannelIE(VimeoBaseInfoExtractor): diff --git a/youtube_dl/extractor/vimple.py b/youtube_dl/extractor/vimple.py index 92321d66e..7fd9b777b 100644 --- a/youtube_dl/extractor/vimple.py +++ b/youtube_dl/extractor/vimple.py @@ -28,23 +28,24 @@ class SprutoBaseIE(InfoExtractor): class VimpleIE(SprutoBaseIE): IE_DESC = 'Vimple - one-click video hosting' - _VALID_URL = r'https?://(?:player\.vimple\.ru/iframe|vimple\.ru)/(?P<id>[\da-f-]{32,36})' - _TESTS = [ - { - 'url': 'http://vimple.ru/c0f6b1687dcd4000a97ebe70068039cf', - 'md5': '2e750a330ed211d3fd41821c6ad9a279', - 'info_dict': { - 'id': 'c0f6b168-7dcd-4000-a97e-be70068039cf', - 'ext': 'mp4', - 'title': 'Sunset', - 'duration': 20, - 'thumbnail': 're:https?://.*?\.jpg', - }, - }, { - 'url': 'http://player.vimple.ru/iframe/52e1beec-1314-4a83-aeac-c61562eadbf9', - 'only_matching': True, - } - ] + _VALID_URL = r'https?://(?:player\.vimple\.(?:ru|co)/iframe|vimple\.(?:ru|co))/(?P<id>[\da-f-]{32,36})' + _TESTS = [{ + 'url': 'http://vimple.ru/c0f6b1687dcd4000a97ebe70068039cf', + 'md5': '2e750a330ed211d3fd41821c6ad9a279', + 'info_dict': { + 'id': 'c0f6b168-7dcd-4000-a97e-be70068039cf', + 'ext': 'mp4', + 'title': 'Sunset', + 'duration': 20, + 'thumbnail': 're:https?://.*?\.jpg', + }, + }, { + 'url': 'http://player.vimple.ru/iframe/52e1beec-1314-4a83-aeac-c61562eadbf9', + 'only_matching': True, + }, { + 'url': 'http://vimple.co/04506a053f124483b8fb05ed73899f19', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 9f1b8b4b5..20fef1f04 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -86,38 +86,50 @@ class WatIE(InfoExtractor): def extract_url(path_template, url_type): req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id) - head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type) - red_url = head.geturl() - if req_url == red_url: - raise ExtractorError( - '%s said: Sorry, this video is not available from your country.' % self.IE_NAME, - expected=True) - return red_url + head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type, fatal=False) + if head: + red_url = head.geturl() + if req_url != red_url: + return red_url + return None + + def remove_bitrate_limit(manifest_url): + return re.sub(r'(?:max|min)_bitrate=\d+&?', '', manifest_url) formats = [] try: - http_url = extract_url('android5/%s.mp4', 'http') - m3u8_url = extract_url('ipad/%s.m3u8', 'm3u8') - m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') - formats.extend(m3u8_formats) - formats.extend(self._extract_f4m_formats( - m3u8_url.replace('ios.', 'web.').replace('.m3u8', '.f4m'), - video_id, f4m_id='hds', fatal=False)) - for m3u8_format in m3u8_formats: - vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr') - if not vbr or not abr: - continue - format_id = m3u8_format['format_id'].replace('hls', 'http') - fmt_url = re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url) - if self._is_valid_url(fmt_url, video_id, format_id): - f = m3u8_format.copy() - f.update({ - 'url': fmt_url, - 'format_id': format_id, - 'protocol': 'http', - }) - formats.append(f) + manifest_urls = self._download_json( + 'http://www.wat.tv/get/webhtml/' + video_id, video_id) + m3u8_url = manifest_urls.get('hls') + if m3u8_url: + m3u8_url = remove_bitrate_limit(m3u8_url) + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + formats.extend(self._extract_f4m_formats( + m3u8_url.replace('ios', 'web').replace('.m3u8', '.f4m'), + video_id, f4m_id='hds', fatal=False)) + http_url = extract_url('android5/%s.mp4', 'http') + if http_url: + for m3u8_format in m3u8_formats: + vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr') + if not vbr or not abr: + continue + format_id = m3u8_format['format_id'].replace('hls', 'http') + fmt_url = re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url) + if self._is_valid_url(fmt_url, video_id, format_id): + f = m3u8_format.copy() + f.update({ + 'url': fmt_url, + 'format_id': format_id, + 'protocol': 'http', + }) + formats.append(f) + mpd_url = manifest_urls.get('mpd') + if mpd_url: + formats.extend(self._extract_mpd_formats(remove_bitrate_limit( + mpd_url), video_id, mpd_id='dash', fatal=False)) self._sort_formats(formats) except ExtractorError: abr = 64 diff --git a/youtube_dl/extractor/weiqitv.py b/youtube_dl/extractor/weiqitv.py index 3dafbeec2..8e09156c2 100644 --- a/youtube_dl/extractor/weiqitv.py +++ b/youtube_dl/extractor/weiqitv.py @@ -6,7 +6,7 @@ from .common import InfoExtractor class WeiqiTVIE(InfoExtractor): IE_DESC = 'WQTV' - _VALID_URL = r'https?://www\.weiqitv\.com/index/video_play\?videoId=(?P<id>[A-Za-z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?weiqitv\.com/index/video_play\?videoId=(?P<id>[A-Za-z0-9]+)' _TESTS = [{ 'url': 'http://www.weiqitv.com/index/video_play?videoId=53c744f09874f0e76a8b46f3', diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index d7a81ab8c..91f0a0dbb 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -19,7 +19,10 @@ from ..utils import ( determine_ext, ) -from .brightcove import BrightcoveNewIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) from .nbc import NBCSportsVPlayerIE @@ -223,6 +226,11 @@ class YahooIE(InfoExtractor): if nbc_sports_url: return self.url_result(nbc_sports_url, NBCSportsVPlayerIE.ie_key()) + # Look for Brightcove Legacy Studio embeds + bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) + if bc_url: + return self.url_result(bc_url, BrightcoveLegacyIE.ie_key()) + # Look for Brightcove New Studio embeds bc_url = BrightcoveNewIE._extract_url(webpage) if bc_url: diff --git a/youtube_dl/extractor/yam.py b/youtube_dl/extractor/yam.py index 63bbc0634..ef5535547 100644 --- a/youtube_dl/extractor/yam.py +++ b/youtube_dl/extractor/yam.py @@ -15,7 +15,7 @@ from ..utils import ( class YamIE(InfoExtractor): IE_DESC = '蕃薯藤yam天空部落' - _VALID_URL = r'https?://mymedia.yam.com/m/(?P<id>\d+)' + _VALID_URL = r'https?://mymedia\.yam\.com/m/(?P<id>\d+)' _TESTS = [{ # An audio hosted on Yam diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index 31e2f9263..b50f34e9b 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -1,21 +1,16 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) class YouJizzIE(InfoExtractor): _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]+)?-(?P<id>[0-9]+)\.html(?:$|[?#])' _TESTS = [{ 'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', - 'md5': '07e15fa469ba384c7693fd246905547c', + 'md5': '78fc1901148284c69af12640e01c6310', 'info_dict': { 'id': '2189178', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Zeichentrick 1', 'age_limit': 18, } @@ -27,38 +22,18 @@ class YouJizzIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + # YouJizz's HTML5 player has invalid HTML + webpage = webpage.replace('"controls', '" controls') age_limit = self._rta_search(webpage) video_title = self._html_search_regex( r'<title>\s*(.*)\s*', webpage, 'title') - embed_page_url = self._search_regex( - r'(https?://www.youjizz.com/videos/embed/[0-9]+)', - webpage, 'embed page') - webpage = self._download_webpage( - embed_page_url, video_id, note='downloading embed page') + info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] - # Get the video URL - m_playlist = re.search(r'so.addVariable\("playlist", ?"(?P.+?)"\);', webpage) - if m_playlist is not None: - playlist_url = m_playlist.group('playlist') - playlist_page = self._download_webpage(playlist_url, video_id, - 'Downloading playlist page') - m_levels = list(re.finditer(r'[^"]+)"\)\);', - webpage, 'video URL') - - return { + info_dict.update({ 'id': video_id, - 'url': video_url, 'title': video_title, - 'ext': 'flv', - 'format': 'flv', - 'player_url': embed_page_url, 'age_limit': age_limit, - } + }) + + return info_dict diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d5d5b7334..5ca903825 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -264,7 +264,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ) )? # all until now is optional -> you can pass the naked ID ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID - (?!.*?&list=) # combined list/video URLs are handled by the playlist IE + (?!.*?\blist=) # combined list/video URLs are handled by the playlist IE (?(1).+)? # if we found the ID, everything can follow $""" _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' @@ -844,6 +844,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # YouTube Red paid video (https://github.com/rg3/youtube-dl/issues/10059) 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo', 'only_matching': True, + }, + { + # Rental video preview + 'url': 'https://www.youtube.com/watch?v=yYr8q0y5Jfg', + 'info_dict': { + 'id': 'uGpuVWrhIzE', + 'ext': 'mp4', + 'title': 'Piku - Trailer', + 'description': 'md5:c36bd60c3fd6f1954086c083c72092eb', + 'upload_date': '20150811', + 'uploader': 'FlixMatrix', + 'uploader_id': 'FlixMatrixKaravan', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/FlixMatrixKaravan', + 'license': 'Standard YouTube License', + }, + 'params': { + 'skip_download': True, + }, } ] @@ -1254,6 +1272,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Convert to the same format returned by compat_parse_qs video_info = dict((k, [v]) for k, v in args.items()) add_dash_mpd(video_info) + # Rental video is not rented but preview is available (e.g. + # https://www.youtube.com/watch?v=yYr8q0y5Jfg, + # https://github.com/rg3/youtube-dl/issues/10532) + if not video_info and args.get('ypc_vid'): + return self.url_result( + args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) if args.get('livestream') == '1' or args.get('live_playback') == 1: is_live = True if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): @@ -1754,11 +1778,14 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): _VALID_URL = r"""(?x)(?: (?:https?://)? (?:\w+\.)? - youtube\.com/ (?: - (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries) - \? (?:.*?[&;])*? (?:p|a|list)= - | p/ + youtube\.com/ + (?: + (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries) + \? (?:.*?[&;])*? (?:p|a|list)= + | p/ + )| + youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= ) ( (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,} @@ -1841,6 +1868,31 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', }, 'playlist_mincout': 21, + }, { + # Playlist URL that does not actually serve a playlist + 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', + 'info_dict': { + 'id': 'FqZTN594JQw', + 'ext': 'webm', + 'title': "Smiley's People 01 detective, Adventure Series, Action", + 'uploader': 'STREEM', + 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', + 'upload_date': '20150526', + 'license': 'Standard YouTube License', + 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', + 'categories': ['People & Blogs'], + 'tags': list, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [YoutubeIE.ie_key()], + }, { + 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', + 'only_matching': True, }] def _real_initialize(self): @@ -1901,9 +1953,20 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): playlist_title = self._html_search_regex( r'(?s)

]*>\s*(.*?)\s*

', - page, 'title') + page, 'title', default=None) - return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title) + has_videos = True + + if not playlist_title: + try: + # Some playlist URLs don't actually serve a playlist (e.g. + # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4) + next(self._entries(page, playlist_id)) + except StopIteration: + has_videos = False + + return has_videos, self.playlist_result( + self._entries(page, playlist_id), playlist_id, playlist_title) def _check_download_just_video(self, url, playlist_id): # Check if it's a video-specific URL @@ -1912,9 +1975,11 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): video_id = query_dict['v'][0] if self._downloader.params.get('noplaylist'): self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - return self.url_result(video_id, 'Youtube', video_id=video_id) + return video_id, self.url_result(video_id, 'Youtube', video_id=video_id) else: self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + return video_id, None + return None, None def _real_extract(self, url): # Extract playlist id @@ -1923,7 +1988,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): raise ExtractorError('Invalid URL: %s' % url) playlist_id = mobj.group(1) or mobj.group(2) - video = self._check_download_just_video(url, playlist_id) + video_id, video = self._check_download_just_video(url, playlist_id) if video: return video @@ -1931,7 +1996,15 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): # Mixes require a custom extraction process return self._extract_mix(playlist_id) - return self._extract_playlist(playlist_id) + has_videos, playlist = self._extract_playlist(playlist_id) + if has_videos or not video_id: + return playlist + + # Some playlist URLs don't actually serve a playlist (see + # https://github.com/rg3/youtube-dl/issues/10537). + # Fallback to plain video extraction if there is a video id + # along with playlist id. + return self.url_result(video_id, 'Youtube', video_id=video_id) class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): @@ -2229,7 +2302,7 @@ class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor): class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): IE_DESC = 'YouTube.com (multi-season) shows' - _VALID_URL = r'https?://www\.youtube\.com/show/(?P[^?#]*)' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P[^?#]*)' IE_NAME = 'youtube:show' _TESTS = [{ 'url': 'https://www.youtube.com/show/airdisasters', @@ -2298,7 +2371,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): class YoutubeWatchLaterIE(YoutubePlaylistIE): IE_NAME = 'youtube:watchlater' IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater' _TESTS = [{ 'url': 'https://www.youtube.com/playlist?list=WL', @@ -2309,16 +2382,17 @@ class YoutubeWatchLaterIE(YoutubePlaylistIE): }] def _real_extract(self, url): - video = self._check_download_just_video(url, 'WL') + _, video = self._check_download_just_video(url, 'WL') if video: return video - return self._extract_playlist('WL') + _, playlist = self._extract_playlist('WL') + return playlist class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = 'youtube:favorites' IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?' _LOGIN_REQUIRED = True def _real_extract(self, url): @@ -2329,21 +2403,21 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?' _FEED_NAME = 'recommended' _PLAYLIST_TITLE = 'Youtube Recommended videos' class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' _FEED_NAME = 'subscriptions' _PLAYLIST_TITLE = 'Youtube Subscriptions' class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' - _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory' _FEED_NAME = 'history' _PLAYLIST_TITLE = 'Youtube History' diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 5d62deef4..b2e863119 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -94,7 +94,7 @@ def parseOpts(overrideArguments=None): setattr(parser.values, option.dest, value.split(',')) def _hide_login_info(opts): - PRIVATE_OPTS = ['-p', '--password', '-u', '--username', '--video-password'] + PRIVATE_OPTS = ['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'] eqre = re.compile('^(?P' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$') def _scrub_eq(o): @@ -351,6 +351,24 @@ def parseOpts(overrideArguments=None): dest='videopassword', metavar='PASSWORD', help='Video password (vimeo, smotri, youku)') + adobe_pass = optparse.OptionGroup(parser, 'Adobe Pass Options') + adobe_pass.add_option( + '--ap-mso', + dest='ap_mso', metavar='MSO', + help='Adobe Pass Multiple-system operator Identifier') + adobe_pass.add_option( + '--ap-username', + dest='ap_username', metavar='USERNAME', + help='TV Provider Login with this account ID') + adobe_pass.add_option( + '--ap-password', + dest='ap_password', metavar='PASSWORD', + help='TV Provider Account password. If this option is left out, youtube-dl will ask interactively.') + adobe_pass.add_option( + '--ap-list-mso', + action='store_true', dest='ap_list_mso', default=False, + help='List all supported TV Providers') + video_format = optparse.OptionGroup(parser, 'Video Format Options') video_format.add_option( '-f', '--format', @@ -423,7 +441,15 @@ def parseOpts(overrideArguments=None): downloader.add_option( '--fragment-retries', dest='fragment_retries', metavar='RETRIES', default=10, - help='Number of retries for a fragment (default is %default), or "infinite" (DASH only)') + help='Number of retries for a fragment (default is %default), or "infinite" (DASH and hlsnative only)') + downloader.add_option( + '--skip-unavailable-fragments', + action='store_true', dest='skip_unavailable_fragments', default=True, + help='Skip unavailable fragments (DASH and hlsnative only)') + general.add_option( + '--abort-on-unavailable-fragment', + action='store_false', dest='skip_unavailable_fragments', + help='Abort downloading when some fragment is not available') downloader.add_option( '--buffer-size', dest='buffersize', metavar='SIZE', default='1024', diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 1091f17f3..69ca88c85 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -91,6 +91,13 @@ ENGLISH_MONTH_NAMES = [ 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'] +MONTH_NAMES = { + 'en': ENGLISH_MONTH_NAMES, + 'fr': [ + 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', + 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'], +} + KNOWN_EXTENSIONS = ( 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac', 'flv', 'f4v', 'f4a', 'f4b', @@ -1587,11 +1594,13 @@ def parse_count(s): return lookup_unit_table(_UNIT_TABLE, s) -def month_by_name(name): +def month_by_name(name, lang='en'): """ Return the number of a month by (locale-independently) English name """ + month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en']) + try: - return ENGLISH_MONTH_NAMES.index(name) + 1 + return month_names.index(name) + 1 except ValueError: return None @@ -2148,7 +2157,7 @@ def mimetype2ext(mt): return ext _, _, res = mt.rpartition('/') - res = res.lower() + res = res.split(';')[0].strip().lower() return { '3gpp': '3gp', @@ -2168,6 +2177,7 @@ def mimetype2ext(mt): 'f4m+xml': 'f4m', 'hds+xml': 'f4m', 'vnd.ms-sstr+xml': 'ism', + 'quicktime': 'mov', }.get(res, res) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index fe442dd88..903aede58 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.08.31' +__version__ = '2016.09.11.1'