From 51090d636b06bd75b6b567bf3790301975c88256 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 15 Jun 2013 11:04:59 +0200 Subject: [PATCH 001/135] VimeoIE: allow to download password protected videos --- youtube_dl/InfoExtractors.py | 23 +++++++++++++++++++++++ youtube_dl/__init__.py | 2 +- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 33ba0fdd1..e27e0cb7c 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -1063,6 +1063,25 @@ class VimeoIE(InfoExtractor): _VALID_URL = r'(?Phttps?://)?(?:(?:www|player)\.)?vimeo(?Ppro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)' IE_NAME = u'vimeo' + def _verify_video_password(self, url, video_id, webpage): + password = self._downloader.params.get('password', None) + if password is None: + raise ExtractorError(u'This video is protected by a password, use the --password option') + token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1) + data = compat_urllib_parse.urlencode({'password': password, + 'token': token}) + # I didn't manage to use the password with https + if url.startswith('https'): + pass_url = url.replace('https','http') + else: + pass_url = url + password_request = compat_urllib_request.Request(pass_url+'/password', data) + password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + password_request.add_header('Cookie', 'xsrft=%s' % token) + pass_web = self._download_webpage(password_request, video_id, + u'Verifying the password', + u'Wrong password') + def _real_extract(self, url, new_video=True): # Extract ID from URL mobj = re.match(self._VALID_URL, url) @@ -1091,6 +1110,10 @@ class VimeoIE(InfoExtractor): except: if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option') + + if re.search('If so please provide the correct password.', webpage): + self._verify_video_password(url, video_id, webpage) + return self._real_extract(url) else: raise ExtractorError(u'Unable to extract info section') diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 9279ce776..18be9f156 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -423,7 +423,7 @@ def _real_main(argv=None): if opts.usenetrc and (opts.username is not None or opts.password is not None): parser.error(u'using .netrc conflicts with giving username/password') if opts.password is not None and opts.username is None: - parser.error(u'account username missing') + print(u'WARNING: account username missing') if opts.outtmpl is not None and (opts.usetitle or opts.autonumber or opts.useid): parser.error(u'using output template conflicts with using title, video ID or auto number') if opts.usetitle and opts.useid: From 77d0f05f714eec81f076025f309d5d39325d5d0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 21 Jun 2013 19:28:23 +0200 Subject: [PATCH 002/135] YoutubeIE: Detect new Vevo style videos The url_encoded_fmt_stream_map can be found in the video page, but the signature must be decrypted, we get it from the webpage instead of the `get_video_info` pages because we have only discover the algorithm for keys with both sub keys of size 43. --- youtube_dl/InfoExtractors.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 8d228d40d..a12bffbe3 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -724,6 +724,16 @@ class YoutubeIE(InfoExtractor): # Decide which formats to download req_format = self._downloader.params.get('format', None) + try: + mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage) + info = json.loads(mobj.group(1)) + if 'dashmpd' in info['args']: + # Vevo videos with encrypted signatures + self.to_screen(u'Vevo video detected.') + video_info['url_encoded_fmt_stream_map'] = [info['args']['url_encoded_fmt_stream_map']] + except ValueError: + pass + if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): self.report_rtmp_download() video_url_list = [(None, video_info['conn'][0])] @@ -735,6 +745,16 @@ class YoutubeIE(InfoExtractor): url = url_data['url'][0] if 'sig' in url_data: url += '&signature=' + url_data['sig'][0] + if 's' in url_data: + def k(s): + """Decrypt the key the two subkeys must have a length of 43""" + (a,b) = s.split('.') + b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40] + a = a[-40:] + s_dec = '.'.join((a,b))[::-1] + return s_dec + key = k(url_data['s'][0]) + url += '&signature=' + key if 'ratebypass' not in url: url += '&ratebypass=yes' url_map[url_data['itag'][0]] = url From ef75f76f5c0704bc5d91a436c724630b6aa9b7d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 22 Jun 2013 13:13:40 +0200 Subject: [PATCH 003/135] Detect more vevo videos --- youtube_dl/InfoExtractors.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index a12bffbe3..2a748b175 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -727,10 +727,11 @@ class YoutubeIE(InfoExtractor): try: mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage) info = json.loads(mobj.group(1)) - if 'dashmpd' in info['args']: + args = info['args'] + if args.get('ptk','') == 'vevo' or 'dashmpd': # Vevo videos with encrypted signatures self.to_screen(u'Vevo video detected.') - video_info['url_encoded_fmt_stream_map'] = [info['args']['url_encoded_fmt_stream_map']] + video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']] except ValueError: pass From b37fbb990bd29521f5891b0874bb485eb72981fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sat, 22 Jun 2013 13:20:06 +0200 Subject: [PATCH 004/135] Move the decrypting function to a static method --- youtube_dl/InfoExtractors.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 2a748b175..af11333d1 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -379,6 +379,17 @@ class YoutubeIE(InfoExtractor): """Indicate the download will use the RTMP protocol.""" self.to_screen(u'RTMP download detected') + @staticmethod + def _decrypt_signature(s): + """Decrypt the key the two subkeys must have a length of 43""" + (a,b) = s.split('.') + if len(a) != 43 or len(b) != 43: + raise ExtractorError(u'Unable to decrypt signature, subkeys lengths not valid') + b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40] + a = a[-40:] + s_dec = '.'.join((a,b))[::-1] + return s_dec + def _get_available_subtitles(self, video_id): self.report_video_subtitles_download(video_id) request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) @@ -747,15 +758,8 @@ class YoutubeIE(InfoExtractor): if 'sig' in url_data: url += '&signature=' + url_data['sig'][0] if 's' in url_data: - def k(s): - """Decrypt the key the two subkeys must have a length of 43""" - (a,b) = s.split('.') - b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40] - a = a[-40:] - s_dec = '.'.join((a,b))[::-1] - return s_dec - key = k(url_data['s'][0]) - url += '&signature=' + key + signature = self._decrypt_signature(url_data['s'][0]) + url += '&signature=' + signature if 'ratebypass' not in url: url += '&ratebypass=yes' url_map[url_data['itag'][0]] = url From 94eb2dd1fe00ad3cd751c0f01d593b89b331f6f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Brito?= Date: Sat, 22 Jun 2013 19:03:28 -0300 Subject: [PATCH 005/135] README: Add brief description for manpages/apropos. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trying to mimic the manpage of (GNU) `ls`, we don't conjugate the verb as "downloads" or something else. Signed-off-by: Rogério Brito --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2f3c81a7c..ccab537e7 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ % YOUTUBE-DL(1) # NAME -youtube-dl +youtube-dl - download videos from youtube.com or other video platforms # SYNOPSIS **youtube-dl** [OPTIONS] URL [URL...] From c3ab8f866c37c8773b74fa5d0a57235722fe2c61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 23 Jun 2013 12:59:20 +0200 Subject: [PATCH 006/135] Change metavar of "--sub-format" from LANG to FORMAT --- youtube_dl/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 9279ce776..32141439f 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -200,7 +200,7 @@ def parseOpts(overrideArguments=None): action='store_true', dest='listsubtitles', help='lists all available subtitles for the video (currently youtube only)', default=False) video_format.add_option('--sub-format', - action='store', dest='subtitlesformat', metavar='LANG', + action='store', dest='subtitlesformat', metavar='FORMAT', help='subtitle format [srt/sbv] (default=srt) (currently youtube only)', default='srt') video_format.add_option('--sub-lang', '--srt-lang', action='store', dest='subtitleslang', metavar='LANG', From 1037d53988387db920748861e6af644547be0f20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 23 Jun 2013 13:26:49 +0200 Subject: [PATCH 007/135] GenericIE: look for Open Graph info Only if there is a direct link to the file, don't try if it points to a Flash player --- youtube_dl/InfoExtractors.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 619ddeba1..cd91c13f5 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -1409,6 +1409,13 @@ class GenericIE(InfoExtractor): if mobj is None: # Try to find twitter cards info mobj = re.search(r' Date: Sun, 23 Jun 2013 17:35:19 +0500 Subject: [PATCH 008/135] added StatigrIE --- youtube_dl/InfoExtractors.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index cd91c13f5..6788bf402 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -4552,6 +4552,30 @@ class GametrailersIE(InfoExtractor): 'description': video_description, } +class StatigrIE(InfoExtractor): + _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + video_id = mobj.group(1) + webpage = self._download_webpage(url, video_id) + video_url = re.search(r'',webpage).group(1) + thumbnail_url = re.search(r'',webpage).group(1) + title = (re.search(r'(.+?)',webpage).group(1)).strip("| Statigram") + uploader = re.search(r'@(.+) \(Videos\)',title).group(1) + print uploader + ext = "mp4" + return [{ + 'id': video_id, + 'url': video_url, + 'ext': ext, + 'title': title, + 'thumbnail': thumbnail_url, + 'uploader' : uploader + }] + def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. @@ -4618,6 +4642,7 @@ def gen_extractors(): HypemIE(), Vbox7IE(), GametrailersIE(), + StatigrIE(), GenericIE() ] From 01ba4b80a732308e8da66ba89bac9273181db1ad Mon Sep 17 00:00:00 2001 From: "M.Yasoob Khalid" Date: Sun, 23 Jun 2013 18:02:55 +0500 Subject: [PATCH 009/135] added StatigrIE --- youtube_dl/InfoExtractors.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 6788bf402..4aec8c687 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -4565,7 +4565,6 @@ class StatigrIE(InfoExtractor): thumbnail_url = re.search(r'',webpage).group(1) title = (re.search(r'(.+?)',webpage).group(1)).strip("| Statigram") uploader = re.search(r'@(.+) \(Videos\)',title).group(1) - print uploader ext = "mp4" return [{ 'id': video_id, From 4fdd4e6f6f2b34afb657901c036609a9c6a25c9a Mon Sep 17 00:00:00 2001 From: "M.Yasoob Khalid" Date: Sun, 23 Jun 2013 18:56:26 +0500 Subject: [PATCH 010/135] added test for Statigr --- test/tests.json | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/tests.json b/test/tests.json index 3e0db297d..c2c6421fd 100644 --- a/test/tests.json +++ b/test/tests.json @@ -630,5 +630,15 @@ "title": "E3 2013: Debut Trailer" }, "skip": "Requires rtmpdump" + }, + { + "name": "Statigr", + "url": "http://statigr.am/p/484091715184808010_284179915", + "file": "484091715184808010_284179915.mp4", + "md5": "deda4ff333abe2e118740321e992605b", + "info_dict": { + "uploader": "videoseconds", + "title": "Instagram photo by @videoseconds (Videos)" + } } ] From 6b3f5a329bc1d1f91655d37222253de2d71fad96 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 23 Jun 2013 18:58:53 +0200 Subject: [PATCH 011/135] Improve Statigr.am IE --- test/tests.json | 2 +- youtube_dl/InfoExtractors.py | 26 +++++++++++++++++--------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/test/tests.json b/test/tests.json index c2c6421fd..b112e6318 100644 --- a/test/tests.json +++ b/test/tests.json @@ -632,7 +632,7 @@ "skip": "Requires rtmpdump" }, { - "name": "Statigr", + "name": "Statigram", "url": "http://statigr.am/p/484091715184808010_284179915", "file": "484091715184808010_284179915.mp4", "md5": "deda4ff333abe2e118740321e992605b", diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 4aec8c687..507dfc324 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -4552,20 +4552,28 @@ class GametrailersIE(InfoExtractor): 'description': video_description, } -class StatigrIE(InfoExtractor): +class StatigramIE(InfoExtractor): _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) - video_url = re.search(r'',webpage).group(1) - thumbnail_url = re.search(r'',webpage).group(1) - title = (re.search(r'(.+?)',webpage).group(1)).strip("| Statigram") - uploader = re.search(r'@(.+) \(Videos\)',title).group(1) - ext = "mp4" + video_url = self._html_search_regex( + r'', + webpage, u'video URL') + thumbnail_url = self._html_search_regex( + r'', + webpage, u'thumbnail URL', fatal=False) + html_title = self._html_search_regex( + r'(.+?)', + webpage, u'title') + title = html_title.rpartition(u' | Statigram')[0] + uploader = self._html_search_regex( + r'@(.+) \(Videos\)', title, u'uploader name', fatal=False) + ext = 'mp4' + return [{ 'id': video_id, 'url': video_url, @@ -4641,7 +4649,7 @@ def gen_extractors(): HypemIE(), Vbox7IE(), GametrailersIE(), - StatigrIE(), + StatigramIE(), GenericIE() ] From 828dba298328d3fd719bc9b4f076ebf5eb48c3d8 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 23 Jun 2013 18:59:01 +0200 Subject: [PATCH 012/135] Improvge error reporting --- test/test_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_download.py b/test/test_download.py index 577bcdbf2..84b3204fe 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -129,7 +129,7 @@ def generator(test_case): if isinstance(value, compat_str) and value.startswith('md5:'): self.assertEqual(value, 'md5:' + md5(info_dict.get(info_field))) else: - self.assertEqual(value, info_dict.get(info_field)) + self.assertEqual(value, info_dict.get(info_field), u'invalid value for field ' + info_field) # If checkable fields are missing from the test case, print the info_dict test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value)) From a130adb25bb16422a4a03da9252f09839d490494 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 23 Jun 2013 19:41:28 +0200 Subject: [PATCH 013/135] [Statigr.am] Correct uploader id --- youtube_dl/InfoExtractors.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 574d417be..f25732bf5 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -4593,8 +4593,8 @@ class StatigramIE(InfoExtractor): r'(.+?)', webpage, u'title') title = html_title.rpartition(u' | Statigram')[0] - uploader = self._html_search_regex( - r'@(.+) \(Videos\)', title, u'uploader name', fatal=False) + uploader_id = self._html_search_regex( + r'@([^ ]+)', title, u'uploader name', fatal=False) ext = 'mp4' return [{ @@ -4603,7 +4603,7 @@ class StatigramIE(InfoExtractor): 'ext': ext, 'title': title, 'thumbnail': thumbnail_url, - 'uploader' : uploader + 'uploader_id' : uploader_id }] def gen_extractors(): From 9b5fffb14973bf35ede515a482d701f34343abd9 Mon Sep 17 00:00:00 2001 From: "M.Yasoob Khalid" Date: Sun, 23 Jun 2013 22:42:51 +0500 Subject: [PATCH 014/135] added an IE and test for break.com --- test/tests.json | 9 +++++++++ youtube_dl/InfoExtractors.py | 24 ++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/test/tests.json b/test/tests.json index c2c6421fd..30ab11fd8 100644 --- a/test/tests.json +++ b/test/tests.json @@ -640,5 +640,14 @@ "uploader": "videoseconds", "title": "Instagram photo by @videoseconds (Videos)" } + }, + { + "name": "Break", + "url": "http://www.break.com/video/when-girls-act-like-guys-2468056", + "file": "2468056.mp4", + "md5": "a3513fb1547fba4fb6cfac1bffc6c46b", + "info_dict": { + "title": "When Girls Act Like D-Bags" + } } ] diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 4aec8c687..11176dd6a 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -4575,6 +4575,29 @@ class StatigrIE(InfoExtractor): 'uploader' : uploader }] +class BreakIE(InfoExtractor): + _VALID_URL = r'(?:http://)?(?:www\.)?break\.com/video/([^/]+)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + video_id = mobj.group(1).split("-")[-1] + webpage = self._download_webpage(url, video_id) + video_url = re.search(r"videoPath: '(.+?)',",webpage).group(1) + key = re.search(r"icon: '(.+?)',",webpage).group(1) + final_url = str(video_url)+"?"+str(key) + thumbnail_url = re.search(r"thumbnailURL: '(.+?)'",webpage).group(1) + title = re.search(r"sVidTitle: '(.+)',",webpage).group(1) + ext = video_url.split('.')[-1] + return [{ + 'id': video_id, + 'url': final_url, + 'ext': ext, + 'title': title, + 'thumbnail': thumbnail_url, + }] + def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. @@ -4642,6 +4665,7 @@ def gen_extractors(): Vbox7IE(), GametrailersIE(), StatigrIE(), + BreakIE(), GenericIE() ] From 89cb0eb0b6037c9769761eb58026b949bfd1277b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 23 Jun 2013 19:43:18 +0200 Subject: [PATCH 015/135] Use new signature calculation method only if sig is not present --- youtube_dl/InfoExtractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 39d2ef9d4..062e60ca2 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -757,7 +757,7 @@ class YoutubeIE(InfoExtractor): url = url_data['url'][0] if 'sig' in url_data: url += '&signature=' + url_data['sig'][0] - if 's' in url_data: + elif 's' in url_data: signature = self._decrypt_signature(url_data['s'][0]) url += '&signature=' + signature if 'ratebypass' not in url: From dd9829292e4bdb1a3f48a6278cf359eb693da36a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 23 Jun 2013 19:45:42 +0200 Subject: [PATCH 016/135] Improve vevo message --- youtube_dl/InfoExtractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 062e60ca2..b4ce96b05 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -741,7 +741,7 @@ class YoutubeIE(InfoExtractor): args = info['args'] if args.get('ptk','') == 'vevo' or 'dashmpd': # Vevo videos with encrypted signatures - self.to_screen(u'Vevo video detected.') + self.to_screen(u'%s: Vevo video detected.' % video_id) video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']] except ValueError: pass From d6983cb460c86cc0f0786173ae6e497afaa8e032 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 23 Jun 2013 19:57:38 +0200 Subject: [PATCH 017/135] Fix generic class move (add all files) --- youtube_dl/InfoExtractors.py | 248 +--------------------------- youtube_dl/extractor/__init__.py | 0 youtube_dl/extractor/common.py | 266 +++++++++++++++++++++++++++++++ 3 files changed, 267 insertions(+), 247 deletions(-) create mode 100644 youtube_dl/extractor/__init__.py create mode 100644 youtube_dl/extractor/common.py diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index b4ce96b05..b3335a89c 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -23,253 +23,7 @@ import urllib from .utils import * -class InfoExtractor(object): - """Information Extractor class. - - Information extractors are the classes that, given a URL, extract - information about the video (or videos) the URL refers to. This - information includes the real video URL, the video title, author and - others. The information is stored in a dictionary which is then - passed to the FileDownloader. The FileDownloader processes this - information possibly downloading the video to the file system, among - other possible outcomes. - - The dictionaries must include the following fields: - - id: Video identifier. - url: Final video URL. - title: Video title, unescaped. - ext: Video filename extension. - - The following fields are optional: - - format: The video format, defaults to ext (used for --get-format) - thumbnail: Full URL to a video thumbnail image. - description: One-line video description. - uploader: Full name of the video uploader. - upload_date: Video upload date (YYYYMMDD). - uploader_id: Nickname or id of the video uploader. - location: Physical location of the video. - player_url: SWF Player URL (used for rtmpdump). - subtitles: The subtitle file contents. - urlhandle: [internal] The urlHandle to be used to download the file, - like returned by urllib.request.urlopen - - The fields should all be Unicode strings. - - Subclasses of this one should re-define the _real_initialize() and - _real_extract() methods and define a _VALID_URL regexp. - Probably, they should also be added to the list of extractors. - - _real_extract() must return a *list* of information dictionaries as - described above. - - Finally, the _WORKING attribute should be set to False for broken IEs - in order to warn the users and skip the tests. - """ - - _ready = False - _downloader = None - _WORKING = True - - def __init__(self, downloader=None): - """Constructor. Receives an optional downloader.""" - self._ready = False - self.set_downloader(downloader) - - @classmethod - def suitable(cls, url): - """Receives a URL and returns True if suitable for this IE.""" - return re.match(cls._VALID_URL, url) is not None - - @classmethod - def working(cls): - """Getter method for _WORKING.""" - return cls._WORKING - - def initialize(self): - """Initializes an instance (authentication, etc).""" - if not self._ready: - self._real_initialize() - self._ready = True - - def extract(self, url): - """Extracts URL information and returns it in list of dicts.""" - self.initialize() - return self._real_extract(url) - - def set_downloader(self, downloader): - """Sets the downloader for this IE.""" - self._downloader = downloader - - def _real_initialize(self): - """Real initialization process. Redefine in subclasses.""" - pass - - def _real_extract(self, url): - """Real extraction process. Redefine in subclasses.""" - pass - - @property - def IE_NAME(self): - return type(self).__name__[:-2] - - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None): - """ Returns the response handle """ - if note is None: - self.report_download_webpage(video_id) - elif note is not False: - self.to_screen(u'%s: %s' % (video_id, note)) - try: - return compat_urllib_request.urlopen(url_or_request) - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - if errnote is None: - errnote = u'Unable to download webpage' - raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2]) - - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None): - """ Returns a tuple (page content as string, URL handle) """ - urlh = self._request_webpage(url_or_request, video_id, note, errnote) - content_type = urlh.headers.get('Content-Type', '') - m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) - if m: - encoding = m.group(1) - else: - encoding = 'utf-8' - webpage_bytes = urlh.read() - if self._downloader.params.get('dump_intermediate_pages', False): - try: - url = url_or_request.get_full_url() - except AttributeError: - url = url_or_request - self.to_screen(u'Dumping request to ' + url) - dump = base64.b64encode(webpage_bytes).decode('ascii') - self._downloader.to_screen(dump) - content = webpage_bytes.decode(encoding, 'replace') - return (content, urlh) - - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None): - """ Returns the data of the page as a string """ - return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0] - - def to_screen(self, msg): - """Print msg to screen, prefixing it with '[ie_name]'""" - self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg)) - - def report_extraction(self, id_or_name): - """Report information extraction.""" - self.to_screen(u'%s: Extracting information' % id_or_name) - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self.to_screen(u'%s: Downloading webpage' % video_id) - - def report_age_confirmation(self): - """Report attempt to confirm age.""" - self.to_screen(u'Confirming age') - - #Methods for following #608 - #They set the correct value of the '_type' key - def video_result(self, video_info): - """Returns a video""" - video_info['_type'] = 'video' - return video_info - def url_result(self, url, ie=None): - """Returns a url that points to a page that should be processed""" - #TODO: ie should be the class used for getting the info - video_info = {'_type': 'url', - 'url': url, - 'ie_key': ie} - return video_info - def playlist_result(self, entries, playlist_id=None, playlist_title=None): - """Returns a playlist""" - video_info = {'_type': 'playlist', - 'entries': entries} - if playlist_id: - video_info['id'] = playlist_id - if playlist_title: - video_info['title'] = playlist_title - return video_info - - def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): - """ - Perform a regex search on the given string, using a single or a list of - patterns returning the first matching group. - In case of failure return a default value or raise a WARNING or a - ExtractorError, depending on fatal, specifying the field name. - """ - if isinstance(pattern, (str, compat_str, compiled_regex_type)): - mobj = re.search(pattern, string, flags) - else: - for p in pattern: - mobj = re.search(p, string, flags) - if mobj: break - - if sys.stderr.isatty() and os.name != 'nt': - _name = u'\033[0;34m%s\033[0m' % name - else: - _name = name - - if mobj: - # return the first matching group - return next(g for g in mobj.groups() if g is not None) - elif default is not None: - return default - elif fatal: - raise ExtractorError(u'Unable to extract %s' % _name) - else: - self._downloader.report_warning(u'unable to extract %s; ' - u'please report this issue on GitHub.' % _name) - return None - - def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): - """ - Like _search_regex, but strips HTML tags and unescapes entities. - """ - res = self._search_regex(pattern, string, name, default, fatal, flags) - if res: - return clean_html(res).strip() - else: - return res - -class SearchInfoExtractor(InfoExtractor): - """ - Base class for paged search queries extractors. - They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query} - Instances should define _SEARCH_KEY and _MAX_RESULTS. - """ - - @classmethod - def _make_valid_url(cls): - return r'%s(?P|[1-9][0-9]*|all):(?P[\s\S]+)' % cls._SEARCH_KEY - - @classmethod - def suitable(cls, url): - return re.match(cls._make_valid_url(), url) is not None - - def _real_extract(self, query): - mobj = re.match(self._make_valid_url(), query) - if mobj is None: - raise ExtractorError(u'Invalid search query "%s"' % query) - - prefix = mobj.group('prefix') - query = mobj.group('query') - if prefix == '': - return self._get_n_results(query, 1) - elif prefix == 'all': - return self._get_n_results(query, self._MAX_RESULTS) - else: - n = int(prefix) - if n <= 0: - raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query)) - elif n > self._MAX_RESULTS: - self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) - n = self._MAX_RESULTS - return self._get_n_results(query, n) - - def _get_n_results(self, query, n): - """Get a specified number of results for a query""" - raise NotImplementedError("This method must be implemented by sublclasses") +from .extractor.common import InfoExtractor, SearchInfoExtractor class YoutubeIE(InfoExtractor): diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py new file mode 100644 index 000000000..54df689f2 --- /dev/null +++ b/youtube_dl/extractor/common.py @@ -0,0 +1,266 @@ +from __future__ import absolute_import + +import base64 +import os +import re +import socket +import sys + +from ..utils import ( + compat_http_client, + compat_urllib_error, + compat_urllib_request, + compat_str, + + clean_html, + compiled_regex_type, + ExtractorError, +) + +class InfoExtractor(object): + """Information Extractor class. + + Information extractors are the classes that, given a URL, extract + information about the video (or videos) the URL refers to. This + information includes the real video URL, the video title, author and + others. The information is stored in a dictionary which is then + passed to the FileDownloader. The FileDownloader processes this + information possibly downloading the video to the file system, among + other possible outcomes. + + The dictionaries must include the following fields: + + id: Video identifier. + url: Final video URL. + title: Video title, unescaped. + ext: Video filename extension. + + The following fields are optional: + + format: The video format, defaults to ext (used for --get-format) + thumbnail: Full URL to a video thumbnail image. + description: One-line video description. + uploader: Full name of the video uploader. + upload_date: Video upload date (YYYYMMDD). + uploader_id: Nickname or id of the video uploader. + location: Physical location of the video. + player_url: SWF Player URL (used for rtmpdump). + subtitles: The subtitle file contents. + urlhandle: [internal] The urlHandle to be used to download the file, + like returned by urllib.request.urlopen + + The fields should all be Unicode strings. + + Subclasses of this one should re-define the _real_initialize() and + _real_extract() methods and define a _VALID_URL regexp. + Probably, they should also be added to the list of extractors. + + _real_extract() must return a *list* of information dictionaries as + described above. + + Finally, the _WORKING attribute should be set to False for broken IEs + in order to warn the users and skip the tests. + """ + + _ready = False + _downloader = None + _WORKING = True + + def __init__(self, downloader=None): + """Constructor. Receives an optional downloader.""" + self._ready = False + self.set_downloader(downloader) + + @classmethod + def suitable(cls, url): + """Receives a URL and returns True if suitable for this IE.""" + return re.match(cls._VALID_URL, url) is not None + + @classmethod + def working(cls): + """Getter method for _WORKING.""" + return cls._WORKING + + def initialize(self): + """Initializes an instance (authentication, etc).""" + if not self._ready: + self._real_initialize() + self._ready = True + + def extract(self, url): + """Extracts URL information and returns it in list of dicts.""" + self.initialize() + return self._real_extract(url) + + def set_downloader(self, downloader): + """Sets the downloader for this IE.""" + self._downloader = downloader + + def _real_initialize(self): + """Real initialization process. Redefine in subclasses.""" + pass + + def _real_extract(self, url): + """Real extraction process. Redefine in subclasses.""" + pass + + @property + def IE_NAME(self): + return type(self).__name__[:-2] + + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None): + """ Returns the response handle """ + if note is None: + self.report_download_webpage(video_id) + elif note is not False: + self.to_screen(u'%s: %s' % (video_id, note)) + try: + return compat_urllib_request.urlopen(url_or_request) + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + if errnote is None: + errnote = u'Unable to download webpage' + raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2]) + + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None): + """ Returns a tuple (page content as string, URL handle) """ + urlh = self._request_webpage(url_or_request, video_id, note, errnote) + content_type = urlh.headers.get('Content-Type', '') + m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) + if m: + encoding = m.group(1) + else: + encoding = 'utf-8' + webpage_bytes = urlh.read() + if self._downloader.params.get('dump_intermediate_pages', False): + try: + url = url_or_request.get_full_url() + except AttributeError: + url = url_or_request + self.to_screen(u'Dumping request to ' + url) + dump = base64.b64encode(webpage_bytes).decode('ascii') + self._downloader.to_screen(dump) + content = webpage_bytes.decode(encoding, 'replace') + return (content, urlh) + + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None): + """ Returns the data of the page as a string """ + return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0] + + def to_screen(self, msg): + """Print msg to screen, prefixing it with '[ie_name]'""" + self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg)) + + def report_extraction(self, id_or_name): + """Report information extraction.""" + self.to_screen(u'%s: Extracting information' % id_or_name) + + def report_download_webpage(self, video_id): + """Report webpage download.""" + self.to_screen(u'%s: Downloading webpage' % video_id) + + def report_age_confirmation(self): + """Report attempt to confirm age.""" + self.to_screen(u'Confirming age') + + #Methods for following #608 + #They set the correct value of the '_type' key + def video_result(self, video_info): + """Returns a video""" + video_info['_type'] = 'video' + return video_info + def url_result(self, url, ie=None): + """Returns a url that points to a page that should be processed""" + #TODO: ie should be the class used for getting the info + video_info = {'_type': 'url', + 'url': url, + 'ie_key': ie} + return video_info + def playlist_result(self, entries, playlist_id=None, playlist_title=None): + """Returns a playlist""" + video_info = {'_type': 'playlist', + 'entries': entries} + if playlist_id: + video_info['id'] = playlist_id + if playlist_title: + video_info['title'] = playlist_title + return video_info + + def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): + """ + Perform a regex search on the given string, using a single or a list of + patterns returning the first matching group. + In case of failure return a default value or raise a WARNING or a + ExtractorError, depending on fatal, specifying the field name. + """ + if isinstance(pattern, (str, compat_str, compiled_regex_type)): + mobj = re.search(pattern, string, flags) + else: + for p in pattern: + mobj = re.search(p, string, flags) + if mobj: break + + if sys.stderr.isatty() and os.name != 'nt': + _name = u'\033[0;34m%s\033[0m' % name + else: + _name = name + + if mobj: + # return the first matching group + return next(g for g in mobj.groups() if g is not None) + elif default is not None: + return default + elif fatal: + raise ExtractorError(u'Unable to extract %s' % _name) + else: + self._downloader.report_warning(u'unable to extract %s; ' + u'please report this issue on GitHub.' % _name) + return None + + def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): + """ + Like _search_regex, but strips HTML tags and unescapes entities. + """ + res = self._search_regex(pattern, string, name, default, fatal, flags) + if res: + return clean_html(res).strip() + else: + return res + +class SearchInfoExtractor(InfoExtractor): + """ + Base class for paged search queries extractors. + They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query} + Instances should define _SEARCH_KEY and _MAX_RESULTS. + """ + + @classmethod + def _make_valid_url(cls): + return r'%s(?P|[1-9][0-9]*|all):(?P[\s\S]+)' % cls._SEARCH_KEY + + @classmethod + def suitable(cls, url): + return re.match(cls._make_valid_url(), url) is not None + + def _real_extract(self, query): + mobj = re.match(self._make_valid_url(), query) + if mobj is None: + raise ExtractorError(u'Invalid search query "%s"' % query) + + prefix = mobj.group('prefix') + query = mobj.group('query') + if prefix == '': + return self._get_n_results(query, 1) + elif prefix == 'all': + return self._get_n_results(query, self._MAX_RESULTS) + else: + n = int(prefix) + if n <= 0: + raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query)) + elif n > self._MAX_RESULTS: + self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) + n = self._MAX_RESULTS + return self._get_n_results(query, n) + + def _get_n_results(self, query, n): + """Get a specified number of results for a query""" + raise NotImplementedError("This method must be implemented by sublclasses") From c5e8d7af0ed867d70502491e3a80ee09b78ed2ce Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 23 Jun 2013 19:58:33 +0200 Subject: [PATCH 018/135] Move youtube extractors to youtube_dl.extractor.youtube --- youtube_dl/InfoExtractors.py | 735 +------------------------------ youtube_dl/extractor/youtube.py | 757 ++++++++++++++++++++++++++++++++ 2 files changed, 758 insertions(+), 734 deletions(-) create mode 100644 youtube_dl/extractor/youtube.py diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index b3335a89c..d950880ad 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -24,557 +24,9 @@ from .utils import * from .extractor.common import InfoExtractor, SearchInfoExtractor +from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE -class YoutubeIE(InfoExtractor): - """Information extractor for youtube.com.""" - - _VALID_URL = r"""^ - ( - (?:https?://)? # http(s):// (optional) - (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| - tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains - (?:.*?\#/)? # handle anchor (#/) redirect urls - (?: # the various things that can precede the ID: - (?:(?:v|embed|e)/) # v/ or embed/ or e/ - |(?: # or the v= param in all its forms - (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) - (?:\?|\#!?) # the params delimiter ? or # or #! - (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx) - v= - ) - )? # optional -> youtube.com/xxxx is OK - )? # all until now is optional -> you can pass the naked ID - ([0-9A-Za-z_-]+) # here is it! the YouTube video ID - (?(1).+)? # if we found the ID, everything can follow - $""" - _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' - _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' - _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' - _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' - _NETRC_MACHINE = 'youtube' - # Listed in order of quality - _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] - _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] - _video_extensions = { - '13': '3gp', - '17': 'mp4', - '18': 'mp4', - '22': 'mp4', - '37': 'mp4', - '38': 'video', # You actually don't know if this will be MOV, AVI or whatever - '43': 'webm', - '44': 'webm', - '45': 'webm', - '46': 'webm', - } - _video_dimensions = { - '5': '240x400', - '6': '???', - '13': '???', - '17': '144x176', - '18': '360x640', - '22': '720x1280', - '34': '360x640', - '35': '480x854', - '37': '1080x1920', - '38': '3072x4096', - '43': '360x640', - '44': '480x854', - '45': '720x1280', - '46': '1080x1920', - } - IE_NAME = u'youtube' - - @classmethod - def suitable(cls, url): - """Receives a URL and returns True if suitable for this IE.""" - if YoutubePlaylistIE.suitable(url): return False - return re.match(cls._VALID_URL, url, re.VERBOSE) is not None - - def report_lang(self): - """Report attempt to set language.""" - self.to_screen(u'Setting language') - - def report_login(self): - """Report attempt to log in.""" - self.to_screen(u'Logging in') - - def report_video_webpage_download(self, video_id): - """Report attempt to download video webpage.""" - self.to_screen(u'%s: Downloading video webpage' % video_id) - - def report_video_info_webpage_download(self, video_id): - """Report attempt to download video info webpage.""" - self.to_screen(u'%s: Downloading video info webpage' % video_id) - - def report_video_subtitles_download(self, video_id): - """Report attempt to download video info webpage.""" - self.to_screen(u'%s: Checking available subtitles' % video_id) - - def report_video_subtitles_request(self, video_id, sub_lang, format): - """Report attempt to download video info webpage.""" - self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) - - def report_video_subtitles_available(self, video_id, sub_lang_list): - """Report available subtitles.""" - sub_lang = ",".join(list(sub_lang_list.keys())) - self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang)) - - def report_information_extraction(self, video_id): - """Report attempt to extract video information.""" - self.to_screen(u'%s: Extracting video information' % video_id) - - def report_unavailable_format(self, video_id, format): - """Report extracted video URL.""" - self.to_screen(u'%s: Format %s not available' % (video_id, format)) - - def report_rtmp_download(self): - """Indicate the download will use the RTMP protocol.""" - self.to_screen(u'RTMP download detected') - - @staticmethod - def _decrypt_signature(s): - """Decrypt the key the two subkeys must have a length of 43""" - (a,b) = s.split('.') - if len(a) != 43 or len(b) != 43: - raise ExtractorError(u'Unable to decrypt signature, subkeys lengths not valid') - b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40] - a = a[-40:] - s_dec = '.'.join((a,b))[::-1] - return s_dec - - def _get_available_subtitles(self, video_id): - self.report_video_subtitles_download(video_id) - request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) - try: - sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - return (u'unable to download video subtitles: %s' % compat_str(err), None) - sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) - sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list) - if not sub_lang_list: - return (u'video doesn\'t have subtitles', None) - return sub_lang_list - - def _list_available_subtitles(self, video_id): - sub_lang_list = self._get_available_subtitles(video_id) - self.report_video_subtitles_available(video_id, sub_lang_list) - - def _request_subtitle(self, sub_lang, sub_name, video_id, format): - """ - Return tuple: - (error_message, sub_lang, sub) - """ - self.report_video_subtitles_request(video_id, sub_lang, format) - params = compat_urllib_parse.urlencode({ - 'lang': sub_lang, - 'name': sub_name, - 'v': video_id, - 'fmt': format, - }) - url = 'http://www.youtube.com/api/timedtext?' + params - try: - sub = compat_urllib_request.urlopen(url).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - return (u'unable to download video subtitles: %s' % compat_str(err), None, None) - if not sub: - return (u'Did not fetch video subtitles', None, None) - return (None, sub_lang, sub) - - def _request_automatic_caption(self, video_id, webpage): - """We need the webpage for getting the captions url, pass it as an - argument to speed up the process.""" - sub_lang = self._downloader.params.get('subtitleslang') or 'en' - sub_format = self._downloader.params.get('subtitlesformat') - self.to_screen(u'%s: Looking for automatic captions' % video_id) - mobj = re.search(r';ytplayer.config = ({.*?});', webpage) - err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang - if mobj is None: - return [(err_msg, None, None)] - player_config = json.loads(mobj.group(1)) - try: - args = player_config[u'args'] - caption_url = args[u'ttsurl'] - timestamp = args[u'timestamp'] - params = compat_urllib_parse.urlencode({ - 'lang': 'en', - 'tlang': sub_lang, - 'fmt': sub_format, - 'ts': timestamp, - 'kind': 'asr', - }) - subtitles_url = caption_url + '&' + params - sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') - return [(None, sub_lang, sub)] - except KeyError: - return [(err_msg, None, None)] - - def _extract_subtitle(self, video_id): - """ - Return a list with a tuple: - [(error_message, sub_lang, sub)] - """ - sub_lang_list = self._get_available_subtitles(video_id) - sub_format = self._downloader.params.get('subtitlesformat') - if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles - return [(sub_lang_list[0], None, None)] - if self._downloader.params.get('subtitleslang', False): - sub_lang = self._downloader.params.get('subtitleslang') - elif 'en' in sub_lang_list: - sub_lang = 'en' - else: - sub_lang = list(sub_lang_list.keys())[0] - if not sub_lang in sub_lang_list: - return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)] - - subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) - return [subtitle] - - def _extract_all_subtitles(self, video_id): - sub_lang_list = self._get_available_subtitles(video_id) - sub_format = self._downloader.params.get('subtitlesformat') - if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles - return [(sub_lang_list[0], None, None)] - subtitles = [] - for sub_lang in sub_lang_list: - subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) - subtitles.append(subtitle) - return subtitles - - def _print_formats(self, formats): - print('Available formats:') - for x in formats: - print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))) - - def _real_initialize(self): - if self._downloader is None: - return - - username = None - password = None - downloader_params = self._downloader.params - - # Attempt to use provided username and password or .netrc data - if downloader_params.get('username', None) is not None: - username = downloader_params['username'] - password = downloader_params['password'] - elif downloader_params.get('usenetrc', False): - try: - info = netrc.netrc().authenticators(self._NETRC_MACHINE) - if info is not None: - username = info[0] - password = info[2] - else: - raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) - except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) - return - - # Set language - request = compat_urllib_request.Request(self._LANG_URL) - try: - self.report_lang() - compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) - return - - # No authentication to be performed - if username is None: - return - - request = compat_urllib_request.Request(self._LOGIN_URL) - try: - login_page = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) - return - - galx = None - dsh = None - match = re.search(re.compile(r']* id="gaia_loginform"', login_results) is not None: - self._downloader.report_warning(u'unable to log in: bad username or password') - return - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) - return - - # Confirm age - age_form = { - 'next_url': '/', - 'action_confirm': 'Confirm', - } - request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) - try: - self.report_age_confirmation() - age_results = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) - - def _extract_id(self, url): - mobj = re.match(self._VALID_URL, url, re.VERBOSE) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group(2) - return video_id - - def _real_extract(self, url): - # Extract original video URL from URL with redirection, like age verification, using next_url parameter - mobj = re.search(self._NEXT_URL_RE, url) - if mobj: - url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/') - video_id = self._extract_id(url) - - # Get video webpage - self.report_video_webpage_download(video_id) - url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id - request = compat_urllib_request.Request(url) - try: - video_webpage_bytes = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err)) - - video_webpage = video_webpage_bytes.decode('utf-8', 'ignore') - - # Attempt to extract SWF player URL - mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) - if mobj is not None: - player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) - else: - player_url = None - - # Get video info - self.report_video_info_webpage_download(video_id) - for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: - video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' - % (video_id, el_type)) - video_info_webpage = self._download_webpage(video_info_url, video_id, - note=False, - errnote='unable to download video info webpage') - video_info = compat_parse_qs(video_info_webpage) - if 'token' in video_info: - break - if 'token' not in video_info: - if 'reason' in video_info: - raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0]) - else: - raise ExtractorError(u'"token" parameter not in video info for unknown reason') - - # Check for "rental" videos - if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - raise ExtractorError(u'"rental" videos not supported') - - # Start extracting information - self.report_information_extraction(video_id) - - # uploader - if 'author' not in video_info: - raise ExtractorError(u'Unable to extract uploader name') - video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0]) - - # uploader_id - video_uploader_id = None - mobj = re.search(r'', video_webpage) - if mobj is not None: - video_uploader_id = mobj.group(1) - else: - self._downloader.report_warning(u'unable to extract uploader nickname') - - # title - if 'title' not in video_info: - raise ExtractorError(u'Unable to extract video title') - video_title = compat_urllib_parse.unquote_plus(video_info['title'][0]) - - # thumbnail image - if 'thumbnail_url' not in video_info: - self._downloader.report_warning(u'unable to extract video thumbnail') - video_thumbnail = '' - else: # don't panic if we can't find it - video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0]) - - # upload date - upload_date = None - mobj = re.search(r'id="eow-date.*?>(.*?)', video_webpage, re.DOTALL) - if mobj is not None: - upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) - upload_date = unified_strdate(upload_date) - - # description - video_description = get_element_by_id("eow-description", video_webpage) - if video_description: - video_description = clean_html(video_description) - else: - fd_mobj = re.search(r'= 1: - url_map = {} - for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','): - url_data = compat_parse_qs(url_data_str) - if 'itag' in url_data and 'url' in url_data: - url = url_data['url'][0] - if 'sig' in url_data: - url += '&signature=' + url_data['sig'][0] - elif 's' in url_data: - signature = self._decrypt_signature(url_data['s'][0]) - url += '&signature=' + signature - if 'ratebypass' not in url: - url += '&ratebypass=yes' - url_map[url_data['itag'][0]] = url - - format_limit = self._downloader.params.get('format_limit', None) - available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats - if format_limit is not None and format_limit in available_formats: - format_list = available_formats[available_formats.index(format_limit):] - else: - format_list = available_formats - existing_formats = [x for x in format_list if x in url_map] - if len(existing_formats) == 0: - raise ExtractorError(u'no known formats available for video') - if self._downloader.params.get('listformats', None): - self._print_formats(existing_formats) - return - if req_format is None or req_format == 'best': - video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality - elif req_format == 'worst': - video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality - elif req_format in ('-1', 'all'): - video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats - else: - # Specific formats. We pick the first in a slash-delimeted sequence. - # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. - req_formats = req_format.split('/') - video_url_list = None - for rf in req_formats: - if rf in url_map: - video_url_list = [(rf, url_map[rf])] - break - if video_url_list is None: - raise ExtractorError(u'requested format not available') - else: - raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info') - - results = [] - for format_param, video_real_url in video_url_list: - # Extension - video_extension = self._video_extensions.get(format_param, 'flv') - - video_format = '{0} - {1}'.format(format_param if format_param else video_extension, - self._video_dimensions.get(format_param, '???')) - - results.append({ - 'id': video_id, - 'url': video_real_url, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'upload_date': upload_date, - 'title': video_title, - 'ext': video_extension, - 'format': video_format, - 'thumbnail': video_thumbnail, - 'description': video_description, - 'player_url': player_url, - 'subtitles': video_subtitles, - 'duration': video_duration - }) - return results - class MetacafeIE(InfoExtractor): """Information Extractor for metacafe.com.""" @@ -1365,191 +817,6 @@ class YahooSearchIE(SearchInfoExtractor): return res -class YoutubePlaylistIE(InfoExtractor): - """Information Extractor for YouTube playlists.""" - - _VALID_URL = r"""(?: - (?:https?://)? - (?:\w+\.)? - youtube\.com/ - (?: - (?:course|view_play_list|my_playlists|artist|playlist|watch) - \? (?:.*?&)*? (?:p|a|list)= - | p/ - ) - ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,}) - .* - | - ((?:PL|EC|UU)[0-9A-Za-z-_]{10,}) - )""" - _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none' - _MAX_RESULTS = 50 - IE_NAME = u'youtube:playlist' - - @classmethod - def suitable(cls, url): - """Receives a URL and returns True if suitable for this IE.""" - return re.match(cls._VALID_URL, url, re.VERBOSE) is not None - - def _real_extract(self, url): - # Extract playlist id - mobj = re.match(self._VALID_URL, url, re.VERBOSE) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - # Download playlist videos from API - playlist_id = mobj.group(1) or mobj.group(2) - page_num = 1 - videos = [] - - while True: - url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1) - page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num) - - try: - response = json.loads(page) - except ValueError as err: - raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) - - if 'feed' not in response: - raise ExtractorError(u'Got a malformed response from YouTube API') - playlist_title = response['feed']['title']['$t'] - if 'entry' not in response['feed']: - # Number of videos is a multiple of self._MAX_RESULTS - break - - for entry in response['feed']['entry']: - index = entry['yt$position']['$t'] - if 'media$group' in entry and 'media$player' in entry['media$group']: - videos.append((index, entry['media$group']['media$player']['url'])) - - if len(response['feed']['entry']) < self._MAX_RESULTS: - break - page_num += 1 - - videos = [v[1] for v in sorted(videos)] - - url_results = [self.url_result(url, 'Youtube') for url in videos] - return [self.playlist_result(url_results, playlist_id, playlist_title)] - - -class YoutubeChannelIE(InfoExtractor): - """Information Extractor for YouTube channels.""" - - _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" - _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en' - _MORE_PAGES_INDICATOR = 'yt-uix-load-more' - _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' - IE_NAME = u'youtube:channel' - - def extract_videos_from_page(self, page): - ids_in_page = [] - for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(mobj.group(1)) - return ids_in_page - - def _real_extract(self, url): - # Extract channel id - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - # Download channel page - channel_id = mobj.group(1) - video_ids = [] - pagenum = 1 - - url = self._TEMPLATE_URL % (channel_id, pagenum) - page = self._download_webpage(url, channel_id, - u'Downloading page #%s' % pagenum) - - # Extract video identifiers - ids_in_page = self.extract_videos_from_page(page) - video_ids.extend(ids_in_page) - - # Download any subsequent channel pages using the json-based channel_ajax query - if self._MORE_PAGES_INDICATOR in page: - while True: - pagenum = pagenum + 1 - - url = self._MORE_PAGES_URL % (pagenum, channel_id) - page = self._download_webpage(url, channel_id, - u'Downloading page #%s' % pagenum) - - page = json.loads(page) - - ids_in_page = self.extract_videos_from_page(page['content_html']) - video_ids.extend(ids_in_page) - - if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']: - break - - self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) - - urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids] - url_entries = [self.url_result(url, 'Youtube') for url in urls] - return [self.playlist_result(url_entries, channel_id)] - - -class YoutubeUserIE(InfoExtractor): - """Information Extractor for YouTube users.""" - - _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)' - _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s' - _GDATA_PAGE_SIZE = 50 - _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d' - _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]' - IE_NAME = u'youtube:user' - - def _real_extract(self, url): - # Extract username - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - username = mobj.group(1) - - # Download video ids using YouTube Data API. Result size per - # query is limited (currently to 50 videos) so we need to query - # page by page until there are no video ids - it means we got - # all of them. - - video_ids = [] - pagenum = 0 - - while True: - start_index = pagenum * self._GDATA_PAGE_SIZE + 1 - - gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index) - page = self._download_webpage(gdata_url, username, - u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE)) - - # Extract video identifiers - ids_in_page = [] - - for mobj in re.finditer(self._VIDEO_INDICATOR, page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(mobj.group(1)) - - video_ids.extend(ids_in_page) - - # A little optimization - if current page is not - # "full", ie. does not contain PAGE_SIZE video ids then - # we can assume that this page is the last one - there - # are no more ids on further pages - no need to query - # again. - - if len(ids_in_page) < self._GDATA_PAGE_SIZE: - break - - pagenum += 1 - - urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids] - url_results = [self.url_result(url, 'Youtube') for url in urls] - return [self.playlist_result(url_results, playlist_title = username)] - - class BlipTVUserIE(InfoExtractor): """Information Extractor for blip.tv users.""" diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py new file mode 100644 index 000000000..d09128555 --- /dev/null +++ b/youtube_dl/extractor/youtube.py @@ -0,0 +1,757 @@ +# coding: utf-8 +from __future__ import absolute_import + +import json +import netrc +import re +import socket + +from .common import InfoExtractor +from ..utils import ( + compat_http_client, + compat_parse_qs, + compat_urllib_error, + compat_urllib_parse, + compat_urllib_request, + compat_str, + + clean_html, + get_element_by_id, + ExtractorError, + unescapeHTML, + unified_strdate, +) + + +class YoutubeIE(InfoExtractor): + """Information extractor for youtube.com.""" + + _VALID_URL = r"""^ + ( + (?:https?://)? # http(s):// (optional) + (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| + tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains + (?:.*?\#/)? # handle anchor (#/) redirect urls + (?: # the various things that can precede the ID: + (?:(?:v|embed|e)/) # v/ or embed/ or e/ + |(?: # or the v= param in all its forms + (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) + (?:\?|\#!?) # the params delimiter ? or # or #! + (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx) + v= + ) + )? # optional -> youtube.com/xxxx is OK + )? # all until now is optional -> you can pass the naked ID + ([0-9A-Za-z_-]+) # here is it! the YouTube video ID + (?(1).+)? # if we found the ID, everything can follow + $""" + _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' + _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' + _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' + _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' + _NETRC_MACHINE = 'youtube' + # Listed in order of quality + _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] + _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] + _video_extensions = { + '13': '3gp', + '17': 'mp4', + '18': 'mp4', + '22': 'mp4', + '37': 'mp4', + '38': 'video', # You actually don't know if this will be MOV, AVI or whatever + '43': 'webm', + '44': 'webm', + '45': 'webm', + '46': 'webm', + } + _video_dimensions = { + '5': '240x400', + '6': '???', + '13': '???', + '17': '144x176', + '18': '360x640', + '22': '720x1280', + '34': '360x640', + '35': '480x854', + '37': '1080x1920', + '38': '3072x4096', + '43': '360x640', + '44': '480x854', + '45': '720x1280', + '46': '1080x1920', + } + IE_NAME = u'youtube' + + @classmethod + def suitable(cls, url): + """Receives a URL and returns True if suitable for this IE.""" + if YoutubePlaylistIE.suitable(url): return False + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + + def report_lang(self): + """Report attempt to set language.""" + self.to_screen(u'Setting language') + + def report_login(self): + """Report attempt to log in.""" + self.to_screen(u'Logging in') + + def report_video_webpage_download(self, video_id): + """Report attempt to download video webpage.""" + self.to_screen(u'%s: Downloading video webpage' % video_id) + + def report_video_info_webpage_download(self, video_id): + """Report attempt to download video info webpage.""" + self.to_screen(u'%s: Downloading video info webpage' % video_id) + + def report_video_subtitles_download(self, video_id): + """Report attempt to download video info webpage.""" + self.to_screen(u'%s: Checking available subtitles' % video_id) + + def report_video_subtitles_request(self, video_id, sub_lang, format): + """Report attempt to download video info webpage.""" + self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) + + def report_video_subtitles_available(self, video_id, sub_lang_list): + """Report available subtitles.""" + sub_lang = ",".join(list(sub_lang_list.keys())) + self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang)) + + def report_information_extraction(self, video_id): + """Report attempt to extract video information.""" + self.to_screen(u'%s: Extracting video information' % video_id) + + def report_unavailable_format(self, video_id, format): + """Report extracted video URL.""" + self.to_screen(u'%s: Format %s not available' % (video_id, format)) + + def report_rtmp_download(self): + """Indicate the download will use the RTMP protocol.""" + self.to_screen(u'RTMP download detected') + + @staticmethod + def _decrypt_signature(s): + """Decrypt the key the two subkeys must have a length of 43""" + (a,b) = s.split('.') + if len(a) != 43 or len(b) != 43: + raise ExtractorError(u'Unable to decrypt signature, subkeys lengths not valid') + b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40] + a = a[-40:] + s_dec = '.'.join((a,b))[::-1] + return s_dec + + def _get_available_subtitles(self, video_id): + self.report_video_subtitles_download(video_id) + request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) + try: + sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + return (u'unable to download video subtitles: %s' % compat_str(err), None) + sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) + sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list) + if not sub_lang_list: + return (u'video doesn\'t have subtitles', None) + return sub_lang_list + + def _list_available_subtitles(self, video_id): + sub_lang_list = self._get_available_subtitles(video_id) + self.report_video_subtitles_available(video_id, sub_lang_list) + + def _request_subtitle(self, sub_lang, sub_name, video_id, format): + """ + Return tuple: + (error_message, sub_lang, sub) + """ + self.report_video_subtitles_request(video_id, sub_lang, format) + params = compat_urllib_parse.urlencode({ + 'lang': sub_lang, + 'name': sub_name, + 'v': video_id, + 'fmt': format, + }) + url = 'http://www.youtube.com/api/timedtext?' + params + try: + sub = compat_urllib_request.urlopen(url).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + return (u'unable to download video subtitles: %s' % compat_str(err), None, None) + if not sub: + return (u'Did not fetch video subtitles', None, None) + return (None, sub_lang, sub) + + def _request_automatic_caption(self, video_id, webpage): + """We need the webpage for getting the captions url, pass it as an + argument to speed up the process.""" + sub_lang = self._downloader.params.get('subtitleslang') or 'en' + sub_format = self._downloader.params.get('subtitlesformat') + self.to_screen(u'%s: Looking for automatic captions' % video_id) + mobj = re.search(r';ytplayer.config = ({.*?});', webpage) + err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang + if mobj is None: + return [(err_msg, None, None)] + player_config = json.loads(mobj.group(1)) + try: + args = player_config[u'args'] + caption_url = args[u'ttsurl'] + timestamp = args[u'timestamp'] + params = compat_urllib_parse.urlencode({ + 'lang': 'en', + 'tlang': sub_lang, + 'fmt': sub_format, + 'ts': timestamp, + 'kind': 'asr', + }) + subtitles_url = caption_url + '&' + params + sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') + return [(None, sub_lang, sub)] + except KeyError: + return [(err_msg, None, None)] + + def _extract_subtitle(self, video_id): + """ + Return a list with a tuple: + [(error_message, sub_lang, sub)] + """ + sub_lang_list = self._get_available_subtitles(video_id) + sub_format = self._downloader.params.get('subtitlesformat') + if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles + return [(sub_lang_list[0], None, None)] + if self._downloader.params.get('subtitleslang', False): + sub_lang = self._downloader.params.get('subtitleslang') + elif 'en' in sub_lang_list: + sub_lang = 'en' + else: + sub_lang = list(sub_lang_list.keys())[0] + if not sub_lang in sub_lang_list: + return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)] + + subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) + return [subtitle] + + def _extract_all_subtitles(self, video_id): + sub_lang_list = self._get_available_subtitles(video_id) + sub_format = self._downloader.params.get('subtitlesformat') + if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles + return [(sub_lang_list[0], None, None)] + subtitles = [] + for sub_lang in sub_lang_list: + subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) + subtitles.append(subtitle) + return subtitles + + def _print_formats(self, formats): + print('Available formats:') + for x in formats: + print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))) + + def _real_initialize(self): + if self._downloader is None: + return + + username = None + password = None + downloader_params = self._downloader.params + + # Attempt to use provided username and password or .netrc data + if downloader_params.get('username', None) is not None: + username = downloader_params['username'] + password = downloader_params['password'] + elif downloader_params.get('usenetrc', False): + try: + info = netrc.netrc().authenticators(self._NETRC_MACHINE) + if info is not None: + username = info[0] + password = info[2] + else: + raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) + except (IOError, netrc.NetrcParseError) as err: + self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) + return + + # Set language + request = compat_urllib_request.Request(self._LANG_URL) + try: + self.report_lang() + compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) + return + + # No authentication to be performed + if username is None: + return + + request = compat_urllib_request.Request(self._LOGIN_URL) + try: + login_page = compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) + return + + galx = None + dsh = None + match = re.search(re.compile(r']* id="gaia_loginform"', login_results) is not None: + self._downloader.report_warning(u'unable to log in: bad username or password') + return + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) + return + + # Confirm age + age_form = { + 'next_url': '/', + 'action_confirm': 'Confirm', + } + request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) + try: + self.report_age_confirmation() + age_results = compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) + + def _extract_id(self, url): + mobj = re.match(self._VALID_URL, url, re.VERBOSE) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + video_id = mobj.group(2) + return video_id + + def _real_extract(self, url): + # Extract original video URL from URL with redirection, like age verification, using next_url parameter + mobj = re.search(self._NEXT_URL_RE, url) + if mobj: + url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/') + video_id = self._extract_id(url) + + # Get video webpage + self.report_video_webpage_download(video_id) + url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id + request = compat_urllib_request.Request(url) + try: + video_webpage_bytes = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err)) + + video_webpage = video_webpage_bytes.decode('utf-8', 'ignore') + + # Attempt to extract SWF player URL + mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) + if mobj is not None: + player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) + else: + player_url = None + + # Get video info + self.report_video_info_webpage_download(video_id) + for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: + video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' + % (video_id, el_type)) + video_info_webpage = self._download_webpage(video_info_url, video_id, + note=False, + errnote='unable to download video info webpage') + video_info = compat_parse_qs(video_info_webpage) + if 'token' in video_info: + break + if 'token' not in video_info: + if 'reason' in video_info: + raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0]) + else: + raise ExtractorError(u'"token" parameter not in video info for unknown reason') + + # Check for "rental" videos + if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: + raise ExtractorError(u'"rental" videos not supported') + + # Start extracting information + self.report_information_extraction(video_id) + + # uploader + if 'author' not in video_info: + raise ExtractorError(u'Unable to extract uploader name') + video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0]) + + # uploader_id + video_uploader_id = None + mobj = re.search(r'', video_webpage) + if mobj is not None: + video_uploader_id = mobj.group(1) + else: + self._downloader.report_warning(u'unable to extract uploader nickname') + + # title + if 'title' not in video_info: + raise ExtractorError(u'Unable to extract video title') + video_title = compat_urllib_parse.unquote_plus(video_info['title'][0]) + + # thumbnail image + if 'thumbnail_url' not in video_info: + self._downloader.report_warning(u'unable to extract video thumbnail') + video_thumbnail = '' + else: # don't panic if we can't find it + video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0]) + + # upload date + upload_date = None + mobj = re.search(r'id="eow-date.*?>(.*?)', video_webpage, re.DOTALL) + if mobj is not None: + upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) + upload_date = unified_strdate(upload_date) + + # description + video_description = get_element_by_id("eow-description", video_webpage) + if video_description: + video_description = clean_html(video_description) + else: + fd_mobj = re.search(r'= 1: + url_map = {} + for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','): + url_data = compat_parse_qs(url_data_str) + if 'itag' in url_data and 'url' in url_data: + url = url_data['url'][0] + if 'sig' in url_data: + url += '&signature=' + url_data['sig'][0] + elif 's' in url_data: + signature = self._decrypt_signature(url_data['s'][0]) + url += '&signature=' + signature + if 'ratebypass' not in url: + url += '&ratebypass=yes' + url_map[url_data['itag'][0]] = url + + format_limit = self._downloader.params.get('format_limit', None) + available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats + if format_limit is not None and format_limit in available_formats: + format_list = available_formats[available_formats.index(format_limit):] + else: + format_list = available_formats + existing_formats = [x for x in format_list if x in url_map] + if len(existing_formats) == 0: + raise ExtractorError(u'no known formats available for video') + if self._downloader.params.get('listformats', None): + self._print_formats(existing_formats) + return + if req_format is None or req_format == 'best': + video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality + elif req_format == 'worst': + video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality + elif req_format in ('-1', 'all'): + video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats + else: + # Specific formats. We pick the first in a slash-delimeted sequence. + # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. + req_formats = req_format.split('/') + video_url_list = None + for rf in req_formats: + if rf in url_map: + video_url_list = [(rf, url_map[rf])] + break + if video_url_list is None: + raise ExtractorError(u'requested format not available') + else: + raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info') + + results = [] + for format_param, video_real_url in video_url_list: + # Extension + video_extension = self._video_extensions.get(format_param, 'flv') + + video_format = '{0} - {1}'.format(format_param if format_param else video_extension, + self._video_dimensions.get(format_param, '???')) + + results.append({ + 'id': video_id, + 'url': video_real_url, + 'uploader': video_uploader, + 'uploader_id': video_uploader_id, + 'upload_date': upload_date, + 'title': video_title, + 'ext': video_extension, + 'format': video_format, + 'thumbnail': video_thumbnail, + 'description': video_description, + 'player_url': player_url, + 'subtitles': video_subtitles, + 'duration': video_duration + }) + return results + +class YoutubePlaylistIE(InfoExtractor): + """Information Extractor for YouTube playlists.""" + + _VALID_URL = r"""(?: + (?:https?://)? + (?:\w+\.)? + youtube\.com/ + (?: + (?:course|view_play_list|my_playlists|artist|playlist|watch) + \? (?:.*?&)*? (?:p|a|list)= + | p/ + ) + ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,}) + .* + | + ((?:PL|EC|UU)[0-9A-Za-z-_]{10,}) + )""" + _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none' + _MAX_RESULTS = 50 + IE_NAME = u'youtube:playlist' + + @classmethod + def suitable(cls, url): + """Receives a URL and returns True if suitable for this IE.""" + return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + + def _real_extract(self, url): + # Extract playlist id + mobj = re.match(self._VALID_URL, url, re.VERBOSE) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + + # Download playlist videos from API + playlist_id = mobj.group(1) or mobj.group(2) + page_num = 1 + videos = [] + + while True: + url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1) + page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num) + + try: + response = json.loads(page) + except ValueError as err: + raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) + + if 'feed' not in response: + raise ExtractorError(u'Got a malformed response from YouTube API') + playlist_title = response['feed']['title']['$t'] + if 'entry' not in response['feed']: + # Number of videos is a multiple of self._MAX_RESULTS + break + + for entry in response['feed']['entry']: + index = entry['yt$position']['$t'] + if 'media$group' in entry and 'media$player' in entry['media$group']: + videos.append((index, entry['media$group']['media$player']['url'])) + + if len(response['feed']['entry']) < self._MAX_RESULTS: + break + page_num += 1 + + videos = [v[1] for v in sorted(videos)] + + url_results = [self.url_result(url, 'Youtube') for url in videos] + return [self.playlist_result(url_results, playlist_id, playlist_title)] + + +class YoutubeChannelIE(InfoExtractor): + """Information Extractor for YouTube channels.""" + + _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" + _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en' + _MORE_PAGES_INDICATOR = 'yt-uix-load-more' + _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' + IE_NAME = u'youtube:channel' + + def extract_videos_from_page(self, page): + ids_in_page = [] + for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page): + if mobj.group(1) not in ids_in_page: + ids_in_page.append(mobj.group(1)) + return ids_in_page + + def _real_extract(self, url): + # Extract channel id + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + + # Download channel page + channel_id = mobj.group(1) + video_ids = [] + pagenum = 1 + + url = self._TEMPLATE_URL % (channel_id, pagenum) + page = self._download_webpage(url, channel_id, + u'Downloading page #%s' % pagenum) + + # Extract video identifiers + ids_in_page = self.extract_videos_from_page(page) + video_ids.extend(ids_in_page) + + # Download any subsequent channel pages using the json-based channel_ajax query + if self._MORE_PAGES_INDICATOR in page: + while True: + pagenum = pagenum + 1 + + url = self._MORE_PAGES_URL % (pagenum, channel_id) + page = self._download_webpage(url, channel_id, + u'Downloading page #%s' % pagenum) + + page = json.loads(page) + + ids_in_page = self.extract_videos_from_page(page['content_html']) + video_ids.extend(ids_in_page) + + if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']: + break + + self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) + + urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids] + url_entries = [self.url_result(url, 'Youtube') for url in urls] + return [self.playlist_result(url_entries, channel_id)] + + +class YoutubeUserIE(InfoExtractor): + """Information Extractor for YouTube users.""" + + _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)' + _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s' + _GDATA_PAGE_SIZE = 50 + _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d' + _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]' + IE_NAME = u'youtube:user' + + def _real_extract(self, url): + # Extract username + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + + username = mobj.group(1) + + # Download video ids using YouTube Data API. Result size per + # query is limited (currently to 50 videos) so we need to query + # page by page until there are no video ids - it means we got + # all of them. + + video_ids = [] + pagenum = 0 + + while True: + start_index = pagenum * self._GDATA_PAGE_SIZE + 1 + + gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index) + page = self._download_webpage(gdata_url, username, + u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE)) + + # Extract video identifiers + ids_in_page = [] + + for mobj in re.finditer(self._VIDEO_INDICATOR, page): + if mobj.group(1) not in ids_in_page: + ids_in_page.append(mobj.group(1)) + + video_ids.extend(ids_in_page) + + # A little optimization - if current page is not + # "full", ie. does not contain PAGE_SIZE video ids then + # we can assume that this page is the last one - there + # are no more ids on further pages - no need to query + # again. + + if len(ids_in_page) < self._GDATA_PAGE_SIZE: + break + + pagenum += 1 + + urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids] + url_results = [self.url_result(url, 'Youtube') for url in urls] + return [self.playlist_result(url_results, playlist_title = username)] From 93d3a642a9cfdd268f6967dd8012e42dc4412907 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 23 Jun 2013 19:59:40 +0200 Subject: [PATCH 019/135] [youtube] remove dead code --- youtube_dl/extractor/youtube.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d09128555..a7cb88b5a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -343,7 +343,7 @@ class YoutubeIE(InfoExtractor): request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) try: self.report_age_confirmation() - age_results = compat_urllib_request.urlopen(request).read().decode('utf-8') + compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) @@ -471,7 +471,7 @@ class YoutubeIE(InfoExtractor): self._downloader.report_warning(sub_error) if self._downloader.params.get('listsubtitles', False): - sub_lang_list = self._list_available_subtitles(video_id) + self._list_available_subtitles(video_id) return if 'length_seconds' not in video_info: @@ -480,9 +480,6 @@ class YoutubeIE(InfoExtractor): else: video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]) - # token - video_token = compat_urllib_parse.unquote_plus(video_info['token'][0]) - # Decide which formats to download req_format = self._downloader.params.get('format', None) From 38cbc40a64513718228ad14cdb616cf955246224 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 23 Jun 2013 20:07:51 +0200 Subject: [PATCH 020/135] Move Metacafe and Statigram into their own files, and remove absolute import --- youtube_dl/InfoExtractors.py | 129 +----------------------------- youtube_dl/extractor/metacafe.py | 110 +++++++++++++++++++++++++ youtube_dl/extractor/statigram.py | 33 ++++++++ youtube_dl/extractor/youtube.py | 1 - 4 files changed, 145 insertions(+), 128 deletions(-) create mode 100644 youtube_dl/extractor/metacafe.py create mode 100644 youtube_dl/extractor/statigram.py diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index d950880ad..fcc5d02cf 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -24,106 +24,12 @@ from .utils import * from .extractor.common import InfoExtractor, SearchInfoExtractor +from .extractor.metacafe import MetacafeIE +from .extractor.statigram import StatigramIE from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE -class MetacafeIE(InfoExtractor): - """Information Extractor for metacafe.com.""" - - _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' - _DISCLAIMER = 'http://www.metacafe.com/family_filter/' - _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' - IE_NAME = u'metacafe' - - def report_disclaimer(self): - """Report disclaimer retrieval.""" - self.to_screen(u'Retrieving disclaimer') - - def _real_initialize(self): - # Retrieve disclaimer - request = compat_urllib_request.Request(self._DISCLAIMER) - try: - self.report_disclaimer() - disclaimer = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err)) - - # Confirm age - disclaimer_form = { - 'filters': '0', - 'submit': "Continue - I'm over 18", - } - request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form)) - try: - self.report_age_confirmation() - disclaimer = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) - - def _real_extract(self, url): - # Extract id and simplified title from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group(1) - - # Check if video comes from YouTube - mobj2 = re.match(r'^yt-(.*)$', video_id) - if mobj2 is not None: - return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')] - - # Retrieve video webpage to extract further information - webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id) - - # Extract URL, uploader and title from webpage - self.report_extraction(video_id) - mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage) - if mobj is not None: - mediaURL = compat_urllib_parse.unquote(mobj.group(1)) - video_extension = mediaURL[-3:] - - # Extract gdaKey if available - mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage) - if mobj is None: - video_url = mediaURL - else: - gdaKey = mobj.group(1) - video_url = '%s?__gda__=%s' % (mediaURL, gdaKey) - else: - mobj = re.search(r' name="flashvars" value="(.*?)"', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - vardict = compat_parse_qs(mobj.group(1)) - if 'mediaData' not in vardict: - raise ExtractorError(u'Unable to extract media URL') - mobj = re.search(r'"mediaURL":"(?Phttp.*?)",(.*?)"key":"(?P.*?)"', vardict['mediaData'][0]) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - mediaURL = mobj.group('mediaURL').replace('\\/', '/') - video_extension = mediaURL[-3:] - video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key')) - - mobj = re.search(r'(?im)(.*) - Video', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1).decode('utf-8') - - mobj = re.search(r'submitter=(.*?);', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract uploader nickname') - video_uploader = mobj.group(1) - - return [{ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), - 'uploader': video_uploader.decode('utf-8'), - 'upload_date': None, - 'title': video_title, - 'ext': video_extension.decode('utf-8'), - }] - class DailymotionIE(InfoExtractor): """Information Extractor for Dailymotion""" @@ -3621,37 +3527,6 @@ class GametrailersIE(InfoExtractor): 'description': video_description, } -class StatigramIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - video_id = mobj.group(1) - webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'', - webpage, u'video URL') - thumbnail_url = self._html_search_regex( - r'', - webpage, u'thumbnail URL', fatal=False) - html_title = self._html_search_regex( - r'(.+?)', - webpage, u'title') - title = html_title.rpartition(u' | Statigram')[0] - uploader_id = self._html_search_regex( - r'@([^ ]+)', title, u'uploader name', fatal=False) - ext = 'mp4' - - return [{ - 'id': video_id, - 'url': video_url, - 'ext': ext, - 'title': title, - 'thumbnail': thumbnail_url, - 'uploader_id' : uploader_id - }] - def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py new file mode 100644 index 000000000..66d6554fe --- /dev/null +++ b/youtube_dl/extractor/metacafe.py @@ -0,0 +1,110 @@ +import re +import socket + +from .common import InfoExtractor +from ..utils import ( + compat_http_client, + compat_parse_qs, + compat_urllib_error, + compat_urllib_parse, + compat_urllib_request, + compat_str, + + ExtractorError, +) + +class MetacafeIE(InfoExtractor): + """Information Extractor for metacafe.com.""" + + _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' + _DISCLAIMER = 'http://www.metacafe.com/family_filter/' + _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' + IE_NAME = u'metacafe' + + def report_disclaimer(self): + """Report disclaimer retrieval.""" + self.to_screen(u'Retrieving disclaimer') + + def _real_initialize(self): + # Retrieve disclaimer + request = compat_urllib_request.Request(self._DISCLAIMER) + try: + self.report_disclaimer() + compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err)) + + # Confirm age + disclaimer_form = { + 'filters': '0', + 'submit': "Continue - I'm over 18", + } + request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form)) + try: + self.report_age_confirmation() + compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) + + def _real_extract(self, url): + # Extract id and simplified title from URL + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + + video_id = mobj.group(1) + + # Check if video comes from YouTube + mobj2 = re.match(r'^yt-(.*)$', video_id) + if mobj2 is not None: + return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')] + + # Retrieve video webpage to extract further information + webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id) + + # Extract URL, uploader and title from webpage + self.report_extraction(video_id) + mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage) + if mobj is not None: + mediaURL = compat_urllib_parse.unquote(mobj.group(1)) + video_extension = mediaURL[-3:] + + # Extract gdaKey if available + mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage) + if mobj is None: + video_url = mediaURL + else: + gdaKey = mobj.group(1) + video_url = '%s?__gda__=%s' % (mediaURL, gdaKey) + else: + mobj = re.search(r' name="flashvars" value="(.*?)"', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract media URL') + vardict = compat_parse_qs(mobj.group(1)) + if 'mediaData' not in vardict: + raise ExtractorError(u'Unable to extract media URL') + mobj = re.search(r'"mediaURL":"(?Phttp.*?)",(.*?)"key":"(?P.*?)"', vardict['mediaData'][0]) + if mobj is None: + raise ExtractorError(u'Unable to extract media URL') + mediaURL = mobj.group('mediaURL').replace('\\/', '/') + video_extension = mediaURL[-3:] + video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key')) + + mobj = re.search(r'(?im)(.*) - Video', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract title') + video_title = mobj.group(1).decode('utf-8') + + mobj = re.search(r'submitter=(.*?);', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract uploader nickname') + video_uploader = mobj.group(1) + + return [{ + 'id': video_id.decode('utf-8'), + 'url': video_url.decode('utf-8'), + 'uploader': video_uploader.decode('utf-8'), + 'upload_date': None, + 'title': video_title, + 'ext': video_extension.decode('utf-8'), + }] diff --git a/youtube_dl/extractor/statigram.py b/youtube_dl/extractor/statigram.py new file mode 100644 index 000000000..95d2ee3c0 --- /dev/null +++ b/youtube_dl/extractor/statigram.py @@ -0,0 +1,33 @@ +import re + +from .common import InfoExtractor + +class StatigramIE(InfoExtractor): + _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + webpage = self._download_webpage(url, video_id) + video_url = self._html_search_regex( + r'', + webpage, u'video URL') + thumbnail_url = self._html_search_regex( + r'', + webpage, u'thumbnail URL', fatal=False) + html_title = self._html_search_regex( + r'(.+?)', + webpage, u'title') + title = html_title.rpartition(u' | Statigram')[0] + uploader_id = self._html_search_regex( + r'@([^ ]+)', title, u'uploader name', fatal=False) + ext = 'mp4' + + return [{ + 'id': video_id, + 'url': video_url, + 'ext': ext, + 'title': title, + 'thumbnail': thumbnail_url, + 'uploader_id' : uploader_id + }] diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a7cb88b5a..4aef85d0c 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1,5 +1,4 @@ # coding: utf-8 -from __future__ import absolute_import import json import netrc From 219b8130dfe564701a0ebd27bedfba7785b24b52 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 23 Jun 2013 20:09:47 +0200 Subject: [PATCH 021/135] Move DailyMotion into its own file --- youtube_dl/InfoExtractors.py | 69 +------------------------- youtube_dl/extractor/dailymotion.py | 77 +++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 68 deletions(-) create mode 100644 youtube_dl/extractor/dailymotion.py diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index fcc5d02cf..b32bd3d94 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -24,80 +24,13 @@ from .utils import * from .extractor.common import InfoExtractor, SearchInfoExtractor +from .extractor.dailymotion import DailymotionIE from .extractor.metacafe import MetacafeIE from .extractor.statigram import StatigramIE from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE -class DailymotionIE(InfoExtractor): - """Information Extractor for Dailymotion""" - - _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' - IE_NAME = u'dailymotion' - - def _real_extract(self, url): - # Extract id and simplified title from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group(1).split('_')[0].split('?')[0] - - video_extension = 'mp4' - - # Retrieve video webpage to extract further information - request = compat_urllib_request.Request(url) - request.add_header('Cookie', 'family_filter=off') - webpage = self._download_webpage(request, video_id) - - # Extract URL, uploader and title from webpage - self.report_extraction(video_id) - mobj = re.search(r'\s*var flashvars = (.*)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - flashvars = compat_urllib_parse.unquote(mobj.group(1)) - - for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']: - if key in flashvars: - max_quality = key - self.to_screen(u'Using %s' % key) - break - else: - raise ExtractorError(u'Unable to extract video URL') - - mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars) - if mobj is None: - raise ExtractorError(u'Unable to extract video URL') - - video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/') - - # TODO: support choosing qualities - - mobj = re.search(r'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = unescapeHTML(mobj.group('title')) - - video_uploader = None - video_uploader = self._search_regex([r'(?im)[^<]+?]+?>([^<]+?)', - # Looking for official user - r'<(?:span|a) .*?rel="author".*?>([^<]+?)([0-9]{2})-([0-9]{2})-([0-9]{4})', webpage) - if mobj is not None: - video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) - - return [{ - 'id': video_id, - 'url': video_url, - 'uploader': video_uploader, - 'upload_date': video_upload_date, - 'title': video_title, - 'ext': video_extension, - }] class PhotobucketIE(InfoExtractor): diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py new file mode 100644 index 000000000..34306b073 --- /dev/null +++ b/youtube_dl/extractor/dailymotion.py @@ -0,0 +1,77 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_request, + compat_urllib_parse, + + ExtractorError, + unescapeHTML, +) + +class DailymotionIE(InfoExtractor): + """Information Extractor for Dailymotion""" + + _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' + IE_NAME = u'dailymotion' + + def _real_extract(self, url): + # Extract id and simplified title from URL + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group(1).split('_')[0].split('?')[0] + + video_extension = 'mp4' + + # Retrieve video webpage to extract further information + request = compat_urllib_request.Request(url) + request.add_header('Cookie', 'family_filter=off') + webpage = self._download_webpage(request, video_id) + + # Extract URL, uploader and title from webpage + self.report_extraction(video_id) + mobj = re.search(r'\s*var flashvars = (.*)', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract media URL') + flashvars = compat_urllib_parse.unquote(mobj.group(1)) + + for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']: + if key in flashvars: + max_quality = key + self.to_screen(u'Using %s' % key) + break + else: + raise ExtractorError(u'Unable to extract video URL') + + mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars) + if mobj is None: + raise ExtractorError(u'Unable to extract video URL') + + video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/') + + # TODO: support choosing qualities + + mobj = re.search(r'', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract title') + video_title = unescapeHTML(mobj.group('title')) + + video_uploader = None + video_uploader = self._search_regex([r'(?im)[^<]+?]+?>([^<]+?)', + # Looking for official user + r'<(?:span|a) .*?rel="author".*?>([^<]+?)([0-9]{2})-([0-9]{2})-([0-9]{4})', webpage) + if mobj is not None: + video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) + + return [{ + 'id': video_id, + 'url': video_url, + 'uploader': video_uploader, + 'upload_date': video_upload_date, + 'title': video_title, + 'ext': video_extension, + }] From 97d6faaced24ffb3e2e91980ba068e22c76b0416 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 23 Jun 2013 20:12:18 +0200 Subject: [PATCH 022/135] Move Photobucket into its own file --- youtube_dl/InfoExtractors.py | 57 +------------------------ youtube_dl/extractor/photobucket.py | 66 +++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 56 deletions(-) create mode 100644 youtube_dl/extractor/photobucket.py diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index b32bd3d94..d1488ade9 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -27,68 +27,13 @@ from .extractor.common import InfoExtractor, SearchInfoExtractor from .extractor.dailymotion import DailymotionIE from .extractor.metacafe import MetacafeIE from .extractor.statigram import StatigramIE +from .extractor.photobucket import PhotobucketIE from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE -class PhotobucketIE(InfoExtractor): - """Information extractor for photobucket.com.""" - - # TODO: the original _VALID_URL was: - # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' - # Check if it's necessary to keep the old extracion process - _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P.*)\.(?P(flv)|(mp4))' - IE_NAME = u'photobucket' - - def _real_extract(self, url): - # Extract id from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group('id') - - video_extension = mobj.group('ext') - - # Retrieve video webpage to extract further information - webpage = self._download_webpage(url, video_id) - - # Extract URL, uploader, and title from webpage - self.report_extraction(video_id) - # We try first by looking the javascript code: - mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P.*?)\);', webpage) - if mobj is not None: - info = json.loads(mobj.group('json')) - return [{ - 'id': video_id, - 'url': info[u'downloadUrl'], - 'uploader': info[u'username'], - 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'), - 'title': info[u'title'], - 'ext': video_extension, - 'thumbnail': info[u'thumbUrl'], - }] - - # We try looking in other parts of the webpage - video_url = self._search_regex(r'', - webpage, u'video URL') - - mobj = re.search(r'(.*) video by (.*) - Photobucket', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1).decode('utf-8') - video_uploader = mobj.group(2).decode('utf-8') - - return [{ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), - 'uploader': video_uploader, - 'upload_date': None, - 'title': video_title, - 'ext': video_extension.decode('utf-8'), - }] class YahooIE(InfoExtractor): diff --git a/youtube_dl/extractor/photobucket.py b/youtube_dl/extractor/photobucket.py new file mode 100644 index 000000000..cd7fe6f52 --- /dev/null +++ b/youtube_dl/extractor/photobucket.py @@ -0,0 +1,66 @@ +import datetime +import json +import re + +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, +) + +class PhotobucketIE(InfoExtractor): + """Information extractor for photobucket.com.""" + + # TODO: the original _VALID_URL was: + # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' + # Check if it's necessary to keep the old extracion process + _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P.*)\.(?P(flv)|(mp4))' + IE_NAME = u'photobucket' + + def _real_extract(self, url): + # Extract id from URL + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + + video_id = mobj.group('id') + + video_extension = mobj.group('ext') + + # Retrieve video webpage to extract further information + webpage = self._download_webpage(url, video_id) + + # Extract URL, uploader, and title from webpage + self.report_extraction(video_id) + # We try first by looking the javascript code: + mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P.*?)\);', webpage) + if mobj is not None: + info = json.loads(mobj.group('json')) + return [{ + 'id': video_id, + 'url': info[u'downloadUrl'], + 'uploader': info[u'username'], + 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'), + 'title': info[u'title'], + 'ext': video_extension, + 'thumbnail': info[u'thumbUrl'], + }] + + # We try looking in other parts of the webpage + video_url = self._search_regex(r'', + webpage, u'video URL') + + mobj = re.search(r'(.*) video by (.*) - Photobucket', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract title') + video_title = mobj.group(1).decode('utf-8') + video_uploader = mobj.group(2).decode('utf-8') + + return [{ + 'id': video_id.decode('utf-8'), + 'url': video_url.decode('utf-8'), + 'uploader': video_uploader, + 'upload_date': None, + 'title': video_title, + 'ext': video_extension.decode('utf-8'), + }] From d6039175e5b66740de0258898ff3fc44b2760a3d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 23 Jun 2013 20:13:52 +0200 Subject: [PATCH 023/135] Move yahoo into its own file --- youtube_dl/InfoExtractors.py | 69 +------------------------------ youtube_dl/extractor/yahoo.py | 76 +++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 68 deletions(-) create mode 100644 youtube_dl/extractor/yahoo.py diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index d1488ade9..68e21c635 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -28,6 +28,7 @@ from .extractor.dailymotion import DailymotionIE from .extractor.metacafe import MetacafeIE from .extractor.statigram import StatigramIE from .extractor.photobucket import PhotobucketIE +from .extractor.yahoo import YahooIE from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE @@ -36,74 +37,6 @@ from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, Yout -class YahooIE(InfoExtractor): - """Information extractor for screen.yahoo.com.""" - _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P\d*?)\.html' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P.+?)";', webpage) - - if m_id is None: - # TODO: Check which url parameters are required - info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id - webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage') - info_re = r'''<!\[CDATA\[(?P<title>.*?)\]\]>.* - .*?)\]\]>.* - .*?)\ .*\]\]>.* - \d*?)\.html' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P.+?)";', webpage) + + if m_id is None: + # TODO: Check which url parameters are required + info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id + webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage') + info_re = r'''<!\[CDATA\[(?P<title>.*?)\]\]>.* + .*?)\]\]>.* + .*?)\ .*\]\]>.* + Date: Sun, 23 Jun 2013 20:18:21 +0200 Subject: [PATCH 024/135] Move Vimeo into its own file --- youtube_dl/InfoExtractors.py | 125 +----------------------------- youtube_dl/extractor/vimeo.py | 138 ++++++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 124 deletions(-) create mode 100644 youtube_dl/extractor/vimeo.py diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 68e21c635..f06a8e0c9 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -28,6 +28,7 @@ from .extractor.dailymotion import DailymotionIE from .extractor.metacafe import MetacafeIE from .extractor.statigram import StatigramIE from .extractor.photobucket import PhotobucketIE +from .extractor.vimeo import VimeoIE from .extractor.yahoo import YahooIE from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE @@ -37,130 +38,6 @@ from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, Yout -class VimeoIE(InfoExtractor): - """Information extractor for vimeo.com.""" - - # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?Phttps?://)?(?:(?:www|player)\.)?vimeo(?Ppro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)' - IE_NAME = u'vimeo' - - def _verify_video_password(self, url, video_id, webpage): - password = self._downloader.params.get('password', None) - if password is None: - raise ExtractorError(u'This video is protected by a password, use the --password option') - token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1) - data = compat_urllib_parse.urlencode({'password': password, - 'token': token}) - # I didn't manage to use the password with https - if url.startswith('https'): - pass_url = url.replace('https','http') - else: - pass_url = url - password_request = compat_urllib_request.Request(pass_url+'/password', data) - password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - password_request.add_header('Cookie', 'xsrft=%s' % token) - pass_web = self._download_webpage(password_request, video_id, - u'Verifying the password', - u'Wrong password') - - def _real_extract(self, url, new_video=True): - # Extract ID from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group('id') - if not mobj.group('proto'): - url = 'https://' + url - if mobj.group('direct_link') or mobj.group('pro'): - url = 'https://vimeo.com/' + video_id - - # Retrieve video webpage to extract further information - request = compat_urllib_request.Request(url, None, std_headers) - webpage = self._download_webpage(request, video_id) - - # Now we begin extracting as much information as we can from what we - # retrieved. First we extract the information common to all extractors, - # and latter we extract those that are Vimeo specific. - self.report_extraction(video_id) - - # Extract the config JSON - try: - config = webpage.split(' = {config:')[1].split(',assets:')[0] - config = json.loads(config) - except: - if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): - raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option') - - if re.search('If so please provide the correct password.', webpage): - self._verify_video_password(url, video_id, webpage) - return self._real_extract(url) - else: - raise ExtractorError(u'Unable to extract info section') - - # Extract title - video_title = config["video"]["title"] - - # Extract uploader and uploader_id - video_uploader = config["video"]["owner"]["name"] - video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None - - # Extract video thumbnail - video_thumbnail = config["video"]["thumbnail"] - - # Extract video description - video_description = get_element_by_attribute("itemprop", "description", webpage) - if video_description: video_description = clean_html(video_description) - else: video_description = u'' - - # Extract upload date - video_upload_date = None - mobj = re.search(r' 0: - video_quality = files[quality][0][2] - video_codec = files[quality][0][0] - video_extension = files[quality][0][1] - self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality)) - break - else: - raise ExtractorError(u'No known codec found') - - video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ - %(video_id, sig, timestamp, video_quality, video_codec.upper()) - - return [{ - 'id': video_id, - 'url': video_url, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'upload_date': video_upload_date, - 'title': video_title, - 'ext': video_extension, - 'thumbnail': video_thumbnail, - 'description': video_description, - }] class ArteTvIE(InfoExtractor): diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py new file mode 100644 index 000000000..677cf4e1c --- /dev/null +++ b/youtube_dl/extractor/vimeo.py @@ -0,0 +1,138 @@ +import json +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + compat_urllib_request, + + clean_html, + get_element_by_attribute, + ExtractorError, + std_headers, +) + +class VimeoIE(InfoExtractor): + """Information extractor for vimeo.com.""" + + # _VALID_URL matches Vimeo URLs + _VALID_URL = r'(?Phttps?://)?(?:(?:www|player)\.)?vimeo(?Ppro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)' + IE_NAME = u'vimeo' + + def _verify_video_password(self, url, video_id, webpage): + password = self._downloader.params.get('password', None) + if password is None: + raise ExtractorError(u'This video is protected by a password, use the --password option') + token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1) + data = compat_urllib_parse.urlencode({'password': password, + 'token': token}) + # I didn't manage to use the password with https + if url.startswith('https'): + pass_url = url.replace('https','http') + else: + pass_url = url + password_request = compat_urllib_request.Request(pass_url+'/password', data) + password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + password_request.add_header('Cookie', 'xsrft=%s' % token) + self._download_webpage(password_request, video_id, + u'Verifying the password', + u'Wrong password') + + def _real_extract(self, url, new_video=True): + # Extract ID from URL + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + + video_id = mobj.group('id') + if not mobj.group('proto'): + url = 'https://' + url + if mobj.group('direct_link') or mobj.group('pro'): + url = 'https://vimeo.com/' + video_id + + # Retrieve video webpage to extract further information + request = compat_urllib_request.Request(url, None, std_headers) + webpage = self._download_webpage(request, video_id) + + # Now we begin extracting as much information as we can from what we + # retrieved. First we extract the information common to all extractors, + # and latter we extract those that are Vimeo specific. + self.report_extraction(video_id) + + # Extract the config JSON + try: + config = webpage.split(' = {config:')[1].split(',assets:')[0] + config = json.loads(config) + except: + if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): + raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option') + + if re.search('If so please provide the correct password.', webpage): + self._verify_video_password(url, video_id, webpage) + return self._real_extract(url) + else: + raise ExtractorError(u'Unable to extract info section') + + # Extract title + video_title = config["video"]["title"] + + # Extract uploader and uploader_id + video_uploader = config["video"]["owner"]["name"] + video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None + + # Extract video thumbnail + video_thumbnail = config["video"]["thumbnail"] + + # Extract video description + video_description = get_element_by_attribute("itemprop", "description", webpage) + if video_description: video_description = clean_html(video_description) + else: video_description = u'' + + # Extract upload date + video_upload_date = None + mobj = re.search(r' 0: + video_quality = files[quality][0][2] + video_codec = files[quality][0][0] + video_extension = files[quality][0][1] + self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality)) + break + else: + raise ExtractorError(u'No known codec found') + + video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ + %(video_id, sig, timestamp, video_quality, video_codec.upper()) + + return [{ + 'id': video_id, + 'url': video_url, + 'uploader': video_uploader, + 'uploader_id': video_uploader_id, + 'upload_date': video_upload_date, + 'title': video_title, + 'ext': video_extension, + 'thumbnail': video_thumbnail, + 'description': video_description, + }] From d5822b96b00fce48e04a14953c4cb25cef1cdbaf Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 23 Jun 2013 20:24:07 +0200 Subject: [PATCH 025/135] Move ARD, Arte, ZDF into their own files --- youtube_dl/InfoExtractors.py | 220 +---------------------------------- youtube_dl/extractor/ard.py | 45 +++++++ youtube_dl/extractor/arte.py | 134 +++++++++++++++++++++ youtube_dl/extractor/zdf.py | 65 +++++++++++ 4 files changed, 248 insertions(+), 216 deletions(-) create mode 100644 youtube_dl/extractor/ard.py create mode 100644 youtube_dl/extractor/arte.py create mode 100644 youtube_dl/extractor/zdf.py diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index f06a8e0c9..e8940ac36 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -21,9 +21,10 @@ import binascii import urllib from .utils import * - - from .extractor.common import InfoExtractor, SearchInfoExtractor + +from .extractor.ard import ARDIE +from .extractor.arte import ArteTvIE from .extractor.dailymotion import DailymotionIE from .extractor.metacafe import MetacafeIE from .extractor.statigram import StatigramIE @@ -31,6 +32,7 @@ from .extractor.photobucket import PhotobucketIE from .extractor.vimeo import VimeoIE from .extractor.yahoo import YahooIE from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE +from .extractor.zdf import ZDFIE @@ -40,125 +42,6 @@ from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, Yout -class ArteTvIE(InfoExtractor): - """arte.tv information extractor.""" - - _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*' - _LIVE_URL = r'index-[0-9]+\.html$' - - IE_NAME = u'arte.tv' - - def fetch_webpage(self, url): - request = compat_urllib_request.Request(url) - try: - self.report_download_webpage(url) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err)) - except ValueError as err: - raise ExtractorError(u'Invalid URL: %s' % url) - return webpage - - def grep_webpage(self, url, regex, regexFlags, matchTuples): - page = self.fetch_webpage(url) - mobj = re.search(regex, page, regexFlags) - info = {} - - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - for (i, key, err) in matchTuples: - if mobj.group(i) is None: - raise ExtractorError(err) - else: - info[key] = mobj.group(i) - - return info - - def extractLiveStream(self, url): - video_lang = url.split('/')[-4] - info = self.grep_webpage( - url, - r'src="(.*?/videothek_js.*?\.js)', - 0, - [ - (1, 'url', u'Invalid URL: %s' % url) - ] - ) - http_host = url.split('/')[2] - next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url'))) - info = self.grep_webpage( - next_url, - r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' + - '(http://.*?\.swf).*?' + - '(rtmp://.*?)\'', - re.DOTALL, - [ - (1, 'path', u'could not extract video path: %s' % url), - (2, 'player', u'could not extract video player: %s' % url), - (3, 'url', u'could not extract video url: %s' % url) - ] - ) - video_url = u'%s/%s' % (info.get('url'), info.get('path')) - - def extractPlus7Stream(self, url): - video_lang = url.split('/')[-3] - info = self.grep_webpage( - url, - r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)', - 0, - [ - (1, 'url', u'Invalid URL: %s' % url) - ] - ) - next_url = compat_urllib_parse.unquote(info.get('url')) - info = self.grep_webpage( - next_url, - r'