diff --git a/.travis.yml b/.travis.yml index 7f1fa8a3c..45b71f11b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,7 @@ notifications: - filippo.valsorda@gmail.com - phihag@phihag.de - jaime.marquinez.ferrandiz+travis@gmail.com + - yasoob.khld@gmail.com # irc: # channels: # - "irc.freenode.org#youtube-dl" diff --git a/README.md b/README.md index b246d3c53..560bcdca1 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,9 @@ which means you can modify it, redistribute it or use it however you like. # OPTIONS -h, --help print this help text and exit --version print program version and exit - -U, --update update this program to latest version + -U, --update update this program to latest version. Make sure + that you have sufficient permissions (run with + sudo if needed) -i, --ignore-errors continue on download errors --dump-user-agent display the current browser identification --user-agent UA specify a custom user agent diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py index c3d69e6f4..22977ccd9 100644 --- a/devscripts/youtube_genalgo.py +++ b/devscripts/youtube_genalgo.py @@ -5,27 +5,39 @@ import sys tests = [ + # 92 - vflQw-fB4 2013/07/17 + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`~\"", + "mrtyuioplkjhgfdsazxcvbnq1234567890QWERTY}IOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]\"|:;"), + # 90 + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`", + "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|"), # 88 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<", "J:|}][{=+-_)(*&;%$#@>MNBVCXZASDFGH^KLPOIUYTREWQ0987654321mnbvcxzasdfghrklpoiuytej"), - # 87 + # 87 - vflART1Nf 2013/07/24 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<", - "!?;:|}][{=+-_)(*&^$#@/MNBVCXZASqFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr"), + "tyuioplkjhgfdsazxcv"), # 86 - vfl_ymO4Z 2013/06/27 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<", "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"), - # 85 + # 85 - vflSAFCP9 2013/07/19 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?/>.<", - "{>/?;}[.=+-_)(*&^%$#@!MqBVCXZASDFwHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr"), + "ertyuiqplkjhgfdsazx$vbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#<%^&*()_-+={[};?/c"), # 84 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<", "<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWe098765432rmnbvcxzasdfghjklpoiuyt1"), - # 83 + # 83 - vflcaqGO8 2013/07/11 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<", - "D.>/?;}[{=+_)(*&^%$#!MNBVCXeAS.<"), # 82 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<", "Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9"), + # 81 - vflLC8JvQ 2013/07/25 + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.", + "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), + # 79 - vflLC8JvQ 2013/07/25 (sporadic) + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/", + "Z?;}[{=+-(*&^%$#@!MNBVCXRASDFGHKLPOIUYT/EWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), ] def find_matching(wrong, right): diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 39a5ee33a..c73d0e467 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -61,6 +61,17 @@ class TestAllURLsMatching(unittest.TestCase): else: self.assertFalse(ie.suitable(url), '%s should not match URL %r' % (type(ie).__name__, url)) + def test_keywords(self): + ies = gen_extractors() + matching_ies = lambda url: [ie.IE_NAME for ie in ies + if ie.suitable(url) and ie.IE_NAME != 'generic'] + self.assertEqual(matching_ies(':ytsubs'), ['youtube:subscriptions']) + self.assertEqual(matching_ies(':ytsubscriptions'), ['youtube:subscriptions']) + self.assertEqual(matching_ies(':thedailyshow'), ['ComedyCentral']) + self.assertEqual(matching_ies(':tds'), ['ComedyCentral']) + self.assertEqual(matching_ies(':colbertreport'), ['ComedyCentral']) + self.assertEqual(matching_ies(':cr'), ['ComedyCentral']) + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index c4b71362e..be1069105 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -4,6 +4,7 @@ import sys import unittest +import xml.etree.ElementTree # Allow direct execution import os @@ -16,6 +17,7 @@ from youtube_dl.utils import unescapeHTML from youtube_dl.utils import orderedSet from youtube_dl.utils import DateRange from youtube_dl.utils import unified_strdate +from youtube_dl.utils import find_xpath_attr if sys.version_info < (3, 0): _compat_str = lambda b: b.decode('unicode-escape') @@ -112,5 +114,18 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214') self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011') + def test_find_xpath_attr(self): + testxml = u''' + + + + + ''' + doc = xml.etree.ElementTree.fromstring(testxml) + + self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None) + self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1]) + self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2]) + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_sig.py b/test/test_youtube_sig.py old mode 100755 new mode 100644 index e87b6259b..4d45a0e08 --- a/test/test_youtube_sig.py +++ b/test/test_youtube_sig.py @@ -13,9 +13,14 @@ from helper import FakeYDL sig = YoutubeIE(FakeYDL())._decrypt_signature class TestYoutubeSig(unittest.TestCase): - def test_43_43(self): - wrong = '5AEEAE0EC39677BC65FD9021CCD115F1F2DBD5A59E4.C0B243A3E2DED6769199AF3461781E75122AE135135' - right = '931EA22157E1871643FA9519676DED253A342B0C.4E95A5DBD2F1F511DCC1209DF56CB77693CE0EAE' + def test_92(self): + wrong = "F9F9B6E6FD47029957AB911A964CC20D95A181A5D37A2DBEFD67D403DB0E8BE4F4910053E4E8A79.0B70B.0B80B8" + right = "69B6E6FD47029957AB911A9F4CC20D95A181A5D3.A2DBEFD67D403DB0E8BE4F4910053E4E8A7980B7" + self.assertEqual(sig(wrong), right) + + def test_90(self): + wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`" + right = "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|" self.assertEqual(sig(wrong), right) def test_88(self): @@ -25,7 +30,7 @@ class TestYoutubeSig(unittest.TestCase): def test_87(self): wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<" - right = "!?;:|}][{=+-_)(*&^$#@/MNBVCXZASqFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr" + right = "tyuioplkjhgfdsazxcv" self.assertEqual(sig(wrong), right) def test_86(self): @@ -35,7 +40,7 @@ class TestYoutubeSig(unittest.TestCase): def test_85(self): wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?/>.<" - right = "{>/?;}[.=+-_)(*&^%$#@!MqBVCXZASDFwHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr" + right = "ertyuiqplkjhgfdsazx$vbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#<%^&*()_-+={[};?/c" self.assertEqual(sig(wrong), right) def test_84(self): @@ -45,7 +50,7 @@ class TestYoutubeSig(unittest.TestCase): def test_83(self): wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<" - right = "D.>/?;}[{=+_)(*&^%$#!MNBVCXeAS[^?/]+)(?:[?].*)?$' + _TEST = { + u"url": u"http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect", + u'file': u'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv', + u'md5': u'8af1d4cf447933ed3c7f4871162602db', + u'info_dict': { + u"title": u"1968 Demo - FJCC Conference Presentation Reel #1", + u"description": u"Reel 1 of 3: Also known as the \"Mother of All Demos\", Doug Engelbart's presentation at the Fall Joint Computer Conference in San Francisco, December 9, 1968 titled \"A Research Center for Augmenting Human Intellect.\" For this presentation, Doug and his team astonished the audience by not only relating their research, but demonstrating it live. This was the debut of the mouse, interactive computing, hypermedia, computer supported software engineering, video teleconferencing, etc. See also Doug's 1968 Demo page for more background, highlights, links, and the detailed paper published in this conference proceedings. Filmed on 3 reels: Reel 1 | Reel 2 | Reel 3", + u"upload_date": u"19681210", + u"uploader": u"SRI International" + } + } + + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + json_url = url + (u'?' if u'?' in url else '&') + u'output=json' + json_data = self._download_webpage(json_url, video_id) + data = json.loads(json_data) + + title = data['metadata']['title'][0] + description = data['metadata']['description'][0] + uploader = data['metadata']['creator'][0] + upload_date = unified_strdate(data['metadata']['date'][0]) + + formats = [{ + 'format': fdata['format'], + 'url': 'http://' + data['server'] + data['dir'] + fn, + 'file_size': int(fdata['size']), + } + for fn,fdata in data['files'].items() + if 'Video' in fdata['format']] + formats.sort(key=lambda fdata: fdata['file_size']) + + info = { + '_type': 'video', + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'uploader': uploader, + 'upload_date': upload_date, + } + thumbnail = data.get('misc', {}).get('image') + if thumbnail: + info['thumbnail'] = thumbnail + + # TODO: Remove when #980 has been merged + info['url'] = formats[-1]['url'] + info['ext'] = determine_ext(formats[-1]['url']) + + return info \ No newline at end of file diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 5793a4129..dbf8eed99 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -32,7 +32,7 @@ class ARDIE(InfoExtractor): # determine title and media streams from webpage html = self._download_webpage(url, video_id) title = re.search(self._TITLE, html).group('title') - streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] + streams = [mo.groupdict() for mo in re.finditer(self._MEDIA_STREAM, html)] if not streams: assert '"fsk"' in html raise ExtractorError(u'This video is only available after 8:00 pm') diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index a030a28bb..993e30f7a 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -1,12 +1,11 @@ import re import json +import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( - # This is used by the not implemented extractLiveStream method - compat_urllib_parse, - ExtractorError, + find_xpath_attr, unified_strdate, ) @@ -17,7 +16,7 @@ class ArteTvIE(InfoExtractor): The videos expire in 7 days, so we can't add tests. """ _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?Pfr|de)/(?:(?:sendungen|emissions)/)?(?P.*?)/(?P.*?)(\?.*)?' - _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?:fr|de)/.*-(?P.*?).html' + _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?Pfr|de)/.*-(?P.*?).html' _LIVE_URL = r'index-[0-9]+\.html$' IE_NAME = u'arte.tv' @@ -27,6 +26,7 @@ class ArteTvIE(InfoExtractor): return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL)) # TODO implement Live Stream + # from ..utils import compat_urllib_parse # def extractLiveStream(self, url): # video_lang = url.split('/')[-4] # info = self.grep_webpage( @@ -56,7 +56,6 @@ class ArteTvIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._EMISSION_URL, url) if mobj is not None: - name = mobj.group('name') lang = mobj.group('lang') # This is not a real id, it can be for example AJT for the news # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal @@ -66,7 +65,8 @@ class ArteTvIE(InfoExtractor): mobj = re.match(self._VIDEOS_URL, url) if mobj is not None: id = mobj.group('id') - return self._extract_video(url, id) + lang = mobj.group('lang') + return self._extract_video(url, id, lang) if re.search(self._LIVE_URL, video_id) is not None: raise ExtractorError(u'Arte live streams are not yet supported, sorry') @@ -75,7 +75,8 @@ class ArteTvIE(InfoExtractor): def _extract_emission(self, url, video_id, lang): """Extract from www.arte.tv/guide""" - json_url = 'http://org-www.arte.tv/papi/tvguide/videos/stream/player/F/%s_PLUS7-F/ALL/ALL.json' % video_id + webpage = self._download_webpage(url, video_id) + json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') json_info = self._download_webpage(json_url, video_id, 'Downloading info json') self.report_extraction(video_id) @@ -113,13 +114,15 @@ class ArteTvIE(InfoExtractor): return info_dict - def _extract_video(self, url, video_id): + def _extract_video(self, url, video_id, lang): """Extract from videos.arte.tv""" - config_xml_url = url.replace('/videos/', '/do_delegate/videos/') - config_xml_url = config_xml_url.replace('.html', ',view,asPlayerXml.xml') - config_xml = self._download_webpage(config_xml_url, video_id) - config_xml_url = self._html_search_regex(r'(?P.*?)', config_xml)) def _key(m): diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index f7af65606..08b28c994 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -189,5 +189,5 @@ class BlipTVUserIE(InfoExtractor): pagenum += 1 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids] - url_entries = [self.url_result(url, 'BlipTV') for url in urls] + url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls] return [self.playlist_result(url_entries, playlist_title = username)] diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 34f555e89..53a898de3 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -1,6 +1,8 @@ import re +import json from .common import InfoExtractor +from ..utils import determine_ext class BreakIE(InfoExtractor): @@ -17,17 +19,20 @@ class BreakIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1).split("-")[-1] - webpage = self._download_webpage(url, video_id) - video_url = re.search(r"videoPath: '(.+?)',",webpage).group(1) - key = re.search(r"icon: '(.+?)',",webpage).group(1) - final_url = str(video_url)+"?"+str(key) - thumbnail_url = re.search(r"thumbnailURL: '(.+?)'",webpage).group(1) - title = re.search(r"sVidTitle: '(.+)',",webpage).group(1) - ext = video_url.split('.')[-1] + embed_url = 'http://www.break.com/embed/%s' % video_id + webpage = self._download_webpage(embed_url, video_id) + info_json = self._search_regex(r'var embedVars = ({.*?});', webpage, + u'info json', flags=re.DOTALL) + info = json.loads(info_json) + video_url = info['videoUri'] + m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url) + if m_youtube is not None: + return self.url_result(m_youtube.group(1), 'Youtube') + final_url = video_url + '?' + info['AuthToken'] return [{ 'id': video_id, 'url': final_url, - 'ext': ext, - 'title': title, - 'thumbnail': thumbnail_url, + 'ext': determine_ext(final_url), + 'title': info['contentName'], + 'thumbnail': info['thumbUri'], }] diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py new file mode 100644 index 000000000..71e3c7883 --- /dev/null +++ b/youtube_dl/extractor/brightcove.py @@ -0,0 +1,86 @@ +import re +import json +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + find_xpath_attr, + compat_urlparse, +) + +class BrightcoveIE(InfoExtractor): + _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P.*)' + _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' + _PLAYLIST_URL_TEMPLATE = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s' + + # There is a test for Brigtcove in GenericIE, that way we test both the download + # and the detection of videos, and we don't have to find an URL that is always valid + + @classmethod + def _build_brighcove_url(cls, object_str): + """ + Build a Brightcove url from a xml string containing + {params} + """ + object_doc = xml.etree.ElementTree.fromstring(object_str) + assert u'BrightcoveExperience' in object_doc.attrib['class'] + params = {'flashID': object_doc.attrib['id'], + 'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'], + } + playerKey = find_xpath_attr(object_doc, './param', 'name', 'playerKey') + # Not all pages define this value + if playerKey is not None: + params['playerKey'] = playerKey.attrib['value'] + videoPlayer = find_xpath_attr(object_doc, './param', 'name', '@videoPlayer') + if videoPlayer is not None: + params['@videoPlayer'] = videoPlayer.attrib['value'] + data = compat_urllib_parse.urlencode(params) + return cls._FEDERATED_URL_TEMPLATE % data + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + query_str = mobj.group('query') + query = compat_urlparse.parse_qs(query_str) + + videoPlayer = query.get('@videoPlayer') + if videoPlayer: + return self._get_video_info(videoPlayer[0], query_str) + else: + player_key = query['playerKey'] + return self._get_playlist_info(player_key[0]) + + def _get_video_info(self, video_id, query): + request_url = self._FEDERATED_URL_TEMPLATE % query + webpage = self._download_webpage(request_url, video_id) + + self.report_extraction(video_id) + info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json') + info = json.loads(info)['data'] + video_info = info['programmedContent']['videoPlayer']['mediaDTO'] + + return self._extract_video_info(video_info) + + def _get_playlist_info(self, player_key): + playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key, + player_key, u'Downloading playlist information') + + playlist_info = json.loads(playlist_info)['videoList'] + videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']] + + return self.playlist_result(videos, playlist_id=playlist_info['id'], + playlist_title=playlist_info['mediaCollectionDTO']['displayName']) + + def _extract_video_info(self, video_info): + renditions = video_info['renditions'] + renditions = sorted(renditions, key=lambda r: r['size']) + best_format = renditions[-1] + + return {'id': video_info['id'], + 'title': video_info['displayName'], + 'url': best_format['defaultURL'], + 'ext': 'mp4', + 'description': video_info.get('shortDescription'), + 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'), + 'uploader': video_info.get('publisherName'), + } diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py new file mode 100644 index 000000000..4c8a8af09 --- /dev/null +++ b/youtube_dl/extractor/c56.py @@ -0,0 +1,36 @@ +# coding: utf-8 + +import re +import json + +from .common import InfoExtractor +from ..utils import determine_ext + +class C56IE(InfoExtractor): + _VALID_URL = r'https?://((www|player)\.)?56\.com/(.+?/)?(v_|(play_album.+-))(?P.+?)\.(html|swf)' + IE_NAME = u'56.com' + + _TEST ={ + u'url': u'http://www.56.com/u39/v_OTM0NDA3MTY.html', + u'file': u'93440716.mp4', + u'md5': u'9dc07b5c8e978112a6441f9e75d2b59e', + u'info_dict': { + u'title': u'网事知多少 第32期:车怒', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) + text_id = mobj.group('textid') + info_page = self._download_webpage('http://vxml.56.com/json/%s/' % text_id, + text_id, u'Downloading video info') + info = json.loads(info_page)['info'] + best_format = sorted(info['rfiles'], key=lambda f: int(f['filesize']))[-1] + video_url = best_format['url'] + + return {'id': info['vid'], + 'title': info['Subject'], + 'url': video_url, + 'ext': determine_ext(video_url), + 'thumbnail': info.get('bimg') or info.get('img'), + } diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py new file mode 100644 index 000000000..3b1c88876 --- /dev/null +++ b/youtube_dl/extractor/canalplus.py @@ -0,0 +1,46 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import unified_strdate + +class CanalplusIE(InfoExtractor): + _VALID_URL = r'https?://www\.canalplus\.fr/.*?\?vid=(?P\d+)' + _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s' + IE_NAME = u'canalplus.fr' + + _TEST = { + u'url': u'http://www.canalplus.fr/c-divertissement/pid3351-c-le-petit-journal.html?vid=889861', + u'file': u'889861.flv', + u'md5': u'590a888158b5f0d6832f84001fbf3e99', + u'info_dict': { + u'title': u'Le Petit Journal 20/06/13 - La guerre des drone', + u'upload_date': u'20130620', + }, + u'skip': u'Requires rtmpdump' + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info_url = self._VIDEO_INFO_TEMPLATE % video_id + info_page = self._download_webpage(info_url,video_id, + u'Downloading video info') + + self.report_extraction(video_id) + doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8')) + video_info = [video for video in doc if video.find('ID').text == video_id][0] + infos = video_info.find('INFOS') + media = video_info.find('MEDIA') + formats = [media.find('VIDEOS/%s' % format) + for format in ['BAS_DEBIT', 'HAUT_DEBIT', 'HD']] + video_url = [format.text for format in formats if format is not None][-1] + + return {'id': video_id, + 'title': u'%s - %s' % (infos.find('TITRAGE/TITRE').text, + infos.find('TITRAGE/SOUS_TITRE').text), + 'url': video_url, + 'ext': 'flv', + 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text), + 'thumbnail': media.find('IMAGES/GRAND').text, + } diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index 7ae0972e5..5badde03a 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -1,26 +1,26 @@ import re -import socket import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( - compat_http_client, - compat_str, - compat_urllib_error, compat_urllib_parse_urlparse, - compat_urllib_request, ExtractorError, ) class CollegeHumorIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P[0-9]+)/(?P.*)$' + _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P[0-9]+)/(?P.*)$' - def report_manifest(self, video_id): - """Report information extraction.""" - self.to_screen(u'%s: Downloading XML manifest' % video_id) + _TEST = { + u'url': u'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe', + u'file': u'6902724.mp4', + u'md5': u'1264c12ad95dca142a9f0bf7968105a0', + u'info_dict': { + u'title': u'Comic-Con Cosplay Catastrophe', + u'description': u'Fans get creative this year at San Diego. Too creative. And yes, that\'s really Joss Whedon.', + }, + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -36,14 +36,16 @@ class CollegeHumorIE(InfoExtractor): self.report_extraction(video_id) xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id - try: - metaXml = compat_urllib_request.urlopen(xmlUrl).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) + metaXml = self._download_webpage(xmlUrl, video_id, + u'Downloading info XML', + u'Unable to download video info XML') mdoc = xml.etree.ElementTree.fromstring(metaXml) try: videoNode = mdoc.findall('./video')[0] + youtubeIdNode = videoNode.find('./youtubeID') + if youtubeIdNode is not None: + return self.url_result(youtubeIdNode.text, 'Youtube') info['description'] = videoNode.findall('./description')[0].text info['title'] = videoNode.findall('./caption')[0].text info['thumbnail'] = videoNode.findall('./thumbnail')[0].text @@ -52,11 +54,9 @@ class CollegeHumorIE(InfoExtractor): raise ExtractorError(u'Invalid metadata XML file') manifest_url += '?hdcore=2.10.3' - self.report_manifest(video_id) - try: - manifestXml = compat_urllib_request.urlopen(manifest_url).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) + manifestXml = self._download_webpage(manifest_url, video_id, + u'Downloading XML manifest', + u'Unable to download video info XML') adoc = xml.etree.ElementTree.fromstring(manifestXml) try: @@ -66,9 +66,8 @@ class CollegeHumorIE(InfoExtractor): except IndexError as err: raise ExtractorError(u'Invalid manifest file') - url_pr = compat_urllib_parse_urlparse(manifest_url) - url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1' + url_pr = compat_urllib_parse_urlparse(info['thumbnail']) - info['url'] = url - info['ext'] = 'f4f' + info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','') + info['ext'] = 'mp4' return [info] diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 93d9e3d5e..bf8d711ee 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -24,7 +24,9 @@ class ComedyCentralIE(InfoExtractor): (full-episodes/(?P.*)| (?P (the-colbert-report-(videos|collections)/(?P[0-9]+)/[^/]*/(?P.*?)) - |(watch/(?P[^/]*)/(?P.*))))) + |(watch/(?P[^/]*)/(?P.*)))| + (?P + extended-interviews/(?P[0-9]+)/playlist_tds_extended_(?P.*?)/.*?))) $""" _TEST = { u'url': u'http://www.thedailyshow.com/watch/thu-december-13-2012/kristen-stewart', @@ -87,6 +89,9 @@ class ComedyCentralIE(InfoExtractor): else: epTitle = mobj.group('cntitle') dlNewest = False + elif mobj.group('interview'): + epTitle = mobj.group('interview_title') + dlNewest = False else: dlNewest = not mobj.group('episode') if dlNewest: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 655836ff6..da50abfc1 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -3,6 +3,7 @@ import os import re import socket import sys +import netrc from ..utils import ( compat_http_client, @@ -13,6 +14,7 @@ from ..utils import ( clean_html, compiled_regex_type, ExtractorError, + unescapeHTML, ) class InfoExtractor(object): @@ -36,6 +38,8 @@ class InfoExtractor(object): The following fields are optional: format: The video format, defaults to ext (used for --get-format) + thumbnails: A list of dictionaries (with the entries "resolution" and + "url") for the varying thumbnails thumbnail: Full URL to a video thumbnail image. description: One-line video description. uploader: Full name of the video uploader. @@ -122,6 +126,11 @@ class InfoExtractor(object): def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None): """ Returns a tuple (page content as string, URL handle) """ + + # Strip hashes from the URL (#1038) + if isinstance(url_or_request, (compat_str, str)): + url_or_request = url_or_request.partition('#')[0] + urlh = self._request_webpage(url_or_request, video_id, note, errnote) content_type = urlh.headers.get('Content-Type', '') m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) @@ -161,12 +170,11 @@ class InfoExtractor(object): """Report attempt to confirm age.""" self.to_screen(u'Confirming age') + def report_login(self): + """Report attempt to log in.""" + self.to_screen(u'Logging in') + #Methods for following #608 - #They set the correct value of the '_type' key - def video_result(self, video_info): - """Returns a video""" - video_info['_type'] = 'video' - return video_info def url_result(self, url, ie=None): """Returns a url that points to a page that should be processed""" #TODO: ie should be the class used for getting the info @@ -225,6 +233,61 @@ class InfoExtractor(object): else: return res + def _get_login_info(self): + """ + Get the the login info as (username, password) + It will look in the netrc file using the _NETRC_MACHINE value + If there's no info available, return (None, None) + """ + if self._downloader is None: + return (None, None) + + username = None + password = None + downloader_params = self._downloader.params + + # Attempt to use provided username and password or .netrc data + if downloader_params.get('username', None) is not None: + username = downloader_params['username'] + password = downloader_params['password'] + elif downloader_params.get('usenetrc', False): + try: + info = netrc.netrc().authenticators(self._NETRC_MACHINE) + if info is not None: + username = info[0] + password = info[2] + else: + raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) + except (IOError, netrc.NetrcParseError) as err: + self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) + + return (username, password) + + # Helper functions for extracting OpenGraph info + @staticmethod + def _og_regex(prop): + return r'%s).com/(?Pwatch|series|video)/(?P.+)' % '|'.join(_SITES.keys()) + IE_DESC = u'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) + + _TEST = { + u'url': u'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', + u'file': u'5171b343c2b4c00dd0c1ccb3.mp4', + u'md5': u'1921f713ed48aabd715691f774c451f7', + u'info_dict': { + u'title': u'3D Printed Speakers Lit With LED', + u'description': u'Check out these beautiful 3D printed LED speakers. You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.', + } + } + + def _extract_series(self, url, webpage): + title = self._html_search_regex(r'.*?(.+?)', + webpage, u'series title', flags=re.DOTALL) + url_object = compat_urllib_parse_urlparse(url) + base_url = '%s://%s' % (url_object.scheme, url_object.netloc) + m_paths = re.finditer(r'.*?(.+?)', + r'(.+?)', + ], + webpage, u'description', + fatal=False, flags=re.DOTALL) + params = self._search_regex(r'var params = {(.+?)}[;,]', webpage, + u'player params', flags=re.DOTALL) + video_id = self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, u'video id') + player_id = self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, u'player id') + target = self._search_regex(r'target: [\'"](.+?)[\'"]', params, u'target') + data = compat_urllib_parse.urlencode({'videoId': video_id, + 'playerId': player_id, + 'target': target, + }) + base_info_url = self._search_regex(r'url = [\'"](.+?)[\'"][,;]', + webpage, u'base info url', + default='http://player.cnevids.com/player/loader.js?') + info_url = base_info_url + data + info_page = self._download_webpage(info_url, video_id, + u'Downloading video info') + video_info = self._search_regex(r'var video = ({.+?});', info_page, u'video info') + video_info = json.loads(video_info) + + def _formats_sort_key(f): + type_ord = 1 if f['type'] == 'video/mp4' else 0 + quality_ord = 1 if f['quality'] == 'high' else 0 + return (quality_ord, type_ord) + best_format = sorted(video_info['sources'][0], key=_formats_sort_key)[-1] + + return {'id': video_id, + 'url': best_format['src'], + 'ext': best_format['type'].split('/')[-1], + 'title': video_info['title'], + 'thumbnail': video_info['poster_frame'], + 'description': description, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + site = mobj.group('site') + url_type = mobj.group('type') + id = mobj.group('id') + + self.to_screen(u'Extracting from %s with the Condé Nast extractor' % self._SITES[site]) + webpage = self._download_webpage(url, id) + + if url_type == 'series': + return self._extract_series(url, webpage) + else: + return self._extract_video(webpage) diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py new file mode 100644 index 000000000..31fe3d57b --- /dev/null +++ b/youtube_dl/extractor/criterion.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- + +import re + +from .common import InfoExtractor +from ..utils import determine_ext + +class CriterionIE(InfoExtractor): + _VALID_URL = r'https?://www\.criterion\.com/films/(\d*)-.+' + _TEST = { + u'url': u'http://www.criterion.com/films/184-le-samourai', + u'file': u'184.mp4', + u'md5': u'bc51beba55685509883a9a7830919ec3', + u'info_dict': { + u"title": u"Le Samouraï", + u"description" : u'md5:a2b4b116326558149bef81f76dcbb93f', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + webpage = self._download_webpage(url, video_id) + + final_url = self._search_regex(r'so.addVariable\("videoURL", "(.+?)"\)\;', + webpage, 'video url') + title = self._html_search_regex(r'', + webpage, 'video title') + description = self._html_search_regex(r'', + webpage, 'video description') + thumbnail = self._search_regex(r'so.addVariable\("thumbnailURL", "(.+?)"\)\;', + webpage, 'thumbnail url') + + return {'id': video_id, + 'url' : final_url, + 'title': title, + 'ext': determine_ext(final_url), + 'description': description, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index a4853279b..7bf03c584 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -34,8 +34,6 @@ class CSpanIE(InfoExtractor): description = self._html_search_regex(r'(.*?)', video_info, 'video url') @@ -49,5 +47,5 @@ class CSpanIE(InfoExtractor): 'url': url, 'play_path': path, 'description': description, - 'thumbnail': thumbnail, + 'thumbnail': self._og_search_thumbnail(webpage), } diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 3297a8549..9bf7a28ca 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -1,12 +1,11 @@ import re +import json from .common import InfoExtractor from ..utils import ( compat_urllib_request, - compat_urllib_parse, ExtractorError, - unescapeHTML, ) class DailymotionIE(InfoExtractor): @@ -39,33 +38,7 @@ class DailymotionIE(InfoExtractor): # Extract URL, uploader and title from webpage self.report_extraction(video_id) - mobj = re.search(r'\s*var flashvars = (.*)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - flashvars = compat_urllib_parse.unquote(mobj.group(1)) - for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']: - if key in flashvars: - max_quality = key - self.to_screen(u'Using %s' % key) - break - else: - raise ExtractorError(u'Unable to extract video URL') - - mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars) - if mobj is None: - raise ExtractorError(u'Unable to extract video URL') - - video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/') - - # TODO: support choosing qualities - - mobj = re.search(r'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = unescapeHTML(mobj.group('title')) - - video_uploader = None video_uploader = self._search_regex([r'(?im)[^<]+?]+?>([^<]+?)', # Looking for official user r'<(?:span|a) .*?rel="author".*?>([^<]+?)'], @@ -76,11 +49,31 @@ class DailymotionIE(InfoExtractor): if mobj is not None: video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) + embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id + embed_page = self._download_webpage(embed_url, video_id, + u'Downloading embed page') + info = self._search_regex(r'var info = ({.*?}),', embed_page, 'video info') + info = json.loads(info) + + # TODO: support choosing qualities + + for key in ['stream_h264_hd1080_url','stream_h264_hd_url', + 'stream_h264_hq_url','stream_h264_url', + 'stream_h264_ld_url']: + if info.get(key):#key in info and info[key]: + max_quality = key + self.to_screen(u'Using %s' % key) + break + else: + raise ExtractorError(u'Unable to extract video URL') + video_url = info[max_quality] + return [{ 'id': video_id, 'url': video_url, 'uploader': video_uploader, 'upload_date': video_upload_date, - 'title': video_title, + 'title': self._og_search_title(webpage), 'ext': video_extension, + 'thumbnail': info['thumbnail_url'] }] diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py new file mode 100644 index 000000000..0ee9a684e --- /dev/null +++ b/youtube_dl/extractor/dotsub.py @@ -0,0 +1,41 @@ +import re +import json +import time + +from .common import InfoExtractor + + +class DotsubIE(InfoExtractor): + _VALID_URL = r'(?:http://)?(?:www\.)?dotsub\.com/view/([^/]+)' + _TEST = { + u'url': u'http://dotsub.com/view/aed3b8b2-1889-4df5-ae63-ad85f5572f27', + u'file': u'aed3b8b2-1889-4df5-ae63-ad85f5572f27.flv', + u'md5': u'0914d4d69605090f623b7ac329fea66e', + u'info_dict': { + u"title": u"Pyramids of Waste (2010), AKA The Lightbulb Conspiracy - Planned obsolescence documentary", + u"uploader": u"4v4l0n42", + u'description': u'Pyramids of Waste (2010) also known as "The lightbulb conspiracy" is a documentary about how our economic system based on consumerism and planned obsolescence is breaking our planet down.\r\n\r\nSolutions to this can be found at:\r\nhttp://robotswillstealyourjob.com\r\nhttp://www.federicopistono.org\r\n\r\nhttp://opensourceecology.org\r\nhttp://thezeitgeistmovement.com', + u'thumbnail': u'http://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p', + u'upload_date': u'20101213', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + info_url = "https://dotsub.com/api/media/%s/metadata" %(video_id) + webpage = self._download_webpage(info_url, video_id) + info = json.loads(webpage) + date = time.gmtime(info['dateCreated']/1000) # The timestamp is in miliseconds + + return [{ + 'id': video_id, + 'url': info['mediaURI'], + 'ext': 'flv', + 'title': info['title'], + 'thumbnail': info['screenshotURI'], + 'description': info['description'], + 'uploader': info['user'], + 'view_count': info['numberOfViews'], + 'upload_date': u'%04i%02i%02i' % (date.tm_year, date.tm_mon, date.tm_mday), + }] diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py new file mode 100644 index 000000000..64b465805 --- /dev/null +++ b/youtube_dl/extractor/dreisat.py @@ -0,0 +1,85 @@ +# coding: utf-8 + +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + unified_strdate, +) + + +class DreiSatIE(InfoExtractor): + IE_NAME = '3sat' + _VALID_URL = r'(?:http://)?(?:www\.)?3sat.de/mediathek/index.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)$' + _TEST = { + u"url": u"http://www.3sat.de/mediathek/index.php?obj=36983", + u'file': u'36983.webm', + u'md5': u'57c97d0469d71cf874f6815aa2b7c944', + u'info_dict': { + u"title": u"Kaffeeland Schweiz", + u"description": u"Über 80 Kaffeeröstereien liefern in der Schweiz das Getränk, in das das Land so vernarrt ist: Mehr als 1000 Tassen trinkt ein Schweizer pro Jahr. SCHWEIZWEIT nimmt die Kaffeekultur unter die...", + u"uploader": u"3sat", + u"upload_date": u"20130622" + } + } + + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id + details_xml = self._download_webpage(details_url, video_id, note=u'Downloading video details') + details_doc = xml.etree.ElementTree.fromstring(details_xml.encode('utf-8')) + + thumbnail_els = details_doc.findall('.//teaserimage') + thumbnails = [{ + 'width': te.attrib['key'].partition('x')[0], + 'height': te.attrib['key'].partition('x')[2], + 'url': te.text, + } for te in thumbnail_els] + + information_el = details_doc.find('.//information') + video_title = information_el.find('./title').text + video_description = information_el.find('./detail').text + + details_el = details_doc.find('.//details') + video_uploader = details_el.find('./channel').text + upload_date = unified_strdate(details_el.find('./airtime').text) + + format_els = details_doc.findall('.//formitaet') + formats = [{ + 'format_id': fe.attrib['basetype'], + 'width': int(fe.find('./width').text), + 'height': int(fe.find('./height').text), + 'url': fe.find('./url').text, + 'filesize': int(fe.find('./filesize').text), + 'video_bitrate': int(fe.find('./videoBitrate').text), + '3sat_qualityname': fe.find('./quality').text, + } for fe in format_els + if not fe.find('./url').text.startswith('http://www.metafilegenerator.de/')] + + def _sortkey(format): + qidx = ['low', 'med', 'high', 'veryhigh'].index(format['3sat_qualityname']) + prefer_http = 1 if 'rtmp' in format['url'] else 0 + return (qidx, prefer_http, format['video_bitrate']) + formats.sort(key=_sortkey) + + info = { + '_type': 'video', + 'id': video_id, + 'title': video_title, + 'formats': formats, + 'description': video_description, + 'thumbnails': thumbnails, + 'thumbnail': thumbnails[-1]['url'], + 'uploader': video_uploader, + 'upload_date': upload_date, + } + + # TODO: Remove when #980 has been merged + info['url'] = formats[-1]['url'] + info['ext'] = determine_ext(formats[-1]['url']) + + return info \ No newline at end of file diff --git a/youtube_dl/extractor/ehow.py b/youtube_dl/extractor/ehow.py new file mode 100644 index 000000000..2bb77aec6 --- /dev/null +++ b/youtube_dl/extractor/ehow.py @@ -0,0 +1,46 @@ +import re + +from ..utils import ( + compat_urllib_parse, + determine_ext +) +from .common import InfoExtractor + + +class EHowIE(InfoExtractor): + IE_NAME = u'eHow' + _VALID_URL = r'(?:https?://)?(?:www\.)?ehow\.com/[^/_?]*_(?P[0-9]+)' + _TEST = { + u'url': u'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html', + u'file': u'12245069.flv', + u'md5': u'9809b4e3f115ae2088440bcb4efbf371', + u'info_dict': { + u"title": u"Hardwood Flooring Basics", + u"description": u"Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...", + u"uploader": u"Erick Nathan" + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)', + webpage, u'video URL') + final_url = compat_urllib_parse.unquote(video_url) + uploader = self._search_regex(r'', + webpage, u'uploader') + title = self._og_search_title(webpage).replace(' | eHow', '') + ext = determine_ext(final_url) + + return { + '_type': 'video', + 'id': video_id, + 'url': final_url, + 'ext': ext, + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage), + 'uploader': uploader, + } + diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index 794460e84..3aa2da52c 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -36,11 +36,7 @@ class EscapistIE(InfoExtractor): videoDesc = self._html_search_regex('[^/]+)' + _TEST = { + u'url': u'http://www.freesound.org/people/miklovan/sounds/194503/', + u'file': u'194503.mp3', + u'md5': u'12280ceb42c81f19a515c745eae07650', + u'info_dict': { + u"title": u"gulls in the city.wav", + u"uploader" : u"miklovan", + u'description': u'the sounds of seagulls in the city', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + music_id = mobj.group('id') + webpage = self._download_webpage(url, music_id) + title = self._html_search_regex(r'.*?(.+?)', + webpage, 'music title', flags=re.DOTALL) + music_url = self._og_search_property('audio', webpage, 'music url') + description = self._html_search_regex(r'(.*?)', + webpage, 'description', fatal=False, flags=re.DOTALL) + + return [{ + 'id': music_id, + 'title': title, + 'url': music_url, + 'uploader': self._og_search_property('audio:artist', webpage, 'music uploader'), + 'ext': determine_ext(music_url), + 'description': description, + }] diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 388aacf2f..67a7e5f76 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -27,14 +27,11 @@ class FunnyOrDieIE(InfoExtractor): title = self._html_search_regex((r"(?P.*?)", r'(?P[^<]+?)'), webpage, 'title', flags=re.DOTALL) - video_description = self._html_search_regex(r'\d+)/?' _TEST = { u"url": u"http://www.gamespot.com/arma-iii/videos/arma-iii-community-guide-sitrep-i-6410818/", u"file": u"6410818.mp4", - u"md5": u"5569d64ca98db01f0177c934fe8c1e9b", + u"md5": u"b2a30deaa8654fcccd43713a6b6a4825", u"info_dict": { u"title": u"Arma III - Community Guide: SITREP I", u"upload_date": u"20130627", @@ -21,13 +22,22 @@ class GameSpotIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(3).split("-")[-1] - info_url = "http://www.gamespot.com/pages/video_player/xml.php?id="+str(video_id) + page_id = mobj.group('page_id') + webpage = self._download_webpage(url, page_id) + video_id = self._html_search_regex([r'"og:video" content=".*?\?id=(\d+)"', + r'http://www\.gamespot\.com/videoembed/(\d+)'], + webpage, 'video id') + data = compat_urllib_parse.urlencode({'id': video_id, 'newplayer': '1'}) + info_url = 'http://www.gamespot.com/pages/video_player/xml.php?' + data info_xml = self._download_webpage(info_url, video_id) doc = xml.etree.ElementTree.fromstring(info_xml) clip_el = doc.find('./playList/clip') - video_url = clip_el.find('./URI').text + http_urls = [{'url': node.find('filePath').text, + 'rate': int(node.find('rate').text)} + for node in clip_el.find('./httpURI')] + best_quality = sorted(http_urls, key=lambda f: f['rate'])[-1] + video_url = best_quality['url'] title = clip_el.find('./title').text ext = video_url.rpartition('.')[2] thumbnail_url = clip_el.find('./screenGrabURI').text diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py index 3ce93b492..3cc02d97e 100644 --- a/youtube_dl/extractor/gametrailers.py +++ b/youtube_dl/extractor/gametrailers.py @@ -1,68 +1,36 @@ import re -from .common import InfoExtractor -from ..utils import ( - compat_urllib_parse, +from .mtv import MTVIE, _media_xml_tag - ExtractorError, -) - -class GametrailersIE(InfoExtractor): +class GametrailersIE(MTVIE): + """ + Gametrailers use the same videos system as MTVIE, it just changes the feed + url, where the uri is and the method to get the thumbnails. + """ _VALID_URL = r'http://www.gametrailers.com/(?Pvideos|reviews|full-episodes)/(?P.*?)/(?P.*)' _TEST = { u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer', - u'file': u'zbvr8i.flv', - u'md5': u'c3edbc995ab4081976e16779bd96a878', + u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4', + u'md5': u'4c8e67681a0ea7ec241e8c09b3ea8cf7', u'info_dict': { - u"title": u"E3 2013: Debut Trailer" + u'title': u'E3 2013: Debut Trailer', + u'description': u'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!', }, - u'skip': u'Requires rtmpdump' } + # Overwrite MTVIE properties we don't want + _TESTS = [] + + _FEED_URL = 'http://www.gametrailers.com/feeds/mrss' + + def _get_thumbnail_url(self, uri, itemdoc): + search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) + return itemdoc.find(search_path).attrib['url'] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('id') - video_type = mobj.group('type') webpage = self._download_webpage(url, video_id) - if video_type == 'full-episodes': - mgid_re = r'data-video="(?Pmgid:.*?)"' - else: - mgid_re = r'data-contentId=\'(?Pmgid:.*?)\'' - mgid = self._search_regex(mgid_re, webpage, u'mgid') - data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'}) - - info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data, - video_id, u'Downloading video info') - links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data, - video_id, u'Downloading video urls info') - - self.report_extraction(video_id) - info_re = r'''.*?)\]\]>.* - .*?)\]\]>.* - .* - (?P.*?).* - ''' - - m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL) - if m_info is None: - raise ExtractorError(u'Unable to extract video info') - video_title = m_info.group('title') - video_description = m_info.group('description') - video_thumb = m_info.group('thumb') - - m_urls = list(re.finditer(r'(?P.*)', links_webpage)) - if m_urls is None or len(m_urls) == 0: - raise ExtractorError(u'Unable to extract video url') - # They are sorted from worst to best quality - video_url = m_urls[-1].group('url') - - return {'url': video_url, - 'id': video_id, - 'title': video_title, - # Videos are actually flv not mp4 - 'ext': 'flv', - 'thumbnail': video_thumb, - 'description': video_description, - } + mgid = self._search_regex([r'data-video="(?Pmgid:.*?)"', + r'data-contentId=\'(?Pmgid:.*?)\''], + webpage, u'mgid') + return self._get_videos_info(mgid) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 20bc53330..b633e896c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1,3 +1,5 @@ +# encoding: utf-8 + import os import re @@ -9,20 +11,34 @@ from ..utils import ( ExtractorError, ) +from .brightcove import BrightcoveIE class GenericIE(InfoExtractor): IE_DESC = u'Generic downloader that works on some sites' _VALID_URL = r'.*' IE_NAME = u'generic' - _TEST = { - u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', - u'file': u'13601338388002.mp4', - u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89', - u'info_dict': { - u"uploader": u"www.hodiho.fr", - u"title": u"R\u00e9gis plante sa Jeep" - } - } + _TESTS = [ + { + u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', + u'file': u'13601338388002.mp4', + u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89', + u'info_dict': { + u"uploader": u"www.hodiho.fr", + u"title": u"R\u00e9gis plante sa Jeep" + } + }, + { + u'url': u'http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/', + u'file': u'2371591881001.mp4', + u'md5': u'9e80619e0a94663f0bdc849b4566af19', + u'note': u'Test Brightcove downloads and detection in GenericIE', + u'info_dict': { + u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', + u'uploader': u'8TV', + u'description': u'md5:a950cc4285c43e44d763d036710cd9cd', + } + }, + ] def report_download_webpage(self, video_id): """Report webpage download.""" @@ -103,6 +119,13 @@ class GenericIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) self.report_extraction(video_id) + # Look for BrigthCove: + m_brightcove = re.search(r'', webpage, re.DOTALL) + if m_brightcove is not None: + self.to_screen(u'Brightcove video detected.') + bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group()) + return self.url_result(bc_url, 'Brightcove') + # Start with something easy: JW Player in SWFObject mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if mobj is None: diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index ca3abb7d7..ccca1d7e0 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -33,16 +33,12 @@ class HotNewHipHopIE(InfoExtractor): video_title = self._html_search_regex(r"(.*)", webpage_src, u'title') - - # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. - thumbnail = self._html_search_regex(r'"og:image" content="(.*)"', - webpage_src, u'thumbnail', fatal=False) results = [{ 'id': video_id, 'url' : video_url, 'title' : video_title, - 'thumbnail' : thumbnail, + 'thumbnail' : self._og_search_thumbnail(webpage_src), 'ext' : 'mp3', }] - return results \ No newline at end of file + return results diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py new file mode 100644 index 000000000..62abab655 --- /dev/null +++ b/youtube_dl/extractor/ign.py @@ -0,0 +1,91 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + determine_ext, +) + + +class IGNIE(InfoExtractor): + """ + Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com. + Some videos of it.ign.com are also supported + """ + + _VALID_URL = r'https?://.+?\.ign\.com/(?:videos|show_videos)(/.+)?/(?P.+)' + IE_NAME = u'ign.com' + + _CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config' + _DESCRIPTION_RE = [r'(.+?)', + r'id="my_show_video">.*?(.*?)', + ] + + _TEST = { + u'url': u'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', + u'file': u'8f862beef863986b2785559b9e1aa599.mp4', + u'md5': u'eac8bdc1890980122c3b66f14bdd02e9', + u'info_dict': { + u'title': u'The Last of Us Review', + u'description': u'md5:c8946d4260a4d43a00d5ae8ed998870c', + } + } + + def _find_video_id(self, webpage): + res_id = [r'data-video-id="(.+?)"', + r'.+)' + IE_NAME = '1up.com' + + _DESCRIPTION_RE = r'(.+?)' + + _TEST = { + u'url': u'http://gamevideos.1up.com/video/id/34976', + u'file': u'34976.mp4', + u'md5': u'68a54ce4ebc772e4b71e3123d413163d', + u'info_dict': { + u'title': u'Sniper Elite V2 - Trailer', + u'description': u'md5:5d289b722f5a6d940ca3136e9dae89cf', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + id = mobj.group('name_or_id') + result = super(OneUPIE, self)._real_extract(url) + result['id'] = id + return result diff --git a/youtube_dl/extractor/ina.py b/youtube_dl/extractor/ina.py index 962c59214..652f19b7b 100644 --- a/youtube_dl/extractor/ina.py +++ b/youtube_dl/extractor/ina.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class InaIE(InfoExtractor): """Information Extractor for Ina.fr""" - _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?PI[0-9]+)/.*' + _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?PI?[A-F0-9]+)/.*' _TEST = { u'url': u'www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', u'file': u'I12055569.mp4', diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 6ae704efd..ddc42882a 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -5,12 +5,13 @@ from .common import InfoExtractor class InstagramIE(InfoExtractor): _VALID_URL = r'(?:http://)?instagram.com/p/(.*?)/' _TEST = { - u'url': u'http://instagram.com/p/aye83DjauH/#', + u'url': u'http://instagram.com/p/aye83DjauH/?foo=bar#abc', u'file': u'aye83DjauH.mp4', u'md5': u'0d2da106a9d2631273e192b372806516', u'info_dict': { u"uploader_id": u"naomipq", - u"title": u"Video by naomipq" + u"title": u"Video by naomipq", + u'description': u'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', } } @@ -18,25 +19,17 @@ class InstagramIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'', - webpage, u'thumbnail URL', fatal=False) - html_title = self._html_search_regex( - r'(.+?)', - webpage, u'title', flags=re.DOTALL) - title = re.sub(u'(?: *\(Videos?\))? \u2022 Instagram$', '', html_title).strip() - uploader_id = self._html_search_regex(r'content="(.*?)\'s video on Instagram', - webpage, u'uploader name', fatal=False) - ext = 'mp4' + uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"', + webpage, u'uploader id', fatal=False) + desc = self._search_regex(r'"caption":"(.*?)"', webpage, u'description', + fatal=False) return [{ 'id': video_id, - 'url': video_url, - 'ext': ext, - 'title': title, - 'thumbnail': thumbnail_url, - 'uploader_id' : uploader_id + 'url': self._og_search_video_url(webpage), + 'ext': 'mp4', + 'title': u'Video by %s' % uploader_id, + 'thumbnail': self._og_search_thumbnail(webpage), + 'uploader_id' : uploader_id, + 'description': desc, }] diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py index 72ad6a3d0..a7b88d2d9 100644 --- a/youtube_dl/extractor/keek.py +++ b/youtube_dl/extractor/keek.py @@ -4,10 +4,10 @@ from .common import InfoExtractor class KeekIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P\w+)' IE_NAME = u'keek' _TEST = { - u'url': u'http://www.keek.com/ytdl/keeks/NODfbab', + u'url': u'https://www.keek.com/ytdl/keeks/NODfbab', u'file': u'NODfbab.mp4', u'md5': u'9b0636f8c0f7614afa4ea5e4c6e57e83', u'info_dict': { @@ -24,8 +24,7 @@ class KeekIE(InfoExtractor): thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id webpage = self._download_webpage(url, video_id) - video_title = self._html_search_regex(r'[\S\s]+?(?P.+?)', webpage, u'uploader', fatal=False) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index cf8a2c931..dd062a14e 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -33,11 +33,9 @@ class LiveLeakIE(InfoExtractor): video_url = self._search_regex(r'file: "(.*?)",', webpage, u'video URL') - video_title = self._html_search_regex(r'', webpage, u'uploader', fatal=False) diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py new file mode 100644 index 000000000..309921078 --- /dev/null +++ b/youtube_dl/extractor/livestream.py @@ -0,0 +1,52 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import compat_urllib_parse_urlparse, compat_urlparse + + +class LivestreamIE(InfoExtractor): + _VALID_URL = r'http://new.livestream.com/.*?/(?P.*?)(/videos/(?P\d+))?/?$' + _TEST = { + u'url': u'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', + u'file': u'4719370.mp4', + u'md5': u'0d2186e3187d185a04b3cdd02b828836', + u'info_dict': { + u'title': u'Live from Webster Hall NYC', + u'upload_date': u'20121012', + } + } + + def _extract_video_info(self, video_data): + video_url = video_data.get('progressive_url_hd') or video_data.get('progressive_url') + return {'id': video_data['id'], + 'url': video_url, + 'ext': 'mp4', + 'title': video_data['caption'], + 'thumbnail': video_data['thumbnail_url'], + 'upload_date': video_data['updated_at'].replace('-','')[:8], + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + event_name = mobj.group('event_name') + webpage = self._download_webpage(url, video_id or event_name) + + if video_id is None: + # This is an event page: + api_url = self._search_regex(r'event_design_eventId: \'(.+?)\'', + webpage, 'api url') + info = json.loads(self._download_webpage(api_url, event_name, + u'Downloading event info')) + videos = [self._extract_video_info(video_data['data']) + for video_data in info['feed']['data'] if video_data['type'] == u'video'] + return self.playlist_result(videos, info['id'], info['full_name']) + else: + og_video = self._og_search_video_url(webpage, name=u'player url') + query_str = compat_urllib_parse_urlparse(og_video).query + query = compat_urlparse.parse_qs(query_str) + api_url = query['play_url'][0].replace('.smil', '') + info = json.loads(self._download_webpage(api_url, video_id, + u'Downloading video info')) + return self._extract_video_info(info) diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 4c3f81b98..e38dc98b4 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -9,7 +9,7 @@ from ..utils import ( compat_urllib_parse, compat_urllib_request, compat_str, - + determine_ext, ExtractorError, ) @@ -20,7 +20,7 @@ class MetacafeIE(InfoExtractor): _DISCLAIMER = 'http://www.metacafe.com/family_filter/' _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' IE_NAME = u'metacafe' - _TEST = { + _TESTS = [{ u"add_ie": ["Youtube"], u"url": u"http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/", u"file": u"_aUehQsCQtM.flv", @@ -31,7 +31,16 @@ class MetacafeIE(InfoExtractor): u"uploader": u"PBS", u"uploader_id": u"PBS" } - } + }, + { + u"url": u"http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/", + u"file": u"an-dVVXnuY7Jh77J.mp4", + u"info_dict": { + u"title": u"The Andromeda Strain (1971): Stop the Bomb Part 3", + u"uploader": u"anyclip", + u"description": u"md5:38c711dd98f5bb87acf973d573442e67" + } + }] def report_disclaimer(self): @@ -73,14 +82,16 @@ class MetacafeIE(InfoExtractor): return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')] # Retrieve video webpage to extract further information - webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id) + req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id) + req.headers['Cookie'] = 'flashVersion=0;' + webpage = self._download_webpage(req, video_id) # Extract URL, uploader and title from webpage self.report_extraction(video_id) mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage) if mobj is not None: mediaURL = compat_urllib_parse.unquote(mobj.group(1)) - video_extension = mediaURL[-3:] + video_ext = mediaURL[-3:] # Extract gdaKey if available mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage) @@ -90,34 +101,37 @@ class MetacafeIE(InfoExtractor): gdaKey = mobj.group(1) video_url = '%s?__gda__=%s' % (mediaURL, gdaKey) else: - mobj = re.search(r' name="flashvars" value="(.*?)"', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - vardict = compat_parse_qs(mobj.group(1)) - if 'mediaData' not in vardict: - raise ExtractorError(u'Unable to extract media URL') - mobj = re.search(r'"mediaURL":"(?Phttp.*?)",(.*?)"key":"(?P.*?)"', vardict['mediaData'][0]) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - mediaURL = mobj.group('mediaURL').replace('\\/', '/') - video_extension = mediaURL[-3:] - video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key')) + mobj = re.search(r'http.*?)",(.*?)"key":"(?P.*?)"', vardict['mediaData'][0]) + if mobj is None: + raise ExtractorError(u'Unable to extract media URL') + mediaURL = mobj.group('mediaURL').replace('\\/', '/') + video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key')) + video_ext = determine_ext(video_url) - mobj = re.search(r'(?im)(.*) - Video', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1).decode('utf-8') + video_title = self._html_search_regex(r'(?im)(.*) - Video', webpage, u'title') + description = self._og_search_description(webpage) + video_uploader = self._html_search_regex( + r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("channel","([^"]+)"\);', + webpage, u'uploader nickname', fatal=False) - mobj = re.search(r'submitter=(.*?);', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract uploader nickname') - video_uploader = mobj.group(1) - - return [{ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), - 'uploader': video_uploader.decode('utf-8'), + return { + '_type': 'video', + 'id': video_id, + 'url': video_url, + 'description': description, + 'uploader': video_uploader, 'upload_date': None, 'title': video_title, - 'ext': video_extension.decode('utf-8'), - }] + 'ext': video_ext, + } diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index 969db7113..8f956571d 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -1,28 +1,110 @@ import re -import socket import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( - compat_http_client, - compat_str, - compat_urllib_error, - compat_urllib_request, - + compat_urllib_parse, ExtractorError, ) +def _media_xml_tag(tag): + return '{http://search.yahoo.com/mrss/}%s' % tag class MTVIE(InfoExtractor): - _VALID_URL = r'^(?Phttps?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P[0-9]+)/[^/]+$' - _WORKING = False + _VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P[0-9]+)/[^/]+$' + + _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/' + + _TESTS = [ + { + u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml', + u'file': u'853555.mp4', + u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8', + u'info_dict': { + u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"', + u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', + }, + }, + { + u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml', + u'file': u'USCJY1331283.mp4', + u'md5': u'73b4e7fcadd88929292fe52c3ced8caf', + u'info_dict': { + u'title': u'Everything Has Changed', + u'upload_date': u'20130606', + u'uploader': u'Taylor Swift', + }, + u'skip': u'VEVO is only available in some countries', + }, + ] + + @staticmethod + def _id_from_uri(uri): + return uri.split(':')[-1] + + # This was originally implemented for ComedyCentral, but it also works here + @staticmethod + def _transform_rtmp_url(rtmp_video_url): + m = re.match(r'^rtmpe?://.*?/(?Pgsp\..+?/.*)$', rtmp_video_url) + if not m: + raise ExtractorError(u'Cannot transform RTMP url') + base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/' + return base + m.group('finalid') + + def _get_thumbnail_url(self, uri, itemdoc): + return 'http://mtv.mtvnimages.com/uri/' + uri + + def _extract_video_url(self, metadataXml): + if '/error_country_block.swf' in metadataXml: + raise ExtractorError(u'This video is not available from your country.', expected=True) + mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8')) + renditions = mdoc.findall('.//rendition') + + # For now, always pick the highest quality. + rendition = renditions[-1] + + try: + _,_,ext = rendition.attrib['type'].partition('/') + format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate'] + rtmp_video_url = rendition.find('./src').text + except KeyError: + raise ExtractorError('Invalid rendition field.') + video_url = self._transform_rtmp_url(rtmp_video_url) + return {'ext': ext, 'url': video_url, 'format': format} + + def _get_video_info(self, itemdoc): + uri = itemdoc.find('guid').text + video_id = self._id_from_uri(uri) + self.report_extraction(video_id) + mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url'] + if 'acceptMethods' not in mediagen_url: + mediagen_url += '&acceptMethods=fms' + mediagen_page = self._download_webpage(mediagen_url, video_id, + u'Downloading video urls') + video_info = self._extract_video_url(mediagen_page) + + description_node = itemdoc.find('description') + if description_node is not None: + description = description_node.text + else: + description = None + video_info.update({'title': itemdoc.find('title').text, + 'id': video_id, + 'thumbnail': self._get_thumbnail_url(uri, itemdoc), + 'description': description, + }) + return video_info + + def _get_videos_info(self, uri): + video_id = self._id_from_uri(uri) + data = compat_urllib_parse.urlencode({'uri': uri}) + infoXml = self._download_webpage(self._FEED_URL +'?' + data, video_id, + u'Downloading info') + idoc = xml.etree.ElementTree.fromstring(infoXml.encode('utf-8')) + return [self._get_video_info(item) for item in idoc.findall('.//item')] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - if not mobj.group('proto'): - url = 'http://' + url video_id = mobj.group('videoid') webpage = self._download_webpage(url, video_id) @@ -35,46 +117,5 @@ class MTVIE(InfoExtractor): self.to_screen(u'Vevo video detected: %s' % vevo_id) return self.url_result('vevo:%s' % vevo_id, ie='Vevo') - #song_name = self._html_search_regex(r'', - # webpage, u'song name', fatal=False) - - video_title = self._html_search_regex(r'', - webpage, u'title') - - mtvn_uri = self._html_search_regex(r'', - webpage, u'mtvn_uri', fatal=False) - - content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', - webpage, u'content id', fatal=False) - - videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri - self.report_extraction(video_id) - request = compat_urllib_request.Request(videogen_url) - try: - metadataXml = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err)) - - mdoc = xml.etree.ElementTree.fromstring(metadataXml) - renditions = mdoc.findall('.//rendition') - - # For now, always pick the highest quality. - rendition = renditions[-1] - - try: - _,_,ext = rendition.attrib['type'].partition('/') - format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate'] - video_url = rendition.find('./src').text - except KeyError: - raise ExtractorError('Invalid rendition field.') - - info = { - 'id': video_id, - 'url': video_url, - 'upload_date': None, - 'title': video_title, - 'ext': ext, - 'format': format, - } - - return [info] + uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, u'uri') + return self._get_videos_info(uri) diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py index 122b7dd26..0f178905b 100644 --- a/youtube_dl/extractor/nba.py +++ b/youtube_dl/extractor/nba.py @@ -30,8 +30,7 @@ class NBAIE(InfoExtractor): video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' shortened_video_id = video_id.rpartition('/')[2] - title = self._html_search_regex(r'Date: (.*?)', webpage, 'upload_date', fatal=False) diff --git a/youtube_dl/extractor/roxwel.py b/youtube_dl/extractor/roxwel.py new file mode 100644 index 000000000..d339e6cb5 --- /dev/null +++ b/youtube_dl/extractor/roxwel.py @@ -0,0 +1,49 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import unified_strdate, determine_ext + + +class RoxwelIE(InfoExtractor): + _VALID_URL = r'https?://www\.roxwel\.com/player/(?P.+?)(\.|\?|$)' + + _TEST = { + u'url': u'http://www.roxwel.com/player/passionpittakeawalklive.html', + u'file': u'passionpittakeawalklive.flv', + u'md5': u'd9dea8360a1e7d485d2206db7fe13035', + u'info_dict': { + u'title': u'Take A Walk (live)', + u'uploader': u'Passion Pit', + u'description': u'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ', + }, + u'skip': u'Requires rtmpdump', + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + filename = mobj.group('filename') + info_url = 'http://www.roxwel.com/api/videos/%s' % filename + info_page = self._download_webpage(info_url, filename, + u'Downloading video info') + + self.report_extraction(filename) + info = json.loads(info_page) + rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')]) + best_rate = rtmp_rates[-1] + url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate) + rtmp_url = self._download_webpage(url_page_url, filename, u'Downloading video url') + ext = determine_ext(rtmp_url) + if ext == 'f4v': + rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename) + + return {'id': filename, + 'title': info['title'], + 'url': rtmp_url, + 'ext': 'flv', + 'description': info['description'], + 'thumbnail': info.get('player_image_url') or info.get('image_url_large'), + 'uploader': info['artist'], + 'uploader_id': info['artistname'], + 'upload_date': unified_strdate(info['dbdate']), + } diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py new file mode 100644 index 000000000..14b1c656c --- /dev/null +++ b/youtube_dl/extractor/sina.py @@ -0,0 +1,67 @@ +# coding: utf-8 + +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_request, + compat_urllib_parse, +) + + +class SinaIE(InfoExtractor): + _VALID_URL = r'''https?://(.*?\.)?video\.sina\.com\.cn/ + ( + (.+?/(((?P\d+).html)|(.*?(\#|(vid=))(?P\d+?)($|&)))) + | + # This is used by external sites like Weibo + (api/sinawebApi/outplay.php/(?P.+?)\.swf) + ) + ''' + + _TEST = { + u'url': u'http://video.sina.com.cn/news/vlist/zt/chczlj2013/?opsubject_id=top12#110028898', + u'file': u'110028898.flv', + u'md5': u'd65dd22ddcf44e38ce2bf58a10c3e71f', + u'info_dict': { + u'title': u'《中国新闻》 朝鲜要求巴拿马立即释放被扣船员', + } + } + + @classmethod + def suitable(cls, url): + return re.match(cls._VALID_URL, url, flags=re.VERBOSE) is not None + + def _extract_video(self, video_id): + data = compat_urllib_parse.urlencode({'vid': video_id}) + url_page = self._download_webpage('http://v.iask.com/v_play.php?%s' % data, + video_id, u'Downloading video url') + image_page = self._download_webpage( + 'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data, + video_id, u'Downloading thumbnail info') + url_doc = xml.etree.ElementTree.fromstring(url_page.encode('utf-8')) + + return {'id': video_id, + 'url': url_doc.find('./durl/url').text, + 'ext': 'flv', + 'title': url_doc.find('./vname').text, + 'thumbnail': image_page.split('=')[1], + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) + video_id = mobj.group('id') + if mobj.group('token') is not None: + # The video id is in the redirected url + self.to_screen(u'Getting video id') + request = compat_urllib_request.Request(url) + request.get_method = lambda: 'HEAD' + (_, urlh) = self._download_webpage_handle(request, 'NA', False) + return self._real_extract(urlh.geturl()) + elif video_id is None: + pseudo_id = mobj.group('pseudo_id') + webpage = self._download_webpage(url, pseudo_id) + video_id = self._search_regex(r'vid:\'(\d+?)\'', webpage, u'video id') + + return self._extract_video(video_id) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index d47c49c03..7c9f1c6b6 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -19,7 +19,11 @@ class SoundcloudIE(InfoExtractor): of the stream token and uid """ - _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)(?:[?].*)?$' + _VALID_URL = r'''^(?:https?://)? + (?:(?:(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)/?(?:[?].*)?$) + |(?:api\.soundcloud\.com/tracks/(?P\d+)) + ) + ''' IE_NAME = u'soundcloud' _TEST = { u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy', @@ -33,59 +37,65 @@ class SoundcloudIE(InfoExtractor): } } + _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28' + + @classmethod + def suitable(cls, url): + return re.match(cls._VALID_URL, url, flags=re.VERBOSE) is not None + def report_resolve(self, video_id): """Report information extraction.""" self.to_screen(u'%s: Resolving id' % video_id) - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) + @classmethod + def _resolv_url(cls, url): + return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID - # extract uploader (which is in the url) - uploader = mobj.group(1) - # extract simple title (uploader + slug of song title) - slug_title = mobj.group(2) - full_title = '%s/%s' % (uploader, slug_title) - - self.report_resolve(full_title) - - url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title) - resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28' - info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON') - - info = json.loads(info_json) + def _extract_info_dict(self, info, full_title=None): video_id = info['id'] - self.report_extraction(full_title) + name = full_title or video_id + self.report_extraction(name) - streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28' - stream_json = self._download_webpage(streams_url, full_title, - u'Downloading stream definitions', - u'unable to download stream definitions') - - streams = json.loads(stream_json) - mediaURL = streams['http_mp3_128_url'] - upload_date = unified_strdate(info['created_at']) - - return [{ + thumbnail = info['artwork_url'] + if thumbnail is not None: + thumbnail = thumbnail.replace('-large', '-t500x500') + return { 'id': info['id'], - 'url': mediaURL, + 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID, 'uploader': info['user']['username'], - 'upload_date': upload_date, + 'upload_date': unified_strdate(info['created_at']), 'title': info['title'], 'ext': u'mp3', 'description': info['description'], - }] + 'thumbnail': thumbnail, + } -class SoundcloudSetIE(InfoExtractor): - """Information extractor for soundcloud.com sets - To access the media, the uid of the song and a stream token - must be extracted from the page source and the script must make - a request to media.soundcloud.com/crossdomain.xml. Then - the media can be grabbed by requesting from an url composed - of the stream token and uid - """ + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + track_id = mobj.group('track_id') + if track_id is not None: + info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID + full_title = track_id + else: + # extract uploader (which is in the url) + uploader = mobj.group(1) + # extract simple title (uploader + slug of song title) + slug_title = mobj.group(2) + full_title = '%s/%s' % (uploader, slug_title) + + self.report_resolve(full_title) + + url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title) + info_json_url = self._resolv_url(url) + info_json = self._download_webpage(info_json_url, full_title, u'Downloading info JSON') + + info = json.loads(info_json) + return self._extract_info_dict(info, full_title) + +class SoundcloudSetIE(SoundcloudIE): _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$' IE_NAME = u'soundcloud:set' _TEST = { @@ -153,10 +163,6 @@ class SoundcloudSetIE(InfoExtractor): ] } - def report_resolve(self, video_id): - """Report information extraction.""" - self.to_screen(u'%s: Resolving id' % video_id) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: @@ -171,7 +177,7 @@ class SoundcloudSetIE(InfoExtractor): self.report_resolve(full_title) url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title) - resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28' + resolv_url = self._resolv_url(url) info_json = self._download_webpage(resolv_url, full_title) videos = [] @@ -182,23 +188,8 @@ class SoundcloudSetIE(InfoExtractor): return self.report_extraction(full_title) - for track in info['tracks']: - video_id = track['id'] - - streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28' - stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON') - - self.report_extraction(video_id) - streams = json.loads(stream_json) - mediaURL = streams['http_mp3_128_url'] - - videos.append({ - 'id': video_id, - 'url': mediaURL, - 'uploader': track['user']['username'], - 'upload_date': unified_strdate(track['created_at']), - 'title': track['title'], - 'ext': u'mp3', - 'description': track['description'], - }) - return videos + return {'_type': 'playlist', + 'entries': [self._extract_info_dict(track) for track in info['tracks']], + 'id': info['id'], + 'title': info['title'], + } diff --git a/youtube_dl/extractor/statigram.py b/youtube_dl/extractor/statigram.py index ae9a63e8b..b8e6b3bf9 100644 --- a/youtube_dl/extractor/statigram.py +++ b/youtube_dl/extractor/statigram.py @@ -18,12 +18,6 @@ class StatigramIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'', - webpage, u'video URL') - thumbnail_url = self._html_search_regex( - r'', - webpage, u'thumbnail URL', fatal=False) html_title = self._html_search_regex( r'(.+?)', webpage, u'title') @@ -34,9 +28,9 @@ class StatigramIE(InfoExtractor): return [{ 'id': video_id, - 'url': video_url, + 'url': self._og_search_video_url(webpage), 'ext': ext, 'title': title, - 'thumbnail': thumbnail_url, + 'thumbnail': self._og_search_thumbnail(webpage), 'uploader_id' : uploader_id }] diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py index ecac4ec40..91658f892 100644 --- a/youtube_dl/extractor/steam.py +++ b/youtube_dl/extractor/steam.py @@ -23,14 +23,16 @@ class SteamIE(InfoExtractor): u"file": u"81300.flv", u"md5": u"f870007cee7065d7c76b88f0a45ecc07", u"info_dict": { - u"title": u"Terraria 1.1 Trailer" + u"title": u"Terraria 1.1 Trailer", + u'playlist_index': 1, } }, { u"file": u"80859.flv", u"md5": u"61aaf31a5c5c3041afb58fb83cbb5751", u"info_dict": { - u"title": u"Terraria Trailer" + u"title": u"Terraria Trailer", + u'playlist_index': 2, } } ] diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py index 1dd5e1b68..c910110ca 100644 --- a/youtube_dl/extractor/teamcoco.py +++ b/youtube_dl/extractor/teamcoco.py @@ -30,26 +30,17 @@ class TeamcocoIE(InfoExtractor): self.report_extraction(video_id) - video_title = self._html_search_regex(r'(.*?)', + video_url = self._html_search_regex(r']*type="high".*?>(.*?)', data, u'video URL') return [{ 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': video_title, - 'thumbnail': thumbnail, - 'description': video_description, + 'title': self._og_search_title(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage), }] diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 8b73b8340..4c11f7a03 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -67,7 +67,7 @@ class TEDIE(InfoExtractor): webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name) self.report_extraction(video_name) # If the url includes the language we get the title translated - title = self._html_search_regex(r'(?P.*)', + title = self._html_search_regex(r'(?P.*)', webpage, 'title') json_data = self._search_regex(r'var talkDetails = ({.*?})', webpage, 'json data') diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py index e0ffeced5..a8af89f83 100644 --- a/youtube_dl/extractor/tf1.py +++ b/youtube_dl/extractor/tf1.py @@ -10,6 +10,7 @@ class TF1IE(InfoExtractor): TF1 uses the wat.tv player, currently it can only download videos with the html5 player enabled, it cannot download HD videos. """ + _WORKING = False _VALID_URL = r'http://videos.tf1.fr/.*-(.*?).html' _TEST = { u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', diff --git a/youtube_dl/extractor/thisav.py b/youtube_dl/extractor/thisav.py new file mode 100644 index 000000000..9dcfc28b3 --- /dev/null +++ b/youtube_dl/extractor/thisav.py @@ -0,0 +1,47 @@ +#coding: utf-8 + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, +) + +class ThisAVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P[0-9]+)/.*' + _TEST = { + u"url": u"http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html", + u"file": u"47734.flv", + u"md5": u"0480f1ef3932d901f0e0e719f188f19b", + u"info_dict": { + u"title": u"高樹マリア - Just fit", + u"uploader": u"dj7970", + u"uploader_id": u"dj7970" + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex(r'([^<]*)', webpage, u'title') + video_url = self._html_search_regex( + r"addVariable\('file','([^']+)'\);", webpage, u'video url') + uploader = self._html_search_regex( + r': ([^<]+)', + webpage, u'uploader name', fatal=False) + uploader_id = self._html_search_regex( + r': (?:[^<]+)', + webpage, u'uploader id', fatal=False) + ext = determine_ext(video_url) + + return { + '_type': 'video', + 'id': video_id, + 'url': video_url, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'title': title, + 'ext': ext, + } diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py index 9dd26c163..35f89e9ee 100644 --- a/youtube_dl/extractor/traileraddict.py +++ b/youtube_dl/extractor/traileraddict.py @@ -4,11 +4,11 @@ from .common import InfoExtractor class TrailerAddictIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/trailer/([^/]+)/(?:trailer|feature-trailer)' + _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P.+?)/(?P.+)' _TEST = { u'url': u'http://www.traileraddict.com/trailer/prince-avalanche/trailer', u'file': u'76184.mp4', - u'md5': u'41365557f3c8c397d091da510e73ceb4', + u'md5': u'57e39dbcf4142ceb8e1f242ff423fd71', u'info_dict': { u"title": u"Prince Avalanche Trailer", u"description": u"Trailer for Prince Avalanche.Two highway road workers spend the summer of 1988 away from their city lives. The isolated landscape becomes a place of misadventure as the men find themselves at odds with each other and the women they left behind." @@ -17,33 +17,36 @@ class TrailerAddictIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1) - webpage = self._download_webpage(url, video_id) - + name = mobj.group('movie') + '/' + mobj.group('trailer_name') + webpage = self._download_webpage(url, name) + title = self._search_regex(r'(.+?)', webpage, 'video title').replace(' - Trailer Addict','') view_count = self._search_regex(r'Views: (.+?)', webpage, 'Views Count') - description = self._search_regex(r'', - webpage, 'video description') - video_id = self._search_regex(r'', - webpage, 'Video id').split('=')[1] - - info_url = "http://www.traileraddict.com/fvar.php?tid=%s" %(str(video_id)) + video_id = self._og_search_property('video', webpage, 'Video id').split('=')[1] + + # Presence of (no)watchplus function indicates HD quality is available + if re.search(r'function (no)?watchplus()', webpage): + fvar = "fvarhd" + else: + fvar = "fvar" + + info_url = "http://www.traileraddict.com/%s.php?tid=%s" % (fvar, str(video_id)) info_webpage = self._download_webpage(info_url, video_id , "Downloading the info webpage") - + final_url = self._search_regex(r'&fileurl=(.+)', info_webpage, 'Download url').replace('%3F','?') thumbnail_url = self._search_regex(r'&image=(.+?)&', info_webpage, 'thumbnail url') ext = final_url.split('.')[-1].split('?')[0] - + return [{ 'id' : video_id, 'url' : final_url, 'ext' : ext, 'title' : title, 'thumbnail' : thumbnail_url, - 'description' : description, + 'description' : self._og_search_description(webpage), 'view_count' : view_count, }] diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py index fcaa6ac01..4e404fbf5 100644 --- a/youtube_dl/extractor/tutv.py +++ b/youtube_dl/extractor/tutv.py @@ -22,8 +22,6 @@ class TutvIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'', webpage, u'title') internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, u'internal video ID') data_url = u'http://tu.tv/flvurl.php?codVideo=' + str(internal_id) @@ -36,6 +34,6 @@ class TutvIE(InfoExtractor): 'id': internal_id, 'url': video_url, 'ext': ext, - 'title': title, + 'title': self._og_search_title(webpage), } return [info] diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py new file mode 100644 index 000000000..00672c9e5 --- /dev/null +++ b/youtube_dl/extractor/veoh.py @@ -0,0 +1,47 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + determine_ext, +) + +class VeohIE(InfoExtractor): + _VALID_URL = r'http://www\.veoh\.com/watch/v(?P\d*)' + + _TEST = { + u'url': u'http://www.veoh.com/watch/v56314296nk7Zdmz3', + u'file': u'56314296.mp4', + u'md5': u'620e68e6a3cff80086df3348426c9ca3', + u'info_dict': { + u'title': u'Straight Backs Are Stronger', + u'uploader': u'LUMOback', + u'description': u'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + + m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|")', webpage) + if m_youtube is not None: + youtube_id = m_youtube.group(1) + self.to_screen(u'%s: detected Youtube video.' % video_id) + return self.url_result(youtube_id, 'Youtube') + + self.report_extraction(video_id) + info = self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info') + info = json.loads(info) + video_url = info.get('fullPreviewHashHighPath') or info.get('fullPreviewHashLowPath') + + return {'id': info['videoId'], + 'title': info['title'], + 'ext': determine_ext(video_url), + 'url': video_url, + 'uploader': info['username'], + 'thumbnail': info.get('highResImage') or info.get('medResImage'), + 'description': info['description'], + 'view_count': info['views'], + } diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 7c4562790..ac32043c1 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -17,6 +17,7 @@ class VimeoIE(InfoExtractor): # _VALID_URL matches Vimeo URLs _VALID_URL = r'(?Phttps?://)?(?:(?:www|player)\.)?vimeo(?Ppro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)(?:[?].*)?$' + _NETRC_MACHINE = 'vimeo' IE_NAME = u'vimeo' _TEST = { u'url': u'http://vimeo.com/56015672', @@ -31,6 +32,25 @@ class VimeoIE(InfoExtractor): } } + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + self.report_login() + login_url = 'https://vimeo.com/log_in' + webpage = self._download_webpage(login_url, None, False) + token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1) + data = compat_urllib_parse.urlencode({'email': username, + 'password': password, + 'action': 'login', + 'service': 'vimeo', + 'token': token, + }) + login_request = compat_urllib_request.Request(login_url, data) + login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + login_request.add_header('Cookie', 'xsrft=%s' % token) + self._download_webpage(login_request, None, False, u'Wrong login info') + def _verify_video_password(self, url, video_id, webpage): password = self._downloader.params.get('videopassword', None) if password is None: @@ -50,6 +70,9 @@ class VimeoIE(InfoExtractor): u'Verifying the password', u'Wrong password') + def _real_initialize(self): + self._login() + def _real_extract(self, url, new_video=True): # Extract ID from URL mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index bdd3522eb..c4ec1f06f 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -27,12 +27,6 @@ class VineIE(InfoExtractor): video_url = self._html_search_regex(r'.*?(.+?)', webpage, u'uploader', fatal=False, flags=re.DOTALL) @@ -40,7 +34,7 @@ class VineIE(InfoExtractor): 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': video_title, - 'thumbnail': thumbnail, + 'title': self._og_search_title(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), 'uploader': uploader, }] diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index 0d1302cd2..0407a2d26 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -12,6 +12,7 @@ from ..utils import ( class WatIE(InfoExtractor): + _WORKING = False _VALID_URL=r'http://www.wat.tv/.*-(?P.*?)_.*?.html' IE_NAME = 'wat.tv' _TEST = { diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py new file mode 100644 index 000000000..0757495bd --- /dev/null +++ b/youtube_dl/extractor/weibo.py @@ -0,0 +1,48 @@ +# coding: utf-8 + +import re +import json + +from .common import InfoExtractor + +class WeiboIE(InfoExtractor): + """ + The videos in Weibo come from different sites, this IE just finds the link + to the external video and returns it. + """ + _VALID_URL = r'https?://video\.weibo\.com/v/weishipin/t_(?P.+?)\.htm' + + _TEST = { + u'url': u'http://video.weibo.com/v/weishipin/t_zjUw2kZ.htm', + u'file': u'98322879.flv', + u'info_dict': { + u'title': u'魔声耳机最新广告“All Eyes On Us”', + }, + u'note': u'Sina video', + u'params': { + u'skip_download': True, + }, + } + + # Additional example videos from different sites + # Youku: http://video.weibo.com/v/weishipin/t_zQGDWQ8.htm + # 56.com: http://video.weibo.com/v/weishipin/t_zQ44HxN.htm + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) + video_id = mobj.group('id') + info_url = 'http://video.weibo.com/?s=v&a=play_list&format=json&mix_video_id=t_%s' % video_id + info_page = self._download_webpage(info_url, video_id) + info = json.loads(info_page) + + videos_urls = map(lambda v: v['play_page_url'], info['result']['data']) + #Prefer sina video since they have thumbnails + videos_urls = sorted(videos_urls, key=lambda u: u'video.sina.com' in u) + player_url = videos_urls[-1] + m_sina = re.match(r'https?://video.sina.com.cn/v/b/(\d+)-\d+.html', player_url) + if m_sina is not None: + self.to_screen('Sina video detected') + sina_id = m_sina.group(1) + player_url = 'http://you.video.sina.com.cn/swf/quotePlayer.swf?vid=%s' % sina_id + return self.url_result(player_url) + diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index 6f022670c..1265639e8 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -40,8 +40,20 @@ class YouJizzIE(InfoExtractor): webpage = self._download_webpage(embed_page_url, video_id) # Get the video URL - video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P[^"]+)"\)\);', - webpage, u'video URL') + m_playlist = re.search(r'so.addVariable\("playlist", ?"(?P.+?)"\);', webpage) + if m_playlist is not None: + playlist_url = m_playlist.group('playlist') + playlist_page = self._download_webpage(playlist_url, video_id, + u'Downloading playlist page') + m_levels = list(re.finditer(r'[^"]+)"\)\);', + webpage, u'video URL') info = {'id': video_id, 'url': video_url, diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index eb9829801..996d38478 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -13,7 +13,7 @@ from ..utils import ( class YoukuIE(InfoExtractor): - _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P[A-Za-z0-9]+)\.html' + _VALID_URL = r'(?:http://)?(v|player)\.youku\.com/(v_show/id_|player\.php/sid/)(?P[A-Za-z0-9]+)(\.html|/v.swf)' _TEST = { u"url": u"http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html", u"file": u"XNDgyMDQ2NTQw_part00.flv", diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 14a8bd6ea..f10f2e3dd 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -4,6 +4,7 @@ import json import netrc import re import socket +import itertools from .common import InfoExtractor, SearchInfoExtractor from ..utils import ( @@ -19,10 +20,117 @@ from ..utils import ( ExtractorError, unescapeHTML, unified_strdate, + orderedSet, ) +class YoutubeBaseInfoExtractor(InfoExtractor): + """Provide base functions for Youtube extractors""" + _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' + _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' + _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' + _NETRC_MACHINE = 'youtube' + # If True it will raise an error if no login info is provided + _LOGIN_REQUIRED = False -class YoutubeIE(InfoExtractor): + def report_lang(self): + """Report attempt to set language.""" + self.to_screen(u'Setting language') + + def _set_language(self): + request = compat_urllib_request.Request(self._LANG_URL) + try: + self.report_lang() + compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) + return False + return True + + def _login(self): + (username, password) = self._get_login_info() + # No authentication to be performed + if username is None: + if self._LOGIN_REQUIRED: + raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True) + return False + + request = compat_urllib_request.Request(self._LOGIN_URL) + try: + login_page = compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) + return False + + galx = None + dsh = None + match = re.search(re.compile(r']* id="gaia_loginform"', login_results) is not None: + self._downloader.report_warning(u'unable to log in: bad username or password') + return False + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) + return False + return True + + def _confirm_age(self): + age_form = { + 'next_url': '/', + 'action_confirm': 'Confirm', + } + request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) + try: + self.report_age_confirmation() + compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) + return True + + def _real_initialize(self): + if self._downloader is None: + return + if not self._set_language(): + return + if not self._login(): + return + self._confirm_age() + +class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = u'YouTube.com' _VALID_URL = r"""^ ( @@ -43,11 +151,7 @@ class YoutubeIE(InfoExtractor): ([0-9A-Za-z_-]+) # here is it! the YouTube video ID (?(1).+)? # if we found the ID, everything can follow $""" - _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' - _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' - _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' - _NETRC_MACHINE = 'youtube' # Listed in order of quality _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] @@ -115,24 +219,28 @@ class YoutubeIE(InfoExtractor): u"uploader": u"IconaPop", u"uploader_id": u"IconaPop" } - } + }, + { + u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ", + u"file": u"07FYdnEawAQ.mp4", + u"note": u"Test VEVO video with age protection (#956)", + u"info_dict": { + u"upload_date": u"20130703", + u"title": u"Justin Timberlake - Tunnel Vision (Explicit)", + u"description": u"md5:64249768eec3bc4276236606ea996373", + u"uploader": u"justintimberlakeVEVO", + u"uploader_id": u"justintimberlakeVEVO" + } + }, ] @classmethod def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - if YoutubePlaylistIE.suitable(url): return False + if YoutubePlaylistIE.suitable(url) or YoutubeSubscriptionsIE.suitable(url): return False return re.match(cls._VALID_URL, url, re.VERBOSE) is not None - def report_lang(self): - """Report attempt to set language.""" - self.to_screen(u'Setting language') - - def report_login(self): - """Report attempt to log in.""" - self.to_screen(u'Logging in') - def report_video_webpage_download(self, video_id): """Report attempt to download video webpage.""" self.to_screen(u'%s: Downloading video webpage' % video_id) @@ -169,20 +277,28 @@ class YoutubeIE(InfoExtractor): def _decrypt_signature(self, s): """Turn the encrypted s field into a working signature""" - if len(s) == 88: + if len(s) == 92: + return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83] + elif len(s) == 90: + return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81] + elif len(s) == 88: return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12] elif len(s) == 87: - return s[62] + s[82:62:-1] + s[83] + s[61:52:-1] + s[0] + s[51:2:-1] + return s[4:23] + s[86] + s[24:85] elif len(s) == 86: return s[2:63] + s[82] + s[64:82] + s[63] elif len(s) == 85: - return s[76] + s[82:76:-1] + s[83] + s[75:60:-1] + s[0] + s[59:50:-1] + s[1] + s[49:2:-1] + return s[2:8] + s[0] + s[9:21] + s[65] + s[22:65] + s[84] + s[66:82] + s[21] elif len(s) == 84: return s[83:36:-1] + s[2] + s[35:26:-1] + s[3] + s[25:3:-1] + s[26] elif len(s) == 83: - return s[52] + s[81:55:-1] + s[2] + s[54:52:-1] + s[82] + s[51:36:-1] + s[55] + s[35:2:-1] + s[36] + return s[6] + s[3:6] + s[33] + s[7:24] + s[0] + s[25:33] + s[53] + s[34:53] + s[24] + s[54:] elif len(s) == 82: return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34] + elif len(s) == 81: + return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] + elif len(s) == 79: + return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] else: raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) @@ -290,109 +406,6 @@ class YoutubeIE(InfoExtractor): for x in formats: print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))) - def _real_initialize(self): - if self._downloader is None: - return - - username = None - password = None - downloader_params = self._downloader.params - - # Attempt to use provided username and password or .netrc data - if downloader_params.get('username', None) is not None: - username = downloader_params['username'] - password = downloader_params['password'] - elif downloader_params.get('usenetrc', False): - try: - info = netrc.netrc().authenticators(self._NETRC_MACHINE) - if info is not None: - username = info[0] - password = info[2] - else: - raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) - except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) - return - - # Set language - request = compat_urllib_request.Request(self._LANG_URL) - try: - self.report_lang() - compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) - return - - # No authentication to be performed - if username is None: - return - - request = compat_urllib_request.Request(self._LOGIN_URL) - try: - login_page = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) - return - - galx = None - dsh = None - match = re.search(re.compile(r']* id="gaia_loginform"', login_results) is not None: - self._downloader.report_warning(u'unable to log in: bad username or password') - return - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) - return - - # Confirm age - age_form = { - 'next_url': '/', - 'action_confirm': 'Confirm', - } - request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) - try: - self.report_age_confirmation() - compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) - def _extract_id(self, url): mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: @@ -430,15 +443,35 @@ class YoutubeIE(InfoExtractor): # Get video info self.report_video_info_webpage_download(video_id) - for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: - video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' - % (video_id, el_type)) + if re.search(r'player-age-gate-content">', video_webpage) is not None: + self.report_age_confirmation() + age_gate = True + # We simulate the access to the video from www.youtube.com/v/{video_id} + # this can be viewed without login into Youtube + data = compat_urllib_parse.urlencode({'video_id': video_id, + 'el': 'embedded', + 'gl': 'US', + 'hl': 'en', + 'eurl': 'https://youtube.googleapis.com/v/' + video_id, + 'asv': 3, + 'sts':'1588', + }) + video_info_url = 'https://www.youtube.com/get_video_info?' + data video_info_webpage = self._download_webpage(video_info_url, video_id, note=False, errnote='unable to download video info webpage') video_info = compat_parse_qs(video_info_webpage) - if 'token' in video_info: - break + else: + age_gate = False + for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: + video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' + % (video_id, el_type)) + video_info_webpage = self._download_webpage(video_info_url, video_id, + note=False, + errnote='unable to download video info webpage') + video_info = compat_parse_qs(video_info_webpage) + if 'token' in video_info: + break if 'token' not in video_info: if 'reason' in video_info: raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True) @@ -471,7 +504,12 @@ class YoutubeIE(InfoExtractor): video_title = compat_urllib_parse.unquote_plus(video_info['title'][0]) # thumbnail image - if 'thumbnail_url' not in video_info: + # We try first to get a high quality image: + m_thumb = re.search(r'', + video_webpage, re.DOTALL) + if m_thumb is not None: + video_thumbnail = m_thumb.group(1) + elif 'thumbnail_url' not in video_info: self._downloader.report_warning(u'unable to extract video thumbnail') video_thumbnail = '' else: # don't panic if we can't find it @@ -550,6 +588,8 @@ class YoutubeIE(InfoExtractor): self.report_rtmp_download() video_url_list = [(None, video_info['conn'][0])] elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1: + if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]: + raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True) url_map = {} for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','): url_data = compat_parse_qs(url_data_str) @@ -560,10 +600,17 @@ class YoutubeIE(InfoExtractor): elif 's' in url_data: if self._downloader.params.get('verbose'): s = url_data['s'][0] - player = self._search_regex(r'html5player-(.+?)\.js', video_webpage, - 'html5 player', fatal=False) - self.to_screen('encrypted signature length %d (%d.%d), itag %s, html5 player %s' % - (len(s), len(s.split('.')[0]), len(s.split('.')[1]), url_data['itag'][0], player)) + if age_gate: + player_version = self._search_regex(r'ad3-(.+?)\.swf', + video_info['ad3_module'][0], 'flash player', + fatal=False) + player = 'flash player %s' % player_version + else: + player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage, + 'html5 player', fatal=False) + parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.')) + self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' % + (len(s), parts_sizes, url_data['itag'][0], player)) signature = self._decrypt_signature(url_data['s'][0]) url += '&signature=' + signature if 'ratebypass' not in url: @@ -638,10 +685,10 @@ class YoutubePlaylistIE(InfoExtractor): \? (?:.*?&)*? (?:p|a|list)= | p/ ) - ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,}) + ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,}) .* | - ((?:PL|EC|UU)[0-9A-Za-z-_]{10,}) + ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none' _MAX_RESULTS = 50 @@ -660,11 +707,14 @@ class YoutubePlaylistIE(InfoExtractor): # Download playlist videos from API playlist_id = mobj.group(1) or mobj.group(2) - page_num = 1 videos = [] - while True: - url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1) + for page_num in itertools.count(1): + start_index = self._MAX_RESULTS * (page_num - 1) + 1 + if start_index >= 1000: + self._downloader.report_warning(u'Max number of results reached') + break + url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index) page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num) try: @@ -684,13 +734,9 @@ class YoutubePlaylistIE(InfoExtractor): if 'media$group' in entry and 'media$player' in entry['media$group']: videos.append((index, entry['media$group']['media$player']['url'])) - if len(response['feed']['entry']) < self._MAX_RESULTS: - break - page_num += 1 - videos = [v[1] for v in sorted(videos)] - url_results = [self.url_result(url, 'Youtube') for url in videos] + url_results = [self.url_result(vurl, 'Youtube') for vurl in videos] return [self.playlist_result(url_results, playlist_id, playlist_title)] @@ -699,7 +745,7 @@ class YoutubeChannelIE(InfoExtractor): _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en' _MORE_PAGES_INDICATOR = 'yt-uix-load-more' - _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' + _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' IE_NAME = u'youtube:channel' def extract_videos_from_page(self, page): @@ -730,9 +776,7 @@ class YoutubeChannelIE(InfoExtractor): # Download any subsequent channel pages using the json-based channel_ajax query if self._MORE_PAGES_INDICATOR in page: - while True: - pagenum = pagenum + 1 - + for pagenum in itertools.count(1): url = self._MORE_PAGES_URL % (pagenum, channel_id) page = self._download_webpage(url, channel_id, u'Downloading page #%s' % pagenum) @@ -748,7 +792,7 @@ class YoutubeChannelIE(InfoExtractor): self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids] - url_entries = [self.url_result(url, 'Youtube') for url in urls] + url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls] return [self.playlist_result(url_entries, channel_id)] @@ -775,9 +819,8 @@ class YoutubeUserIE(InfoExtractor): # all of them. video_ids = [] - pagenum = 0 - while True: + for pagenum in itertools.count(0): start_index = pagenum * self._GDATA_PAGE_SIZE + 1 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index) @@ -802,10 +845,8 @@ class YoutubeUserIE(InfoExtractor): if len(ids_in_page) < self._GDATA_PAGE_SIZE: break - pagenum += 1 - urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids] - url_results = [self.url_result(url, 'Youtube') for url in urls] + url_results = [self.url_result(rurl, 'Youtube') for rurl in urls] return [self.playlist_result(url_results, playlist_title = username)] class YoutubeSearchIE(SearchInfoExtractor): @@ -864,3 +905,77 @@ class YoutubeShowIE(InfoExtractor): m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage)) self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons))) return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons] + + +class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): + """ + Base class for extractors that fetch info from + http://www.youtube.com/feed_ajax + Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. + """ + _LOGIN_REQUIRED = True + _PAGING_STEP = 30 + # use action_load_personal_feed instead of action_load_system_feed + _PERSONAL_FEED = False + + @property + def _FEED_TEMPLATE(self): + action = 'action_load_system_feed' + if self._PERSONAL_FEED: + action = 'action_load_personal_feed' + return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME) + + @property + def IE_NAME(self): + return u'youtube:%s' % self._FEED_NAME + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + feed_entries = [] + # The step argument is available only in 2.7 or higher + for i in itertools.count(0): + paging = i*self._PAGING_STEP + info = self._download_webpage(self._FEED_TEMPLATE % paging, + u'%s feed' % self._FEED_NAME, + u'Downloading page %s' % i) + info = json.loads(info) + feed_html = info['feed_html'] + m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html) + ids = orderedSet(m.group(1) for m in m_ids) + feed_entries.extend(self.url_result(id, 'Youtube') for id in ids) + if info['paging'] is None: + break + return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE) + +class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): + IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)' + _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' + _FEED_NAME = 'subscriptions' + _PLAYLIST_TITLE = u'Youtube Subscriptions' + +class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): + IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)' + _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' + _FEED_NAME = 'recommended' + _PLAYLIST_TITLE = u'Youtube Recommended videos' + +class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): + IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)' + _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater' + _FEED_NAME = 'watch_later' + _PLAYLIST_TITLE = u'Youtube Watch Later' + _PAGING_STEP = 100 + _PERSONAL_FEED = True + +class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): + IE_NAME = u'youtube:favorites' + IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)' + _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:o?rites)?' + _LOGIN_REQUIRED = True + + def _real_extract(self, url): + webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos') + playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id') + return self.url_result(playlist_id, 'YoutubePlaylist') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 9137a4f70..cf2ea654e 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -35,6 +35,11 @@ try: except ImportError: # Python 2 from urlparse import urlparse as compat_urllib_parse_urlparse +try: + import urllib.parse as compat_urlparse +except ImportError: # Python 2 + import urlparse as compat_urlparse + try: import http.cookiejar as compat_cookiejar except ImportError: # Python 2 @@ -198,6 +203,20 @@ else: with open(fn, 'w', encoding='utf-8') as f: json.dump(obj, f) +if sys.version_info >= (2,7): + def find_xpath_attr(node, xpath, key, val): + """ Find the xpath xpath[@key=val] """ + assert re.match(r'^[a-zA-Z]+$', key) + assert re.match(r'^[a-zA-Z@]*$', val) + expr = xpath + u"[@%s='%s']" % (key, val) + return node.find(expr) +else: + def find_xpath_attr(node, xpath, key, val): + for f in node.findall(xpath): + if f.attrib.get(key) == val: + return f + return None + def htmlentity_transform(matchobj): """Transforms an HTML entity to a character. @@ -623,7 +642,7 @@ def unified_strdate(date_str): date_str = date_str.replace(',',' ') # %z (UTC offset) is only supported in python>=3.2 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str) - format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S'] + format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M'] for expression in format_expressions: try: upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') @@ -631,6 +650,13 @@ def unified_strdate(date_str): pass return upload_date +def determine_ext(url, default_ext=u'unknown_video'): + guess = url.partition(u'?')[0].rpartition(u'.')[2] + if re.match(r'^[A-Za-z0-9]+$', guess): + return guess + else: + return default_ext + def date_from_str(date_str): """ Return a datetime object from a string in the format YYYYMMDD or diff --git a/youtube_dl/version.py b/youtube_dl/version.py index bc4ad90be..8b436720d 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.07.02' +__version__ = '2013.07.25.2'
.*?(.+?)
(.*?)