From ac3e9394e76c0e8baeff1bc77eb67fa184ceb81c Mon Sep 17 00:00:00 2001 From: Anna Bernardi Date: Thu, 6 Jun 2013 13:27:27 +0200 Subject: [PATCH 01/31] Implement search_regex from #847 --- youtube_dl/InfoExtractors.py | 633 ++++++++++++++--------------------- 1 file changed, 252 insertions(+), 381 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index b40edf5fb..4d13c17e4 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -191,6 +191,20 @@ class InfoExtractor(object): video_info['title'] = playlist_title return video_info + def _search_regex(self, pattern, text, name, fatal=True, flags=0): + """Extract a field from some text based on regex""" + mobj = re.search(pattern, text, flags) + if mobj is None and fatal: + raise ExtractorError(u'Unable to extract %s; ' + u'please report this issue on GitHub.' % name) + elif mobj is None: + self._downloader.report_warning(u'unable to extract %s; ' + u'please report this issue on GitHub.' % name) + return None + else: + # return the first matched group + return next(g for g in mobj.groups() if g is not None) + class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. @@ -964,18 +978,13 @@ class PhotobucketIE(InfoExtractor): }] # We try looking in other parts of the webpage - mobj = re.search(r'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - mediaURL = compat_urllib_parse.unquote(mobj.group(1)) - - video_url = mediaURL + video_url = self._search_regex(r'', + webpage, u'video URL') mobj = re.search(r'(.*) video by (.*) - Photobucket', webpage) if mobj is None: raise ExtractorError(u'Unable to extract title') video_title = mobj.group(1).decode('utf-8') - video_uploader = mobj.group(2).decode('utf-8') return [{ @@ -1803,10 +1812,7 @@ class DepositFilesIE(InfoExtractor): file_extension = os.path.splitext(file_url)[1][1:] # Search for file title - mobj = re.search(r'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - file_title = mobj.group(1).decode('utf-8') + file_title = self._search_regex(r'', webpage, u'title') return [{ 'id': file_id.decode('utf-8'), @@ -1900,10 +1906,9 @@ class FacebookIE(InfoExtractor): video_duration = int(video_data['video_duration']) thumbnail = video_data['thumbnail_src'] - m = re.search('

([^<]+)

', webpage) - if not m: - raise ExtractorError(u'Cannot find title in webpage') - video_title = unescapeHTML(m.group(1)) + video_title = self._search_regex('

([^<]+)

', + webpage, u'title') + video_title = unescapeHTML(video_title) info = { 'id': video_id, @@ -2065,15 +2070,10 @@ class MyVideoIE(InfoExtractor): self.report_extraction(video_id) video_url = mobj.group(1) + '.flv' - mobj = re.search('([^<]+)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_title = self._search_regex('([^<]+)', + webpage, u'title') - mobj = re.search('[.](.+?)$', video_url) - if mobj is None: - raise ExtractorError(u'Unable to extract extention') - video_ext = mobj.group(1) + video_ext = self._search_regex('[.](.+?)$', video_url, u'extension') return [{ 'id': video_id, @@ -2121,25 +2121,23 @@ class MyVideoIE(InfoExtractor): # extracting infos self.report_extraction(video_id) + video_url = None mobj = re.search('connectionurl=\'(.*?)\'', dec_data) - if mobj is None: - raise ExtractorError(u'unable to extract rtmpurl') - video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) - if 'myvideo2flash' in video_rtmpurl: - self._downloader.report_warning(u'forcing RTMPT ...') - video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://') + if mobj: + video_url = compat_urllib_parse.unquote(mobj.group(1)) + if 'myvideo2flash' in video_url: + self._downloader.report_warning(u'forcing RTMPT ...') + video_url = video_url.replace('rtmpe://', 'rtmpt://') - # extract non rtmp videos - if (video_rtmpurl is None) or (video_rtmpurl == ''): + if not video_url: + # extract non rtmp videos mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data) if mobj is None: raise ExtractorError(u'unable to extract url') - video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2)) + video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2)) - mobj = re.search('source=\'(.*?)\'', dec_data) - if mobj is None: - raise ExtractorError(u'unable to extract swfobj') - video_file = compat_urllib_parse.unquote(mobj.group(1)) + video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file') + video_file = compat_urllib_parse.unquote(video_file) if not video_file.endswith('f4m'): ppath, prefix = video_file.split('.') @@ -2151,20 +2149,16 @@ class MyVideoIE(InfoExtractor): video_filepath + video_file ).replace('.f4m', '.m3u8') - mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage) - if mobj is None: - raise ExtractorError(u'unable to extract swfobj') - video_swfobj = compat_urllib_parse.unquote(mobj.group(1)) + video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj') + video_swfobj = compat_urllib_parse.unquote(video_swfobj) - mobj = re.search("(.*?)", webpage) - if mobj is None: - raise ExtractorError(u'unable to extract title') - video_title = mobj.group(1) + video_title = self._search_regex("(.*?)", + webpage, u'title') return [{ 'id': video_id, - 'url': video_rtmpurl, - 'tc_url': video_rtmpurl, + 'url': video_url, + 'tc_url': video_url, 'uploader': None, 'upload_date': None, 'title': video_title, @@ -2175,6 +2169,7 @@ class MyVideoIE(InfoExtractor): 'player_url': video_swfobj, }] + class ComedyCentralIE(InfoExtractor): """Information extractor for The Daily Show and Colbert Report """ @@ -2357,16 +2352,22 @@ class EscapistIE(InfoExtractor): videoId = mobj.group('episode') self.report_extraction(showName) - webPage = self._download_webpage(url, showName) + webpage = self._download_webpage(url, showName) - descMatch = re.search('(.*?)\s+-\s+XVID', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video title') - video_title = mobj.group(1) - + video_title = self._search_regex(r'(.*?)\s+-\s+XVID', + webpage, u'title') # Extract video thumbnail - mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video thumbnail') - video_thumbnail = mobj.group(0) + video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', + webpage, u'thumbnail', fatal=False) info = { 'id': video_id, @@ -2652,16 +2644,12 @@ class InfoQIE(InfoExtractor): video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id # Extract title - mobj = re.search(r'contentTitle = "(.*?)";', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video title') - video_title = mobj.group(1) + video_title = self._search_regex(r'contentTitle = "(.*?)";', + webpage, u'title') # Extract description - video_description = u'No description available.' - mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage) - if mobj is not None: - video_description = mobj.group(1) + video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>', + webpage, u'description', fatal=False) video_filename = video_url.split('/')[-1] video_id, extension = video_filename.split('.') @@ -2832,15 +2820,16 @@ class StanfordOpenClassroomIE(InfoExtractor): note='Downloading course info page', errnote='Unable to download course info page') + # TODO: implement default_value in search_regex m = re.search('<h1>([^<]+)</h1>', coursepage) if m: info['title'] = unescapeHTML(m.group(1)) else: info['title'] = info['id'] - m = re.search('<description>([^<]+)</description>', coursepage) - if m: - info['description'] = unescapeHTML(m.group(1)) + info['description'] = self._search_regex('<description>([^<]+)</description>', + coursepage, u'description', fatal=False) + if info['description']: info['description'] = unescapeHTML(info['description']) links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) info['list'] = [ @@ -2901,25 +2890,19 @@ class MTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract song name') - song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1')) - mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract performer') - performer = unescapeHTML(mobj.group(1).decode('iso-8859-1')) - video_title = performer + ' - ' + song_name + song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>', + webpage, u'song name', fatal=False) + if song_name: song_name = unescapeHTML(song_name) - mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to mtvn_uri') - mtvn_uri = mobj.group(1) + video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>', + webpage, u'title') + video_title = unescapeHTML(video_title) - mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract content id') - content_id = mobj.group(1) + mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>', + webpage, u'mtvn_uri', fatal=False) + + content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', + webpage, u'content id', fatal=False) videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri self.report_extraction(video_id) @@ -3067,20 +3050,15 @@ class XNXXIE(InfoExtractor): # Get webpage content webpage = self._download_webpage(url, video_id) - result = re.search(self.VIDEO_URL_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video url') - video_url = compat_urllib_parse.unquote(result.group(1)) + video_url = self._search_regex(self.VIDEO_URL_RE, + webpage, u'video URL') + video_url = compat_urllib_parse.unquote(video_url) - result = re.search(self.VIDEO_TITLE_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video title') - video_title = result.group(1) + video_title = self._search_regex(self.VIDEO_TITLE_RE, + webpage, u'title') - result = re.search(self.VIDEO_THUMB_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video thumbnail') - video_thumbnail = result.group(1) + video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE, + webpage, u'thumbnail', fatal=False) return [{ 'id': video_id, @@ -3100,26 +3078,6 @@ class GooglePlusIE(InfoExtractor): _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)' IE_NAME = u'plus.google' - def report_extract_entry(self, url): - """Report downloading extry""" - self.to_screen(u'Downloading entry: %s' % url) - - def report_date(self, upload_date): - """Report downloading extry""" - self.to_screen(u'Entry date: %s' % upload_date) - - def report_uploader(self, uploader): - """Report downloading extry""" - self.to_screen(u'Uploader: %s' % uploader) - - def report_title(self, video_title): - """Report downloading extry""" - self.to_screen(u'Title: %s' % video_title) - - def report_extract_vid_page(self, video_page): - """Report information extraction.""" - self.to_screen(u'Extracting video page: %s' % video_page) - def _real_extract(self, url): # Extract id from URL mobj = re.match(self._VALID_URL, url) @@ -3132,47 +3090,35 @@ class GooglePlusIE(InfoExtractor): video_extension = 'flv' # Step 1, Retrieve post webpage to extract further information - self.report_extract_entry(post_url) webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage') + self.report_extraction(video_id) + # Extract update date - upload_date = None - pattern = 'title="Timestamp">(.*?)</a>' - mobj = re.search(pattern, webpage) - if mobj: - upload_date = mobj.group(1) + upload_date = self._search_regex('title="Timestamp">(.*?)</a>', + webpage, u'upload date', fatal=False) + if upload_date: # Convert timestring to a format suitable for filename upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d") upload_date = upload_date.strftime('%Y%m%d') - self.report_date(upload_date) # Extract uploader - uploader = None - pattern = r'rel\="author".*?>(.*?)</a>' - mobj = re.search(pattern, webpage) - if mobj: - uploader = mobj.group(1) - self.report_uploader(uploader) + uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>', + webpage, u'uploader', fatal=False) # Extract title # Get the first line for title + # TODO: implement default_value in search_regex video_title = u'NA' pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]' mobj = re.search(pattern, webpage) if mobj: video_title = mobj.group(1) - self.report_title(video_title) # Step 2, Stimulate clicking the image box to launch video - pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]' - mobj = re.search(pattern, webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video page URL') - - video_page = mobj.group(1) + video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]', + webpage, u'video page URL') webpage = self._download_webpage(video_page, video_id, u'Downloading video page') - self.report_extract_vid_page(video_page) - # Extract video links on video page """Extract video links of all sizes""" @@ -3220,6 +3166,8 @@ class NBAIE(InfoExtractor): webpage = self._download_webpage(url, video_id) video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' + + # TODO: implement default_value in search_regex def _findProp(rexp, default=None): m = re.search(rexp, webpage) if m: @@ -3383,11 +3331,11 @@ class FunnyOrDieIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL) - if not m: - raise ExtractorError(u'Unable to find video information') - video_url = unescapeHTML(m.group('url')) + video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', + webpage, u'video URL', flags=re.DOTALL) + video_url = unescapeHTML(video_url) + # TODO: implement fallbacks in regex_search m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL) if not m: m = re.search(r'<title>(?P<title>[^<]+?)', webpage) @@ -3395,18 +3343,16 @@ class FunnyOrDieIE(InfoExtractor): raise ExtractorError(u'Cannot find video title') title = clean_html(m.group('title')) - m = re.search(r'.+)"',webpage) - title = m.group('title') - m = re.search(r'data-content-type="channel".*?>(?P.*?)', - webpage, re.DOTALL) - uploader = unescapeHTML(m.group('uploader').strip()) - m = re.search(r'.+)"', + webpage, u'title') + + uploader = self._search_regex(r'data-content-type="channel".*?>(?P.*?)', + webpage, u'uploader', fatal=False, flags=re.DOTALL) + if uploader: uploader = unescapeHTML(uploader.strip()) + + thumbnail = self._search_regex(r'(.*)", webpage_src) + video_title = self._search_regex(r"(.*)", + webpage_src, u'title') - if mobj is None: - raise ExtractorError(u'Cannot determine title') - title = mobj.group(1) - - mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src) # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. - if mobj is not None: - thumbnail = mobj.group(1) - else: + thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />', + webpage_src, u'thumbnail', fatal=False) + + if not thumbnail: _title = r"""candytitles.*>(.*)""" mobj = re.search(_title, webpage_src) if mobj is not None: - title = mobj.group(1) - thumbnail = None + video_title = mobj.group(1) results = [{ 'id': video_id, 'url' : video_url, - 'title' : title, + 'title' : video_title, 'thumbnail' : thumbnail, 'ext' : ext, }] @@ -3542,10 +3482,9 @@ class RBMARadioIE(InfoExtractor): video_id = m.group('videoID') webpage = self._download_webpage(url, video_id) - m = re.search(r'', webpage) - if not m: - raise ExtractorError(u'Cannot find metadata') - json_data = m.group(1) + + json_data = self._search_regex(r'', + webpage, u'json data') try: data = json.loads(json_data) @@ -3592,7 +3531,6 @@ class YouPornIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group('videoid') req = compat_urllib_request.Request(url) @@ -3600,34 +3538,23 @@ class YouPornIE(InfoExtractor): webpage = self._download_webpage(req, video_id) # Get the video title - result = re.search(r'(?P.*)</h1>', webpage) - if result is None: - raise ExtractorError(u'Unable to extract video title') - video_title = result.group('title').strip() + video_title = self._search_regex(r'<h1.*?>(?P<title>.*)</h1>', + webpage, u'title').strip() # Get the video date - result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage) - if result is None: - self._downloader.report_warning(u'unable to extract video date') - upload_date = None - else: - upload_date = unified_strdate(result.group('date').strip()) + upload_date = self._search_regex(r'Date:</label>(?P<date>.*) </li>', + webpage, u'upload date', fatal=False) + if upload_date: upload_date = unified_strdate(upload_date.strip()) # Get the video uploader - result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage) - if result is None: - self._downloader.report_warning(u'unable to extract uploader') - video_uploader = None - else: - video_uploader = result.group('uploader').strip() - video_uploader = clean_html( video_uploader ) + video_uploader = self._search_regex(r'Submitted:</label>(?P<uploader>.*)</li>', + webpage, u'uploader', fatal=False) + if video_uploader: video_uploader = clean_html(video_uploader.strip()) # Get all of the formats available DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>' - result = re.search(DOWNLOAD_LIST_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract download list') - download_list_html = result.group('download_list').strip() + download_list_html = self._search_regex(DOWNLOAD_LIST_RE, + webpage, u'download list').strip() # Get all of the links from the page LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">' @@ -3704,17 +3631,13 @@ class PornotubeIE(InfoExtractor): # Get the video URL VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",' - result = re.search(VIDEO_URL_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video url') - video_url = compat_urllib_parse.unquote(result.group('url')) + video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url') + video_url = compat_urllib_parse.unquote(video_url) #Get the uploaded date VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by' - result = re.search(VIDEO_UPLOADED_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video title') - upload_date = unified_strdate(result.group('date')) + upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) + if upload_date: upload_date = unified_strdate(upload_date) info = {'id': video_id, 'url': video_url, @@ -3741,10 +3664,8 @@ class YouJizzIE(InfoExtractor): webpage = self._download_webpage(url, video_id) # Get the video title - result = re.search(r'<title>(?P<title>.*)', webpage) - if result is None: - raise ExtractorError(u'ERROR: unable to extract video title') - video_title = result.group('title').strip() + video_title = self._search_regex(r'(?P<title>.*)', + webpage, u'title').strip() # Get the embed page result = re.search(r'https?://www.youjizz.com/videos/embed/(?P[0-9]+)', webpage) @@ -3757,10 +3678,8 @@ class YouJizzIE(InfoExtractor): webpage = self._download_webpage(embed_page_url, video_id) # Get the video URL - result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P[^"]+)"\)\);', webpage) - if result is None: - raise ExtractorError(u'ERROR: unable to extract video url') - video_url = result.group('source') + video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P[^"]+)"\)\);', + webpage, u'video URL') info = {'id': video_id, 'url': video_url, @@ -3783,10 +3702,7 @@ class EightTracksIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) - m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL) - if not m: - raise ExtractorError(u'Cannot find trax information') - json_like = m.group(1) + json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL) data = json.loads(json_like) session = str(random.randint(0, 1000000000)) @@ -3822,18 +3738,24 @@ class KeekIE(InfoExtractor): def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('videoID') + video_url = u'http://cdn.keek.com/keek/video/%s' % video_id thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id webpage = self._download_webpage(url, video_id) - m = re.search(r'[\S\s]+?

(?P.+?)

', webpage) - uploader = clean_html(m.group('uploader')) + + video_title = self._search_regex(r'[\S\s]+?

(?P.+?)

', + webpage, u'uploader', fatal=False) + if uploader: uploader = clean_html(uploader) + info = { 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': title, + 'title': video_title, 'thumbnail': thumbnail, 'uploader': uploader } @@ -3980,10 +3902,10 @@ class SpiegelIE(InfoExtractor): video_id = m.group('videoID') webpage = self._download_webpage(url, video_id) - m = re.search(r'
(.*?)
', webpage) - if not m: - raise ExtractorError(u'Cannot find title') - video_title = unescapeHTML(m.group(1)) + + video_title = self._search_regex(r'
(.*?)
', + webpage, u'title') + video_title = unescapeHTML(video_title) xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' xml_code = self._download_webpage(xml_url, video_id, @@ -4019,35 +3941,27 @@ class LiveLeakIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - m = re.search(r'file: "(.*?)",', webpage) - if not m: - raise ExtractorError(u'Unable to find video url') - video_url = m.group(1) + video_url = self._search_regex(r'file: "(.*?)",', + webpage, u'video URL') - m = re.search(r'', webpage) - if m: - uploader = clean_html(m.group(1)) - else: - uploader = None + video_uploader = self._search_regex(r'By:.*?(\w+)', + webpage, u'uploader', fatal=False) info = { 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': title, - 'description': desc, - 'uploader': uploader + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader } return [info] @@ -4105,23 +4019,24 @@ class TumblrIE(InfoExtractor): re_video = r'src=\\x22(?Phttp://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P.*?)\\x22' % (blog, video_id) video = re.search(re_video, webpage) if video is None: - self.to_screen("No video found") - return [] + raise ExtractorError(u'Unable to extract video') video_url = video.group('video_url') ext = video.group('ext') - re_thumb = r'posters(.*?)\[\\x22(?P.*?)\\x22' # We pick the first poster - thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '') + video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P.*?)\\x22', + webpage, u'thumbnail', fatal=False) # We pick the first poster + if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '') # The only place where you can get a title, it's not complete, # but searching in other places doesn't work for all videos - re_title = r'(?P<title>.*?)' - title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title')) + video_title = self._search_regex(r'(?P<title>.*?)', + webpage, u'title', flags=re.DOTALL) + video_title = unescapeHTML(video_title) return [{'id': video_id, 'url': video_url, - 'title': title, - 'thumbnail': thumb, + 'title': video_title, + 'thumbnail': video_thumbnail, 'ext': ext }] @@ -4135,7 +4050,7 @@ class BandcampIE(InfoExtractor): # We get the link to the free download page m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) if m_download is None: - raise ExtractorError(u'No free songs founded') + raise ExtractorError(u'No free songs found') download_link = m_download.group(1) id = re.search(r'var TralbumData = {(.*?)id: (?P\d*?)$', @@ -4163,10 +4078,10 @@ class BandcampIE(InfoExtractor): track_info = {'id':id, 'title' : info[u'title'], - 'ext' : 'mp3', - 'url' : final_url, + 'ext' : 'mp3', + 'url' : final_url, 'thumbnail' : info[u'thumb_url'], - 'uploader' : info[u'artist'] + 'uploader' : info[u'artist'] } return [track_info] @@ -4183,17 +4098,14 @@ class RedTubeIE(InfoExtractor): video_id = mobj.group('id') video_extension = 'mp4' webpage = self._download_webpage(url, video_id) + self.report_extraction(video_id) - mobj = re.search(r'',webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') + video_url = self._search_regex(r'', + webpage, u'video URL') - video_url = mobj.group(1) - mobj = re.search('

(.+)

',webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_title = self._search_regex('

(.+?)

', + webpage, u'title') return [{ 'id': video_id, @@ -4214,15 +4126,13 @@ class InaIE(InfoExtractor): video_extension = 'mp4' webpage = self._download_webpage(mrss_url, video_id) - mobj = re.search(r'.*?)]]>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_url = self._search_regex(r'.*?)]]>', + webpage, u'title') return [{ 'id': video_id, @@ -4244,27 +4154,17 @@ class HowcastIE(InfoExtractor): self.report_extraction(video_id) - mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video URL') - video_url = mobj.group(1) + video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)', + webpage, u'video URL') - mobj = re.search(r'\w+)' def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') @@ -4289,25 +4188,17 @@ class VineIE(InfoExtractor): self.report_extraction(video_id) - mobj = re.search(r'.*?

(.+?)

', webpage, re.DOTALL) - if mobj is None: - raise ExtractorError(u'Unable to extract uploader') - uploader = mobj.group(1) + uploader = self._search_regex(r'
.*?

(.+?)

', + webpage, u'uploader', fatal=False, flags=re.DOTALL) return [{ 'id': video_id, @@ -4330,18 +4221,13 @@ class FlickrIE(InfoExtractor): webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id webpage = self._download_webpage(webpage_url, video_id) - mobj = re.search(r"photo_secret: '(\w+)'", webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video secret') - secret = mobj.group(1) + secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret') first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage') - mobj = re.search(r'(\d+-\d+)', first_xml) - if mobj is None: - raise ExtractorError(u'Unable to extract node_id') - node_id = mobj.group(1) + node_id = self._search_regex(r'(\d+-\d+)', + first_xml, u'node_id') second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage') @@ -4353,22 +4239,14 @@ class FlickrIE(InfoExtractor): raise ExtractorError(u'Unable to extract video url') video_url = mobj.group(1) + unescapeHTML(mobj.group(2)) - mobj = re.search(r'(.*?)', data) - if mobj is None: - raise ExtractorError(u'Unable to extract video url') - video_url = mobj.group(1) + + video_url = self._search_regex(r'(.*?)', + data, u'video URL') return [{ 'id': video_id, @@ -4423,7 +4294,7 @@ class TeamcocoIE(InfoExtractor): 'ext': 'mp4', 'title': video_title, 'thumbnail': thumbnail, - 'description': description, + 'description': video_description, }] class XHamsterIE(InfoExtractor): From 468e2e926b8d1f55d6ce67fee67e33a7fa6d8371 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Thu, 6 Jun 2013 14:35:08 +0200 Subject: [PATCH 02/31] implement fallbacks and defaults in _search_regex --- youtube_dl/InfoExtractors.py | 86 +++++++++++++++++++----------------- youtube_dl/utils.py | 3 ++ 2 files changed, 48 insertions(+), 41 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 4d13c17e4..fbf40f3ca 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -191,19 +191,37 @@ class InfoExtractor(object): video_info['title'] = playlist_title return video_info - def _search_regex(self, pattern, text, name, fatal=True, flags=0): - """Extract a field from some text based on regex""" - mobj = re.search(pattern, text, flags) - if mobj is None and fatal: - raise ExtractorError(u'Unable to extract %s; ' - u'please report this issue on GitHub.' % name) - elif mobj is None: - self._downloader.report_warning(u'unable to extract %s; ' - u'please report this issue on GitHub.' % name) - return None + def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): + """ + Perform a regex search on the given string, using a single or a list of + patterns returning the first matching group. + In case of failure return a default value or raise a WARNING or a + ExtractorError, depending on fatal, specifying the field name. + """ + if isinstance(pattern, (str, compat_str, compiled_regex_type)): + mobj = re.search(pattern, string, flags) else: - # return the first matched group + for p in pattern: + mobj = re.search(p, string, flags) + if mobj: break + + if sys.stderr.isatty() and os.name != 'nt': + _name = u'\033[0;34m%s\033[0m' % name + else: + _name = name + + if mobj: + # return the first matching group return next(g for g in mobj.groups() if g is not None) + elif default is not None: + return default + elif fatal: + raise ExtractorError(u'Unable to extract %s; ' + u'please report this issue on GitHub.' % _name) + else: + self._downloader.report_warning(u'unable to extract %s; ' + u'please report this issue on GitHub.' % _name) + return None class SearchInfoExtractor(InfoExtractor): """ @@ -2820,12 +2838,8 @@ class StanfordOpenClassroomIE(InfoExtractor): note='Downloading course info page', errnote='Unable to download course info page') - # TODO: implement default_value in search_regex - m = re.search('

([^<]+)

', coursepage) - if m: - info['title'] = unescapeHTML(m.group(1)) - else: - info['title'] = info['id'] + info['title'] = self._search_regex('

([^<]+)

', coursepage, 'title', default=info['id']) + info['title'] = unescapeHTML(info['title']) info['description'] = self._search_regex('([^<]+)', coursepage, u'description', fatal=False) @@ -3108,12 +3122,8 @@ class GooglePlusIE(InfoExtractor): # Extract title # Get the first line for title - # TODO: implement default_value in search_regex - video_title = u'NA' - pattern = r'Date:
(.*?)
', webpage, 'upload_date', fatal=False) + + description = self._search_regex(r'
(.*?)', webpage, 'description', fatal=False) + info = { 'id': shortened_video_id, 'url': video_url, 'ext': 'mp4', 'title': title, - 'uploader_date': _findProp(r'Date: (.*?)
'), - 'description': _findProp(r'
(.*?)'), + 'uploader_date': uploader_date, + 'description': description, } return [info] @@ -3335,13 +3343,9 @@ class FunnyOrDieIE(InfoExtractor): webpage, u'video URL', flags=re.DOTALL) video_url = unescapeHTML(video_url) - # TODO: implement fallbacks in regex_search - m = re.search(r"

(?P.*?)</h1>", webpage, flags=re.DOTALL) - if not m: - m = re.search(r'<title>(?P<title>[^<]+?)', webpage) - if not m: - raise ExtractorError(u'Cannot find video title') - title = clean_html(m.group('title')) + title = self._search_regex((r"

(?P.*?)</h1>", + r'<title>(?P<title>[^<]+?)'), webpage, 'title', flags=re.DOTALL) + title = clean_html(title) video_description = self._search_regex(r' Date: Thu, 6 Jun 2013 15:07:05 +0200 Subject: [PATCH 03/31] print WARNINGs during test + minor fix to NBAIE --- test/test_download.py | 9 +++++++++ youtube_dl/InfoExtractors.py | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/test/test_download.py b/test/test_download.py index 3eca333f2..3e6bdd44e 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -40,9 +40,18 @@ def _try_rm(filename): class FileDownloader(youtube_dl.FileDownloader): def __init__(self, *args, **kwargs): + self._to_stderr = self.to_stderr self.to_stderr = self.to_screen self.processed_info_dicts = [] return youtube_dl.FileDownloader.__init__(self, *args, **kwargs) + def report_warning(self, message): + # let warnings pass to output + if sys.stderr.isatty() and os.name != 'nt': + _msg_header=u'\033[0;33mWARNING:\033[0m' + else: + _msg_header=u'WARNING:' + warning_message=u'%s %s' % (_msg_header,message) + self._to_stderr(warning_message) def process_info(self, info_dict): self.processed_info_dicts.append(info_dict) return youtube_dl.FileDownloader.process_info(self, info_dict) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index fbf40f3ca..0f1880756 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -3183,7 +3183,7 @@ class NBAIE(InfoExtractor): uploader_date = self._search_regex(r'Date: (.*?)

', webpage, 'upload_date', fatal=False) - description = self._search_regex(r'
(.*?)', webpage, 'description', fatal=False) + description = self._search_regex(r'', webpage, 'description', fatal=False) info = { 'id': shortened_video_id, From be95cac157a75da1a0fa512b36eb90bc2c28cc96 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Fri, 7 Jun 2013 11:19:27 +0200 Subject: [PATCH 04/31] raise exceptions on warnings during tests - and solve a couple of them --- test/test_download.py | 10 ++------- youtube_dl/InfoExtractors.py | 41 ++++++++++++++++++------------------ 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index 3e6bdd44e..565b1ebc5 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -40,18 +40,12 @@ def _try_rm(filename): class FileDownloader(youtube_dl.FileDownloader): def __init__(self, *args, **kwargs): - self._to_stderr = self.to_stderr self.to_stderr = self.to_screen self.processed_info_dicts = [] return youtube_dl.FileDownloader.__init__(self, *args, **kwargs) def report_warning(self, message): - # let warnings pass to output - if sys.stderr.isatty() and os.name != 'nt': - _msg_header=u'\033[0;33mWARNING:\033[0m' - else: - _msg_header=u'WARNING:' - warning_message=u'%s %s' % (_msg_header,message) - self._to_stderr(warning_message) + # Don't accept warnings during tests + raise ExtractorError(message) def process_info(self, info_dict): self.processed_info_dicts.append(info_dict) return youtube_dl.FileDownloader.process_info(self, info_dict) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 0f1880756..bd6fce3b6 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -3161,7 +3161,7 @@ class GooglePlusIE(InfoExtractor): }] class NBAIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$' + _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$' IE_NAME = u'nba' def _real_extract(self, url): @@ -3170,8 +3170,6 @@ class NBAIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group(1) - if video_id.endswith('/index.html'): - video_id = video_id[:-len('/index.html')] webpage = self._download_webpage(url, video_id) @@ -3181,7 +3179,8 @@ class NBAIE(InfoExtractor): title = self._search_regex(r'Date: (.*?)
', webpage, 'upload_date', fatal=False) + # It isn't there in the HTML it returns to us + # uploader_date = self._search_regex(r'Date: (.*?)', webpage, 'upload_date', fatal=False) description = self._search_regex(r'', webpage, 'description', fatal=False) @@ -3190,7 +3189,7 @@ class NBAIE(InfoExtractor): 'url': video_url, 'ext': 'mp4', 'title': title, - 'uploader_date': uploader_date, + # 'uploader_date': uploader_date, 'description': description, } return [info] @@ -3541,19 +3540,22 @@ class YouPornIE(InfoExtractor): req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) - # Get the video title - video_title = self._search_regex(r'(?P.*)</h1>', - webpage, u'title').strip() + # Get JSON parameters + json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters') + try: + params = json.loads(json_params) + except: + raise ExtractorError(u'Invalid JSON') - # Get the video date - upload_date = self._search_regex(r'Date:</label>(?P<date>.*) </li>', - webpage, u'upload date', fatal=False) - if upload_date: upload_date = unified_strdate(upload_date.strip()) - - # Get the video uploader - video_uploader = self._search_regex(r'Submitted:</label>(?P<uploader>.*)</li>', - webpage, u'uploader', fatal=False) - if video_uploader: video_uploader = clean_html(video_uploader.strip()) + self.report_extraction(video_id) + try: + video_title = params['title'] + upload_date = unified_strdate(params['release_date_f']) + video_description = params['description'] + video_uploader = params['submitted_by'] + thumbnail = params['thumbnails'][0]['image'] + except KeyError: + raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1]) # Get all of the formats available DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>' @@ -3592,9 +3594,8 @@ class YouPornIE(InfoExtractor): 'title': title, 'ext': extension, 'format': format, - 'thumbnail': None, - 'description': None, - 'player_url': None + 'thumbnail': thumbnail, + 'description': video_description }) if self._downloader.params.get('listformats', None): From 8409501206e37d57f01e5fe72bfc54a5562e4e0a Mon Sep 17 00:00:00 2001 From: Filippo Valsorda <filippo.valsorda@gmail.com> Date: Fri, 7 Jun 2013 11:46:03 +0200 Subject: [PATCH 05/31] use search_regex in new IEs --- youtube_dl/InfoExtractors.py | 50 ++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index bd6fce3b6..5d54e93e7 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -3347,7 +3347,7 @@ class FunnyOrDieIE(InfoExtractor): title = clean_html(title) video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"', - webpage, u'description', flags=re.DOTALL) + webpage, u'description', fatal=False, flags=re.DOTALL) if video_description: video_description = unescapeHTML(video_description) info = { @@ -4301,7 +4301,7 @@ class TeamcocoIE(InfoExtractor): 'thumbnail': thumbnail, 'description': video_description, }] - + class XHamsterIE(InfoExtractor): """Information Extractor for xHamster""" _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html' @@ -4310,8 +4310,9 @@ class XHamsterIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - mrss_url='http://xhamster.com/movies/%s/.html' % video_id + mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id webpage = self._download_webpage(mrss_url, video_id) + mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage) if mobj is None: raise ExtractorError(u'Unable to extract media URL') @@ -4321,32 +4322,26 @@ class XHamsterIE(InfoExtractor): video_url = mobj.group('server')+'/key='+mobj.group('file') video_extension = video_url.split('.')[-1] - mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = unescapeHTML(mobj.group('title')) + video_title = self._search_regex(r'(?P<title>.+?) - xHamster\.com', + webpage, u'title') + video_title = unescapeHTML(video_title) - mobj = re.search(r'Description: (?P[^<]+)', webpage) - if mobj is None: - video_description = u'' - else: - video_description = unescapeHTML(mobj.group('description')) + video_description = self._search_regex(r'Description: (?P[^<]+)', + webpage, u'description', fatal=False) + if video_description: video_description = unescapeHTML(video_description) mobj = re.search(r'hint=\'(?P[0-9]{4})-(?P[0-9]{2})-(?P[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract upload date') - video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') - - mobj = re.search(r']+>(?P[^>]+)', webpage) - if mobj is None: - video_uploader_id = u'anonymous' + if mobj: + video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') else: - video_uploader_id = mobj.group('uploader_id') + video_upload_date = None + self._downloader.report_warning(u'Unable to extract upload date') - mobj = re.search(r'\'image\':\'(?P[^\']+)\'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract thumbnail URL') - video_thumbnail = mobj.group('thumbnail') + video_uploader_id = self._search_regex(r']+>(?P[^>]+)', + webpage, u'uploader id', default=u'anonymous') + + video_thumbnail = self._search_regex(r'\'image\':\'(?P[^\']+)\'', + webpage, u'thumbnail', fatal=False) return [{ 'id': video_id, @@ -4377,10 +4372,9 @@ class HypemIE(InfoExtractor): cookie = urlh.headers.get('Set-Cookie', '') self.report_extraction(track_id) - mobj = re.search(r'', response, flags=re.MULTILINE|re.DOTALL) - if mobj is None: - raise ExtractorError(u'Unable to extrack tracks') - html_tracks = mobj.group(1).strip() + + html_tracks = self._search_regex(r'', + response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip() try: track_list = json.loads(html_tracks) track = track_list[u'tracks'][0] From 8b59a9861040482c9af58e85fb397353ea2e8080 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Fri, 7 Jun 2013 12:10:02 +0200 Subject: [PATCH 06/31] XHamster: Can't see the description anywhere in the UI --- youtube_dl/InfoExtractors.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 5d54e93e7..0d7db013b 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -4326,9 +4326,10 @@ class XHamsterIE(InfoExtractor): webpage, u'title') video_title = unescapeHTML(video_title) - video_description = self._search_regex(r'Description: (?P[^<]+)', - webpage, u'description', fatal=False) - if video_description: video_description = unescapeHTML(video_description) + # Can't see the description anywhere in the UI + # video_description = self._search_regex(r'Description: (?P[^<]+)', + # webpage, u'description', fatal=False) + # if video_description: video_description = unescapeHTML(video_description) mobj = re.search(r'hint=\'(?P[0-9]{4})-(?P[0-9]{2})-(?P[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage) if mobj: @@ -4348,7 +4349,7 @@ class XHamsterIE(InfoExtractor): 'url': video_url, 'ext': video_extension, 'title': video_title, - 'description': video_description, + # 'description': video_description, 'upload_date': video_upload_date, 'uploader_id': video_uploader_id, 'thumbnail': video_thumbnail From f5a290eed949b7726a8d745960bbe9c6b8b7de52 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sat, 8 Jun 2013 09:56:34 +0200 Subject: [PATCH 07/31] print "please report this issue on GitHub" on every ExtractorError --- youtube_dl/InfoExtractors.py | 3 +-- youtube_dl/utils.py | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 0d7db013b..86cc7c748 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -216,8 +216,7 @@ class InfoExtractor(object): elif default is not None: return default elif fatal: - raise ExtractorError(u'Unable to extract %s; ' - u'please report this issue on GitHub.' % _name) + raise ExtractorError(u'Unable to extract %s' % _name) else: self._downloader.report_warning(u'unable to extract %s; ' u'please report this issue on GitHub.' % _name) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 3a8dcf4d3..718ee3aae 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -472,6 +472,7 @@ class ExtractorError(Exception): """Error during info extraction.""" def __init__(self, msg, tb=None): """ tb, if given, is the original traceback (so that it can be printed out). """ + msg = msg + u'; please report this issue on GitHub.' super(ExtractorError, self).__init__(msg) self.traceback = tb self.exc_info = sys.exc_info() # preserve original exception From d5979c5d55b0df11973b9a2b6630fd676e5726d1 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sun, 9 Jun 2013 11:55:08 +0200 Subject: [PATCH 08/31] do not ask the user to report network errors --- youtube_dl/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 718ee3aae..66ae41e31 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -12,7 +12,7 @@ import sys import traceback import zlib import email.utils -import json +import socket import datetime try: @@ -472,8 +472,11 @@ class ExtractorError(Exception): """Error during info extraction.""" def __init__(self, msg, tb=None): """ tb, if given, is the original traceback (so that it can be printed out). """ - msg = msg + u'; please report this issue on GitHub.' + + if not sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): + msg = msg + u'; please report this issue on GitHub.' super(ExtractorError, self).__init__(msg) + self.traceback = tb self.exc_info = sys.exc_info() # preserve original exception From 979a9dd4c4d46e0f2b11bc4bcac51ad8d446d186 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sun, 9 Jun 2013 11:57:13 +0200 Subject: [PATCH 09/31] _html_search_regex with clean_html superpowers --- test/tests.json | 2 +- youtube_dl/InfoExtractors.py | 151 ++++++++++++++++------------------- 2 files changed, 72 insertions(+), 81 deletions(-) diff --git a/test/tests.json b/test/tests.json index c39d1d9c1..82da27d5b 100644 --- a/test/tests.json +++ b/test/tests.json @@ -325,7 +325,7 @@ "file": "wshh6a7q1ny0G34ZwuIO.mp4", "md5": "9d04de741161603bf7071bbf4e883186", "info_dict": { - "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick! " + "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!" } }, { diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 86cc7c748..6060a5988 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -222,6 +222,16 @@ class InfoExtractor(object): u'please report this issue on GitHub.' % _name) return None + def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): + """ + Like _search_regex, but strips HTML tags and unescapes entities. + """ + res = self._search_regex(pattern, string, name, default, fatal, flags) + if res: + return clean_html(res).strip() + else: + return res + class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. @@ -1923,9 +1933,8 @@ class FacebookIE(InfoExtractor): video_duration = int(video_data['video_duration']) thumbnail = video_data['thumbnail_src'] - video_title = self._search_regex('

([^<]+)

', + video_title = self._html_search_regex('

([^<]+)

', webpage, u'title') - video_title = unescapeHTML(video_title) info = { 'id': video_id, @@ -2087,7 +2096,7 @@ class MyVideoIE(InfoExtractor): self.report_extraction(video_id) video_url = mobj.group(1) + '.flv' - video_title = self._search_regex('([^<]+)', + video_title = self._html_search_regex('([^<]+)', webpage, u'title') video_ext = self._search_regex('[.](.+?)$', video_url, u'extension') @@ -2169,7 +2178,7 @@ class MyVideoIE(InfoExtractor): video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj') video_swfobj = compat_urllib_parse.unquote(video_swfobj) - video_title = self._search_regex("(.*?)", + video_title = self._html_search_regex("(.*?)", webpage, u'title') return [{ @@ -2371,17 +2380,14 @@ class EscapistIE(InfoExtractor): self.report_extraction(showName) webpage = self._download_webpage(url, showName) - videoDesc = self._search_regex('(.*?)\s+-\s+XVID', + video_title = self._html_search_regex(r'(.*?)\s+-\s+XVID', webpage, u'title') # Extract video thumbnail @@ -2665,7 +2671,7 @@ class InfoQIE(InfoExtractor): webpage, u'title') # Extract description - video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>', + video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage, u'description', fatal=False) video_filename = video_url.split('/')[-1] @@ -2837,12 +2843,10 @@ class StanfordOpenClassroomIE(InfoExtractor): note='Downloading course info page', errnote='Unable to download course info page') - info['title'] = self._search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) - info['title'] = unescapeHTML(info['title']) + info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) - info['description'] = self._search_regex('<description>([^<]+)</description>', + info['description'] = self._html_search_regex('<description>([^<]+)</description>', coursepage, u'description', fatal=False) - if info['description']: info['description'] = unescapeHTML(info['description']) links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) info['list'] = [ @@ -2903,15 +2907,13 @@ class MTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>', + song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage, u'song name', fatal=False) - if song_name: song_name = unescapeHTML(song_name) - video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>', + video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>', webpage, u'title') - video_title = unescapeHTML(video_title) - mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>', + mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage, u'mtvn_uri', fatal=False) content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', @@ -3067,7 +3069,7 @@ class XNXXIE(InfoExtractor): webpage, u'video URL') video_url = compat_urllib_parse.unquote(video_url) - video_title = self._search_regex(self.VIDEO_TITLE_RE, + video_title = self._html_search_regex(self.VIDEO_TITLE_RE, webpage, u'title') video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE, @@ -3108,7 +3110,7 @@ class GooglePlusIE(InfoExtractor): self.report_extraction(video_id) # Extract update date - upload_date = self._search_regex('title="Timestamp">(.*?)</a>', + upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>', webpage, u'upload date', fatal=False) if upload_date: # Convert timestring to a format suitable for filename @@ -3116,12 +3118,12 @@ class GooglePlusIE(InfoExtractor): upload_date = upload_date.strftime('%Y%m%d') # Extract uploader - uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>', + uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>', webpage, u'uploader', fatal=False) # Extract title # Get the first line for title - video_title = self._search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]', + video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]', webpage, 'title', default=u'NA') # Step 2, Stimulate clicking the image box to launch video @@ -3175,13 +3177,13 @@ class NBAIE(InfoExtractor): video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' shortened_video_id = video_id.rpartition('/')[2] - title = self._search_regex(r'<meta property="og:title" content="(.*?)"', + title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"', webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '') # It isn't there in the HTML it returns to us - # uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False) + # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False) - description = self._search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False) + description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False) info = { 'id': shortened_video_id, @@ -3337,17 +3339,14 @@ class FunnyOrDieIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', + video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, u'video URL', flags=re.DOTALL) - video_url = unescapeHTML(video_url) - title = self._search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", + title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", r'<title>(?P<title>[^<]+?)'), webpage, 'title', flags=re.DOTALL) - title = clean_html(title) - video_description = self._search_regex(r'.+)"', + video_title = self._html_search_regex(r'data-title="(?P.+)"', webpage, u'title') - uploader = self._search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>', + uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>', webpage, u'uploader', fatal=False, flags=re.DOTALL) - if uploader: uploader = unescapeHTML(uploader.strip()) - thumbnail = self._search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"', + thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage, u'thumbnail', fatal=False) info = { @@ -3454,11 +3452,11 @@ class WorldStarHipHopIE(InfoExtractor): else: ext = 'flv' - video_title = self._search_regex(r"<title>(.*)", + video_title = self._html_search_regex(r"(.*)", webpage_src, u'title') # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. - thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />', + thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />', webpage_src, u'thumbnail', fatal=False) if not thumbnail: @@ -3640,7 +3638,7 @@ class PornotubeIE(InfoExtractor): #Get the uploaded date VIDEO_UPLOADED_RE = r'
Added (?P[0-9\/]+) by' - upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) + upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) if upload_date: upload_date = unified_strdate(upload_date) info = {'id': video_id, @@ -3668,7 +3666,7 @@ class YouJizzIE(InfoExtractor): webpage = self._download_webpage(url, video_id) # Get the video title - video_title = self._search_regex(r'(?P<title>.*)', + video_title = self._html_search_regex(r'(?P<title>.*)', webpage, u'title').strip() # Get the embed page @@ -3747,13 +3745,11 @@ class KeekIE(InfoExtractor): thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id webpage = self._download_webpage(url, video_id) - video_title = self._search_regex(r'[\S\s]+?

(?P.+?)

', + uploader = self._html_search_regex(r'
[\S\s]+?

(?P.+?)

', webpage, u'uploader', fatal=False) - if uploader: uploader = clean_html(uploader) info = { 'id': video_id, @@ -3907,9 +3903,8 @@ class SpiegelIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - video_title = self._search_regex(r'
(.*?)
', + video_title = self._html_search_regex(r'
(.*?)
', webpage, u'title') - video_title = unescapeHTML(video_title) xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' xml_code = self._download_webpage(xml_url, video_id, @@ -3948,15 +3943,13 @@ class LiveLeakIE(InfoExtractor): video_url = self._search_regex(r'file: "(.*?)",', webpage, u'video URL') - video_title = self._search_regex(r'', + video_uploader = self._html_search_regex(r'By:.*?(\w+)
', webpage, u'uploader', fatal=False) info = { @@ -4033,9 +4026,8 @@ class TumblrIE(InfoExtractor): # The only place where you can get a title, it's not complete, # but searching in other places doesn't work for all videos - video_title = self._search_regex(r'(?P<title>.*?)', + video_title = self._html_search_regex(r'(?P<title>.*?)', webpage, u'title', flags=re.DOTALL) - video_title = unescapeHTML(video_title) return [{'id': video_id, 'url': video_url, @@ -4105,10 +4097,10 @@ class RedTubeIE(InfoExtractor): self.report_extraction(video_id) - video_url = self._search_regex(r'', + video_url = self._html_search_regex(r'', webpage, u'video URL') - video_title = self._search_regex('

(.+?)

', + video_title = self._html_search_regex('

(.+?)

', webpage, u'title') return [{ @@ -4132,7 +4124,7 @@ class InaIE(InfoExtractor): self.report_extraction(video_id) - video_url = self._search_regex(r'.*?)]]>', @@ -4161,13 +4153,13 @@ class HowcastIE(InfoExtractor): video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)', webpage, u'video URL') - video_title = self._search_regex(r'.*?

(.+?)

', + uploader = self._html_search_regex(r'
.*?

(.+?)

', webpage, u'uploader', fatal=False, flags=re.DOTALL) return [{ @@ -4230,7 +4222,7 @@ class FlickrIE(InfoExtractor): first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage') - node_id = self._search_regex(r'(\d+-\d+)', + node_id = self._html_search_regex(r'(\d+-\d+)', first_xml, u'node_id') second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' @@ -4243,13 +4235,13 @@ class FlickrIE(InfoExtractor): raise ExtractorError(u'Unable to extract video url') video_url = mobj.group(1) + unescapeHTML(mobj.group(2)) - video_title = self._search_regex(r'(.*?)', + video_url = self._html_search_regex(r'(.*?)', data, u'video URL') return [{ @@ -4321,12 +4313,11 @@ class XHamsterIE(InfoExtractor): video_url = mobj.group('server')+'/key='+mobj.group('file') video_extension = video_url.split('.')[-1] - video_title = self._search_regex(r'(?P<title>.+?) - xHamster\.com', + video_title = self._html_search_regex(r'(?P<title>.+?) - xHamster\.com', webpage, u'title') - video_title = unescapeHTML(video_title) # Can't see the description anywhere in the UI - # video_description = self._search_regex(r'Description: (?P[^<]+)', + # video_description = self._html_search_regex(r'Description: (?P[^<]+)', # webpage, u'description', fatal=False) # if video_description: video_description = unescapeHTML(video_description) @@ -4337,7 +4328,7 @@ class XHamsterIE(InfoExtractor): video_upload_date = None self._downloader.report_warning(u'Unable to extract upload date') - video_uploader_id = self._search_regex(r']+>(?P[^>]+)', + video_uploader_id = self._html_search_regex(r']+>(?P[^>]+)', webpage, u'uploader id', default=u'anonymous') video_thumbnail = self._search_regex(r'\'image\':\'(?P[^\']+)\'', @@ -4373,7 +4364,7 @@ class HypemIE(InfoExtractor): self.report_extraction(track_id) - html_tracks = self._search_regex(r'', + html_tracks = self._html_search_regex(r'', response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip() try: track_list = json.loads(html_tracks) From 78d3442b1209d3858cfea1f7ca958f661784b5ab Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sun, 9 Jun 2013 14:21:42 +0200 Subject: [PATCH 10/31] test: extend the reach of info_dict checking * print the info_dict in a format suitable to easy adding to tests.json during tests if un-tested fields are detected * make it possible to put the crc32 in tests.json if the field is too long * complete the "info_dict" fields in existing tests * fixed the bugs catched doing this --- test/test_download.py | 21 +++- test/tests.json | 185 ++++++++++++++++++++++++++++------- youtube_dl/InfoExtractors.py | 17 ++-- 3 files changed, 177 insertions(+), 46 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index 565b1ebc5..862152033 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -7,8 +7,8 @@ import os import json import unittest import sys -import hashlib import socket +import binascii # Allow direct execution sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -38,6 +38,9 @@ def _try_rm(filename): if ose.errno != errno.ENOENT: raise +def crc32(value): + return '%08x' % (binascii.crc32(value.encode('utf8')) & 0xffffffff) + class FileDownloader(youtube_dl.FileDownloader): def __init__(self, *args, **kwargs): self.to_stderr = self.to_screen @@ -124,7 +127,21 @@ def generator(test_case): with io.open(tc['file'] + '.info.json', encoding='utf-8') as infof: info_dict = json.load(infof) for (info_field, value) in tc.get('info_dict', {}).items(): - self.assertEqual(value, info_dict.get(info_field)) + if isinstance(value, compat_str) and value.startswith('crc32:'): + self.assertEqual(value, 'crc32:' + crc32(info_dict.get(info_field))) + else: + self.assertEqual(value, info_dict.get(info_field)) + + # If checkable fields are missing from the test case, print the info_dict + test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'crc32:' + crc32(value)) + for key, value in info_dict.items() + if value and key in ('title', 'description', 'uploader', 'upload_date', 'uploader_id', 'location')) + if not all(key in tc.get('info_dict', {}).keys() for key in test_info_dict.keys()): + sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=2) + u'\n') + + # Check for the presence of mandatory fields + for key in ('id', 'url', 'title', 'ext'): + self.assertTrue(key in info_dict.keys() and info_dict[key]) finally: for tc in test_cases: _try_rm(tc['file']) diff --git a/test/tests.json b/test/tests.json index 82da27d5b..e9abb0950 100644 --- a/test/tests.json +++ b/test/tests.json @@ -15,43 +15,76 @@ "name": "Dailymotion", "md5": "392c4b85a60a90dc4792da41ce3144eb", "url": "http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech", - "file": "x33vw9.mp4" + "file": "x33vw9.mp4", + "info_dict": { + "uploader": "Alex and Van .", + "title": "Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\"" + } }, { "name": "Metacafe", "add_ie": ["Youtube"], "url": "http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/", - "file": "_aUehQsCQtM.flv" + "file": "_aUehQsCQtM.flv", + "info_dict": { + "upload_date": "20090102", + "title": "The Electric Company | \"Short I\" | PBS KIDS GO!", + "description": "crc32:5ef3bc57", + "uploader": "PBS", + "uploader_id": "PBS" + } }, { "name": "BlipTV", "md5": "b2d849efcf7ee18917e4b4d9ff37cafe", "url": "http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352", - "file": "5779306.m4v" + "file": "5779306.m4v", + "info_dict": { + "upload_date": "20111205", + "description": "crc32:fa658d49", + "uploader": "Comic Book Resources - CBR TV", + "title": "CBR EXCLUSIVE: \"Gotham City Imposters\" Bats VS Jokerz Short 3" + } }, { "name": "XVideos", "md5": "1d0c835822f0a71a7bf011855db929d0", "url": "http://www.xvideos.com/video939581/funny_porns_by_s_-1", - "file": "939581.flv" + "file": "939581.flv", + "info_dict": { + "title": "Funny Porns By >>>>S<<<<<< -1" + } }, { "name": "YouPorn", "md5": "c37ddbaaa39058c76a7e86c6813423c1", "url": "http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/", - "file": "505835.mp4" + "file": "505835.mp4", + "info_dict": { + "upload_date": "20101221", + "description": "Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?", + "uploader": "Ask Dan And Jennifer", + "title": "Sex Ed: Is It Safe To Masturbate Daily?" + } }, { "name": "Pornotube", "md5": "374dd6dcedd24234453b295209aa69b6", "url": "http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing", - "file": "1689755.flv" + "file": "1689755.flv", + "info_dict": { + "upload_date": "20090708", + "title": "Marilyn-Monroe-Bathing" + } }, { "name": "YouJizz", "md5": "07e15fa469ba384c7693fd246905547c", "url": "http://www.youjizz.com/videos/zeichentrick-1-2189178.html", - "file": "2189178.flv" + "file": "2189178.flv", + "info_dict": { + "title": "Zeichentrick 1" + } }, { "name": "Vimeo", @@ -70,61 +103,103 @@ "name": "Soundcloud", "md5": "ebef0a451b909710ed1d7787dddbf0d7", "url": "http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy", - "file": "62986583.mp3" + "file": "62986583.mp3", + "info_dict": { + "upload_date": "20121011", + "description": "No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd", + "uploader": "E.T. ExTerrestrial Music", + "title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1" + } }, { "name": "StanfordOpenClassroom", "md5": "544a9468546059d4e80d76265b0443b8", "url": "http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100", - "file": "PracticalUnix_intro-environment.mp4" + "file": "PracticalUnix_intro-environment.mp4", + "info_dict": { + "title": "Intro Environment" + } }, { "name": "XNXX", "md5": "0831677e2b4761795f68d417e0b7b445", "url": "http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_", - "file": "1135332.flv" + "file": "1135332.flv", + "info_dict": { + "title": "lida » Naked Funny Actress (5)" + } }, { "name": "Youku", "url": "http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html", "file": "XNDgyMDQ2NTQw_part00.flv", "md5": "ffe3f2e435663dc2d1eea34faeff5b5b", - "params": { "test": false } + "params": { "test": false }, + "info_dict": { + "title": "youtube-dl test video \"'/\\ä↭𝕐" + } }, { "name": "NBA", "url": "http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html", "file": "0021200253-okc-bkn-recap.nba.mp4", - "md5": "c0edcfc37607344e2ff8f13c378c88a4" + "md5": "c0edcfc37607344e2ff8f13c378c88a4", + "info_dict": { + "description": "Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.", + "title": "Thunder vs. Nets" + } }, { "name": "JustinTV", "url": "http://www.twitch.tv/thegamedevhub/b/296128360", "file": "296128360.flv", - "md5": "ecaa8a790c22a40770901460af191c9a" + "md5": "ecaa8a790c22a40770901460af191c9a", + "info_dict": { + "upload_date": "20110927", + "uploader_id": 25114803, + "uploader": "thegamedevhub", + "title": "Beginner Series - Scripting With Python Pt.1" + } }, { "name": "MyVideo", "url": "http://www.myvideo.de/watch/8229274/bowling_fail_or_win", "file": "8229274.flv", - "md5": "2d2753e8130479ba2cb7e0a37002053e" + "md5": "2d2753e8130479ba2cb7e0a37002053e", + "info_dict": { + "title": "bowling-fail-or-win" + } }, { "name": "Escapist", "url": "http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate", "file": "6618-Breaking-Down-Baldurs-Gate.mp4", - "md5": "c6793dbda81388f4264c1ba18684a74d" + "md5": "c6793dbda81388f4264c1ba18684a74d", + "info_dict": { + "description": "Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.", + "uploader": "the-escapist-presents", + "title": "Breaking Down Baldur's Gate" + } }, { "name": "GooglePlus", "url": "https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH", - "file": "ZButuJc6CtH.flv" + "file": "ZButuJc6CtH.flv", + "info_dict": { + "upload_date": "20120613", + "uploader": "井上ヨシマサ", + "title": "嘆きの天使 降臨" + } }, { "name": "FunnyOrDie", "url": "http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version", "file": "0732f586d7.mp4", - "md5": "f647e9e90064b53b6e046e75d0241fbd" + "md5": "f647e9e90064b53b6e046e75d0241fbd", + "info_dict": { + "description": "Lyrics changed to match the video. Spoken cameo by Obscurus Lupa (from ThatGuyWithTheGlasses.com). Based on a concept by Dustin McLean (DustFilms.com). Performed, edited, and written by David A. Scott.", + "title": "Heart-Shaped Box: Literal Video Version" + } }, { "name": "Steam", @@ -161,6 +236,7 @@ "url": "http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things", "file": "12-jan-pythonthings.mp4", "info_dict": { + "description": "Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.", "title": "A Few of My Favorite [Python] Things" }, "params": { @@ -173,7 +249,10 @@ "file": "422212.mp4", "md5": "4e2f5cb088a83cd8cdb7756132f9739d", "info_dict": { - "title": "thedailyshow-kristen-stewart part 1" + "upload_date": "20121214", + "description": "Kristen Stewart", + "uploader": "thedailyshow", + "title": "thedailyshow-kristen-stewart part 1" } }, { @@ -224,42 +303,48 @@ "file": "11885679.m4a", "md5": "d30b5b5f74217410f4689605c35d1fd7", "info_dict": { - "title": "youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad" + "title": "youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } }, { "file": "11885680.m4a", "md5": "4eb0a669317cd725f6bbd336a29f923a", "info_dict": { - "title": "youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad" + "title": "youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } }, { "file": "11885682.m4a", "md5": "1893e872e263a2705558d1d319ad19e8", "info_dict": { - "title": "PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad" + "title": "PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } }, { "file": "11885683.m4a", "md5": "b673c46f47a216ab1741ae8836af5899", "info_dict": { - "title": "PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad" + "title": "PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } }, { "file": "11885684.m4a", "md5": "1d74534e95df54986da7f5abf7d842b7", "info_dict": { - "title": "phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad" + "title": "phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } }, { "file": "11885685.m4a", "md5": "f081f47af8f6ae782ed131d38b9cd1c0", "info_dict": { - "title": "phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad" + "title": "phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } } ] @@ -270,9 +355,9 @@ "file": "NODfbab.mp4", "md5": "9b0636f8c0f7614afa4ea5e4c6e57e83", "info_dict": { + "uploader": "ytdl", "title": "test chars: \"'/\\ä<>This is a test video for youtube-dl.For more information, contact phihag@phihag.de ." } - }, { "name": "TED", @@ -290,14 +375,19 @@ "file": "11741.mp4", "md5": "0b49f4844a068f8b33f4b7c88405862b", "info_dict": { - "title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2" + "description": "Wer kann in die Fußstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?", + "title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2" } }, { "name": "Generic", "url": "http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html", "file": "13601338388002.mp4", - "md5": "85b90ccc9d73b4acd9138d3af4c27f89" + "md5": "85b90ccc9d73b4acd9138d3af4c27f89", + "info_dict": { + "uploader": "www.hodiho.fr", + "title": "Régis plante sa Jeep" + } }, { "name": "Spiegel", @@ -355,42 +445,59 @@ "file":"30510138.mp3", "md5":"f9136bf103901728f29e419d2c70f55d", "info_dict": { - "title":"D-D-Dance" + "upload_date": "20111213", + "description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", + "uploader": "The Royal Concept", + "title": "D-D-Dance" } }, { "file":"47127625.mp3", "md5":"09b6758a018470570f8fd423c9453dd8", "info_dict": { - "title":"The Royal Concept - Gimme Twice" + "upload_date": "20120521", + "description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", + "uploader": "The Royal Concept", + "title": "The Royal Concept - Gimme Twice" } }, { "file":"47127627.mp3", "md5":"154abd4e418cea19c3b901f1e1306d9c", "info_dict": { - "title":"Goldrushed" + "upload_date": "20120521", + "uploader": "The Royal Concept", + "title": "Goldrushed" } }, { "file":"47127629.mp3", "md5":"2f5471edc79ad3f33a683153e96a79c1", "info_dict": { - "title":"In the End" + "upload_date": "20120521", + "description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", + "uploader": "The Royal Concept", + "title": "In the End" } }, { "file":"47127631.mp3", "md5":"f9ba87aa940af7213f98949254f1c6e2", "info_dict": { - "title":"Knocked Up" + "upload_date": "20120521", + "description": "The Royal Concept from Stockholm\r\nFilip / David / Povel / Magnus\r\nwww.theroyalconceptband.com", + "uploader": "The Royal Concept", + "title": "Knocked Up" } }, { "file":"75206121.mp3", "md5":"f9d1fe9406717e302980c30de4af9353", "info_dict": { - "title":"World On Fire" + "upload_date": "20130116", + "description": "The unreleased track World on Fire premiered on the CW's hit show Arrow (8pm/7pm central). \r\nAs a gift to our fans we would like to offer you a free download of the track! ", + "uploader": "The Royal Concept", + "title": "World On Fire" } } ] @@ -419,8 +526,10 @@ "url": "http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0", "file": "zpsc0c3b9fa.mp4", "md5": "7dabfb92b0a31f6c16cebc0f8e60ff99", - "info_dict":{ - "title":"Tired of Link Building? Try BacklinkMyDomain.com!" + "info_dict": { + "upload_date": "20130504", + "uploader": "rachaneronas", + "title": "Tired of Link Building? Try BacklinkMyDomain.com!" } }, { @@ -488,8 +597,10 @@ "url": "http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html", "file": "1509445.flv", "md5": "9f48e0e8d58e3076bb236ff412ab62fa", - "info_dict":{ - "title":"FemaleAgent Shy beauty takes the bait" + "info_dict": { + "upload_date": "20121014", + "uploader_id": "Ruseful2011", + "title": "FemaleAgent Shy beauty takes the bait" } }, { diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 6060a5988..24e9c4cc7 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -2377,8 +2377,8 @@ class EscapistIE(InfoExtractor): showName = mobj.group('showname') videoId = mobj.group('episode') - self.report_extraction(showName) - webpage = self._download_webpage(url, showName) + self.report_extraction(videoId) + webpage = self._download_webpage(url, videoId) videoDesc = self._html_search_regex(']+>(?P[^>]+)', + video_uploader_id = self._html_search_regex(r']+>(?P[^<]+)', webpage, u'uploader id', default=u'anonymous') video_thumbnail = self._search_regex(r'\'image\':\'(?P[^\']+)\'', From ee55fcbe121baa0dacc9f87b9aa3abd974291355 Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Sun, 9 Jun 2013 15:03:54 +0200 Subject: [PATCH 11/31] switch long info_dict fields checking to md5 --- test/test_download.py | 9 ++++----- test/tests.json | 4 ++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/test/test_download.py b/test/test_download.py index 862152033..577bcdbf2 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -38,8 +38,7 @@ def _try_rm(filename): if ose.errno != errno.ENOENT: raise -def crc32(value): - return '%08x' % (binascii.crc32(value.encode('utf8')) & 0xffffffff) +md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() class FileDownloader(youtube_dl.FileDownloader): def __init__(self, *args, **kwargs): @@ -127,13 +126,13 @@ def generator(test_case): with io.open(tc['file'] + '.info.json', encoding='utf-8') as infof: info_dict = json.load(infof) for (info_field, value) in tc.get('info_dict', {}).items(): - if isinstance(value, compat_str) and value.startswith('crc32:'): - self.assertEqual(value, 'crc32:' + crc32(info_dict.get(info_field))) + if isinstance(value, compat_str) and value.startswith('md5:'): + self.assertEqual(value, 'md5:' + md5(info_dict.get(info_field))) else: self.assertEqual(value, info_dict.get(info_field)) # If checkable fields are missing from the test case, print the info_dict - test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'crc32:' + crc32(value)) + test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value)) for key, value in info_dict.items() if value and key in ('title', 'description', 'uploader', 'upload_date', 'uploader_id', 'location')) if not all(key in tc.get('info_dict', {}).keys() for key in test_info_dict.keys()): diff --git a/test/tests.json b/test/tests.json index e9abb0950..8a3e8e8e1 100644 --- a/test/tests.json +++ b/test/tests.json @@ -29,7 +29,7 @@ "info_dict": { "upload_date": "20090102", "title": "The Electric Company | \"Short I\" | PBS KIDS GO!", - "description": "crc32:5ef3bc57", + "description": "md5:2439a8ef6d5a70e380c22f5ad323e5a8", "uploader": "PBS", "uploader_id": "PBS" } @@ -41,7 +41,7 @@ "file": "5779306.m4v", "info_dict": { "upload_date": "20111205", - "description": "crc32:fa658d49", + "description": "md5:9bc31f227219cde65e47eeec8d2dc596", "uploader": "Comic Book Resources - CBR TV", "title": "CBR EXCLUSIVE: \"Gotham City Imposters\" Bats VS Jokerz Short 3" } From 88cebbd7b8ce5c564adecdfdd6a41ba76193fa26 Mon Sep 17 00:00:00 2001 From: Anisse Astier Date: Thu, 13 Jun 2013 23:42:48 +0200 Subject: [PATCH 12/31] YoutubePlaylistIE: get *all* videos For that, we add parameter safeSearch=none that asks youtube not filter results before sending them to us. Note: this parameter could be added to YoutubeSearchIE and YoutubeUserIE as well, but I don't know what would be the impact in term of unwanted results. Maybe expose that as a parameter? For a playlist it's different since the user chose what she put in the playlist. --- youtube_dl/InfoExtractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 33ba0fdd1..39278a2e9 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -1544,7 +1544,7 @@ class YoutubePlaylistIE(InfoExtractor): | ((?:PL|EC|UU)[0-9A-Za-z-_]{10,}) )""" - _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json' + _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none' _MAX_RESULTS = 50 IE_NAME = u'youtube:playlist' From 32aa88bcae5060c457406229a80de3a9f29a6d69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 16 Jun 2013 20:34:45 +0200 Subject: [PATCH 13/31] Add GametrailersIE --- test/tests.json | 10 +++++++ youtube_dl/InfoExtractors.py | 55 ++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/test/tests.json b/test/tests.json index 6c2373321..a2181bccc 100644 --- a/test/tests.json +++ b/test/tests.json @@ -509,5 +509,15 @@ "info_dict":{ "title":"Смях! Чудо - чист за секунди - Скрита камера" } + }, + { + "name": "Gametrailers", + "url": "http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer", + "file": "zbvr8i.flv", + "md5": "c3edbc995ab4081976e16779bd96a878", + "info_dict": { + "title": "E3 2013: Debut Trailer" + }, + "skip": "Requires rtmpdump" } ] diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 33ba0fdd1..6904a6686 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -4629,6 +4629,60 @@ class Vbox7IE(InfoExtractor): 'thumbnail': thumbnail_url, }] +class GametrailersIE(InfoExtractor): + _VALID_URL = r'http://www.gametrailers.com/(?Pvideos|reviews|full-episodes)/(?P.*?)/(?P.*)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + video_id = mobj.group('id') + video_type = mobj.group('type') + webpage = self._download_webpage(url, video_id) + if video_type == 'full-episodes': + mgid_re = r'data-video="(?P<mgid>mgid:.*?)"' + else: + mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\'' + m_mgid = re.search(mgid_re, webpage) + if m_mgid is None: + raise ExtractorError(u'Unable to extract mgid') + mgid = m_mgid.group(1) + data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'}) + + info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data, + video_id, u'Downloading video info') + links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data, + video_id, u'Downloading video urls info') + + self.report_extraction(video_id) + info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]>.* + .*?)\]\]>.* + .* + (?P.*?).* + ''' + + m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL) + if m_info is None: + raise ExtractorError(u'Unable to extract video info') + video_title = m_info.group('title') + video_description = m_info.group('description') + video_thumb = m_info.group('thumb') + + m_urls = re.finditer(r'(?P.*)', links_webpage) + if m_urls is None: + raise ExtractError(u'Unable to extrat video url') + # They are sorted from worst to best quality + video_url = list(m_urls)[-1].group('url') + + return {'url': video_url, + 'id': video_id, + 'title': video_title, + # Videos are actually flv not mp4 + 'ext': 'flv', + 'thumbnail': video_thumb, + 'description': video_description, + } + def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. @@ -4694,6 +4748,7 @@ def gen_extractors(): XHamsterIE(), HypemIE(), Vbox7IE(), + GametrailersIE(), GenericIE() ] From 36ed7177f01d278935ab5eac3d44fcea421c5325 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Sun, 16 Jun 2013 20:42:28 +0200 Subject: [PATCH 14/31] Fix HypemIE test: the song name has been changed --- test/tests.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/tests.json b/test/tests.json index a2181bccc..62170baf7 100644 --- a/test/tests.json +++ b/test/tests.json @@ -498,7 +498,7 @@ "file": "1v6ga.mp3", "md5": "b9cc91b5af8995e9f0c1cee04c575828", "info_dict":{ - "title":"TAME" + "title":"Tame" } }, { From af44c9486255f16ab180a9e45aaab06a6b38bdde Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Mon, 17 Jun 2013 19:25:35 +0200 Subject: [PATCH 15/31] use _search_regex in GenericIE --- youtube_dl/InfoExtractors.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 24e9c4cc7..3c95012b1 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -1430,16 +1430,12 @@ class GenericIE(InfoExtractor): # Site Name | Video Title # Video Title - Tagline | Site Name # and so on and so forth; it's just not practical - mobj = re.search(r'(.*)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_title = self._html_search_regex(r'(.*)', + webpage, u'video title') # video uploader is domain name - mobj = re.match(r'(?:https?://)?([^/]*)/.*', url) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_uploader = mobj.group(1) + video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*', + url, u'video uploader') return [{ 'id': video_id, From 0251f9c9c085234505b8a65b066ff54052d5fcdb Mon Sep 17 00:00:00 2001 From: Filippo Valsorda Date: Mon, 17 Jun 2013 19:47:44 +0200 Subject: [PATCH 16/31] add _search_regex to the new IEs --- youtube_dl/InfoExtractors.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index a25ccc173..2f926f243 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -4465,11 +4465,12 @@ class Vbox7IE(InfoExtractor): video_id = mobj.group(1) redirect_page, urlh = self._download_webpage_handle(url, video_id) - redirect_url = urlh.geturl() + re.search(r'window\.location = \'(.*)\';', redirect_page).group(1) + new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location') + redirect_url = urlh.geturl() + new_location webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page') - title = re.search(r'(.*)', webpage) - title = (title.group(1)).split('/')[0].strip() + title = self._html_search_regex(r'(.*)', + webpage, u'title').split('/')[0].strip() ext = "flv" info_url = "http://vbox7.com/play/magare.do" @@ -4503,10 +4504,7 @@ class GametrailersIE(InfoExtractor): mgid_re = r'data-video="(?Pmgid:.*?)"' else: mgid_re = r'data-contentId=\'(?Pmgid:.*?)\'' - m_mgid = re.search(mgid_re, webpage) - if m_mgid is None: - raise ExtractorError(u'Unable to extract mgid') - mgid = m_mgid.group(1) + mgid = self._search_regex(mgid_re, webpage, u'mgid') data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'}) info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data, @@ -4528,11 +4526,11 @@ class GametrailersIE(InfoExtractor): video_description = m_info.group('description') video_thumb = m_info.group('thumb') - m_urls = re.finditer(r'(?P.*)', links_webpage) - if m_urls is None: + m_urls = list(re.finditer(r'(?P.*)', links_webpage)) + if m_urls is None or len(m_urls) == 0: raise ExtractError(u'Unable to extrat video url') # They are sorted from worst to best quality - video_url = list(m_urls)[-1].group('url') + video_url = m_urls[-1].group('url') return {'url': video_url, 'id': video_id, From 449d5c910cefdbfdfc5aa13d682dc46988a5318e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Tue, 18 Jun 2013 14:22:16 +0200 Subject: [PATCH 17/31] Fix GooglePlusIE: the video_page url has changed of place --- youtube_dl/InfoExtractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 2f926f243..f36503d21 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -3128,7 +3128,7 @@ class GooglePlusIE(InfoExtractor): webpage, 'title', default=u'NA') # Step 2, Stimulate clicking the image box to launch video - video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]', + video_page = self._search_regex('href="(https\://plus\.google\.com/photos/.*?)"', webpage, u'video page URL') webpage = self._download_webpage(video_page, video_id, u'Downloading video page') From bb474376868469b5bcbaed6b1667b752ceba6119 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Wed, 19 Jun 2013 22:13:16 +0200 Subject: [PATCH 18/31] Ignore invalid dates (Fixes #894) --- youtube_dl/FileDownloader.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 72f03c217..f4ce48046 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -322,6 +322,9 @@ class FileDownloader(object): filetime = timeconvert(timestr) if filetime is None: return filetime + # Ignore obviously invalid dates + if filetime == 0: + return try: os.utime(filename, (time.time(), filetime)) except: From 68f54207a381aa21002af7e992ca587470b74d23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 20 Jun 2013 13:43:44 +0200 Subject: [PATCH 19/31] SteamIE: only verify the age if needed Also use the _html_search_regex function --- youtube_dl/InfoExtractors.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index db089403f..76f39b09e 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -3365,6 +3365,8 @@ class SteamIE(InfoExtractor): (?P\d+)/? (?P\d*)(?P\??) #For urltype == video we sometimes get the videoID """ + _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/' + _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' @classmethod def suitable(cls, url): @@ -3374,11 +3376,19 @@ class SteamIE(InfoExtractor): def _real_extract(self, url): m = re.match(self._VALID_URL, url, re.VERBOSE) gameID = m.group('gameID') - videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID - self.report_age_confirmation() + + videourl = self._VIDEO_PAGE_TEMPLATE % gameID webpage = self._download_webpage(videourl, gameID) - game_title = re.search(r'', webpage).group('game_title') - + + if re.search('

Please enter your birth date to continue:

', webpage) is not None: + videourl = self._AGECHECK_TEMPLATE % gameID + self.report_age_confirmation() + webpage = self._download_webpage(videourl, gameID) + + self.report_extraction(gameID) + game_title = self._html_search_regex(r'', + webpage, 'game title') + urlRE = r"'movie_(?P\d+)': \{\s*FILENAME: \"(?P[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P[\w:/\.\?=\+-]+)\")?\s*\}," mweb = re.finditer(urlRE, webpage) namesRE = r'(?P.+?)' From 5c6760193199530da1e66a1e412b58e238786f51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 20 Jun 2013 13:52:15 +0200 Subject: [PATCH 20/31] Revert "Fix GooglePlusIE: the video_page url has changed of place" The old method is working again. This reverts commit 449d5c910cefdbfdfc5aa13d682dc46988a5318e. --- youtube_dl/InfoExtractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 76f39b09e..59f65aca3 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -3128,7 +3128,7 @@ class GooglePlusIE(InfoExtractor): webpage, 'title', default=u'NA') # Step 2, Stimulate clicking the image box to launch video - video_page = self._search_regex('href="(https\://plus\.google\.com/photos/.*?)"', + video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]', webpage, u'video page URL') webpage = self._download_webpage(video_page, video_id, u'Downloading video page') From 377fdf5dde6a8fe477f5c3cd721df82112d41de7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 20 Jun 2013 14:02:21 +0200 Subject: [PATCH 21/31] Update the TumblrIE: the video is no longer available --- test/tests.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/tests.json b/test/tests.json index 30ebd964b..418897731 100644 --- a/test/tests.json +++ b/test/tests.json @@ -430,11 +430,11 @@ }, { "name": "Tumblr", - "url": "http://birthdayproject2012.tumblr.com/post/17258355236/a-sample-video-from-leeann-if-you-need-an-idea", - "file": "17258355236.mp4", - "md5": "7c6a514d691b034ccf8567999e9e88a3", + "url": "http://resigno.tumblr.com/post/53364321212/e-de-extrema-importancia-que-esse-video-seja", + "file": "53364321212.mp4", + "md5": "0716d3dd51baf68a28b40fdf1251494e", "info_dict": { - "title": "Calling all Pris! - A sample video from LeeAnn. (If you need an idea..." + "title": "Rafael Lemos | Tumblr" } }, { From 587c68b2cdef65766d4dc408b04bd67cf8d2daf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 20 Jun 2013 14:15:29 +0200 Subject: [PATCH 22/31] DailymotionIE: fix the extraction of the video uploader and use _search_regex for getting it --- youtube_dl/InfoExtractors.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 59f65aca3..c80a74ef6 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -940,16 +940,10 @@ class DailymotionIE(InfoExtractor): video_title = unescapeHTML(mobj.group('title')) video_uploader = None - mobj = re.search(r'(?im)[^<]+?
]+?>([^<]+?)', webpage) - if mobj is None: - # lookin for official user - mobj_official = re.search(r'', webpage) - if mobj_official is None: - self._downloader.report_warning(u'unable to extract uploader nickname') - else: - video_uploader = mobj_official.group(1) - else: - video_uploader = mobj.group(1) + video_uploader = self._search_regex([r'(?im)[^<]+?]+?>([^<]+?)', + # Looking for official user + r'<(?:span|a) .*?rel="author".*?>([^<]+?)([0-9]{2})-([0-9]{2})-([0-9]{4})
', webpage) From 038a3a1a614c2d67233f5822ae439da877b797ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 20 Jun 2013 14:37:43 +0200 Subject: [PATCH 23/31] RBMARadioIE: fix the extraction of the JSON data --- youtube_dl/InfoExtractors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index c80a74ef6..d9c555643 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -3488,8 +3488,8 @@ class RBMARadioIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - json_data = self._search_regex(r'', - webpage, u'json data') + json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$', + webpage, u'json data', flags=re.MULTILINE) try: data = json.loads(json_data) From 31eead52e764569fc95ffaf56b25850fa260988b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 14 Jun 2013 14:14:02 +0200 Subject: [PATCH 24/31] YoutubePlaylistIE: try to extract the url of the entries from the media$group dictionary Extracting it from content can return rtsp urls. --- youtube_dl/InfoExtractors.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index d9c555643..418c23f74 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -1606,9 +1606,15 @@ class YoutubePlaylistIE(InfoExtractor): # Number of videos is a multiple of self._MAX_RESULTS break - videos += [ (entry['yt$position']['$t'], entry['content']['src']) - for entry in response['feed']['entry'] - if 'content' in entry ] + for entry in response['feed']['entry']: + index = entry['yt$position']['$t'] + if 'media$group' in entry and 'media$player' in entry['media$group']: + videos.append((index, entry['media$group']['media$player']['url'])) + # Using this field can cause problems: + # https://github.com/rg3/youtube-dl/issues/886 + elif 'content' in entry: + videos.append((index, entry['content']['src'])) + if len(response['feed']['entry']) < self._MAX_RESULTS: break From f4c8bbcfc2c28d434a5696c88fe001ded68155ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Thu, 20 Jun 2013 20:51:20 +0200 Subject: [PATCH 25/31] TEDIE: download the best quality video and use the new _search_regex functions Also extracts the description. --- test/tests.json | 4 ++-- youtube_dl/InfoExtractors.py | 44 ++++++++++++++++-------------------- 2 files changed, 22 insertions(+), 26 deletions(-) diff --git a/test/tests.json b/test/tests.json index 418897731..3e0db297d 100644 --- a/test/tests.json +++ b/test/tests.json @@ -363,10 +363,10 @@ "name": "TED", "url": "http://www.ted.com/talks/dan_dennett_on_our_consciousness.html", "file": "102.mp4", - "md5": "7bc087e71d16f18f9b8ab9fa62a8a031", + "md5": "8cd9dfa41ee000ce658fd48fb5d89a61", "info_dict": { "title": "Dan Dennett: The illusion of consciousness", - "thumbnail": "http://images.ted.com/images/ted/488_389x292.jpg" + "description": "md5:c6fa72e6eedbd938c9caf6b2702f5922" } }, { diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index d9c555643..17e0f8323 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -3792,10 +3792,6 @@ class TEDIE(InfoExtractor): self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name)) return [self._playlist_videos_info(url,name,playlist_id)] - def _talk_video_link(self,mediaSlug): - '''Returns the video link for that mediaSlug''' - return 'http://download.ted.com/talks/%s.mp4' % mediaSlug - def _playlist_videos_info(self,url,name,playlist_id=0): '''Returns the videos of the playlist''' video_RE=r''' @@ -3808,9 +3804,8 @@ class TEDIE(InfoExtractor): m_videos=re.finditer(video_RE,webpage,re.VERBOSE) m_names=re.finditer(video_name_RE,webpage) - playlist_RE = r'div class="headline">(\s*?)

(\s*?)(?P.*?)' - m_playlist = re.search(playlist_RE, webpage) - playlist_title = m_playlist.group('playlist_title') + playlist_title = self._html_search_regex(r'div class="headline">\s*?

\s*?(.*?)', + webpage, 'playlist title') playlist_entries = [] for m_video, m_name in zip(m_videos,m_names): @@ -3821,27 +3816,28 @@ class TEDIE(InfoExtractor): def _talk_info(self, url, video_id=0): """Return the video for the talk in the url""" - m=re.match(self._VALID_URL, url,re.VERBOSE) - videoName=m.group('name') - webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName) + m = re.match(self._VALID_URL, url,re.VERBOSE) + video_name = m.group('name') + webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name) + self.report_extraction(video_name) # If the url includes the language we get the title translated - title_RE=r'(?P.*)</span>' - title=re.search(title_RE, webpage).group('title') - info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?) - "id":(?P<videoID>[\d]+).*? - "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"''' - thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"' - thumb_match=re.search(thumb_RE,webpage) - info_match=re.search(info_RE,webpage,re.VERBOSE) - video_id=info_match.group('videoID') - mediaSlug=info_match.group('mediaSlug') - video_url=self._talk_video_link(mediaSlug) + title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>', + webpage, 'title') + json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>', + webpage, 'json data') + info = json.loads(json_data) + desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>', + webpage, 'description', flags = re.DOTALL) + + thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"', + webpage, 'thumbnail') info = { - 'id': video_id, - 'url': video_url, + 'id': info['id'], + 'url': info['htmlStreams'][-1]['file'], 'ext': 'mp4', 'title': title, - 'thumbnail': thumb_match.group('thumbnail') + 'thumbnail': thumbnail, + 'description': desc, } return info From fcfa1885481488c431c434bf44a0a13a467e91b4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 21 Jun 2013 00:29:31 +0200 Subject: [PATCH 26/31] Show which IEs are slow during release --- devscripts/release.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devscripts/release.sh b/devscripts/release.sh index b2a91f817..b8efdab47 100755 --- a/devscripts/release.sh +++ b/devscripts/release.sh @@ -22,7 +22,7 @@ if [ ! -f "updates_key.pem" ]; then echo 'ERROR: updates_key.pem missing'; exit /bin/echo -e "\n### First of all, testing..." make cleanall -nosetests --with-coverage --cover-package=youtube_dl --cover-html test --stop || exit 1 +nosetests --verbose --with-coverage --cover-package=youtube_dl --cover-html test --stop || exit 1 /bin/echo -e "\n### Changing version in version.py..." sed -i "s/__version__ = '.*'/__version__ = '$version'/" youtube_dl/version.py From 759d525301477bd04607b05cf583ffa68951d0b5 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 21 Jun 2013 00:33:44 +0200 Subject: [PATCH 27/31] release 2013.06.21 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 1cda7fa74..7c6757efe 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.05.23' +__version__ = '2013.06.21' From 50d2376769725fcab6472d09919d78f92cb19005 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 21 Jun 2013 01:22:47 +0200 Subject: [PATCH 28/31] Leave out sig if not present (#896) --- youtube_dl/InfoExtractors.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 17e0f8323..8d228d40d 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -732,8 +732,11 @@ class YoutubeIE(InfoExtractor): for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','): url_data = compat_parse_qs(url_data_str) if 'itag' in url_data and 'url' in url_data: - url = url_data['url'][0] + '&signature=' + url_data['sig'][0] - if not 'ratebypass' in url: url += '&ratebypass=yes' + url = url_data['url'][0] + if 'sig' in url_data: + url += '&signature=' + url_data['sig'][0] + if 'ratebypass' not in url: + url += '&ratebypass=yes' url_map[url_data['itag'][0]] = url format_limit = self._downloader.params.get('format_limit', None) From e704f4d3785b318d7f4d3a034424b54ce50beb88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 22 Jun 2013 12:14:24 +0200 Subject: [PATCH 29/31] YoutubeIE: If not subtitles language is given default to English for automatic captions (related #901) --- youtube_dl/InfoExtractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 8d228d40d..ed2beac78 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -420,7 +420,7 @@ class YoutubeIE(InfoExtractor): def _request_automatic_caption(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" - sub_lang = self._downloader.params.get('subtitleslang') + sub_lang = self._downloader.params.get('subtitleslang') or 'en' sub_format = self._downloader.params.get('subtitlesformat') self.to_screen(u'%s: Looking for automatic captions' % video_id) mobj = re.search(r';ytplayer.config = ({.*?});', webpage) From 953dd93a481dc879033e92a8f1547782af04d6ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 22 Jun 2013 12:32:27 +0200 Subject: [PATCH 30/31] YoutubePlaylistIE: don't look into entry['content']['src'], accruing to the docs this can return live stream urls --- youtube_dl/InfoExtractors.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 418c23f74..f5006d23e 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -1610,11 +1610,6 @@ class YoutubePlaylistIE(InfoExtractor): index = entry['yt$position']['$t'] if 'media$group' in entry and 'media$player' in entry['media$group']: videos.append((index, entry['media$group']['media$player']['url'])) - # Using this field can cause problems: - # https://github.com/rg3/youtube-dl/issues/886 - elif 'content' in entry: - videos.append((index, entry['content']['src'])) - if len(response['feed']['entry']) < self._MAX_RESULTS: break From 346b5ce8fdcb114b48b26fc883a0d7e249dc55d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 22 Jun 2013 14:15:33 +0200 Subject: [PATCH 31/31] YoutubeIE: report warnings instead of errors if the subtitles are not found (related #901) For example when downloading a playlist some videos may not have subtitles but the download shouldn't stop. --- youtube_dl/InfoExtractors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 15417f05a..619ddeba1 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -699,14 +699,14 @@ class YoutubeIE(InfoExtractor): pass else: # We report the original error - self._downloader.report_error(sub_error) + self._downloader.report_warning(sub_error) if self._downloader.params.get('allsubtitles', False): video_subtitles = self._extract_all_subtitles(video_id) for video_subtitle in video_subtitles: (sub_error, sub_lang, sub) = video_subtitle if sub_error: - self._downloader.report_error(sub_error) + self._downloader.report_warning(sub_error) if self._downloader.params.get('listsubtitles', False): sub_lang_list = self._list_available_subtitles(video_id)