From 3d5f7a3947a8d304bc7ad46217f171996e95c475 Mon Sep 17 00:00:00 2001 From: Johannes Knoedtel Date: Mon, 12 Jan 2015 22:26:20 +0100 Subject: [PATCH 01/40] [utils] Prevent override of custom headers. The dict of headers of request objects in urllib has its keys always capitalized. This causes the lookup to fail and overwrite the header. If for example a Extractor tries to add a "User-Agent" header the internal representation in the request object is "User-agent". The header is therefore clobbered by the "User-Agent" in std_headers, because the strings are not equal. --- youtube_dl/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 98732e8e9..daf94abd1 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -611,7 +611,9 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): def http_request(self, req): for h, v in std_headers.items(): - if h not in req.headers: + # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 + # The dict keys are capitalized because of this bug by urllib + if h.capitalize() not in req.headers: req.add_header(h, v) if 'Youtubedl-no-compression' in req.headers: if 'Accept-encoding' in req.headers: From e205db3bcd7f7a9b0beaa13a4774bdcb26091e2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 16 Jan 2015 13:29:01 +0100 Subject: [PATCH 02/40] FFmpegEmbedSubtitlePP: don't fail if the video doesn't have an audio stream (fixes #4718) Instead of specifying which streams ffmpeg must copy, we tell it to copy all. --- youtube_dl/postprocessor/ffmpeg.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index 26b99e43c..a75277778 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -475,7 +475,13 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): filename = information['filepath'] input_files = [filename] + [subtitles_filename(filename, lang, self._subformat) for lang in sub_langs] - opts = ['-map', '0:0', '-map', '0:1', '-c:v', 'copy', '-c:a', 'copy'] + opts = [ + '-map', '0', + '-c', 'copy', + # Don't copy the existing subtitles, we may be running the + # postprocessor a second time + '-map', '-0:s', + ] for (i, lang) in enumerate(sub_langs): opts.extend(['-map', '%d:0' % (i + 1), '-c:s:%d' % i, 'mov_text']) lang_code = self._conver_lang_code(lang) From 2875cf01bb2229cb6cceeac72fb6dd770f7a96d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 16 Jan 2015 13:37:37 +0100 Subject: [PATCH 03/40] FFmpegEmbedSubtitlePP: simplify command --- youtube_dl/postprocessor/ffmpeg.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index a75277778..5b0ff32b1 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -481,15 +481,15 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): # Don't copy the existing subtitles, we may be running the # postprocessor a second time '-map', '-0:s', + '-c:s', 'mov_text', ] for (i, lang) in enumerate(sub_langs): - opts.extend(['-map', '%d:0' % (i + 1), '-c:s:%d' % i, 'mov_text']) + opts.extend(['-map', '%d:0' % (i + 1)]) lang_code = self._conver_lang_code(lang) if lang_code is not None: opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code]) - opts.extend(['-f', 'mp4']) - temp_filename = filename + '.temp' + temp_filename = prepend_extension(filename, 'temp') self._downloader.to_screen('[ffmpeg] Embedding subtitles in \'%s\'' % filename) self.run_ffmpeg_multiple_files(input_files, temp_filename, opts) os.remove(encodeFilename(filename)) From 4f4f6428224b641eb97ffd4b90be9d67034fc64a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 16 Jan 2015 13:44:36 +0100 Subject: [PATCH 04/40] [npo] Remove unused import --- youtube_dl/extractor/npo.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 8d496488b..175b14583 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -1,7 +1,5 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( fix_xml_ampersands, From 9d22a7dfb07ad6ff23680e11238744b2bb1735aa Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 16 Jan 2015 13:44:44 +0100 Subject: [PATCH 05/40] [fourtube] Fix extraction --- youtube_dl/extractor/fourtube.py | 76 ++++++++++++++++++-------------- youtube_dl/utils.py | 2 +- 2 files changed, 45 insertions(+), 33 deletions(-) diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index 7187e0752..b2284ab01 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -7,10 +7,9 @@ from ..compat import ( compat_urllib_request, ) from ..utils import ( - clean_html, parse_duration, + parse_iso8601, str_to_int, - unified_strdate, ) @@ -28,68 +27,81 @@ class FourTubeIE(InfoExtractor): 'uploader': 'WCP Club', 'uploader_id': 'wcp-club', 'upload_date': '20131031', + 'timestamp': 1383263892, 'duration': 583, + 'view_count': int, + 'like_count': int, + 'categories': list, } } def _real_extract(self, url): video_id = self._match_id(url) - webpage_url = 'http://www.4tube.com/videos/' + video_id - webpage = self._download_webpage(webpage_url, video_id) + webpage = self._download_webpage(url, video_id) - self.report_extraction(video_id) + title = self._html_search_meta('name', webpage) + timestamp = parse_iso8601(self._html_search_meta( + 'uploadDate', webpage)) + thumbnail = self._html_search_meta('thumbnailUrl', webpage) + uploader_id = self._html_search_regex( + r'', + webpage, 'uploader id') + uploader = self._html_search_regex( + r'', + webpage, 'uploader') - playlist_json = self._html_search_regex(r'var playerConfigPlaylist\s+=\s+([^;]+)', webpage, 'Playlist') - media_id = self._search_regex(r'idMedia:\s*(\d+)', playlist_json, 'Media Id') - sources = self._search_regex(r'sources:\s*\[([^\]]*)\]', playlist_json, 'Sources').split(',') - title = self._search_regex(r'title:\s*"([^"]*)', playlist_json, 'Title') - thumbnail_url = self._search_regex(r'image:\s*"([^"]*)', playlist_json, 'Thumbnail', fatal=False) + categories_html = self._search_regex( + r'(?s)>\s*Categories / Tags\s*.*?
    (.*?)
', + webpage, 'categories', fatal=False) + categories = None + if categories_html: + categories = [ + c.strip() for c in re.findall( + r'(?s)
  • (.*?)', categories_html)] - uploader_str = self._search_regex(r'Uploaded by(.*?)', webpage, 'uploader', fatal=False) - mobj = re.search(r'(?P[^<]+)', uploader_str) - (uploader, uploader_id) = (mobj.group('name'), mobj.group('id')) if mobj else (clean_html(uploader_str), None) + view_count = str_to_int(self._search_regex( + r'', + webpage, 'view count', fatal=False)) + like_count = str_to_int(self._search_regex( + r'', + webpage, 'like count', fatal=False)) + duration = parse_duration(self._html_search_meta('duration', webpage)) - upload_date = None - view_count = None - duration = None - description = self._html_search_meta('description', webpage, 'description') - if description: - upload_date = self._search_regex(r'Published Date: (\d{2} [a-zA-Z]{3} \d{4})', description, 'upload date', - fatal=False) - if upload_date: - upload_date = unified_strdate(upload_date) - view_count = self._search_regex(r'Views: ([\d,\.]+)', description, 'view count', fatal=False) - if view_count: - view_count = str_to_int(view_count) - duration = parse_duration(self._search_regex(r'Length: (\d+m\d+s)', description, 'duration', fatal=False)) + params_js = self._search_regex( + r'\$\.ajax\(url,\ opts\);\s*\}\s*\}\)\(([0-9,\[\] ]+)\)', + webpage, 'initialization parameters' + ) + params = self._parse_json('[%s]' % params_js, video_id) + media_id = params[0] + sources = ['%s' % p for p in params[2]] - token_url = "http://tkn.4tube.com/{0}/desktop/{1}".format(media_id, "+".join(sources)) + token_url = 'http://tkn.4tube.com/{0}/desktop/{1}'.format( + media_id, '+'.join(sources)) headers = { b'Content-Type': b'application/x-www-form-urlencoded', b'Origin': b'http://www.4tube.com', } token_req = compat_urllib_request.Request(token_url, b'{}', headers) tokens = self._download_json(token_req, video_id) - formats = [{ 'url': tokens[format]['token'], 'format_id': format + 'p', 'resolution': format + 'p', 'quality': int(format), } for format in sources] - self._sort_formats(formats) return { 'id': video_id, 'title': title, 'formats': formats, - 'thumbnail': thumbnail_url, + 'categories': categories, + 'thumbnail': thumbnail, 'uploader': uploader, 'uploader_id': uploader_id, - 'upload_date': upload_date, + 'timestamp': timestamp, + 'like_count': like_count, 'view_count': view_count, 'duration': duration, 'age_limit': 18, - 'webpage_url': webpage_url, } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 2546fa45d..7832ed87f 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1277,7 +1277,7 @@ def parse_duration(s): s = s.strip() m = re.match( - r'''(?ix)T? + r'''(?ix)(?:P?T)? (?: (?P[0-9.]+)\s*(?:mins?|minutes?)\s*| (?P[0-9.]+)\s*(?:hours?)| From cce81f192c15f3766adb7340315a877bc4fc592b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 16 Jan 2015 14:20:25 +0100 Subject: [PATCH 06/40] [bandcamp:album] Fix title extraction (Fixes #4721) --- youtube_dl/extractor/bandcamp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index b45d68a61..aea0263d6 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -161,7 +161,8 @@ class BandcampAlbumIE(InfoExtractor): entries = [ self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key()) for t_path in tracks_paths] - title = self._search_regex(r'album_title : "(.*?)"', webpage, 'title') + title = self._search_regex( + r'album_title\s*:\s*"(.*?)"', webpage, 'title', fatal=False) return { '_type': 'playlist', 'id': playlist_id, From 910c55205293097e79b2a134b69b572984e37846 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 16 Jan 2015 14:20:38 +0100 Subject: [PATCH 07/40] release 2015.01.16 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 458813b2d..63a79a7ee 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2015.01.15.1' +__version__ = '2015.01.16' From ba319696a99f34342a321d23bb41d3d442a773cc Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sat, 17 Jan 2015 23:56:34 +0100 Subject: [PATCH 08/40] [options] Clarify that --password can be left out (#4723) --- youtube_dl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/options.py b/youtube_dl/options.py index a30974efd..f25c12e52 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -264,7 +264,7 @@ def parseOpts(overrideArguments=None): authentication.add_option( '-p', '--password', dest='password', metavar='PASSWORD', - help='account password') + help='account password. If this option is left out, youtube-dl will ask interactively.') authentication.add_option( '-2', '--twofactor', dest='twofactor', metavar='TWOFACTOR', From fdb2ed7455d509a5f0b17ad5c1d721d5484bde8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 18 Jan 2015 08:09:18 +0600 Subject: [PATCH 09/40] [abc7news] Add extractor (Closes #4734) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/abc7news.py | 68 ++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 youtube_dl/extractor/abc7news.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 0902eb437..2b9d4455d 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals from .abc import ABCIE +from .abc7news import Abc7NewsIE from .academicearth import AcademicEarthCourseIE from .addanime import AddAnimeIE from .adobetv import AdobeTVIE diff --git a/youtube_dl/extractor/abc7news.py b/youtube_dl/extractor/abc7news.py new file mode 100644 index 000000000..c04949c21 --- /dev/null +++ b/youtube_dl/extractor/abc7news.py @@ -0,0 +1,68 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import parse_iso8601 + + +class Abc7NewsIE(InfoExtractor): + _VALID_URL = r'https?://abc7news\.com(?:/[^/]+/(?P[^/]+))?/(?P\d+)' + _TESTS = [ + { + 'url': 'http://abc7news.com/entertainment/east-bay-museum-celebrates-vintage-synthesizers/472581/', + 'info_dict': { + 'id': '472581', + 'display_id': 'east-bay-museum-celebrates-vintage-synthesizers', + 'ext': 'mp4', + 'title': 'East Bay museum celebrates history of synthesized music', + 'description': 'md5:a4f10fb2f2a02565c1749d4adbab4b10', + 'thumbnail': 're:^https?://.*\.jpg$', + 'timestamp': 1421123075, + 'upload_date': '20150113', + 'uploader': 'Jonathan Bloom', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + { + 'url': 'http://abc7news.com/472581', + 'only_matching': True, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id + + webpage = self._download_webpage(url, display_id) + + m3u8 = self._html_search_meta( + 'contentURL', webpage, 'm3u8 url', fatal=True) + + formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4') + self._sort_formats(formats) + + title = self._og_search_title(webpage).strip() + description = self._og_search_description(webpage).strip() + thumbnail = self._og_search_thumbnail(webpage) + timestamp = parse_iso8601(self._search_regex( + r'
    \s*