From 9ccab970342ee043b422f0a67532d4d47c15d932 Mon Sep 17 00:00:00 2001 From: john Date: Sat, 8 Sep 2018 13:46:40 -0700 Subject: [PATCH 1/7] Added nicovideo search extractor --- README.md | 19 ++++++----- youtube_dl/extractor/extractors.py | 2 +- youtube_dl/extractor/niconico.py | 52 +++++++++++++++++++++++++++++- 3 files changed, 61 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index dd068a462..cb69ef6f6 100644 --- a/README.md +++ b/README.md @@ -77,8 +77,8 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo repairs broken URLs, but emits an error if this is not possible instead of searching. --ignore-config Do not read configuration files. When given - in the global configuration file - /etc/youtube-dl.conf: Do not read the user + in the global configuration file /etc + /youtube-dl.conf: Do not read the user configuration in ~/.config/youtube- dl/config (%APPDATA%/youtube-dl/config.txt on Windows) @@ -108,8 +108,8 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo proxy specified by --proxy (or none, if the option is not present) is used for the actual downloading. - --geo-bypass Bypass geographic restriction via faking - X-Forwarded-For HTTP header + --geo-bypass Bypass geographic restriction via faking X + -Forwarded-For HTTP header --no-geo-bypass Do not bypass geographic restriction via faking X-Forwarded-For HTTP header --geo-bypass-country CODE Force bypass geographic restriction with @@ -258,12 +258,11 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo jar in --cache-dir DIR Location in the filesystem where youtube-dl can store some downloaded information - permanently. By default - $XDG_CACHE_HOME/youtube-dl or - ~/.cache/youtube-dl . At the moment, only - YouTube player files (for videos with - obfuscated signatures) are cached, but that - may change. + permanently. By default $XDG_CACHE_HOME + /youtube-dl or ~/.cache/youtube-dl . At the + moment, only YouTube player files (for + videos with obfuscated signatures) are + cached, but that may change. --no-cache-dir Disable filesystem caching --rm-cache-dir Delete all filesystem cache files diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 7dc569724..eb90049de 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -730,7 +730,7 @@ from .nick import ( NickNightIE, NickRuIE, ) -from .niconico import NiconicoIE, NiconicoPlaylistIE +from .niconico import NiconicoIE, NiconicoPlaylistIE, NicovideoIE from .ninecninemedia import NineCNineMediaIE from .ninegag import NineGagIE from .ninenow import NineNowIE diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 76b412ff1..e4d986f73 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -3,8 +3,10 @@ from __future__ import unicode_literals import json import datetime +import re +import datetime -from .common import InfoExtractor +from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( compat_parse_qs, compat_urlparse, @@ -468,3 +470,51 @@ class NiconicoPlaylistIE(InfoExtractor): 'id': list_id, 'entries': entries, } + + #USAGE: youtube-dl "nicosearch:" +class NicovideoIE(SearchInfoExtractor): + IE_DESC = 'Nico video search' + _MAX_RESULTS = 100000 + _SEARCH_KEY = 'nicosearch' + def _get_n_results(self, query, n): + """Get a specified number of results for a query""" + entries = [] + currDate = datetime.datetime.now().date() + + while True: + search_url = "http://www.nicovideo.jp/search/%s?sort=f&order=d" % (query) + print(search_url) + r = self._get_entries_for_date(search_url, query, currDate) + + #did we gather more entries in the last few pages than were asked for? If so, only add as many as are needed to reach the desired number. + m = n - len(entries) + entries += r[0:min(m, len(r))] + + #for a given search, nicovideo will show a maximum of 50 pages. My way around this is specifying a date for the search, down to the date, which for the most part + #is a guarantee that the number of pages in the search results will not exceed 50. For any given search for a day, we extract everything available, and move on, until + #finding as many entries as were requested. + currDate -= datetime.timedelta(days=1) + if(len(entries) >= n): + break + + return { + '_type': 'playlist', + 'id': query, + 'entries': entries + } + + def _get_entries_for_date(self, url, query, date, pageNumber = 1): + link = url + "&page=" + str(pageNumber) + "&start=" + str(date) + "&end=" + str(date) + results = self._download_webpage(link, query, note='Downloading results page %s for date %s' % (pageNumber, date)) + entries = [] + r = re.findall(r'= 32): + entries += self._get_entries_for_date(url, query, date, pageNumber + 1) + return entries From efae7c1a5d335a021809c95b13de04fe5b28bf36 Mon Sep 17 00:00:00 2001 From: john Date: Sat, 8 Sep 2018 15:00:05 -0700 Subject: [PATCH 2/7] made regex more robust, fixed coding conventions --- youtube_dl/extractor/niconico.py | 33 ++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index e4d986f73..fce5755de 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import json import datetime import re -import datetime from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( @@ -471,50 +470,52 @@ class NiconicoPlaylistIE(InfoExtractor): 'entries': entries, } - #USAGE: youtube-dl "nicosearch:" + +# USAGE: youtube-dl "nicosearch:" class NicovideoIE(SearchInfoExtractor): IE_DESC = 'Nico video search' _MAX_RESULTS = 100000 _SEARCH_KEY = 'nicosearch' + def _get_n_results(self, query, n): """Get a specified number of results for a query""" entries = [] currDate = datetime.datetime.now().date() - - while True: + + while True: search_url = "http://www.nicovideo.jp/search/%s?sort=f&order=d" % (query) print(search_url) r = self._get_entries_for_date(search_url, query, currDate) - #did we gather more entries in the last few pages than were asked for? If so, only add as many as are needed to reach the desired number. + # did we gather more entries in the last few pages than were asked for? If so, only add as many as are needed to reach the desired number. m = n - len(entries) entries += r[0:min(m, len(r))] - - #for a given search, nicovideo will show a maximum of 50 pages. My way around this is specifying a date for the search, down to the date, which for the most part - #is a guarantee that the number of pages in the search results will not exceed 50. For any given search for a day, we extract everything available, and move on, until - #finding as many entries as were requested. + + # for a given search, nicovideo will show a maximum of 50 pages. My way around this is specifying a date for the search, down to the date, which for the most part + # is a guarantee that the number of pages in the search results will not exceed 50. For any given search for a day, we extract everything available, and move on, until + # finding as many entries as were requested. currDate -= datetime.timedelta(days=1) if(len(entries) >= n): break - + return { '_type': 'playlist', 'id': query, 'entries': entries - } + } - def _get_entries_for_date(self, url, query, date, pageNumber = 1): + def _get_entries_for_date(self, url, query, date, pageNumber=1): link = url + "&page=" + str(pageNumber) + "&start=" + str(date) + "&end=" + str(date) results = self._download_webpage(link, query, note='Downloading results page %s for date %s' % (pageNumber, date)) entries = [] - r = re.findall(r'= 32): entries += self._get_entries_for_date(url, query, date, pageNumber + 1) return entries From bfbeb00e0c9f7310826f3771d20c736fcdd066bf Mon Sep 17 00:00:00 2001 From: john Date: Sat, 8 Sep 2018 15:10:00 -0700 Subject: [PATCH 3/7] Added date bugfix --- youtube_dl/extractor/niconico.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index fce5755de..0ad6479b1 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -484,7 +484,6 @@ class NicovideoIE(SearchInfoExtractor): while True: search_url = "http://www.nicovideo.jp/search/%s?sort=f&order=d" % (query) - print(search_url) r = self._get_entries_for_date(search_url, query, currDate) # did we gather more entries in the last few pages than were asked for? If so, only add as many as are needed to reach the desired number. @@ -495,7 +494,7 @@ class NicovideoIE(SearchInfoExtractor): # is a guarantee that the number of pages in the search results will not exceed 50. For any given search for a day, we extract everything available, and move on, until # finding as many entries as were requested. currDate -= datetime.timedelta(days=1) - if(len(entries) >= n): + if(len(entries) >= n or currDate < datetime.datetime(2007, 1, 1)): break return { @@ -506,7 +505,7 @@ class NicovideoIE(SearchInfoExtractor): def _get_entries_for_date(self, url, query, date, pageNumber=1): link = url + "&page=" + str(pageNumber) + "&start=" + str(date) + "&end=" + str(date) - results = self._download_webpage(link, query, note='Downloading results page %s for date %s' % (pageNumber, date)) + results = self._download_webpage(link, query, note='Extracting results from page %s for date %s' % (pageNumber, date)) entries = [] r = re.findall(r''') data-video-id=['|"](..[0-9]{1,8})''', results) From 63cc9b4110b5fb3a13344f542db22c87f65a64d3 Mon Sep 17 00:00:00 2001 From: john Date: Sat, 8 Sep 2018 15:35:44 -0700 Subject: [PATCH 4/7] undid whatever I did to the readme --- README.md | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index cb69ef6f6..dd068a462 100644 --- a/README.md +++ b/README.md @@ -77,8 +77,8 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo repairs broken URLs, but emits an error if this is not possible instead of searching. --ignore-config Do not read configuration files. When given - in the global configuration file /etc - /youtube-dl.conf: Do not read the user + in the global configuration file + /etc/youtube-dl.conf: Do not read the user configuration in ~/.config/youtube- dl/config (%APPDATA%/youtube-dl/config.txt on Windows) @@ -108,8 +108,8 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo proxy specified by --proxy (or none, if the option is not present) is used for the actual downloading. - --geo-bypass Bypass geographic restriction via faking X - -Forwarded-For HTTP header + --geo-bypass Bypass geographic restriction via faking + X-Forwarded-For HTTP header --no-geo-bypass Do not bypass geographic restriction via faking X-Forwarded-For HTTP header --geo-bypass-country CODE Force bypass geographic restriction with @@ -258,11 +258,12 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo jar in --cache-dir DIR Location in the filesystem where youtube-dl can store some downloaded information - permanently. By default $XDG_CACHE_HOME - /youtube-dl or ~/.cache/youtube-dl . At the - moment, only YouTube player files (for - videos with obfuscated signatures) are - cached, but that may change. + permanently. By default + $XDG_CACHE_HOME/youtube-dl or + ~/.cache/youtube-dl . At the moment, only + YouTube player files (for videos with + obfuscated signatures) are cached, but that + may change. --no-cache-dir Disable filesystem caching --rm-cache-dir Delete all filesystem cache files From d61d495d5ed18fee27435afe8dc013616bf33d97 Mon Sep 17 00:00:00 2001 From: john Date: Fri, 14 Sep 2018 14:53:36 -0700 Subject: [PATCH 5/7] date bug fix --- youtube_dl/extractor/niconico.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 0ad6479b1..97f8f5737 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -494,7 +494,7 @@ class NicovideoIE(SearchInfoExtractor): # is a guarantee that the number of pages in the search results will not exceed 50. For any given search for a day, we extract everything available, and move on, until # finding as many entries as were requested. currDate -= datetime.timedelta(days=1) - if(len(entries) >= n or currDate < datetime.datetime(2007, 1, 1)): + if(len(entries) >= n or currDate < datetime.date(2007, 1, 1)): break return { From ef212dc2fefddd1228d02c211c5033fc878addeb Mon Sep 17 00:00:00 2001 From: john Date: Mon, 31 Dec 2018 16:00:11 -0800 Subject: [PATCH 6/7] Made changes --- youtube_dl/extractor/niconico.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 97f8f5737..708e819bf 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -483,7 +483,7 @@ class NicovideoIE(SearchInfoExtractor): currDate = datetime.datetime.now().date() while True: - search_url = "http://www.nicovideo.jp/search/%s?sort=f&order=d" % (query) + search_url = "http://www.nicovideo.jp/search/%s" % query r = self._get_entries_for_date(search_url, query, currDate) # did we gather more entries in the last few pages than were asked for? If so, only add as many as are needed to reach the desired number. @@ -504,17 +504,21 @@ class NicovideoIE(SearchInfoExtractor): } def _get_entries_for_date(self, url, query, date, pageNumber=1): - link = url + "&page=" + str(pageNumber) + "&start=" + str(date) + "&end=" + str(date) - results = self._download_webpage(link, query, note='Extracting results from page %s for date %s' % (pageNumber, date)) - entries = [] - r = re.findall(r''') data-video-id=['|"](..[0-9]{1,8})''', results) + while True: + link = url + "?page=" + str(pageNumber) + "&start=" + str(date) + "&end=" + str(date) + results = self._download_webpage(link, "None", note='Extracting results from page %s for date %s' % (pageNumber, date)) + entries = [] + r = re.findall(r'(?<=data-video-id=)["\']?(?P.*?)(?=["\'])', results) - for item in r: - e = self.url_result("http://www.nicovideo.jp/watch/" + str(item), 'Niconico') - entries.append(e) + for item in r: + e = self.url_result("http://www.nicovideo.jp/watch/" + item, 'Niconico') + entries.append(e) + + # each page holds a maximum of 32 entries. If we've seen 32 entries on the current page, + # it's possible there may be another, so we can check. It's a little awkward, but it works. + if(len(r) < 32): + break + + pageNumber += 1 - # each page holds a maximum of 32 entries. If we've seen 32 entries on the current page, - # it's possible there may be another, so we can check. It's a little awkward, but it works. - if(len(r) >= 32): - entries += self._get_entries_for_date(url, query, date, pageNumber + 1) return entries From 335f037e80b69d8ca68fc3df79d5d3cd6ed7dac3 Mon Sep 17 00:00:00 2001 From: john Date: Mon, 31 Dec 2018 16:18:51 -0800 Subject: [PATCH 7/7] Added query argument --- youtube_dl/extractor/niconico.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 708e819bf..6f22f9c8a 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -506,7 +506,7 @@ class NicovideoIE(SearchInfoExtractor): def _get_entries_for_date(self, url, query, date, pageNumber=1): while True: link = url + "?page=" + str(pageNumber) + "&start=" + str(date) + "&end=" + str(date) - results = self._download_webpage(link, "None", note='Extracting results from page %s for date %s' % (pageNumber, date)) + results = self._download_webpage(link, "None", query={"Search_key": query}, note='Extracting results from page %s for date %s' % (pageNumber, date)) entries = [] r = re.findall(r'(?<=data-video-id=)["\']?(?P.*?)(?=["\'])', results)