From 469c1dc4e3575203d388c6317b1b528bc1b9ba6c Mon Sep 17 00:00:00 2001 From: hodayabu Date: Mon, 20 Apr 2020 13:40:18 +0300 Subject: [PATCH 1/7] tiktok youtube-dl --- test/ci/test_tiktok.py | 15 ++++++++ youtube_dl/extractor/tiktok.py | 63 ++++++++++++++++++++++++++++++---- 2 files changed, 71 insertions(+), 7 deletions(-) create mode 100644 test/ci/test_tiktok.py diff --git a/test/ci/test_tiktok.py b/test/ci/test_tiktok.py new file mode 100644 index 000000000..130097688 --- /dev/null +++ b/test/ci/test_tiktok.py @@ -0,0 +1,15 @@ +import unittest +import youtube_dl + + +class MyTestCase(unittest.TestCase): + def test_something(self): + url = 'https://www.tiktok.com/@danieltbraun/video/6817099671043853574' + params = {} + ydl = youtube_dl.YoutubeDL(params) + info = ydl.extract_info(url, download=False) + self.assertEquals(info['title'], "She got a face full of DUSTBIN #foryou") + + +if __name__ == '__main__': + unittest.main() diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 66088b9ab..24ed25557 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -1,6 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals +from newspaper import Article +from bs4 import BeautifulSoup +import requests +import json from .common import InfoExtractor from ..utils import ( compat_str, @@ -69,7 +73,8 @@ class TikTokIE(TikTokBaseIE): https?:// (?: (?:m\.)?tiktok\.com/v| - (?:www\.)?tiktok\.com/share/video + (?:www\.)?tiktok\.com/share/video| + (?:www\.|)tiktok\.com\/@(?:.*?)\/video ) /(?P\d+) ''' @@ -94,12 +99,56 @@ class TikTokIE(TikTokBaseIE): }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://m.tiktok.com/v/%s.html' % video_id, video_id) - data = self._parse_json(self._search_regex( - r'\bdata\s*=\s*({.+?})\s*;', webpage, 'data'), video_id) - return self._extract_aweme(data) + #extract meta data using the official api + res = requests.get('https://www.tiktok.com/oembed?url='+url) + #json contains: provider url, titile, html, author_namee, height, thumbnail_width, width, version, + #author_url, thumbnail_height, thumbnail_url, type, provider_name (tiktok) + json= res.json() + + #extract metadata with beautifulSoup + #class - jsx-1038045583 jsx-3192540912 jsx-2150087249 video-meta-count conatins likes and comments + result = requests.get(url) + src = result.content + soup = BeautifulSoup(result.text, 'html.parser') + + meta_data= soup.find_all("div",{ "class": "jsx-1715470091.desktop-container"}) + print (meta_data) + + # + # + # video_id = self._match_id(url) + # webpage = self._download_webpage(url, video_id) + # s_rejex=self._search_regex(r'\bdata\s*=\s*({.+?})\s*;', webpage, 'data') + # data = self._parse_json(s_rejex, video_id) + # #return self.info_dict() + #return self._extract_aweme(data) + return None + + # def info_dict(self,video_id,video_title,formats,uploader, timestamp, thumbnail, view_count, uploader_id, is_live, live_status + # , likes_count, shares_count, subtitles, comment_count, ): + # info_dict = { + # 'id': video_id, + # 'title': video_title, + # 'formats': formats, + # 'uploader': uploader, + # 'timestamp': timestamp, + # 'thumbnail': thumbnail, + # 'view_count': view_count, + # 'uploader_id': uploader_id, + # 'is_live': is_live, + # 'live_status': live_status, + # 'like_count': likes_count, + # 'share_count': shares_count, + # 'subtitles': subtitles, + # 'comment_count': comment_count, + # 'other_posts_view_count': other_posts_view_count, + # 'uploader_handle': uploader_handle, + # '_internal_data': { + # 'page': webpage, + # 'api_response_list': [tahoe_data.primary, tahoe_data.secondary] + # } + # } + # return info_dict class TikTokUserIE(TikTokBaseIE): From 82381d8dd4bd97e51aef2cda2c05b0ed684dbd7e Mon Sep 17 00:00:00 2001 From: hodayabu Date: Mon, 20 Apr 2020 15:27:31 +0300 Subject: [PATCH 2/7] tiktok youtube-dl --- test/ci/test_tiktok.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/ci/test_tiktok.py b/test/ci/test_tiktok.py index 130097688..7d2f87047 100644 --- a/test/ci/test_tiktok.py +++ b/test/ci/test_tiktok.py @@ -2,8 +2,8 @@ import unittest import youtube_dl -class MyTestCase(unittest.TestCase): - def test_something(self): +class TikTokTestYoutubeDl(unittest.TestCase): + def test_meta_data(self): url = 'https://www.tiktok.com/@danieltbraun/video/6817099671043853574' params = {} ydl = youtube_dl.YoutubeDL(params) From 817be403ebd45c8f20049b00a4f414a17f1270f1 Mon Sep 17 00:00:00 2001 From: hodayabu Date: Mon, 20 Apr 2020 20:54:18 +0300 Subject: [PATCH 3/7] tiktok extractor --- test/ci/__init__.py | 0 test/ci/test_tiktok.py | 3 +- youtube_dl/extractor/tiktok.py | 117 +++++++++++++++++++-------------- 3 files changed, 71 insertions(+), 49 deletions(-) create mode 100644 test/ci/__init__.py diff --git a/test/ci/__init__.py b/test/ci/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/test/ci/test_tiktok.py b/test/ci/test_tiktok.py index 7d2f87047..7f0294c2a 100644 --- a/test/ci/test_tiktok.py +++ b/test/ci/test_tiktok.py @@ -8,7 +8,8 @@ class TikTokTestYoutubeDl(unittest.TestCase): params = {} ydl = youtube_dl.YoutubeDL(params) info = ydl.extract_info(url, download=False) - self.assertEquals(info['title'], "She got a face full of DUSTBIN #foryou") + self.assertEquals(info['share_count'], 110) + if __name__ == '__main__': diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 24ed25557..140db07bc 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals - -from newspaper import Article +import ast from bs4 import BeautifulSoup import requests import json @@ -16,6 +15,9 @@ from ..utils import ( ) +# add to requirements.txt- bs4, newspaper, requests + + class TikTokBaseIE(InfoExtractor): def _extract_aweme(self, data): video = data['video'] @@ -68,6 +70,8 @@ class TikTokBaseIE(InfoExtractor): } + + class TikTokIE(TikTokBaseIE): _VALID_URL = r'''(?x) https?:// @@ -99,57 +103,74 @@ class TikTokIE(TikTokBaseIE): }] def _real_extract(self, url): - #extract meta data using the official api - res = requests.get('https://www.tiktok.com/oembed?url='+url) - #json contains: provider url, titile, html, author_namee, height, thumbnail_width, width, version, - #author_url, thumbnail_height, thumbnail_url, type, provider_name (tiktok) - json= res.json() + video_id = url.split('/')[-1] - #extract metadata with beautifulSoup - #class - jsx-1038045583 jsx-3192540912 jsx-2150087249 video-meta-count conatins likes and comments - result = requests.get(url) - src = result.content - soup = BeautifulSoup(result.text, 'html.parser') + # extract meta data using the official api + # Response json contains: provider url, title, html, author_namee, height, thumbnail_width, width, version, + # author_url, thumbnail_height, thumbnail_url, type, provider_name (tiktok) - meta_data= soup.find_all("div",{ "class": "jsx-1715470091.desktop-container"}) - print (meta_data) + json_api = self._download_json('https://www.tiktok.com/oembed?url=' + url, video_id) - # - # - # video_id = self._match_id(url) - # webpage = self._download_webpage(url, video_id) - # s_rejex=self._search_regex(r'\bdata\s*=\s*({.+?})\s*;', webpage, 'data') - # data = self._parse_json(s_rejex, video_id) - # #return self.info_dict() - #return self._extract_aweme(data) - return None + # extract metadata with beautifulSoup + webpage = self._download_webpage(url, video_id) + soup = BeautifulSoup(webpage, features="html.parser") + h2 = soup.find_all("h2", {"class": "jsx-1038045583 jsx-3192540912 jsx-2150087249 video-meta-count"}) + data = h2[0].text.split(' ') + likes_count = self.numeric_convert(data[0]) + comments_count = self.numeric_convert(data[3]) + json_next_data = soup.find(id='__NEXT_DATA__') + props = json_next_data.contents[0] + json_data_encode = json.dumps(props.encode('utf-8')) + ast_le = ast.literal_eval(json_data_encode) + data_dict = json.loads(ast_le) + timestamp = self.numeric_convert(data_dict['props']['pageProps']['videoData']['itemInfos']['createTime']) - # def info_dict(self,video_id,video_title,formats,uploader, timestamp, thumbnail, view_count, uploader_id, is_live, live_status - # , likes_count, shares_count, subtitles, comment_count, ): - # info_dict = { - # 'id': video_id, - # 'title': video_title, - # 'formats': formats, - # 'uploader': uploader, - # 'timestamp': timestamp, - # 'thumbnail': thumbnail, - # 'view_count': view_count, - # 'uploader_id': uploader_id, - # 'is_live': is_live, - # 'live_status': live_status, - # 'like_count': likes_count, - # 'share_count': shares_count, - # 'subtitles': subtitles, - # 'comment_count': comment_count, - # 'other_posts_view_count': other_posts_view_count, - # 'uploader_handle': uploader_handle, - # '_internal_data': { - # 'page': webpage, - # 'api_response_list': [tahoe_data.primary, tahoe_data.secondary] - # } - # } - # return info_dict + shares = data_dict['props']['pageProps']['videoData']['itemInfos']['shareCount'] + views = data_dict['props']['pageProps']['videoData']['itemInfos']['playCount'] + duration = data_dict['props']['pageProps']['videoData']['itemInfos']['video']['videoMeta']['duration'] + provider_id = data_dict['props']['pageProps']['videoData']['itemInfos']['authorId'] + # TO-DO- check on formats + + return self.info_dict(video_id, str(url), json_api['title'], + json_api['author_name'], timestamp, json_api['thumbnail_url'], + views, provider_id, False, 'not_live', likes_count, shares, '', comments_count,duration) + + def numeric_convert(self, unicode): + if 'K' in unicode: + unicode=unicode[:-1] + return int(float(unicode)*1000) + if 'M' in unicode: + unicode=unicode[:-1] + return int(float(unicode)*100000) + else: + return int(unicode) + + + + def info_dict (self, video_id, url, video_title, + uploader, timestamp, thumbnail, + view_count, uploader_id, is_live, live_status + , likes_count, shares_count, subtitles, comment_count, duration): + info_dict = { + 'id': video_id, + 'url': url, + 'title': video_title, + 'uploader': uploader, + 'timestamp': timestamp, + 'thumbnail': thumbnail, + 'view_count': view_count, + 'uploader_id': uploader_id, + 'is_live': is_live, + 'live_status': live_status, + 'like_count': likes_count, + 'share_count': shares_count, + 'subtitles': subtitles, + 'comment_count': comment_count, + 'duration': duration + + } + return info_dict class TikTokUserIE(TikTokBaseIE): _VALID_URL = r'''(?x) From 8cddcb03222eeb6ca4bb1e8a441dcfb7c3449d5a Mon Sep 17 00:00:00 2001 From: hodayabu Date: Mon, 20 Apr 2020 21:07:13 +0300 Subject: [PATCH 4/7] video info platform tiktok --- test/ci/test_tiktok.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/ci/test_tiktok.py b/test/ci/test_tiktok.py index 7f0294c2a..2fe6cc4b9 100644 --- a/test/ci/test_tiktok.py +++ b/test/ci/test_tiktok.py @@ -8,7 +8,8 @@ class TikTokTestYoutubeDl(unittest.TestCase): params = {} ydl = youtube_dl.YoutubeDL(params) info = ydl.extract_info(url, download=False) - self.assertEquals(info['share_count'], 110) + self.assertEquals(info['share_count'], 121) + From 17880cbaca971d631304ed6cfbe7c1365e8b4110 Mon Sep 17 00:00:00 2001 From: hodayabu Date: Tue, 21 Apr 2020 14:32:32 +0300 Subject: [PATCH 5/7] tiktok fix code --- test/ci/test_tiktok.py | 21 +++++++++--- youtube_dl/extractor/tiktok.py | 62 ++++++++++------------------------ 2 files changed, 35 insertions(+), 48 deletions(-) diff --git a/test/ci/test_tiktok.py b/test/ci/test_tiktok.py index 2fe6cc4b9..5e050e6e2 100644 --- a/test/ci/test_tiktok.py +++ b/test/ci/test_tiktok.py @@ -4,13 +4,26 @@ import youtube_dl class TikTokTestYoutubeDl(unittest.TestCase): def test_meta_data(self): - url = 'https://www.tiktok.com/@danieltbraun/video/6817099671043853574' + url = 'https://www.tiktok.com/@oriangaon/video/6807126376001441030' params = {} ydl = youtube_dl.YoutubeDL(params) info = ydl.extract_info(url, download=False) - self.assertEquals(info['share_count'], 121) - - + self.assertEquals(info['id'], '6807126376001441030') + self.assertEquals(info['url'], 'https://www.tiktok.com/@oriangaon/video/6807126376001441030') + self.assertEquals(info['title'], '#foryou #foyou Mmmmm....,,') + self.assertEquals(info['uploader'], 'Oriangaon') + self.assertEquals(info['timestamp'], 1584907616) + self.assertEquals(info['thumbnail'], + 'https://p16-va-default.akamaized.net/obj/tos-maliva-p-0068/d1a8fbd3e42dda3a1baa01ee9edad289') + self.assertGreaterEqual(info['view_count'], 79864) + self.assertEquals(info['uploader_id'], '6772113344733955077') + self.assertFalse(info['is_live']) + self.assertEquals(info['live_status'], 'not_live') + self.assertGreaterEqual(info['like_count'], 2213) + self.assertGreaterEqual(info['share_count'], 109) + self.assertGreaterEqual(info['comment_count'], 40) + self.assertEquals(info['duration'], 10) + self.assertEquals(info['ext'], 'mp.4') if __name__ == '__main__': diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 140db07bc..9d742465e 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import ast from bs4 import BeautifulSoup -import requests import json from .common import InfoExtractor from ..utils import ( @@ -11,11 +10,7 @@ from ..utils import ( int_or_none, str_or_none, try_get, - url_or_none, -) - - -# add to requirements.txt- bs4, newspaper, requests + url_or_none) class TikTokBaseIE(InfoExtractor): @@ -70,8 +65,6 @@ class TikTokBaseIE(InfoExtractor): } - - class TikTokIE(TikTokBaseIE): _VALID_URL = r'''(?x) https?:// @@ -103,55 +96,34 @@ class TikTokIE(TikTokBaseIE): }] def _real_extract(self, url): - video_id = url.split('/')[-1] - - # extract meta data using the official api - # Response json contains: provider url, title, html, author_namee, height, thumbnail_width, width, version, - # author_url, thumbnail_height, thumbnail_url, type, provider_name (tiktok) - + video_id = self._match_id(url) json_api = self._download_json('https://www.tiktok.com/oembed?url=' + url, video_id) - # extract metadata with beautifulSoup webpage = self._download_webpage(url, video_id) soup = BeautifulSoup(webpage, features="html.parser") - h2 = soup.find_all("h2", {"class": "jsx-1038045583 jsx-3192540912 jsx-2150087249 video-meta-count"}) - data = h2[0].text.split(' ') - likes_count = self.numeric_convert(data[0]) - comments_count = self.numeric_convert(data[3]) json_next_data = soup.find(id='__NEXT_DATA__') props = json_next_data.contents[0] json_data_encode = json.dumps(props.encode('utf-8')) ast_le = ast.literal_eval(json_data_encode) data_dict = json.loads(ast_le) - timestamp = self.numeric_convert(data_dict['props']['pageProps']['videoData']['itemInfos']['createTime']) - shares = data_dict['props']['pageProps']['videoData']['itemInfos']['shareCount'] - views = data_dict['props']['pageProps']['videoData']['itemInfos']['playCount'] - duration = data_dict['props']['pageProps']['videoData']['itemInfos']['video']['videoMeta']['duration'] - provider_id = data_dict['props']['pageProps']['videoData']['itemInfos']['authorId'] - - # TO-DO- check on formats + item_info = data_dict['props']['pageProps']['videoData']['itemInfos'] + timestamp = int(item_info['createTime']) + shares = item_info['shareCount'] + views = item_info['playCount'] + duration = item_info['video']['videoMeta']['duration'] + provider_id = item_info['authorId'] + comments_count = item_info['commentCount'] + likes_count = item_info['diggCount'] return self.info_dict(video_id, str(url), json_api['title'], json_api['author_name'], timestamp, json_api['thumbnail_url'], - views, provider_id, False, 'not_live', likes_count, shares, '', comments_count,duration) + views, provider_id, False, 'not_live', likes_count, shares, '', comments_count, duration) - def numeric_convert(self, unicode): - if 'K' in unicode: - unicode=unicode[:-1] - return int(float(unicode)*1000) - if 'M' in unicode: - unicode=unicode[:-1] - return int(float(unicode)*100000) - else: - return int(unicode) - - - - def info_dict (self, video_id, url, video_title, - uploader, timestamp, thumbnail, - view_count, uploader_id, is_live, live_status - , likes_count, shares_count, subtitles, comment_count, duration): + def info_dict(self, video_id, url, video_title, + uploader, timestamp, thumbnail, + view_count, uploader_id, is_live, live_status + , likes_count, shares_count, subtitles, comment_count, duration): info_dict = { 'id': video_id, 'url': url, @@ -167,11 +139,13 @@ class TikTokIE(TikTokBaseIE): 'share_count': shares_count, 'subtitles': subtitles, 'comment_count': comment_count, - 'duration': duration + 'duration': duration, + 'ext':'mp.4' } return info_dict + class TikTokUserIE(TikTokBaseIE): _VALID_URL = r'''(?x) https?:// From b9f1ba3c017ae7b235d7b1efd9be36f2802069af Mon Sep 17 00:00:00 2001 From: hodayabu Date: Tue, 21 Apr 2020 17:57:08 +0300 Subject: [PATCH 6/7] video info platform tiktok --- youtube_dl/extractor/tiktok.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 9d742465e..4be1ef562 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -118,12 +118,12 @@ class TikTokIE(TikTokBaseIE): return self.info_dict(video_id, str(url), json_api['title'], json_api['author_name'], timestamp, json_api['thumbnail_url'], - views, provider_id, False, 'not_live', likes_count, shares, '', comments_count, duration) + views, provider_id, False, 'not_live', likes_count, shares, '', comments_count, duration, json_api['html']) def info_dict(self, video_id, url, video_title, uploader, timestamp, thumbnail, view_count, uploader_id, is_live, live_status - , likes_count, shares_count, subtitles, comment_count, duration): + , likes_count, shares_count, subtitles, comment_count, duration, embed_code): info_dict = { 'id': video_id, 'url': url, @@ -140,8 +140,8 @@ class TikTokIE(TikTokBaseIE): 'subtitles': subtitles, 'comment_count': comment_count, 'duration': duration, - 'ext':'mp.4' - + 'ext':'mp.4', + 'embed_code': embed_code } return info_dict From 06b483099c8d4b854f551fa16a1a3c2f1be98db8 Mon Sep 17 00:00:00 2001 From: bhodaya Date: Sun, 26 Apr 2020 09:19:15 +0300 Subject: [PATCH 7/7] add embed code --- test/ci/test_tiktok.py | 1 + youtube_dl/extractor/tiktok.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/test/ci/test_tiktok.py b/test/ci/test_tiktok.py index 5e050e6e2..c0a93d61f 100644 --- a/test/ci/test_tiktok.py +++ b/test/ci/test_tiktok.py @@ -24,6 +24,7 @@ class TikTokTestYoutubeDl(unittest.TestCase): self.assertGreaterEqual(info['comment_count'], 40) self.assertEquals(info['duration'], 10) self.assertEquals(info['ext'], 'mp.4') + self.assertGreater(len(info['embed_code']),0) if __name__ == '__main__': diff --git a/youtube_dl/extractor/tiktok.py b/youtube_dl/extractor/tiktok.py index 4be1ef562..4a2d3b94f 100644 --- a/youtube_dl/extractor/tiktok.py +++ b/youtube_dl/extractor/tiktok.py @@ -1,4 +1,3 @@ -# coding: utf-8 from __future__ import unicode_literals import ast from bs4 import BeautifulSoup