[Flipagram] add new extractor

saw there was another pull request for an extractor for the website, but it does not extract all the necessary metadata, and has been inactive for a period of time
This commit is contained in:
TRox1972 2016-05-19 02:20:11 +02:00
parent dd81769c62
commit 444d749924
3 changed files with 114 additions and 0 deletions

View File

@ -833,6 +833,13 @@ class InfoExtractor(object):
'title': unescapeHTML(json_ld.get('headline')),
'description': unescapeHTML(json_ld.get('articleBody')),
})
elif item_type == 'VideoObject':
info.update({
'title': unescapeHTML(json_ld.get('name')),
'description': unescapeHTML(json_ld.get('description')),
'upload_date': unified_strdate(json_ld.get('upload_date')),
'url': unescapeHTML(json_ld.get('contentUrl')),
})
return dict((k, v) for k, v in info.items() if v is not None)
@staticmethod

View File

@ -240,6 +240,7 @@ from .fivemin import FiveMinIE
from .fivetv import FiveTVIE
from .fktv import FKTVIE
from .flickr import FlickrIE
from .flipagram import FlipagramIE
from .folketinget import FolketingetIE
from .footyroom import FootyRoomIE
from .formula1 import Formula1IE

View File

@ -0,0 +1,106 @@
# coding: utf-8
from __future__ import unicode_literals
import time
from .common import InfoExtractor
from ..utils import (
parse_iso8601,
unified_strdate,
int_or_none,
)
class FlipagramIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?flipagram\.com/f/(?P<id>[^/?_]+)'
_TESTS = [{
'url': 'https://flipagram.com/f/myrWjW9RJw',
'md5': '541988fb6c4c7c375215ea22a4a21841',
'info_dict': {
'id': 'myrWjW9RJw',
'title': 'Flipagram by crystaldolce featuring King and Lionheart by Of Monsters and Men',
'description': 'Herbie\'s first bannana🍌🐢🍌. #animals #pets #reptile #tortoise #sulcata #tort #justatreat #snacktime #bannanas #rescuepets #ofmonstersandmen @animals',
'ext': 'mp4',
'uploader': 'Crystal Dolce',
'creator': 'Crystal Dolce',
'uploader_id': 'crystaldolce',
}
}, {
'url': 'https://flipagram.com/f/nyvTSJMKId',
'only_matching': True,
}]
def c_date_to_iso(self, c_date):
'Convert dates in format \'04/25/2016 00:23:24 UTC\' to ISO8601.'
return time.strftime('%Y-%m-%dT%H:%M:%S', time.strptime(c_date, '%m/%d/%Y %H:%M:%S %Z'))
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
self.report_extraction(video_id)
user_data = self._parse_json(self._search_regex(r'window.reactH2O\s*=\s*({.+});', webpage, 'user data'), video_id)
content_data = self._search_json_ld(webpage, video_id)
flipagram = user_data.get('flipagram', {})
counts = flipagram.get('counts', {})
user = flipagram.get('user', {})
video = flipagram.get('video', {})
thumbnails = []
for cover in flipagram.get('covers', []):
if not cover.get('url'):
continue
thumbnails.append({
'url': self._proto_relative_url(cover.get('url')),
'width': int_or_none(cover.get('width')),
'height': int_or_none(cover.get('height')),
})
comments = []
for comment in user_data.get('comments', {}).get(video_id, {}).get('items', []):
text = comment.get('comment', [])
comments.append({
'author': comment.get('user', {}).get('name'),
'author_id': comment.get('user', {}).get('username'),
'id': comment.get('id'),
'text': text[0] if text else '',
'timestamp': parse_iso8601(self.c_date_to_iso(comment.get('created', ''))),
})
tags = [tag for item in flipagram['story'][1:] for tag in item]
formats = []
if flipagram.get('music', {}).get('track', {}).get('previewUrl', {}):
formats.append({
'url': flipagram.get('music').get('track').get('previewUrl'),
'ext': 'm4a',
'vcodec': 'none',
})
formats.append({
'url': video.get('url'),
'ext': 'mp4',
'width': int_or_none(video.get('width')),
'height': int_or_none(video.get('height')),
'filesize': int_or_none(video.get('size')),
})
return {
'id': video_id,
'title': content_data.get('title'),
'formats': formats,
'ext': 'mp4',
'thumbnails': thumbnails,
'description': content_data.get('description'),
'uploader': user.get('name'),
'creator': user.get('name'),
'timestamp': parse_iso8601(flipagram.get('iso801Created')),
'upload_date': unified_strdate(flipagram.get('created')),
'uploader_id': user.get('username'),
'view_count': int_or_none(counts.get('plays')),
'repost_count': int_or_none(counts.get('reflips')),
'comment_count': int_or_none(counts.get('comments')),
'comments': comments,
'tags': tags,
}