[gfycat] Add new extractor

2015-04-15 18:13:16 -04:00 · 2015-04-15 18:13:16 -04:00 · 1d0141565d
commit 1d0141565d
parent 9fc03aa87c
2 changed files with 105 additions and 0 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -185,6 +185,7 @@ from .gametrailers import GametrailersIE
 from .gazeta import GazetaIE
 from .gdcvault import GDCVaultIE
 from .generic import GenericIE
+from .gfycat import GfycatIE
 from .giantbomb import GiantBombIE
 from .giga import GigaIE
 from .glide import GlideIE
--- a/youtube_dl/extractor/gfycat.py
+++ b/youtube_dl/extractor/gfycat.py
@ -0,0 +1,104 @@
+from __future__ import unicode_literals
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    js_to_json,
+    mimetype2ext,
+    ExtractorError,
+)
+
+class GfycatIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:\w+\.)?gfycat\.com/(?P<id>[a-zA-Z]+)(\.(?P<ext>gif|webm|mp4))?'
+    _TESTS = [{
+        'url': 'http://gfycat.com/RequiredUnkemptBuzzard',
+        'info_dict': {
+            'id': 'RequiredUnkemptBuzzard',
+            'title': 'Headshot!',
+            'ext': 'mp4'
+        },
+    }, {
+        'url': 'https://giant.gfycat.com/RequiredUnkemptBuzzard.gif',
+        'info_dict': {
+            'id': 'RequiredUnkemptBuzzard',
+            'title': 'Headshot!',
+            'ext': 'gif'
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        parse = re.search(self._VALID_URL, url)
+        userExt = None
+        if parse.group('ext'):
+            userExt = parse.group('ext')
+        
+        url = 'http://gfycat.com/'+video_id
+        webpage = self._download_webpage(url, video_id)
+        
+        width = int_or_none(self._search_regex(
+            r'gfyWidth[\s=]*?"(?P<width>\d+?)"',
+            webpage, 'width', fatal=False))
+        height = int_or_none(self._search_regex(
+            r'gfyHeight[\s=]*?"(?P<height>\d+?)"',
+            webpage, 'height', fatal=False))
+        framerate = int_or_none(self._search_regex(
+            r'gfyFrameRate[\s=]*?"(?P<framerate>\d+?)"',
+            webpage, 'framerate', fatal=False))
+        frames = int_or_none(self._search_regex(
+            r'gfyNumFrames[\s=]*?"(?P<frames>\d+?)"',
+            webpage, 'frames', fatal=False))
+        views = int_or_none(self._search_regex(
+            r'gfyViews[\s=]*?"(?P<views>\d+?)"',
+            webpage, 'views', fatal=False))
+        title = self._search_regex(r'class="gfyTitle">(?P<title>[^<]*)',webpage, 'title', fatal=False)
+
+        formats = []
+        x=0
+        for f in ['image/webm','image/gif','video/mp4']:
+            preference = False
+            fext = f.partition('/')[2]
+            furl = re.search('gfy'+fext.title()+'Url[\s=]*?"(.*?)"', webpage)
+            fsize = re.search('gfy'+fext.title()+'Size[\s=]*?"(.*?)"', webpage)
+            
+            if fext == userExt:
+                preference=1000
+            
+            formats.append({
+                'format_id': f.partition('/')[2],
+                'url': self._proto_relative_url(furl.group(1)),
+                'acodec': 'none',
+                'ext':f.partition('/')[2],
+                'width': width,
+                'vbr':float(fsize.group(1))/(frames/framerate)/1024,
+                'preference':x if not preference else preference,
+                'fps':framerate,
+                'height': height,
+                'bytesize': fsize.group(1),
+                'id':video_id,
+                'http_headers': {
+                    'User-Agent': 'youtube-dl (like wget)',
+                },
+            })
+            x+=1
+       
+        if not len(formats):
+            raise ExtractorError('No sources found for gfycat %s. be sure to link to the page with the embed on it.' % video_id, expected=True)
+	
+        self._sort_formats(formats)
+
+        ret = {
+            'id': video_id,
+            'formats': formats,
+            'title': title,
+            'duration':(frames/framerate),
+            'view_count':views 
+        }
+        
+        # print json.dumps(ret, sort_keys=True, indent=4, separators=(',', ': '))
+        
+        
+        return ret