generic: dynamically find extractor for iframes/embeds/etc if static methods fail

2015-07-13 10:47:14 -05:00 · 2015-07-13 10:47:14 -05:00 · 7cdeaca34b
commit 7cdeaca34b
parent 41c0d2f8cb
1 changed files with 62 additions and 0 deletions
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -970,6 +970,57 @@ class GenericIE(InfoExtractor):
            'title': title,
        }

+    def _extract_plugin_embeds(self, webpage, url):
+        match = re.findall(
+            r'<(?:[^>]+?data-video-url|meta[^>]+?content|(?:embed|iframe)[^>]+?src)\s*=\s*(["\'])(?P<url>(?:https?:)?//.+?)\1', webpage)
+
+        # In addition to 'generic', ignore matches from these plugins
+        # ..however _extract_plugin_embeds should run last
+        notbefore_blacklist = {
+            # test 37 (Wistia) http://thoughtworks.wistia.com/medias/uxjb0lwrcz
+            #  duplicate embed causes test failure
+            'Wistia': True,
+            # test 46 for rtl.nl (http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen)
+            #  has a broken youtube embed, download & test failure
+            'youtube': True,
+        }
+        elist = []
+        if not match:
+            return elist
+        # eliminate duplicate checks
+        checked = {url: True}
+        for m in match:
+            u=unescapeHTML(m[1])
+
+            if checked.get(u,False) == True:
+               continue
+            checked[u] = True
+
+            for ie in self._downloader._ies:
+               found = False
+               if ie.IE_NAME == self.IE_NAME:
+                  continue
+               if not ie.working():
+                  continue
+               if notbefore_blacklist.get(ie.IE_NAME,False) == True:
+                  continue
+               if ie.suitable(u):
+                   print (' EMBED ['+ie.IE_NAME+'] '+u)
+                   found = True
+                   elist.append({
+                        '_type': 'url',
+                        'url': u,
+                        'ie_key': ie.ie_key(),
+                   })
+                   break
+            if not found:
+                 #self._downloader.params.get('verbose', False):
+                print (' EMBED [?!] '+u)
+        if elist:
+            print(''+str(len(elist))+' embeds')
+        return elist
+
+
    def _real_extract(self, url):
        if url.startswith('//'):
            return {
@ -1603,6 +1654,17 @@ class GenericIE(InfoExtractor):
                self._proto_relative_url(unescapeHTML(mobj.group(1))),
                'AdobeTVVideo')

+        # Last-ditch attempt to find matching plugin for embeds
+        # (this can potentially replace alot of code above)
+        elist = self._extract_plugin_embeds(webpage, url)
+        if elist:
+            return {
+                '_type': 'playlist',
+                'title': video_title,
+                'id': video_id,
+                'entries': elist,
+            }
+
        def check_video(vurl):
            if YoutubeIE.suitable(vurl):
                return True