identifying multiple assets in a webpage for html5

2014-01-22 11:00:29 +05:30 · 2014-01-22 11:00:29 +05:30 · 55347a7d59
commit 55347a7d59
parent 4e4799176e
2 changed files with 19 additions and 1 deletions
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -312,6 +312,18 @@ class InfoExtractor(object):
        if video_id is not None:
            video_info['id'] = video_id
        return video_info
+
+    @staticmethod
+    def video_result(video_url=None, video_id=None, uploader=None, video_title=None):
+        """Returns a url that points to a page that should be processed"""
+        #TODO: ie should be the class used for getting the info
+        video_info = {'_type': 'video',
+                      'url': video_url,
+                      'id': video_id,
+                      'uploader': uploader,
+                       'title': video_title}
+        return video_info
+
    @staticmethod
    def playlist_result(entries, playlist_id=None, playlist_title=None):
        """Returns a playlist"""
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -348,7 +348,13 @@ class GenericIE(InfoExtractor):
                mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
        if mobj is None:
            # HTML5 video
-            mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL)
+            matches = re.findall(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL)
+            if matches:
+                urlrs = [self.video_result(unescapeHTML(tuppl), video_id, video_uploader,video_title)
+                for tuppl in matches]
+                return self.playlist_result(
+                    urlrs, playlist_id=video_id, playlist_title=video_title)
+
        if mobj is None:
            raise ExtractorError('Unsupported URL: %s' % url)