[ProSiebenSat1] Improve title extraction (#13915)

With this commit, og:title titles are preferred over the old extraction. Some tests had to be adjusted, but I have verified the now extracted titles are equally well or better.
2017-09-05 20:17:16 +02:00 · 2017-09-05 20:17:16 +02:00 · 36b93e52e2
commit 36b93e52e2
parent bc35f07537
1 changed files with 8 additions and 6 deletions
--- a/youtube_dl/extractor/prosiebensat1.py
+++ b/youtube_dl/extractor/prosiebensat1.py
@ -148,7 +148,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE):
            'info_dict': {
                'id': '2104602',
                'ext': 'mp4',
-                'title': 'Episode 18 - Staffel 2',
+                'title': 'CIRCUS HALLIGALLI - Episode 18 - Staffel 2',
                'description': 'md5:8733c81b702ea472e069bc48bb658fc1',
                'upload_date': '20131231',
                'duration': 5845.04,
@ -255,7 +255,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE):
            'info_dict': {
                'id': '2572814',
                'ext': 'mp4',
-                'title': 'Andreas Kümmert: Rocket Man',
+                'title': 'The Voice of Germany - Andreas Kümmert: Rocket Man',
                'description': 'md5:6ddb02b0781c6adf778afea606652e38',
                'upload_date': '20131017',
                'duration': 469.88,
@ -292,7 +292,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE):
            'info_dict': {
                'id': '4187506',
                'ext': 'mp4',
-                'title': 'Best of Circus HalliGalli',
+                'title': 'Best of Circus HalliGalli - Circus Halligalli - 7TV',
                'description': 'md5:8849752efd90b9772c9db6fdf87fb9e9',
                'upload_date': '20151229',
            },
@ -385,9 +385,11 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE):
    def _extract_clip(self, url, webpage):
        clip_id = self._html_search_regex(
            self._CLIPID_REGEXES, webpage, 'clip id')
-        title = self._html_search_regex(
+        title = self._og_search_title(webpage)
+        if title is None:
+            self._html_search_regex(
                self._TITLE_REGEXES, webpage, 'title',
-            default=None) or self._og_search_title(webpage)
+                default=None) 
        info = self._extract_video_info(url, clip_id)
        description = self._html_search_regex(
            self._DESCRIPTION_REGEXES, webpage, 'description', default=None)