[anvato] Improve extraction (closes #12913)
* Promote to regular shortcut based extractor * Add mcp to access key mapping table * Add support for embeds extraction * Add support for anvato embeds in generic extractor
This commit is contained in:
		
							parent
							
								
									a1ebfd4494
								
							
						
					
					
						commit
						7986c3abcd
					
				@ -5,6 +5,7 @@ import base64
 | 
				
			|||||||
import hashlib
 | 
					import hashlib
 | 
				
			||||||
import json
 | 
					import json
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .common import InfoExtractor
 | 
					from .common import InfoExtractor
 | 
				
			||||||
@ -16,6 +17,7 @@ from ..utils import (
 | 
				
			|||||||
    intlist_to_bytes,
 | 
					    intlist_to_bytes,
 | 
				
			||||||
    int_or_none,
 | 
					    int_or_none,
 | 
				
			||||||
    strip_jsonp,
 | 
					    strip_jsonp,
 | 
				
			||||||
 | 
					    unescapeHTML,
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -26,6 +28,8 @@ def md5_text(s):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class AnvatoIE(InfoExtractor):
 | 
					class AnvatoIE(InfoExtractor):
 | 
				
			||||||
 | 
					    _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)'
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
    # Copied from anvplayer.min.js
 | 
					    # Copied from anvplayer.min.js
 | 
				
			||||||
    _ANVACK_TABLE = {
 | 
					    _ANVACK_TABLE = {
 | 
				
			||||||
        'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ',
 | 
					        'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ',
 | 
				
			||||||
@ -114,6 +118,22 @@ class AnvatoIE(InfoExtractor):
 | 
				
			|||||||
        'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ'
 | 
					        'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ'
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    _MCP_TO_ACCESS_KEY_TABLE = {
 | 
				
			||||||
 | 
					        'qa': 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922',
 | 
				
			||||||
 | 
					        'lin': 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749',
 | 
				
			||||||
 | 
					        'univison': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa',
 | 
				
			||||||
 | 
					        'uni': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa',
 | 
				
			||||||
 | 
					        'dev': 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a',
 | 
				
			||||||
 | 
					        'sps': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336',
 | 
				
			||||||
 | 
					        'spsstg': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336',
 | 
				
			||||||
 | 
					        'anv': 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3',
 | 
				
			||||||
 | 
					        'gray': 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900',
 | 
				
			||||||
 | 
					        'hearst': 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99',
 | 
				
			||||||
 | 
					        'cbs': 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe',
 | 
				
			||||||
 | 
					        'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582'
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'
 | 
				
			||||||
    _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'
 | 
					    _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, *args, **kwargs):
 | 
					    def __init__(self, *args, **kwargs):
 | 
				
			||||||
@ -217,9 +237,42 @@ class AnvatoIE(InfoExtractor):
 | 
				
			|||||||
            'subtitles': subtitles,
 | 
					            'subtitles': subtitles,
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @staticmethod
 | 
				
			||||||
 | 
					    def _extract_urls(ie, webpage, video_id):
 | 
				
			||||||
 | 
					        entries = []
 | 
				
			||||||
 | 
					        for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage):
 | 
				
			||||||
 | 
					            anvplayer_data = ie._parse_json(
 | 
				
			||||||
 | 
					                mobj.group('anvp'), video_id, transform_source=unescapeHTML,
 | 
				
			||||||
 | 
					                fatal=False)
 | 
				
			||||||
 | 
					            if not anvplayer_data:
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            video = anvplayer_data.get('video')
 | 
				
			||||||
 | 
					            if not isinstance(video, compat_str) or not video.isdigit():
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            access_key = anvplayer_data.get('accessKey')
 | 
				
			||||||
 | 
					            if not access_key:
 | 
				
			||||||
 | 
					                mcp = anvplayer_data.get('mcp')
 | 
				
			||||||
 | 
					                if mcp:
 | 
				
			||||||
 | 
					                    access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get(
 | 
				
			||||||
 | 
					                        mcp.lower())
 | 
				
			||||||
 | 
					            if not access_key:
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					            entries.append(ie.url_result(
 | 
				
			||||||
 | 
					                'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(),
 | 
				
			||||||
 | 
					                video_id=video))
 | 
				
			||||||
 | 
					        return entries
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _extract_anvato_videos(self, webpage, video_id):
 | 
					    def _extract_anvato_videos(self, webpage, video_id):
 | 
				
			||||||
        anvplayer_data = self._parse_json(self._html_search_regex(
 | 
					        anvplayer_data = self._parse_json(
 | 
				
			||||||
            r'<script[^>]+data-anvp=\'([^\']+)\'', webpage,
 | 
					            self._html_search_regex(
 | 
				
			||||||
            'Anvato player data'), video_id)
 | 
					                self._ANVP_RE, webpage, 'Anvato player data', group='anvp'),
 | 
				
			||||||
 | 
					            video_id)
 | 
				
			||||||
        return self._get_anvato_videos(
 | 
					        return self._get_anvato_videos(
 | 
				
			||||||
            anvplayer_data['accessKey'], anvplayer_data['video'])
 | 
					            anvplayer_data['accessKey'], anvplayer_data['video'])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _real_extract(self, url):
 | 
				
			||||||
 | 
					        mobj = re.match(self._VALID_URL, url)
 | 
				
			||||||
 | 
					        access_key, video_id = mobj.group('access_key_or_mcp', 'id')
 | 
				
			||||||
 | 
					        if access_key not in self._ANVACK_TABLE:
 | 
				
			||||||
 | 
					            access_key = self._MCP_TO_ACCESS_KEY_TABLE[access_key]
 | 
				
			||||||
 | 
					        return self._get_anvato_videos(access_key, video_id)
 | 
				
			||||||
 | 
				
			|||||||
@ -41,6 +41,7 @@ from .alphaporno import AlphaPornoIE
 | 
				
			|||||||
from .amcnetworks import AMCNetworksIE
 | 
					from .amcnetworks import AMCNetworksIE
 | 
				
			||||||
from .animeondemand import AnimeOnDemandIE
 | 
					from .animeondemand import AnimeOnDemandIE
 | 
				
			||||||
from .anitube import AnitubeIE
 | 
					from .anitube import AnitubeIE
 | 
				
			||||||
 | 
					from .anvato import AnvatoIE
 | 
				
			||||||
from .anysex import AnySexIE
 | 
					from .anysex import AnySexIE
 | 
				
			||||||
from .aol import AolIE
 | 
					from .aol import AolIE
 | 
				
			||||||
from .allocine import AllocineIE
 | 
					from .allocine import AllocineIE
 | 
				
			||||||
 | 
				
			|||||||
@ -86,6 +86,7 @@ from .openload import OpenloadIE
 | 
				
			|||||||
from .videopress import VideoPressIE
 | 
					from .videopress import VideoPressIE
 | 
				
			||||||
from .rutube import RutubeIE
 | 
					from .rutube import RutubeIE
 | 
				
			||||||
from .limelight import LimelightBaseIE
 | 
					from .limelight import LimelightBaseIE
 | 
				
			||||||
 | 
					from .anvato import AnvatoIE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class GenericIE(InfoExtractor):
 | 
					class GenericIE(InfoExtractor):
 | 
				
			||||||
@ -1677,6 +1678,15 @@ class GenericIE(InfoExtractor):
 | 
				
			|||||||
            },
 | 
					            },
 | 
				
			||||||
            'playlist_mincount': 5,
 | 
					            'playlist_mincount': 5,
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					            'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/',
 | 
				
			||||||
 | 
					            'info_dict': {
 | 
				
			||||||
 | 
					                'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest',
 | 
				
			||||||
 | 
					                'title': 'Standoff with Walnut Creek murder suspect ends',
 | 
				
			||||||
 | 
					                'description': 'md5:3ccc48a60fc9441eeccfc9c469ebf788',
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            'playlist_mincount': 4,
 | 
				
			||||||
 | 
					        },
 | 
				
			||||||
        # {
 | 
					        # {
 | 
				
			||||||
        #     # TODO: find another test
 | 
					        #     # TODO: find another test
 | 
				
			||||||
        #     # http://schema.org/VideoObject
 | 
					        #     # http://schema.org/VideoObject
 | 
				
			||||||
@ -2537,6 +2547,12 @@ class GenericIE(InfoExtractor):
 | 
				
			|||||||
                'limelight:media:%s' % mobj.group('id'),
 | 
					                'limelight:media:%s' % mobj.group('id'),
 | 
				
			||||||
                {'source_url': url}), 'LimelightMedia', mobj.group('id'))
 | 
					                {'source_url': url}), 'LimelightMedia', mobj.group('id'))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Look for Anvato embeds
 | 
				
			||||||
 | 
					        anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id)
 | 
				
			||||||
 | 
					        if anvato_urls:
 | 
				
			||||||
 | 
					            return self.playlist_result(
 | 
				
			||||||
 | 
					                anvato_urls, video_id, video_title, video_description)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Look for AdobeTVVideo embeds
 | 
					        # Look for AdobeTVVideo embeds
 | 
				
			||||||
        mobj = re.search(
 | 
					        mobj = re.search(
 | 
				
			||||||
            r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',
 | 
					            r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user