Use get_element_by_class() from utils to get rid of yet another regex. This function used to return only the content of the element, and not the element itself, including its tag and attributes. The whole group of get_element_by_X() functions are a bit of a misnomer, as they all return the *content* of the element and not the element itself. All these functions can now return the whole element when setting their `include_tag` parameter to `True`. By default it is `False` so no other code will be affected by this change. Tests have been added to test/test_utils.py accordingly. This uncovered a bug which prevented elements starting with a hyphen as their class name from being found. This has been fixed by fixing the regex used in get_elements_by_class().
69 lines
2.6 KiB
Python
69 lines
2.6 KiB
Python
from __future__ import unicode_literals
|
|
|
|
from .common import InfoExtractor
|
|
from ..utils import (
|
|
clean_html,
|
|
extract_attributes,
|
|
get_element_by_class,
|
|
unified_strdate,
|
|
)
|
|
|
|
|
|
class ArchiveOrgIE(InfoExtractor):
|
|
IE_NAME = 'archive.org'
|
|
IE_DESC = 'archive.org videos'
|
|
_VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#]+)(?:[?].*)?$'
|
|
_TESTS = [{
|
|
'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
|
|
'md5': '8af1d4cf447933ed3c7f4871162602db',
|
|
'info_dict': {
|
|
'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect',
|
|
'ext': 'ogg',
|
|
'title': '1968 Demo - FJCC Conference Presentation Reel #1',
|
|
'description': 'md5:da45c349df039f1cc8075268eb1b5c25',
|
|
'upload_date': '19681210',
|
|
'uploader': 'SRI International'
|
|
}
|
|
}, {
|
|
'url': 'https://archive.org/details/Cops1922',
|
|
'md5': '0869000b4ce265e8ca62738b336b268a',
|
|
'info_dict': {
|
|
'id': 'Cops1922',
|
|
'ext': 'mp4',
|
|
'title': 'Buster Keaton\'s "Cops" (1922)',
|
|
'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6',
|
|
}
|
|
}, {
|
|
'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
|
|
'only_matching': True,
|
|
}]
|
|
|
|
def _real_extract(self, url):
|
|
video_id = self._match_id(url)
|
|
webpage = self._download_webpage(
|
|
'http://archive.org/embed/' + video_id, video_id)
|
|
input_element_with_playlist = get_element_by_class(
|
|
'js-play8-playlist', webpage, include_tag=True)
|
|
jwplayer_playlist = self._parse_json(extract_attributes(
|
|
input_element_with_playlist)['value'], video_id)
|
|
info = self._parse_jwplayer_data(
|
|
{'playlist': jwplayer_playlist}, video_id, base_url=url)
|
|
|
|
def get_optional(metadata, field):
|
|
return metadata.get(field, [None])[0]
|
|
|
|
metadata = self._download_json(
|
|
'http://archive.org/details/' + video_id, video_id, query={
|
|
'output': 'json',
|
|
}).get('metadata', {})
|
|
info.update({
|
|
'title': get_optional(metadata, 'title') or info.get('title'),
|
|
'description': clean_html(get_optional(metadata, 'description')),
|
|
})
|
|
if info.get('_type') != 'playlist':
|
|
info.update({
|
|
'uploader': get_optional(metadata, 'creator'),
|
|
'upload_date': unified_strdate(get_optional(metadata, 'date')),
|
|
})
|
|
return info
|