diff --git a/test/test_utils.py b/test/test_utils.py index 0896f4150..74a7792fc 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1401,8 +1401,49 @@ Line 1 ''' self.assertEqual(get_element_by_class('foo', html), 'nice') + self.assertEqual(get_element_by_class('foo', html, include_tag=True), 'nice') self.assertEqual(get_element_by_class('no-such-class', html), None) + html = ''' + + ''' + + self.assertEqual(get_element_by_class('foo', html), None) + self.assertEqual(get_element_by_class('foo', html, include_tag=True), '') + + html = ''' + + ''' + + self.assertEqual(get_element_by_class('foo', html), '') + self.assertEqual(get_element_by_class('foo', html, include_tag=True), '') + + html = ''' + nice + ''' + + self.assertEqual(get_element_by_class('content-section__wrap', html), 'nice') + self.assertEqual(get_element_by_class('content-section__wrap', html, include_tag=True), 'nice') + + html = ''' + nice + ''' + + self.assertEqual(get_element_by_class('-test-hyphen', html), 'nice') + + html = ''' + nice + ''' + + self.assertEqual(get_element_by_class('_test_underscore', html), 'nice') + + html = ''' + nice + ''' + + self.assertEqual(get_element_by_class('ä-umlaut', html), 'nice') + self.assertEqual(get_element_by_class('↑-unicode', html), 'nice') + def test_get_element_by_attribute(self): html = ''' nice diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 909dc0aaf..80ad653b1 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -4,6 +4,7 @@ from .common import InfoExtractor from ..utils import ( clean_html, extract_attributes, + get_element_by_class, unified_strdate, ) @@ -41,9 +42,8 @@ class ArchiveOrgIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( 'http://archive.org/embed/' + video_id, video_id) - input_element_with_playlist = self._search_regex( - r'(<\s*input.*\s*class\s*=\s*[\'"].*\s*js-play8-playlist\s*.*[\'"]\s*.*>)', - webpage, 'jwplayer playlist') + input_element_with_playlist = get_element_by_class( + 'js-play8-playlist', webpage, include_tag=True) jwplayer_playlist = self._parse_json(extract_attributes( input_element_with_playlist)['value'], video_id) info = self._parse_jwplayer_data( diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index f6204692a..4149f4dc5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1926,32 +1926,55 @@ def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT): return n.attrib[key] -def get_element_by_id(id, html): - """Return the content of the tag with the specified ID in the passed HTML document""" - return get_element_by_attribute('id', id, html) +def get_element_by_id(id, html, include_tag=False): + """ + Return the content of the tag with the specified ID in the passed HTML document. + + The whole element, including its tag, is returned when `include_flag` is `True`. + """ + return get_element_by_attribute('id', id, html, include_tag) -def get_element_by_class(class_name, html): - """Return the content of the first tag with the specified class in the passed HTML document""" - retval = get_elements_by_class(class_name, html) +def get_element_by_class(class_name, html, include_tag=False): + """ + Return the content of the first tag with the specified class in the passed HTML document. + + The whole element, including its tag, is returned when `include_flag` is `True`. + """ + retval = get_elements_by_class(class_name, html, include_tag) return retval[0] if retval else None -def get_element_by_attribute(attribute, value, html, escape_value=True): - retval = get_elements_by_attribute(attribute, value, html, escape_value) +def get_element_by_attribute(attribute, value, html, escape_value=True, + include_tag=False): + """ + Return the content of the first tag with the specified attribute in the passed HTML document. + + The whole element, including its tag, is returned when `include_flag` is `True`. + """ + retval = get_elements_by_attribute(attribute, value, html, escape_value, + include_tag) return retval[0] if retval else None -def get_elements_by_class(class_name, html): - """Return the content of all tags with the specified class in the passed HTML document as a list""" +def get_elements_by_class(class_name, html, include_tag=False): + """ + Return the content of all tags with the specified class in the passed HTML document as a list. + + The whole elements, including their tags, are returned when `include_flag` is `True`. + """ return get_elements_by_attribute( - 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), - html, escape_value=False) + 'class', r'[^\'"]*(? + \s*(?:\/\s*>|> (?P.*?) - + ) ''' % (re.escape(attribute), value), html): - res = m.group('content') + res = m.group(0) if include_tag else m.group('content') + if res is None: + continue if res.startswith('"') or res.startswith("'"): res = res[1:-1] @@ -1981,7 +2006,10 @@ class HTMLAttributeParser(compat_HTMLParser): compat_HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): - self.attrs = dict(attrs) + # Make sure we're looking at the first attributes. Later ones are from + # embedded elements. + if not self.attrs: + self.attrs = dict(attrs) def extract_attributes(html_element):