diff --git a/test/test_utils.py b/test/test_utils.py
index 0896f4150..74a7792fc 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -1401,8 +1401,49 @@ Line 1
'''
self.assertEqual(get_element_by_class('foo', html), 'nice')
+ self.assertEqual(get_element_by_class('foo', html, include_tag=True), 'nice')
self.assertEqual(get_element_by_class('no-such-class', html), None)
+ html = '''
+
+ '''
+
+ self.assertEqual(get_element_by_class('foo', html), None)
+ self.assertEqual(get_element_by_class('foo', html, include_tag=True), '')
+
+ html = '''
+
+ '''
+
+ self.assertEqual(get_element_by_class('foo', html), '')
+ self.assertEqual(get_element_by_class('foo', html, include_tag=True), '')
+
+ html = '''
+ nice
+ '''
+
+ self.assertEqual(get_element_by_class('content-section__wrap', html), 'nice')
+ self.assertEqual(get_element_by_class('content-section__wrap', html, include_tag=True), 'nice')
+
+ html = '''
+ nice
+ '''
+
+ self.assertEqual(get_element_by_class('-test-hyphen', html), 'nice')
+
+ html = '''
+ nice
+ '''
+
+ self.assertEqual(get_element_by_class('_test_underscore', html), 'nice')
+
+ html = '''
+ nice
+ '''
+
+ self.assertEqual(get_element_by_class('ä-umlaut', html), 'nice')
+ self.assertEqual(get_element_by_class('↑-unicode', html), 'nice')
+
def test_get_element_by_attribute(self):
html = '''
nice
diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py
index 909dc0aaf..80ad653b1 100644
--- a/youtube_dl/extractor/archiveorg.py
+++ b/youtube_dl/extractor/archiveorg.py
@@ -4,6 +4,7 @@ from .common import InfoExtractor
from ..utils import (
clean_html,
extract_attributes,
+ get_element_by_class,
unified_strdate,
)
@@ -41,9 +42,8 @@ class ArchiveOrgIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(
'http://archive.org/embed/' + video_id, video_id)
- input_element_with_playlist = self._search_regex(
- r'(<\s*input.*\s*class\s*=\s*[\'"].*\s*js-play8-playlist\s*.*[\'"]\s*.*>)',
- webpage, 'jwplayer playlist')
+ input_element_with_playlist = get_element_by_class(
+ 'js-play8-playlist', webpage, include_tag=True)
jwplayer_playlist = self._parse_json(extract_attributes(
input_element_with_playlist)['value'], video_id)
info = self._parse_jwplayer_data(
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index f6204692a..4149f4dc5 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1926,32 +1926,55 @@ def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
return n.attrib[key]
-def get_element_by_id(id, html):
- """Return the content of the tag with the specified ID in the passed HTML document"""
- return get_element_by_attribute('id', id, html)
+def get_element_by_id(id, html, include_tag=False):
+ """
+ Return the content of the tag with the specified ID in the passed HTML document.
+
+ The whole element, including its tag, is returned when `include_flag` is `True`.
+ """
+ return get_element_by_attribute('id', id, html, include_tag)
-def get_element_by_class(class_name, html):
- """Return the content of the first tag with the specified class in the passed HTML document"""
- retval = get_elements_by_class(class_name, html)
+def get_element_by_class(class_name, html, include_tag=False):
+ """
+ Return the content of the first tag with the specified class in the passed HTML document.
+
+ The whole element, including its tag, is returned when `include_flag` is `True`.
+ """
+ retval = get_elements_by_class(class_name, html, include_tag)
return retval[0] if retval else None
-def get_element_by_attribute(attribute, value, html, escape_value=True):
- retval = get_elements_by_attribute(attribute, value, html, escape_value)
+def get_element_by_attribute(attribute, value, html, escape_value=True,
+ include_tag=False):
+ """
+ Return the content of the first tag with the specified attribute in the passed HTML document.
+
+ The whole element, including its tag, is returned when `include_flag` is `True`.
+ """
+ retval = get_elements_by_attribute(attribute, value, html, escape_value,
+ include_tag)
return retval[0] if retval else None
-def get_elements_by_class(class_name, html):
- """Return the content of all tags with the specified class in the passed HTML document as a list"""
+def get_elements_by_class(class_name, html, include_tag=False):
+ """
+ Return the content of all tags with the specified class in the passed HTML document as a list.
+
+ The whole elements, including their tags, are returned when `include_flag` is `True`.
+ """
return get_elements_by_attribute(
- 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
- html, escape_value=False)
+ 'class', r'[^\'"]*(?
+ \s*(?:\/\s*>|>
(?P.*?)
- \1>
+ \1>)
''' % (re.escape(attribute), value), html):
- res = m.group('content')
+ res = m.group(0) if include_tag else m.group('content')
+ if res is None:
+ continue
if res.startswith('"') or res.startswith("'"):
res = res[1:-1]
@@ -1981,7 +2006,10 @@ class HTMLAttributeParser(compat_HTMLParser):
compat_HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
- self.attrs = dict(attrs)
+ # Make sure we're looking at the first attributes. Later ones are from
+ # embedded elements.
+ if not self.attrs:
+ self.attrs = dict(attrs)
def extract_attributes(html_element):