[utils] Introduce clean_html_markdown

This commit is contained in:
Jeremie J. Jarosh 2019-08-25 14:22:51 -05:00
parent 393cc31d5e
commit 2e9c31369e
2 changed files with 81 additions and 0 deletions

View File

@ -20,6 +20,7 @@ from youtube_dl.utils import (
args_to_str,
encode_base_n,
clean_html,
clean_html_markdown,
date_from_str,
DateRange,
detect_exe_version,
@ -1025,6 +1026,33 @@ class TestUtil(unittest.TestCase):
self.assertEqual(clean_html('a:\n "b"'), 'a: "b"')
self.assertEqual(clean_html('a<br>\xa0b'), 'a\nb')
def test_clean_html_markdown(self):
self.assertEqual(clean_html_markdown(
'<div id="out" class="markdown-body"><h1>Happy Text</h1>\n'
'<p>When you do it your way you can go <em>anywhere</em> you choose. And just raise cain. I thought today we would make a happy little stream that\'s just running through the woods here. I was <strong>blessed</strong> with a very steady hand; and it comes in very handy when you\'re doing these little delicate things. You have to allow the paint to break to <strong><em>make it beautiful</em></strong>. Let\'s do it again then, what the heck.</p>\n'
'<h2>This is your creation - and it\'s just as unique and special as you are.</h2>\n'
'<p>Paint <b>anything</b> you want on the canvas. Create your own world. By now you should be quite happy about what\'s happening here. You can\'t have light without dark. You can\'t know <i>happiness</i> unless you\'ve known <em>sorrow</em>. Let\'s get crazy.</p>\n'
'<ul>\n'
'<li>You can spend all day playing with mountains.</li>\n'
'<li>We\'ll put a happy little sky in here.</li>\n'
'</ul>\n'
'<p>I like to beat the brush. There we go.<br>\n'
'We don\'t need any guidelines or formats. All we need to do is just let it flow right out of us. Trees live in your fan brush, but you have to scare them out.</p>\n'
'</div>'),
"# Happy Text\n"
"\n"
"When you do it your way you can go *anywhere* you choose. And just raise cain. I thought today we would make a happy little stream that's just running through the woods here. I was **blessed** with a very steady hand; and it comes in very handy when you're doing these little delicate things. You have to allow the paint to break to ***make it beautiful***. Let's do it again then, what the heck.\n"
"\n"
"## This is your creation - and it's just as unique and special as you are.\n"
"\n"
"Paint **anything** you want on the canvas. Create your own world. By now you should be quite happy about what's happening here. You can't have light without dark. You can't know *happiness* unless you've known *sorrow*. Let's get crazy.\n"
"\n"
"- You can spend all day playing with mountains. \n"
"- We'll put a happy little sky in here. \n"
"\n"
"I like to beat the brush. There we go. \n"
"We don't need any guidelines or formats. All we need to do is just let it flow right out of us. Trees live in your fan brush, but you have to scare them out.")
def test_intlist_to_bytes(self):
self.assertEqual(
intlist_to_bytes([0, 1, 127, 128, 255]),

View File

@ -2023,6 +2023,59 @@ def clean_html(html):
return html.strip()
def clean_html_markdown(html):
"""Clean an HTML snippet into readable markdown"""
if html is None: # Convenience for sanitizing descriptions etc.
return html
# Remove Newlines
html = html.replace('\n', ' ')
# Paragraphs and Line Breaks
html = re.sub(r'(?u)<\s*br\s*/?\s*>\s*', ' \n', html)
html = re.sub(r'(?u)<\s*/?\s*p\b[^>]*>', '\n\n', html)
# Headings
html = re.sub(r'(?u)<\s*h1\b[^>]*>', '\n\n# ', html)
html = re.sub(r'(?u)<\s*h2\b[^>]*>', '\n\n## ', html)
html = re.sub(r'(?u)<\s*h3\b[^>]*>', '\n\n### ', html)
html = re.sub(r'(?u)<\s*h4\b[^>]*>', '\n\n#### ', html)
html = re.sub(r'(?u)<\s*h5\b[^>]*>', '\n\n##### ', html)
html = re.sub(r'(?u)<\s*h6\b[^>]*>', '\n\n###### ', html)
html = re.sub(r'(?u)<\s*/\s*h[123456]\b[^>]*>', '\n\n', html)
# Lists
html = re.sub(r'(?u)<\s*/?\s*(ul|ol)\b[^>]*>', '\n\n', html)
html = re.sub(r'(?u)<\s*li\b[^>]*>', '\n- ', html)
# Emphasis
html = re.sub(r'(?u)<\s*/?\s*(i|em)\b[^>]*>', '*', html)
html = re.sub(r'(?u)<\s*/?\s*(b|strong)\b[^>]*>', '**', html)
# Strip html tags
html = re.sub('<.*?>', '', html)
# Replace html entities
html = unescapeHTML(html)
# remove duplicate blank lines
cleaned_text = ''
sequential_blank_lines = 0
for line in html.splitlines():
line = line.lstrip()
if not line:
sequential_blank_lines += 1
if sequential_blank_lines <= 1:
cleaned_text += '\n'
continue
else:
sequential_blank_lines = 0
cleaned_text += line + '\n'
return cleaned_text.strip()
def sanitize_open(filename, open_mode):
"""Try to open the given filename, and slightly tweak it if this fails.