[utils] Introduce clean_html_markdown

2019-08-25 14:22:51 -05:00 · 2019-08-25 14:22:51 -05:00 · 2e9c31369e
commit 2e9c31369e
parent 393cc31d5e
2 changed files with 81 additions and 0 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -20,6 +20,7 @@ from youtube_dl.utils import (
    args_to_str,
    encode_base_n,
    clean_html,
+    clean_html_markdown,
    date_from_str,
    DateRange,
    detect_exe_version,
@ -1025,6 +1026,33 @@ class TestUtil(unittest.TestCase):
        self.assertEqual(clean_html('a:\n   "b"'), 'a:    "b"')
        self.assertEqual(clean_html('a<br>\xa0b'), 'a\nb')

+    def test_clean_html_markdown(self):
+        self.assertEqual(clean_html_markdown(
+            '<div id="out" class="markdown-body"><h1>Happy Text</h1>\n'
+            '<p>When you do it your way you can go <em>anywhere</em> you choose. And just raise cain. I thought today we would make a happy little stream that\'s just running through the woods here. I was <strong>blessed</strong> with a very steady hand; and it comes in very handy when you\'re doing these little delicate things. You have to allow the paint to break to <strong><em>make it beautiful</em></strong>. Let\'s do it again then, what the heck.</p>\n'
+            '<h2>This is your creation - and it\'s just as unique and special as you are.</h2>\n'
+            '<p>Paint <b>anything</b> you want on the canvas. Create your own world. By now you should be quite happy about what\'s happening here. You can\'t have light without dark. You can\'t know <i>happiness</i> unless you\'ve known <em>sorrow</em>. Let\'s get crazy.</p>\n'
+            '<ul>\n'
+            '<li>You can spend all day playing with mountains.</li>\n'
+            '<li>We\'ll put a happy little sky in here.</li>\n'
+            '</ul>\n'
+            '<p>I like to beat the brush. There we go.<br>\n'
+            'We don\'t need any guidelines or formats. All we need to do is just let it flow right out of us. Trees live in your fan brush, but you have to scare them out.</p>\n'
+            '</div>'),
+            "# Happy Text\n"
+            "\n"
+            "When you do it your way you can go *anywhere* you choose. And just raise cain. I thought today we would make a happy little stream that's just running through the woods here. I was **blessed** with a very steady hand; and it comes in very handy when you're doing these little delicate things. You have to allow the paint to break to ***make it beautiful***. Let's do it again then, what the heck.\n"
+            "\n"
+            "## This is your creation - and it's just as unique and special as you are.\n"
+            "\n"
+            "Paint **anything** you want on the canvas. Create your own world. By now you should be quite happy about what's happening here. You can't have light without dark. You can't know *happiness* unless you've known *sorrow*. Let's get crazy.\n"
+            "\n"
+            "- You can spend all day playing with mountains. \n"
+            "- We'll put a happy little sky in here. \n"
+            "\n"
+            "I like to beat the brush. There we go.  \n"
+            "We don't need any guidelines or formats. All we need to do is just let it flow right out of us. Trees live in your fan brush, but you have to scare them out.")
+
    def test_intlist_to_bytes(self):
        self.assertEqual(
            intlist_to_bytes([0, 1, 127, 128, 255]),
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -2023,6 +2023,59 @@ def clean_html(html):
    return html.strip()


+def clean_html_markdown(html):
+    """Clean an HTML snippet into readable markdown"""
+
+    if html is None:  # Convenience for sanitizing descriptions etc.
+        return html
+
+    # Remove Newlines
+    html = html.replace('\n', ' ')
+
+    # Paragraphs and Line Breaks
+    html = re.sub(r'(?u)<\s*br\s*/?\s*>\s*', '  \n', html)
+    html = re.sub(r'(?u)<\s*/?\s*p\b[^>]*>', '\n\n', html)
+
+    # Headings
+    html = re.sub(r'(?u)<\s*h1\b[^>]*>', '\n\n# ', html)
+    html = re.sub(r'(?u)<\s*h2\b[^>]*>', '\n\n## ', html)
+    html = re.sub(r'(?u)<\s*h3\b[^>]*>', '\n\n### ', html)
+    html = re.sub(r'(?u)<\s*h4\b[^>]*>', '\n\n#### ', html)
+    html = re.sub(r'(?u)<\s*h5\b[^>]*>', '\n\n##### ', html)
+    html = re.sub(r'(?u)<\s*h6\b[^>]*>', '\n\n###### ', html)
+    html = re.sub(r'(?u)<\s*/\s*h[123456]\b[^>]*>', '\n\n', html)
+
+    # Lists
+    html = re.sub(r'(?u)<\s*/?\s*(ul|ol)\b[^>]*>', '\n\n', html)
+    html = re.sub(r'(?u)<\s*li\b[^>]*>', '\n- ', html)
+
+    # Emphasis
+    html = re.sub(r'(?u)<\s*/?\s*(i|em)\b[^>]*>', '*', html)
+    html = re.sub(r'(?u)<\s*/?\s*(b|strong)\b[^>]*>', '**', html)
+
+    # Strip html tags
+    html = re.sub('<.*?>', '', html)
+    # Replace html entities
+    html = unescapeHTML(html)
+
+    # remove duplicate blank lines
+    cleaned_text = ''
+    sequential_blank_lines = 0
+    for line in html.splitlines():
+        line = line.lstrip()
+        if not line:
+            sequential_blank_lines += 1
+            if sequential_blank_lines <= 1:
+                cleaned_text += '\n'
+            continue
+        else:
+            sequential_blank_lines = 0
+
+        cleaned_text += line + '\n'
+
+    return cleaned_text.strip()
+
+
 def sanitize_open(filename, open_mode):
    """Try to open the given filename, and slightly tweak it if this fails.