summaryrefslogtreecommitdiffstats
path: root/tdemarkdown/md4c/test/normalize.py
diff options
context:
space:
mode:
authorMavridis Philippe <mavridisf@gmail.com>2022-02-05 17:44:26 +0000
committerSlávek Banko <slavek.banko@axis.cz>2022-04-17 18:35:01 +0200
commit95279fbf6dfeb43d80590740a9259d7caa614177 (patch)
tree8c3ff4de77102f1e55357dd81d650e9f22be69a0 /tdemarkdown/md4c/test/normalize.py
parenta291f3a0a3fab073c009f77a36745c5c7bd48e9b (diff)
downloadtdelibs-95279fbf6dfeb43d80590740a9259d7caa614177.tar.gz
tdelibs-95279fbf6dfeb43d80590740a9259d7caa614177.zip
Add tdemarkdown part - embeddable lightweight markdown viewing component.
TDEMarkdown is based on the md4c library and using TDEHTML for rendering its output. For enhanced safety, on HTML widget is turned off everything we don't need for viewing. It integrates nicely into Konqueror and supports both Commonmark and GitHub markdown syntaxes. Signed-off-by: Mavridis Philippe <mavridisf@gmail.com> Prepare to merge tdemarkdown into tdelibs. Signed-off-by: Slávek Banko <slavek.banko@axis.cz>
Diffstat (limited to 'tdemarkdown/md4c/test/normalize.py')
-rwxr-xr-xtdemarkdown/md4c/test/normalize.py194
1 files changed, 194 insertions, 0 deletions
diff --git a/tdemarkdown/md4c/test/normalize.py b/tdemarkdown/md4c/test/normalize.py
new file mode 100755
index 000000000..f8ece18d5
--- /dev/null
+++ b/tdemarkdown/md4c/test/normalize.py
@@ -0,0 +1,194 @@
+# -*- coding: utf-8 -*-
+from html.parser import HTMLParser
+import urllib
+
+try:
+ from html.parser import HTMLParseError
+except ImportError:
+ # HTMLParseError was removed in Python 3.5. It could never be
+ # thrown, so we define a placeholder instead.
+ class HTMLParseError(Exception):
+ pass
+
+from html.entities import name2codepoint
+import sys
+import re
+import html
+
+# Normalization code, adapted from
+# https://github.com/karlcow/markdown-testsuite/
+significant_attrs = ["alt", "href", "src", "title"]
+whitespace_re = re.compile('\s+')
+class MyHTMLParser(HTMLParser):
+ def __init__(self):
+ HTMLParser.__init__(self)
+ self.convert_charrefs = False
+ self.last = "starttag"
+ self.in_pre = False
+ self.output = ""
+ self.last_tag = ""
+ def handle_data(self, data):
+ after_tag = self.last == "endtag" or self.last == "starttag"
+ after_block_tag = after_tag and self.is_block_tag(self.last_tag)
+ if after_tag and self.last_tag == "br":
+ data = data.lstrip('\n')
+ if not self.in_pre:
+ data = whitespace_re.sub(' ', data)
+ if after_block_tag and not self.in_pre:
+ if self.last == "starttag":
+ data = data.lstrip()
+ elif self.last == "endtag":
+ data = data.strip()
+ self.output += data
+ self.last = "data"
+ def handle_endtag(self, tag):
+ if tag == "pre":
+ self.in_pre = False
+ elif self.is_block_tag(tag):
+ self.output = self.output.rstrip()
+ self.output += "</" + tag + ">"
+ self.last_tag = tag
+ self.last = "endtag"
+ def handle_starttag(self, tag, attrs):
+ if tag == "pre":
+ self.in_pre = True
+ if self.is_block_tag(tag):
+ self.output = self.output.rstrip()
+ self.output += "<" + tag
+ # For now we don't strip out 'extra' attributes, because of
+ # raw HTML test cases.
+ # attrs = filter(lambda attr: attr[0] in significant_attrs, attrs)
+ if attrs:
+ attrs.sort()
+ for (k,v) in attrs:
+ self.output += " " + k
+ if v in ['href','src']:
+ self.output += ("=" + '"' +
+ urllib.quote(urllib.unquote(v), safe='/') + '"')
+ elif v != None:
+ self.output += ("=" + '"' + html.escape(v,quote=True) + '"')
+ self.output += ">"
+ self.last_tag = tag
+ self.last = "starttag"
+ def handle_startendtag(self, tag, attrs):
+ """Ignore closing tag for self-closing """
+ self.handle_starttag(tag, attrs)
+ self.last_tag = tag
+ self.last = "endtag"
+ def handle_comment(self, data):
+ self.output += '<!--' + data + '-->'
+ self.last = "comment"
+ def handle_decl(self, data):
+ self.output += '<!' + data + '>'
+ self.last = "decl"
+ def unknown_decl(self, data):
+ self.output += '<!' + data + '>'
+ self.last = "decl"
+ def handle_pi(self,data):
+ self.output += '<?' + data + '>'
+ self.last = "pi"
+ def handle_entityref(self, name):
+ try:
+ c = chr(name2codepoint[name])
+ except KeyError:
+ c = None
+ self.output_char(c, '&' + name + ';')
+ self.last = "ref"
+ def handle_charref(self, name):
+ try:
+ if name.startswith("x"):
+ c = chr(int(name[1:], 16))
+ else:
+ c = chr(int(name))
+ except ValueError:
+ c = None
+ self.output_char(c, '&' + name + ';')
+ self.last = "ref"
+ # Helpers.
+ def output_char(self, c, fallback):
+ if c == '<':
+ self.output += "&lt;"
+ elif c == '>':
+ self.output += "&gt;"
+ elif c == '&':
+ self.output += "&amp;"
+ elif c == '"':
+ self.output += "&quot;"
+ elif c == None:
+ self.output += fallback
+ else:
+ self.output += c
+
+ def is_block_tag(self,tag):
+ return (tag in ['article', 'header', 'aside', 'hgroup', 'blockquote',
+ 'hr', 'iframe', 'body', 'li', 'map', 'button', 'object', 'canvas',
+ 'ol', 'caption', 'output', 'col', 'p', 'colgroup', 'pre', 'dd',
+ 'progress', 'div', 'section', 'dl', 'table', 'td', 'dt',
+ 'tbody', 'embed', 'textarea', 'fieldset', 'tfoot', 'figcaption',
+ 'th', 'figure', 'thead', 'footer', 'tr', 'form', 'ul',
+ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'video', 'script', 'style'])
+
+def normalize_html(html):
+ r"""
+ Return normalized form of HTML which ignores insignificant output
+ differences:
+
+ Multiple inner whitespaces are collapsed to a single space (except
+ in pre tags):
+
+ >>> normalize_html("<p>a \t b</p>")
+ '<p>a b</p>'
+
+ >>> normalize_html("<p>a \t\nb</p>")
+ '<p>a b</p>'
+
+ * Whitespace surrounding block-level tags is removed.
+
+ >>> normalize_html("<p>a b</p>")
+ '<p>a b</p>'
+
+ >>> normalize_html(" <p>a b</p>")
+ '<p>a b</p>'
+
+ >>> normalize_html("<p>a b</p> ")
+ '<p>a b</p>'
+
+ >>> normalize_html("\n\t<p>\n\t\ta b\t\t</p>\n\t")
+ '<p>a b</p>'
+
+ >>> normalize_html("<i>a b</i> ")
+ '<i>a b</i> '
+
+ * Self-closing tags are converted to open tags.
+
+ >>> normalize_html("<br />")
+ '<br>'
+
+ * Attributes are sorted and lowercased.
+
+ >>> normalize_html('<a title="bar" HREF="foo">x</a>')
+ '<a href="foo" title="bar">x</a>'
+
+ * References are converted to unicode, except that '<', '>', '&', and
+ '"' are rendered using entities.
+
+ >>> normalize_html("&forall;&amp;&gt;&lt;&quot;")
+ '\u2200&amp;&gt;&lt;&quot;'
+
+ """
+ html_chunk_re = re.compile("(\<!\[CDATA\[.*?\]\]\>|\<[^>]*\>|[^<]+)")
+ try:
+ parser = MyHTMLParser()
+ # We work around HTMLParser's limitations parsing CDATA
+ # by breaking the input into chunks and passing CDATA chunks
+ # through verbatim.
+ for chunk in re.finditer(html_chunk_re, html):
+ if chunk.group(0)[:8] == "<![CDATA":
+ parser.output += chunk.group(0)
+ else:
+ parser.feed(chunk.group(0))
+ parser.close()
+ return parser.output
+ except HTMLParseError as e:
+ sys.stderr.write("Normalization error: " + e.msg + "\n")
+ return html # on error, return unnormalized HTML