serializers.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. # markdown/searializers.py
  2. #
  3. # Add x/html serialization to Elementree
  4. # Taken from ElementTree 1.3 preview with slight modifications
  5. #
  6. # Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved.
  7. #
  8. # fredrik@pythonware.com
  9. # https://www.pythonware.com/
  10. #
  11. # --------------------------------------------------------------------
  12. # The ElementTree toolkit is
  13. #
  14. # Copyright (c) 1999-2007 by Fredrik Lundh
  15. #
  16. # By obtaining, using, and/or copying this software and/or its
  17. # associated documentation, you agree that you have read, understood,
  18. # and will comply with the following terms and conditions:
  19. #
  20. # Permission to use, copy, modify, and distribute this software and
  21. # its associated documentation for any purpose and without fee is
  22. # hereby granted, provided that the above copyright notice appears in
  23. # all copies, and that both that copyright notice and this permission
  24. # notice appear in supporting documentation, and that the name of
  25. # Secret Labs AB or the author not be used in advertising or publicity
  26. # pertaining to distribution of the software without specific, written
  27. # prior permission.
  28. #
  29. # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
  30. # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
  31. # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
  32. # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
  33. # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
  34. # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
  35. # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
  36. # OF THIS SOFTWARE.
  37. # --------------------------------------------------------------------
  38. from xml.etree.ElementTree import ProcessingInstruction
  39. from xml.etree.ElementTree import Comment, ElementTree, QName
  40. import re
  41. __all__ = ['to_html_string', 'to_xhtml_string']
  42. HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
  43. "img", "input", "isindex", "link", "meta", "param")
  44. RE_AMP = re.compile(r'&(?!(?:\#[0-9]+|\#x[0-9a-f]+|[0-9a-z]+);)', re.I)
  45. try:
  46. HTML_EMPTY = set(HTML_EMPTY)
  47. except NameError: # pragma: no cover
  48. pass
  49. def _raise_serialization_error(text): # pragma: no cover
  50. raise TypeError(
  51. "cannot serialize {!r} (type {})".format(text, type(text).__name__)
  52. )
  53. def _escape_cdata(text):
  54. # escape character data
  55. try:
  56. # it's worth avoiding do-nothing calls for strings that are
  57. # shorter than 500 character, or so. assume that's, by far,
  58. # the most common case in most applications.
  59. if "&" in text:
  60. # Only replace & when not part of an entity
  61. text = RE_AMP.sub('&', text)
  62. if "<" in text:
  63. text = text.replace("<", "&lt;")
  64. if ">" in text:
  65. text = text.replace(">", "&gt;")
  66. return text
  67. except (TypeError, AttributeError): # pragma: no cover
  68. _raise_serialization_error(text)
  69. def _escape_attrib(text):
  70. # escape attribute value
  71. try:
  72. if "&" in text:
  73. # Only replace & when not part of an entity
  74. text = RE_AMP.sub('&amp;', text)
  75. if "<" in text:
  76. text = text.replace("<", "&lt;")
  77. if ">" in text:
  78. text = text.replace(">", "&gt;")
  79. if "\"" in text:
  80. text = text.replace("\"", "&quot;")
  81. if "\n" in text:
  82. text = text.replace("\n", "&#10;")
  83. return text
  84. except (TypeError, AttributeError): # pragma: no cover
  85. _raise_serialization_error(text)
  86. def _escape_attrib_html(text):
  87. # escape attribute value
  88. try:
  89. if "&" in text:
  90. # Only replace & when not part of an entity
  91. text = RE_AMP.sub('&amp;', text)
  92. if "<" in text:
  93. text = text.replace("<", "&lt;")
  94. if ">" in text:
  95. text = text.replace(">", "&gt;")
  96. if "\"" in text:
  97. text = text.replace("\"", "&quot;")
  98. return text
  99. except (TypeError, AttributeError): # pragma: no cover
  100. _raise_serialization_error(text)
  101. def _serialize_html(write, elem, format):
  102. tag = elem.tag
  103. text = elem.text
  104. if tag is Comment:
  105. write("<!--%s-->" % _escape_cdata(text))
  106. elif tag is ProcessingInstruction:
  107. write("<?%s?>" % _escape_cdata(text))
  108. elif tag is None:
  109. if text:
  110. write(_escape_cdata(text))
  111. for e in elem:
  112. _serialize_html(write, e, format)
  113. else:
  114. namespace_uri = None
  115. if isinstance(tag, QName):
  116. # QNAME objects store their data as a string: `{uri}tag`
  117. if tag.text[:1] == "{":
  118. namespace_uri, tag = tag.text[1:].split("}", 1)
  119. else:
  120. raise ValueError('QName objects must define a tag.')
  121. write("<" + tag)
  122. items = elem.items()
  123. if items:
  124. items = sorted(items) # lexical order
  125. for k, v in items:
  126. if isinstance(k, QName):
  127. # Assume a text only QName
  128. k = k.text
  129. if isinstance(v, QName):
  130. # Assume a text only QName
  131. v = v.text
  132. else:
  133. v = _escape_attrib_html(v)
  134. if k == v and format == 'html':
  135. # handle boolean attributes
  136. write(" %s" % v)
  137. else:
  138. write(' {}="{}"'.format(k, v))
  139. if namespace_uri:
  140. write(' xmlns="%s"' % (_escape_attrib(namespace_uri)))
  141. if format == "xhtml" and tag.lower() in HTML_EMPTY:
  142. write(" />")
  143. else:
  144. write(">")
  145. if text:
  146. if tag.lower() in ["script", "style"]:
  147. write(text)
  148. else:
  149. write(_escape_cdata(text))
  150. for e in elem:
  151. _serialize_html(write, e, format)
  152. if tag.lower() not in HTML_EMPTY:
  153. write("</" + tag + ">")
  154. if elem.tail:
  155. write(_escape_cdata(elem.tail))
  156. def _write_html(root, format="html"):
  157. assert root is not None
  158. data = []
  159. write = data.append
  160. _serialize_html(write, root, format)
  161. return "".join(data)
  162. # --------------------------------------------------------------------
  163. # public functions
  164. def to_html_string(element):
  165. return _write_html(ElementTree(element).getroot(), format="html")
  166. def to_xhtml_string(element):
  167. return _write_html(ElementTree(element).getroot(), format="xhtml")