striphtml.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. """
  2. Strip HTML (previously named Plain HTML).
  3. pymdownx.striphtml
  4. An extension for Python Markdown.
  5. Strip classes, styles, and ids from html
  6. MIT license.
  7. Copyright (c) 2014 - 2017 Isaac Muse <isaacmuse@gmail.com>
  8. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
  9. documentation files (the "Software"), to deal in the Software without restriction, including without limitation
  10. the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
  11. and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
  12. The above copyright notice and this permission notice shall be included in all copies or substantial portions
  13. of the Software.
  14. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
  15. TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  16. THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
  17. CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  18. DEALINGS IN THE SOFTWARE.
  19. """
  20. from markdown import Extension
  21. from markdown.postprocessors import Postprocessor
  22. import re
  23. RE_TAG_HTML = re.compile(
  24. r'''(?x)
  25. (?:
  26. (?P<comments>(?:\r?\n?\s*)<!--[\s\S]*?-->(?:\s*)(?=\r?\n)|<!--[\s\S]*?-->)|
  27. (?P<scripts>
  28. (?P<script_open><(?P<script_name>style|script))
  29. (?P<script_attr>(?:\s+[\w\-:]+(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s"'`=<>]+))?)*)
  30. (?P<script_rest>\s*>.*?</(?P=script_name)\s*>)
  31. )|
  32. (?P<open><(?P<name>[\w\:\.\-]+))
  33. (?P<attr>(?:\s+[\w\-:]+(?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s"'`=<>]+))?)*)
  34. (?P<close>\s*(?P<self_close>/)?>)|
  35. (?P<close_tag></(?P<close_name>[\w\:\.\-]+)\s*>)
  36. )
  37. ''',
  38. re.DOTALL | re.UNICODE
  39. )
  40. TAG_BAD_ATTR = r'''(?x)
  41. (?P<attr>
  42. (?:
  43. \s+(?:%s)
  44. (?:\s*=\s*(?:"[^"]*"|'[^']*'|[^\s"'`=<>]+))
  45. )*
  46. )
  47. '''
  48. class StripHtmlPostprocessor(Postprocessor):
  49. """Post processor to strip out unwanted content."""
  50. def __init__(self, strip_comments, strip_js_on_attributes, strip_attributes, md):
  51. """Initialize."""
  52. self.strip_comments = strip_comments
  53. self.re_attributes = None
  54. attributes = [re.escape(a.strip()) for a in strip_attributes]
  55. if strip_js_on_attributes:
  56. attributes.append(r'on[\w]+')
  57. if attributes:
  58. self.re_attributes = re.compile(
  59. TAG_BAD_ATTR % '|'.join(attributes),
  60. re.DOTALL | re.UNICODE
  61. )
  62. super(StripHtmlPostprocessor, self).__init__(md)
  63. def repl(self, m):
  64. """Replace comments and unwanted attributes."""
  65. if m.group('comments'):
  66. tag = '' if self.strip_comments else m.group('comments')
  67. else:
  68. if m.group('scripts'):
  69. tag = m.group('script_open')
  70. if self.re_attributes is not None:
  71. tag += self.re_attributes.sub('', m.group('script_attr'))
  72. else:
  73. tag += m.group('script_attr')
  74. tag += m.group('script_rest')
  75. elif m.group('close_tag'):
  76. tag = m.group(0)
  77. else:
  78. tag = m.group('open')
  79. if self.re_attributes is not None:
  80. tag += self.re_attributes.sub('', m.group('attr'))
  81. else:
  82. tag += m.group('attr')
  83. tag += m.group('close')
  84. return tag
  85. def run(self, text):
  86. """Strip out ids and classes for a simplified HTML output."""
  87. strip = self.strip_comments or self.strip_js_on_attributes or self.re_attributes
  88. return RE_TAG_HTML.sub(self.repl, text) if strip else text
  89. class StripHtmlExtension(Extension):
  90. """StripHTML extension."""
  91. def __init__(self, *args, **kwargs):
  92. """Initialize."""
  93. self.config = {
  94. 'strip_comments': [
  95. True,
  96. "Strip HTML comments at the end of processing. "
  97. "- Default: True"
  98. ],
  99. 'strip_attributes': [
  100. [],
  101. "A string of attributes separated by spaces."
  102. "- Default: 'id class style']"
  103. ],
  104. 'strip_js_on_attributes': [
  105. True,
  106. "Strip JavaScript script attribues with the pattern on*. "
  107. " - Default: True"
  108. ]
  109. }
  110. super(StripHtmlExtension, self).__init__(*args, **kwargs)
  111. def extendMarkdown(self, md):
  112. """Strip unwanted HTML attributes and/or comments."""
  113. md.registerExtension(self)
  114. config = self.getConfigs()
  115. striphtml = StripHtmlPostprocessor(
  116. config.get('strip_comments'),
  117. config.get('strip_js_on_attributes'),
  118. config.get('strip_attributes'),
  119. md
  120. )
  121. md.postprocessors.register(striphtml, "strip-html", 1)
  122. def makeExtension(*args, **kwargs):
  123. """Return extension."""
  124. return StripHtmlExtension(*args, **kwargs)