core.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. """
  2. Python Markdown
  3. A Python implementation of John Gruber's Markdown.
  4. Documentation: https://python-markdown.github.io/
  5. GitHub: https://github.com/Python-Markdown/markdown/
  6. PyPI: https://pypi.org/project/Markdown/
  7. Started by Manfred Stienstra (http://www.dwerg.net/).
  8. Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
  9. Currently maintained by Waylan Limberg (https://github.com/waylan),
  10. Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
  11. Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
  12. Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
  13. Copyright 2004 Manfred Stienstra (the original version)
  14. License: BSD (see LICENSE.md for details).
  15. """
  16. import codecs
  17. import sys
  18. import logging
  19. import importlib
  20. from . import util
  21. from .preprocessors import build_preprocessors
  22. from .blockprocessors import build_block_parser
  23. from .treeprocessors import build_treeprocessors
  24. from .inlinepatterns import build_inlinepatterns
  25. from .postprocessors import build_postprocessors
  26. from .extensions import Extension
  27. from .serializers import to_html_string, to_xhtml_string
  28. __all__ = ['Markdown', 'markdown', 'markdownFromFile']
  29. logger = logging.getLogger('MARKDOWN')
  30. class Markdown:
  31. """Convert Markdown to HTML."""
  32. doc_tag = "div" # Element used to wrap document - later removed
  33. output_formats = {
  34. 'html': to_html_string,
  35. 'xhtml': to_xhtml_string,
  36. }
  37. def __init__(self, **kwargs):
  38. """
  39. Creates a new Markdown instance.
  40. Keyword arguments:
  41. * extensions: A list of extensions.
  42. If an item is an instance of a subclass of `markdown.extension.Extension`, the instance will be used
  43. as-is. If an item is of type string, first an entry point will be loaded. If that fails, the string is
  44. assumed to use Python dot notation (`path.to.module:ClassName`) to load a markdown.Extension subclass. If
  45. no class is specified, then a `makeExtension` function is called within the specified module.
  46. * extension_configs: Configuration settings for extensions.
  47. * output_format: Format of output. Supported formats are:
  48. * "xhtml": Outputs XHTML style tags. Default.
  49. * "html": Outputs HTML style tags.
  50. * tab_length: Length of tabs in the source. Default: 4
  51. """
  52. self.tab_length = kwargs.get('tab_length', 4)
  53. self.ESCAPED_CHARS = ['\\', '`', '*', '_', '{', '}', '[', ']',
  54. '(', ')', '>', '#', '+', '-', '.', '!']
  55. self.block_level_elements = [
  56. # Elements which are invalid to wrap in a `<p>` tag.
  57. # See https://w3c.github.io/html/grouping-content.html#the-p-element
  58. 'address', 'article', 'aside', 'blockquote', 'details', 'div', 'dl',
  59. 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3',
  60. 'h4', 'h5', 'h6', 'header', 'hr', 'main', 'menu', 'nav', 'ol', 'p', 'pre',
  61. 'section', 'table', 'ul',
  62. # Other elements which Markdown should not be mucking up the contents of.
  63. 'canvas', 'dd', 'dt', 'group', 'iframe', 'li', 'math', 'noscript', 'output',
  64. 'progress', 'script', 'style', 'tbody', 'td', 'th', 'thead', 'tr', 'video'
  65. ]
  66. self.registeredExtensions = []
  67. self.docType = ""
  68. self.stripTopLevelTags = True
  69. self.build_parser()
  70. self.references = {}
  71. self.htmlStash = util.HtmlStash()
  72. self.registerExtensions(extensions=kwargs.get('extensions', []),
  73. configs=kwargs.get('extension_configs', {}))
  74. self.set_output_format(kwargs.get('output_format', 'xhtml'))
  75. self.reset()
  76. def build_parser(self):
  77. """ Build the parser from the various parts. """
  78. self.preprocessors = build_preprocessors(self)
  79. self.parser = build_block_parser(self)
  80. self.inlinePatterns = build_inlinepatterns(self)
  81. self.treeprocessors = build_treeprocessors(self)
  82. self.postprocessors = build_postprocessors(self)
  83. return self
  84. def registerExtensions(self, extensions, configs):
  85. """
  86. Register extensions with this instance of Markdown.
  87. Keyword arguments:
  88. * extensions: A list of extensions, which can either
  89. be strings or objects.
  90. * configs: A dictionary mapping extension names to config options.
  91. """
  92. for ext in extensions:
  93. if isinstance(ext, str):
  94. ext = self.build_extension(ext, configs.get(ext, {}))
  95. if isinstance(ext, Extension):
  96. ext._extendMarkdown(self)
  97. logger.debug(
  98. 'Successfully loaded extension "%s.%s".'
  99. % (ext.__class__.__module__, ext.__class__.__name__)
  100. )
  101. elif ext is not None:
  102. raise TypeError(
  103. 'Extension "{}.{}" must be of type: "{}.{}"'.format(
  104. ext.__class__.__module__, ext.__class__.__name__,
  105. Extension.__module__, Extension.__name__
  106. )
  107. )
  108. return self
  109. def build_extension(self, ext_name, configs):
  110. """
  111. Build extension from a string name, then return an instance.
  112. First attempt to load an entry point. The string name must be registered as an entry point in the
  113. `markdown.extensions` group which points to a subclass of the `markdown.extensions.Extension` class.
  114. If multiple distributions have registered the same name, the first one found is returned.
  115. If no entry point is found, assume dot notation (`path.to.module:ClassName`). Load the specified class and
  116. return an instance. If no class is specified, import the module and call a `makeExtension` function and return
  117. the Extension instance returned by that function.
  118. """
  119. configs = dict(configs)
  120. entry_points = [ep for ep in util.INSTALLED_EXTENSIONS if ep.name == ext_name]
  121. if entry_points:
  122. ext = entry_points[0].load()
  123. return ext(**configs)
  124. # Get class name (if provided): `path.to.module:ClassName`
  125. ext_name, class_name = ext_name.split(':', 1) if ':' in ext_name else (ext_name, '')
  126. try:
  127. module = importlib.import_module(ext_name)
  128. logger.debug(
  129. 'Successfully imported extension module "%s".' % ext_name
  130. )
  131. except ImportError as e:
  132. message = 'Failed loading extension "%s".' % ext_name
  133. e.args = (message,) + e.args[1:]
  134. raise
  135. if class_name:
  136. # Load given class name from module.
  137. return getattr(module, class_name)(**configs)
  138. else:
  139. # Expect makeExtension() function to return a class.
  140. try:
  141. return module.makeExtension(**configs)
  142. except AttributeError as e:
  143. message = e.args[0]
  144. message = "Failed to initiate extension " \
  145. "'%s': %s" % (ext_name, message)
  146. e.args = (message,) + e.args[1:]
  147. raise
  148. def registerExtension(self, extension):
  149. """ This gets called by the extension """
  150. self.registeredExtensions.append(extension)
  151. return self
  152. def reset(self):
  153. """
  154. Resets all state variables so that we can start with a new text.
  155. """
  156. self.htmlStash.reset()
  157. self.references.clear()
  158. for extension in self.registeredExtensions:
  159. if hasattr(extension, 'reset'):
  160. extension.reset()
  161. return self
  162. def set_output_format(self, format):
  163. """ Set the output format for the class instance. """
  164. self.output_format = format.lower().rstrip('145') # ignore num
  165. try:
  166. self.serializer = self.output_formats[self.output_format]
  167. except KeyError as e:
  168. valid_formats = list(self.output_formats.keys())
  169. valid_formats.sort()
  170. message = 'Invalid Output Format: "%s". Use one of %s.' \
  171. % (self.output_format,
  172. '"' + '", "'.join(valid_formats) + '"')
  173. e.args = (message,) + e.args[1:]
  174. raise
  175. return self
  176. def is_block_level(self, tag):
  177. """Check if the tag is a block level HTML tag."""
  178. if isinstance(tag, str):
  179. return tag.lower().rstrip('/') in self.block_level_elements
  180. # Some ElementTree tags are not strings, so return False.
  181. return False
  182. def convert(self, source):
  183. """
  184. Convert markdown to serialized XHTML or HTML.
  185. Keyword arguments:
  186. * source: Source text as a Unicode string.
  187. Markdown processing takes place in five steps:
  188. 1. A bunch of "preprocessors" munge the input text.
  189. 2. BlockParser() parses the high-level structural elements of the
  190. pre-processed text into an ElementTree.
  191. 3. A bunch of "treeprocessors" are run against the ElementTree. One
  192. such treeprocessor runs InlinePatterns against the ElementTree,
  193. detecting inline markup.
  194. 4. Some post-processors are run against the text after the ElementTree
  195. has been serialized into text.
  196. 5. The output is written to a string.
  197. """
  198. # Fixup the source text
  199. if not source.strip():
  200. return '' # a blank unicode string
  201. try:
  202. source = str(source)
  203. except UnicodeDecodeError as e: # pragma: no cover
  204. # Customise error message while maintaining original trackback
  205. e.reason += '. -- Note: Markdown only accepts unicode input!'
  206. raise
  207. # Split into lines and run the line preprocessors.
  208. self.lines = source.split("\n")
  209. for prep in self.preprocessors:
  210. self.lines = prep.run(self.lines)
  211. # Parse the high-level elements.
  212. root = self.parser.parseDocument(self.lines).getroot()
  213. # Run the tree-processors
  214. for treeprocessor in self.treeprocessors:
  215. newRoot = treeprocessor.run(root)
  216. if newRoot is not None:
  217. root = newRoot
  218. # Serialize _properly_. Strip top-level tags.
  219. output = self.serializer(root)
  220. if self.stripTopLevelTags:
  221. try:
  222. start = output.index(
  223. '<%s>' % self.doc_tag) + len(self.doc_tag) + 2
  224. end = output.rindex('</%s>' % self.doc_tag)
  225. output = output[start:end].strip()
  226. except ValueError: # pragma: no cover
  227. if output.strip().endswith('<%s />' % self.doc_tag):
  228. # We have an empty document
  229. output = ''
  230. else:
  231. # We have a serious problem
  232. raise ValueError('Markdown failed to strip top-level '
  233. 'tags. Document=%r' % output.strip())
  234. # Run the text post-processors
  235. for pp in self.postprocessors:
  236. output = pp.run(output)
  237. return output.strip()
  238. def convertFile(self, input=None, output=None, encoding=None):
  239. """Converts a markdown file and returns the HTML as a unicode string.
  240. Decodes the file using the provided encoding (defaults to utf-8),
  241. passes the file content to markdown, and outputs the html to either
  242. the provided stream or the file with provided name, using the same
  243. encoding as the source file. The 'xmlcharrefreplace' error handler is
  244. used when encoding the output.
  245. **Note:** This is the only place that decoding and encoding of unicode
  246. takes place in Python-Markdown. (All other code is unicode-in /
  247. unicode-out.)
  248. Keyword arguments:
  249. * input: File object or path. Reads from stdin if `None`.
  250. * output: File object or path. Writes to stdout if `None`.
  251. * encoding: Encoding of input and output files. Defaults to utf-8.
  252. """
  253. encoding = encoding or "utf-8"
  254. # Read the source
  255. if input:
  256. if isinstance(input, str):
  257. input_file = codecs.open(input, mode="r", encoding=encoding)
  258. else:
  259. input_file = codecs.getreader(encoding)(input)
  260. text = input_file.read()
  261. input_file.close()
  262. else:
  263. text = sys.stdin.read()
  264. if not isinstance(text, str): # pragma: no cover
  265. text = text.decode(encoding)
  266. text = text.lstrip('\ufeff') # remove the byte-order mark
  267. # Convert
  268. html = self.convert(text)
  269. # Write to file or stdout
  270. if output:
  271. if isinstance(output, str):
  272. output_file = codecs.open(output, "w",
  273. encoding=encoding,
  274. errors="xmlcharrefreplace")
  275. output_file.write(html)
  276. output_file.close()
  277. else:
  278. writer = codecs.getwriter(encoding)
  279. output_file = writer(output, errors="xmlcharrefreplace")
  280. output_file.write(html)
  281. # Don't close here. User may want to write more.
  282. else:
  283. # Encode manually and write bytes to stdout.
  284. html = html.encode(encoding, "xmlcharrefreplace")
  285. try:
  286. # Write bytes directly to buffer (Python 3).
  287. sys.stdout.buffer.write(html)
  288. except AttributeError: # pragma: no cover
  289. # Probably Python 2, which works with bytes by default.
  290. sys.stdout.write(html)
  291. return self
  292. """
  293. EXPORTED FUNCTIONS
  294. =============================================================================
  295. Those are the two functions we really mean to export: markdown() and
  296. markdownFromFile().
  297. """
  298. def markdown(text, **kwargs):
  299. """Convert a markdown string to HTML and return HTML as a unicode string.
  300. This is a shortcut function for `Markdown` class to cover the most
  301. basic use case. It initializes an instance of Markdown, loads the
  302. necessary extensions and runs the parser on the given text.
  303. Keyword arguments:
  304. * text: Markdown formatted text as Unicode or ASCII string.
  305. * Any arguments accepted by the Markdown class.
  306. Returns: An HTML document as a string.
  307. """
  308. md = Markdown(**kwargs)
  309. return md.convert(text)
  310. def markdownFromFile(**kwargs):
  311. """Read markdown code from a file and write it to a file or a stream.
  312. This is a shortcut function which initializes an instance of Markdown,
  313. and calls the convertFile method rather than convert.
  314. Keyword arguments:
  315. * input: a file name or readable object.
  316. * output: a file name or writable object.
  317. * encoding: Encoding of input and output.
  318. * Any arguments accepted by the Markdown class.
  319. """
  320. md = Markdown(**kwargs)
  321. md.convertFile(kwargs.get('input', None),
  322. kwargs.get('output', None),
  323. kwargs.get('encoding', None))