preprocessors.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370
  1. """
  2. Python Markdown
  3. A Python implementation of John Gruber's Markdown.
  4. Documentation: https://python-markdown.github.io/
  5. GitHub: https://github.com/Python-Markdown/markdown/
  6. PyPI: https://pypi.org/project/Markdown/
  7. Started by Manfred Stienstra (http://www.dwerg.net/).
  8. Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
  9. Currently maintained by Waylan Limberg (https://github.com/waylan),
  10. Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
  11. Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
  12. Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
  13. Copyright 2004 Manfred Stienstra (the original version)
  14. License: BSD (see LICENSE.md for details).
  15. PRE-PROCESSORS
  16. =============================================================================
  17. Preprocessors work on source text before we start doing anything too
  18. complicated.
  19. """
  20. from . import util
  21. import re
  22. def build_preprocessors(md, **kwargs):
  23. """ Build the default set of preprocessors used by Markdown. """
  24. preprocessors = util.Registry()
  25. preprocessors.register(NormalizeWhitespace(md), 'normalize_whitespace', 30)
  26. preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20)
  27. preprocessors.register(ReferencePreprocessor(md), 'reference', 10)
  28. return preprocessors
  29. class Preprocessor(util.Processor):
  30. """
  31. Preprocessors are run after the text is broken into lines.
  32. Each preprocessor implements a "run" method that takes a pointer to a
  33. list of lines of the document, modifies it as necessary and returns
  34. either the same pointer or a pointer to a new list.
  35. Preprocessors must extend markdown.Preprocessor.
  36. """
  37. def run(self, lines):
  38. """
  39. Each subclass of Preprocessor should override the `run` method, which
  40. takes the document as a list of strings split by newlines and returns
  41. the (possibly modified) list of lines.
  42. """
  43. pass # pragma: no cover
  44. class NormalizeWhitespace(Preprocessor):
  45. """ Normalize whitespace for consistent parsing. """
  46. def run(self, lines):
  47. source = '\n'.join(lines)
  48. source = source.replace(util.STX, "").replace(util.ETX, "")
  49. source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
  50. source = source.expandtabs(self.md.tab_length)
  51. source = re.sub(r'(?<=\n) +\n', '\n', source)
  52. return source.split('\n')
  53. class HtmlBlockPreprocessor(Preprocessor):
  54. """Remove html blocks from the text and store them for later retrieval."""
  55. right_tag_patterns = ["</%s>", "%s>"]
  56. attrs_pattern = r"""
  57. \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value"
  58. | # OR
  59. \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value
  60. | # OR
  61. \s+(?P<attr2>[^>"'/= ]+) # attr
  62. """
  63. left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % \
  64. attrs_pattern
  65. attrs_re = re.compile(attrs_pattern, re.VERBOSE)
  66. left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
  67. markdown_in_raw = False
  68. def _get_left_tag(self, block):
  69. m = self.left_tag_re.match(block)
  70. if m:
  71. tag = m.group('tag')
  72. raw_attrs = m.group('attrs')
  73. attrs = {}
  74. if raw_attrs:
  75. for ma in self.attrs_re.finditer(raw_attrs):
  76. if ma.group('attr'):
  77. if ma.group('value'):
  78. attrs[ma.group('attr').strip()] = ma.group('value')
  79. else:
  80. attrs[ma.group('attr').strip()] = ""
  81. elif ma.group('attr1'):
  82. if ma.group('value1'):
  83. attrs[ma.group('attr1').strip()] = ma.group(
  84. 'value1'
  85. )
  86. else:
  87. attrs[ma.group('attr1').strip()] = ""
  88. elif ma.group('attr2'):
  89. attrs[ma.group('attr2').strip()] = ""
  90. return tag, len(m.group(0)), attrs
  91. else:
  92. tag = block[1:].split(">", 1)[0].lower()
  93. return tag, len(tag)+2, {}
  94. def _recursive_tagfind(self, ltag, rtag, start_index, block):
  95. while 1:
  96. i = block.find(rtag, start_index)
  97. if i == -1:
  98. return -1
  99. j = block.find(ltag, start_index)
  100. # if no ltag, or rtag found before another ltag, return index
  101. if (j > i or j == -1):
  102. return i + len(rtag)
  103. # another ltag found before rtag, use end of ltag as starting
  104. # point and search again
  105. j = block.find('>', j)
  106. start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
  107. if start_index == -1:
  108. # HTML potentially malformed- ltag has no corresponding
  109. # rtag
  110. return -1
  111. def _get_right_tag(self, left_tag, left_index, block):
  112. for p in self.right_tag_patterns:
  113. tag = p % left_tag
  114. i = self._recursive_tagfind(
  115. "<%s" % left_tag, tag, left_index, block
  116. )
  117. if i > 2:
  118. return tag.lstrip("<").rstrip(">"), i
  119. return block.rstrip()[-left_index:-1].lower(), len(block)
  120. def _equal_tags(self, left_tag, right_tag):
  121. if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
  122. return True
  123. if ("/" + left_tag) == right_tag:
  124. return True
  125. if (right_tag == "--" and left_tag == "--"):
  126. return True
  127. elif left_tag == right_tag[1:] and right_tag[0] == "/":
  128. return True
  129. else:
  130. return False
  131. def _is_oneliner(self, tag):
  132. return (tag in ['hr', 'hr/'])
  133. def _stringindex_to_listindex(self, stringindex, items):
  134. """
  135. Same effect as concatenating the strings in items,
  136. finding the character to which stringindex refers in that string,
  137. and returning the index of the item in which that character resides.
  138. """
  139. items.append('dummy')
  140. i, count = 0, 0
  141. while count <= stringindex:
  142. count += len(items[i])
  143. i += 1
  144. return i - 1
  145. def _nested_markdown_in_html(self, items):
  146. """Find and process html child elements of the given element block."""
  147. for i, item in enumerate(items):
  148. if self.left_tag_re.match(item):
  149. left_tag, left_index, attrs = \
  150. self._get_left_tag(''.join(items[i:]))
  151. right_tag, data_index = self._get_right_tag(
  152. left_tag, left_index, ''.join(items[i:]))
  153. right_listindex = \
  154. self._stringindex_to_listindex(data_index, items[i:]) + i
  155. if 'markdown' in attrs.keys():
  156. items[i] = items[i][left_index:] # remove opening tag
  157. placeholder = self.md.htmlStash.store_tag(
  158. left_tag, attrs, i + 1, right_listindex + 1)
  159. items.insert(i, placeholder)
  160. if len(items) - right_listindex <= 1: # last nest, no tail
  161. right_listindex -= 1
  162. items[right_listindex] = items[right_listindex][
  163. :-len(right_tag) - 2] # remove closing tag
  164. else: # raw html
  165. if len(items) - right_listindex <= 1: # last element
  166. right_listindex -= 1
  167. if right_listindex <= i:
  168. right_listindex = i + 1
  169. placeholder = self.md.htmlStash.store('\n\n'.join(
  170. items[i:right_listindex]))
  171. del items[i:right_listindex]
  172. items.insert(i, placeholder)
  173. return items
  174. def run(self, lines):
  175. text = "\n".join(lines)
  176. new_blocks = []
  177. text = text.rsplit("\n\n")
  178. items = []
  179. left_tag = ''
  180. right_tag = ''
  181. in_tag = False # flag
  182. while text:
  183. block = text[0]
  184. if block.startswith("\n"):
  185. block = block[1:]
  186. text = text[1:]
  187. if block.startswith("\n"):
  188. block = block[1:]
  189. if not in_tag:
  190. if block.startswith("<") and len(block.strip()) > 1:
  191. if block[1:4] == "!--":
  192. # is a comment block
  193. left_tag, left_index, attrs = "--", 2, {}
  194. else:
  195. left_tag, left_index, attrs = self._get_left_tag(block)
  196. right_tag, data_index = self._get_right_tag(left_tag,
  197. left_index,
  198. block)
  199. # keep checking conditions below and maybe just append
  200. if data_index < len(block) and (self.md.is_block_level(left_tag) or left_tag == '--'):
  201. text.insert(0, block[data_index:])
  202. block = block[:data_index]
  203. if not (self.md.is_block_level(left_tag) or block[1] in ["!", "?", "@", "%"]):
  204. new_blocks.append(block)
  205. continue
  206. if self._is_oneliner(left_tag):
  207. new_blocks.append(block.strip())
  208. continue
  209. if block.rstrip().endswith(">") \
  210. and self._equal_tags(left_tag, right_tag):
  211. if self.markdown_in_raw and 'markdown' in attrs.keys():
  212. block = block[left_index:-len(right_tag) - 2]
  213. new_blocks.append(self.md.htmlStash.
  214. store_tag(left_tag, attrs, 0, 2))
  215. new_blocks.extend([block])
  216. else:
  217. new_blocks.append(
  218. self.md.htmlStash.store(block.strip()))
  219. continue
  220. else:
  221. # if is block level tag and is not complete
  222. if (not self._equal_tags(left_tag, right_tag)) and \
  223. (self.md.is_block_level(left_tag) or left_tag == "--"):
  224. items.append(block.strip())
  225. in_tag = True
  226. else:
  227. new_blocks.append(
  228. self.md.htmlStash.store(block.strip())
  229. )
  230. continue
  231. else:
  232. new_blocks.append(block)
  233. else:
  234. items.append(block)
  235. # Need to evaluate all items so we can calculate relative to the left index.
  236. right_tag, data_index = self._get_right_tag(left_tag, left_index, ''.join(items))
  237. # Adjust data_index: relative to items -> relative to last block
  238. prev_block_length = 0
  239. for item in items[:-1]:
  240. prev_block_length += len(item)
  241. data_index -= prev_block_length
  242. if self._equal_tags(left_tag, right_tag):
  243. # if find closing tag
  244. if data_index < len(block):
  245. # we have more text after right_tag
  246. items[-1] = block[:data_index]
  247. text.insert(0, block[data_index:])
  248. in_tag = False
  249. if self.markdown_in_raw and 'markdown' in attrs.keys():
  250. items[0] = items[0][left_index:]
  251. items[-1] = items[-1][:-len(right_tag) - 2]
  252. if items[len(items) - 1]: # not a newline/empty string
  253. right_index = len(items) + 3
  254. else:
  255. right_index = len(items) + 2
  256. new_blocks.append(self.md.htmlStash.store_tag(
  257. left_tag, attrs, 0, right_index))
  258. placeholderslen = len(self.md.htmlStash.tag_data)
  259. new_blocks.extend(
  260. self._nested_markdown_in_html(items))
  261. nests = len(self.md.htmlStash.tag_data) - \
  262. placeholderslen
  263. self.md.htmlStash.tag_data[-1 - nests][
  264. 'right_index'] += nests - 2
  265. else:
  266. new_blocks.append(
  267. self.md.htmlStash.store('\n\n'.join(items)))
  268. items = []
  269. if items:
  270. if self.markdown_in_raw and 'markdown' in attrs.keys():
  271. items[0] = items[0][left_index:]
  272. items[-1] = items[-1][:-len(right_tag) - 2]
  273. if items[len(items) - 1]: # not a newline/empty string
  274. right_index = len(items) + 3
  275. else:
  276. right_index = len(items) + 2
  277. new_blocks.append(
  278. self.md.htmlStash.store_tag(
  279. left_tag, attrs, 0, right_index))
  280. placeholderslen = len(self.md.htmlStash.tag_data)
  281. new_blocks.extend(self._nested_markdown_in_html(items))
  282. nests = len(self.md.htmlStash.tag_data) - placeholderslen
  283. self.md.htmlStash.tag_data[-1 - nests][
  284. 'right_index'] += nests - 2
  285. else:
  286. new_blocks.append(
  287. self.md.htmlStash.store('\n\n'.join(items)))
  288. new_blocks.append('\n')
  289. new_text = "\n\n".join(new_blocks)
  290. return new_text.split("\n")
  291. class ReferencePreprocessor(Preprocessor):
  292. """ Remove reference definitions from text and store for later use. """
  293. TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*'
  294. RE = re.compile(
  295. r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL
  296. )
  297. TITLE_RE = re.compile(r'^%s$' % TITLE)
  298. def run(self, lines):
  299. new_text = []
  300. while lines:
  301. line = lines.pop(0)
  302. m = self.RE.match(line)
  303. if m:
  304. id = m.group(1).strip().lower()
  305. link = m.group(2).lstrip('<').rstrip('>')
  306. t = m.group(5) or m.group(6) or m.group(7)
  307. if not t:
  308. # Check next line for title
  309. tm = self.TITLE_RE.match(lines[0])
  310. if tm:
  311. lines.pop(0)
  312. t = tm.group(2) or tm.group(3) or tm.group(4)
  313. self.md.references[id] = (link, t)
  314. # Preserve the line to prevent raw HTML indexing issue.
  315. # https://github.com/Python-Markdown/markdown/issues/584
  316. new_text.append('')
  317. else:
  318. new_text.append(line)
  319. return new_text # + "\n"