| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370 |
- """
- Python Markdown
- A Python implementation of John Gruber's Markdown.
- Documentation: https://python-markdown.github.io/
- GitHub: https://github.com/Python-Markdown/markdown/
- PyPI: https://pypi.org/project/Markdown/
- Started by Manfred Stienstra (http://www.dwerg.net/).
- Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
- Currently maintained by Waylan Limberg (https://github.com/waylan),
- Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
- Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
- Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
- Copyright 2004 Manfred Stienstra (the original version)
- License: BSD (see LICENSE.md for details).
- PRE-PROCESSORS
- =============================================================================
- Preprocessors work on source text before we start doing anything too
- complicated.
- """
- from . import util
- import re
- def build_preprocessors(md, **kwargs):
- """ Build the default set of preprocessors used by Markdown. """
- preprocessors = util.Registry()
- preprocessors.register(NormalizeWhitespace(md), 'normalize_whitespace', 30)
- preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20)
- preprocessors.register(ReferencePreprocessor(md), 'reference', 10)
- return preprocessors
- class Preprocessor(util.Processor):
- """
- Preprocessors are run after the text is broken into lines.
- Each preprocessor implements a "run" method that takes a pointer to a
- list of lines of the document, modifies it as necessary and returns
- either the same pointer or a pointer to a new list.
- Preprocessors must extend markdown.Preprocessor.
- """
- def run(self, lines):
- """
- Each subclass of Preprocessor should override the `run` method, which
- takes the document as a list of strings split by newlines and returns
- the (possibly modified) list of lines.
- """
- pass # pragma: no cover
- class NormalizeWhitespace(Preprocessor):
- """ Normalize whitespace for consistent parsing. """
- def run(self, lines):
- source = '\n'.join(lines)
- source = source.replace(util.STX, "").replace(util.ETX, "")
- source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
- source = source.expandtabs(self.md.tab_length)
- source = re.sub(r'(?<=\n) +\n', '\n', source)
- return source.split('\n')
- class HtmlBlockPreprocessor(Preprocessor):
- """Remove html blocks from the text and store them for later retrieval."""
- right_tag_patterns = ["</%s>", "%s>"]
- attrs_pattern = r"""
- \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value"
- | # OR
- \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value
- | # OR
- \s+(?P<attr2>[^>"'/= ]+) # attr
- """
- left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % \
- attrs_pattern
- attrs_re = re.compile(attrs_pattern, re.VERBOSE)
- left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
- markdown_in_raw = False
- def _get_left_tag(self, block):
- m = self.left_tag_re.match(block)
- if m:
- tag = m.group('tag')
- raw_attrs = m.group('attrs')
- attrs = {}
- if raw_attrs:
- for ma in self.attrs_re.finditer(raw_attrs):
- if ma.group('attr'):
- if ma.group('value'):
- attrs[ma.group('attr').strip()] = ma.group('value')
- else:
- attrs[ma.group('attr').strip()] = ""
- elif ma.group('attr1'):
- if ma.group('value1'):
- attrs[ma.group('attr1').strip()] = ma.group(
- 'value1'
- )
- else:
- attrs[ma.group('attr1').strip()] = ""
- elif ma.group('attr2'):
- attrs[ma.group('attr2').strip()] = ""
- return tag, len(m.group(0)), attrs
- else:
- tag = block[1:].split(">", 1)[0].lower()
- return tag, len(tag)+2, {}
- def _recursive_tagfind(self, ltag, rtag, start_index, block):
- while 1:
- i = block.find(rtag, start_index)
- if i == -1:
- return -1
- j = block.find(ltag, start_index)
- # if no ltag, or rtag found before another ltag, return index
- if (j > i or j == -1):
- return i + len(rtag)
- # another ltag found before rtag, use end of ltag as starting
- # point and search again
- j = block.find('>', j)
- start_index = self._recursive_tagfind(ltag, rtag, j + 1, block)
- if start_index == -1:
- # HTML potentially malformed- ltag has no corresponding
- # rtag
- return -1
- def _get_right_tag(self, left_tag, left_index, block):
- for p in self.right_tag_patterns:
- tag = p % left_tag
- i = self._recursive_tagfind(
- "<%s" % left_tag, tag, left_index, block
- )
- if i > 2:
- return tag.lstrip("<").rstrip(">"), i
- return block.rstrip()[-left_index:-1].lower(), len(block)
- def _equal_tags(self, left_tag, right_tag):
- if left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
- return True
- if ("/" + left_tag) == right_tag:
- return True
- if (right_tag == "--" and left_tag == "--"):
- return True
- elif left_tag == right_tag[1:] and right_tag[0] == "/":
- return True
- else:
- return False
- def _is_oneliner(self, tag):
- return (tag in ['hr', 'hr/'])
- def _stringindex_to_listindex(self, stringindex, items):
- """
- Same effect as concatenating the strings in items,
- finding the character to which stringindex refers in that string,
- and returning the index of the item in which that character resides.
- """
- items.append('dummy')
- i, count = 0, 0
- while count <= stringindex:
- count += len(items[i])
- i += 1
- return i - 1
- def _nested_markdown_in_html(self, items):
- """Find and process html child elements of the given element block."""
- for i, item in enumerate(items):
- if self.left_tag_re.match(item):
- left_tag, left_index, attrs = \
- self._get_left_tag(''.join(items[i:]))
- right_tag, data_index = self._get_right_tag(
- left_tag, left_index, ''.join(items[i:]))
- right_listindex = \
- self._stringindex_to_listindex(data_index, items[i:]) + i
- if 'markdown' in attrs.keys():
- items[i] = items[i][left_index:] # remove opening tag
- placeholder = self.md.htmlStash.store_tag(
- left_tag, attrs, i + 1, right_listindex + 1)
- items.insert(i, placeholder)
- if len(items) - right_listindex <= 1: # last nest, no tail
- right_listindex -= 1
- items[right_listindex] = items[right_listindex][
- :-len(right_tag) - 2] # remove closing tag
- else: # raw html
- if len(items) - right_listindex <= 1: # last element
- right_listindex -= 1
- if right_listindex <= i:
- right_listindex = i + 1
- placeholder = self.md.htmlStash.store('\n\n'.join(
- items[i:right_listindex]))
- del items[i:right_listindex]
- items.insert(i, placeholder)
- return items
- def run(self, lines):
- text = "\n".join(lines)
- new_blocks = []
- text = text.rsplit("\n\n")
- items = []
- left_tag = ''
- right_tag = ''
- in_tag = False # flag
- while text:
- block = text[0]
- if block.startswith("\n"):
- block = block[1:]
- text = text[1:]
- if block.startswith("\n"):
- block = block[1:]
- if not in_tag:
- if block.startswith("<") and len(block.strip()) > 1:
- if block[1:4] == "!--":
- # is a comment block
- left_tag, left_index, attrs = "--", 2, {}
- else:
- left_tag, left_index, attrs = self._get_left_tag(block)
- right_tag, data_index = self._get_right_tag(left_tag,
- left_index,
- block)
- # keep checking conditions below and maybe just append
- if data_index < len(block) and (self.md.is_block_level(left_tag) or left_tag == '--'):
- text.insert(0, block[data_index:])
- block = block[:data_index]
- if not (self.md.is_block_level(left_tag) or block[1] in ["!", "?", "@", "%"]):
- new_blocks.append(block)
- continue
- if self._is_oneliner(left_tag):
- new_blocks.append(block.strip())
- continue
- if block.rstrip().endswith(">") \
- and self._equal_tags(left_tag, right_tag):
- if self.markdown_in_raw and 'markdown' in attrs.keys():
- block = block[left_index:-len(right_tag) - 2]
- new_blocks.append(self.md.htmlStash.
- store_tag(left_tag, attrs, 0, 2))
- new_blocks.extend([block])
- else:
- new_blocks.append(
- self.md.htmlStash.store(block.strip()))
- continue
- else:
- # if is block level tag and is not complete
- if (not self._equal_tags(left_tag, right_tag)) and \
- (self.md.is_block_level(left_tag) or left_tag == "--"):
- items.append(block.strip())
- in_tag = True
- else:
- new_blocks.append(
- self.md.htmlStash.store(block.strip())
- )
- continue
- else:
- new_blocks.append(block)
- else:
- items.append(block)
- # Need to evaluate all items so we can calculate relative to the left index.
- right_tag, data_index = self._get_right_tag(left_tag, left_index, ''.join(items))
- # Adjust data_index: relative to items -> relative to last block
- prev_block_length = 0
- for item in items[:-1]:
- prev_block_length += len(item)
- data_index -= prev_block_length
- if self._equal_tags(left_tag, right_tag):
- # if find closing tag
- if data_index < len(block):
- # we have more text after right_tag
- items[-1] = block[:data_index]
- text.insert(0, block[data_index:])
- in_tag = False
- if self.markdown_in_raw and 'markdown' in attrs.keys():
- items[0] = items[0][left_index:]
- items[-1] = items[-1][:-len(right_tag) - 2]
- if items[len(items) - 1]: # not a newline/empty string
- right_index = len(items) + 3
- else:
- right_index = len(items) + 2
- new_blocks.append(self.md.htmlStash.store_tag(
- left_tag, attrs, 0, right_index))
- placeholderslen = len(self.md.htmlStash.tag_data)
- new_blocks.extend(
- self._nested_markdown_in_html(items))
- nests = len(self.md.htmlStash.tag_data) - \
- placeholderslen
- self.md.htmlStash.tag_data[-1 - nests][
- 'right_index'] += nests - 2
- else:
- new_blocks.append(
- self.md.htmlStash.store('\n\n'.join(items)))
- items = []
- if items:
- if self.markdown_in_raw and 'markdown' in attrs.keys():
- items[0] = items[0][left_index:]
- items[-1] = items[-1][:-len(right_tag) - 2]
- if items[len(items) - 1]: # not a newline/empty string
- right_index = len(items) + 3
- else:
- right_index = len(items) + 2
- new_blocks.append(
- self.md.htmlStash.store_tag(
- left_tag, attrs, 0, right_index))
- placeholderslen = len(self.md.htmlStash.tag_data)
- new_blocks.extend(self._nested_markdown_in_html(items))
- nests = len(self.md.htmlStash.tag_data) - placeholderslen
- self.md.htmlStash.tag_data[-1 - nests][
- 'right_index'] += nests - 2
- else:
- new_blocks.append(
- self.md.htmlStash.store('\n\n'.join(items)))
- new_blocks.append('\n')
- new_text = "\n\n".join(new_blocks)
- return new_text.split("\n")
- class ReferencePreprocessor(Preprocessor):
- """ Remove reference definitions from text and store for later use. """
- TITLE = r'[ ]*(\"(.*)\"|\'(.*)\'|\((.*)\))[ ]*'
- RE = re.compile(
- r'^[ ]{0,3}\[([^\]]*)\]:\s*([^ ]*)[ ]*(%s)?$' % TITLE, re.DOTALL
- )
- TITLE_RE = re.compile(r'^%s$' % TITLE)
- def run(self, lines):
- new_text = []
- while lines:
- line = lines.pop(0)
- m = self.RE.match(line)
- if m:
- id = m.group(1).strip().lower()
- link = m.group(2).lstrip('<').rstrip('>')
- t = m.group(5) or m.group(6) or m.group(7)
- if not t:
- # Check next line for title
- tm = self.TITLE_RE.match(lines[0])
- if tm:
- lines.pop(0)
- t = tm.group(2) or tm.group(3) or tm.group(4)
- self.md.references[id] = (link, t)
- # Preserve the line to prevent raw HTML indexing issue.
- # https://github.com/Python-Markdown/markdown/issues/584
- new_text.append('')
- else:
- new_text.append(line)
- return new_text # + "\n"
|