# Python Markdown # A Python implementation of John Gruber's Markdown. # Documentation: https://python-markdown.github.io/ # GitHub: https://github.com/Python-Markdown/markdown/ # PyPI: https://pypi.org/project/Markdown/ # Started by Manfred Stienstra (http://www.dwerg.net/). # Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). # Currently maintained by Waylan Limberg (https://github.com/waylan), # Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). # Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later) # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) # Copyright 2004 Manfred Stienstra (the original version) # License: BSD (see LICENSE.md for details). """ Tree processors manipulate the tree created by block processors. They can even create an entirely new `ElementTree` object. This is an excellent place for creating summaries, adding collected references, or last minute adjustments. """ from __future__ import annotations import re import xml.etree.ElementTree as etree from typing import TYPE_CHECKING, Any from . import util from . import inlinepatterns if TYPE_CHECKING: # pragma: no cover from markdown import Markdown def build_treeprocessors(md: Markdown, **kwargs: Any) -> util.Registry[Treeprocessor]: """ Build the default `treeprocessors` for Markdown. """ treeprocessors = util.Registry() treeprocessors.register(InlineProcessor(md), 'inline', 20) treeprocessors.register(PrettifyTreeprocessor(md), 'prettify', 10) treeprocessors.register(UnescapeTreeprocessor(md), 'unescape', 0) return treeprocessors def isString(s: object) -> bool: """ Return `True` if object is a string but not an [`AtomicString`][markdown.util.AtomicString]. """ if not isinstance(s, util.AtomicString): return isinstance(s, str) return False class Treeprocessor(util.Processor): """ `Treeprocessor`s are run on the `ElementTree` object before serialization. Each `Treeprocessor` implements a `run` method that takes a pointer to an `Element` and modifies it as necessary. `Treeprocessors` must extend `markdown.Treeprocessor`. """ def run(self, root: etree.Element) -> etree.Element | None: """ Subclasses of `Treeprocessor` should implement a `run` method, which takes a root `Element`. This method can return another `Element` object, and the existing root `Element` will be replaced, or it can modify the current tree and return `None`. """ pass # pragma: no cover class InlineProcessor(Treeprocessor): """ A `Treeprocessor` that traverses a tree, applying inline patterns. """ def __init__(self, md: Markdown): self.__placeholder_prefix = util.INLINE_PLACEHOLDER_PREFIX self.__placeholder_suffix = util.ETX self.__placeholder_length = 4 + len(self.__placeholder_prefix) \ + len(self.__placeholder_suffix) self.__placeholder_re = util.INLINE_PLACEHOLDER_RE self.md = md self.inlinePatterns = md.inlinePatterns self.ancestors: list[str] = [] def __makePlaceholder(self, type: str) -> tuple[str, str]: """ Generate a placeholder """ id = "%04d" % len(self.stashed_nodes) hash = util.INLINE_PLACEHOLDER % id return hash, id def __findPlaceholder(self, data: str, index: int) -> tuple[str | None, int]: """ Extract id from data string, start from index. Arguments: data: String. index: Index, from which we start search. Returns: Placeholder id and string index, after the found placeholder. """ m = self.__placeholder_re.search(data, index) if m: return m.group(1), m.end() else: return None, index + 1 def __stashNode(self, node: etree.Element | str, type: str) -> str: """ Add node to stash. """ placeholder, id = self.__makePlaceholder(type) self.stashed_nodes[id] = node return placeholder def __handleInline(self, data: str, patternIndex: int = 0) -> str: """ Process string with inline patterns and replace it with placeholders. Arguments: data: A line of Markdown text. patternIndex: The index of the `inlinePattern` to start with. Returns: String with placeholders. """ if not isinstance(data, util.AtomicString): startIndex = 0 count = len(self.inlinePatterns) while patternIndex < count: data, matched, startIndex = self.__applyPattern( self.inlinePatterns[patternIndex], data, patternIndex, startIndex ) if not matched: patternIndex += 1 return data def __processElementText(self, node: etree.Element, subnode: etree.Element, isText: bool = True) -> None: """ Process placeholders in `Element.text` or `Element.tail` of Elements popped from `self.stashed_nodes`. Arguments: node: Parent node. subnode: Processing node. isText: Boolean variable, True - it's text, False - it's a tail. """ if isText: text = subnode.text subnode.text = None else: text = subnode.tail subnode.tail = None childResult = self.__processPlaceholders(text, subnode, isText) if not isText and node is not subnode: pos = list(node).index(subnode) + 1 else: pos = 0 childResult.reverse() for newChild in childResult: node.insert(pos, newChild[0]) def __processPlaceholders( self, data: str | None, parent: etree.Element, isText: bool = True ) -> list[tuple[etree.Element, list[str]]]: """ Process string with placeholders and generate `ElementTree` tree. Arguments: data: String with placeholders instead of `ElementTree` elements. parent: Element, which contains processing inline data. isText: Boolean variable, True - it's text, False - it's a tail. Returns: List with `ElementTree` elements with applied inline patterns. """ def linkText(text: str | None) -> None: if text: if result: if result[-1][0].tail: result[-1][0].tail += text else: result[-1][0].tail = text elif not isText: if parent.tail: parent.tail += text else: parent.tail = text else: if parent.text: parent.text += text else: parent.text = text result = [] strartIndex = 0 while data: index = data.find(self.__placeholder_prefix, strartIndex) if index != -1: id, phEndIndex = self.__findPlaceholder(data, index) if id in self.stashed_nodes: node = self.stashed_nodes.get(id) if index > 0: text = data[strartIndex:index] linkText(text) if not isinstance(node, str): # it's Element for child in [node] + list(node): if child.tail: if child.tail.strip(): self.__processElementText( node, child, False ) if child.text: if child.text.strip(): self.__processElementText(child, child) else: # it's just a string linkText(node) strartIndex = phEndIndex continue strartIndex = phEndIndex result.append((node, self.ancestors[:])) else: # wrong placeholder end = index + len(self.__placeholder_prefix) linkText(data[strartIndex:end]) strartIndex = end else: text = data[strartIndex:] if isinstance(data, util.AtomicString): # We don't want to loose the `AtomicString` text = util.AtomicString(text) linkText(text) data = "" return result def __applyPattern( self, pattern: inlinepatterns.Pattern, data: str, patternIndex: int, startIndex: int = 0 ) -> tuple[str, bool, int]: """ Check if the line fits the pattern, create the necessary elements, add it to `stashed_nodes`. Arguments: data: The text to be processed. pattern: The pattern to be checked. patternIndex: Index of current pattern. startIndex: String index, from which we start searching. Returns: String with placeholders instead of `ElementTree` elements. """ new_style = isinstance(pattern, inlinepatterns.InlineProcessor) for exclude in pattern.ANCESTOR_EXCLUDES: if exclude.lower() in self.ancestors: return data, False, 0 if new_style: match = None # Since `handleMatch` may reject our first match, # we iterate over the buffer looking for matches # until we can't find any more. for match in pattern.getCompiledRegExp().finditer(data, startIndex): node, start, end = pattern.handleMatch(match, data) if start is None or end is None: startIndex += match.end(0) match = None continue break else: # pragma: no cover match = pattern.getCompiledRegExp().match(data[startIndex:]) leftData = data[:startIndex] if not match: return data, False, 0 if not new_style: # pragma: no cover node = pattern.handleMatch(match) start = match.start(0) end = match.end(0) if node is None: return data, True, end if not isinstance(node, str): if not isinstance(node.text, util.AtomicString): # We need to process current node too for child in [node] + list(node): if not isString(node): if child.text: self.ancestors.append(child.tag.lower()) child.text = self.__handleInline( child.text, patternIndex + 1 ) self.ancestors.pop() if child.tail: child.tail = self.__handleInline( child.tail, patternIndex ) placeholder = self.__stashNode(node, pattern.type()) if new_style: return "{}{}{}".format(data[:start], placeholder, data[end:]), True, 0 else: # pragma: no cover return "{}{}{}{}".format(leftData, match.group(1), placeholder, match.groups()[-1]), True, 0 def __build_ancestors(self, parent: etree.Element | None, parents: list[str]) -> None: """Build the ancestor list.""" ancestors = [] while parent is not None: if parent is not None: ancestors.append(parent.tag.lower()) parent = self.parent_map.get(parent) ancestors.reverse() parents.extend(ancestors) def run(self, tree: etree.Element, ancestors: list[str] | None = None) -> etree.Element: """Apply inline patterns to a parsed Markdown tree. Iterate over `Element`, find elements with inline tag, apply inline patterns and append newly created Elements to tree. To avoid further processing of string with inline patterns, instead of normal string, use subclass [`AtomicString`][markdown.util.AtomicString]: node.text = markdown.util.AtomicString("This will not be processed.") Arguments: tree: `Element` object, representing Markdown tree. ancestors: List of parent tag names that precede the tree node (if needed). Returns: An element tree object with applied inline patterns. """ self.stashed_nodes: dict[str, etree.Element | str] = {} # Ensure a valid parent list, but copy passed in lists # to ensure we don't have the user accidentally change it on us. tree_parents = [] if ancestors is None else ancestors[:] self.parent_map = {c: p for p in tree.iter() for c in p} stack = [(tree, tree_parents)] while stack: currElement, parents = stack.pop() self.ancestors = parents self.__build_ancestors(currElement, self.ancestors) insertQueue = [] for child in currElement: if child.text and not isinstance( child.text, util.AtomicString ): self.ancestors.append(child.tag.lower()) text = child.text child.text = None lst = self.__processPlaceholders( self.__handleInline(text), child ) for item in lst: self.parent_map[item[0]] = child stack += lst insertQueue.append((child, lst)) self.ancestors.pop() if child.tail: tail = self.__handleInline(child.tail) dumby = etree.Element('d') child.tail = None tailResult = self.__processPlaceholders(tail, dumby, False) if dumby.tail: child.tail = dumby.tail pos = list(currElement).index(child) + 1 tailResult.reverse() for newChild in tailResult: self.parent_map[newChild[0]] = currElement currElement.insert(pos, newChild[0]) if len(child): self.parent_map[child] = currElement stack.append((child, self.ancestors[:])) for element, lst in insertQueue: for i, obj in enumerate(lst): newChild = obj[0] element.insert(i, newChild) return tree class PrettifyTreeprocessor(Treeprocessor): """ Add line breaks to the html document. """ def _prettifyETree(self, elem: etree.Element) -> None: """ Recursively add line breaks to `ElementTree` children. """ i = "\n" if self.md.is_block_level(elem.tag) and elem.tag not in ['code', 'pre']: if (not elem.text or not elem.text.strip()) \ and len(elem) and self.md.is_block_level(elem[0].tag): elem.text = i for e in elem: if self.md.is_block_level(e.tag): self._prettifyETree(e) if not elem.tail or not elem.tail.strip(): elem.tail = i def run(self, root: etree.Element) -> None: """ Add line breaks to `Element` object and its children. """ self._prettifyETree(root) # Do `
`'s separately as they are often in the middle of # inline content and missed by `_prettifyETree`. brs = root.iter('br') for br in brs: if not br.tail or not br.tail.strip(): br.tail = '\n' else: br.tail = '\n%s' % br.tail # Clean up extra empty lines at end of code blocks. pres = root.iter('pre') for pre in pres: if len(pre) and pre[0].tag == 'code': code = pre[0] # Only prettify code containing text only if not len(code) and code.text is not None: code.text = util.AtomicString(code.text.rstrip() + '\n') class UnescapeTreeprocessor(Treeprocessor): """ Restore escaped chars """ RE = re.compile(r'{}(\d+){}'.format(util.STX, util.ETX)) def _unescape(self, m: re.Match[str]) -> str: return chr(int(m.group(1))) def unescape(self, text: str) -> str: return self.RE.sub(self._unescape, text) def run(self, root: etree.Element) -> None: """ Loop over all elements and unescape all text. """ for elem in root.iter(): # Unescape text content if elem.text and not elem.tag == 'code': elem.text = self.unescape(elem.text) # Unescape tail content if elem.tail: elem.tail = self.unescape(elem.tail) # Unescape attribute values for key, value in elem.items(): elem.set(key, self.unescape(value))