# Table of Contents Extension for Python-Markdown # =============================================== # See https://Python-Markdown.github.io/extensions/toc # for documentation. # Original code Copyright 2008 [Jack Miller](https://codezen.org/) # All changes Copyright 2008-2024 The Python Markdown Project # License: [BSD](https://opensource.org/licenses/bsd-license.php) """ Add table of contents support to Python-Markdown. See the [documentation](https://Python-Markdown.github.io/extensions/toc) for details. """ from __future__ import annotations from . import Extension from ..treeprocessors import Treeprocessor from ..util import parseBoolValue, AMP_SUBSTITUTE, deprecated, HTML_PLACEHOLDER_RE, AtomicString from ..treeprocessors import UnescapeTreeprocessor from ..serializers import RE_AMP import re import html import unicodedata from copy import deepcopy import xml.etree.ElementTree as etree from typing import TYPE_CHECKING, Any, Iterator, MutableSet if TYPE_CHECKING: # pragma: no cover from markdown import Markdown def slugify(value: str, separator: str, unicode: bool = False) -> str: """ Slugify a string, to make it URL friendly. """ if not unicode: # Replace Extended Latin characters with ASCII, i.e. `žlutý` => `zluty` value = unicodedata.normalize('NFKD', value) value = value.encode('ascii', 'ignore').decode('ascii') value = re.sub(r'[^\w\s-]', '', value).strip().lower() return re.sub(r'[{}\s]+'.format(separator), separator, value) def slugify_unicode(value: str, separator: str) -> str: """ Slugify a string, to make it URL friendly while preserving Unicode characters. """ return slugify(value, separator, unicode=True) IDCOUNT_RE = re.compile(r'^(.*)_([0-9]+)$') def unique(id: str, ids: MutableSet[str]) -> str: """ Ensure id is unique in set of ids. Append '_1', '_2'... if not """ while id in ids or not id: m = IDCOUNT_RE.match(id) if m: id = '%s_%d' % (m.group(1), int(m.group(2))+1) else: id = '%s_%d' % (id, 1) ids.add(id) return id @deprecated('Use `render_inner_html` and `striptags` instead.') def get_name(el: etree.Element) -> str: """Get title name.""" text = [] for c in el.itertext(): if isinstance(c, AtomicString): text.append(html.unescape(c)) else: text.append(c) return ''.join(text).strip() @deprecated('Use `run_postprocessors`, `render_inner_html` and/or `striptags` instead.') def stashedHTML2text(text: str, md: Markdown, strip_entities: bool = True) -> str: """ Extract raw HTML from stash, reduce to plain text and swap with placeholder. """ def _html_sub(m: re.Match[str]) -> str: """ Substitute raw html with plain text. """ try: raw = md.htmlStash.rawHtmlBlocks[int(m.group(1))] except (IndexError, TypeError): # pragma: no cover return m.group(0) # Strip out tags and/or entities - leaving text res = re.sub(r'(<[^>]+>)', '', raw) if strip_entities: res = re.sub(r'(&[\#a-zA-Z0-9]+;)', '', res) return res return HTML_PLACEHOLDER_RE.sub(_html_sub, text) def unescape(text: str) -> str: """ Unescape Markdown backslash escaped text. """ c = UnescapeTreeprocessor() return c.unescape(text) def strip_tags(text: str) -> str: """ Strip HTML tags and return plain text. Note: HTML entities are unaffected. """ # A comment could contain a tag, so strip comments first while (start := text.find('', start)) != -1: text = f'{text[:start]}{text[end + 3:]}' while (start := text.find('<')) != -1 and (end := text.find('>', start)) != -1: text = f'{text[:start]}{text[end + 1:]}' # Collapse whitespace text = ' '.join(text.split()) return text def escape_cdata(text: str) -> str: """ Escape character data. """ if "&" in text: # Only replace & when not part of an entity text = RE_AMP.sub('&', text) if "<" in text: text = text.replace("<", "<") if ">" in text: text = text.replace(">", ">") return text def run_postprocessors(text: str, md: Markdown) -> str: """ Run postprocessors from Markdown instance on text. """ for pp in md.postprocessors: text = pp.run(text) return text.strip() def render_inner_html(el: etree.Element, md: Markdown) -> str: """ Fully render inner html of an `etree` element as a string. """ # The `UnescapeTreeprocessor` runs after `toc` extension so run here. text = unescape(md.serializer(el)) # strip parent tag start = text.index('>') + 1 end = text.rindex('<') text = text[start:end].strip() return run_postprocessors(text, md) def remove_fnrefs(root: etree.Element) -> etree.Element: """ Remove footnote references from a copy of the element, if any are present. """ # Remove footnote references, which look like this: `...`. # If there are no `sup` elements, then nothing to do. if next(root.iter('sup'), None) is None: return root root = deepcopy(root) # Find parent elements that contain `sup` elements. for parent in root.findall('.//sup/..'): carry_text = "" for child in reversed(parent): # Reversed for the ability to mutate during iteration. # Remove matching footnote references but carry any `tail` text to preceding elements. if child.tag == 'sup' and child.get('id', '').startswith('fnref'): carry_text = f'{child.tail or ""}{carry_text}' parent.remove(child) elif carry_text: child.tail = f'{child.tail or ""}{carry_text}' carry_text = "" if carry_text: parent.text = f'{parent.text or ""}{carry_text}' return root def nest_toc_tokens(toc_list): """Given an unsorted list with errors and skips, return a nested one. [{'level': 1}, {'level': 2}] => [{'level': 1, 'children': [{'level': 2, 'children': []}]}] A wrong list is also converted: [{'level': 2}, {'level': 1}] => [{'level': 2, 'children': []}, {'level': 1, 'children': []}] """ ordered_list = [] if len(toc_list): # Initialize everything by processing the first entry last = toc_list.pop(0) last['children'] = [] levels = [last['level']] ordered_list.append(last) parents = [] # Walk the rest nesting the entries properly while toc_list: t = toc_list.pop(0) current_level = t['level'] t['children'] = [] # Reduce depth if current level < last item's level if current_level < levels[-1]: # Pop last level since we know we are less than it levels.pop() # Pop parents and levels we are less than or equal to to_pop = 0 for p in reversed(parents): if current_level <= p['level']: to_pop += 1 else: # pragma: no cover break if to_pop: levels = levels[:-to_pop] parents = parents[:-to_pop] # Note current level as last levels.append(current_level) # Level is the same, so append to # the current parent (if available) if current_level == levels[-1]: (parents[-1]['children'] if parents else ordered_list).append(t) # Current level is > last item's level, # So make last item a parent and append current as child else: last['children'].append(t) parents.append(last) levels.append(current_level) last = t return ordered_list class TocTreeprocessor(Treeprocessor): """ Step through document and build TOC. """ def __init__(self, md: Markdown, config: dict[str, Any]): super().__init__(md) self.marker: str = config["marker"] self.title: str = config["title"] self.base_level = int(config["baselevel"]) - 1 self.slugify = config["slugify"] self.sep = config["separator"] self.toc_class = config["toc_class"] self.title_class: str = config["title_class"] self.use_anchors: bool = parseBoolValue(config["anchorlink"]) self.anchorlink_class: str = config["anchorlink_class"] self.use_permalinks = parseBoolValue(config["permalink"], False) if self.use_permalinks is None: self.use_permalinks = config["permalink"] self.permalink_class: str = config["permalink_class"] self.permalink_title: str = config["permalink_title"] self.permalink_leading: bool | None = parseBoolValue(config["permalink_leading"], False) self.header_rgx = re.compile("[Hh][123456]") if isinstance(config["toc_depth"], str) and '-' in config["toc_depth"]: self.toc_top, self.toc_bottom = [int(x) for x in config["toc_depth"].split('-')] else: self.toc_top = 1 self.toc_bottom = int(config["toc_depth"]) def iterparent(self, node: etree.Element) -> Iterator[tuple[etree.Element, etree.Element]]: """ Iterator wrapper to get allowed parent and child all at once. """ # We do not allow the marker inside a header as that # would causes an endless loop of placing a new TOC # inside previously generated TOC. for child in node: if not self.header_rgx.match(child.tag) and child.tag not in ['pre', 'code']: yield node, child yield from self.iterparent(child) def replace_marker(self, root: etree.Element, elem: etree.Element) -> None: """ Replace marker with elem. """ for (p, c) in self.iterparent(root): text = ''.join(c.itertext()).strip() if not text: continue # To keep the output from screwing up the # validation by putting a `
` inside of a `

` # we actually replace the `

` in its entirety. # The `

` element may contain more than a single text content # (`nl2br` can introduce a `
`). In this situation, `c.text` returns # the very first content, ignore children contents or tail content. # `len(c) == 0` is here to ensure there is only text in the `

`. if c.text and c.text.strip() == self.marker and len(c) == 0: for i in range(len(p)): if p[i] == c: p[i] = elem break def set_level(self, elem: etree.Element) -> None: """ Adjust header level according to base level. """ level = int(elem.tag[-1]) + self.base_level if level > 6: level = 6 elem.tag = 'h%d' % level def add_anchor(self, c: etree.Element, elem_id: str) -> None: anchor = etree.Element("a") anchor.text = c.text anchor.attrib["href"] = "#" + elem_id anchor.attrib["class"] = self.anchorlink_class c.text = "" for elem in c: anchor.append(elem) while len(c): c.remove(c[0]) c.append(anchor) def add_permalink(self, c: etree.Element, elem_id: str) -> None: permalink = etree.Element("a") permalink.text = ("%spara;" % AMP_SUBSTITUTE if self.use_permalinks is True else self.use_permalinks) permalink.attrib["href"] = "#" + elem_id permalink.attrib["class"] = self.permalink_class if self.permalink_title: permalink.attrib["title"] = self.permalink_title if self.permalink_leading: permalink.tail = c.text c.text = "" c.insert(0, permalink) else: c.append(permalink) def build_toc_div(self, toc_list: list) -> etree.Element: """ Return a string div given a toc list. """ div = etree.Element("div") div.attrib["class"] = self.toc_class # Add title to the div if self.title: header = etree.SubElement(div, "span") if self.title_class: header.attrib["class"] = self.title_class header.text = self.title def build_etree_ul(toc_list: list, parent: etree.Element) -> etree.Element: ul = etree.SubElement(parent, "ul") for item in toc_list: # List item link, to be inserted into the toc div li = etree.SubElement(ul, "li") link = etree.SubElement(li, "a") link.text = item.get('name', '') link.attrib["href"] = '#' + item.get('id', '') if item['children']: build_etree_ul(item['children'], li) return ul build_etree_ul(toc_list, div) if 'prettify' in self.md.treeprocessors: self.md.treeprocessors['prettify'].run(div) return div def run(self, doc: etree.Element) -> None: # Get a list of id attributes used_ids = set() for el in doc.iter(): if "id" in el.attrib: used_ids.add(el.attrib["id"]) toc_tokens = [] for el in doc.iter(): if isinstance(el.tag, str) and self.header_rgx.match(el.tag): self.set_level(el) innerhtml = render_inner_html(remove_fnrefs(el), self.md) name = strip_tags(innerhtml) # Do not override pre-existing ids if "id" not in el.attrib: el.attrib["id"] = unique(self.slugify(html.unescape(name), self.sep), used_ids) data_toc_label = '' if 'data-toc-label' in el.attrib: data_toc_label = run_postprocessors(unescape(el.attrib['data-toc-label']), self.md) # Overwrite name with sanitized value of `data-toc-label`. name = escape_cdata(strip_tags(data_toc_label)) # Remove the data-toc-label attribute as it is no longer needed del el.attrib['data-toc-label'] if int(el.tag[-1]) >= self.toc_top and int(el.tag[-1]) <= self.toc_bottom: toc_tokens.append({ 'level': int(el.tag[-1]), 'id': el.attrib["id"], 'name': name, 'html': innerhtml, 'data-toc-label': data_toc_label }) if self.use_anchors: self.add_anchor(el, el.attrib["id"]) if self.use_permalinks not in [False, None]: self.add_permalink(el, el.attrib["id"]) toc_tokens = nest_toc_tokens(toc_tokens) div = self.build_toc_div(toc_tokens) if self.marker: self.replace_marker(doc, div) # serialize and attach to markdown instance. toc = self.md.serializer(div) for pp in self.md.postprocessors: toc = pp.run(toc) self.md.toc_tokens = toc_tokens self.md.toc = toc class TocExtension(Extension): TreeProcessorClass = TocTreeprocessor def __init__(self, **kwargs): self.config = { 'marker': [ '[TOC]', 'Text to find and replace with Table of Contents. Set to an empty string to disable. ' 'Default: `[TOC]`.' ], 'title': [ '', 'Title to insert into TOC `

`. Default: an empty string.' ], 'title_class': [ 'toctitle', 'CSS class used for the title. Default: `toctitle`.' ], 'toc_class': [ 'toc', 'CSS class(es) used for the link. Default: `toclink`.' ], 'anchorlink': [ False, 'True if header should be a self link. Default: `False`.' ], 'anchorlink_class': [ 'toclink', 'CSS class(es) used for the link. Defaults: `toclink`.' ], 'permalink': [ 0, 'True or link text if a Sphinx-style permalink should be added. Default: `False`.' ], 'permalink_class': [ 'headerlink', 'CSS class(es) used for the link. Default: `headerlink`.' ], 'permalink_title': [ 'Permanent link', 'Title attribute of the permalink. Default: `Permanent link`.' ], 'permalink_leading': [ False, 'True if permalinks should be placed at start of the header, rather than end. Default: False.' ], 'baselevel': ['1', 'Base level for headers. Default: `1`.'], 'slugify': [ slugify, 'Function to generate anchors based on header text. Default: `slugify`.' ], 'separator': ['-', 'Word separator. Default: `-`.'], 'toc_depth': [ 6, 'Define the range of section levels to include in the Table of Contents. A single integer ' '(b) defines the bottom section level (

..) only. A string consisting of two digits ' 'separated by a hyphen in between (`2-5`) defines the top (t) and the bottom (b) (..). ' 'Default: `6` (bottom).' ], } """ Default configuration options. """ super().__init__(**kwargs) def extendMarkdown(self, md): """ Add TOC tree processor to Markdown. """ md.registerExtension(self) self.md = md self.reset() tocext = self.TreeProcessorClass(md, self.getConfigs()) md.treeprocessors.register(tocext, 'toc', 5) def reset(self) -> None: self.md.toc = '' self.md.toc_tokens = [] def makeExtension(**kwargs): # pragma: no cover return TocExtension(**kwargs)